bump
[glibc.git] / posix / regexec.c
blob021fcf4060fee1ac5c9c1a969238562672d222fc
1 /* Extended regular expression matching and search library.
2 Copyright (C) 2002, 2003, 2004 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4 Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library; if not, write to the Free
18 Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
19 02111-1307 USA. */
21 static reg_errcode_t match_ctx_init (re_match_context_t *cache, int eflags,
22 int n) internal_function;
23 static void match_ctx_clean (re_match_context_t *mctx) internal_function;
24 static void match_ctx_free (re_match_context_t *cache) internal_function;
25 static reg_errcode_t match_ctx_add_entry (re_match_context_t *cache, int node,
26 int str_idx, int from, int to)
27 internal_function;
28 static int search_cur_bkref_entry (re_match_context_t *mctx, int str_idx)
29 internal_function;
30 static reg_errcode_t match_ctx_add_subtop (re_match_context_t *mctx, int node,
31 int str_idx) internal_function;
32 static re_sub_match_last_t * match_ctx_add_sublast (re_sub_match_top_t *subtop,
33 int node, int str_idx)
34 internal_function;
35 static void sift_ctx_init (re_sift_context_t *sctx, re_dfastate_t **sifted_sts,
36 re_dfastate_t **limited_sts, int last_node,
37 int last_str_idx)
38 internal_function;
39 static reg_errcode_t re_search_internal (const regex_t *preg,
40 const char *string, int length,
41 int start, int range, int stop,
42 size_t nmatch, regmatch_t pmatch[],
43 int eflags) internal_function;
44 static int re_search_2_stub (struct re_pattern_buffer *bufp,
45 const char *string1, int length1,
46 const char *string2, int length2,
47 int start, int range, struct re_registers *regs,
48 int stop, int ret_len) internal_function;
49 static int re_search_stub (struct re_pattern_buffer *bufp,
50 const char *string, int length, int start,
51 int range, int stop, struct re_registers *regs,
52 int ret_len) internal_function;
53 static unsigned re_copy_regs (struct re_registers *regs, regmatch_t *pmatch,
54 int nregs, int regs_allocated) internal_function;
55 static inline re_dfastate_t *acquire_init_state_context
56 (reg_errcode_t *err, const re_match_context_t *mctx, int idx)
57 __attribute ((always_inline)) internal_function;
58 static reg_errcode_t prune_impossible_nodes (re_match_context_t *mctx)
59 internal_function;
60 static int check_matching (re_match_context_t *mctx, int fl_longest_match,
61 int *p_match_first)
62 internal_function;
63 static int check_halt_node_context (const re_dfa_t *dfa, int node,
64 unsigned int context) internal_function;
65 static int check_halt_state_context (const re_match_context_t *mctx,
66 const re_dfastate_t *state, int idx)
67 internal_function;
68 static void update_regs (re_dfa_t *dfa, regmatch_t *pmatch,
69 regmatch_t *prev_idx_match, int cur_node,
70 int cur_idx, int nmatch) internal_function;
71 static int proceed_next_node (const re_match_context_t *mctx,
72 int nregs, regmatch_t *regs,
73 int *pidx, int node, re_node_set *eps_via_nodes,
74 struct re_fail_stack_t *fs) internal_function;
75 static reg_errcode_t push_fail_stack (struct re_fail_stack_t *fs,
76 int str_idx, int dest_node, int nregs,
77 regmatch_t *regs,
78 re_node_set *eps_via_nodes) internal_function;
79 static int pop_fail_stack (struct re_fail_stack_t *fs, int *pidx, int nregs,
80 regmatch_t *regs, re_node_set *eps_via_nodes) internal_function;
81 static reg_errcode_t set_regs (const regex_t *preg,
82 const re_match_context_t *mctx,
83 size_t nmatch, regmatch_t *pmatch,
84 int fl_backtrack) internal_function;
85 static reg_errcode_t free_fail_stack_return (struct re_fail_stack_t *fs) internal_function;
87 #ifdef RE_ENABLE_I18N
88 static int sift_states_iter_mb (const re_match_context_t *mctx,
89 re_sift_context_t *sctx,
90 int node_idx, int str_idx, int max_str_idx) internal_function;
91 #endif /* RE_ENABLE_I18N */
92 static reg_errcode_t sift_states_backward (re_match_context_t *mctx,
93 re_sift_context_t *sctx) internal_function;
94 static reg_errcode_t build_sifted_states (re_match_context_t *mctx,
95 re_sift_context_t *sctx, int str_idx,
96 re_node_set *cur_dest) internal_function;
97 static reg_errcode_t update_cur_sifted_state (re_match_context_t *mctx,
98 re_sift_context_t *sctx,
99 int str_idx,
100 re_node_set *dest_nodes) internal_function;
101 static reg_errcode_t add_epsilon_src_nodes (re_dfa_t *dfa,
102 re_node_set *dest_nodes,
103 const re_node_set *candidates) internal_function;
104 static reg_errcode_t sub_epsilon_src_nodes (re_dfa_t *dfa, int node,
105 re_node_set *dest_nodes,
106 const re_node_set *and_nodes) internal_function;
107 static int check_dst_limits (re_match_context_t *mctx, re_node_set *limits,
108 int dst_node, int dst_idx, int src_node,
109 int src_idx) internal_function;
110 static int check_dst_limits_calc_pos_1 (re_match_context_t *mctx,
111 int boundaries, int subexp_idx,
112 int from_node, int bkref_idx) internal_function;
113 static int check_dst_limits_calc_pos (re_match_context_t *mctx,
114 int limit, int subexp_idx,
115 int node, int str_idx,
116 int bkref_idx) internal_function;
117 static reg_errcode_t check_subexp_limits (re_dfa_t *dfa,
118 re_node_set *dest_nodes,
119 const re_node_set *candidates,
120 re_node_set *limits,
121 struct re_backref_cache_entry *bkref_ents,
122 int str_idx) internal_function;
123 static reg_errcode_t sift_states_bkref (re_match_context_t *mctx,
124 re_sift_context_t *sctx,
125 int str_idx, const re_node_set *candidates) internal_function;
126 static reg_errcode_t clean_state_log_if_needed (re_match_context_t *mctx,
127 int next_state_log_idx) internal_function;
128 static reg_errcode_t merge_state_array (re_dfa_t *dfa, re_dfastate_t **dst,
129 re_dfastate_t **src, int num) internal_function;
130 static re_dfastate_t *find_recover_state (reg_errcode_t *err,
131 re_match_context_t *mctx) internal_function;
132 static re_dfastate_t *transit_state (reg_errcode_t *err,
133 re_match_context_t *mctx,
134 re_dfastate_t *state) internal_function;
135 static re_dfastate_t *merge_state_with_log (reg_errcode_t *err,
136 re_match_context_t *mctx,
137 re_dfastate_t *next_state) internal_function;
138 static reg_errcode_t check_subexp_matching_top (re_match_context_t *mctx,
139 re_node_set *cur_nodes,
140 int str_idx) internal_function;
141 #if 0
142 static re_dfastate_t *transit_state_sb (reg_errcode_t *err,
143 re_match_context_t *mctx,
144 re_dfastate_t *pstate) internal_function;
145 #endif
146 #ifdef RE_ENABLE_I18N
147 static reg_errcode_t transit_state_mb (re_match_context_t *mctx,
148 re_dfastate_t *pstate) internal_function;
149 #endif /* RE_ENABLE_I18N */
150 static reg_errcode_t transit_state_bkref (re_match_context_t *mctx,
151 const re_node_set *nodes) internal_function;
152 static reg_errcode_t get_subexp (re_match_context_t *mctx,
153 int bkref_node, int bkref_str_idx) internal_function;
154 static reg_errcode_t get_subexp_sub (re_match_context_t *mctx,
155 const re_sub_match_top_t *sub_top,
156 re_sub_match_last_t *sub_last,
157 int bkref_node, int bkref_str) internal_function;
158 static int find_subexp_node (const re_dfa_t *dfa, const re_node_set *nodes,
159 int subexp_idx, int type) internal_function;
160 static reg_errcode_t check_arrival (re_match_context_t *mctx,
161 state_array_t *path, int top_node,
162 int top_str, int last_node, int last_str,
163 int type) internal_function;
164 static reg_errcode_t check_arrival_add_next_nodes (re_match_context_t *mctx,
165 int str_idx,
166 re_node_set *cur_nodes,
167 re_node_set *next_nodes) internal_function;
168 static reg_errcode_t check_arrival_expand_ecl (re_dfa_t *dfa,
169 re_node_set *cur_nodes,
170 int ex_subexp, int type) internal_function;
171 static reg_errcode_t check_arrival_expand_ecl_sub (re_dfa_t *dfa,
172 re_node_set *dst_nodes,
173 int target, int ex_subexp,
174 int type) internal_function;
175 static reg_errcode_t expand_bkref_cache (re_match_context_t *mctx,
176 re_node_set *cur_nodes, int cur_str,
177 int subexp_num, int type) internal_function;
178 static re_dfastate_t **build_trtable (re_dfa_t *dfa,
179 re_dfastate_t *state) internal_function;
180 #ifdef RE_ENABLE_I18N
181 static int check_node_accept_bytes (re_dfa_t *dfa, int node_idx,
182 const re_string_t *input, int idx) internal_function;
183 # ifdef _LIBC
184 static unsigned int find_collation_sequence_value (const unsigned char *mbs,
185 size_t name_len) internal_function;
186 # endif /* _LIBC */
187 #endif /* RE_ENABLE_I18N */
188 static int group_nodes_into_DFAstates (re_dfa_t *dfa,
189 const re_dfastate_t *state,
190 re_node_set *states_node,
191 bitset *states_ch) internal_function;
192 static int check_node_accept (const re_match_context_t *mctx,
193 const re_token_t *node, int idx) internal_function;
194 static reg_errcode_t extend_buffers (re_match_context_t *mctx) internal_function;
196 /* Entry point for POSIX code. */
198 /* regexec searches for a given pattern, specified by PREG, in the
199 string STRING.
201 If NMATCH is zero or REG_NOSUB was set in the cflags argument to
202 `regcomp', we ignore PMATCH. Otherwise, we assume PMATCH has at
203 least NMATCH elements, and we set them to the offsets of the
204 corresponding matched substrings.
206 EFLAGS specifies `execution flags' which affect matching: if
207 REG_NOTBOL is set, then ^ does not match at the beginning of the
208 string; if REG_NOTEOL is set, then $ does not match at the end.
210 We return 0 if we find a match and REG_NOMATCH if not. */
213 regexec (preg, string, nmatch, pmatch, eflags)
214 const regex_t *__restrict preg;
215 const char *__restrict string;
216 size_t nmatch;
217 regmatch_t pmatch[];
218 int eflags;
220 reg_errcode_t err;
221 int start, length;
222 re_dfa_t *dfa = (re_dfa_t *)preg->buffer;
224 if (eflags & ~(REG_NOTBOL | REG_NOTEOL | REG_STARTEND))
225 return REG_BADPAT;
227 if (eflags & REG_STARTEND)
229 start = pmatch[0].rm_so;
230 length = pmatch[0].rm_eo;
232 else
234 start = 0;
235 length = strlen (string);
238 __libc_lock_lock (dfa->lock);
239 if (preg->no_sub)
240 err = re_search_internal (preg, string, length, start, length - start,
241 length, 0, NULL, eflags);
242 else
243 err = re_search_internal (preg, string, length, start, length - start,
244 length, nmatch, pmatch, eflags);
245 __libc_lock_unlock (dfa->lock);
246 return err != REG_NOERROR;
249 #ifdef _LIBC
250 # include <shlib-compat.h>
251 versioned_symbol (libc, __regexec, regexec, GLIBC_2_3_4);
253 # if SHLIB_COMPAT (libc, GLIBC_2_0, GLIBC_2_3_4)
254 __typeof__ (__regexec) __compat_regexec;
257 attribute_compat_text_section
258 __compat_regexec (const regex_t *__restrict preg,
259 const char *__restrict string, size_t nmatch,
260 regmatch_t pmatch[], int eflags)
262 return regexec (preg, string, nmatch, pmatch,
263 eflags & (REG_NOTBOL | REG_NOTEOL));
265 compat_symbol (libc, __compat_regexec, regexec, GLIBC_2_0);
266 # endif
267 #endif
269 /* Entry points for GNU code. */
271 /* re_match, re_search, re_match_2, re_search_2
273 The former two functions operate on STRING with length LENGTH,
274 while the later two operate on concatenation of STRING1 and STRING2
275 with lengths LENGTH1 and LENGTH2, respectively.
277 re_match() matches the compiled pattern in BUFP against the string,
278 starting at index START.
280 re_search() first tries matching at index START, then it tries to match
281 starting from index START + 1, and so on. The last start position tried
282 is START + RANGE. (Thus RANGE = 0 forces re_search to operate the same
283 way as re_match().)
285 The parameter STOP of re_{match,search}_2 specifies that no match exceeding
286 the first STOP characters of the concatenation of the strings should be
287 concerned.
289 If REGS is not NULL, and BUFP->no_sub is not set, the offsets of the match
290 and all groups is stroed in REGS. (For the "_2" variants, the offsets are
291 computed relative to the concatenation, not relative to the individual
292 strings.)
294 On success, re_match* functions return the length of the match, re_search*
295 return the position of the start of the match. Return value -1 means no
296 match was found and -2 indicates an internal error. */
299 re_match (bufp, string, length, start, regs)
300 struct re_pattern_buffer *bufp;
301 const char *string;
302 int length, start;
303 struct re_registers *regs;
305 return re_search_stub (bufp, string, length, start, 0, length, regs, 1);
307 #ifdef _LIBC
308 weak_alias (__re_match, re_match)
309 #endif
312 re_search (bufp, string, length, start, range, regs)
313 struct re_pattern_buffer *bufp;
314 const char *string;
315 int length, start, range;
316 struct re_registers *regs;
318 return re_search_stub (bufp, string, length, start, range, length, regs, 0);
320 #ifdef _LIBC
321 weak_alias (__re_search, re_search)
322 #endif
325 re_match_2 (bufp, string1, length1, string2, length2, start, regs, stop)
326 struct re_pattern_buffer *bufp;
327 const char *string1, *string2;
328 int length1, length2, start, stop;
329 struct re_registers *regs;
331 return re_search_2_stub (bufp, string1, length1, string2, length2,
332 start, 0, regs, stop, 1);
334 #ifdef _LIBC
335 weak_alias (__re_match_2, re_match_2)
336 #endif
339 re_search_2 (bufp, string1, length1, string2, length2, start, range, regs, stop)
340 struct re_pattern_buffer *bufp;
341 const char *string1, *string2;
342 int length1, length2, start, range, stop;
343 struct re_registers *regs;
345 return re_search_2_stub (bufp, string1, length1, string2, length2,
346 start, range, regs, stop, 0);
348 #ifdef _LIBC
349 weak_alias (__re_search_2, re_search_2)
350 #endif
352 static int
353 re_search_2_stub (bufp, string1, length1, string2, length2, start, range, regs,
354 stop, ret_len)
355 struct re_pattern_buffer *bufp;
356 const char *string1, *string2;
357 int length1, length2, start, range, stop, ret_len;
358 struct re_registers *regs;
360 const char *str;
361 int rval;
362 int len = length1 + length2;
363 int free_str = 0;
365 if (BE (length1 < 0 || length2 < 0 || stop < 0, 0))
366 return -2;
368 /* Concatenate the strings. */
369 if (length2 > 0)
370 if (length1 > 0)
372 char *s = re_malloc (char, len);
374 if (BE (s == NULL, 0))
375 return -2;
376 memcpy (s, string1, length1);
377 memcpy (s + length1, string2, length2);
378 str = s;
379 free_str = 1;
381 else
382 str = string2;
383 else
384 str = string1;
386 rval = re_search_stub (bufp, str, len, start, range, stop, regs,
387 ret_len);
388 if (free_str)
389 re_free ((char *) str);
390 return rval;
393 /* The parameters have the same meaning as those of re_search.
394 Additional parameters:
395 If RET_LEN is nonzero the length of the match is returned (re_match style);
396 otherwise the position of the match is returned. */
398 static int
399 re_search_stub (bufp, string, length, start, range, stop, regs, ret_len)
400 struct re_pattern_buffer *bufp;
401 const char *string;
402 int length, start, range, stop, ret_len;
403 struct re_registers *regs;
405 reg_errcode_t result;
406 regmatch_t *pmatch;
407 int nregs, rval;
408 int eflags = 0;
409 re_dfa_t *dfa = (re_dfa_t *)bufp->buffer;
411 /* Check for out-of-range. */
412 if (BE (start < 0 || start > length, 0))
413 return -1;
414 if (BE (start + range > length, 0))
415 range = length - start;
416 else if (BE (start + range < 0, 0))
417 range = -start;
419 __libc_lock_lock (dfa->lock);
421 eflags |= (bufp->not_bol) ? REG_NOTBOL : 0;
422 eflags |= (bufp->not_eol) ? REG_NOTEOL : 0;
424 /* Compile fastmap if we haven't yet. */
425 if (range > 0 && bufp->fastmap != NULL && !bufp->fastmap_accurate)
426 re_compile_fastmap (bufp);
428 if (BE (bufp->no_sub, 0))
429 regs = NULL;
431 /* We need at least 1 register. */
432 if (regs == NULL)
433 nregs = 1;
434 else if (BE (bufp->regs_allocated == REGS_FIXED &&
435 regs->num_regs < bufp->re_nsub + 1, 0))
437 nregs = regs->num_regs;
438 if (BE (nregs < 1, 0))
440 /* Nothing can be copied to regs. */
441 regs = NULL;
442 nregs = 1;
445 else
446 nregs = bufp->re_nsub + 1;
447 pmatch = re_malloc (regmatch_t, nregs);
448 if (BE (pmatch == NULL, 0))
450 rval = -2;
451 goto out;
454 result = re_search_internal (bufp, string, length, start, range, stop,
455 nregs, pmatch, eflags);
457 rval = 0;
459 /* I hope we needn't fill ther regs with -1's when no match was found. */
460 if (result != REG_NOERROR)
461 rval = -1;
462 else if (regs != NULL)
464 /* If caller wants register contents data back, copy them. */
465 bufp->regs_allocated = re_copy_regs (regs, pmatch, nregs,
466 bufp->regs_allocated);
467 if (BE (bufp->regs_allocated == REGS_UNALLOCATED, 0))
468 rval = -2;
471 if (BE (rval == 0, 1))
473 if (ret_len)
475 assert (pmatch[0].rm_so == start);
476 rval = pmatch[0].rm_eo - start;
478 else
479 rval = pmatch[0].rm_so;
481 re_free (pmatch);
482 out:
483 __libc_lock_unlock (dfa->lock);
484 return rval;
487 static unsigned
488 re_copy_regs (regs, pmatch, nregs, regs_allocated)
489 struct re_registers *regs;
490 regmatch_t *pmatch;
491 int nregs, regs_allocated;
493 int rval = REGS_REALLOCATE;
494 int i;
495 int need_regs = nregs + 1;
496 /* We need one extra element beyond `num_regs' for the `-1' marker GNU code
497 uses. */
499 /* Have the register data arrays been allocated? */
500 if (regs_allocated == REGS_UNALLOCATED)
501 { /* No. So allocate them with malloc. */
502 regs->start = re_malloc (regoff_t, need_regs);
503 regs->end = re_malloc (regoff_t, need_regs);
504 if (BE (regs->start == NULL, 0) || BE (regs->end == NULL, 0))
505 return REGS_UNALLOCATED;
506 regs->num_regs = need_regs;
508 else if (regs_allocated == REGS_REALLOCATE)
509 { /* Yes. If we need more elements than were already
510 allocated, reallocate them. If we need fewer, just
511 leave it alone. */
512 if (BE (need_regs > regs->num_regs, 0))
514 regoff_t *new_start = re_realloc (regs->start, regoff_t, need_regs);
515 regoff_t *new_end = re_realloc (regs->end, regoff_t, need_regs);
516 if (BE (new_start == NULL, 0) || BE (new_end == NULL, 0))
517 return REGS_UNALLOCATED;
518 regs->start = new_start;
519 regs->end = new_end;
520 regs->num_regs = need_regs;
523 else
525 assert (regs_allocated == REGS_FIXED);
526 /* This function may not be called with REGS_FIXED and nregs too big. */
527 assert (regs->num_regs >= nregs);
528 rval = REGS_FIXED;
531 /* Copy the regs. */
532 for (i = 0; i < nregs; ++i)
534 regs->start[i] = pmatch[i].rm_so;
535 regs->end[i] = pmatch[i].rm_eo;
537 for ( ; i < regs->num_regs; ++i)
538 regs->start[i] = regs->end[i] = -1;
540 return rval;
543 /* Set REGS to hold NUM_REGS registers, storing them in STARTS and
544 ENDS. Subsequent matches using PATTERN_BUFFER and REGS will use
545 this memory for recording register information. STARTS and ENDS
546 must be allocated using the malloc library routine, and must each
547 be at least NUM_REGS * sizeof (regoff_t) bytes long.
549 If NUM_REGS == 0, then subsequent matches should allocate their own
550 register data.
552 Unless this function is called, the first search or match using
553 PATTERN_BUFFER will allocate its own register data, without
554 freeing the old data. */
556 void
557 re_set_registers (bufp, regs, num_regs, starts, ends)
558 struct re_pattern_buffer *bufp;
559 struct re_registers *regs;
560 unsigned num_regs;
561 regoff_t *starts, *ends;
563 if (num_regs)
565 bufp->regs_allocated = REGS_REALLOCATE;
566 regs->num_regs = num_regs;
567 regs->start = starts;
568 regs->end = ends;
570 else
572 bufp->regs_allocated = REGS_UNALLOCATED;
573 regs->num_regs = 0;
574 regs->start = regs->end = (regoff_t *) 0;
577 #ifdef _LIBC
578 weak_alias (__re_set_registers, re_set_registers)
579 #endif
581 /* Entry points compatible with 4.2 BSD regex library. We don't define
582 them unless specifically requested. */
584 #if defined _REGEX_RE_COMP || defined _LIBC
586 # ifdef _LIBC
587 weak_function
588 # endif
589 re_exec (s)
590 const char *s;
592 return 0 == regexec (&re_comp_buf, s, 0, NULL, 0);
594 #endif /* _REGEX_RE_COMP */
596 /* Internal entry point. */
598 /* Searches for a compiled pattern PREG in the string STRING, whose
599 length is LENGTH. NMATCH, PMATCH, and EFLAGS have the same
600 mingings with regexec. START, and RANGE have the same meanings
601 with re_search.
602 Return REG_NOERROR if we find a match, and REG_NOMATCH if not,
603 otherwise return the error code.
604 Note: We assume front end functions already check ranges.
605 (START + RANGE >= 0 && START + RANGE <= LENGTH) */
607 static reg_errcode_t
608 re_search_internal (preg, string, length, start, range, stop, nmatch, pmatch,
609 eflags)
610 const regex_t *preg;
611 const char *string;
612 int length, start, range, stop, eflags;
613 size_t nmatch;
614 regmatch_t pmatch[];
616 reg_errcode_t err;
617 re_dfa_t *dfa = (re_dfa_t *)preg->buffer;
618 int left_lim, right_lim, incr;
619 int fl_longest_match, match_first, match_kind, match_last = -1;
620 int sb, ch;
621 #if defined _LIBC || (defined __STDC_VERSION__ && __STDC_VERSION__ >= 199901L)
622 re_match_context_t mctx = { .dfa = dfa };
623 #else
624 re_match_context_t mctx;
625 #endif
626 char *fastmap = (preg->fastmap != NULL && preg->fastmap_accurate
627 && range && !preg->can_be_null) ? preg->fastmap : NULL;
628 unsigned RE_TRANSLATE_TYPE t = (unsigned RE_TRANSLATE_TYPE) preg->translate;
630 #if !(defined _LIBC || (defined __STDC_VERSION__ && __STDC_VERSION__ >= 199901L))
631 memset (&mctx, '\0', sizeof (re_match_context_t));
632 mctx.dfa = dfa;
633 #endif
635 /* Check if the DFA haven't been compiled. */
636 if (BE (preg->used == 0 || dfa->init_state == NULL
637 || dfa->init_state_word == NULL || dfa->init_state_nl == NULL
638 || dfa->init_state_begbuf == NULL, 0))
639 return REG_NOMATCH;
641 #ifdef DEBUG
642 /* We assume front-end functions already check them. */
643 assert (start + range >= 0 && start + range <= length);
644 #endif
646 /* If initial states with non-begbuf contexts have no elements,
647 the regex must be anchored. If preg->newline_anchor is set,
648 we'll never use init_state_nl, so do not check it. */
649 if (dfa->init_state->nodes.nelem == 0
650 && dfa->init_state_word->nodes.nelem == 0
651 && (dfa->init_state_nl->nodes.nelem == 0
652 || !preg->newline_anchor))
654 if (start != 0 && start + range != 0)
655 return REG_NOMATCH;
656 start = range = 0;
659 /* We must check the longest matching, if nmatch > 0. */
660 fl_longest_match = (nmatch != 0 || dfa->nbackref);
662 err = re_string_allocate (&mctx.input, string, length, dfa->nodes_len + 1,
663 preg->translate, preg->syntax & RE_ICASE, dfa);
664 if (BE (err != REG_NOERROR, 0))
665 goto free_return;
666 mctx.input.stop = stop;
667 mctx.input.raw_stop = stop;
668 mctx.input.newline_anchor = preg->newline_anchor;
670 err = match_ctx_init (&mctx, eflags, dfa->nbackref * 2);
671 if (BE (err != REG_NOERROR, 0))
672 goto free_return;
674 /* We will log all the DFA states through which the dfa pass,
675 if nmatch > 1, or this dfa has "multibyte node", which is a
676 back-reference or a node which can accept multibyte character or
677 multi character collating element. */
678 if (nmatch > 1 || dfa->has_mb_node)
680 mctx.state_log = re_malloc (re_dfastate_t *, mctx.input.bufs_len + 1);
681 if (BE (mctx.state_log == NULL, 0))
683 err = REG_ESPACE;
684 goto free_return;
687 else
688 mctx.state_log = NULL;
690 match_first = start;
691 mctx.input.tip_context = (eflags & REG_NOTBOL) ? CONTEXT_BEGBUF
692 : CONTEXT_NEWLINE | CONTEXT_BEGBUF;
694 /* Check incrementally whether of not the input string match. */
695 incr = (range < 0) ? -1 : 1;
696 left_lim = (range < 0) ? start + range : start;
697 right_lim = (range < 0) ? start : start + range;
698 sb = dfa->mb_cur_max == 1;
699 match_kind =
700 (fastmap
701 ? ((sb || !(preg->syntax & RE_ICASE || t) ? 4 : 0)
702 | (range >= 0 ? 2 : 0)
703 | (t != NULL ? 1 : 0))
704 : 8);
706 for (;; match_first += incr)
708 err = REG_NOMATCH;
709 if (match_first < left_lim || right_lim < match_first)
710 goto free_return;
712 /* Advance as rapidly as possible through the string, until we
713 find a plausible place to start matching. This may be done
714 with varying efficiency, so there are various possibilities:
715 only the most common of them are specialized, in order to
716 save on code size. We use a switch statement for speed. */
717 switch (match_kind)
719 case 8:
720 /* No fastmap. */
721 break;
723 case 7:
724 /* Fastmap with single-byte translation, match forward. */
725 while (BE (match_first < right_lim, 1)
726 && !fastmap[t[(unsigned char) string[match_first]]])
727 ++match_first;
728 goto forward_match_found_start_or_reached_end;
730 case 6:
731 /* Fastmap without translation, match forward. */
732 while (BE (match_first < right_lim, 1)
733 && !fastmap[(unsigned char) string[match_first]])
734 ++match_first;
736 forward_match_found_start_or_reached_end:
737 if (BE (match_first == right_lim, 0))
739 ch = match_first >= length
740 ? 0 : (unsigned char) string[match_first];
741 if (!fastmap[t ? t[ch] : ch])
742 goto free_return;
744 break;
746 case 4:
747 case 5:
748 /* Fastmap without multi-byte translation, match backwards. */
749 while (match_first >= left_lim)
751 ch = match_first >= length
752 ? 0 : (unsigned char) string[match_first];
753 if (fastmap[t ? t[ch] : ch])
754 break;
755 --match_first;
757 if (match_first < left_lim)
758 goto free_return;
759 break;
761 default:
762 /* In this case, we can't determine easily the current byte,
763 since it might be a component byte of a multibyte
764 character. Then we use the constructed buffer instead. */
765 for (;;)
767 /* If MATCH_FIRST is out of the valid range, reconstruct the
768 buffers. */
769 unsigned int offset = match_first - mctx.input.raw_mbs_idx;
770 if (BE (offset >= (unsigned int) mctx.input.valid_raw_len, 0))
772 err = re_string_reconstruct (&mctx.input, match_first,
773 eflags);
774 if (BE (err != REG_NOERROR, 0))
775 goto free_return;
777 offset = match_first - mctx.input.raw_mbs_idx;
779 /* If MATCH_FIRST is out of the buffer, leave it as '\0'.
780 Note that MATCH_FIRST must not be smaller than 0. */
781 ch = (match_first >= length
782 ? 0 : re_string_byte_at (&mctx.input, offset));
783 if (fastmap[ch])
784 break;
785 match_first += incr;
786 if (match_first < left_lim || match_first > right_lim)
788 err = REG_NOMATCH;
789 goto free_return;
792 break;
795 /* Reconstruct the buffers so that the matcher can assume that
796 the matching starts from the beginning of the buffer. */
797 err = re_string_reconstruct (&mctx.input, match_first, eflags);
798 if (BE (err != REG_NOERROR, 0))
799 goto free_return;
801 #ifdef RE_ENABLE_I18N
802 /* Don't consider this char as a possible match start if it part,
803 yet isn't the head, of a multibyte character. */
804 if (!sb && !re_string_first_byte (&mctx.input, 0))
805 continue;
806 #endif
808 /* It seems to be appropriate one, then use the matcher. */
809 /* We assume that the matching starts from 0. */
810 mctx.state_log_top = mctx.nbkref_ents = mctx.max_mb_elem_len = 0;
811 match_last = check_matching (&mctx, fl_longest_match,
812 range >= 0 ? &match_first : NULL);
813 if (match_last != -1)
815 if (BE (match_last == -2, 0))
817 err = REG_ESPACE;
818 goto free_return;
820 else
822 mctx.match_last = match_last;
823 if ((!preg->no_sub && nmatch > 1) || dfa->nbackref)
825 re_dfastate_t *pstate = mctx.state_log[match_last];
826 mctx.last_node = check_halt_state_context (&mctx, pstate,
827 match_last);
829 if ((!preg->no_sub && nmatch > 1 && dfa->has_plural_match)
830 || dfa->nbackref)
832 err = prune_impossible_nodes (&mctx);
833 if (err == REG_NOERROR)
834 break;
835 if (BE (err != REG_NOMATCH, 0))
836 goto free_return;
837 match_last = -1;
839 else
840 break; /* We found a match. */
844 match_ctx_clean (&mctx);
847 #ifdef DEBUG
848 assert (match_last != -1);
849 assert (err == REG_NOERROR);
850 #endif
852 /* Set pmatch[] if we need. */
853 if (nmatch > 0)
855 int reg_idx;
857 /* Initialize registers. */
858 for (reg_idx = 1; reg_idx < nmatch; ++reg_idx)
859 pmatch[reg_idx].rm_so = pmatch[reg_idx].rm_eo = -1;
861 /* Set the points where matching start/end. */
862 pmatch[0].rm_so = 0;
863 pmatch[0].rm_eo = mctx.match_last;
865 if (!preg->no_sub && nmatch > 1)
867 err = set_regs (preg, &mctx, nmatch, pmatch,
868 dfa->has_plural_match && dfa->nbackref > 0);
869 if (BE (err != REG_NOERROR, 0))
870 goto free_return;
873 /* At last, add the offset to the each registers, since we slided
874 the buffers so that we could assume that the matching starts
875 from 0. */
876 for (reg_idx = 0; reg_idx < nmatch; ++reg_idx)
877 if (pmatch[reg_idx].rm_so != -1)
879 #ifdef RE_ENABLE_I18N
880 if (BE (mctx.input.offsets_needed != 0, 0))
882 if (pmatch[reg_idx].rm_so == mctx.input.valid_len)
883 pmatch[reg_idx].rm_so += mctx.input.valid_raw_len - mctx.input.valid_len;
884 else
885 pmatch[reg_idx].rm_so = mctx.input.offsets[pmatch[reg_idx].rm_so];
886 if (pmatch[reg_idx].rm_eo == mctx.input.valid_len)
887 pmatch[reg_idx].rm_eo += mctx.input.valid_raw_len - mctx.input.valid_len;
888 else
889 pmatch[reg_idx].rm_eo = mctx.input.offsets[pmatch[reg_idx].rm_eo];
891 #else
892 assert (mctx.input.offsets_needed == 0);
893 #endif
894 pmatch[reg_idx].rm_so += match_first;
895 pmatch[reg_idx].rm_eo += match_first;
898 if (dfa->subexp_map)
899 for (reg_idx = 0;
900 reg_idx + 1 < nmatch && reg_idx < preg->re_nsub;
901 reg_idx++)
902 if (dfa->subexp_map[reg_idx] != reg_idx)
904 pmatch[reg_idx + 1].rm_so
905 = pmatch[dfa->subexp_map[reg_idx] + 1].rm_so;
906 pmatch[reg_idx + 1].rm_eo
907 = pmatch[dfa->subexp_map[reg_idx] + 1].rm_eo;
911 free_return:
912 re_free (mctx.state_log);
913 if (dfa->nbackref)
914 match_ctx_free (&mctx);
915 re_string_destruct (&mctx.input);
916 return err;
919 static reg_errcode_t
920 prune_impossible_nodes (mctx)
921 re_match_context_t *mctx;
923 re_dfa_t *const dfa = mctx->dfa;
924 int halt_node, match_last;
925 reg_errcode_t ret;
926 re_dfastate_t **sifted_states;
927 re_dfastate_t **lim_states = NULL;
928 re_sift_context_t sctx;
929 #ifdef DEBUG
930 assert (mctx->state_log != NULL);
931 #endif
932 match_last = mctx->match_last;
933 halt_node = mctx->last_node;
934 sifted_states = re_malloc (re_dfastate_t *, match_last + 1);
935 if (BE (sifted_states == NULL, 0))
937 ret = REG_ESPACE;
938 goto free_return;
940 if (dfa->nbackref)
942 lim_states = re_malloc (re_dfastate_t *, match_last + 1);
943 if (BE (lim_states == NULL, 0))
945 ret = REG_ESPACE;
946 goto free_return;
948 while (1)
950 memset (lim_states, '\0',
951 sizeof (re_dfastate_t *) * (match_last + 1));
952 sift_ctx_init (&sctx, sifted_states, lim_states, halt_node,
953 match_last);
954 ret = sift_states_backward (mctx, &sctx);
955 re_node_set_free (&sctx.limits);
956 if (BE (ret != REG_NOERROR, 0))
957 goto free_return;
958 if (sifted_states[0] != NULL || lim_states[0] != NULL)
959 break;
962 --match_last;
963 if (match_last < 0)
965 ret = REG_NOMATCH;
966 goto free_return;
968 } while (mctx->state_log[match_last] == NULL
969 || !mctx->state_log[match_last]->halt);
970 halt_node = check_halt_state_context (mctx,
971 mctx->state_log[match_last],
972 match_last);
974 ret = merge_state_array (dfa, sifted_states, lim_states,
975 match_last + 1);
976 re_free (lim_states);
977 lim_states = NULL;
978 if (BE (ret != REG_NOERROR, 0))
979 goto free_return;
981 else
983 sift_ctx_init (&sctx, sifted_states, lim_states, halt_node, match_last);
984 ret = sift_states_backward (mctx, &sctx);
985 re_node_set_free (&sctx.limits);
986 if (BE (ret != REG_NOERROR, 0))
987 goto free_return;
989 re_free (mctx->state_log);
990 mctx->state_log = sifted_states;
991 sifted_states = NULL;
992 mctx->last_node = halt_node;
993 mctx->match_last = match_last;
994 ret = REG_NOERROR;
995 free_return:
996 re_free (sifted_states);
997 re_free (lim_states);
998 return ret;
1001 /* Acquire an initial state and return it.
1002 We must select appropriate initial state depending on the context,
1003 since initial states may have constraints like "\<", "^", etc.. */
1005 static inline re_dfastate_t *
1006 acquire_init_state_context (err, mctx, idx)
1007 reg_errcode_t *err;
1008 const re_match_context_t *mctx;
1009 int idx;
1011 re_dfa_t *const dfa = mctx->dfa;
1012 if (dfa->init_state->has_constraint)
1014 unsigned int context;
1015 context = re_string_context_at (&mctx->input, idx - 1, mctx->eflags);
1016 if (IS_WORD_CONTEXT (context))
1017 return dfa->init_state_word;
1018 else if (IS_ORDINARY_CONTEXT (context))
1019 return dfa->init_state;
1020 else if (IS_BEGBUF_CONTEXT (context) && IS_NEWLINE_CONTEXT (context))
1021 return dfa->init_state_begbuf;
1022 else if (IS_NEWLINE_CONTEXT (context))
1023 return dfa->init_state_nl;
1024 else if (IS_BEGBUF_CONTEXT (context))
1026 /* It is relatively rare case, then calculate on demand. */
1027 return re_acquire_state_context (err, dfa,
1028 dfa->init_state->entrance_nodes,
1029 context);
1031 else
1032 /* Must not happen? */
1033 return dfa->init_state;
1035 else
1036 return dfa->init_state;
1039 /* Check whether the regular expression match input string INPUT or not,
1040 and return the index where the matching end, return -1 if not match,
1041 or return -2 in case of an error.
1042 FL_LONGEST_MATCH means we want the POSIX longest matching.
1043 If P_MATCH_FIRST is not NULL, and the match fails, it is set to the
1044 next place where we may want to try matching.
1045 Note that the matcher assume that the maching starts from the current
1046 index of the buffer. */
1048 static int
1049 check_matching (mctx, fl_longest_match, p_match_first)
1050 re_match_context_t *mctx;
1051 int fl_longest_match;
1052 int *p_match_first;
1054 re_dfa_t *const dfa = mctx->dfa;
1055 reg_errcode_t err;
1056 int match = 0;
1057 int match_last = -1;
1058 int cur_str_idx = re_string_cur_idx (&mctx->input);
1059 re_dfastate_t *cur_state;
1060 int at_init_state = p_match_first != NULL;
1061 int next_start_idx = cur_str_idx;
1063 err = REG_NOERROR;
1064 cur_state = acquire_init_state_context (&err, mctx, cur_str_idx);
1065 /* An initial state must not be NULL (invalid). */
1066 if (BE (cur_state == NULL, 0))
1068 assert (err == REG_ESPACE);
1069 return -2;
1072 if (mctx->state_log != NULL)
1074 mctx->state_log[cur_str_idx] = cur_state;
1076 /* Check OP_OPEN_SUBEXP in the initial state in case that we use them
1077 later. E.g. Processing back references. */
1078 if (BE (dfa->nbackref, 0))
1080 at_init_state = 0;
1081 err = check_subexp_matching_top (mctx, &cur_state->nodes, 0);
1082 if (BE (err != REG_NOERROR, 0))
1083 return err;
1085 if (cur_state->has_backref)
1087 err = transit_state_bkref (mctx, &cur_state->nodes);
1088 if (BE (err != REG_NOERROR, 0))
1089 return err;
1094 /* If the RE accepts NULL string. */
1095 if (BE (cur_state->halt, 0))
1097 if (!cur_state->has_constraint
1098 || check_halt_state_context (mctx, cur_state, cur_str_idx))
1100 if (!fl_longest_match)
1101 return cur_str_idx;
1102 else
1104 match_last = cur_str_idx;
1105 match = 1;
1110 while (!re_string_eoi (&mctx->input))
1112 re_dfastate_t *old_state = cur_state;
1113 int next_char_idx = re_string_cur_idx (&mctx->input) + 1;
1115 if (BE (next_char_idx >= mctx->input.bufs_len, 0)
1116 || (BE (next_char_idx >= mctx->input.valid_len, 0)
1117 && mctx->input.valid_len < mctx->input.len))
1119 err = extend_buffers (mctx);
1120 if (BE (err != REG_NOERROR, 0))
1122 assert (err == REG_ESPACE);
1123 return -2;
1127 cur_state = transit_state (&err, mctx, cur_state);
1128 if (mctx->state_log != NULL)
1129 cur_state = merge_state_with_log (&err, mctx, cur_state);
1131 if (cur_state == NULL)
1133 /* Reached the invalid state or an error. Try to recover a valid
1134 state using the state log, if available and if we have not
1135 already found a valid (even if not the longest) match. */
1136 if (BE (err != REG_NOERROR, 0))
1137 return -2;
1139 if (mctx->state_log == NULL
1140 || (match && !fl_longest_match)
1141 || (cur_state = find_recover_state (&err, mctx)) == NULL)
1142 break;
1145 if (BE (at_init_state, 0))
1147 if (old_state == cur_state)
1148 next_start_idx = next_char_idx;
1149 else
1150 at_init_state = 0;
1153 if (cur_state->halt)
1155 /* Reached a halt state.
1156 Check the halt state can satisfy the current context. */
1157 if (!cur_state->has_constraint
1158 || check_halt_state_context (mctx, cur_state,
1159 re_string_cur_idx (&mctx->input)))
1161 /* We found an appropriate halt state. */
1162 match_last = re_string_cur_idx (&mctx->input);
1163 match = 1;
1165 /* We found a match, do not modify match_first below. */
1166 p_match_first = NULL;
1167 if (!fl_longest_match)
1168 break;
1173 if (p_match_first)
1174 *p_match_first += next_start_idx;
1176 return match_last;
1179 /* Check NODE match the current context. */
1181 static int check_halt_node_context (dfa, node, context)
1182 const re_dfa_t *dfa;
1183 int node;
1184 unsigned int context;
1186 re_token_type_t type = dfa->nodes[node].type;
1187 unsigned int constraint = dfa->nodes[node].constraint;
1188 if (type != END_OF_RE)
1189 return 0;
1190 if (!constraint)
1191 return 1;
1192 if (NOT_SATISFY_NEXT_CONSTRAINT (constraint, context))
1193 return 0;
1194 return 1;
1197 /* Check the halt state STATE match the current context.
1198 Return 0 if not match, if the node, STATE has, is a halt node and
1199 match the context, return the node. */
1201 static int
1202 check_halt_state_context (mctx, state, idx)
1203 const re_match_context_t *mctx;
1204 const re_dfastate_t *state;
1205 int idx;
1207 int i;
1208 unsigned int context;
1209 #ifdef DEBUG
1210 assert (state->halt);
1211 #endif
1212 context = re_string_context_at (&mctx->input, idx, mctx->eflags);
1213 for (i = 0; i < state->nodes.nelem; ++i)
1214 if (check_halt_node_context (mctx->dfa, state->nodes.elems[i], context))
1215 return state->nodes.elems[i];
1216 return 0;
1219 /* Compute the next node to which "NFA" transit from NODE("NFA" is a NFA
1220 corresponding to the DFA).
1221 Return the destination node, and update EPS_VIA_NODES, return -1 in case
1222 of errors. */
1224 static int
1225 proceed_next_node (mctx, nregs, regs, pidx, node, eps_via_nodes, fs)
1226 const re_match_context_t *mctx;
1227 regmatch_t *regs;
1228 int nregs, *pidx, node;
1229 re_node_set *eps_via_nodes;
1230 struct re_fail_stack_t *fs;
1232 re_dfa_t *const dfa = mctx->dfa;
1233 int i, err, dest_node;
1234 dest_node = -1;
1235 if (IS_EPSILON_NODE (dfa->nodes[node].type))
1237 re_node_set *cur_nodes = &mctx->state_log[*pidx]->nodes;
1238 re_node_set *edests = &dfa->edests[node];
1239 int dest_node;
1240 err = re_node_set_insert (eps_via_nodes, node);
1241 if (BE (err < 0, 0))
1242 return -2;
1243 /* Pick up a valid destination, or return -1 if none is found. */
1244 for (dest_node = -1, i = 0; i < edests->nelem; ++i)
1246 int candidate = edests->elems[i];
1247 if (!re_node_set_contains (cur_nodes, candidate))
1248 continue;
1249 if (dest_node == -1)
1250 dest_node = candidate;
1252 else
1254 /* In order to avoid infinite loop like "(a*)*", return the second
1255 epsilon-transition if the first was already considered. */
1256 if (re_node_set_contains (eps_via_nodes, dest_node))
1257 return candidate;
1259 /* Otherwise, push the second epsilon-transition on the fail stack. */
1260 else if (fs != NULL
1261 && push_fail_stack (fs, *pidx, candidate, nregs, regs,
1262 eps_via_nodes))
1263 return -2;
1265 /* We know we are going to exit. */
1266 break;
1269 return dest_node;
1271 else
1273 int naccepted = 0;
1274 re_token_type_t type = dfa->nodes[node].type;
1276 #ifdef RE_ENABLE_I18N
1277 if (ACCEPT_MB_NODE (type))
1278 naccepted = check_node_accept_bytes (dfa, node, &mctx->input, *pidx);
1279 else
1280 #endif /* RE_ENABLE_I18N */
1281 if (type == OP_BACK_REF)
1283 int subexp_idx = dfa->nodes[node].opr.idx + 1;
1284 naccepted = regs[subexp_idx].rm_eo - regs[subexp_idx].rm_so;
1285 if (fs != NULL)
1287 if (regs[subexp_idx].rm_so == -1 || regs[subexp_idx].rm_eo == -1)
1288 return -1;
1289 else if (naccepted)
1291 char *buf = (char *) re_string_get_buffer (&mctx->input);
1292 if (memcmp (buf + regs[subexp_idx].rm_so, buf + *pidx,
1293 naccepted) != 0)
1294 return -1;
1298 if (naccepted == 0)
1300 err = re_node_set_insert (eps_via_nodes, node);
1301 if (BE (err < 0, 0))
1302 return -2;
1303 dest_node = dfa->edests[node].elems[0];
1304 if (re_node_set_contains (&mctx->state_log[*pidx]->nodes,
1305 dest_node))
1306 return dest_node;
1310 if (naccepted != 0
1311 || check_node_accept (mctx, dfa->nodes + node, *pidx))
1313 dest_node = dfa->nexts[node];
1314 *pidx = (naccepted == 0) ? *pidx + 1 : *pidx + naccepted;
1315 if (fs && (*pidx > mctx->match_last || mctx->state_log[*pidx] == NULL
1316 || !re_node_set_contains (&mctx->state_log[*pidx]->nodes,
1317 dest_node)))
1318 return -1;
1319 re_node_set_empty (eps_via_nodes);
1320 return dest_node;
1323 return -1;
1326 static reg_errcode_t
1327 push_fail_stack (fs, str_idx, dest_node, nregs, regs, eps_via_nodes)
1328 struct re_fail_stack_t *fs;
1329 int str_idx, dest_node, nregs;
1330 regmatch_t *regs;
1331 re_node_set *eps_via_nodes;
1333 reg_errcode_t err;
1334 int num = fs->num++;
1335 if (fs->num == fs->alloc)
1337 struct re_fail_stack_ent_t *new_array;
1338 new_array = realloc (fs->stack, (sizeof (struct re_fail_stack_ent_t)
1339 * fs->alloc * 2));
1340 if (new_array == NULL)
1341 return REG_ESPACE;
1342 fs->alloc *= 2;
1343 fs->stack = new_array;
1345 fs->stack[num].idx = str_idx;
1346 fs->stack[num].node = dest_node;
1347 fs->stack[num].regs = re_malloc (regmatch_t, nregs);
1348 if (fs->stack[num].regs == NULL)
1349 return REG_ESPACE;
1350 memcpy (fs->stack[num].regs, regs, sizeof (regmatch_t) * nregs);
1351 err = re_node_set_init_copy (&fs->stack[num].eps_via_nodes, eps_via_nodes);
1352 return err;
1355 static int
1356 pop_fail_stack (fs, pidx, nregs, regs, eps_via_nodes)
1357 struct re_fail_stack_t *fs;
1358 int *pidx, nregs;
1359 regmatch_t *regs;
1360 re_node_set *eps_via_nodes;
1362 int num = --fs->num;
1363 assert (num >= 0);
1364 *pidx = fs->stack[num].idx;
1365 memcpy (regs, fs->stack[num].regs, sizeof (regmatch_t) * nregs);
1366 re_node_set_free (eps_via_nodes);
1367 re_free (fs->stack[num].regs);
1368 *eps_via_nodes = fs->stack[num].eps_via_nodes;
1369 return fs->stack[num].node;
1372 /* Set the positions where the subexpressions are starts/ends to registers
1373 PMATCH.
1374 Note: We assume that pmatch[0] is already set, and
1375 pmatch[i].rm_so == pmatch[i].rm_eo == -1 for 0 < i < nmatch. */
1377 static reg_errcode_t
1378 set_regs (preg, mctx, nmatch, pmatch, fl_backtrack)
1379 const regex_t *preg;
1380 const re_match_context_t *mctx;
1381 size_t nmatch;
1382 regmatch_t *pmatch;
1383 int fl_backtrack;
1385 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1386 int idx, cur_node, real_nmatch;
1387 re_node_set eps_via_nodes;
1388 struct re_fail_stack_t *fs;
1389 struct re_fail_stack_t fs_body = { 0, 2, NULL };
1390 regmatch_t *prev_idx_match;
1392 #ifdef DEBUG
1393 assert (nmatch > 1);
1394 assert (mctx->state_log != NULL);
1395 #endif
1396 if (fl_backtrack)
1398 fs = &fs_body;
1399 fs->stack = re_malloc (struct re_fail_stack_ent_t, fs->alloc);
1400 if (fs->stack == NULL)
1401 return REG_ESPACE;
1403 else
1404 fs = NULL;
1406 cur_node = dfa->init_node;
1407 real_nmatch = (nmatch <= preg->re_nsub) ? nmatch : preg->re_nsub + 1;
1408 re_node_set_init_empty (&eps_via_nodes);
1410 prev_idx_match = (regmatch_t *) alloca (sizeof (regmatch_t) * real_nmatch);
1411 memcpy (prev_idx_match, pmatch, sizeof (regmatch_t) * real_nmatch);
1413 for (idx = pmatch[0].rm_so; idx <= pmatch[0].rm_eo ;)
1415 update_regs (dfa, pmatch, prev_idx_match, cur_node, idx, real_nmatch);
1417 if (idx == pmatch[0].rm_eo && cur_node == mctx->last_node)
1419 int reg_idx;
1420 if (fs)
1422 for (reg_idx = 0; reg_idx < nmatch; ++reg_idx)
1423 if (pmatch[reg_idx].rm_so > -1 && pmatch[reg_idx].rm_eo == -1)
1424 break;
1425 if (reg_idx == nmatch)
1427 re_node_set_free (&eps_via_nodes);
1428 return free_fail_stack_return (fs);
1430 cur_node = pop_fail_stack (fs, &idx, nmatch, pmatch,
1431 &eps_via_nodes);
1433 else
1435 re_node_set_free (&eps_via_nodes);
1436 return REG_NOERROR;
1440 /* Proceed to next node. */
1441 cur_node = proceed_next_node (mctx, nmatch, pmatch, &idx, cur_node,
1442 &eps_via_nodes, fs);
1444 if (BE (cur_node < 0, 0))
1446 if (BE (cur_node == -2, 0))
1448 re_node_set_free (&eps_via_nodes);
1449 free_fail_stack_return (fs);
1450 return REG_ESPACE;
1452 if (fs)
1453 cur_node = pop_fail_stack (fs, &idx, nmatch, pmatch,
1454 &eps_via_nodes);
1455 else
1457 re_node_set_free (&eps_via_nodes);
1458 return REG_NOMATCH;
1462 re_node_set_free (&eps_via_nodes);
1463 return free_fail_stack_return (fs);
1466 static reg_errcode_t
1467 free_fail_stack_return (fs)
1468 struct re_fail_stack_t *fs;
1470 if (fs)
1472 int fs_idx;
1473 for (fs_idx = 0; fs_idx < fs->num; ++fs_idx)
1475 re_node_set_free (&fs->stack[fs_idx].eps_via_nodes);
1476 re_free (fs->stack[fs_idx].regs);
1478 re_free (fs->stack);
1480 return REG_NOERROR;
1483 static void
1484 update_regs (dfa, pmatch, prev_idx_match, cur_node, cur_idx, nmatch)
1485 re_dfa_t *dfa;
1486 regmatch_t *pmatch, *prev_idx_match;
1487 int cur_node, cur_idx, nmatch;
1489 int type = dfa->nodes[cur_node].type;
1490 if (type == OP_OPEN_SUBEXP)
1492 int reg_num = dfa->nodes[cur_node].opr.idx + 1;
1494 /* We are at the first node of this sub expression. */
1495 if (reg_num < nmatch)
1497 pmatch[reg_num].rm_so = cur_idx;
1498 pmatch[reg_num].rm_eo = -1;
1501 else if (type == OP_CLOSE_SUBEXP)
1503 int reg_num = dfa->nodes[cur_node].opr.idx + 1;
1504 if (reg_num < nmatch)
1506 /* We are at the last node of this sub expression. */
1507 if (pmatch[reg_num].rm_so < cur_idx)
1509 pmatch[reg_num].rm_eo = cur_idx;
1510 /* This is a non-empty match or we are not inside an optional
1511 subexpression. Accept this right away. */
1512 memcpy (prev_idx_match, pmatch, sizeof (regmatch_t) * nmatch);
1514 else
1516 if (dfa->nodes[cur_node].opt_subexp
1517 && prev_idx_match[reg_num].rm_so != -1)
1518 /* We transited through an empty match for an optional
1519 subexpression, like (a?)*, and this is not the subexp's
1520 first match. Copy back the old content of the registers
1521 so that matches of an inner subexpression are undone as
1522 well, like in ((a?))*. */
1523 memcpy (pmatch, prev_idx_match, sizeof (regmatch_t) * nmatch);
1524 else
1525 /* We completed a subexpression, but it may be part of
1526 an optional one, so do not update PREV_IDX_MATCH. */
1527 pmatch[reg_num].rm_eo = cur_idx;
1533 /* This function checks the STATE_LOG from the SCTX->last_str_idx to 0
1534 and sift the nodes in each states according to the following rules.
1535 Updated state_log will be wrote to STATE_LOG.
1537 Rules: We throw away the Node `a' in the STATE_LOG[STR_IDX] if...
1538 1. When STR_IDX == MATCH_LAST(the last index in the state_log):
1539 If `a' isn't the LAST_NODE and `a' can't epsilon transit to
1540 the LAST_NODE, we throw away the node `a'.
1541 2. When 0 <= STR_IDX < MATCH_LAST and `a' accepts
1542 string `s' and transit to `b':
1543 i. If 'b' isn't in the STATE_LOG[STR_IDX+strlen('s')], we throw
1544 away the node `a'.
1545 ii. If 'b' is in the STATE_LOG[STR_IDX+strlen('s')] but 'b' is
1546 thrown away, we throw away the node `a'.
1547 3. When 0 <= STR_IDX < MATCH_LAST and 'a' epsilon transit to 'b':
1548 i. If 'b' isn't in the STATE_LOG[STR_IDX], we throw away the
1549 node `a'.
1550 ii. If 'b' is in the STATE_LOG[STR_IDX] but 'b' is thrown away,
1551 we throw away the node `a'. */
1553 #define STATE_NODE_CONTAINS(state,node) \
1554 ((state) != NULL && re_node_set_contains (&(state)->nodes, node))
1556 static reg_errcode_t
1557 sift_states_backward (mctx, sctx)
1558 re_match_context_t *mctx;
1559 re_sift_context_t *sctx;
1561 reg_errcode_t err;
1562 int null_cnt = 0;
1563 int str_idx = sctx->last_str_idx;
1564 re_node_set cur_dest;
1566 #ifdef DEBUG
1567 assert (mctx->state_log != NULL && mctx->state_log[str_idx] != NULL);
1568 #endif
1570 /* Build sifted state_log[str_idx]. It has the nodes which can epsilon
1571 transit to the last_node and the last_node itself. */
1572 err = re_node_set_init_1 (&cur_dest, sctx->last_node);
1573 if (BE (err != REG_NOERROR, 0))
1574 return err;
1575 err = update_cur_sifted_state (mctx, sctx, str_idx, &cur_dest);
1576 if (BE (err != REG_NOERROR, 0))
1577 goto free_return;
1579 /* Then check each states in the state_log. */
1580 while (str_idx > 0)
1582 /* Update counters. */
1583 null_cnt = (sctx->sifted_states[str_idx] == NULL) ? null_cnt + 1 : 0;
1584 if (null_cnt > mctx->max_mb_elem_len)
1586 memset (sctx->sifted_states, '\0',
1587 sizeof (re_dfastate_t *) * str_idx);
1588 re_node_set_free (&cur_dest);
1589 return REG_NOERROR;
1591 re_node_set_empty (&cur_dest);
1592 --str_idx;
1594 if (mctx->state_log[str_idx])
1596 err = build_sifted_states (mctx, sctx, str_idx, &cur_dest);
1597 if (BE (err != REG_NOERROR, 0))
1598 goto free_return;
1601 /* Add all the nodes which satisfy the following conditions:
1602 - It can epsilon transit to a node in CUR_DEST.
1603 - It is in CUR_SRC.
1604 And update state_log. */
1605 err = update_cur_sifted_state (mctx, sctx, str_idx, &cur_dest);
1606 if (BE (err != REG_NOERROR, 0))
1607 goto free_return;
1609 err = REG_NOERROR;
1610 free_return:
1611 re_node_set_free (&cur_dest);
1612 return err;
1615 static reg_errcode_t
1616 build_sifted_states (mctx, sctx, str_idx, cur_dest)
1617 re_match_context_t *mctx;
1618 re_sift_context_t *sctx;
1619 int str_idx;
1620 re_node_set *cur_dest;
1622 re_dfa_t *const dfa = mctx->dfa;
1623 re_node_set *cur_src = &mctx->state_log[str_idx]->non_eps_nodes;
1624 int i;
1626 /* Then build the next sifted state.
1627 We build the next sifted state on `cur_dest', and update
1628 `sifted_states[str_idx]' with `cur_dest'.
1629 Note:
1630 `cur_dest' is the sifted state from `state_log[str_idx + 1]'.
1631 `cur_src' points the node_set of the old `state_log[str_idx]'
1632 (with the epsilon nodes pre-filtered out). */
1633 for (i = 0; i < cur_src->nelem; i++)
1635 int prev_node = cur_src->elems[i];
1636 int naccepted = 0;
1637 int ret;
1639 #if defined DEBUG || defined RE_ENABLE_I18N
1640 re_token_type_t type = dfa->nodes[prev_node].type;
1641 #endif
1642 #ifdef DEBUG
1643 assert (!IS_EPSILON_NODE (type));
1644 #endif
1645 #ifdef RE_ENABLE_I18N
1646 /* If the node may accept `multi byte'. */
1647 if (ACCEPT_MB_NODE (type))
1648 naccepted = sift_states_iter_mb (mctx, sctx, prev_node,
1649 str_idx, sctx->last_str_idx);
1650 #endif /* RE_ENABLE_I18N */
1652 /* We don't check backreferences here.
1653 See update_cur_sifted_state(). */
1654 if (!naccepted
1655 && check_node_accept (mctx, dfa->nodes + prev_node, str_idx)
1656 && STATE_NODE_CONTAINS (sctx->sifted_states[str_idx + 1],
1657 dfa->nexts[prev_node]))
1658 naccepted = 1;
1660 if (naccepted == 0)
1661 continue;
1663 if (sctx->limits.nelem)
1665 int to_idx = str_idx + naccepted;
1666 if (check_dst_limits (mctx, &sctx->limits,
1667 dfa->nexts[prev_node], to_idx,
1668 prev_node, str_idx))
1669 continue;
1671 ret = re_node_set_insert (cur_dest, prev_node);
1672 if (BE (ret == -1, 0))
1673 return REG_ESPACE;
1676 return REG_NOERROR;
1679 /* Helper functions. */
1681 static reg_errcode_t
1682 clean_state_log_if_needed (mctx, next_state_log_idx)
1683 re_match_context_t *mctx;
1684 int next_state_log_idx;
1686 int top = mctx->state_log_top;
1688 if (next_state_log_idx >= mctx->input.bufs_len
1689 || (next_state_log_idx >= mctx->input.valid_len
1690 && mctx->input.valid_len < mctx->input.len))
1692 reg_errcode_t err;
1693 err = extend_buffers (mctx);
1694 if (BE (err != REG_NOERROR, 0))
1695 return err;
1698 if (top < next_state_log_idx)
1700 memset (mctx->state_log + top + 1, '\0',
1701 sizeof (re_dfastate_t *) * (next_state_log_idx - top));
1702 mctx->state_log_top = next_state_log_idx;
1704 return REG_NOERROR;
1707 static reg_errcode_t
1708 merge_state_array (dfa, dst, src, num)
1709 re_dfa_t *dfa;
1710 re_dfastate_t **dst;
1711 re_dfastate_t **src;
1712 int num;
1714 int st_idx;
1715 reg_errcode_t err;
1716 for (st_idx = 0; st_idx < num; ++st_idx)
1718 if (dst[st_idx] == NULL)
1719 dst[st_idx] = src[st_idx];
1720 else if (src[st_idx] != NULL)
1722 re_node_set merged_set;
1723 err = re_node_set_init_union (&merged_set, &dst[st_idx]->nodes,
1724 &src[st_idx]->nodes);
1725 if (BE (err != REG_NOERROR, 0))
1726 return err;
1727 dst[st_idx] = re_acquire_state (&err, dfa, &merged_set);
1728 re_node_set_free (&merged_set);
1729 if (BE (err != REG_NOERROR, 0))
1730 return err;
1733 return REG_NOERROR;
1736 static reg_errcode_t
1737 update_cur_sifted_state (mctx, sctx, str_idx, dest_nodes)
1738 re_match_context_t *mctx;
1739 re_sift_context_t *sctx;
1740 int str_idx;
1741 re_node_set *dest_nodes;
1743 re_dfa_t *const dfa = mctx->dfa;
1744 reg_errcode_t err;
1745 const re_node_set *candidates;
1746 candidates = ((mctx->state_log[str_idx] == NULL) ? NULL
1747 : &mctx->state_log[str_idx]->nodes);
1749 if (dest_nodes->nelem == 0)
1750 sctx->sifted_states[str_idx] = NULL;
1751 else
1753 if (candidates)
1755 /* At first, add the nodes which can epsilon transit to a node in
1756 DEST_NODE. */
1757 err = add_epsilon_src_nodes (dfa, dest_nodes, candidates);
1758 if (BE (err != REG_NOERROR, 0))
1759 return err;
1761 /* Then, check the limitations in the current sift_context. */
1762 if (sctx->limits.nelem)
1764 err = check_subexp_limits (dfa, dest_nodes, candidates, &sctx->limits,
1765 mctx->bkref_ents, str_idx);
1766 if (BE (err != REG_NOERROR, 0))
1767 return err;
1771 sctx->sifted_states[str_idx] = re_acquire_state (&err, dfa, dest_nodes);
1772 if (BE (err != REG_NOERROR, 0))
1773 return err;
1776 if (candidates && mctx->state_log[str_idx]->has_backref)
1778 err = sift_states_bkref (mctx, sctx, str_idx, candidates);
1779 if (BE (err != REG_NOERROR, 0))
1780 return err;
1782 return REG_NOERROR;
1785 static reg_errcode_t
1786 add_epsilon_src_nodes (dfa, dest_nodes, candidates)
1787 re_dfa_t *dfa;
1788 re_node_set *dest_nodes;
1789 const re_node_set *candidates;
1791 reg_errcode_t err = REG_NOERROR;
1792 int i;
1794 re_dfastate_t *state = re_acquire_state (&err, dfa, dest_nodes);
1795 if (BE (err != REG_NOERROR, 0))
1796 return err;
1798 if (!state->inveclosure.alloc)
1800 err = re_node_set_alloc (&state->inveclosure, dest_nodes->nelem);
1801 if (BE (err != REG_NOERROR, 0))
1802 return REG_ESPACE;
1803 for (i = 0; i < dest_nodes->nelem; i++)
1804 re_node_set_merge (&state->inveclosure,
1805 dfa->inveclosures + dest_nodes->elems[i]);
1807 return re_node_set_add_intersect (dest_nodes, candidates,
1808 &state->inveclosure);
1811 static reg_errcode_t
1812 sub_epsilon_src_nodes (dfa, node, dest_nodes, candidates)
1813 re_dfa_t *dfa;
1814 int node;
1815 re_node_set *dest_nodes;
1816 const re_node_set *candidates;
1818 int ecl_idx;
1819 reg_errcode_t err;
1820 re_node_set *inv_eclosure = dfa->inveclosures + node;
1821 re_node_set except_nodes;
1822 re_node_set_init_empty (&except_nodes);
1823 for (ecl_idx = 0; ecl_idx < inv_eclosure->nelem; ++ecl_idx)
1825 int cur_node = inv_eclosure->elems[ecl_idx];
1826 if (cur_node == node)
1827 continue;
1828 if (IS_EPSILON_NODE (dfa->nodes[cur_node].type))
1830 int edst1 = dfa->edests[cur_node].elems[0];
1831 int edst2 = ((dfa->edests[cur_node].nelem > 1)
1832 ? dfa->edests[cur_node].elems[1] : -1);
1833 if ((!re_node_set_contains (inv_eclosure, edst1)
1834 && re_node_set_contains (dest_nodes, edst1))
1835 || (edst2 > 0
1836 && !re_node_set_contains (inv_eclosure, edst2)
1837 && re_node_set_contains (dest_nodes, edst2)))
1839 err = re_node_set_add_intersect (&except_nodes, candidates,
1840 dfa->inveclosures + cur_node);
1841 if (BE (err != REG_NOERROR, 0))
1843 re_node_set_free (&except_nodes);
1844 return err;
1849 for (ecl_idx = 0; ecl_idx < inv_eclosure->nelem; ++ecl_idx)
1851 int cur_node = inv_eclosure->elems[ecl_idx];
1852 if (!re_node_set_contains (&except_nodes, cur_node))
1854 int idx = re_node_set_contains (dest_nodes, cur_node) - 1;
1855 re_node_set_remove_at (dest_nodes, idx);
1858 re_node_set_free (&except_nodes);
1859 return REG_NOERROR;
1862 static int
1863 check_dst_limits (mctx, limits, dst_node, dst_idx, src_node, src_idx)
1864 re_match_context_t *mctx;
1865 re_node_set *limits;
1866 int dst_node, dst_idx, src_node, src_idx;
1868 re_dfa_t *const dfa = mctx->dfa;
1869 int lim_idx, src_pos, dst_pos;
1871 int dst_bkref_idx = search_cur_bkref_entry (mctx, dst_idx);
1872 int src_bkref_idx = search_cur_bkref_entry (mctx, src_idx);
1873 for (lim_idx = 0; lim_idx < limits->nelem; ++lim_idx)
1875 int subexp_idx;
1876 struct re_backref_cache_entry *ent;
1877 ent = mctx->bkref_ents + limits->elems[lim_idx];
1878 subexp_idx = dfa->nodes[ent->node].opr.idx;
1880 dst_pos = check_dst_limits_calc_pos (mctx, limits->elems[lim_idx],
1881 subexp_idx, dst_node, dst_idx,
1882 dst_bkref_idx);
1883 src_pos = check_dst_limits_calc_pos (mctx, limits->elems[lim_idx],
1884 subexp_idx, src_node, src_idx,
1885 src_bkref_idx);
1887 /* In case of:
1888 <src> <dst> ( <subexp> )
1889 ( <subexp> ) <src> <dst>
1890 ( <subexp1> <src> <subexp2> <dst> <subexp3> ) */
1891 if (src_pos == dst_pos)
1892 continue; /* This is unrelated limitation. */
1893 else
1894 return 1;
1896 return 0;
1899 static int
1900 check_dst_limits_calc_pos_1 (mctx, boundaries, subexp_idx, from_node, bkref_idx)
1901 re_match_context_t *mctx;
1902 int boundaries, subexp_idx, from_node, bkref_idx;
1904 re_dfa_t *const dfa = mctx->dfa;
1905 re_node_set *eclosures = dfa->eclosures + from_node;
1906 int node_idx;
1908 /* Else, we are on the boundary: examine the nodes on the epsilon
1909 closure. */
1910 for (node_idx = 0; node_idx < eclosures->nelem; ++node_idx)
1912 int node = eclosures->elems[node_idx];
1913 switch (dfa->nodes[node].type)
1915 case OP_BACK_REF:
1916 if (bkref_idx != -1)
1918 struct re_backref_cache_entry *ent = mctx->bkref_ents + bkref_idx;
1921 int dst, cpos;
1923 if (ent->node != node)
1924 continue;
1926 if (subexp_idx <= 8 * sizeof (ent->eps_reachable_subexps_map)
1927 && !(ent->eps_reachable_subexps_map & (1 << subexp_idx)))
1928 continue;
1930 /* Recurse trying to reach the OP_OPEN_SUBEXP and
1931 OP_CLOSE_SUBEXP cases below. But, if the
1932 destination node is the same node as the source
1933 node, don't recurse because it would cause an
1934 infinite loop: a regex that exhibits this behavior
1935 is ()\1*\1* */
1936 dst = dfa->edests[node].elems[0];
1937 if (dst == from_node)
1939 if (boundaries & 1)
1940 return -1;
1941 else /* if (boundaries & 2) */
1942 return 0;
1945 cpos =
1946 check_dst_limits_calc_pos_1 (mctx, boundaries, subexp_idx,
1947 dst, bkref_idx);
1948 if (cpos == -1 /* && (boundaries & 1) */)
1949 return -1;
1950 if (cpos == 0 && (boundaries & 2))
1951 return 0;
1953 ent->eps_reachable_subexps_map &= ~(1 << subexp_idx);
1955 while (ent++->more);
1957 break;
1959 case OP_OPEN_SUBEXP:
1960 if ((boundaries & 1) && subexp_idx == dfa->nodes[node].opr.idx)
1961 return -1;
1962 break;
1964 case OP_CLOSE_SUBEXP:
1965 if ((boundaries & 2) && subexp_idx == dfa->nodes[node].opr.idx)
1966 return 0;
1967 break;
1969 default:
1970 break;
1974 return (boundaries & 2) ? 1 : 0;
1977 static int
1978 check_dst_limits_calc_pos (mctx, limit, subexp_idx, from_node, str_idx, bkref_idx)
1979 re_match_context_t *mctx;
1980 int limit, subexp_idx, from_node, str_idx, bkref_idx;
1982 struct re_backref_cache_entry *lim = mctx->bkref_ents + limit;
1983 int boundaries;
1985 /* If we are outside the range of the subexpression, return -1 or 1. */
1986 if (str_idx < lim->subexp_from)
1987 return -1;
1989 if (lim->subexp_to < str_idx)
1990 return 1;
1992 /* If we are within the subexpression, return 0. */
1993 boundaries = (str_idx == lim->subexp_from);
1994 boundaries |= (str_idx == lim->subexp_to) << 1;
1995 if (boundaries == 0)
1996 return 0;
1998 /* Else, examine epsilon closure. */
1999 return check_dst_limits_calc_pos_1 (mctx, boundaries, subexp_idx,
2000 from_node, bkref_idx);
2003 /* Check the limitations of sub expressions LIMITS, and remove the nodes
2004 which are against limitations from DEST_NODES. */
2006 static reg_errcode_t
2007 check_subexp_limits (dfa, dest_nodes, candidates, limits, bkref_ents, str_idx)
2008 re_dfa_t *dfa;
2009 re_node_set *dest_nodes;
2010 const re_node_set *candidates;
2011 re_node_set *limits;
2012 struct re_backref_cache_entry *bkref_ents;
2013 int str_idx;
2015 reg_errcode_t err;
2016 int node_idx, lim_idx;
2018 for (lim_idx = 0; lim_idx < limits->nelem; ++lim_idx)
2020 int subexp_idx;
2021 struct re_backref_cache_entry *ent;
2022 ent = bkref_ents + limits->elems[lim_idx];
2024 if (str_idx <= ent->subexp_from || ent->str_idx < str_idx)
2025 continue; /* This is unrelated limitation. */
2027 subexp_idx = dfa->nodes[ent->node].opr.idx;
2028 if (ent->subexp_to == str_idx)
2030 int ops_node = -1;
2031 int cls_node = -1;
2032 for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
2034 int node = dest_nodes->elems[node_idx];
2035 re_token_type_t type = dfa->nodes[node].type;
2036 if (type == OP_OPEN_SUBEXP
2037 && subexp_idx == dfa->nodes[node].opr.idx)
2038 ops_node = node;
2039 else if (type == OP_CLOSE_SUBEXP
2040 && subexp_idx == dfa->nodes[node].opr.idx)
2041 cls_node = node;
2044 /* Check the limitation of the open subexpression. */
2045 /* Note that (ent->subexp_to = str_idx != ent->subexp_from). */
2046 if (ops_node >= 0)
2048 err = sub_epsilon_src_nodes (dfa, ops_node, dest_nodes,
2049 candidates);
2050 if (BE (err != REG_NOERROR, 0))
2051 return err;
2054 /* Check the limitation of the close subexpression. */
2055 if (cls_node >= 0)
2056 for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
2058 int node = dest_nodes->elems[node_idx];
2059 if (!re_node_set_contains (dfa->inveclosures + node,
2060 cls_node)
2061 && !re_node_set_contains (dfa->eclosures + node,
2062 cls_node))
2064 /* It is against this limitation.
2065 Remove it form the current sifted state. */
2066 err = sub_epsilon_src_nodes (dfa, node, dest_nodes,
2067 candidates);
2068 if (BE (err != REG_NOERROR, 0))
2069 return err;
2070 --node_idx;
2074 else /* (ent->subexp_to != str_idx) */
2076 for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
2078 int node = dest_nodes->elems[node_idx];
2079 re_token_type_t type = dfa->nodes[node].type;
2080 if (type == OP_CLOSE_SUBEXP || type == OP_OPEN_SUBEXP)
2082 if (subexp_idx != dfa->nodes[node].opr.idx)
2083 continue;
2084 /* It is against this limitation.
2085 Remove it form the current sifted state. */
2086 err = sub_epsilon_src_nodes (dfa, node, dest_nodes,
2087 candidates);
2088 if (BE (err != REG_NOERROR, 0))
2089 return err;
2094 return REG_NOERROR;
2097 static reg_errcode_t
2098 sift_states_bkref (mctx, sctx, str_idx, candidates)
2099 re_match_context_t *mctx;
2100 re_sift_context_t *sctx;
2101 int str_idx;
2102 const re_node_set *candidates;
2104 re_dfa_t *const dfa = mctx->dfa;
2105 reg_errcode_t err;
2106 int node_idx, node;
2107 re_sift_context_t local_sctx;
2108 int first_idx = search_cur_bkref_entry (mctx, str_idx);
2110 if (first_idx == -1)
2111 return REG_NOERROR;
2113 local_sctx.sifted_states = NULL; /* Mark that it hasn't been initialized. */
2115 for (node_idx = 0; node_idx < candidates->nelem; ++node_idx)
2117 int enabled_idx;
2118 re_token_type_t type;
2119 struct re_backref_cache_entry *entry;
2120 node = candidates->elems[node_idx];
2121 type = dfa->nodes[node].type;
2122 /* Avoid infinite loop for the REs like "()\1+". */
2123 if (node == sctx->last_node && str_idx == sctx->last_str_idx)
2124 continue;
2125 if (type != OP_BACK_REF)
2126 continue;
2128 entry = mctx->bkref_ents + first_idx;
2129 enabled_idx = first_idx;
2132 int subexp_len, to_idx, dst_node;
2133 re_dfastate_t *cur_state;
2135 if (entry->node != node)
2136 continue;
2137 subexp_len = entry->subexp_to - entry->subexp_from;
2138 to_idx = str_idx + subexp_len;
2139 dst_node = (subexp_len ? dfa->nexts[node]
2140 : dfa->edests[node].elems[0]);
2142 if (to_idx > sctx->last_str_idx
2143 || sctx->sifted_states[to_idx] == NULL
2144 || !STATE_NODE_CONTAINS (sctx->sifted_states[to_idx], dst_node)
2145 || check_dst_limits (mctx, &sctx->limits, node,
2146 str_idx, dst_node, to_idx))
2147 continue;
2149 if (local_sctx.sifted_states == NULL)
2151 local_sctx = *sctx;
2152 err = re_node_set_init_copy (&local_sctx.limits, &sctx->limits);
2153 if (BE (err != REG_NOERROR, 0))
2154 goto free_return;
2156 local_sctx.last_node = node;
2157 local_sctx.last_str_idx = str_idx;
2158 err = re_node_set_insert (&local_sctx.limits, enabled_idx);
2159 if (BE (err < 0, 0))
2161 err = REG_ESPACE;
2162 goto free_return;
2164 cur_state = local_sctx.sifted_states[str_idx];
2165 err = sift_states_backward (mctx, &local_sctx);
2166 if (BE (err != REG_NOERROR, 0))
2167 goto free_return;
2168 if (sctx->limited_states != NULL)
2170 err = merge_state_array (dfa, sctx->limited_states,
2171 local_sctx.sifted_states,
2172 str_idx + 1);
2173 if (BE (err != REG_NOERROR, 0))
2174 goto free_return;
2176 local_sctx.sifted_states[str_idx] = cur_state;
2177 re_node_set_remove (&local_sctx.limits, enabled_idx);
2179 /* mctx->bkref_ents may have changed, reload the pointer. */
2180 entry = mctx->bkref_ents + enabled_idx;
2182 while (enabled_idx++, entry++->more);
2184 err = REG_NOERROR;
2185 free_return:
2186 if (local_sctx.sifted_states != NULL)
2188 re_node_set_free (&local_sctx.limits);
2191 return err;
2195 #ifdef RE_ENABLE_I18N
2196 static int
2197 sift_states_iter_mb (mctx, sctx, node_idx, str_idx, max_str_idx)
2198 const re_match_context_t *mctx;
2199 re_sift_context_t *sctx;
2200 int node_idx, str_idx, max_str_idx;
2202 re_dfa_t *const dfa = mctx->dfa;
2203 int naccepted;
2204 /* Check the node can accept `multi byte'. */
2205 naccepted = check_node_accept_bytes (dfa, node_idx, &mctx->input, str_idx);
2206 if (naccepted > 0 && str_idx + naccepted <= max_str_idx &&
2207 !STATE_NODE_CONTAINS (sctx->sifted_states[str_idx + naccepted],
2208 dfa->nexts[node_idx]))
2209 /* The node can't accept the `multi byte', or the
2210 destination was already thrown away, then the node
2211 could't accept the current input `multi byte'. */
2212 naccepted = 0;
2213 /* Otherwise, it is sure that the node could accept
2214 `naccepted' bytes input. */
2215 return naccepted;
2217 #endif /* RE_ENABLE_I18N */
2220 /* Functions for state transition. */
2222 /* Return the next state to which the current state STATE will transit by
2223 accepting the current input byte, and update STATE_LOG if necessary.
2224 If STATE can accept a multibyte char/collating element/back reference
2225 update the destination of STATE_LOG. */
2227 static re_dfastate_t *
2228 transit_state (err, mctx, state)
2229 reg_errcode_t *err;
2230 re_match_context_t *mctx;
2231 re_dfastate_t *state;
2233 re_dfa_t *const dfa = mctx->dfa;
2234 re_dfastate_t **trtable;
2235 unsigned char ch;
2237 #ifdef RE_ENABLE_I18N
2238 /* If the current state can accept multibyte. */
2239 if (BE (state->accept_mb, 0))
2241 *err = transit_state_mb (mctx, state);
2242 if (BE (*err != REG_NOERROR, 0))
2243 return NULL;
2245 #endif /* RE_ENABLE_I18N */
2247 /* Then decide the next state with the single byte. */
2248 if (1)
2250 /* Use transition table */
2251 ch = re_string_fetch_byte (&mctx->input);
2252 trtable = state->trtable;
2253 if (trtable == NULL)
2255 trtable = build_trtable (dfa, state);
2256 if (trtable == NULL)
2258 *err = REG_ESPACE;
2259 return NULL;
2262 if (BE (state->word_trtable, 0))
2264 unsigned int context;
2265 context
2266 = re_string_context_at (&mctx->input,
2267 re_string_cur_idx (&mctx->input) - 1,
2268 mctx->eflags);
2269 if (IS_WORD_CONTEXT (context))
2270 return trtable[ch + SBC_MAX];
2271 else
2272 return trtable[ch];
2274 else
2275 return trtable[ch];
2277 #if 0
2278 else
2279 /* don't use transition table */
2280 return transit_state_sb (err, mctx, state);
2281 #endif
2284 /* Update the state_log if we need */
2285 re_dfastate_t *
2286 merge_state_with_log (err, mctx, next_state)
2287 reg_errcode_t *err;
2288 re_match_context_t *mctx;
2289 re_dfastate_t *next_state;
2291 re_dfa_t *const dfa = mctx->dfa;
2292 int cur_idx = re_string_cur_idx (&mctx->input);
2294 if (cur_idx > mctx->state_log_top)
2296 mctx->state_log[cur_idx] = next_state;
2297 mctx->state_log_top = cur_idx;
2299 else if (mctx->state_log[cur_idx] == 0)
2301 mctx->state_log[cur_idx] = next_state;
2303 else
2305 re_dfastate_t *pstate;
2306 unsigned int context;
2307 re_node_set next_nodes, *log_nodes, *table_nodes = NULL;
2308 /* If (state_log[cur_idx] != 0), it implies that cur_idx is
2309 the destination of a multibyte char/collating element/
2310 back reference. Then the next state is the union set of
2311 these destinations and the results of the transition table. */
2312 pstate = mctx->state_log[cur_idx];
2313 log_nodes = pstate->entrance_nodes;
2314 if (next_state != NULL)
2316 table_nodes = next_state->entrance_nodes;
2317 *err = re_node_set_init_union (&next_nodes, table_nodes,
2318 log_nodes);
2319 if (BE (*err != REG_NOERROR, 0))
2320 return NULL;
2322 else
2323 next_nodes = *log_nodes;
2324 /* Note: We already add the nodes of the initial state,
2325 then we don't need to add them here. */
2327 context = re_string_context_at (&mctx->input,
2328 re_string_cur_idx (&mctx->input) - 1,
2329 mctx->eflags);
2330 next_state = mctx->state_log[cur_idx]
2331 = re_acquire_state_context (err, dfa, &next_nodes, context);
2332 /* We don't need to check errors here, since the return value of
2333 this function is next_state and ERR is already set. */
2335 if (table_nodes != NULL)
2336 re_node_set_free (&next_nodes);
2339 if (BE (dfa->nbackref, 0) && next_state != NULL)
2341 /* Check OP_OPEN_SUBEXP in the current state in case that we use them
2342 later. We must check them here, since the back references in the
2343 next state might use them. */
2344 *err = check_subexp_matching_top (mctx, &next_state->nodes,
2345 cur_idx);
2346 if (BE (*err != REG_NOERROR, 0))
2347 return NULL;
2349 /* If the next state has back references. */
2350 if (next_state->has_backref)
2352 *err = transit_state_bkref (mctx, &next_state->nodes);
2353 if (BE (*err != REG_NOERROR, 0))
2354 return NULL;
2355 next_state = mctx->state_log[cur_idx];
2359 return next_state;
2362 /* Skip bytes in the input that correspond to part of a
2363 multi-byte match, then look in the log for a state
2364 from which to restart matching. */
2365 re_dfastate_t *
2366 find_recover_state (err, mctx)
2367 reg_errcode_t *err;
2368 re_match_context_t *mctx;
2370 re_dfastate_t *cur_state = NULL;
2373 int max = mctx->state_log_top;
2374 int cur_str_idx = re_string_cur_idx (&mctx->input);
2378 if (++cur_str_idx > max)
2379 return NULL;
2380 re_string_skip_bytes (&mctx->input, 1);
2382 while (mctx->state_log[cur_str_idx] == NULL);
2384 cur_state = merge_state_with_log (err, mctx, NULL);
2386 while (err == REG_NOERROR && cur_state == NULL);
2387 return cur_state;
2390 /* Helper functions for transit_state. */
2392 /* From the node set CUR_NODES, pick up the nodes whose types are
2393 OP_OPEN_SUBEXP and which have corresponding back references in the regular
2394 expression. And register them to use them later for evaluating the
2395 correspoding back references. */
2397 static reg_errcode_t
2398 check_subexp_matching_top (mctx, cur_nodes, str_idx)
2399 re_match_context_t *mctx;
2400 re_node_set *cur_nodes;
2401 int str_idx;
2403 re_dfa_t *const dfa = mctx->dfa;
2404 int node_idx;
2405 reg_errcode_t err;
2407 /* TODO: This isn't efficient.
2408 Because there might be more than one nodes whose types are
2409 OP_OPEN_SUBEXP and whose index is SUBEXP_IDX, we must check all
2410 nodes.
2411 E.g. RE: (a){2} */
2412 for (node_idx = 0; node_idx < cur_nodes->nelem; ++node_idx)
2414 int node = cur_nodes->elems[node_idx];
2415 if (dfa->nodes[node].type == OP_OPEN_SUBEXP
2416 && dfa->nodes[node].opr.idx < (8 * sizeof (dfa->used_bkref_map))
2417 && dfa->used_bkref_map & (1 << dfa->nodes[node].opr.idx))
2419 err = match_ctx_add_subtop (mctx, node, str_idx);
2420 if (BE (err != REG_NOERROR, 0))
2421 return err;
2424 return REG_NOERROR;
2427 #if 0
2428 /* Return the next state to which the current state STATE will transit by
2429 accepting the current input byte. */
2431 static re_dfastate_t *
2432 transit_state_sb (err, mctx, state)
2433 reg_errcode_t *err;
2434 re_match_context_t *mctx;
2435 re_dfastate_t *state;
2437 re_dfa_t *const dfa = mctx->dfa;
2438 re_node_set next_nodes;
2439 re_dfastate_t *next_state;
2440 int node_cnt, cur_str_idx = re_string_cur_idx (&mctx->input);
2441 unsigned int context;
2443 *err = re_node_set_alloc (&next_nodes, state->nodes.nelem + 1);
2444 if (BE (*err != REG_NOERROR, 0))
2445 return NULL;
2446 for (node_cnt = 0; node_cnt < state->nodes.nelem; ++node_cnt)
2448 int cur_node = state->nodes.elems[node_cnt];
2449 if (check_node_accept (mctx, dfa->nodes + cur_node, cur_str_idx))
2451 *err = re_node_set_merge (&next_nodes,
2452 dfa->eclosures + dfa->nexts[cur_node]);
2453 if (BE (*err != REG_NOERROR, 0))
2455 re_node_set_free (&next_nodes);
2456 return NULL;
2460 context = re_string_context_at (&mctx->input, cur_str_idx, mctx->eflags);
2461 next_state = re_acquire_state_context (err, dfa, &next_nodes, context);
2462 /* We don't need to check errors here, since the return value of
2463 this function is next_state and ERR is already set. */
2465 re_node_set_free (&next_nodes);
2466 re_string_skip_bytes (&mctx->input, 1);
2467 return next_state;
2469 #endif
2471 #ifdef RE_ENABLE_I18N
2472 static reg_errcode_t
2473 transit_state_mb (mctx, pstate)
2474 re_match_context_t *mctx;
2475 re_dfastate_t *pstate;
2477 re_dfa_t *const dfa = mctx->dfa;
2478 reg_errcode_t err;
2479 int i;
2481 for (i = 0; i < pstate->nodes.nelem; ++i)
2483 re_node_set dest_nodes, *new_nodes;
2484 int cur_node_idx = pstate->nodes.elems[i];
2485 int naccepted = 0, dest_idx;
2486 unsigned int context;
2487 re_dfastate_t *dest_state;
2489 if (dfa->nodes[cur_node_idx].constraint)
2491 context = re_string_context_at (&mctx->input,
2492 re_string_cur_idx (&mctx->input),
2493 mctx->eflags);
2494 if (NOT_SATISFY_NEXT_CONSTRAINT (dfa->nodes[cur_node_idx].constraint,
2495 context))
2496 continue;
2499 /* How many bytes the node can accept? */
2500 if (ACCEPT_MB_NODE (dfa->nodes[cur_node_idx].type))
2501 naccepted = check_node_accept_bytes (dfa, cur_node_idx, &mctx->input,
2502 re_string_cur_idx (&mctx->input));
2503 if (naccepted == 0)
2504 continue;
2506 /* The node can accepts `naccepted' bytes. */
2507 dest_idx = re_string_cur_idx (&mctx->input) + naccepted;
2508 mctx->max_mb_elem_len = ((mctx->max_mb_elem_len < naccepted) ? naccepted
2509 : mctx->max_mb_elem_len);
2510 err = clean_state_log_if_needed (mctx, dest_idx);
2511 if (BE (err != REG_NOERROR, 0))
2512 return err;
2513 #ifdef DEBUG
2514 assert (dfa->nexts[cur_node_idx] != -1);
2515 #endif
2516 /* `cur_node_idx' may point the entity of the OP_CONTEXT_NODE,
2517 then we use pstate->nodes.elems[i] instead. */
2518 new_nodes = dfa->eclosures + dfa->nexts[pstate->nodes.elems[i]];
2520 dest_state = mctx->state_log[dest_idx];
2521 if (dest_state == NULL)
2522 dest_nodes = *new_nodes;
2523 else
2525 err = re_node_set_init_union (&dest_nodes,
2526 dest_state->entrance_nodes, new_nodes);
2527 if (BE (err != REG_NOERROR, 0))
2528 return err;
2530 context = re_string_context_at (&mctx->input, dest_idx - 1, mctx->eflags);
2531 mctx->state_log[dest_idx]
2532 = re_acquire_state_context (&err, dfa, &dest_nodes, context);
2533 if (dest_state != NULL)
2534 re_node_set_free (&dest_nodes);
2535 if (BE (mctx->state_log[dest_idx] == NULL && err != REG_NOERROR, 0))
2536 return err;
2538 return REG_NOERROR;
2540 #endif /* RE_ENABLE_I18N */
2542 static reg_errcode_t
2543 transit_state_bkref (mctx, nodes)
2544 re_match_context_t *mctx;
2545 const re_node_set *nodes;
2547 re_dfa_t *const dfa = mctx->dfa;
2548 reg_errcode_t err;
2549 int i;
2550 int cur_str_idx = re_string_cur_idx (&mctx->input);
2552 for (i = 0; i < nodes->nelem; ++i)
2554 int dest_str_idx, prev_nelem, bkc_idx;
2555 int node_idx = nodes->elems[i];
2556 unsigned int context;
2557 const re_token_t *node = dfa->nodes + node_idx;
2558 re_node_set *new_dest_nodes;
2560 /* Check whether `node' is a backreference or not. */
2561 if (node->type != OP_BACK_REF)
2562 continue;
2564 if (node->constraint)
2566 context = re_string_context_at (&mctx->input, cur_str_idx,
2567 mctx->eflags);
2568 if (NOT_SATISFY_NEXT_CONSTRAINT (node->constraint, context))
2569 continue;
2572 /* `node' is a backreference.
2573 Check the substring which the substring matched. */
2574 bkc_idx = mctx->nbkref_ents;
2575 err = get_subexp (mctx, node_idx, cur_str_idx);
2576 if (BE (err != REG_NOERROR, 0))
2577 goto free_return;
2579 /* And add the epsilon closures (which is `new_dest_nodes') of
2580 the backreference to appropriate state_log. */
2581 #ifdef DEBUG
2582 assert (dfa->nexts[node_idx] != -1);
2583 #endif
2584 for (; bkc_idx < mctx->nbkref_ents; ++bkc_idx)
2586 int subexp_len;
2587 re_dfastate_t *dest_state;
2588 struct re_backref_cache_entry *bkref_ent;
2589 bkref_ent = mctx->bkref_ents + bkc_idx;
2590 if (bkref_ent->node != node_idx || bkref_ent->str_idx != cur_str_idx)
2591 continue;
2592 subexp_len = bkref_ent->subexp_to - bkref_ent->subexp_from;
2593 new_dest_nodes = (subexp_len == 0
2594 ? dfa->eclosures + dfa->edests[node_idx].elems[0]
2595 : dfa->eclosures + dfa->nexts[node_idx]);
2596 dest_str_idx = (cur_str_idx + bkref_ent->subexp_to
2597 - bkref_ent->subexp_from);
2598 context = re_string_context_at (&mctx->input, dest_str_idx - 1,
2599 mctx->eflags);
2600 dest_state = mctx->state_log[dest_str_idx];
2601 prev_nelem = ((mctx->state_log[cur_str_idx] == NULL) ? 0
2602 : mctx->state_log[cur_str_idx]->nodes.nelem);
2603 /* Add `new_dest_node' to state_log. */
2604 if (dest_state == NULL)
2606 mctx->state_log[dest_str_idx]
2607 = re_acquire_state_context (&err, dfa, new_dest_nodes,
2608 context);
2609 if (BE (mctx->state_log[dest_str_idx] == NULL
2610 && err != REG_NOERROR, 0))
2611 goto free_return;
2613 else
2615 re_node_set dest_nodes;
2616 err = re_node_set_init_union (&dest_nodes,
2617 dest_state->entrance_nodes,
2618 new_dest_nodes);
2619 if (BE (err != REG_NOERROR, 0))
2621 re_node_set_free (&dest_nodes);
2622 goto free_return;
2624 mctx->state_log[dest_str_idx]
2625 = re_acquire_state_context (&err, dfa, &dest_nodes, context);
2626 re_node_set_free (&dest_nodes);
2627 if (BE (mctx->state_log[dest_str_idx] == NULL
2628 && err != REG_NOERROR, 0))
2629 goto free_return;
2631 /* We need to check recursively if the backreference can epsilon
2632 transit. */
2633 if (subexp_len == 0
2634 && mctx->state_log[cur_str_idx]->nodes.nelem > prev_nelem)
2636 err = check_subexp_matching_top (mctx, new_dest_nodes,
2637 cur_str_idx);
2638 if (BE (err != REG_NOERROR, 0))
2639 goto free_return;
2640 err = transit_state_bkref (mctx, new_dest_nodes);
2641 if (BE (err != REG_NOERROR, 0))
2642 goto free_return;
2646 err = REG_NOERROR;
2647 free_return:
2648 return err;
2651 /* Enumerate all the candidates which the backreference BKREF_NODE can match
2652 at BKREF_STR_IDX, and register them by match_ctx_add_entry().
2653 Note that we might collect inappropriate candidates here.
2654 However, the cost of checking them strictly here is too high, then we
2655 delay these checking for prune_impossible_nodes(). */
2657 static reg_errcode_t
2658 get_subexp (mctx, bkref_node, bkref_str_idx)
2659 re_match_context_t *mctx;
2660 int bkref_node, bkref_str_idx;
2662 re_dfa_t *const dfa = mctx->dfa;
2663 int subexp_num, sub_top_idx;
2664 const char *buf = (const char *) re_string_get_buffer (&mctx->input);
2665 /* Return if we have already checked BKREF_NODE at BKREF_STR_IDX. */
2666 int cache_idx = search_cur_bkref_entry (mctx, bkref_str_idx);
2667 if (cache_idx != -1)
2669 const struct re_backref_cache_entry *entry = mctx->bkref_ents + cache_idx;
2671 if (entry->node == bkref_node)
2672 return REG_NOERROR; /* We already checked it. */
2673 while (entry++->more);
2676 subexp_num = dfa->nodes[bkref_node].opr.idx;
2678 /* For each sub expression */
2679 for (sub_top_idx = 0; sub_top_idx < mctx->nsub_tops; ++sub_top_idx)
2681 reg_errcode_t err;
2682 re_sub_match_top_t *sub_top = mctx->sub_tops[sub_top_idx];
2683 re_sub_match_last_t *sub_last;
2684 int sub_last_idx, sl_str, bkref_str_off;
2686 if (dfa->nodes[sub_top->node].opr.idx != subexp_num)
2687 continue; /* It isn't related. */
2689 sl_str = sub_top->str_idx;
2690 bkref_str_off = bkref_str_idx;
2691 /* At first, check the last node of sub expressions we already
2692 evaluated. */
2693 for (sub_last_idx = 0; sub_last_idx < sub_top->nlasts; ++sub_last_idx)
2695 int sl_str_diff;
2696 sub_last = sub_top->lasts[sub_last_idx];
2697 sl_str_diff = sub_last->str_idx - sl_str;
2698 /* The matched string by the sub expression match with the substring
2699 at the back reference? */
2700 if (sl_str_diff > 0)
2702 if (BE (bkref_str_off + sl_str_diff > mctx->input.valid_len, 0))
2704 /* Not enough chars for a successful match. */
2705 if (bkref_str_off + sl_str_diff > mctx->input.len)
2706 break;
2708 err = clean_state_log_if_needed (mctx,
2709 bkref_str_off
2710 + sl_str_diff);
2711 if (BE (err != REG_NOERROR, 0))
2712 return err;
2713 buf = (const char *) re_string_get_buffer (&mctx->input);
2715 if (memcmp (buf + bkref_str_off, buf + sl_str, sl_str_diff) != 0)
2716 break; /* We don't need to search this sub expression any more. */
2718 bkref_str_off += sl_str_diff;
2719 sl_str += sl_str_diff;
2720 err = get_subexp_sub (mctx, sub_top, sub_last, bkref_node,
2721 bkref_str_idx);
2723 /* Reload buf, since the preceding call might have reallocated
2724 the buffer. */
2725 buf = (const char *) re_string_get_buffer (&mctx->input);
2727 if (err == REG_NOMATCH)
2728 continue;
2729 if (BE (err != REG_NOERROR, 0))
2730 return err;
2733 if (sub_last_idx < sub_top->nlasts)
2734 continue;
2735 if (sub_last_idx > 0)
2736 ++sl_str;
2737 /* Then, search for the other last nodes of the sub expression. */
2738 for (; sl_str <= bkref_str_idx; ++sl_str)
2740 int cls_node, sl_str_off;
2741 const re_node_set *nodes;
2742 sl_str_off = sl_str - sub_top->str_idx;
2743 /* The matched string by the sub expression match with the substring
2744 at the back reference? */
2745 if (sl_str_off > 0)
2747 if (BE (bkref_str_off >= mctx->input.valid_len, 0))
2749 /* If we are at the end of the input, we cannot match. */
2750 if (bkref_str_off >= mctx->input.len)
2751 break;
2753 err = extend_buffers (mctx);
2754 if (BE (err != REG_NOERROR, 0))
2755 return err;
2757 buf = (const char *) re_string_get_buffer (&mctx->input);
2759 if (buf [bkref_str_off++] != buf[sl_str - 1])
2760 break; /* We don't need to search this sub expression
2761 any more. */
2763 if (mctx->state_log[sl_str] == NULL)
2764 continue;
2765 /* Does this state have a ')' of the sub expression? */
2766 nodes = &mctx->state_log[sl_str]->nodes;
2767 cls_node = find_subexp_node (dfa, nodes, subexp_num, OP_CLOSE_SUBEXP);
2768 if (cls_node == -1)
2769 continue; /* No. */
2770 if (sub_top->path == NULL)
2772 sub_top->path = calloc (sizeof (state_array_t),
2773 sl_str - sub_top->str_idx + 1);
2774 if (sub_top->path == NULL)
2775 return REG_ESPACE;
2777 /* Can the OP_OPEN_SUBEXP node arrive the OP_CLOSE_SUBEXP node
2778 in the current context? */
2779 err = check_arrival (mctx, sub_top->path, sub_top->node,
2780 sub_top->str_idx, cls_node, sl_str, OP_CLOSE_SUBEXP);
2781 if (err == REG_NOMATCH)
2782 continue;
2783 if (BE (err != REG_NOERROR, 0))
2784 return err;
2785 sub_last = match_ctx_add_sublast (sub_top, cls_node, sl_str);
2786 if (BE (sub_last == NULL, 0))
2787 return REG_ESPACE;
2788 err = get_subexp_sub (mctx, sub_top, sub_last, bkref_node,
2789 bkref_str_idx);
2790 if (err == REG_NOMATCH)
2791 continue;
2794 return REG_NOERROR;
2797 /* Helper functions for get_subexp(). */
2799 /* Check SUB_LAST can arrive to the back reference BKREF_NODE at BKREF_STR.
2800 If it can arrive, register the sub expression expressed with SUB_TOP
2801 and SUB_LAST. */
2803 static reg_errcode_t
2804 get_subexp_sub (mctx, sub_top, sub_last, bkref_node, bkref_str)
2805 re_match_context_t *mctx;
2806 const re_sub_match_top_t *sub_top;
2807 re_sub_match_last_t *sub_last;
2808 int bkref_node, bkref_str;
2810 reg_errcode_t err;
2811 int to_idx;
2812 /* Can the subexpression arrive the back reference? */
2813 err = check_arrival (mctx, &sub_last->path, sub_last->node,
2814 sub_last->str_idx, bkref_node, bkref_str, OP_OPEN_SUBEXP);
2815 if (err != REG_NOERROR)
2816 return err;
2817 err = match_ctx_add_entry (mctx, bkref_node, bkref_str, sub_top->str_idx,
2818 sub_last->str_idx);
2819 if (BE (err != REG_NOERROR, 0))
2820 return err;
2821 to_idx = bkref_str + sub_last->str_idx - sub_top->str_idx;
2822 return clean_state_log_if_needed (mctx, to_idx);
2825 /* Find the first node which is '(' or ')' and whose index is SUBEXP_IDX.
2826 Search '(' if FL_OPEN, or search ')' otherwise.
2827 TODO: This function isn't efficient...
2828 Because there might be more than one nodes whose types are
2829 OP_OPEN_SUBEXP and whose index is SUBEXP_IDX, we must check all
2830 nodes.
2831 E.g. RE: (a){2} */
2833 static int
2834 find_subexp_node (dfa, nodes, subexp_idx, type)
2835 const re_dfa_t *dfa;
2836 const re_node_set *nodes;
2837 int subexp_idx, type;
2839 int cls_idx;
2840 for (cls_idx = 0; cls_idx < nodes->nelem; ++cls_idx)
2842 int cls_node = nodes->elems[cls_idx];
2843 const re_token_t *node = dfa->nodes + cls_node;
2844 if (node->type == type
2845 && node->opr.idx == subexp_idx)
2846 return cls_node;
2848 return -1;
2851 /* Check whether the node TOP_NODE at TOP_STR can arrive to the node
2852 LAST_NODE at LAST_STR. We record the path onto PATH since it will be
2853 heavily reused.
2854 Return REG_NOERROR if it can arrive, or REG_NOMATCH otherwise. */
2856 static reg_errcode_t
2857 check_arrival (mctx, path, top_node, top_str, last_node, last_str,
2858 type)
2859 re_match_context_t *mctx;
2860 state_array_t *path;
2861 int top_node, top_str, last_node, last_str, type;
2863 re_dfa_t *const dfa = mctx->dfa;
2864 reg_errcode_t err;
2865 int subexp_num, backup_cur_idx, str_idx, null_cnt;
2866 re_dfastate_t *cur_state = NULL;
2867 re_node_set *cur_nodes, next_nodes;
2868 re_dfastate_t **backup_state_log;
2869 unsigned int context;
2871 subexp_num = dfa->nodes[top_node].opr.idx;
2872 /* Extend the buffer if we need. */
2873 if (BE (path->alloc < last_str + mctx->max_mb_elem_len + 1, 0))
2875 re_dfastate_t **new_array;
2876 int old_alloc = path->alloc;
2877 path->alloc += last_str + mctx->max_mb_elem_len + 1;
2878 new_array = re_realloc (path->array, re_dfastate_t *, path->alloc);
2879 if (new_array == NULL)
2881 path->alloc = old_alloc;
2882 return REG_ESPACE;
2884 path->array = new_array;
2885 memset (new_array + old_alloc, '\0',
2886 sizeof (re_dfastate_t *) * (path->alloc - old_alloc));
2889 str_idx = path->next_idx == 0 ? top_str : path->next_idx;
2891 /* Temporary modify MCTX. */
2892 backup_state_log = mctx->state_log;
2893 backup_cur_idx = mctx->input.cur_idx;
2894 mctx->state_log = path->array;
2895 mctx->input.cur_idx = str_idx;
2897 /* Setup initial node set. */
2898 context = re_string_context_at (&mctx->input, str_idx - 1, mctx->eflags);
2899 if (str_idx == top_str)
2901 err = re_node_set_init_1 (&next_nodes, top_node);
2902 if (BE (err != REG_NOERROR, 0))
2903 return err;
2904 err = check_arrival_expand_ecl (dfa, &next_nodes, subexp_num, type);
2905 if (BE (err != REG_NOERROR, 0))
2907 re_node_set_free (&next_nodes);
2908 return err;
2911 else
2913 cur_state = mctx->state_log[str_idx];
2914 if (cur_state && cur_state->has_backref)
2916 err = re_node_set_init_copy (&next_nodes, &cur_state->nodes);
2917 if (BE ( err != REG_NOERROR, 0))
2918 return err;
2920 else
2921 re_node_set_init_empty (&next_nodes);
2923 if (str_idx == top_str || (cur_state && cur_state->has_backref))
2925 if (next_nodes.nelem)
2927 err = expand_bkref_cache (mctx, &next_nodes, str_idx,
2928 subexp_num, type);
2929 if (BE ( err != REG_NOERROR, 0))
2931 re_node_set_free (&next_nodes);
2932 return err;
2935 cur_state = re_acquire_state_context (&err, dfa, &next_nodes, context);
2936 if (BE (cur_state == NULL && err != REG_NOERROR, 0))
2938 re_node_set_free (&next_nodes);
2939 return err;
2941 mctx->state_log[str_idx] = cur_state;
2944 for (null_cnt = 0; str_idx < last_str && null_cnt <= mctx->max_mb_elem_len;)
2946 re_node_set_empty (&next_nodes);
2947 if (mctx->state_log[str_idx + 1])
2949 err = re_node_set_merge (&next_nodes,
2950 &mctx->state_log[str_idx + 1]->nodes);
2951 if (BE (err != REG_NOERROR, 0))
2953 re_node_set_free (&next_nodes);
2954 return err;
2957 if (cur_state)
2959 err = check_arrival_add_next_nodes (mctx, str_idx,
2960 &cur_state->non_eps_nodes, &next_nodes);
2961 if (BE (err != REG_NOERROR, 0))
2963 re_node_set_free (&next_nodes);
2964 return err;
2967 ++str_idx;
2968 if (next_nodes.nelem)
2970 err = check_arrival_expand_ecl (dfa, &next_nodes, subexp_num, type);
2971 if (BE (err != REG_NOERROR, 0))
2973 re_node_set_free (&next_nodes);
2974 return err;
2976 err = expand_bkref_cache (mctx, &next_nodes, str_idx,
2977 subexp_num, type);
2978 if (BE ( err != REG_NOERROR, 0))
2980 re_node_set_free (&next_nodes);
2981 return err;
2984 context = re_string_context_at (&mctx->input, str_idx - 1, mctx->eflags);
2985 cur_state = re_acquire_state_context (&err, dfa, &next_nodes, context);
2986 if (BE (cur_state == NULL && err != REG_NOERROR, 0))
2988 re_node_set_free (&next_nodes);
2989 return err;
2991 mctx->state_log[str_idx] = cur_state;
2992 null_cnt = cur_state == NULL ? null_cnt + 1 : 0;
2994 re_node_set_free (&next_nodes);
2995 cur_nodes = (mctx->state_log[last_str] == NULL ? NULL
2996 : &mctx->state_log[last_str]->nodes);
2997 path->next_idx = str_idx;
2999 /* Fix MCTX. */
3000 mctx->state_log = backup_state_log;
3001 mctx->input.cur_idx = backup_cur_idx;
3003 /* Then check the current node set has the node LAST_NODE. */
3004 if (cur_nodes != NULL && re_node_set_contains (cur_nodes, last_node))
3005 return REG_NOERROR;
3007 return REG_NOMATCH;
3010 /* Helper functions for check_arrival. */
3012 /* Calculate the destination nodes of CUR_NODES at STR_IDX, and append them
3013 to NEXT_NODES.
3014 TODO: This function is similar to the functions transit_state*(),
3015 however this function has many additional works.
3016 Can't we unify them? */
3018 static reg_errcode_t
3019 check_arrival_add_next_nodes (mctx, str_idx, cur_nodes, next_nodes)
3020 re_match_context_t *mctx;
3021 int str_idx;
3022 re_node_set *cur_nodes, *next_nodes;
3024 re_dfa_t *const dfa = mctx->dfa;
3025 int result;
3026 int cur_idx;
3027 reg_errcode_t err;
3028 re_node_set union_set;
3029 re_node_set_init_empty (&union_set);
3030 for (cur_idx = 0; cur_idx < cur_nodes->nelem; ++cur_idx)
3032 int naccepted = 0;
3033 int cur_node = cur_nodes->elems[cur_idx];
3034 #if defined DEBUG || defined RE_ENABLE_I18N
3035 re_token_type_t type = dfa->nodes[cur_node].type;
3036 #endif
3037 #ifdef DEBUG
3038 assert (!IS_EPSILON_NODE (type));
3039 #endif
3040 #ifdef RE_ENABLE_I18N
3041 /* If the node may accept `multi byte'. */
3042 if (ACCEPT_MB_NODE (type))
3044 naccepted = check_node_accept_bytes (dfa, cur_node, &mctx->input,
3045 str_idx);
3046 if (naccepted > 1)
3048 re_dfastate_t *dest_state;
3049 int next_node = dfa->nexts[cur_node];
3050 int next_idx = str_idx + naccepted;
3051 dest_state = mctx->state_log[next_idx];
3052 re_node_set_empty (&union_set);
3053 if (dest_state)
3055 err = re_node_set_merge (&union_set, &dest_state->nodes);
3056 if (BE (err != REG_NOERROR, 0))
3058 re_node_set_free (&union_set);
3059 return err;
3062 result = re_node_set_insert (&union_set, next_node);
3063 if (BE (result < 0, 0))
3065 re_node_set_free (&union_set);
3066 return REG_ESPACE;
3068 mctx->state_log[next_idx] = re_acquire_state (&err, dfa,
3069 &union_set);
3070 if (BE (mctx->state_log[next_idx] == NULL
3071 && err != REG_NOERROR, 0))
3073 re_node_set_free (&union_set);
3074 return err;
3078 #endif /* RE_ENABLE_I18N */
3079 if (naccepted
3080 || check_node_accept (mctx, dfa->nodes + cur_node, str_idx))
3082 result = re_node_set_insert (next_nodes, dfa->nexts[cur_node]);
3083 if (BE (result < 0, 0))
3085 re_node_set_free (&union_set);
3086 return REG_ESPACE;
3090 re_node_set_free (&union_set);
3091 return REG_NOERROR;
3094 /* For all the nodes in CUR_NODES, add the epsilon closures of them to
3095 CUR_NODES, however exclude the nodes which are:
3096 - inside the sub expression whose number is EX_SUBEXP, if FL_OPEN.
3097 - out of the sub expression whose number is EX_SUBEXP, if !FL_OPEN.
3100 static reg_errcode_t
3101 check_arrival_expand_ecl (dfa, cur_nodes, ex_subexp, type)
3102 re_dfa_t *dfa;
3103 re_node_set *cur_nodes;
3104 int ex_subexp, type;
3106 reg_errcode_t err;
3107 int idx, outside_node;
3108 re_node_set new_nodes;
3109 #ifdef DEBUG
3110 assert (cur_nodes->nelem);
3111 #endif
3112 err = re_node_set_alloc (&new_nodes, cur_nodes->nelem);
3113 if (BE (err != REG_NOERROR, 0))
3114 return err;
3115 /* Create a new node set NEW_NODES with the nodes which are epsilon
3116 closures of the node in CUR_NODES. */
3118 for (idx = 0; idx < cur_nodes->nelem; ++idx)
3120 int cur_node = cur_nodes->elems[idx];
3121 re_node_set *eclosure = dfa->eclosures + cur_node;
3122 outside_node = find_subexp_node (dfa, eclosure, ex_subexp, type);
3123 if (outside_node == -1)
3125 /* There are no problematic nodes, just merge them. */
3126 err = re_node_set_merge (&new_nodes, eclosure);
3127 if (BE (err != REG_NOERROR, 0))
3129 re_node_set_free (&new_nodes);
3130 return err;
3133 else
3135 /* There are problematic nodes, re-calculate incrementally. */
3136 err = check_arrival_expand_ecl_sub (dfa, &new_nodes, cur_node,
3137 ex_subexp, type);
3138 if (BE (err != REG_NOERROR, 0))
3140 re_node_set_free (&new_nodes);
3141 return err;
3145 re_node_set_free (cur_nodes);
3146 *cur_nodes = new_nodes;
3147 return REG_NOERROR;
3150 /* Helper function for check_arrival_expand_ecl.
3151 Check incrementally the epsilon closure of TARGET, and if it isn't
3152 problematic append it to DST_NODES. */
3154 static reg_errcode_t
3155 check_arrival_expand_ecl_sub (dfa, dst_nodes, target, ex_subexp, type)
3156 re_dfa_t *dfa;
3157 int target, ex_subexp, type;
3158 re_node_set *dst_nodes;
3160 int cur_node;
3161 for (cur_node = target; !re_node_set_contains (dst_nodes, cur_node);)
3163 int err;
3165 if (dfa->nodes[cur_node].type == type
3166 && dfa->nodes[cur_node].opr.idx == ex_subexp)
3168 if (type == OP_CLOSE_SUBEXP)
3170 err = re_node_set_insert (dst_nodes, cur_node);
3171 if (BE (err == -1, 0))
3172 return REG_ESPACE;
3174 break;
3176 err = re_node_set_insert (dst_nodes, cur_node);
3177 if (BE (err == -1, 0))
3178 return REG_ESPACE;
3179 if (dfa->edests[cur_node].nelem == 0)
3180 break;
3181 if (dfa->edests[cur_node].nelem == 2)
3183 err = check_arrival_expand_ecl_sub (dfa, dst_nodes,
3184 dfa->edests[cur_node].elems[1],
3185 ex_subexp, type);
3186 if (BE (err != REG_NOERROR, 0))
3187 return err;
3189 cur_node = dfa->edests[cur_node].elems[0];
3191 return REG_NOERROR;
3195 /* For all the back references in the current state, calculate the
3196 destination of the back references by the appropriate entry
3197 in MCTX->BKREF_ENTS. */
3199 static reg_errcode_t
3200 expand_bkref_cache (mctx, cur_nodes, cur_str, subexp_num,
3201 type)
3202 re_match_context_t *mctx;
3203 int cur_str, subexp_num, type;
3204 re_node_set *cur_nodes;
3206 re_dfa_t *const dfa = mctx->dfa;
3207 reg_errcode_t err;
3208 int cache_idx_start = search_cur_bkref_entry (mctx, cur_str);
3209 struct re_backref_cache_entry *ent;
3211 if (cache_idx_start == -1)
3212 return REG_NOERROR;
3214 restart:
3215 ent = mctx->bkref_ents + cache_idx_start;
3218 int to_idx, next_node;
3220 /* Is this entry ENT is appropriate? */
3221 if (!re_node_set_contains (cur_nodes, ent->node))
3222 continue; /* No. */
3224 to_idx = cur_str + ent->subexp_to - ent->subexp_from;
3225 /* Calculate the destination of the back reference, and append it
3226 to MCTX->STATE_LOG. */
3227 if (to_idx == cur_str)
3229 /* The backreference did epsilon transit, we must re-check all the
3230 node in the current state. */
3231 re_node_set new_dests;
3232 reg_errcode_t err2, err3;
3233 next_node = dfa->edests[ent->node].elems[0];
3234 if (re_node_set_contains (cur_nodes, next_node))
3235 continue;
3236 err = re_node_set_init_1 (&new_dests, next_node);
3237 err2 = check_arrival_expand_ecl (dfa, &new_dests, subexp_num, type);
3238 err3 = re_node_set_merge (cur_nodes, &new_dests);
3239 re_node_set_free (&new_dests);
3240 if (BE (err != REG_NOERROR || err2 != REG_NOERROR
3241 || err3 != REG_NOERROR, 0))
3243 err = (err != REG_NOERROR ? err
3244 : (err2 != REG_NOERROR ? err2 : err3));
3245 return err;
3247 /* TODO: It is still inefficient... */
3248 goto restart;
3250 else
3252 re_node_set union_set;
3253 next_node = dfa->nexts[ent->node];
3254 if (mctx->state_log[to_idx])
3256 int ret;
3257 if (re_node_set_contains (&mctx->state_log[to_idx]->nodes,
3258 next_node))
3259 continue;
3260 err = re_node_set_init_copy (&union_set,
3261 &mctx->state_log[to_idx]->nodes);
3262 ret = re_node_set_insert (&union_set, next_node);
3263 if (BE (err != REG_NOERROR || ret < 0, 0))
3265 re_node_set_free (&union_set);
3266 err = err != REG_NOERROR ? err : REG_ESPACE;
3267 return err;
3270 else
3272 err = re_node_set_init_1 (&union_set, next_node);
3273 if (BE (err != REG_NOERROR, 0))
3274 return err;
3276 mctx->state_log[to_idx] = re_acquire_state (&err, dfa, &union_set);
3277 re_node_set_free (&union_set);
3278 if (BE (mctx->state_log[to_idx] == NULL
3279 && err != REG_NOERROR, 0))
3280 return err;
3283 while (ent++->more);
3284 return REG_NOERROR;
3287 /* Build transition table for the state.
3288 Return the new table if succeeded, otherwise return NULL. */
3290 static re_dfastate_t **
3291 build_trtable (dfa, state)
3292 re_dfa_t *dfa;
3293 re_dfastate_t *state;
3295 reg_errcode_t err;
3296 int i, j, ch;
3297 unsigned int elem, mask;
3298 int dests_node_malloced = 0, dest_states_malloced = 0;
3299 int ndests; /* Number of the destination states from `state'. */
3300 re_dfastate_t **trtable;
3301 re_dfastate_t **dest_states = NULL, **dest_states_word, **dest_states_nl;
3302 re_node_set follows, *dests_node;
3303 bitset *dests_ch;
3304 bitset acceptable;
3306 /* We build DFA states which corresponds to the destination nodes
3307 from `state'. `dests_node[i]' represents the nodes which i-th
3308 destination state contains, and `dests_ch[i]' represents the
3309 characters which i-th destination state accepts. */
3310 #ifdef _LIBC
3311 if (__libc_use_alloca ((sizeof (re_node_set) + sizeof (bitset)) * SBC_MAX))
3312 dests_node = (re_node_set *)
3313 alloca ((sizeof (re_node_set) + sizeof (bitset)) * SBC_MAX);
3314 else
3315 #endif
3317 dests_node = (re_node_set *)
3318 malloc ((sizeof (re_node_set) + sizeof (bitset)) * SBC_MAX);
3319 if (BE (dests_node == NULL, 0))
3320 return NULL;
3321 dests_node_malloced = 1;
3323 dests_ch = (bitset *) (dests_node + SBC_MAX);
3325 /* Initialize transiton table. */
3326 state->word_trtable = 0;
3328 /* At first, group all nodes belonging to `state' into several
3329 destinations. */
3330 ndests = group_nodes_into_DFAstates (dfa, state, dests_node, dests_ch);
3331 if (BE (ndests <= 0, 0))
3333 if (dests_node_malloced)
3334 free (dests_node);
3335 /* Return NULL in case of an error, trtable otherwise. */
3336 if (ndests == 0)
3338 state->trtable = (re_dfastate_t **)
3339 calloc (sizeof (re_dfastate_t *), SBC_MAX);;
3340 return state->trtable;
3342 return NULL;
3345 err = re_node_set_alloc (&follows, ndests + 1);
3346 if (BE (err != REG_NOERROR, 0))
3347 goto out_free;
3349 #ifdef _LIBC
3350 if (__libc_use_alloca ((sizeof (re_node_set) + sizeof (bitset)) * SBC_MAX
3351 + ndests * 3 * sizeof (re_dfastate_t *)))
3352 dest_states = (re_dfastate_t **)
3353 alloca (ndests * 3 * sizeof (re_dfastate_t *));
3354 else
3355 #endif
3357 dest_states = (re_dfastate_t **)
3358 malloc (ndests * 3 * sizeof (re_dfastate_t *));
3359 if (BE (dest_states == NULL, 0))
3361 out_free:
3362 if (dest_states_malloced)
3363 free (dest_states);
3364 re_node_set_free (&follows);
3365 for (i = 0; i < ndests; ++i)
3366 re_node_set_free (dests_node + i);
3367 if (dests_node_malloced)
3368 free (dests_node);
3369 return NULL;
3371 dest_states_malloced = 1;
3373 dest_states_word = dest_states + ndests;
3374 dest_states_nl = dest_states_word + ndests;
3375 bitset_empty (acceptable);
3377 /* Then build the states for all destinations. */
3378 for (i = 0; i < ndests; ++i)
3380 int next_node;
3381 re_node_set_empty (&follows);
3382 /* Merge the follows of this destination states. */
3383 for (j = 0; j < dests_node[i].nelem; ++j)
3385 next_node = dfa->nexts[dests_node[i].elems[j]];
3386 if (next_node != -1)
3388 err = re_node_set_merge (&follows, dfa->eclosures + next_node);
3389 if (BE (err != REG_NOERROR, 0))
3390 goto out_free;
3393 dest_states[i] = re_acquire_state_context (&err, dfa, &follows, 0);
3394 if (BE (dest_states[i] == NULL && err != REG_NOERROR, 0))
3395 goto out_free;
3396 /* If the new state has context constraint,
3397 build appropriate states for these contexts. */
3398 if (dest_states[i]->has_constraint)
3400 dest_states_word[i] = re_acquire_state_context (&err, dfa, &follows,
3401 CONTEXT_WORD);
3402 if (BE (dest_states_word[i] == NULL && err != REG_NOERROR, 0))
3403 goto out_free;
3405 if (dest_states[i] != dest_states_word[i]
3406 && dfa->mb_cur_max > 1)
3407 state->word_trtable = 1;
3409 dest_states_nl[i] = re_acquire_state_context (&err, dfa, &follows,
3410 CONTEXT_NEWLINE);
3411 if (BE (dest_states_nl[i] == NULL && err != REG_NOERROR, 0))
3412 goto out_free;
3414 else
3416 dest_states_word[i] = dest_states[i];
3417 dest_states_nl[i] = dest_states[i];
3419 bitset_merge (acceptable, dests_ch[i]);
3422 if (!BE (state->word_trtable, 0))
3424 /* We don't care about whether the following character is a word
3425 character, or we are in a single-byte character set so we can
3426 discern by looking at the character code: allocate a
3427 256-entry transition table. */
3428 trtable = (re_dfastate_t **) calloc (sizeof (re_dfastate_t *), SBC_MAX);
3429 if (BE (trtable == NULL, 0))
3430 goto out_free;
3432 /* For all characters ch...: */
3433 for (i = 0; i < BITSET_UINTS; ++i)
3434 for (ch = i * UINT_BITS, elem = acceptable[i], mask = 1;
3435 elem;
3436 mask <<= 1, elem >>= 1, ++ch)
3437 if (BE (elem & 1, 0))
3439 /* There must be exactly one destination which accepts
3440 character ch. See group_nodes_into_DFAstates. */
3441 for (j = 0; (dests_ch[j][i] & mask) == 0; ++j)
3444 /* j-th destination accepts the word character ch. */
3445 if (dfa->word_char[i] & mask)
3446 trtable[ch] = dest_states_word[j];
3447 else
3448 trtable[ch] = dest_states[j];
3451 else
3453 /* We care about whether the following character is a word
3454 character, and we are in a multi-byte character set: discern
3455 by looking at the character code: build two 256-entry
3456 transition tables, one starting at trtable[0] and one
3457 starting at trtable[SBC_MAX]. */
3458 trtable = (re_dfastate_t **) calloc (sizeof (re_dfastate_t *),
3459 2 * SBC_MAX);
3460 if (BE (trtable == NULL, 0))
3461 goto out_free;
3463 /* For all characters ch...: */
3464 for (i = 0; i < BITSET_UINTS; ++i)
3465 for (ch = i * UINT_BITS, elem = acceptable[i], mask = 1;
3466 elem;
3467 mask <<= 1, elem >>= 1, ++ch)
3468 if (BE (elem & 1, 0))
3470 /* There must be exactly one destination which accepts
3471 character ch. See group_nodes_into_DFAstates. */
3472 for (j = 0; (dests_ch[j][i] & mask) == 0; ++j)
3475 /* j-th destination accepts the word character ch. */
3476 trtable[ch] = dest_states[j];
3477 trtable[ch + SBC_MAX] = dest_states_word[j];
3481 /* new line */
3482 if (bitset_contain (acceptable, NEWLINE_CHAR))
3484 /* The current state accepts newline character. */
3485 for (j = 0; j < ndests; ++j)
3486 if (bitset_contain (dests_ch[j], NEWLINE_CHAR))
3488 /* k-th destination accepts newline character. */
3489 trtable[NEWLINE_CHAR] = dest_states_nl[j];
3490 if (state->word_trtable)
3491 trtable[NEWLINE_CHAR + SBC_MAX] = dest_states_nl[j];
3492 /* There must be only one destination which accepts
3493 newline. See group_nodes_into_DFAstates. */
3494 break;
3498 if (dest_states_malloced)
3499 free (dest_states);
3501 re_node_set_free (&follows);
3502 for (i = 0; i < ndests; ++i)
3503 re_node_set_free (dests_node + i);
3505 if (dests_node_malloced)
3506 free (dests_node);
3508 state->trtable = trtable;
3509 return trtable;
3512 /* Group all nodes belonging to STATE into several destinations.
3513 Then for all destinations, set the nodes belonging to the destination
3514 to DESTS_NODE[i] and set the characters accepted by the destination
3515 to DEST_CH[i]. This function return the number of destinations. */
3517 static int
3518 group_nodes_into_DFAstates (dfa, state, dests_node, dests_ch)
3519 re_dfa_t *dfa;
3520 const re_dfastate_t *state;
3521 re_node_set *dests_node;
3522 bitset *dests_ch;
3524 reg_errcode_t err;
3525 int result;
3526 int i, j, k;
3527 int ndests; /* Number of the destinations from `state'. */
3528 bitset accepts; /* Characters a node can accept. */
3529 const re_node_set *cur_nodes = &state->nodes;
3530 bitset_empty (accepts);
3531 ndests = 0;
3533 /* For all the nodes belonging to `state', */
3534 for (i = 0; i < cur_nodes->nelem; ++i)
3536 re_token_t *node = &dfa->nodes[cur_nodes->elems[i]];
3537 re_token_type_t type = node->type;
3538 unsigned int constraint = node->constraint;
3540 /* Enumerate all single byte character this node can accept. */
3541 if (type == CHARACTER)
3542 bitset_set (accepts, node->opr.c);
3543 else if (type == SIMPLE_BRACKET)
3545 bitset_merge (accepts, node->opr.sbcset);
3547 else if (type == OP_PERIOD)
3549 #ifdef RE_ENABLE_I18N
3550 if (dfa->mb_cur_max > 1)
3551 bitset_merge (accepts, dfa->sb_char);
3552 else
3553 #endif
3554 bitset_set_all (accepts);
3555 if (!(dfa->syntax & RE_DOT_NEWLINE))
3556 bitset_clear (accepts, '\n');
3557 if (dfa->syntax & RE_DOT_NOT_NULL)
3558 bitset_clear (accepts, '\0');
3560 #ifdef RE_ENABLE_I18N
3561 else if (type == OP_UTF8_PERIOD)
3563 memset (accepts, 255, sizeof (unsigned int) * BITSET_UINTS / 2);
3564 if (!(dfa->syntax & RE_DOT_NEWLINE))
3565 bitset_clear (accepts, '\n');
3566 if (dfa->syntax & RE_DOT_NOT_NULL)
3567 bitset_clear (accepts, '\0');
3569 #endif
3570 else
3571 continue;
3573 /* Check the `accepts' and sift the characters which are not
3574 match it the context. */
3575 if (constraint)
3577 if (constraint & NEXT_NEWLINE_CONSTRAINT)
3579 int accepts_newline = bitset_contain (accepts, NEWLINE_CHAR);
3580 bitset_empty (accepts);
3581 if (accepts_newline)
3582 bitset_set (accepts, NEWLINE_CHAR);
3583 else
3584 continue;
3586 if (constraint & NEXT_ENDBUF_CONSTRAINT)
3588 bitset_empty (accepts);
3589 continue;
3592 if (constraint & NEXT_WORD_CONSTRAINT)
3594 unsigned int any_set = 0;
3595 if (type == CHARACTER && !node->word_char)
3597 bitset_empty (accepts);
3598 continue;
3600 #ifdef RE_ENABLE_I18N
3601 if (dfa->mb_cur_max > 1)
3602 for (j = 0; j < BITSET_UINTS; ++j)
3603 any_set |= (accepts[j] &= (dfa->word_char[j] | ~dfa->sb_char[j]));
3604 else
3605 #endif
3606 for (j = 0; j < BITSET_UINTS; ++j)
3607 any_set |= (accepts[j] &= dfa->word_char[j]);
3608 if (!any_set)
3609 continue;
3611 if (constraint & NEXT_NOTWORD_CONSTRAINT)
3613 unsigned int any_set = 0;
3614 if (type == CHARACTER && node->word_char)
3616 bitset_empty (accepts);
3617 continue;
3619 #ifdef RE_ENABLE_I18N
3620 if (dfa->mb_cur_max > 1)
3621 for (j = 0; j < BITSET_UINTS; ++j)
3622 any_set |= (accepts[j] &= ~(dfa->word_char[j] & dfa->sb_char[j]));
3623 else
3624 #endif
3625 for (j = 0; j < BITSET_UINTS; ++j)
3626 any_set |= (accepts[j] &= ~dfa->word_char[j]);
3627 if (!any_set)
3628 continue;
3632 /* Then divide `accepts' into DFA states, or create a new
3633 state. Above, we make sure that accepts is not empty. */
3634 for (j = 0; j < ndests; ++j)
3636 bitset intersec; /* Intersection sets, see below. */
3637 bitset remains;
3638 /* Flags, see below. */
3639 int has_intersec, not_subset, not_consumed;
3641 /* Optimization, skip if this state doesn't accept the character. */
3642 if (type == CHARACTER && !bitset_contain (dests_ch[j], node->opr.c))
3643 continue;
3645 /* Enumerate the intersection set of this state and `accepts'. */
3646 has_intersec = 0;
3647 for (k = 0; k < BITSET_UINTS; ++k)
3648 has_intersec |= intersec[k] = accepts[k] & dests_ch[j][k];
3649 /* And skip if the intersection set is empty. */
3650 if (!has_intersec)
3651 continue;
3653 /* Then check if this state is a subset of `accepts'. */
3654 not_subset = not_consumed = 0;
3655 for (k = 0; k < BITSET_UINTS; ++k)
3657 not_subset |= remains[k] = ~accepts[k] & dests_ch[j][k];
3658 not_consumed |= accepts[k] = accepts[k] & ~dests_ch[j][k];
3661 /* If this state isn't a subset of `accepts', create a
3662 new group state, which has the `remains'. */
3663 if (not_subset)
3665 bitset_copy (dests_ch[ndests], remains);
3666 bitset_copy (dests_ch[j], intersec);
3667 err = re_node_set_init_copy (dests_node + ndests, &dests_node[j]);
3668 if (BE (err != REG_NOERROR, 0))
3669 goto error_return;
3670 ++ndests;
3673 /* Put the position in the current group. */
3674 result = re_node_set_insert (&dests_node[j], cur_nodes->elems[i]);
3675 if (BE (result < 0, 0))
3676 goto error_return;
3678 /* If all characters are consumed, go to next node. */
3679 if (!not_consumed)
3680 break;
3682 /* Some characters remain, create a new group. */
3683 if (j == ndests)
3685 bitset_copy (dests_ch[ndests], accepts);
3686 err = re_node_set_init_1 (dests_node + ndests, cur_nodes->elems[i]);
3687 if (BE (err != REG_NOERROR, 0))
3688 goto error_return;
3689 ++ndests;
3690 bitset_empty (accepts);
3693 return ndests;
3694 error_return:
3695 for (j = 0; j < ndests; ++j)
3696 re_node_set_free (dests_node + j);
3697 return -1;
3700 #ifdef RE_ENABLE_I18N
3701 /* Check how many bytes the node `dfa->nodes[node_idx]' accepts.
3702 Return the number of the bytes the node accepts.
3703 STR_IDX is the current index of the input string.
3705 This function handles the nodes which can accept one character, or
3706 one collating element like '.', '[a-z]', opposite to the other nodes
3707 can only accept one byte. */
3709 static int
3710 check_node_accept_bytes (dfa, node_idx, input, str_idx)
3711 re_dfa_t *dfa;
3712 int node_idx, str_idx;
3713 const re_string_t *input;
3715 const re_token_t *node = dfa->nodes + node_idx;
3716 int char_len, elem_len;
3717 int i;
3719 if (BE (node->type == OP_UTF8_PERIOD, 0))
3721 unsigned char c = re_string_byte_at (input, str_idx), d;
3722 if (BE (c < 0xc2, 1))
3723 return 0;
3725 if (str_idx + 2 > input->len)
3726 return 0;
3728 d = re_string_byte_at (input, str_idx + 1);
3729 if (c < 0xe0)
3730 return (d < 0x80 || d > 0xbf) ? 0 : 2;
3731 else if (c < 0xf0)
3733 char_len = 3;
3734 if (c == 0xe0 && d < 0xa0)
3735 return 0;
3737 else if (c < 0xf8)
3739 char_len = 4;
3740 if (c == 0xf0 && d < 0x90)
3741 return 0;
3743 else if (c < 0xfc)
3745 char_len = 5;
3746 if (c == 0xf8 && d < 0x88)
3747 return 0;
3749 else if (c < 0xfe)
3751 char_len = 6;
3752 if (c == 0xfc && d < 0x84)
3753 return 0;
3755 else
3756 return 0;
3758 if (str_idx + char_len > input->len)
3759 return 0;
3761 for (i = 1; i < char_len; ++i)
3763 d = re_string_byte_at (input, str_idx + i);
3764 if (d < 0x80 || d > 0xbf)
3765 return 0;
3767 return char_len;
3770 char_len = re_string_char_size_at (input, str_idx);
3771 if (node->type == OP_PERIOD)
3773 if (char_len <= 1)
3774 return 0;
3775 /* FIXME: I don't think this if is needed, as both '\n'
3776 and '\0' are char_len == 1. */
3777 /* '.' accepts any one character except the following two cases. */
3778 if ((!(dfa->syntax & RE_DOT_NEWLINE) &&
3779 re_string_byte_at (input, str_idx) == '\n') ||
3780 ((dfa->syntax & RE_DOT_NOT_NULL) &&
3781 re_string_byte_at (input, str_idx) == '\0'))
3782 return 0;
3783 return char_len;
3786 elem_len = re_string_elem_size_at (input, str_idx);
3787 if ((elem_len <= 1 && char_len <= 1) || char_len == 0)
3788 return 0;
3790 if (node->type == COMPLEX_BRACKET)
3792 const re_charset_t *cset = node->opr.mbcset;
3793 # ifdef _LIBC
3794 const unsigned char *pin = ((char *) re_string_get_buffer (input)
3795 + str_idx);
3796 int j;
3797 uint32_t nrules;
3798 # endif /* _LIBC */
3799 int match_len = 0;
3800 wchar_t wc = ((cset->nranges || cset->nchar_classes || cset->nmbchars)
3801 ? re_string_wchar_at (input, str_idx) : 0);
3803 /* match with multibyte character? */
3804 for (i = 0; i < cset->nmbchars; ++i)
3805 if (wc == cset->mbchars[i])
3807 match_len = char_len;
3808 goto check_node_accept_bytes_match;
3810 /* match with character_class? */
3811 for (i = 0; i < cset->nchar_classes; ++i)
3813 wctype_t wt = cset->char_classes[i];
3814 if (__iswctype (wc, wt))
3816 match_len = char_len;
3817 goto check_node_accept_bytes_match;
3821 # ifdef _LIBC
3822 nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3823 if (nrules != 0)
3825 unsigned int in_collseq = 0;
3826 const int32_t *table, *indirect;
3827 const unsigned char *weights, *extra;
3828 const char *collseqwc;
3829 int32_t idx;
3830 /* This #include defines a local function! */
3831 # include <locale/weight.h>
3833 /* match with collating_symbol? */
3834 if (cset->ncoll_syms)
3835 extra = (const unsigned char *)
3836 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB);
3837 for (i = 0; i < cset->ncoll_syms; ++i)
3839 const unsigned char *coll_sym = extra + cset->coll_syms[i];
3840 /* Compare the length of input collating element and
3841 the length of current collating element. */
3842 if (*coll_sym != elem_len)
3843 continue;
3844 /* Compare each bytes. */
3845 for (j = 0; j < *coll_sym; j++)
3846 if (pin[j] != coll_sym[1 + j])
3847 break;
3848 if (j == *coll_sym)
3850 /* Match if every bytes is equal. */
3851 match_len = j;
3852 goto check_node_accept_bytes_match;
3856 if (cset->nranges)
3858 if (elem_len <= char_len)
3860 collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
3861 in_collseq = __collseq_table_lookup (collseqwc, wc);
3863 else
3864 in_collseq = find_collation_sequence_value (pin, elem_len);
3866 /* match with range expression? */
3867 for (i = 0; i < cset->nranges; ++i)
3868 if (cset->range_starts[i] <= in_collseq
3869 && in_collseq <= cset->range_ends[i])
3871 match_len = elem_len;
3872 goto check_node_accept_bytes_match;
3875 /* match with equivalence_class? */
3876 if (cset->nequiv_classes)
3878 const unsigned char *cp = pin;
3879 table = (const int32_t *)
3880 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3881 weights = (const unsigned char *)
3882 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_WEIGHTMB);
3883 extra = (const unsigned char *)
3884 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_EXTRAMB);
3885 indirect = (const int32_t *)
3886 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_INDIRECTMB);
3887 idx = findidx (&cp);
3888 if (idx > 0)
3889 for (i = 0; i < cset->nequiv_classes; ++i)
3891 int32_t equiv_class_idx = cset->equiv_classes[i];
3892 size_t weight_len = weights[idx];
3893 if (weight_len == weights[equiv_class_idx])
3895 int cnt = 0;
3896 while (cnt <= weight_len
3897 && (weights[equiv_class_idx + 1 + cnt]
3898 == weights[idx + 1 + cnt]))
3899 ++cnt;
3900 if (cnt > weight_len)
3902 match_len = elem_len;
3903 goto check_node_accept_bytes_match;
3909 else
3910 # endif /* _LIBC */
3912 /* match with range expression? */
3913 #if __GNUC__ >= 2
3914 wchar_t cmp_buf[] = {L'\0', L'\0', wc, L'\0', L'\0', L'\0'};
3915 #else
3916 wchar_t cmp_buf[] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'};
3917 cmp_buf[2] = wc;
3918 #endif
3919 for (i = 0; i < cset->nranges; ++i)
3921 cmp_buf[0] = cset->range_starts[i];
3922 cmp_buf[4] = cset->range_ends[i];
3923 if (wcscoll (cmp_buf, cmp_buf + 2) <= 0
3924 && wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0)
3926 match_len = char_len;
3927 goto check_node_accept_bytes_match;
3931 check_node_accept_bytes_match:
3932 if (!cset->non_match)
3933 return match_len;
3934 else
3936 if (match_len > 0)
3937 return 0;
3938 else
3939 return (elem_len > char_len) ? elem_len : char_len;
3942 return 0;
3945 # ifdef _LIBC
3946 static unsigned int
3947 find_collation_sequence_value (mbs, mbs_len)
3948 const unsigned char *mbs;
3949 size_t mbs_len;
3951 uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3952 if (nrules == 0)
3954 if (mbs_len == 1)
3956 /* No valid character. Match it as a single byte character. */
3957 const unsigned char *collseq = (const unsigned char *)
3958 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
3959 return collseq[mbs[0]];
3961 return UINT_MAX;
3963 else
3965 int32_t idx;
3966 const unsigned char *extra = (const unsigned char *)
3967 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB);
3968 int32_t extrasize = (const unsigned char *)
3969 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB + 1) - extra;
3971 for (idx = 0; idx < extrasize;)
3973 int mbs_cnt, found = 0;
3974 int32_t elem_mbs_len;
3975 /* Skip the name of collating element name. */
3976 idx = idx + extra[idx] + 1;
3977 elem_mbs_len = extra[idx++];
3978 if (mbs_len == elem_mbs_len)
3980 for (mbs_cnt = 0; mbs_cnt < elem_mbs_len; ++mbs_cnt)
3981 if (extra[idx + mbs_cnt] != mbs[mbs_cnt])
3982 break;
3983 if (mbs_cnt == elem_mbs_len)
3984 /* Found the entry. */
3985 found = 1;
3987 /* Skip the byte sequence of the collating element. */
3988 idx += elem_mbs_len;
3989 /* Adjust for the alignment. */
3990 idx = (idx + 3) & ~3;
3991 /* Skip the collation sequence value. */
3992 idx += sizeof (uint32_t);
3993 /* Skip the wide char sequence of the collating element. */
3994 idx = idx + sizeof (uint32_t) * (extra[idx] + 1);
3995 /* If we found the entry, return the sequence value. */
3996 if (found)
3997 return *(uint32_t *) (extra + idx);
3998 /* Skip the collation sequence value. */
3999 idx += sizeof (uint32_t);
4001 return UINT_MAX;
4004 # endif /* _LIBC */
4005 #endif /* RE_ENABLE_I18N */
4007 /* Check whether the node accepts the byte which is IDX-th
4008 byte of the INPUT. */
4010 static int
4011 check_node_accept (mctx, node, idx)
4012 const re_match_context_t *mctx;
4013 const re_token_t *node;
4014 int idx;
4016 unsigned char ch;
4017 ch = re_string_byte_at (&mctx->input, idx);
4018 switch (node->type)
4020 case CHARACTER:
4021 if (node->opr.c != ch)
4022 return 0;
4023 break;
4025 case SIMPLE_BRACKET:
4026 if (!bitset_contain (node->opr.sbcset, ch))
4027 return 0;
4028 break;
4030 #ifdef RE_ENABLE_I18N
4031 case OP_UTF8_PERIOD:
4032 if (ch >= 0x80)
4033 return 0;
4034 /* FALLTHROUGH */
4035 #endif
4036 case OP_PERIOD:
4037 if ((ch == '\n' && !(mctx->dfa->syntax & RE_DOT_NEWLINE))
4038 || (ch == '\0' && (mctx->dfa->syntax & RE_DOT_NOT_NULL)))
4039 return 0;
4040 break;
4042 default:
4043 return 0;
4046 if (node->constraint)
4048 /* The node has constraints. Check whether the current context
4049 satisfies the constraints. */
4050 unsigned int context = re_string_context_at (&mctx->input, idx,
4051 mctx->eflags);
4052 if (NOT_SATISFY_NEXT_CONSTRAINT (node->constraint, context))
4053 return 0;
4056 return 1;
4059 /* Extend the buffers, if the buffers have run out. */
4061 static reg_errcode_t
4062 extend_buffers (mctx)
4063 re_match_context_t *mctx;
4065 reg_errcode_t ret;
4066 re_string_t *pstr = &mctx->input;
4068 /* Double the lengthes of the buffers. */
4069 ret = re_string_realloc_buffers (pstr, pstr->bufs_len * 2);
4070 if (BE (ret != REG_NOERROR, 0))
4071 return ret;
4073 if (mctx->state_log != NULL)
4075 /* And double the length of state_log. */
4076 /* XXX We have no indication of the size of this buffer. If this
4077 allocation fail we have no indication that the state_log array
4078 does not have the right size. */
4079 re_dfastate_t **new_array = re_realloc (mctx->state_log, re_dfastate_t *,
4080 pstr->bufs_len + 1);
4081 if (BE (new_array == NULL, 0))
4082 return REG_ESPACE;
4083 mctx->state_log = new_array;
4086 /* Then reconstruct the buffers. */
4087 if (pstr->icase)
4089 #ifdef RE_ENABLE_I18N
4090 if (pstr->mb_cur_max > 1)
4092 ret = build_wcs_upper_buffer (pstr);
4093 if (BE (ret != REG_NOERROR, 0))
4094 return ret;
4096 else
4097 #endif /* RE_ENABLE_I18N */
4098 build_upper_buffer (pstr);
4100 else
4102 #ifdef RE_ENABLE_I18N
4103 if (pstr->mb_cur_max > 1)
4104 build_wcs_buffer (pstr);
4105 else
4106 #endif /* RE_ENABLE_I18N */
4108 if (pstr->trans != NULL)
4109 re_string_translate_buffer (pstr);
4112 return REG_NOERROR;
4116 /* Functions for matching context. */
4118 /* Initialize MCTX. */
4120 static reg_errcode_t
4121 match_ctx_init (mctx, eflags, n)
4122 re_match_context_t *mctx;
4123 int eflags, n;
4125 mctx->eflags = eflags;
4126 mctx->match_last = -1;
4127 if (n > 0)
4129 mctx->bkref_ents = re_malloc (struct re_backref_cache_entry, n);
4130 mctx->sub_tops = re_malloc (re_sub_match_top_t *, n);
4131 if (BE (mctx->bkref_ents == NULL || mctx->sub_tops == NULL, 0))
4132 return REG_ESPACE;
4134 /* Already zero-ed by the caller.
4135 else
4136 mctx->bkref_ents = NULL;
4137 mctx->nbkref_ents = 0;
4138 mctx->nsub_tops = 0; */
4139 mctx->abkref_ents = n;
4140 mctx->max_mb_elem_len = 1;
4141 mctx->asub_tops = n;
4142 return REG_NOERROR;
4145 /* Clean the entries which depend on the current input in MCTX.
4146 This function must be invoked when the matcher changes the start index
4147 of the input, or changes the input string. */
4149 static void
4150 match_ctx_clean (mctx)
4151 re_match_context_t *mctx;
4153 int st_idx;
4154 for (st_idx = 0; st_idx < mctx->nsub_tops; ++st_idx)
4156 int sl_idx;
4157 re_sub_match_top_t *top = mctx->sub_tops[st_idx];
4158 for (sl_idx = 0; sl_idx < top->nlasts; ++sl_idx)
4160 re_sub_match_last_t *last = top->lasts[sl_idx];
4161 re_free (last->path.array);
4162 re_free (last);
4164 re_free (top->lasts);
4165 if (top->path)
4167 re_free (top->path->array);
4168 re_free (top->path);
4170 free (top);
4173 mctx->nsub_tops = 0;
4174 mctx->nbkref_ents = 0;
4177 /* Free all the memory associated with MCTX. */
4179 static void
4180 match_ctx_free (mctx)
4181 re_match_context_t *mctx;
4183 /* First, free all the memory associated with MCTX->SUB_TOPS. */
4184 match_ctx_clean (mctx);
4185 re_free (mctx->sub_tops);
4186 re_free (mctx->bkref_ents);
4189 /* Add a new backreference entry to MCTX.
4190 Note that we assume that caller never call this function with duplicate
4191 entry, and call with STR_IDX which isn't smaller than any existing entry.
4194 static reg_errcode_t
4195 match_ctx_add_entry (mctx, node, str_idx, from, to)
4196 re_match_context_t *mctx;
4197 int node, str_idx, from, to;
4199 if (mctx->nbkref_ents >= mctx->abkref_ents)
4201 struct re_backref_cache_entry* new_entry;
4202 new_entry = re_realloc (mctx->bkref_ents, struct re_backref_cache_entry,
4203 mctx->abkref_ents * 2);
4204 if (BE (new_entry == NULL, 0))
4206 re_free (mctx->bkref_ents);
4207 return REG_ESPACE;
4209 mctx->bkref_ents = new_entry;
4210 memset (mctx->bkref_ents + mctx->nbkref_ents, '\0',
4211 sizeof (struct re_backref_cache_entry) * mctx->abkref_ents);
4212 mctx->abkref_ents *= 2;
4214 if (mctx->nbkref_ents > 0
4215 && mctx->bkref_ents[mctx->nbkref_ents - 1].str_idx == str_idx)
4216 mctx->bkref_ents[mctx->nbkref_ents - 1].more = 1;
4218 mctx->bkref_ents[mctx->nbkref_ents].node = node;
4219 mctx->bkref_ents[mctx->nbkref_ents].str_idx = str_idx;
4220 mctx->bkref_ents[mctx->nbkref_ents].subexp_from = from;
4221 mctx->bkref_ents[mctx->nbkref_ents].subexp_to = to;
4223 /* This is a cache that saves negative results of check_dst_limits_calc_pos.
4224 If bit N is clear, means that this entry won't epsilon-transition to
4225 an OP_OPEN_SUBEXP or OP_CLOSE_SUBEXP for the N+1-th subexpression. If
4226 it is set, check_dst_limits_calc_pos_1 will recurse and try to find one
4227 such node.
4229 A backreference does not epsilon-transition unless it is empty, so set
4230 to all zeros if FROM != TO. */
4231 mctx->bkref_ents[mctx->nbkref_ents].eps_reachable_subexps_map
4232 = (from == to ? ~0 : 0);
4234 mctx->bkref_ents[mctx->nbkref_ents++].more = 0;
4235 if (mctx->max_mb_elem_len < to - from)
4236 mctx->max_mb_elem_len = to - from;
4237 return REG_NOERROR;
4240 /* Search for the first entry which has the same str_idx, or -1 if none is
4241 found. Note that MCTX->BKREF_ENTS is already sorted by MCTX->STR_IDX. */
4243 static int
4244 search_cur_bkref_entry (mctx, str_idx)
4245 re_match_context_t *mctx;
4246 int str_idx;
4248 int left, right, mid, last;
4249 last = right = mctx->nbkref_ents;
4250 for (left = 0; left < right;)
4252 mid = (left + right) / 2;
4253 if (mctx->bkref_ents[mid].str_idx < str_idx)
4254 left = mid + 1;
4255 else
4256 right = mid;
4258 if (left < last && mctx->bkref_ents[left].str_idx == str_idx)
4259 return left;
4260 else
4261 return -1;
4264 /* Register the node NODE, whose type is OP_OPEN_SUBEXP, and which matches
4265 at STR_IDX. */
4267 static reg_errcode_t
4268 match_ctx_add_subtop (mctx, node, str_idx)
4269 re_match_context_t *mctx;
4270 int node, str_idx;
4272 #ifdef DEBUG
4273 assert (mctx->sub_tops != NULL);
4274 assert (mctx->asub_tops > 0);
4275 #endif
4276 if (BE (mctx->nsub_tops == mctx->asub_tops, 0))
4278 int new_asub_tops = mctx->asub_tops * 2;
4279 re_sub_match_top_t **new_array = re_realloc (mctx->sub_tops,
4280 re_sub_match_top_t *,
4281 new_asub_tops);
4282 if (BE (new_array == NULL, 0))
4283 return REG_ESPACE;
4284 mctx->sub_tops = new_array;
4285 mctx->asub_tops = new_asub_tops;
4287 mctx->sub_tops[mctx->nsub_tops] = calloc (1, sizeof (re_sub_match_top_t));
4288 if (BE (mctx->sub_tops[mctx->nsub_tops] == NULL, 0))
4289 return REG_ESPACE;
4290 mctx->sub_tops[mctx->nsub_tops]->node = node;
4291 mctx->sub_tops[mctx->nsub_tops++]->str_idx = str_idx;
4292 return REG_NOERROR;
4295 /* Register the node NODE, whose type is OP_CLOSE_SUBEXP, and which matches
4296 at STR_IDX, whose corresponding OP_OPEN_SUBEXP is SUB_TOP. */
4298 static re_sub_match_last_t *
4299 match_ctx_add_sublast (subtop, node, str_idx)
4300 re_sub_match_top_t *subtop;
4301 int node, str_idx;
4303 re_sub_match_last_t *new_entry;
4304 if (BE (subtop->nlasts == subtop->alasts, 0))
4306 int new_alasts = 2 * subtop->alasts + 1;
4307 re_sub_match_last_t **new_array = re_realloc (subtop->lasts,
4308 re_sub_match_last_t *,
4309 new_alasts);
4310 if (BE (new_array == NULL, 0))
4311 return NULL;
4312 subtop->lasts = new_array;
4313 subtop->alasts = new_alasts;
4315 new_entry = calloc (1, sizeof (re_sub_match_last_t));
4316 if (BE (new_entry != NULL, 1))
4318 subtop->lasts[subtop->nlasts] = new_entry;
4319 new_entry->node = node;
4320 new_entry->str_idx = str_idx;
4321 ++subtop->nlasts;
4323 return new_entry;
4326 static void
4327 sift_ctx_init (sctx, sifted_sts, limited_sts, last_node, last_str_idx)
4328 re_sift_context_t *sctx;
4329 re_dfastate_t **sifted_sts, **limited_sts;
4330 int last_node, last_str_idx;
4332 sctx->sifted_states = sifted_sts;
4333 sctx->limited_states = limited_sts;
4334 sctx->last_node = last_node;
4335 sctx->last_str_idx = last_str_idx;
4336 re_node_set_init_empty (&sctx->limits);