Updated to fedora-glibc-20070814T0725
[glibc.git] / posix / regexec.c
blobbdfa3550a7d3873ad7f0d001afcf04a09a93c766
1 /* Extended regular expression matching and search library.
2 Copyright (C) 2002, 2003, 2004, 2005 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4 Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library; if not, write to the Free
18 Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
19 02111-1307 USA. */
21 static reg_errcode_t match_ctx_init (re_match_context_t *cache, int eflags,
22 int n) internal_function;
23 static void match_ctx_clean (re_match_context_t *mctx) internal_function;
24 static void match_ctx_free (re_match_context_t *cache) internal_function;
25 static reg_errcode_t match_ctx_add_entry (re_match_context_t *cache, int node,
26 int str_idx, int from, int to)
27 internal_function;
28 static int search_cur_bkref_entry (const re_match_context_t *mctx, int str_idx)
29 internal_function;
30 static reg_errcode_t match_ctx_add_subtop (re_match_context_t *mctx, int node,
31 int str_idx) internal_function;
32 static re_sub_match_last_t * match_ctx_add_sublast (re_sub_match_top_t *subtop,
33 int node, int str_idx)
34 internal_function;
35 static void sift_ctx_init (re_sift_context_t *sctx, re_dfastate_t **sifted_sts,
36 re_dfastate_t **limited_sts, int last_node,
37 int last_str_idx)
38 internal_function;
39 static reg_errcode_t re_search_internal (const regex_t *preg,
40 const char *string, int length,
41 int start, int range, int stop,
42 size_t nmatch, regmatch_t pmatch[],
43 int eflags) internal_function;
44 static int re_search_2_stub (struct re_pattern_buffer *bufp,
45 const char *string1, int length1,
46 const char *string2, int length2,
47 int start, int range, struct re_registers *regs,
48 int stop, int ret_len) internal_function;
49 static int re_search_stub (struct re_pattern_buffer *bufp,
50 const char *string, int length, int start,
51 int range, int stop, struct re_registers *regs,
52 int ret_len) internal_function;
53 static unsigned re_copy_regs (struct re_registers *regs, regmatch_t *pmatch,
54 int nregs, int regs_allocated) internal_function;
55 static reg_errcode_t prune_impossible_nodes (re_match_context_t *mctx)
56 internal_function;
57 static int check_matching (re_match_context_t *mctx, int fl_longest_match,
58 int *p_match_first) internal_function;
59 static int check_halt_state_context (const re_match_context_t *mctx,
60 const re_dfastate_t *state, int idx)
61 internal_function;
62 static void update_regs (const re_dfa_t *dfa, regmatch_t *pmatch,
63 regmatch_t *prev_idx_match, int cur_node,
64 int cur_idx, int nmatch) internal_function;
65 static reg_errcode_t push_fail_stack (struct re_fail_stack_t *fs,
66 int str_idx, int dest_node, int nregs,
67 regmatch_t *regs,
68 re_node_set *eps_via_nodes)
69 internal_function;
70 static reg_errcode_t set_regs (const regex_t *preg,
71 const re_match_context_t *mctx,
72 size_t nmatch, regmatch_t *pmatch,
73 int fl_backtrack) internal_function;
74 static reg_errcode_t free_fail_stack_return (struct re_fail_stack_t *fs)
75 internal_function;
77 #ifdef RE_ENABLE_I18N
78 static int sift_states_iter_mb (const re_match_context_t *mctx,
79 re_sift_context_t *sctx,
80 int node_idx, int str_idx, int max_str_idx)
81 internal_function;
82 #endif /* RE_ENABLE_I18N */
83 static reg_errcode_t sift_states_backward (const re_match_context_t *mctx,
84 re_sift_context_t *sctx)
85 internal_function;
86 static reg_errcode_t build_sifted_states (const re_match_context_t *mctx,
87 re_sift_context_t *sctx, int str_idx,
88 re_node_set *cur_dest)
89 internal_function;
90 static reg_errcode_t update_cur_sifted_state (const re_match_context_t *mctx,
91 re_sift_context_t *sctx,
92 int str_idx,
93 re_node_set *dest_nodes)
94 internal_function;
95 static reg_errcode_t add_epsilon_src_nodes (const re_dfa_t *dfa,
96 re_node_set *dest_nodes,
97 const re_node_set *candidates)
98 internal_function;
99 static int check_dst_limits (const re_match_context_t *mctx,
100 re_node_set *limits,
101 int dst_node, int dst_idx, int src_node,
102 int src_idx) internal_function;
103 static int check_dst_limits_calc_pos_1 (const re_match_context_t *mctx,
104 int boundaries, int subexp_idx,
105 int from_node, int bkref_idx)
106 internal_function;
107 static int check_dst_limits_calc_pos (const re_match_context_t *mctx,
108 int limit, int subexp_idx,
109 int node, int str_idx,
110 int bkref_idx) internal_function;
111 static reg_errcode_t check_subexp_limits (const re_dfa_t *dfa,
112 re_node_set *dest_nodes,
113 const re_node_set *candidates,
114 re_node_set *limits,
115 struct re_backref_cache_entry *bkref_ents,
116 int str_idx) internal_function;
117 static reg_errcode_t sift_states_bkref (const re_match_context_t *mctx,
118 re_sift_context_t *sctx,
119 int str_idx, const re_node_set *candidates)
120 internal_function;
121 static reg_errcode_t merge_state_array (const re_dfa_t *dfa,
122 re_dfastate_t **dst,
123 re_dfastate_t **src, int num)
124 internal_function;
125 static re_dfastate_t *find_recover_state (reg_errcode_t *err,
126 re_match_context_t *mctx) internal_function;
127 static re_dfastate_t *transit_state (reg_errcode_t *err,
128 re_match_context_t *mctx,
129 re_dfastate_t *state) internal_function;
130 static re_dfastate_t *merge_state_with_log (reg_errcode_t *err,
131 re_match_context_t *mctx,
132 re_dfastate_t *next_state)
133 internal_function;
134 static reg_errcode_t check_subexp_matching_top (re_match_context_t *mctx,
135 re_node_set *cur_nodes,
136 int str_idx) internal_function;
137 #if 0
138 static re_dfastate_t *transit_state_sb (reg_errcode_t *err,
139 re_match_context_t *mctx,
140 re_dfastate_t *pstate)
141 internal_function;
142 #endif
143 #ifdef RE_ENABLE_I18N
144 static reg_errcode_t transit_state_mb (re_match_context_t *mctx,
145 re_dfastate_t *pstate)
146 internal_function;
147 #endif /* RE_ENABLE_I18N */
148 static reg_errcode_t transit_state_bkref (re_match_context_t *mctx,
149 const re_node_set *nodes)
150 internal_function;
151 static reg_errcode_t get_subexp (re_match_context_t *mctx,
152 int bkref_node, int bkref_str_idx)
153 internal_function;
154 static reg_errcode_t get_subexp_sub (re_match_context_t *mctx,
155 const re_sub_match_top_t *sub_top,
156 re_sub_match_last_t *sub_last,
157 int bkref_node, int bkref_str)
158 internal_function;
159 static int find_subexp_node (const re_dfa_t *dfa, const re_node_set *nodes,
160 int subexp_idx, int type) internal_function;
161 static reg_errcode_t check_arrival (re_match_context_t *mctx,
162 state_array_t *path, int top_node,
163 int top_str, int last_node, int last_str,
164 int type) internal_function;
165 static reg_errcode_t check_arrival_add_next_nodes (re_match_context_t *mctx,
166 int str_idx,
167 re_node_set *cur_nodes,
168 re_node_set *next_nodes)
169 internal_function;
170 static reg_errcode_t check_arrival_expand_ecl (const re_dfa_t *dfa,
171 re_node_set *cur_nodes,
172 int ex_subexp, int type)
173 internal_function;
174 static reg_errcode_t check_arrival_expand_ecl_sub (const re_dfa_t *dfa,
175 re_node_set *dst_nodes,
176 int target, int ex_subexp,
177 int type) internal_function;
178 static reg_errcode_t expand_bkref_cache (re_match_context_t *mctx,
179 re_node_set *cur_nodes, int cur_str,
180 int subexp_num, int type)
181 internal_function;
182 static int build_trtable (const re_dfa_t *dfa,
183 re_dfastate_t *state) internal_function;
184 #ifdef RE_ENABLE_I18N
185 static int check_node_accept_bytes (const re_dfa_t *dfa, int node_idx,
186 const re_string_t *input, int idx)
187 internal_function;
188 # ifdef _LIBC
189 static unsigned int find_collation_sequence_value (const unsigned char *mbs,
190 size_t name_len)
191 internal_function;
192 # endif /* _LIBC */
193 #endif /* RE_ENABLE_I18N */
194 static int group_nodes_into_DFAstates (const re_dfa_t *dfa,
195 const re_dfastate_t *state,
196 re_node_set *states_node,
197 bitset_t *states_ch) internal_function;
198 static int check_node_accept (const re_match_context_t *mctx,
199 const re_token_t *node, int idx)
200 internal_function;
201 static reg_errcode_t extend_buffers (re_match_context_t *mctx)
202 internal_function;
204 /* Entry point for POSIX code. */
206 /* regexec searches for a given pattern, specified by PREG, in the
207 string STRING.
209 If NMATCH is zero or REG_NOSUB was set in the cflags argument to
210 `regcomp', we ignore PMATCH. Otherwise, we assume PMATCH has at
211 least NMATCH elements, and we set them to the offsets of the
212 corresponding matched substrings.
214 EFLAGS specifies `execution flags' which affect matching: if
215 REG_NOTBOL is set, then ^ does not match at the beginning of the
216 string; if REG_NOTEOL is set, then $ does not match at the end.
218 We return 0 if we find a match and REG_NOMATCH if not. */
221 regexec (preg, string, nmatch, pmatch, eflags)
222 const regex_t *__restrict preg;
223 const char *__restrict string;
224 size_t nmatch;
225 regmatch_t pmatch[];
226 int eflags;
228 reg_errcode_t err;
229 int start, length;
230 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
232 if (eflags & ~(REG_NOTBOL | REG_NOTEOL | REG_STARTEND))
233 return REG_BADPAT;
235 if (eflags & REG_STARTEND)
237 start = pmatch[0].rm_so;
238 length = pmatch[0].rm_eo;
240 else
242 start = 0;
243 length = strlen (string);
246 __libc_lock_lock (dfa->lock);
247 if (preg->no_sub)
248 err = re_search_internal (preg, string, length, start, length - start,
249 length, 0, NULL, eflags);
250 else
251 err = re_search_internal (preg, string, length, start, length - start,
252 length, nmatch, pmatch, eflags);
253 __libc_lock_unlock (dfa->lock);
254 return err != REG_NOERROR;
257 #ifdef _LIBC
258 # include <shlib-compat.h>
259 versioned_symbol (libc, __regexec, regexec, GLIBC_2_3_4);
261 # if SHLIB_COMPAT (libc, GLIBC_2_0, GLIBC_2_3_4)
262 __typeof__ (__regexec) __compat_regexec;
265 attribute_compat_text_section
266 __compat_regexec (const regex_t *__restrict preg,
267 const char *__restrict string, size_t nmatch,
268 regmatch_t pmatch[], int eflags)
270 return regexec (preg, string, nmatch, pmatch,
271 eflags & (REG_NOTBOL | REG_NOTEOL));
273 compat_symbol (libc, __compat_regexec, regexec, GLIBC_2_0);
274 # endif
275 #endif
277 /* Entry points for GNU code. */
279 /* re_match, re_search, re_match_2, re_search_2
281 The former two functions operate on STRING with length LENGTH,
282 while the later two operate on concatenation of STRING1 and STRING2
283 with lengths LENGTH1 and LENGTH2, respectively.
285 re_match() matches the compiled pattern in BUFP against the string,
286 starting at index START.
288 re_search() first tries matching at index START, then it tries to match
289 starting from index START + 1, and so on. The last start position tried
290 is START + RANGE. (Thus RANGE = 0 forces re_search to operate the same
291 way as re_match().)
293 The parameter STOP of re_{match,search}_2 specifies that no match exceeding
294 the first STOP characters of the concatenation of the strings should be
295 concerned.
297 If REGS is not NULL, and BUFP->no_sub is not set, the offsets of the match
298 and all groups is stroed in REGS. (For the "_2" variants, the offsets are
299 computed relative to the concatenation, not relative to the individual
300 strings.)
302 On success, re_match* functions return the length of the match, re_search*
303 return the position of the start of the match. Return value -1 means no
304 match was found and -2 indicates an internal error. */
307 re_match (bufp, string, length, start, regs)
308 struct re_pattern_buffer *bufp;
309 const char *string;
310 int length, start;
311 struct re_registers *regs;
313 return re_search_stub (bufp, string, length, start, 0, length, regs, 1);
315 #ifdef _LIBC
316 weak_alias (__re_match, re_match)
317 #endif
320 re_search (bufp, string, length, start, range, regs)
321 struct re_pattern_buffer *bufp;
322 const char *string;
323 int length, start, range;
324 struct re_registers *regs;
326 return re_search_stub (bufp, string, length, start, range, length, regs, 0);
328 #ifdef _LIBC
329 weak_alias (__re_search, re_search)
330 #endif
333 re_match_2 (bufp, string1, length1, string2, length2, start, regs, stop)
334 struct re_pattern_buffer *bufp;
335 const char *string1, *string2;
336 int length1, length2, start, stop;
337 struct re_registers *regs;
339 return re_search_2_stub (bufp, string1, length1, string2, length2,
340 start, 0, regs, stop, 1);
342 #ifdef _LIBC
343 weak_alias (__re_match_2, re_match_2)
344 #endif
347 re_search_2 (bufp, string1, length1, string2, length2, start, range, regs, stop)
348 struct re_pattern_buffer *bufp;
349 const char *string1, *string2;
350 int length1, length2, start, range, stop;
351 struct re_registers *regs;
353 return re_search_2_stub (bufp, string1, length1, string2, length2,
354 start, range, regs, stop, 0);
356 #ifdef _LIBC
357 weak_alias (__re_search_2, re_search_2)
358 #endif
360 static int
361 re_search_2_stub (bufp, string1, length1, string2, length2, start, range, regs,
362 stop, ret_len)
363 struct re_pattern_buffer *bufp;
364 const char *string1, *string2;
365 int length1, length2, start, range, stop, ret_len;
366 struct re_registers *regs;
368 const char *str;
369 int rval;
370 int len = length1 + length2;
371 int free_str = 0;
373 if (BE (length1 < 0 || length2 < 0 || stop < 0, 0))
374 return -2;
376 /* Concatenate the strings. */
377 if (length2 > 0)
378 if (length1 > 0)
380 char *s = re_malloc (char, len);
382 if (BE (s == NULL, 0))
383 return -2;
384 #ifdef _LIBC
385 memcpy (__mempcpy (s, string1, length1), string2, length2);
386 #else
387 memcpy (s, string1, length1);
388 memcpy (s + length1, string2, length2);
389 #endif
390 str = s;
391 free_str = 1;
393 else
394 str = string2;
395 else
396 str = string1;
398 rval = re_search_stub (bufp, str, len, start, range, stop, regs,
399 ret_len);
400 if (free_str)
401 re_free ((char *) str);
402 return rval;
405 /* The parameters have the same meaning as those of re_search.
406 Additional parameters:
407 If RET_LEN is nonzero the length of the match is returned (re_match style);
408 otherwise the position of the match is returned. */
410 static int
411 re_search_stub (bufp, string, length, start, range, stop, regs, ret_len)
412 struct re_pattern_buffer *bufp;
413 const char *string;
414 int length, start, range, stop, ret_len;
415 struct re_registers *regs;
417 reg_errcode_t result;
418 regmatch_t *pmatch;
419 int nregs, rval;
420 int eflags = 0;
421 re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
423 /* Check for out-of-range. */
424 if (BE (start < 0 || start > length, 0))
425 return -1;
426 if (BE (start + range > length, 0))
427 range = length - start;
428 else if (BE (start + range < 0, 0))
429 range = -start;
431 __libc_lock_lock (dfa->lock);
433 eflags |= (bufp->not_bol) ? REG_NOTBOL : 0;
434 eflags |= (bufp->not_eol) ? REG_NOTEOL : 0;
436 /* Compile fastmap if we haven't yet. */
437 if (range > 0 && bufp->fastmap != NULL && !bufp->fastmap_accurate)
438 re_compile_fastmap (bufp);
440 if (BE (bufp->no_sub, 0))
441 regs = NULL;
443 /* We need at least 1 register. */
444 if (regs == NULL)
445 nregs = 1;
446 else if (BE (bufp->regs_allocated == REGS_FIXED &&
447 regs->num_regs < bufp->re_nsub + 1, 0))
449 nregs = regs->num_regs;
450 if (BE (nregs < 1, 0))
452 /* Nothing can be copied to regs. */
453 regs = NULL;
454 nregs = 1;
457 else
458 nregs = bufp->re_nsub + 1;
459 pmatch = re_malloc (regmatch_t, nregs);
460 if (BE (pmatch == NULL, 0))
462 rval = -2;
463 goto out;
466 result = re_search_internal (bufp, string, length, start, range, stop,
467 nregs, pmatch, eflags);
469 rval = 0;
471 /* I hope we needn't fill ther regs with -1's when no match was found. */
472 if (result != REG_NOERROR)
473 rval = -1;
474 else if (regs != NULL)
476 /* If caller wants register contents data back, copy them. */
477 bufp->regs_allocated = re_copy_regs (regs, pmatch, nregs,
478 bufp->regs_allocated);
479 if (BE (bufp->regs_allocated == REGS_UNALLOCATED, 0))
480 rval = -2;
483 if (BE (rval == 0, 1))
485 if (ret_len)
487 assert (pmatch[0].rm_so == start);
488 rval = pmatch[0].rm_eo - start;
490 else
491 rval = pmatch[0].rm_so;
493 re_free (pmatch);
494 out:
495 __libc_lock_unlock (dfa->lock);
496 return rval;
499 static unsigned
500 re_copy_regs (regs, pmatch, nregs, regs_allocated)
501 struct re_registers *regs;
502 regmatch_t *pmatch;
503 int nregs, regs_allocated;
505 int rval = REGS_REALLOCATE;
506 int i;
507 int need_regs = nregs + 1;
508 /* We need one extra element beyond `num_regs' for the `-1' marker GNU code
509 uses. */
511 /* Have the register data arrays been allocated? */
512 if (regs_allocated == REGS_UNALLOCATED)
513 { /* No. So allocate them with malloc. */
514 regs->start = re_malloc (regoff_t, need_regs);
515 regs->end = re_malloc (regoff_t, need_regs);
516 if (BE (regs->start == NULL, 0) || BE (regs->end == NULL, 0))
517 return REGS_UNALLOCATED;
518 regs->num_regs = need_regs;
520 else if (regs_allocated == REGS_REALLOCATE)
521 { /* Yes. If we need more elements than were already
522 allocated, reallocate them. If we need fewer, just
523 leave it alone. */
524 if (BE (need_regs > regs->num_regs, 0))
526 regoff_t *new_start = re_realloc (regs->start, regoff_t, need_regs);
527 regoff_t *new_end = re_realloc (regs->end, regoff_t, need_regs);
528 if (BE (new_start == NULL, 0) || BE (new_end == NULL, 0))
529 return REGS_UNALLOCATED;
530 regs->start = new_start;
531 regs->end = new_end;
532 regs->num_regs = need_regs;
535 else
537 assert (regs_allocated == REGS_FIXED);
538 /* This function may not be called with REGS_FIXED and nregs too big. */
539 assert (regs->num_regs >= nregs);
540 rval = REGS_FIXED;
543 /* Copy the regs. */
544 for (i = 0; i < nregs; ++i)
546 regs->start[i] = pmatch[i].rm_so;
547 regs->end[i] = pmatch[i].rm_eo;
549 for ( ; i < regs->num_regs; ++i)
550 regs->start[i] = regs->end[i] = -1;
552 return rval;
555 /* Set REGS to hold NUM_REGS registers, storing them in STARTS and
556 ENDS. Subsequent matches using PATTERN_BUFFER and REGS will use
557 this memory for recording register information. STARTS and ENDS
558 must be allocated using the malloc library routine, and must each
559 be at least NUM_REGS * sizeof (regoff_t) bytes long.
561 If NUM_REGS == 0, then subsequent matches should allocate their own
562 register data.
564 Unless this function is called, the first search or match using
565 PATTERN_BUFFER will allocate its own register data, without
566 freeing the old data. */
568 void
569 re_set_registers (bufp, regs, num_regs, starts, ends)
570 struct re_pattern_buffer *bufp;
571 struct re_registers *regs;
572 unsigned num_regs;
573 regoff_t *starts, *ends;
575 if (num_regs)
577 bufp->regs_allocated = REGS_REALLOCATE;
578 regs->num_regs = num_regs;
579 regs->start = starts;
580 regs->end = ends;
582 else
584 bufp->regs_allocated = REGS_UNALLOCATED;
585 regs->num_regs = 0;
586 regs->start = regs->end = (regoff_t *) 0;
589 #ifdef _LIBC
590 weak_alias (__re_set_registers, re_set_registers)
591 #endif
593 /* Entry points compatible with 4.2 BSD regex library. We don't define
594 them unless specifically requested. */
596 #if defined _REGEX_RE_COMP || defined _LIBC
598 # ifdef _LIBC
599 weak_function
600 # endif
601 re_exec (s)
602 const char *s;
604 return 0 == regexec (&re_comp_buf, s, 0, NULL, 0);
606 #endif /* _REGEX_RE_COMP */
608 /* Internal entry point. */
610 /* Searches for a compiled pattern PREG in the string STRING, whose
611 length is LENGTH. NMATCH, PMATCH, and EFLAGS have the same
612 mingings with regexec. START, and RANGE have the same meanings
613 with re_search.
614 Return REG_NOERROR if we find a match, and REG_NOMATCH if not,
615 otherwise return the error code.
616 Note: We assume front end functions already check ranges.
617 (START + RANGE >= 0 && START + RANGE <= LENGTH) */
619 static reg_errcode_t
620 re_search_internal (preg, string, length, start, range, stop, nmatch, pmatch,
621 eflags)
622 const regex_t *preg;
623 const char *string;
624 int length, start, range, stop, eflags;
625 size_t nmatch;
626 regmatch_t pmatch[];
628 reg_errcode_t err;
629 const re_dfa_t *dfa = (const re_dfa_t *) preg->buffer;
630 int left_lim, right_lim, incr;
631 int fl_longest_match, match_first, match_kind, match_last = -1;
632 int extra_nmatch;
633 int sb, ch;
634 #if defined _LIBC || (defined __STDC_VERSION__ && __STDC_VERSION__ >= 199901L)
635 re_match_context_t mctx = { .dfa = dfa };
636 #else
637 re_match_context_t mctx;
638 #endif
639 char *fastmap = (preg->fastmap != NULL && preg->fastmap_accurate
640 && range && !preg->can_be_null) ? preg->fastmap : NULL;
641 RE_TRANSLATE_TYPE t = preg->translate;
643 #if !(defined _LIBC || (defined __STDC_VERSION__ && __STDC_VERSION__ >= 199901L))
644 memset (&mctx, '\0', sizeof (re_match_context_t));
645 mctx.dfa = dfa;
646 #endif
648 extra_nmatch = (nmatch > preg->re_nsub) ? nmatch - (preg->re_nsub + 1) : 0;
649 nmatch -= extra_nmatch;
651 /* Check if the DFA haven't been compiled. */
652 if (BE (preg->used == 0 || dfa->init_state == NULL
653 || dfa->init_state_word == NULL || dfa->init_state_nl == NULL
654 || dfa->init_state_begbuf == NULL, 0))
655 return REG_NOMATCH;
657 #ifdef DEBUG
658 /* We assume front-end functions already check them. */
659 assert (start + range >= 0 && start + range <= length);
660 #endif
662 /* If initial states with non-begbuf contexts have no elements,
663 the regex must be anchored. If preg->newline_anchor is set,
664 we'll never use init_state_nl, so do not check it. */
665 if (dfa->init_state->nodes.nelem == 0
666 && dfa->init_state_word->nodes.nelem == 0
667 && (dfa->init_state_nl->nodes.nelem == 0
668 || !preg->newline_anchor))
670 if (start != 0 && start + range != 0)
671 return REG_NOMATCH;
672 start = range = 0;
675 /* We must check the longest matching, if nmatch > 0. */
676 fl_longest_match = (nmatch != 0 || dfa->nbackref);
678 err = re_string_allocate (&mctx.input, string, length, dfa->nodes_len + 1,
679 preg->translate, preg->syntax & RE_ICASE, dfa);
680 if (BE (err != REG_NOERROR, 0))
681 goto free_return;
682 mctx.input.stop = stop;
683 mctx.input.raw_stop = stop;
684 mctx.input.newline_anchor = preg->newline_anchor;
686 err = match_ctx_init (&mctx, eflags, dfa->nbackref * 2);
687 if (BE (err != REG_NOERROR, 0))
688 goto free_return;
690 /* We will log all the DFA states through which the dfa pass,
691 if nmatch > 1, or this dfa has "multibyte node", which is a
692 back-reference or a node which can accept multibyte character or
693 multi character collating element. */
694 if (nmatch > 1 || dfa->has_mb_node)
696 mctx.state_log = re_malloc (re_dfastate_t *, mctx.input.bufs_len + 1);
697 if (BE (mctx.state_log == NULL, 0))
699 err = REG_ESPACE;
700 goto free_return;
703 else
704 mctx.state_log = NULL;
706 match_first = start;
707 mctx.input.tip_context = (eflags & REG_NOTBOL) ? CONTEXT_BEGBUF
708 : CONTEXT_NEWLINE | CONTEXT_BEGBUF;
710 /* Check incrementally whether of not the input string match. */
711 incr = (range < 0) ? -1 : 1;
712 left_lim = (range < 0) ? start + range : start;
713 right_lim = (range < 0) ? start : start + range;
714 sb = dfa->mb_cur_max == 1;
715 match_kind =
716 (fastmap
717 ? ((sb || !(preg->syntax & RE_ICASE || t) ? 4 : 0)
718 | (range >= 0 ? 2 : 0)
719 | (t != NULL ? 1 : 0))
720 : 8);
722 for (;; match_first += incr)
724 err = REG_NOMATCH;
725 if (match_first < left_lim || right_lim < match_first)
726 goto free_return;
728 /* Advance as rapidly as possible through the string, until we
729 find a plausible place to start matching. This may be done
730 with varying efficiency, so there are various possibilities:
731 only the most common of them are specialized, in order to
732 save on code size. We use a switch statement for speed. */
733 switch (match_kind)
735 case 8:
736 /* No fastmap. */
737 break;
739 case 7:
740 /* Fastmap with single-byte translation, match forward. */
741 while (BE (match_first < right_lim, 1)
742 && !fastmap[t[(unsigned char) string[match_first]]])
743 ++match_first;
744 goto forward_match_found_start_or_reached_end;
746 case 6:
747 /* Fastmap without translation, match forward. */
748 while (BE (match_first < right_lim, 1)
749 && !fastmap[(unsigned char) string[match_first]])
750 ++match_first;
752 forward_match_found_start_or_reached_end:
753 if (BE (match_first == right_lim, 0))
755 ch = match_first >= length
756 ? 0 : (unsigned char) string[match_first];
757 if (!fastmap[t ? t[ch] : ch])
758 goto free_return;
760 break;
762 case 4:
763 case 5:
764 /* Fastmap without multi-byte translation, match backwards. */
765 while (match_first >= left_lim)
767 ch = match_first >= length
768 ? 0 : (unsigned char) string[match_first];
769 if (fastmap[t ? t[ch] : ch])
770 break;
771 --match_first;
773 if (match_first < left_lim)
774 goto free_return;
775 break;
777 default:
778 /* In this case, we can't determine easily the current byte,
779 since it might be a component byte of a multibyte
780 character. Then we use the constructed buffer instead. */
781 for (;;)
783 /* If MATCH_FIRST is out of the valid range, reconstruct the
784 buffers. */
785 unsigned int offset = match_first - mctx.input.raw_mbs_idx;
786 if (BE (offset >= (unsigned int) mctx.input.valid_raw_len, 0))
788 err = re_string_reconstruct (&mctx.input, match_first,
789 eflags);
790 if (BE (err != REG_NOERROR, 0))
791 goto free_return;
793 offset = match_first - mctx.input.raw_mbs_idx;
795 /* If MATCH_FIRST is out of the buffer, leave it as '\0'.
796 Note that MATCH_FIRST must not be smaller than 0. */
797 ch = (match_first >= length
798 ? 0 : re_string_byte_at (&mctx.input, offset));
799 if (fastmap[ch])
800 break;
801 match_first += incr;
802 if (match_first < left_lim || match_first > right_lim)
804 err = REG_NOMATCH;
805 goto free_return;
808 break;
811 /* Reconstruct the buffers so that the matcher can assume that
812 the matching starts from the beginning of the buffer. */
813 err = re_string_reconstruct (&mctx.input, match_first, eflags);
814 if (BE (err != REG_NOERROR, 0))
815 goto free_return;
817 #ifdef RE_ENABLE_I18N
818 /* Don't consider this char as a possible match start if it part,
819 yet isn't the head, of a multibyte character. */
820 if (!sb && !re_string_first_byte (&mctx.input, 0))
821 continue;
822 #endif
824 /* It seems to be appropriate one, then use the matcher. */
825 /* We assume that the matching starts from 0. */
826 mctx.state_log_top = mctx.nbkref_ents = mctx.max_mb_elem_len = 0;
827 match_last = check_matching (&mctx, fl_longest_match,
828 range >= 0 ? &match_first : NULL);
829 if (match_last != -1)
831 if (BE (match_last == -2, 0))
833 err = REG_ESPACE;
834 goto free_return;
836 else
838 mctx.match_last = match_last;
839 if ((!preg->no_sub && nmatch > 1) || dfa->nbackref)
841 re_dfastate_t *pstate = mctx.state_log[match_last];
842 mctx.last_node = check_halt_state_context (&mctx, pstate,
843 match_last);
845 if ((!preg->no_sub && nmatch > 1 && dfa->has_plural_match)
846 || dfa->nbackref)
848 err = prune_impossible_nodes (&mctx);
849 if (err == REG_NOERROR)
850 break;
851 if (BE (err != REG_NOMATCH, 0))
852 goto free_return;
853 match_last = -1;
855 else
856 break; /* We found a match. */
860 match_ctx_clean (&mctx);
863 #ifdef DEBUG
864 assert (match_last != -1);
865 assert (err == REG_NOERROR);
866 #endif
868 /* Set pmatch[] if we need. */
869 if (nmatch > 0)
871 int reg_idx;
873 /* Initialize registers. */
874 for (reg_idx = 1; reg_idx < nmatch; ++reg_idx)
875 pmatch[reg_idx].rm_so = pmatch[reg_idx].rm_eo = -1;
877 /* Set the points where matching start/end. */
878 pmatch[0].rm_so = 0;
879 pmatch[0].rm_eo = mctx.match_last;
881 if (!preg->no_sub && nmatch > 1)
883 err = set_regs (preg, &mctx, nmatch, pmatch,
884 dfa->has_plural_match && dfa->nbackref > 0);
885 if (BE (err != REG_NOERROR, 0))
886 goto free_return;
889 /* At last, add the offset to the each registers, since we slided
890 the buffers so that we could assume that the matching starts
891 from 0. */
892 for (reg_idx = 0; reg_idx < nmatch; ++reg_idx)
893 if (pmatch[reg_idx].rm_so != -1)
895 #ifdef RE_ENABLE_I18N
896 if (BE (mctx.input.offsets_needed != 0, 0))
898 pmatch[reg_idx].rm_so =
899 (pmatch[reg_idx].rm_so == mctx.input.valid_len
900 ? mctx.input.valid_raw_len
901 : mctx.input.offsets[pmatch[reg_idx].rm_so]);
902 pmatch[reg_idx].rm_eo =
903 (pmatch[reg_idx].rm_eo == mctx.input.valid_len
904 ? mctx.input.valid_raw_len
905 : mctx.input.offsets[pmatch[reg_idx].rm_eo]);
907 #else
908 assert (mctx.input.offsets_needed == 0);
909 #endif
910 pmatch[reg_idx].rm_so += match_first;
911 pmatch[reg_idx].rm_eo += match_first;
913 for (reg_idx = 0; reg_idx < extra_nmatch; ++reg_idx)
915 pmatch[nmatch + reg_idx].rm_so = -1;
916 pmatch[nmatch + reg_idx].rm_eo = -1;
919 if (dfa->subexp_map)
920 for (reg_idx = 0; reg_idx + 1 < nmatch; reg_idx++)
921 if (dfa->subexp_map[reg_idx] != reg_idx)
923 pmatch[reg_idx + 1].rm_so
924 = pmatch[dfa->subexp_map[reg_idx] + 1].rm_so;
925 pmatch[reg_idx + 1].rm_eo
926 = pmatch[dfa->subexp_map[reg_idx] + 1].rm_eo;
930 free_return:
931 re_free (mctx.state_log);
932 if (dfa->nbackref)
933 match_ctx_free (&mctx);
934 re_string_destruct (&mctx.input);
935 return err;
938 static reg_errcode_t
939 prune_impossible_nodes (mctx)
940 re_match_context_t *mctx;
942 const re_dfa_t *const dfa = mctx->dfa;
943 int halt_node, match_last;
944 reg_errcode_t ret;
945 re_dfastate_t **sifted_states;
946 re_dfastate_t **lim_states = NULL;
947 re_sift_context_t sctx;
948 #ifdef DEBUG
949 assert (mctx->state_log != NULL);
950 #endif
951 match_last = mctx->match_last;
952 halt_node = mctx->last_node;
953 sifted_states = re_malloc (re_dfastate_t *, match_last + 1);
954 if (BE (sifted_states == NULL, 0))
956 ret = REG_ESPACE;
957 goto free_return;
959 if (dfa->nbackref)
961 lim_states = re_malloc (re_dfastate_t *, match_last + 1);
962 if (BE (lim_states == NULL, 0))
964 ret = REG_ESPACE;
965 goto free_return;
967 while (1)
969 memset (lim_states, '\0',
970 sizeof (re_dfastate_t *) * (match_last + 1));
971 sift_ctx_init (&sctx, sifted_states, lim_states, halt_node,
972 match_last);
973 ret = sift_states_backward (mctx, &sctx);
974 re_node_set_free (&sctx.limits);
975 if (BE (ret != REG_NOERROR, 0))
976 goto free_return;
977 if (sifted_states[0] != NULL || lim_states[0] != NULL)
978 break;
981 --match_last;
982 if (match_last < 0)
984 ret = REG_NOMATCH;
985 goto free_return;
987 } while (mctx->state_log[match_last] == NULL
988 || !mctx->state_log[match_last]->halt);
989 halt_node = check_halt_state_context (mctx,
990 mctx->state_log[match_last],
991 match_last);
993 ret = merge_state_array (dfa, sifted_states, lim_states,
994 match_last + 1);
995 re_free (lim_states);
996 lim_states = NULL;
997 if (BE (ret != REG_NOERROR, 0))
998 goto free_return;
1000 else
1002 sift_ctx_init (&sctx, sifted_states, lim_states, halt_node, match_last);
1003 ret = sift_states_backward (mctx, &sctx);
1004 re_node_set_free (&sctx.limits);
1005 if (BE (ret != REG_NOERROR, 0))
1006 goto free_return;
1008 re_free (mctx->state_log);
1009 mctx->state_log = sifted_states;
1010 sifted_states = NULL;
1011 mctx->last_node = halt_node;
1012 mctx->match_last = match_last;
1013 ret = REG_NOERROR;
1014 free_return:
1015 re_free (sifted_states);
1016 re_free (lim_states);
1017 return ret;
1020 /* Acquire an initial state and return it.
1021 We must select appropriate initial state depending on the context,
1022 since initial states may have constraints like "\<", "^", etc.. */
1024 static inline re_dfastate_t *
1025 __attribute ((always_inline)) internal_function
1026 acquire_init_state_context (reg_errcode_t *err, const re_match_context_t *mctx,
1027 int idx)
1029 const re_dfa_t *const dfa = mctx->dfa;
1030 if (dfa->init_state->has_constraint)
1032 unsigned int context;
1033 context = re_string_context_at (&mctx->input, idx - 1, mctx->eflags);
1034 if (IS_WORD_CONTEXT (context))
1035 return dfa->init_state_word;
1036 else if (IS_ORDINARY_CONTEXT (context))
1037 return dfa->init_state;
1038 else if (IS_BEGBUF_CONTEXT (context) && IS_NEWLINE_CONTEXT (context))
1039 return dfa->init_state_begbuf;
1040 else if (IS_NEWLINE_CONTEXT (context))
1041 return dfa->init_state_nl;
1042 else if (IS_BEGBUF_CONTEXT (context))
1044 /* It is relatively rare case, then calculate on demand. */
1045 return re_acquire_state_context (err, dfa,
1046 dfa->init_state->entrance_nodes,
1047 context);
1049 else
1050 /* Must not happen? */
1051 return dfa->init_state;
1053 else
1054 return dfa->init_state;
1057 /* Check whether the regular expression match input string INPUT or not,
1058 and return the index where the matching end, return -1 if not match,
1059 or return -2 in case of an error.
1060 FL_LONGEST_MATCH means we want the POSIX longest matching.
1061 If P_MATCH_FIRST is not NULL, and the match fails, it is set to the
1062 next place where we may want to try matching.
1063 Note that the matcher assume that the maching starts from the current
1064 index of the buffer. */
1066 static int
1067 internal_function
1068 check_matching (re_match_context_t *mctx, int fl_longest_match,
1069 int *p_match_first)
1071 const re_dfa_t *const dfa = mctx->dfa;
1072 reg_errcode_t err;
1073 int match = 0;
1074 int match_last = -1;
1075 int cur_str_idx = re_string_cur_idx (&mctx->input);
1076 re_dfastate_t *cur_state;
1077 int at_init_state = p_match_first != NULL;
1078 int next_start_idx = cur_str_idx;
1080 err = REG_NOERROR;
1081 cur_state = acquire_init_state_context (&err, mctx, cur_str_idx);
1082 /* An initial state must not be NULL (invalid). */
1083 if (BE (cur_state == NULL, 0))
1085 assert (err == REG_ESPACE);
1086 return -2;
1089 if (mctx->state_log != NULL)
1091 mctx->state_log[cur_str_idx] = cur_state;
1093 /* Check OP_OPEN_SUBEXP in the initial state in case that we use them
1094 later. E.g. Processing back references. */
1095 if (BE (dfa->nbackref, 0))
1097 at_init_state = 0;
1098 err = check_subexp_matching_top (mctx, &cur_state->nodes, 0);
1099 if (BE (err != REG_NOERROR, 0))
1100 return err;
1102 if (cur_state->has_backref)
1104 err = transit_state_bkref (mctx, &cur_state->nodes);
1105 if (BE (err != REG_NOERROR, 0))
1106 return err;
1111 /* If the RE accepts NULL string. */
1112 if (BE (cur_state->halt, 0))
1114 if (!cur_state->has_constraint
1115 || check_halt_state_context (mctx, cur_state, cur_str_idx))
1117 if (!fl_longest_match)
1118 return cur_str_idx;
1119 else
1121 match_last = cur_str_idx;
1122 match = 1;
1127 while (!re_string_eoi (&mctx->input))
1129 re_dfastate_t *old_state = cur_state;
1130 int next_char_idx = re_string_cur_idx (&mctx->input) + 1;
1132 if (BE (next_char_idx >= mctx->input.bufs_len, 0)
1133 || (BE (next_char_idx >= mctx->input.valid_len, 0)
1134 && mctx->input.valid_len < mctx->input.len))
1136 err = extend_buffers (mctx);
1137 if (BE (err != REG_NOERROR, 0))
1139 assert (err == REG_ESPACE);
1140 return -2;
1144 cur_state = transit_state (&err, mctx, cur_state);
1145 if (mctx->state_log != NULL)
1146 cur_state = merge_state_with_log (&err, mctx, cur_state);
1148 if (cur_state == NULL)
1150 /* Reached the invalid state or an error. Try to recover a valid
1151 state using the state log, if available and if we have not
1152 already found a valid (even if not the longest) match. */
1153 if (BE (err != REG_NOERROR, 0))
1154 return -2;
1156 if (mctx->state_log == NULL
1157 || (match && !fl_longest_match)
1158 || (cur_state = find_recover_state (&err, mctx)) == NULL)
1159 break;
1162 if (BE (at_init_state, 0))
1164 if (old_state == cur_state)
1165 next_start_idx = next_char_idx;
1166 else
1167 at_init_state = 0;
1170 if (cur_state->halt)
1172 /* Reached a halt state.
1173 Check the halt state can satisfy the current context. */
1174 if (!cur_state->has_constraint
1175 || check_halt_state_context (mctx, cur_state,
1176 re_string_cur_idx (&mctx->input)))
1178 /* We found an appropriate halt state. */
1179 match_last = re_string_cur_idx (&mctx->input);
1180 match = 1;
1182 /* We found a match, do not modify match_first below. */
1183 p_match_first = NULL;
1184 if (!fl_longest_match)
1185 break;
1190 if (p_match_first)
1191 *p_match_first += next_start_idx;
1193 return match_last;
1196 /* Check NODE match the current context. */
1198 static int
1199 internal_function
1200 check_halt_node_context (const re_dfa_t *dfa, int node, unsigned int context)
1202 re_token_type_t type = dfa->nodes[node].type;
1203 unsigned int constraint = dfa->nodes[node].constraint;
1204 if (type != END_OF_RE)
1205 return 0;
1206 if (!constraint)
1207 return 1;
1208 if (NOT_SATISFY_NEXT_CONSTRAINT (constraint, context))
1209 return 0;
1210 return 1;
1213 /* Check the halt state STATE match the current context.
1214 Return 0 if not match, if the node, STATE has, is a halt node and
1215 match the context, return the node. */
1217 static int
1218 internal_function
1219 check_halt_state_context (const re_match_context_t *mctx,
1220 const re_dfastate_t *state, int idx)
1222 int i;
1223 unsigned int context;
1224 #ifdef DEBUG
1225 assert (state->halt);
1226 #endif
1227 context = re_string_context_at (&mctx->input, idx, mctx->eflags);
1228 for (i = 0; i < state->nodes.nelem; ++i)
1229 if (check_halt_node_context (mctx->dfa, state->nodes.elems[i], context))
1230 return state->nodes.elems[i];
1231 return 0;
1234 /* Compute the next node to which "NFA" transit from NODE("NFA" is a NFA
1235 corresponding to the DFA).
1236 Return the destination node, and update EPS_VIA_NODES, return -1 in case
1237 of errors. */
1239 static int
1240 internal_function
1241 proceed_next_node (const re_match_context_t *mctx, int nregs, regmatch_t *regs,
1242 int *pidx, int node, re_node_set *eps_via_nodes,
1243 struct re_fail_stack_t *fs)
1245 const re_dfa_t *const dfa = mctx->dfa;
1246 int i, err;
1247 if (IS_EPSILON_NODE (dfa->nodes[node].type))
1249 re_node_set *cur_nodes = &mctx->state_log[*pidx]->nodes;
1250 re_node_set *edests = &dfa->edests[node];
1251 int dest_node;
1252 err = re_node_set_insert (eps_via_nodes, node);
1253 if (BE (err < 0, 0))
1254 return -2;
1255 /* Pick up a valid destination, or return -1 if none is found. */
1256 for (dest_node = -1, i = 0; i < edests->nelem; ++i)
1258 int candidate = edests->elems[i];
1259 if (!re_node_set_contains (cur_nodes, candidate))
1260 continue;
1261 if (dest_node == -1)
1262 dest_node = candidate;
1264 else
1266 /* In order to avoid infinite loop like "(a*)*", return the second
1267 epsilon-transition if the first was already considered. */
1268 if (re_node_set_contains (eps_via_nodes, dest_node))
1269 return candidate;
1271 /* Otherwise, push the second epsilon-transition on the fail stack. */
1272 else if (fs != NULL
1273 && push_fail_stack (fs, *pidx, candidate, nregs, regs,
1274 eps_via_nodes))
1275 return -2;
1277 /* We know we are going to exit. */
1278 break;
1281 return dest_node;
1283 else
1285 int naccepted = 0;
1286 re_token_type_t type = dfa->nodes[node].type;
1288 #ifdef RE_ENABLE_I18N
1289 if (dfa->nodes[node].accept_mb)
1290 naccepted = check_node_accept_bytes (dfa, node, &mctx->input, *pidx);
1291 else
1292 #endif /* RE_ENABLE_I18N */
1293 if (type == OP_BACK_REF)
1295 int subexp_idx = dfa->nodes[node].opr.idx + 1;
1296 naccepted = regs[subexp_idx].rm_eo - regs[subexp_idx].rm_so;
1297 if (fs != NULL)
1299 if (regs[subexp_idx].rm_so == -1 || regs[subexp_idx].rm_eo == -1)
1300 return -1;
1301 else if (naccepted)
1303 char *buf = (char *) re_string_get_buffer (&mctx->input);
1304 if (memcmp (buf + regs[subexp_idx].rm_so, buf + *pidx,
1305 naccepted) != 0)
1306 return -1;
1310 if (naccepted == 0)
1312 int dest_node;
1313 err = re_node_set_insert (eps_via_nodes, node);
1314 if (BE (err < 0, 0))
1315 return -2;
1316 dest_node = dfa->edests[node].elems[0];
1317 if (re_node_set_contains (&mctx->state_log[*pidx]->nodes,
1318 dest_node))
1319 return dest_node;
1323 if (naccepted != 0
1324 || check_node_accept (mctx, dfa->nodes + node, *pidx))
1326 int dest_node = dfa->nexts[node];
1327 *pidx = (naccepted == 0) ? *pidx + 1 : *pidx + naccepted;
1328 if (fs && (*pidx > mctx->match_last || mctx->state_log[*pidx] == NULL
1329 || !re_node_set_contains (&mctx->state_log[*pidx]->nodes,
1330 dest_node)))
1331 return -1;
1332 re_node_set_empty (eps_via_nodes);
1333 return dest_node;
1336 return -1;
1339 static reg_errcode_t
1340 internal_function
1341 push_fail_stack (struct re_fail_stack_t *fs, int str_idx, int dest_node,
1342 int nregs, regmatch_t *regs, re_node_set *eps_via_nodes)
1344 reg_errcode_t err;
1345 int num = fs->num++;
1346 if (fs->num == fs->alloc)
1348 struct re_fail_stack_ent_t *new_array;
1349 new_array = realloc (fs->stack, (sizeof (struct re_fail_stack_ent_t)
1350 * fs->alloc * 2));
1351 if (new_array == NULL)
1352 return REG_ESPACE;
1353 fs->alloc *= 2;
1354 fs->stack = new_array;
1356 fs->stack[num].idx = str_idx;
1357 fs->stack[num].node = dest_node;
1358 fs->stack[num].regs = re_malloc (regmatch_t, nregs);
1359 if (fs->stack[num].regs == NULL)
1360 return REG_ESPACE;
1361 memcpy (fs->stack[num].regs, regs, sizeof (regmatch_t) * nregs);
1362 err = re_node_set_init_copy (&fs->stack[num].eps_via_nodes, eps_via_nodes);
1363 return err;
1366 static int
1367 internal_function
1368 pop_fail_stack (struct re_fail_stack_t *fs, int *pidx, int nregs,
1369 regmatch_t *regs, re_node_set *eps_via_nodes)
1371 int num = --fs->num;
1372 assert (num >= 0);
1373 *pidx = fs->stack[num].idx;
1374 memcpy (regs, fs->stack[num].regs, sizeof (regmatch_t) * nregs);
1375 re_node_set_free (eps_via_nodes);
1376 re_free (fs->stack[num].regs);
1377 *eps_via_nodes = fs->stack[num].eps_via_nodes;
1378 return fs->stack[num].node;
1381 /* Set the positions where the subexpressions are starts/ends to registers
1382 PMATCH.
1383 Note: We assume that pmatch[0] is already set, and
1384 pmatch[i].rm_so == pmatch[i].rm_eo == -1 for 0 < i < nmatch. */
1386 static reg_errcode_t
1387 internal_function
1388 set_regs (const regex_t *preg, const re_match_context_t *mctx, size_t nmatch,
1389 regmatch_t *pmatch, int fl_backtrack)
1391 const re_dfa_t *dfa = (const re_dfa_t *) preg->buffer;
1392 int idx, cur_node;
1393 re_node_set eps_via_nodes;
1394 struct re_fail_stack_t *fs;
1395 struct re_fail_stack_t fs_body = { 0, 2, NULL };
1396 regmatch_t *prev_idx_match;
1397 int prev_idx_match_malloced = 0;
1399 #ifdef DEBUG
1400 assert (nmatch > 1);
1401 assert (mctx->state_log != NULL);
1402 #endif
1403 if (fl_backtrack)
1405 fs = &fs_body;
1406 fs->stack = re_malloc (struct re_fail_stack_ent_t, fs->alloc);
1407 if (fs->stack == NULL)
1408 return REG_ESPACE;
1410 else
1411 fs = NULL;
1413 cur_node = dfa->init_node;
1414 re_node_set_init_empty (&eps_via_nodes);
1416 if (__libc_use_alloca (nmatch * sizeof (regmatch_t)))
1417 prev_idx_match = (regmatch_t *) alloca (nmatch * sizeof (regmatch_t));
1418 else
1420 prev_idx_match = re_malloc (regmatch_t, nmatch);
1421 if (prev_idx_match == NULL)
1423 free_fail_stack_return (fs);
1424 return REG_ESPACE;
1426 prev_idx_match_malloced = 1;
1428 memcpy (prev_idx_match, pmatch, sizeof (regmatch_t) * nmatch);
1430 for (idx = pmatch[0].rm_so; idx <= pmatch[0].rm_eo ;)
1432 update_regs (dfa, pmatch, prev_idx_match, cur_node, idx, nmatch);
1434 if (idx == pmatch[0].rm_eo && cur_node == mctx->last_node)
1436 int reg_idx;
1437 if (fs)
1439 for (reg_idx = 0; reg_idx < nmatch; ++reg_idx)
1440 if (pmatch[reg_idx].rm_so > -1 && pmatch[reg_idx].rm_eo == -1)
1441 break;
1442 if (reg_idx == nmatch)
1444 re_node_set_free (&eps_via_nodes);
1445 if (prev_idx_match_malloced)
1446 re_free (prev_idx_match);
1447 return free_fail_stack_return (fs);
1449 cur_node = pop_fail_stack (fs, &idx, nmatch, pmatch,
1450 &eps_via_nodes);
1452 else
1454 re_node_set_free (&eps_via_nodes);
1455 if (prev_idx_match_malloced)
1456 re_free (prev_idx_match);
1457 return REG_NOERROR;
1461 /* Proceed to next node. */
1462 cur_node = proceed_next_node (mctx, nmatch, pmatch, &idx, cur_node,
1463 &eps_via_nodes, fs);
1465 if (BE (cur_node < 0, 0))
1467 if (BE (cur_node == -2, 0))
1469 re_node_set_free (&eps_via_nodes);
1470 if (prev_idx_match_malloced)
1471 re_free (prev_idx_match);
1472 free_fail_stack_return (fs);
1473 return REG_ESPACE;
1475 if (fs)
1476 cur_node = pop_fail_stack (fs, &idx, nmatch, pmatch,
1477 &eps_via_nodes);
1478 else
1480 re_node_set_free (&eps_via_nodes);
1481 if (prev_idx_match_malloced)
1482 re_free (prev_idx_match);
1483 return REG_NOMATCH;
1487 re_node_set_free (&eps_via_nodes);
1488 if (prev_idx_match_malloced)
1489 re_free (prev_idx_match);
1490 return free_fail_stack_return (fs);
1493 static reg_errcode_t
1494 internal_function
1495 free_fail_stack_return (struct re_fail_stack_t *fs)
1497 if (fs)
1499 int fs_idx;
1500 for (fs_idx = 0; fs_idx < fs->num; ++fs_idx)
1502 re_node_set_free (&fs->stack[fs_idx].eps_via_nodes);
1503 re_free (fs->stack[fs_idx].regs);
1505 re_free (fs->stack);
1507 return REG_NOERROR;
1510 static void
1511 internal_function
1512 update_regs (const re_dfa_t *dfa, regmatch_t *pmatch,
1513 regmatch_t *prev_idx_match, int cur_node, int cur_idx, int nmatch)
1515 int type = dfa->nodes[cur_node].type;
1516 if (type == OP_OPEN_SUBEXP)
1518 int reg_num = dfa->nodes[cur_node].opr.idx + 1;
1520 /* We are at the first node of this sub expression. */
1521 if (reg_num < nmatch)
1523 pmatch[reg_num].rm_so = cur_idx;
1524 pmatch[reg_num].rm_eo = -1;
1527 else if (type == OP_CLOSE_SUBEXP)
1529 int reg_num = dfa->nodes[cur_node].opr.idx + 1;
1530 if (reg_num < nmatch)
1532 /* We are at the last node of this sub expression. */
1533 if (pmatch[reg_num].rm_so < cur_idx)
1535 pmatch[reg_num].rm_eo = cur_idx;
1536 /* This is a non-empty match or we are not inside an optional
1537 subexpression. Accept this right away. */
1538 memcpy (prev_idx_match, pmatch, sizeof (regmatch_t) * nmatch);
1540 else
1542 if (dfa->nodes[cur_node].opt_subexp
1543 && prev_idx_match[reg_num].rm_so != -1)
1544 /* We transited through an empty match for an optional
1545 subexpression, like (a?)*, and this is not the subexp's
1546 first match. Copy back the old content of the registers
1547 so that matches of an inner subexpression are undone as
1548 well, like in ((a?))*. */
1549 memcpy (pmatch, prev_idx_match, sizeof (regmatch_t) * nmatch);
1550 else
1551 /* We completed a subexpression, but it may be part of
1552 an optional one, so do not update PREV_IDX_MATCH. */
1553 pmatch[reg_num].rm_eo = cur_idx;
1559 /* This function checks the STATE_LOG from the SCTX->last_str_idx to 0
1560 and sift the nodes in each states according to the following rules.
1561 Updated state_log will be wrote to STATE_LOG.
1563 Rules: We throw away the Node `a' in the STATE_LOG[STR_IDX] if...
1564 1. When STR_IDX == MATCH_LAST(the last index in the state_log):
1565 If `a' isn't the LAST_NODE and `a' can't epsilon transit to
1566 the LAST_NODE, we throw away the node `a'.
1567 2. When 0 <= STR_IDX < MATCH_LAST and `a' accepts
1568 string `s' and transit to `b':
1569 i. If 'b' isn't in the STATE_LOG[STR_IDX+strlen('s')], we throw
1570 away the node `a'.
1571 ii. If 'b' is in the STATE_LOG[STR_IDX+strlen('s')] but 'b' is
1572 thrown away, we throw away the node `a'.
1573 3. When 0 <= STR_IDX < MATCH_LAST and 'a' epsilon transit to 'b':
1574 i. If 'b' isn't in the STATE_LOG[STR_IDX], we throw away the
1575 node `a'.
1576 ii. If 'b' is in the STATE_LOG[STR_IDX] but 'b' is thrown away,
1577 we throw away the node `a'. */
1579 #define STATE_NODE_CONTAINS(state,node) \
1580 ((state) != NULL && re_node_set_contains (&(state)->nodes, node))
1582 static reg_errcode_t
1583 internal_function
1584 sift_states_backward (const re_match_context_t *mctx, re_sift_context_t *sctx)
1586 reg_errcode_t err;
1587 int null_cnt = 0;
1588 int str_idx = sctx->last_str_idx;
1589 re_node_set cur_dest;
1591 #ifdef DEBUG
1592 assert (mctx->state_log != NULL && mctx->state_log[str_idx] != NULL);
1593 #endif
1595 /* Build sifted state_log[str_idx]. It has the nodes which can epsilon
1596 transit to the last_node and the last_node itself. */
1597 err = re_node_set_init_1 (&cur_dest, sctx->last_node);
1598 if (BE (err != REG_NOERROR, 0))
1599 return err;
1600 err = update_cur_sifted_state (mctx, sctx, str_idx, &cur_dest);
1601 if (BE (err != REG_NOERROR, 0))
1602 goto free_return;
1604 /* Then check each states in the state_log. */
1605 while (str_idx > 0)
1607 /* Update counters. */
1608 null_cnt = (sctx->sifted_states[str_idx] == NULL) ? null_cnt + 1 : 0;
1609 if (null_cnt > mctx->max_mb_elem_len)
1611 memset (sctx->sifted_states, '\0',
1612 sizeof (re_dfastate_t *) * str_idx);
1613 re_node_set_free (&cur_dest);
1614 return REG_NOERROR;
1616 re_node_set_empty (&cur_dest);
1617 --str_idx;
1619 if (mctx->state_log[str_idx])
1621 err = build_sifted_states (mctx, sctx, str_idx, &cur_dest);
1622 if (BE (err != REG_NOERROR, 0))
1623 goto free_return;
1626 /* Add all the nodes which satisfy the following conditions:
1627 - It can epsilon transit to a node in CUR_DEST.
1628 - It is in CUR_SRC.
1629 And update state_log. */
1630 err = update_cur_sifted_state (mctx, sctx, str_idx, &cur_dest);
1631 if (BE (err != REG_NOERROR, 0))
1632 goto free_return;
1634 err = REG_NOERROR;
1635 free_return:
1636 re_node_set_free (&cur_dest);
1637 return err;
1640 static reg_errcode_t
1641 internal_function
1642 build_sifted_states (const re_match_context_t *mctx, re_sift_context_t *sctx,
1643 int str_idx, re_node_set *cur_dest)
1645 const re_dfa_t *const dfa = mctx->dfa;
1646 const re_node_set *cur_src = &mctx->state_log[str_idx]->non_eps_nodes;
1647 int i;
1649 /* Then build the next sifted state.
1650 We build the next sifted state on `cur_dest', and update
1651 `sifted_states[str_idx]' with `cur_dest'.
1652 Note:
1653 `cur_dest' is the sifted state from `state_log[str_idx + 1]'.
1654 `cur_src' points the node_set of the old `state_log[str_idx]'
1655 (with the epsilon nodes pre-filtered out). */
1656 for (i = 0; i < cur_src->nelem; i++)
1658 int prev_node = cur_src->elems[i];
1659 int naccepted = 0;
1660 int ret;
1662 #ifdef DEBUG
1663 re_token_type_t type = dfa->nodes[prev_node].type;
1664 assert (!IS_EPSILON_NODE (type));
1665 #endif
1666 #ifdef RE_ENABLE_I18N
1667 /* If the node may accept `multi byte'. */
1668 if (dfa->nodes[prev_node].accept_mb)
1669 naccepted = sift_states_iter_mb (mctx, sctx, prev_node,
1670 str_idx, sctx->last_str_idx);
1671 #endif /* RE_ENABLE_I18N */
1673 /* We don't check backreferences here.
1674 See update_cur_sifted_state(). */
1675 if (!naccepted
1676 && check_node_accept (mctx, dfa->nodes + prev_node, str_idx)
1677 && STATE_NODE_CONTAINS (sctx->sifted_states[str_idx + 1],
1678 dfa->nexts[prev_node]))
1679 naccepted = 1;
1681 if (naccepted == 0)
1682 continue;
1684 if (sctx->limits.nelem)
1686 int to_idx = str_idx + naccepted;
1687 if (check_dst_limits (mctx, &sctx->limits,
1688 dfa->nexts[prev_node], to_idx,
1689 prev_node, str_idx))
1690 continue;
1692 ret = re_node_set_insert (cur_dest, prev_node);
1693 if (BE (ret == -1, 0))
1694 return REG_ESPACE;
1697 return REG_NOERROR;
1700 /* Helper functions. */
1702 static reg_errcode_t
1703 internal_function
1704 clean_state_log_if_needed (re_match_context_t *mctx, int next_state_log_idx)
1706 int top = mctx->state_log_top;
1708 if (next_state_log_idx >= mctx->input.bufs_len
1709 || (next_state_log_idx >= mctx->input.valid_len
1710 && mctx->input.valid_len < mctx->input.len))
1712 reg_errcode_t err;
1713 err = extend_buffers (mctx);
1714 if (BE (err != REG_NOERROR, 0))
1715 return err;
1718 if (top < next_state_log_idx)
1720 memset (mctx->state_log + top + 1, '\0',
1721 sizeof (re_dfastate_t *) * (next_state_log_idx - top));
1722 mctx->state_log_top = next_state_log_idx;
1724 return REG_NOERROR;
1727 static reg_errcode_t
1728 internal_function
1729 merge_state_array (const re_dfa_t *dfa, re_dfastate_t **dst,
1730 re_dfastate_t **src, int num)
1732 int st_idx;
1733 reg_errcode_t err;
1734 for (st_idx = 0; st_idx < num; ++st_idx)
1736 if (dst[st_idx] == NULL)
1737 dst[st_idx] = src[st_idx];
1738 else if (src[st_idx] != NULL)
1740 re_node_set merged_set;
1741 err = re_node_set_init_union (&merged_set, &dst[st_idx]->nodes,
1742 &src[st_idx]->nodes);
1743 if (BE (err != REG_NOERROR, 0))
1744 return err;
1745 dst[st_idx] = re_acquire_state (&err, dfa, &merged_set);
1746 re_node_set_free (&merged_set);
1747 if (BE (err != REG_NOERROR, 0))
1748 return err;
1751 return REG_NOERROR;
1754 static reg_errcode_t
1755 internal_function
1756 update_cur_sifted_state (const re_match_context_t *mctx,
1757 re_sift_context_t *sctx, int str_idx,
1758 re_node_set *dest_nodes)
1760 const re_dfa_t *const dfa = mctx->dfa;
1761 reg_errcode_t err = REG_NOERROR;
1762 const re_node_set *candidates;
1763 candidates = ((mctx->state_log[str_idx] == NULL) ? NULL
1764 : &mctx->state_log[str_idx]->nodes);
1766 if (dest_nodes->nelem == 0)
1767 sctx->sifted_states[str_idx] = NULL;
1768 else
1770 if (candidates)
1772 /* At first, add the nodes which can epsilon transit to a node in
1773 DEST_NODE. */
1774 err = add_epsilon_src_nodes (dfa, dest_nodes, candidates);
1775 if (BE (err != REG_NOERROR, 0))
1776 return err;
1778 /* Then, check the limitations in the current sift_context. */
1779 if (sctx->limits.nelem)
1781 err = check_subexp_limits (dfa, dest_nodes, candidates, &sctx->limits,
1782 mctx->bkref_ents, str_idx);
1783 if (BE (err != REG_NOERROR, 0))
1784 return err;
1788 sctx->sifted_states[str_idx] = re_acquire_state (&err, dfa, dest_nodes);
1789 if (BE (err != REG_NOERROR, 0))
1790 return err;
1793 if (candidates && mctx->state_log[str_idx]->has_backref)
1795 err = sift_states_bkref (mctx, sctx, str_idx, candidates);
1796 if (BE (err != REG_NOERROR, 0))
1797 return err;
1799 return REG_NOERROR;
1802 static reg_errcode_t
1803 internal_function
1804 add_epsilon_src_nodes (const re_dfa_t *dfa, re_node_set *dest_nodes,
1805 const re_node_set *candidates)
1807 reg_errcode_t err = REG_NOERROR;
1808 int i;
1810 re_dfastate_t *state = re_acquire_state (&err, dfa, dest_nodes);
1811 if (BE (err != REG_NOERROR, 0))
1812 return err;
1814 if (!state->inveclosure.alloc)
1816 err = re_node_set_alloc (&state->inveclosure, dest_nodes->nelem);
1817 if (BE (err != REG_NOERROR, 0))
1818 return REG_ESPACE;
1819 for (i = 0; i < dest_nodes->nelem; i++)
1820 re_node_set_merge (&state->inveclosure,
1821 dfa->inveclosures + dest_nodes->elems[i]);
1823 return re_node_set_add_intersect (dest_nodes, candidates,
1824 &state->inveclosure);
1827 static reg_errcode_t
1828 internal_function
1829 sub_epsilon_src_nodes (const re_dfa_t *dfa, int node, re_node_set *dest_nodes,
1830 const re_node_set *candidates)
1832 int ecl_idx;
1833 reg_errcode_t err;
1834 re_node_set *inv_eclosure = dfa->inveclosures + node;
1835 re_node_set except_nodes;
1836 re_node_set_init_empty (&except_nodes);
1837 for (ecl_idx = 0; ecl_idx < inv_eclosure->nelem; ++ecl_idx)
1839 int cur_node = inv_eclosure->elems[ecl_idx];
1840 if (cur_node == node)
1841 continue;
1842 if (IS_EPSILON_NODE (dfa->nodes[cur_node].type))
1844 int edst1 = dfa->edests[cur_node].elems[0];
1845 int edst2 = ((dfa->edests[cur_node].nelem > 1)
1846 ? dfa->edests[cur_node].elems[1] : -1);
1847 if ((!re_node_set_contains (inv_eclosure, edst1)
1848 && re_node_set_contains (dest_nodes, edst1))
1849 || (edst2 > 0
1850 && !re_node_set_contains (inv_eclosure, edst2)
1851 && re_node_set_contains (dest_nodes, edst2)))
1853 err = re_node_set_add_intersect (&except_nodes, candidates,
1854 dfa->inveclosures + cur_node);
1855 if (BE (err != REG_NOERROR, 0))
1857 re_node_set_free (&except_nodes);
1858 return err;
1863 for (ecl_idx = 0; ecl_idx < inv_eclosure->nelem; ++ecl_idx)
1865 int cur_node = inv_eclosure->elems[ecl_idx];
1866 if (!re_node_set_contains (&except_nodes, cur_node))
1868 int idx = re_node_set_contains (dest_nodes, cur_node) - 1;
1869 re_node_set_remove_at (dest_nodes, idx);
1872 re_node_set_free (&except_nodes);
1873 return REG_NOERROR;
1876 static int
1877 internal_function
1878 check_dst_limits (const re_match_context_t *mctx, re_node_set *limits,
1879 int dst_node, int dst_idx, int src_node, int src_idx)
1881 const re_dfa_t *const dfa = mctx->dfa;
1882 int lim_idx, src_pos, dst_pos;
1884 int dst_bkref_idx = search_cur_bkref_entry (mctx, dst_idx);
1885 int src_bkref_idx = search_cur_bkref_entry (mctx, src_idx);
1886 for (lim_idx = 0; lim_idx < limits->nelem; ++lim_idx)
1888 int subexp_idx;
1889 struct re_backref_cache_entry *ent;
1890 ent = mctx->bkref_ents + limits->elems[lim_idx];
1891 subexp_idx = dfa->nodes[ent->node].opr.idx;
1893 dst_pos = check_dst_limits_calc_pos (mctx, limits->elems[lim_idx],
1894 subexp_idx, dst_node, dst_idx,
1895 dst_bkref_idx);
1896 src_pos = check_dst_limits_calc_pos (mctx, limits->elems[lim_idx],
1897 subexp_idx, src_node, src_idx,
1898 src_bkref_idx);
1900 /* In case of:
1901 <src> <dst> ( <subexp> )
1902 ( <subexp> ) <src> <dst>
1903 ( <subexp1> <src> <subexp2> <dst> <subexp3> ) */
1904 if (src_pos == dst_pos)
1905 continue; /* This is unrelated limitation. */
1906 else
1907 return 1;
1909 return 0;
1912 static int
1913 internal_function
1914 check_dst_limits_calc_pos_1 (const re_match_context_t *mctx, int boundaries,
1915 int subexp_idx, int from_node, int bkref_idx)
1917 const re_dfa_t *const dfa = mctx->dfa;
1918 const re_node_set *eclosures = dfa->eclosures + from_node;
1919 int node_idx;
1921 /* Else, we are on the boundary: examine the nodes on the epsilon
1922 closure. */
1923 for (node_idx = 0; node_idx < eclosures->nelem; ++node_idx)
1925 int node = eclosures->elems[node_idx];
1926 switch (dfa->nodes[node].type)
1928 case OP_BACK_REF:
1929 if (bkref_idx != -1)
1931 struct re_backref_cache_entry *ent = mctx->bkref_ents + bkref_idx;
1934 int dst, cpos;
1936 if (ent->node != node)
1937 continue;
1939 if (subexp_idx < BITSET_WORD_BITS
1940 && !(ent->eps_reachable_subexps_map
1941 & ((bitset_word_t) 1 << subexp_idx)))
1942 continue;
1944 /* Recurse trying to reach the OP_OPEN_SUBEXP and
1945 OP_CLOSE_SUBEXP cases below. But, if the
1946 destination node is the same node as the source
1947 node, don't recurse because it would cause an
1948 infinite loop: a regex that exhibits this behavior
1949 is ()\1*\1* */
1950 dst = dfa->edests[node].elems[0];
1951 if (dst == from_node)
1953 if (boundaries & 1)
1954 return -1;
1955 else /* if (boundaries & 2) */
1956 return 0;
1959 cpos =
1960 check_dst_limits_calc_pos_1 (mctx, boundaries, subexp_idx,
1961 dst, bkref_idx);
1962 if (cpos == -1 /* && (boundaries & 1) */)
1963 return -1;
1964 if (cpos == 0 && (boundaries & 2))
1965 return 0;
1967 if (subexp_idx < BITSET_WORD_BITS)
1968 ent->eps_reachable_subexps_map
1969 &= ~((bitset_word_t) 1 << subexp_idx);
1971 while (ent++->more);
1973 break;
1975 case OP_OPEN_SUBEXP:
1976 if ((boundaries & 1) && subexp_idx == dfa->nodes[node].opr.idx)
1977 return -1;
1978 break;
1980 case OP_CLOSE_SUBEXP:
1981 if ((boundaries & 2) && subexp_idx == dfa->nodes[node].opr.idx)
1982 return 0;
1983 break;
1985 default:
1986 break;
1990 return (boundaries & 2) ? 1 : 0;
1993 static int
1994 internal_function
1995 check_dst_limits_calc_pos (const re_match_context_t *mctx, int limit,
1996 int subexp_idx, int from_node, int str_idx,
1997 int bkref_idx)
1999 struct re_backref_cache_entry *lim = mctx->bkref_ents + limit;
2000 int boundaries;
2002 /* If we are outside the range of the subexpression, return -1 or 1. */
2003 if (str_idx < lim->subexp_from)
2004 return -1;
2006 if (lim->subexp_to < str_idx)
2007 return 1;
2009 /* If we are within the subexpression, return 0. */
2010 boundaries = (str_idx == lim->subexp_from);
2011 boundaries |= (str_idx == lim->subexp_to) << 1;
2012 if (boundaries == 0)
2013 return 0;
2015 /* Else, examine epsilon closure. */
2016 return check_dst_limits_calc_pos_1 (mctx, boundaries, subexp_idx,
2017 from_node, bkref_idx);
2020 /* Check the limitations of sub expressions LIMITS, and remove the nodes
2021 which are against limitations from DEST_NODES. */
2023 static reg_errcode_t
2024 internal_function
2025 check_subexp_limits (const re_dfa_t *dfa, re_node_set *dest_nodes,
2026 const re_node_set *candidates, re_node_set *limits,
2027 struct re_backref_cache_entry *bkref_ents, int str_idx)
2029 reg_errcode_t err;
2030 int node_idx, lim_idx;
2032 for (lim_idx = 0; lim_idx < limits->nelem; ++lim_idx)
2034 int subexp_idx;
2035 struct re_backref_cache_entry *ent;
2036 ent = bkref_ents + limits->elems[lim_idx];
2038 if (str_idx <= ent->subexp_from || ent->str_idx < str_idx)
2039 continue; /* This is unrelated limitation. */
2041 subexp_idx = dfa->nodes[ent->node].opr.idx;
2042 if (ent->subexp_to == str_idx)
2044 int ops_node = -1;
2045 int cls_node = -1;
2046 for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
2048 int node = dest_nodes->elems[node_idx];
2049 re_token_type_t type = dfa->nodes[node].type;
2050 if (type == OP_OPEN_SUBEXP
2051 && subexp_idx == dfa->nodes[node].opr.idx)
2052 ops_node = node;
2053 else if (type == OP_CLOSE_SUBEXP
2054 && subexp_idx == dfa->nodes[node].opr.idx)
2055 cls_node = node;
2058 /* Check the limitation of the open subexpression. */
2059 /* Note that (ent->subexp_to = str_idx != ent->subexp_from). */
2060 if (ops_node >= 0)
2062 err = sub_epsilon_src_nodes (dfa, ops_node, dest_nodes,
2063 candidates);
2064 if (BE (err != REG_NOERROR, 0))
2065 return err;
2068 /* Check the limitation of the close subexpression. */
2069 if (cls_node >= 0)
2070 for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
2072 int node = dest_nodes->elems[node_idx];
2073 if (!re_node_set_contains (dfa->inveclosures + node,
2074 cls_node)
2075 && !re_node_set_contains (dfa->eclosures + node,
2076 cls_node))
2078 /* It is against this limitation.
2079 Remove it form the current sifted state. */
2080 err = sub_epsilon_src_nodes (dfa, node, dest_nodes,
2081 candidates);
2082 if (BE (err != REG_NOERROR, 0))
2083 return err;
2084 --node_idx;
2088 else /* (ent->subexp_to != str_idx) */
2090 for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
2092 int node = dest_nodes->elems[node_idx];
2093 re_token_type_t type = dfa->nodes[node].type;
2094 if (type == OP_CLOSE_SUBEXP || type == OP_OPEN_SUBEXP)
2096 if (subexp_idx != dfa->nodes[node].opr.idx)
2097 continue;
2098 /* It is against this limitation.
2099 Remove it form the current sifted state. */
2100 err = sub_epsilon_src_nodes (dfa, node, dest_nodes,
2101 candidates);
2102 if (BE (err != REG_NOERROR, 0))
2103 return err;
2108 return REG_NOERROR;
2111 static reg_errcode_t
2112 internal_function
2113 sift_states_bkref (const re_match_context_t *mctx, re_sift_context_t *sctx,
2114 int str_idx, const re_node_set *candidates)
2116 const re_dfa_t *const dfa = mctx->dfa;
2117 reg_errcode_t err;
2118 int node_idx, node;
2119 re_sift_context_t local_sctx;
2120 int first_idx = search_cur_bkref_entry (mctx, str_idx);
2122 if (first_idx == -1)
2123 return REG_NOERROR;
2125 local_sctx.sifted_states = NULL; /* Mark that it hasn't been initialized. */
2127 for (node_idx = 0; node_idx < candidates->nelem; ++node_idx)
2129 int enabled_idx;
2130 re_token_type_t type;
2131 struct re_backref_cache_entry *entry;
2132 node = candidates->elems[node_idx];
2133 type = dfa->nodes[node].type;
2134 /* Avoid infinite loop for the REs like "()\1+". */
2135 if (node == sctx->last_node && str_idx == sctx->last_str_idx)
2136 continue;
2137 if (type != OP_BACK_REF)
2138 continue;
2140 entry = mctx->bkref_ents + first_idx;
2141 enabled_idx = first_idx;
2144 int subexp_len;
2145 int to_idx;
2146 int dst_node;
2147 int ret;
2148 re_dfastate_t *cur_state;
2150 if (entry->node != node)
2151 continue;
2152 subexp_len = entry->subexp_to - entry->subexp_from;
2153 to_idx = str_idx + subexp_len;
2154 dst_node = (subexp_len ? dfa->nexts[node]
2155 : dfa->edests[node].elems[0]);
2157 if (to_idx > sctx->last_str_idx
2158 || sctx->sifted_states[to_idx] == NULL
2159 || !STATE_NODE_CONTAINS (sctx->sifted_states[to_idx], dst_node)
2160 || check_dst_limits (mctx, &sctx->limits, node,
2161 str_idx, dst_node, to_idx))
2162 continue;
2164 if (local_sctx.sifted_states == NULL)
2166 local_sctx = *sctx;
2167 err = re_node_set_init_copy (&local_sctx.limits, &sctx->limits);
2168 if (BE (err != REG_NOERROR, 0))
2169 goto free_return;
2171 local_sctx.last_node = node;
2172 local_sctx.last_str_idx = str_idx;
2173 ret = re_node_set_insert (&local_sctx.limits, enabled_idx);
2174 if (BE (ret < 0, 0))
2176 err = REG_ESPACE;
2177 goto free_return;
2179 cur_state = local_sctx.sifted_states[str_idx];
2180 err = sift_states_backward (mctx, &local_sctx);
2181 if (BE (err != REG_NOERROR, 0))
2182 goto free_return;
2183 if (sctx->limited_states != NULL)
2185 err = merge_state_array (dfa, sctx->limited_states,
2186 local_sctx.sifted_states,
2187 str_idx + 1);
2188 if (BE (err != REG_NOERROR, 0))
2189 goto free_return;
2191 local_sctx.sifted_states[str_idx] = cur_state;
2192 re_node_set_remove (&local_sctx.limits, enabled_idx);
2194 /* mctx->bkref_ents may have changed, reload the pointer. */
2195 entry = mctx->bkref_ents + enabled_idx;
2197 while (enabled_idx++, entry++->more);
2199 err = REG_NOERROR;
2200 free_return:
2201 if (local_sctx.sifted_states != NULL)
2203 re_node_set_free (&local_sctx.limits);
2206 return err;
2210 #ifdef RE_ENABLE_I18N
2211 static int
2212 internal_function
2213 sift_states_iter_mb (const re_match_context_t *mctx, re_sift_context_t *sctx,
2214 int node_idx, int str_idx, int max_str_idx)
2216 const re_dfa_t *const dfa = mctx->dfa;
2217 int naccepted;
2218 /* Check the node can accept `multi byte'. */
2219 naccepted = check_node_accept_bytes (dfa, node_idx, &mctx->input, str_idx);
2220 if (naccepted > 0 && str_idx + naccepted <= max_str_idx &&
2221 !STATE_NODE_CONTAINS (sctx->sifted_states[str_idx + naccepted],
2222 dfa->nexts[node_idx]))
2223 /* The node can't accept the `multi byte', or the
2224 destination was already thrown away, then the node
2225 could't accept the current input `multi byte'. */
2226 naccepted = 0;
2227 /* Otherwise, it is sure that the node could accept
2228 `naccepted' bytes input. */
2229 return naccepted;
2231 #endif /* RE_ENABLE_I18N */
2234 /* Functions for state transition. */
2236 /* Return the next state to which the current state STATE will transit by
2237 accepting the current input byte, and update STATE_LOG if necessary.
2238 If STATE can accept a multibyte char/collating element/back reference
2239 update the destination of STATE_LOG. */
2241 static re_dfastate_t *
2242 internal_function
2243 transit_state (reg_errcode_t *err, re_match_context_t *mctx,
2244 re_dfastate_t *state)
2246 re_dfastate_t **trtable;
2247 unsigned char ch;
2249 #ifdef RE_ENABLE_I18N
2250 /* If the current state can accept multibyte. */
2251 if (BE (state->accept_mb, 0))
2253 *err = transit_state_mb (mctx, state);
2254 if (BE (*err != REG_NOERROR, 0))
2255 return NULL;
2257 #endif /* RE_ENABLE_I18N */
2259 /* Then decide the next state with the single byte. */
2260 #if 0
2261 if (0)
2262 /* don't use transition table */
2263 return transit_state_sb (err, mctx, state);
2264 #endif
2266 /* Use transition table */
2267 ch = re_string_fetch_byte (&mctx->input);
2268 for (;;)
2270 trtable = state->trtable;
2271 if (BE (trtable != NULL, 1))
2272 return trtable[ch];
2274 trtable = state->word_trtable;
2275 if (BE (trtable != NULL, 1))
2277 unsigned int context;
2278 context
2279 = re_string_context_at (&mctx->input,
2280 re_string_cur_idx (&mctx->input) - 1,
2281 mctx->eflags);
2282 if (IS_WORD_CONTEXT (context))
2283 return trtable[ch + SBC_MAX];
2284 else
2285 return trtable[ch];
2288 if (!build_trtable (mctx->dfa, state))
2290 *err = REG_ESPACE;
2291 return NULL;
2294 /* Retry, we now have a transition table. */
2298 /* Update the state_log if we need */
2299 re_dfastate_t *
2300 internal_function
2301 merge_state_with_log (reg_errcode_t *err, re_match_context_t *mctx,
2302 re_dfastate_t *next_state)
2304 const re_dfa_t *const dfa = mctx->dfa;
2305 int cur_idx = re_string_cur_idx (&mctx->input);
2307 if (cur_idx > mctx->state_log_top)
2309 mctx->state_log[cur_idx] = next_state;
2310 mctx->state_log_top = cur_idx;
2312 else if (mctx->state_log[cur_idx] == 0)
2314 mctx->state_log[cur_idx] = next_state;
2316 else
2318 re_dfastate_t *pstate;
2319 unsigned int context;
2320 re_node_set next_nodes, *log_nodes, *table_nodes = NULL;
2321 /* If (state_log[cur_idx] != 0), it implies that cur_idx is
2322 the destination of a multibyte char/collating element/
2323 back reference. Then the next state is the union set of
2324 these destinations and the results of the transition table. */
2325 pstate = mctx->state_log[cur_idx];
2326 log_nodes = pstate->entrance_nodes;
2327 if (next_state != NULL)
2329 table_nodes = next_state->entrance_nodes;
2330 *err = re_node_set_init_union (&next_nodes, table_nodes,
2331 log_nodes);
2332 if (BE (*err != REG_NOERROR, 0))
2333 return NULL;
2335 else
2336 next_nodes = *log_nodes;
2337 /* Note: We already add the nodes of the initial state,
2338 then we don't need to add them here. */
2340 context = re_string_context_at (&mctx->input,
2341 re_string_cur_idx (&mctx->input) - 1,
2342 mctx->eflags);
2343 next_state = mctx->state_log[cur_idx]
2344 = re_acquire_state_context (err, dfa, &next_nodes, context);
2345 /* We don't need to check errors here, since the return value of
2346 this function is next_state and ERR is already set. */
2348 if (table_nodes != NULL)
2349 re_node_set_free (&next_nodes);
2352 if (BE (dfa->nbackref, 0) && next_state != NULL)
2354 /* Check OP_OPEN_SUBEXP in the current state in case that we use them
2355 later. We must check them here, since the back references in the
2356 next state might use them. */
2357 *err = check_subexp_matching_top (mctx, &next_state->nodes,
2358 cur_idx);
2359 if (BE (*err != REG_NOERROR, 0))
2360 return NULL;
2362 /* If the next state has back references. */
2363 if (next_state->has_backref)
2365 *err = transit_state_bkref (mctx, &next_state->nodes);
2366 if (BE (*err != REG_NOERROR, 0))
2367 return NULL;
2368 next_state = mctx->state_log[cur_idx];
2372 return next_state;
2375 /* Skip bytes in the input that correspond to part of a
2376 multi-byte match, then look in the log for a state
2377 from which to restart matching. */
2378 re_dfastate_t *
2379 internal_function
2380 find_recover_state (reg_errcode_t *err, re_match_context_t *mctx)
2382 re_dfastate_t *cur_state;
2385 int max = mctx->state_log_top;
2386 int cur_str_idx = re_string_cur_idx (&mctx->input);
2390 if (++cur_str_idx > max)
2391 return NULL;
2392 re_string_skip_bytes (&mctx->input, 1);
2394 while (mctx->state_log[cur_str_idx] == NULL);
2396 cur_state = merge_state_with_log (err, mctx, NULL);
2398 while (*err == REG_NOERROR && cur_state == NULL);
2399 return cur_state;
2402 /* Helper functions for transit_state. */
2404 /* From the node set CUR_NODES, pick up the nodes whose types are
2405 OP_OPEN_SUBEXP and which have corresponding back references in the regular
2406 expression. And register them to use them later for evaluating the
2407 correspoding back references. */
2409 static reg_errcode_t
2410 internal_function
2411 check_subexp_matching_top (re_match_context_t *mctx, re_node_set *cur_nodes,
2412 int str_idx)
2414 const re_dfa_t *const dfa = mctx->dfa;
2415 int node_idx;
2416 reg_errcode_t err;
2418 /* TODO: This isn't efficient.
2419 Because there might be more than one nodes whose types are
2420 OP_OPEN_SUBEXP and whose index is SUBEXP_IDX, we must check all
2421 nodes.
2422 E.g. RE: (a){2} */
2423 for (node_idx = 0; node_idx < cur_nodes->nelem; ++node_idx)
2425 int node = cur_nodes->elems[node_idx];
2426 if (dfa->nodes[node].type == OP_OPEN_SUBEXP
2427 && dfa->nodes[node].opr.idx < BITSET_WORD_BITS
2428 && (dfa->used_bkref_map
2429 & ((bitset_word_t) 1 << dfa->nodes[node].opr.idx)))
2431 err = match_ctx_add_subtop (mctx, node, str_idx);
2432 if (BE (err != REG_NOERROR, 0))
2433 return err;
2436 return REG_NOERROR;
2439 #if 0
2440 /* Return the next state to which the current state STATE will transit by
2441 accepting the current input byte. */
2443 static re_dfastate_t *
2444 transit_state_sb (reg_errcode_t *err, re_match_context_t *mctx,
2445 re_dfastate_t *state)
2447 const re_dfa_t *const dfa = mctx->dfa;
2448 re_node_set next_nodes;
2449 re_dfastate_t *next_state;
2450 int node_cnt, cur_str_idx = re_string_cur_idx (&mctx->input);
2451 unsigned int context;
2453 *err = re_node_set_alloc (&next_nodes, state->nodes.nelem + 1);
2454 if (BE (*err != REG_NOERROR, 0))
2455 return NULL;
2456 for (node_cnt = 0; node_cnt < state->nodes.nelem; ++node_cnt)
2458 int cur_node = state->nodes.elems[node_cnt];
2459 if (check_node_accept (mctx, dfa->nodes + cur_node, cur_str_idx))
2461 *err = re_node_set_merge (&next_nodes,
2462 dfa->eclosures + dfa->nexts[cur_node]);
2463 if (BE (*err != REG_NOERROR, 0))
2465 re_node_set_free (&next_nodes);
2466 return NULL;
2470 context = re_string_context_at (&mctx->input, cur_str_idx, mctx->eflags);
2471 next_state = re_acquire_state_context (err, dfa, &next_nodes, context);
2472 /* We don't need to check errors here, since the return value of
2473 this function is next_state and ERR is already set. */
2475 re_node_set_free (&next_nodes);
2476 re_string_skip_bytes (&mctx->input, 1);
2477 return next_state;
2479 #endif
2481 #ifdef RE_ENABLE_I18N
2482 static reg_errcode_t
2483 internal_function
2484 transit_state_mb (re_match_context_t *mctx, re_dfastate_t *pstate)
2486 const re_dfa_t *const dfa = mctx->dfa;
2487 reg_errcode_t err;
2488 int i;
2490 for (i = 0; i < pstate->nodes.nelem; ++i)
2492 re_node_set dest_nodes, *new_nodes;
2493 int cur_node_idx = pstate->nodes.elems[i];
2494 int naccepted, dest_idx;
2495 unsigned int context;
2496 re_dfastate_t *dest_state;
2498 if (!dfa->nodes[cur_node_idx].accept_mb)
2499 continue;
2501 if (dfa->nodes[cur_node_idx].constraint)
2503 context = re_string_context_at (&mctx->input,
2504 re_string_cur_idx (&mctx->input),
2505 mctx->eflags);
2506 if (NOT_SATISFY_NEXT_CONSTRAINT (dfa->nodes[cur_node_idx].constraint,
2507 context))
2508 continue;
2511 /* How many bytes the node can accept? */
2512 naccepted = check_node_accept_bytes (dfa, cur_node_idx, &mctx->input,
2513 re_string_cur_idx (&mctx->input));
2514 if (naccepted == 0)
2515 continue;
2517 /* The node can accepts `naccepted' bytes. */
2518 dest_idx = re_string_cur_idx (&mctx->input) + naccepted;
2519 mctx->max_mb_elem_len = ((mctx->max_mb_elem_len < naccepted) ? naccepted
2520 : mctx->max_mb_elem_len);
2521 err = clean_state_log_if_needed (mctx, dest_idx);
2522 if (BE (err != REG_NOERROR, 0))
2523 return err;
2524 #ifdef DEBUG
2525 assert (dfa->nexts[cur_node_idx] != -1);
2526 #endif
2527 new_nodes = dfa->eclosures + dfa->nexts[cur_node_idx];
2529 dest_state = mctx->state_log[dest_idx];
2530 if (dest_state == NULL)
2531 dest_nodes = *new_nodes;
2532 else
2534 err = re_node_set_init_union (&dest_nodes,
2535 dest_state->entrance_nodes, new_nodes);
2536 if (BE (err != REG_NOERROR, 0))
2537 return err;
2539 context = re_string_context_at (&mctx->input, dest_idx - 1,
2540 mctx->eflags);
2541 mctx->state_log[dest_idx]
2542 = re_acquire_state_context (&err, dfa, &dest_nodes, context);
2543 if (dest_state != NULL)
2544 re_node_set_free (&dest_nodes);
2545 if (BE (mctx->state_log[dest_idx] == NULL && err != REG_NOERROR, 0))
2546 return err;
2548 return REG_NOERROR;
2550 #endif /* RE_ENABLE_I18N */
2552 static reg_errcode_t
2553 internal_function
2554 transit_state_bkref (re_match_context_t *mctx, const re_node_set *nodes)
2556 const re_dfa_t *const dfa = mctx->dfa;
2557 reg_errcode_t err;
2558 int i;
2559 int cur_str_idx = re_string_cur_idx (&mctx->input);
2561 for (i = 0; i < nodes->nelem; ++i)
2563 int dest_str_idx, prev_nelem, bkc_idx;
2564 int node_idx = nodes->elems[i];
2565 unsigned int context;
2566 const re_token_t *node = dfa->nodes + node_idx;
2567 re_node_set *new_dest_nodes;
2569 /* Check whether `node' is a backreference or not. */
2570 if (node->type != OP_BACK_REF)
2571 continue;
2573 if (node->constraint)
2575 context = re_string_context_at (&mctx->input, cur_str_idx,
2576 mctx->eflags);
2577 if (NOT_SATISFY_NEXT_CONSTRAINT (node->constraint, context))
2578 continue;
2581 /* `node' is a backreference.
2582 Check the substring which the substring matched. */
2583 bkc_idx = mctx->nbkref_ents;
2584 err = get_subexp (mctx, node_idx, cur_str_idx);
2585 if (BE (err != REG_NOERROR, 0))
2586 goto free_return;
2588 /* And add the epsilon closures (which is `new_dest_nodes') of
2589 the backreference to appropriate state_log. */
2590 #ifdef DEBUG
2591 assert (dfa->nexts[node_idx] != -1);
2592 #endif
2593 for (; bkc_idx < mctx->nbkref_ents; ++bkc_idx)
2595 int subexp_len;
2596 re_dfastate_t *dest_state;
2597 struct re_backref_cache_entry *bkref_ent;
2598 bkref_ent = mctx->bkref_ents + bkc_idx;
2599 if (bkref_ent->node != node_idx || bkref_ent->str_idx != cur_str_idx)
2600 continue;
2601 subexp_len = bkref_ent->subexp_to - bkref_ent->subexp_from;
2602 new_dest_nodes = (subexp_len == 0
2603 ? dfa->eclosures + dfa->edests[node_idx].elems[0]
2604 : dfa->eclosures + dfa->nexts[node_idx]);
2605 dest_str_idx = (cur_str_idx + bkref_ent->subexp_to
2606 - bkref_ent->subexp_from);
2607 context = re_string_context_at (&mctx->input, dest_str_idx - 1,
2608 mctx->eflags);
2609 dest_state = mctx->state_log[dest_str_idx];
2610 prev_nelem = ((mctx->state_log[cur_str_idx] == NULL) ? 0
2611 : mctx->state_log[cur_str_idx]->nodes.nelem);
2612 /* Add `new_dest_node' to state_log. */
2613 if (dest_state == NULL)
2615 mctx->state_log[dest_str_idx]
2616 = re_acquire_state_context (&err, dfa, new_dest_nodes,
2617 context);
2618 if (BE (mctx->state_log[dest_str_idx] == NULL
2619 && err != REG_NOERROR, 0))
2620 goto free_return;
2622 else
2624 re_node_set dest_nodes;
2625 err = re_node_set_init_union (&dest_nodes,
2626 dest_state->entrance_nodes,
2627 new_dest_nodes);
2628 if (BE (err != REG_NOERROR, 0))
2630 re_node_set_free (&dest_nodes);
2631 goto free_return;
2633 mctx->state_log[dest_str_idx]
2634 = re_acquire_state_context (&err, dfa, &dest_nodes, context);
2635 re_node_set_free (&dest_nodes);
2636 if (BE (mctx->state_log[dest_str_idx] == NULL
2637 && err != REG_NOERROR, 0))
2638 goto free_return;
2640 /* We need to check recursively if the backreference can epsilon
2641 transit. */
2642 if (subexp_len == 0
2643 && mctx->state_log[cur_str_idx]->nodes.nelem > prev_nelem)
2645 err = check_subexp_matching_top (mctx, new_dest_nodes,
2646 cur_str_idx);
2647 if (BE (err != REG_NOERROR, 0))
2648 goto free_return;
2649 err = transit_state_bkref (mctx, new_dest_nodes);
2650 if (BE (err != REG_NOERROR, 0))
2651 goto free_return;
2655 err = REG_NOERROR;
2656 free_return:
2657 return err;
2660 /* Enumerate all the candidates which the backreference BKREF_NODE can match
2661 at BKREF_STR_IDX, and register them by match_ctx_add_entry().
2662 Note that we might collect inappropriate candidates here.
2663 However, the cost of checking them strictly here is too high, then we
2664 delay these checking for prune_impossible_nodes(). */
2666 static reg_errcode_t
2667 internal_function
2668 get_subexp (re_match_context_t *mctx, int bkref_node, int bkref_str_idx)
2670 const re_dfa_t *const dfa = mctx->dfa;
2671 int subexp_num, sub_top_idx;
2672 const char *buf = (const char *) re_string_get_buffer (&mctx->input);
2673 /* Return if we have already checked BKREF_NODE at BKREF_STR_IDX. */
2674 int cache_idx = search_cur_bkref_entry (mctx, bkref_str_idx);
2675 if (cache_idx != -1)
2677 const struct re_backref_cache_entry *entry
2678 = mctx->bkref_ents + cache_idx;
2680 if (entry->node == bkref_node)
2681 return REG_NOERROR; /* We already checked it. */
2682 while (entry++->more);
2685 subexp_num = dfa->nodes[bkref_node].opr.idx;
2687 /* For each sub expression */
2688 for (sub_top_idx = 0; sub_top_idx < mctx->nsub_tops; ++sub_top_idx)
2690 reg_errcode_t err;
2691 re_sub_match_top_t *sub_top = mctx->sub_tops[sub_top_idx];
2692 re_sub_match_last_t *sub_last;
2693 int sub_last_idx, sl_str, bkref_str_off;
2695 if (dfa->nodes[sub_top->node].opr.idx != subexp_num)
2696 continue; /* It isn't related. */
2698 sl_str = sub_top->str_idx;
2699 bkref_str_off = bkref_str_idx;
2700 /* At first, check the last node of sub expressions we already
2701 evaluated. */
2702 for (sub_last_idx = 0; sub_last_idx < sub_top->nlasts; ++sub_last_idx)
2704 int sl_str_diff;
2705 sub_last = sub_top->lasts[sub_last_idx];
2706 sl_str_diff = sub_last->str_idx - sl_str;
2707 /* The matched string by the sub expression match with the substring
2708 at the back reference? */
2709 if (sl_str_diff > 0)
2711 if (BE (bkref_str_off + sl_str_diff > mctx->input.valid_len, 0))
2713 /* Not enough chars for a successful match. */
2714 if (bkref_str_off + sl_str_diff > mctx->input.len)
2715 break;
2717 err = clean_state_log_if_needed (mctx,
2718 bkref_str_off
2719 + sl_str_diff);
2720 if (BE (err != REG_NOERROR, 0))
2721 return err;
2722 buf = (const char *) re_string_get_buffer (&mctx->input);
2724 if (memcmp (buf + bkref_str_off, buf + sl_str, sl_str_diff) != 0)
2725 /* We don't need to search this sub expression any more. */
2726 break;
2728 bkref_str_off += sl_str_diff;
2729 sl_str += sl_str_diff;
2730 err = get_subexp_sub (mctx, sub_top, sub_last, bkref_node,
2731 bkref_str_idx);
2733 /* Reload buf, since the preceding call might have reallocated
2734 the buffer. */
2735 buf = (const char *) re_string_get_buffer (&mctx->input);
2737 if (err == REG_NOMATCH)
2738 continue;
2739 if (BE (err != REG_NOERROR, 0))
2740 return err;
2743 if (sub_last_idx < sub_top->nlasts)
2744 continue;
2745 if (sub_last_idx > 0)
2746 ++sl_str;
2747 /* Then, search for the other last nodes of the sub expression. */
2748 for (; sl_str <= bkref_str_idx; ++sl_str)
2750 int cls_node, sl_str_off;
2751 const re_node_set *nodes;
2752 sl_str_off = sl_str - sub_top->str_idx;
2753 /* The matched string by the sub expression match with the substring
2754 at the back reference? */
2755 if (sl_str_off > 0)
2757 if (BE (bkref_str_off >= mctx->input.valid_len, 0))
2759 /* If we are at the end of the input, we cannot match. */
2760 if (bkref_str_off >= mctx->input.len)
2761 break;
2763 err = extend_buffers (mctx);
2764 if (BE (err != REG_NOERROR, 0))
2765 return err;
2767 buf = (const char *) re_string_get_buffer (&mctx->input);
2769 if (buf [bkref_str_off++] != buf[sl_str - 1])
2770 break; /* We don't need to search this sub expression
2771 any more. */
2773 if (mctx->state_log[sl_str] == NULL)
2774 continue;
2775 /* Does this state have a ')' of the sub expression? */
2776 nodes = &mctx->state_log[sl_str]->nodes;
2777 cls_node = find_subexp_node (dfa, nodes, subexp_num,
2778 OP_CLOSE_SUBEXP);
2779 if (cls_node == -1)
2780 continue; /* No. */
2781 if (sub_top->path == NULL)
2783 sub_top->path = calloc (sizeof (state_array_t),
2784 sl_str - sub_top->str_idx + 1);
2785 if (sub_top->path == NULL)
2786 return REG_ESPACE;
2788 /* Can the OP_OPEN_SUBEXP node arrive the OP_CLOSE_SUBEXP node
2789 in the current context? */
2790 err = check_arrival (mctx, sub_top->path, sub_top->node,
2791 sub_top->str_idx, cls_node, sl_str,
2792 OP_CLOSE_SUBEXP);
2793 if (err == REG_NOMATCH)
2794 continue;
2795 if (BE (err != REG_NOERROR, 0))
2796 return err;
2797 sub_last = match_ctx_add_sublast (sub_top, cls_node, sl_str);
2798 if (BE (sub_last == NULL, 0))
2799 return REG_ESPACE;
2800 err = get_subexp_sub (mctx, sub_top, sub_last, bkref_node,
2801 bkref_str_idx);
2802 if (err == REG_NOMATCH)
2803 continue;
2806 return REG_NOERROR;
2809 /* Helper functions for get_subexp(). */
2811 /* Check SUB_LAST can arrive to the back reference BKREF_NODE at BKREF_STR.
2812 If it can arrive, register the sub expression expressed with SUB_TOP
2813 and SUB_LAST. */
2815 static reg_errcode_t
2816 internal_function
2817 get_subexp_sub (re_match_context_t *mctx, const re_sub_match_top_t *sub_top,
2818 re_sub_match_last_t *sub_last, int bkref_node, int bkref_str)
2820 reg_errcode_t err;
2821 int to_idx;
2822 /* Can the subexpression arrive the back reference? */
2823 err = check_arrival (mctx, &sub_last->path, sub_last->node,
2824 sub_last->str_idx, bkref_node, bkref_str,
2825 OP_OPEN_SUBEXP);
2826 if (err != REG_NOERROR)
2827 return err;
2828 err = match_ctx_add_entry (mctx, bkref_node, bkref_str, sub_top->str_idx,
2829 sub_last->str_idx);
2830 if (BE (err != REG_NOERROR, 0))
2831 return err;
2832 to_idx = bkref_str + sub_last->str_idx - sub_top->str_idx;
2833 return clean_state_log_if_needed (mctx, to_idx);
2836 /* Find the first node which is '(' or ')' and whose index is SUBEXP_IDX.
2837 Search '(' if FL_OPEN, or search ')' otherwise.
2838 TODO: This function isn't efficient...
2839 Because there might be more than one nodes whose types are
2840 OP_OPEN_SUBEXP and whose index is SUBEXP_IDX, we must check all
2841 nodes.
2842 E.g. RE: (a){2} */
2844 static int
2845 internal_function
2846 find_subexp_node (const re_dfa_t *dfa, const re_node_set *nodes,
2847 int subexp_idx, int type)
2849 int cls_idx;
2850 for (cls_idx = 0; cls_idx < nodes->nelem; ++cls_idx)
2852 int cls_node = nodes->elems[cls_idx];
2853 const re_token_t *node = dfa->nodes + cls_node;
2854 if (node->type == type
2855 && node->opr.idx == subexp_idx)
2856 return cls_node;
2858 return -1;
2861 /* Check whether the node TOP_NODE at TOP_STR can arrive to the node
2862 LAST_NODE at LAST_STR. We record the path onto PATH since it will be
2863 heavily reused.
2864 Return REG_NOERROR if it can arrive, or REG_NOMATCH otherwise. */
2866 static reg_errcode_t
2867 internal_function
2868 check_arrival (re_match_context_t *mctx, state_array_t *path, int top_node,
2869 int top_str, int last_node, int last_str, int type)
2871 const re_dfa_t *const dfa = mctx->dfa;
2872 reg_errcode_t err = REG_NOERROR;
2873 int subexp_num, backup_cur_idx, str_idx, null_cnt;
2874 re_dfastate_t *cur_state = NULL;
2875 re_node_set *cur_nodes, next_nodes;
2876 re_dfastate_t **backup_state_log;
2877 unsigned int context;
2879 subexp_num = dfa->nodes[top_node].opr.idx;
2880 /* Extend the buffer if we need. */
2881 if (BE (path->alloc < last_str + mctx->max_mb_elem_len + 1, 0))
2883 re_dfastate_t **new_array;
2884 int old_alloc = path->alloc;
2885 path->alloc += last_str + mctx->max_mb_elem_len + 1;
2886 new_array = re_realloc (path->array, re_dfastate_t *, path->alloc);
2887 if (BE (new_array == NULL, 0))
2889 path->alloc = old_alloc;
2890 return REG_ESPACE;
2892 path->array = new_array;
2893 memset (new_array + old_alloc, '\0',
2894 sizeof (re_dfastate_t *) * (path->alloc - old_alloc));
2897 str_idx = path->next_idx ?: top_str;
2899 /* Temporary modify MCTX. */
2900 backup_state_log = mctx->state_log;
2901 backup_cur_idx = mctx->input.cur_idx;
2902 mctx->state_log = path->array;
2903 mctx->input.cur_idx = str_idx;
2905 /* Setup initial node set. */
2906 context = re_string_context_at (&mctx->input, str_idx - 1, mctx->eflags);
2907 if (str_idx == top_str)
2909 err = re_node_set_init_1 (&next_nodes, top_node);
2910 if (BE (err != REG_NOERROR, 0))
2911 return err;
2912 err = check_arrival_expand_ecl (dfa, &next_nodes, subexp_num, type);
2913 if (BE (err != REG_NOERROR, 0))
2915 re_node_set_free (&next_nodes);
2916 return err;
2919 else
2921 cur_state = mctx->state_log[str_idx];
2922 if (cur_state && cur_state->has_backref)
2924 err = re_node_set_init_copy (&next_nodes, &cur_state->nodes);
2925 if (BE (err != REG_NOERROR, 0))
2926 return err;
2928 else
2929 re_node_set_init_empty (&next_nodes);
2931 if (str_idx == top_str || (cur_state && cur_state->has_backref))
2933 if (next_nodes.nelem)
2935 err = expand_bkref_cache (mctx, &next_nodes, str_idx,
2936 subexp_num, type);
2937 if (BE (err != REG_NOERROR, 0))
2939 re_node_set_free (&next_nodes);
2940 return err;
2943 cur_state = re_acquire_state_context (&err, dfa, &next_nodes, context);
2944 if (BE (cur_state == NULL && err != REG_NOERROR, 0))
2946 re_node_set_free (&next_nodes);
2947 return err;
2949 mctx->state_log[str_idx] = cur_state;
2952 for (null_cnt = 0; str_idx < last_str && null_cnt <= mctx->max_mb_elem_len;)
2954 re_node_set_empty (&next_nodes);
2955 if (mctx->state_log[str_idx + 1])
2957 err = re_node_set_merge (&next_nodes,
2958 &mctx->state_log[str_idx + 1]->nodes);
2959 if (BE (err != REG_NOERROR, 0))
2961 re_node_set_free (&next_nodes);
2962 return err;
2965 if (cur_state)
2967 err = check_arrival_add_next_nodes (mctx, str_idx,
2968 &cur_state->non_eps_nodes,
2969 &next_nodes);
2970 if (BE (err != REG_NOERROR, 0))
2972 re_node_set_free (&next_nodes);
2973 return err;
2976 ++str_idx;
2977 if (next_nodes.nelem)
2979 err = check_arrival_expand_ecl (dfa, &next_nodes, subexp_num, type);
2980 if (BE (err != REG_NOERROR, 0))
2982 re_node_set_free (&next_nodes);
2983 return err;
2985 err = expand_bkref_cache (mctx, &next_nodes, str_idx,
2986 subexp_num, type);
2987 if (BE (err != REG_NOERROR, 0))
2989 re_node_set_free (&next_nodes);
2990 return err;
2993 context = re_string_context_at (&mctx->input, str_idx - 1, mctx->eflags);
2994 cur_state = re_acquire_state_context (&err, dfa, &next_nodes, context);
2995 if (BE (cur_state == NULL && err != REG_NOERROR, 0))
2997 re_node_set_free (&next_nodes);
2998 return err;
3000 mctx->state_log[str_idx] = cur_state;
3001 null_cnt = cur_state == NULL ? null_cnt + 1 : 0;
3003 re_node_set_free (&next_nodes);
3004 cur_nodes = (mctx->state_log[last_str] == NULL ? NULL
3005 : &mctx->state_log[last_str]->nodes);
3006 path->next_idx = str_idx;
3008 /* Fix MCTX. */
3009 mctx->state_log = backup_state_log;
3010 mctx->input.cur_idx = backup_cur_idx;
3012 /* Then check the current node set has the node LAST_NODE. */
3013 if (cur_nodes != NULL && re_node_set_contains (cur_nodes, last_node))
3014 return REG_NOERROR;
3016 return REG_NOMATCH;
3019 /* Helper functions for check_arrival. */
3021 /* Calculate the destination nodes of CUR_NODES at STR_IDX, and append them
3022 to NEXT_NODES.
3023 TODO: This function is similar to the functions transit_state*(),
3024 however this function has many additional works.
3025 Can't we unify them? */
3027 static reg_errcode_t
3028 internal_function
3029 check_arrival_add_next_nodes (re_match_context_t *mctx, int str_idx,
3030 re_node_set *cur_nodes, re_node_set *next_nodes)
3032 const re_dfa_t *const dfa = mctx->dfa;
3033 int result;
3034 int cur_idx;
3035 reg_errcode_t err = REG_NOERROR;
3036 re_node_set union_set;
3037 re_node_set_init_empty (&union_set);
3038 for (cur_idx = 0; cur_idx < cur_nodes->nelem; ++cur_idx)
3040 int naccepted = 0;
3041 int cur_node = cur_nodes->elems[cur_idx];
3042 #ifdef DEBUG
3043 re_token_type_t type = dfa->nodes[cur_node].type;
3044 assert (!IS_EPSILON_NODE (type));
3045 #endif
3046 #ifdef RE_ENABLE_I18N
3047 /* If the node may accept `multi byte'. */
3048 if (dfa->nodes[cur_node].accept_mb)
3050 naccepted = check_node_accept_bytes (dfa, cur_node, &mctx->input,
3051 str_idx);
3052 if (naccepted > 1)
3054 re_dfastate_t *dest_state;
3055 int next_node = dfa->nexts[cur_node];
3056 int next_idx = str_idx + naccepted;
3057 dest_state = mctx->state_log[next_idx];
3058 re_node_set_empty (&union_set);
3059 if (dest_state)
3061 err = re_node_set_merge (&union_set, &dest_state->nodes);
3062 if (BE (err != REG_NOERROR, 0))
3064 re_node_set_free (&union_set);
3065 return err;
3068 result = re_node_set_insert (&union_set, next_node);
3069 if (BE (result < 0, 0))
3071 re_node_set_free (&union_set);
3072 return REG_ESPACE;
3074 mctx->state_log[next_idx] = re_acquire_state (&err, dfa,
3075 &union_set);
3076 if (BE (mctx->state_log[next_idx] == NULL
3077 && err != REG_NOERROR, 0))
3079 re_node_set_free (&union_set);
3080 return err;
3084 #endif /* RE_ENABLE_I18N */
3085 if (naccepted
3086 || check_node_accept (mctx, dfa->nodes + cur_node, str_idx))
3088 result = re_node_set_insert (next_nodes, dfa->nexts[cur_node]);
3089 if (BE (result < 0, 0))
3091 re_node_set_free (&union_set);
3092 return REG_ESPACE;
3096 re_node_set_free (&union_set);
3097 return REG_NOERROR;
3100 /* For all the nodes in CUR_NODES, add the epsilon closures of them to
3101 CUR_NODES, however exclude the nodes which are:
3102 - inside the sub expression whose number is EX_SUBEXP, if FL_OPEN.
3103 - out of the sub expression whose number is EX_SUBEXP, if !FL_OPEN.
3106 static reg_errcode_t
3107 internal_function
3108 check_arrival_expand_ecl (const re_dfa_t *dfa, re_node_set *cur_nodes,
3109 int ex_subexp, int type)
3111 reg_errcode_t err;
3112 int idx, outside_node;
3113 re_node_set new_nodes;
3114 #ifdef DEBUG
3115 assert (cur_nodes->nelem);
3116 #endif
3117 err = re_node_set_alloc (&new_nodes, cur_nodes->nelem);
3118 if (BE (err != REG_NOERROR, 0))
3119 return err;
3120 /* Create a new node set NEW_NODES with the nodes which are epsilon
3121 closures of the node in CUR_NODES. */
3123 for (idx = 0; idx < cur_nodes->nelem; ++idx)
3125 int cur_node = cur_nodes->elems[idx];
3126 const re_node_set *eclosure = dfa->eclosures + cur_node;
3127 outside_node = find_subexp_node (dfa, eclosure, ex_subexp, type);
3128 if (outside_node == -1)
3130 /* There are no problematic nodes, just merge them. */
3131 err = re_node_set_merge (&new_nodes, eclosure);
3132 if (BE (err != REG_NOERROR, 0))
3134 re_node_set_free (&new_nodes);
3135 return err;
3138 else
3140 /* There are problematic nodes, re-calculate incrementally. */
3141 err = check_arrival_expand_ecl_sub (dfa, &new_nodes, cur_node,
3142 ex_subexp, type);
3143 if (BE (err != REG_NOERROR, 0))
3145 re_node_set_free (&new_nodes);
3146 return err;
3150 re_node_set_free (cur_nodes);
3151 *cur_nodes = new_nodes;
3152 return REG_NOERROR;
3155 /* Helper function for check_arrival_expand_ecl.
3156 Check incrementally the epsilon closure of TARGET, and if it isn't
3157 problematic append it to DST_NODES. */
3159 static reg_errcode_t
3160 internal_function
3161 check_arrival_expand_ecl_sub (const re_dfa_t *dfa, re_node_set *dst_nodes,
3162 int target, int ex_subexp, int type)
3164 int cur_node;
3165 for (cur_node = target; !re_node_set_contains (dst_nodes, cur_node);)
3167 int err;
3169 if (dfa->nodes[cur_node].type == type
3170 && dfa->nodes[cur_node].opr.idx == ex_subexp)
3172 if (type == OP_CLOSE_SUBEXP)
3174 err = re_node_set_insert (dst_nodes, cur_node);
3175 if (BE (err == -1, 0))
3176 return REG_ESPACE;
3178 break;
3180 err = re_node_set_insert (dst_nodes, cur_node);
3181 if (BE (err == -1, 0))
3182 return REG_ESPACE;
3183 if (dfa->edests[cur_node].nelem == 0)
3184 break;
3185 if (dfa->edests[cur_node].nelem == 2)
3187 err = check_arrival_expand_ecl_sub (dfa, dst_nodes,
3188 dfa->edests[cur_node].elems[1],
3189 ex_subexp, type);
3190 if (BE (err != REG_NOERROR, 0))
3191 return err;
3193 cur_node = dfa->edests[cur_node].elems[0];
3195 return REG_NOERROR;
3199 /* For all the back references in the current state, calculate the
3200 destination of the back references by the appropriate entry
3201 in MCTX->BKREF_ENTS. */
3203 static reg_errcode_t
3204 internal_function
3205 expand_bkref_cache (re_match_context_t *mctx, re_node_set *cur_nodes,
3206 int cur_str, int subexp_num, int type)
3208 const re_dfa_t *const dfa = mctx->dfa;
3209 reg_errcode_t err;
3210 int cache_idx_start = search_cur_bkref_entry (mctx, cur_str);
3211 struct re_backref_cache_entry *ent;
3213 if (cache_idx_start == -1)
3214 return REG_NOERROR;
3216 restart:
3217 ent = mctx->bkref_ents + cache_idx_start;
3220 int to_idx, next_node;
3222 /* Is this entry ENT is appropriate? */
3223 if (!re_node_set_contains (cur_nodes, ent->node))
3224 continue; /* No. */
3226 to_idx = cur_str + ent->subexp_to - ent->subexp_from;
3227 /* Calculate the destination of the back reference, and append it
3228 to MCTX->STATE_LOG. */
3229 if (to_idx == cur_str)
3231 /* The backreference did epsilon transit, we must re-check all the
3232 node in the current state. */
3233 re_node_set new_dests;
3234 reg_errcode_t err2, err3;
3235 next_node = dfa->edests[ent->node].elems[0];
3236 if (re_node_set_contains (cur_nodes, next_node))
3237 continue;
3238 err = re_node_set_init_1 (&new_dests, next_node);
3239 err2 = check_arrival_expand_ecl (dfa, &new_dests, subexp_num, type);
3240 err3 = re_node_set_merge (cur_nodes, &new_dests);
3241 re_node_set_free (&new_dests);
3242 if (BE (err != REG_NOERROR || err2 != REG_NOERROR
3243 || err3 != REG_NOERROR, 0))
3245 err = (err != REG_NOERROR ? err
3246 : (err2 != REG_NOERROR ? err2 : err3));
3247 return err;
3249 /* TODO: It is still inefficient... */
3250 goto restart;
3252 else
3254 re_node_set union_set;
3255 next_node = dfa->nexts[ent->node];
3256 if (mctx->state_log[to_idx])
3258 int ret;
3259 if (re_node_set_contains (&mctx->state_log[to_idx]->nodes,
3260 next_node))
3261 continue;
3262 err = re_node_set_init_copy (&union_set,
3263 &mctx->state_log[to_idx]->nodes);
3264 ret = re_node_set_insert (&union_set, next_node);
3265 if (BE (err != REG_NOERROR || ret < 0, 0))
3267 re_node_set_free (&union_set);
3268 err = err != REG_NOERROR ? err : REG_ESPACE;
3269 return err;
3272 else
3274 err = re_node_set_init_1 (&union_set, next_node);
3275 if (BE (err != REG_NOERROR, 0))
3276 return err;
3278 mctx->state_log[to_idx] = re_acquire_state (&err, dfa, &union_set);
3279 re_node_set_free (&union_set);
3280 if (BE (mctx->state_log[to_idx] == NULL
3281 && err != REG_NOERROR, 0))
3282 return err;
3285 while (ent++->more);
3286 return REG_NOERROR;
3289 /* Build transition table for the state.
3290 Return 1 if succeeded, otherwise return NULL. */
3292 static int
3293 internal_function
3294 build_trtable (const re_dfa_t *dfa, re_dfastate_t *state)
3296 reg_errcode_t err;
3297 int i, j, ch, need_word_trtable = 0;
3298 bitset_word_t elem, mask;
3299 bool dests_node_malloced = false;
3300 bool dest_states_malloced = false;
3301 int ndests; /* Number of the destination states from `state'. */
3302 re_dfastate_t **trtable;
3303 re_dfastate_t **dest_states = NULL, **dest_states_word, **dest_states_nl;
3304 re_node_set follows, *dests_node;
3305 bitset_t *dests_ch;
3306 bitset_t acceptable;
3308 struct dests_alloc
3310 re_node_set dests_node[SBC_MAX];
3311 bitset_t dests_ch[SBC_MAX];
3312 } *dests_alloc;
3314 /* We build DFA states which corresponds to the destination nodes
3315 from `state'. `dests_node[i]' represents the nodes which i-th
3316 destination state contains, and `dests_ch[i]' represents the
3317 characters which i-th destination state accepts. */
3318 if (__libc_use_alloca (sizeof (struct dests_alloc)))
3319 dests_alloc = (struct dests_alloc *) alloca (sizeof (struct dests_alloc));
3320 else
3322 dests_alloc = re_malloc (struct dests_alloc, 1);
3323 if (BE (dests_alloc == NULL, 0))
3324 return 0;
3325 dests_node_malloced = true;
3327 dests_node = dests_alloc->dests_node;
3328 dests_ch = dests_alloc->dests_ch;
3330 /* Initialize transiton table. */
3331 state->word_trtable = state->trtable = NULL;
3333 /* At first, group all nodes belonging to `state' into several
3334 destinations. */
3335 ndests = group_nodes_into_DFAstates (dfa, state, dests_node, dests_ch);
3336 if (BE (ndests <= 0, 0))
3338 if (dests_node_malloced)
3339 free (dests_alloc);
3340 /* Return 0 in case of an error, 1 otherwise. */
3341 if (ndests == 0)
3343 state->trtable = (re_dfastate_t **)
3344 calloc (sizeof (re_dfastate_t *), SBC_MAX);
3345 return 1;
3347 return 0;
3350 err = re_node_set_alloc (&follows, ndests + 1);
3351 if (BE (err != REG_NOERROR, 0))
3352 goto out_free;
3354 if (__libc_use_alloca ((sizeof (re_node_set) + sizeof (bitset_t)) * SBC_MAX
3355 + ndests * 3 * sizeof (re_dfastate_t *)))
3356 dest_states = (re_dfastate_t **)
3357 alloca (ndests * 3 * sizeof (re_dfastate_t *));
3358 else
3360 dest_states = (re_dfastate_t **)
3361 malloc (ndests * 3 * sizeof (re_dfastate_t *));
3362 if (BE (dest_states == NULL, 0))
3364 out_free:
3365 if (dest_states_malloced)
3366 free (dest_states);
3367 re_node_set_free (&follows);
3368 for (i = 0; i < ndests; ++i)
3369 re_node_set_free (dests_node + i);
3370 if (dests_node_malloced)
3371 free (dests_alloc);
3372 return 0;
3374 dest_states_malloced = true;
3376 dest_states_word = dest_states + ndests;
3377 dest_states_nl = dest_states_word + ndests;
3378 bitset_empty (acceptable);
3380 /* Then build the states for all destinations. */
3381 for (i = 0; i < ndests; ++i)
3383 int next_node;
3384 re_node_set_empty (&follows);
3385 /* Merge the follows of this destination states. */
3386 for (j = 0; j < dests_node[i].nelem; ++j)
3388 next_node = dfa->nexts[dests_node[i].elems[j]];
3389 if (next_node != -1)
3391 err = re_node_set_merge (&follows, dfa->eclosures + next_node);
3392 if (BE (err != REG_NOERROR, 0))
3393 goto out_free;
3396 dest_states[i] = re_acquire_state_context (&err, dfa, &follows, 0);
3397 if (BE (dest_states[i] == NULL && err != REG_NOERROR, 0))
3398 goto out_free;
3399 /* If the new state has context constraint,
3400 build appropriate states for these contexts. */
3401 if (dest_states[i]->has_constraint)
3403 dest_states_word[i] = re_acquire_state_context (&err, dfa, &follows,
3404 CONTEXT_WORD);
3405 if (BE (dest_states_word[i] == NULL && err != REG_NOERROR, 0))
3406 goto out_free;
3408 if (dest_states[i] != dest_states_word[i] && dfa->mb_cur_max > 1)
3409 need_word_trtable = 1;
3411 dest_states_nl[i] = re_acquire_state_context (&err, dfa, &follows,
3412 CONTEXT_NEWLINE);
3413 if (BE (dest_states_nl[i] == NULL && err != REG_NOERROR, 0))
3414 goto out_free;
3416 else
3418 dest_states_word[i] = dest_states[i];
3419 dest_states_nl[i] = dest_states[i];
3421 bitset_merge (acceptable, dests_ch[i]);
3424 if (!BE (need_word_trtable, 0))
3426 /* We don't care about whether the following character is a word
3427 character, or we are in a single-byte character set so we can
3428 discern by looking at the character code: allocate a
3429 256-entry transition table. */
3430 trtable = state->trtable =
3431 (re_dfastate_t **) calloc (sizeof (re_dfastate_t *), SBC_MAX);
3432 if (BE (trtable == NULL, 0))
3433 goto out_free;
3435 /* For all characters ch...: */
3436 for (i = 0; i < BITSET_WORDS; ++i)
3437 for (ch = i * BITSET_WORD_BITS, elem = acceptable[i], mask = 1;
3438 elem;
3439 mask <<= 1, elem >>= 1, ++ch)
3440 if (BE (elem & 1, 0))
3442 /* There must be exactly one destination which accepts
3443 character ch. See group_nodes_into_DFAstates. */
3444 for (j = 0; (dests_ch[j][i] & mask) == 0; ++j)
3447 /* j-th destination accepts the word character ch. */
3448 if (dfa->word_char[i] & mask)
3449 trtable[ch] = dest_states_word[j];
3450 else
3451 trtable[ch] = dest_states[j];
3454 else
3456 /* We care about whether the following character is a word
3457 character, and we are in a multi-byte character set: discern
3458 by looking at the character code: build two 256-entry
3459 transition tables, one starting at trtable[0] and one
3460 starting at trtable[SBC_MAX]. */
3461 trtable = state->word_trtable =
3462 (re_dfastate_t **) calloc (sizeof (re_dfastate_t *), 2 * SBC_MAX);
3463 if (BE (trtable == NULL, 0))
3464 goto out_free;
3466 /* For all characters ch...: */
3467 for (i = 0; i < BITSET_WORDS; ++i)
3468 for (ch = i * BITSET_WORD_BITS, elem = acceptable[i], mask = 1;
3469 elem;
3470 mask <<= 1, elem >>= 1, ++ch)
3471 if (BE (elem & 1, 0))
3473 /* There must be exactly one destination which accepts
3474 character ch. See group_nodes_into_DFAstates. */
3475 for (j = 0; (dests_ch[j][i] & mask) == 0; ++j)
3478 /* j-th destination accepts the word character ch. */
3479 trtable[ch] = dest_states[j];
3480 trtable[ch + SBC_MAX] = dest_states_word[j];
3484 /* new line */
3485 if (bitset_contain (acceptable, NEWLINE_CHAR))
3487 /* The current state accepts newline character. */
3488 for (j = 0; j < ndests; ++j)
3489 if (bitset_contain (dests_ch[j], NEWLINE_CHAR))
3491 /* k-th destination accepts newline character. */
3492 trtable[NEWLINE_CHAR] = dest_states_nl[j];
3493 if (need_word_trtable)
3494 trtable[NEWLINE_CHAR + SBC_MAX] = dest_states_nl[j];
3495 /* There must be only one destination which accepts
3496 newline. See group_nodes_into_DFAstates. */
3497 break;
3501 if (dest_states_malloced)
3502 free (dest_states);
3504 re_node_set_free (&follows);
3505 for (i = 0; i < ndests; ++i)
3506 re_node_set_free (dests_node + i);
3508 if (dests_node_malloced)
3509 free (dests_alloc);
3511 return 1;
3514 /* Group all nodes belonging to STATE into several destinations.
3515 Then for all destinations, set the nodes belonging to the destination
3516 to DESTS_NODE[i] and set the characters accepted by the destination
3517 to DEST_CH[i]. This function return the number of destinations. */
3519 static int
3520 internal_function
3521 group_nodes_into_DFAstates (const re_dfa_t *dfa, const re_dfastate_t *state,
3522 re_node_set *dests_node, bitset_t *dests_ch)
3524 reg_errcode_t err;
3525 int result;
3526 int i, j, k;
3527 int ndests; /* Number of the destinations from `state'. */
3528 bitset_t accepts; /* Characters a node can accept. */
3529 const re_node_set *cur_nodes = &state->nodes;
3530 bitset_empty (accepts);
3531 ndests = 0;
3533 /* For all the nodes belonging to `state', */
3534 for (i = 0; i < cur_nodes->nelem; ++i)
3536 re_token_t *node = &dfa->nodes[cur_nodes->elems[i]];
3537 re_token_type_t type = node->type;
3538 unsigned int constraint = node->constraint;
3540 /* Enumerate all single byte character this node can accept. */
3541 if (type == CHARACTER)
3542 bitset_set (accepts, node->opr.c);
3543 else if (type == SIMPLE_BRACKET)
3545 bitset_merge (accepts, node->opr.sbcset);
3547 else if (type == OP_PERIOD)
3549 #ifdef RE_ENABLE_I18N
3550 if (dfa->mb_cur_max > 1)
3551 bitset_merge (accepts, dfa->sb_char);
3552 else
3553 #endif
3554 bitset_set_all (accepts);
3555 if (!(dfa->syntax & RE_DOT_NEWLINE))
3556 bitset_clear (accepts, '\n');
3557 if (dfa->syntax & RE_DOT_NOT_NULL)
3558 bitset_clear (accepts, '\0');
3560 #ifdef RE_ENABLE_I18N
3561 else if (type == OP_UTF8_PERIOD)
3563 memset (accepts, '\xff', sizeof (bitset_t) / 2);
3564 if (!(dfa->syntax & RE_DOT_NEWLINE))
3565 bitset_clear (accepts, '\n');
3566 if (dfa->syntax & RE_DOT_NOT_NULL)
3567 bitset_clear (accepts, '\0');
3569 #endif
3570 else
3571 continue;
3573 /* Check the `accepts' and sift the characters which are not
3574 match it the context. */
3575 if (constraint)
3577 if (constraint & NEXT_NEWLINE_CONSTRAINT)
3579 bool accepts_newline = bitset_contain (accepts, NEWLINE_CHAR);
3580 bitset_empty (accepts);
3581 if (accepts_newline)
3582 bitset_set (accepts, NEWLINE_CHAR);
3583 else
3584 continue;
3586 if (constraint & NEXT_ENDBUF_CONSTRAINT)
3588 bitset_empty (accepts);
3589 continue;
3592 if (constraint & NEXT_WORD_CONSTRAINT)
3594 bitset_word_t any_set = 0;
3595 if (type == CHARACTER && !node->word_char)
3597 bitset_empty (accepts);
3598 continue;
3600 #ifdef RE_ENABLE_I18N
3601 if (dfa->mb_cur_max > 1)
3602 for (j = 0; j < BITSET_WORDS; ++j)
3603 any_set |= (accepts[j] &= (dfa->word_char[j] | ~dfa->sb_char[j]));
3604 else
3605 #endif
3606 for (j = 0; j < BITSET_WORDS; ++j)
3607 any_set |= (accepts[j] &= dfa->word_char[j]);
3608 if (!any_set)
3609 continue;
3611 if (constraint & NEXT_NOTWORD_CONSTRAINT)
3613 bitset_word_t any_set = 0;
3614 if (type == CHARACTER && node->word_char)
3616 bitset_empty (accepts);
3617 continue;
3619 #ifdef RE_ENABLE_I18N
3620 if (dfa->mb_cur_max > 1)
3621 for (j = 0; j < BITSET_WORDS; ++j)
3622 any_set |= (accepts[j] &= ~(dfa->word_char[j] & dfa->sb_char[j]));
3623 else
3624 #endif
3625 for (j = 0; j < BITSET_WORDS; ++j)
3626 any_set |= (accepts[j] &= ~dfa->word_char[j]);
3627 if (!any_set)
3628 continue;
3632 /* Then divide `accepts' into DFA states, or create a new
3633 state. Above, we make sure that accepts is not empty. */
3634 for (j = 0; j < ndests; ++j)
3636 bitset_t intersec; /* Intersection sets, see below. */
3637 bitset_t remains;
3638 /* Flags, see below. */
3639 bitset_word_t has_intersec, not_subset, not_consumed;
3641 /* Optimization, skip if this state doesn't accept the character. */
3642 if (type == CHARACTER && !bitset_contain (dests_ch[j], node->opr.c))
3643 continue;
3645 /* Enumerate the intersection set of this state and `accepts'. */
3646 has_intersec = 0;
3647 for (k = 0; k < BITSET_WORDS; ++k)
3648 has_intersec |= intersec[k] = accepts[k] & dests_ch[j][k];
3649 /* And skip if the intersection set is empty. */
3650 if (!has_intersec)
3651 continue;
3653 /* Then check if this state is a subset of `accepts'. */
3654 not_subset = not_consumed = 0;
3655 for (k = 0; k < BITSET_WORDS; ++k)
3657 not_subset |= remains[k] = ~accepts[k] & dests_ch[j][k];
3658 not_consumed |= accepts[k] = accepts[k] & ~dests_ch[j][k];
3661 /* If this state isn't a subset of `accepts', create a
3662 new group state, which has the `remains'. */
3663 if (not_subset)
3665 bitset_copy (dests_ch[ndests], remains);
3666 bitset_copy (dests_ch[j], intersec);
3667 err = re_node_set_init_copy (dests_node + ndests, &dests_node[j]);
3668 if (BE (err != REG_NOERROR, 0))
3669 goto error_return;
3670 ++ndests;
3673 /* Put the position in the current group. */
3674 result = re_node_set_insert (&dests_node[j], cur_nodes->elems[i]);
3675 if (BE (result < 0, 0))
3676 goto error_return;
3678 /* If all characters are consumed, go to next node. */
3679 if (!not_consumed)
3680 break;
3682 /* Some characters remain, create a new group. */
3683 if (j == ndests)
3685 bitset_copy (dests_ch[ndests], accepts);
3686 err = re_node_set_init_1 (dests_node + ndests, cur_nodes->elems[i]);
3687 if (BE (err != REG_NOERROR, 0))
3688 goto error_return;
3689 ++ndests;
3690 bitset_empty (accepts);
3693 return ndests;
3694 error_return:
3695 for (j = 0; j < ndests; ++j)
3696 re_node_set_free (dests_node + j);
3697 return -1;
3700 #ifdef RE_ENABLE_I18N
3701 /* Check how many bytes the node `dfa->nodes[node_idx]' accepts.
3702 Return the number of the bytes the node accepts.
3703 STR_IDX is the current index of the input string.
3705 This function handles the nodes which can accept one character, or
3706 one collating element like '.', '[a-z]', opposite to the other nodes
3707 can only accept one byte. */
3709 static int
3710 internal_function
3711 check_node_accept_bytes (const re_dfa_t *dfa, int node_idx,
3712 const re_string_t *input, int str_idx)
3714 const re_token_t *node = dfa->nodes + node_idx;
3715 int char_len, elem_len;
3716 int i;
3718 if (BE (node->type == OP_UTF8_PERIOD, 0))
3720 unsigned char c = re_string_byte_at (input, str_idx), d;
3721 if (BE (c < 0xc2, 1))
3722 return 0;
3724 if (str_idx + 2 > input->len)
3725 return 0;
3727 d = re_string_byte_at (input, str_idx + 1);
3728 if (c < 0xe0)
3729 return (d < 0x80 || d > 0xbf) ? 0 : 2;
3730 else if (c < 0xf0)
3732 char_len = 3;
3733 if (c == 0xe0 && d < 0xa0)
3734 return 0;
3736 else if (c < 0xf8)
3738 char_len = 4;
3739 if (c == 0xf0 && d < 0x90)
3740 return 0;
3742 else if (c < 0xfc)
3744 char_len = 5;
3745 if (c == 0xf8 && d < 0x88)
3746 return 0;
3748 else if (c < 0xfe)
3750 char_len = 6;
3751 if (c == 0xfc && d < 0x84)
3752 return 0;
3754 else
3755 return 0;
3757 if (str_idx + char_len > input->len)
3758 return 0;
3760 for (i = 1; i < char_len; ++i)
3762 d = re_string_byte_at (input, str_idx + i);
3763 if (d < 0x80 || d > 0xbf)
3764 return 0;
3766 return char_len;
3769 char_len = re_string_char_size_at (input, str_idx);
3770 if (node->type == OP_PERIOD)
3772 if (char_len <= 1)
3773 return 0;
3774 /* FIXME: I don't think this if is needed, as both '\n'
3775 and '\0' are char_len == 1. */
3776 /* '.' accepts any one character except the following two cases. */
3777 if ((!(dfa->syntax & RE_DOT_NEWLINE) &&
3778 re_string_byte_at (input, str_idx) == '\n') ||
3779 ((dfa->syntax & RE_DOT_NOT_NULL) &&
3780 re_string_byte_at (input, str_idx) == '\0'))
3781 return 0;
3782 return char_len;
3785 elem_len = re_string_elem_size_at (input, str_idx);
3786 if ((elem_len <= 1 && char_len <= 1) || char_len == 0)
3787 return 0;
3789 if (node->type == COMPLEX_BRACKET)
3791 const re_charset_t *cset = node->opr.mbcset;
3792 # ifdef _LIBC
3793 const unsigned char *pin
3794 = ((const unsigned char *) re_string_get_buffer (input) + str_idx);
3795 int j;
3796 uint32_t nrules;
3797 # endif /* _LIBC */
3798 int match_len = 0;
3799 wchar_t wc = ((cset->nranges || cset->nchar_classes || cset->nmbchars)
3800 ? re_string_wchar_at (input, str_idx) : 0);
3802 /* match with multibyte character? */
3803 for (i = 0; i < cset->nmbchars; ++i)
3804 if (wc == cset->mbchars[i])
3806 match_len = char_len;
3807 goto check_node_accept_bytes_match;
3809 /* match with character_class? */
3810 for (i = 0; i < cset->nchar_classes; ++i)
3812 wctype_t wt = cset->char_classes[i];
3813 if (__iswctype (wc, wt))
3815 match_len = char_len;
3816 goto check_node_accept_bytes_match;
3820 # ifdef _LIBC
3821 nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3822 if (nrules != 0)
3824 unsigned int in_collseq = 0;
3825 const int32_t *table, *indirect;
3826 const unsigned char *weights, *extra;
3827 const char *collseqwc;
3828 int32_t idx;
3829 /* This #include defines a local function! */
3830 # include <locale/weight.h>
3832 /* match with collating_symbol? */
3833 if (cset->ncoll_syms)
3834 extra = (const unsigned char *)
3835 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB);
3836 for (i = 0; i < cset->ncoll_syms; ++i)
3838 const unsigned char *coll_sym = extra + cset->coll_syms[i];
3839 /* Compare the length of input collating element and
3840 the length of current collating element. */
3841 if (*coll_sym != elem_len)
3842 continue;
3843 /* Compare each bytes. */
3844 for (j = 0; j < *coll_sym; j++)
3845 if (pin[j] != coll_sym[1 + j])
3846 break;
3847 if (j == *coll_sym)
3849 /* Match if every bytes is equal. */
3850 match_len = j;
3851 goto check_node_accept_bytes_match;
3855 if (cset->nranges)
3857 if (elem_len <= char_len)
3859 collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
3860 in_collseq = __collseq_table_lookup (collseqwc, wc);
3862 else
3863 in_collseq = find_collation_sequence_value (pin, elem_len);
3865 /* match with range expression? */
3866 for (i = 0; i < cset->nranges; ++i)
3867 if (cset->range_starts[i] <= in_collseq
3868 && in_collseq <= cset->range_ends[i])
3870 match_len = elem_len;
3871 goto check_node_accept_bytes_match;
3874 /* match with equivalence_class? */
3875 if (cset->nequiv_classes)
3877 const unsigned char *cp = pin;
3878 table = (const int32_t *)
3879 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3880 weights = (const unsigned char *)
3881 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_WEIGHTMB);
3882 extra = (const unsigned char *)
3883 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_EXTRAMB);
3884 indirect = (const int32_t *)
3885 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_INDIRECTMB);
3886 idx = findidx (&cp);
3887 if (idx > 0)
3888 for (i = 0; i < cset->nequiv_classes; ++i)
3890 int32_t equiv_class_idx = cset->equiv_classes[i];
3891 size_t weight_len = weights[idx];
3892 if (weight_len == weights[equiv_class_idx])
3894 int cnt = 0;
3895 while (cnt <= weight_len
3896 && (weights[equiv_class_idx + 1 + cnt]
3897 == weights[idx + 1 + cnt]))
3898 ++cnt;
3899 if (cnt > weight_len)
3901 match_len = elem_len;
3902 goto check_node_accept_bytes_match;
3908 else
3909 # endif /* _LIBC */
3911 /* match with range expression? */
3912 #if __GNUC__ >= 2
3913 wchar_t cmp_buf[] = {L'\0', L'\0', wc, L'\0', L'\0', L'\0'};
3914 #else
3915 wchar_t cmp_buf[] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'};
3916 cmp_buf[2] = wc;
3917 #endif
3918 for (i = 0; i < cset->nranges; ++i)
3920 cmp_buf[0] = cset->range_starts[i];
3921 cmp_buf[4] = cset->range_ends[i];
3922 if (wcscoll (cmp_buf, cmp_buf + 2) <= 0
3923 && wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0)
3925 match_len = char_len;
3926 goto check_node_accept_bytes_match;
3930 check_node_accept_bytes_match:
3931 if (!cset->non_match)
3932 return match_len;
3933 else
3935 if (match_len > 0)
3936 return 0;
3937 else
3938 return (elem_len > char_len) ? elem_len : char_len;
3941 return 0;
3944 # ifdef _LIBC
3945 static unsigned int
3946 internal_function
3947 find_collation_sequence_value (const unsigned char *mbs, size_t mbs_len)
3949 uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3950 if (nrules == 0)
3952 if (mbs_len == 1)
3954 /* No valid character. Match it as a single byte character. */
3955 const unsigned char *collseq = (const unsigned char *)
3956 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
3957 return collseq[mbs[0]];
3959 return UINT_MAX;
3961 else
3963 int32_t idx;
3964 const unsigned char *extra = (const unsigned char *)
3965 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB);
3966 int32_t extrasize = (const unsigned char *)
3967 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB + 1) - extra;
3969 for (idx = 0; idx < extrasize;)
3971 int mbs_cnt, found = 0;
3972 int32_t elem_mbs_len;
3973 /* Skip the name of collating element name. */
3974 idx = idx + extra[idx] + 1;
3975 elem_mbs_len = extra[idx++];
3976 if (mbs_len == elem_mbs_len)
3978 for (mbs_cnt = 0; mbs_cnt < elem_mbs_len; ++mbs_cnt)
3979 if (extra[idx + mbs_cnt] != mbs[mbs_cnt])
3980 break;
3981 if (mbs_cnt == elem_mbs_len)
3982 /* Found the entry. */
3983 found = 1;
3985 /* Skip the byte sequence of the collating element. */
3986 idx += elem_mbs_len;
3987 /* Adjust for the alignment. */
3988 idx = (idx + 3) & ~3;
3989 /* Skip the collation sequence value. */
3990 idx += sizeof (uint32_t);
3991 /* Skip the wide char sequence of the collating element. */
3992 idx = idx + sizeof (uint32_t) * (extra[idx] + 1);
3993 /* If we found the entry, return the sequence value. */
3994 if (found)
3995 return *(uint32_t *) (extra + idx);
3996 /* Skip the collation sequence value. */
3997 idx += sizeof (uint32_t);
3999 return UINT_MAX;
4002 # endif /* _LIBC */
4003 #endif /* RE_ENABLE_I18N */
4005 /* Check whether the node accepts the byte which is IDX-th
4006 byte of the INPUT. */
4008 static int
4009 internal_function
4010 check_node_accept (const re_match_context_t *mctx, const re_token_t *node,
4011 int idx)
4013 unsigned char ch;
4014 ch = re_string_byte_at (&mctx->input, idx);
4015 switch (node->type)
4017 case CHARACTER:
4018 if (node->opr.c != ch)
4019 return 0;
4020 break;
4022 case SIMPLE_BRACKET:
4023 if (!bitset_contain (node->opr.sbcset, ch))
4024 return 0;
4025 break;
4027 #ifdef RE_ENABLE_I18N
4028 case OP_UTF8_PERIOD:
4029 if (ch >= 0x80)
4030 return 0;
4031 /* FALLTHROUGH */
4032 #endif
4033 case OP_PERIOD:
4034 if ((ch == '\n' && !(mctx->dfa->syntax & RE_DOT_NEWLINE))
4035 || (ch == '\0' && (mctx->dfa->syntax & RE_DOT_NOT_NULL)))
4036 return 0;
4037 break;
4039 default:
4040 return 0;
4043 if (node->constraint)
4045 /* The node has constraints. Check whether the current context
4046 satisfies the constraints. */
4047 unsigned int context = re_string_context_at (&mctx->input, idx,
4048 mctx->eflags);
4049 if (NOT_SATISFY_NEXT_CONSTRAINT (node->constraint, context))
4050 return 0;
4053 return 1;
4056 /* Extend the buffers, if the buffers have run out. */
4058 static reg_errcode_t
4059 internal_function
4060 extend_buffers (re_match_context_t *mctx)
4062 reg_errcode_t ret;
4063 re_string_t *pstr = &mctx->input;
4065 /* Double the lengthes of the buffers. */
4066 ret = re_string_realloc_buffers (pstr, pstr->bufs_len * 2);
4067 if (BE (ret != REG_NOERROR, 0))
4068 return ret;
4070 if (mctx->state_log != NULL)
4072 /* And double the length of state_log. */
4073 /* XXX We have no indication of the size of this buffer. If this
4074 allocation fail we have no indication that the state_log array
4075 does not have the right size. */
4076 re_dfastate_t **new_array = re_realloc (mctx->state_log, re_dfastate_t *,
4077 pstr->bufs_len + 1);
4078 if (BE (new_array == NULL, 0))
4079 return REG_ESPACE;
4080 mctx->state_log = new_array;
4083 /* Then reconstruct the buffers. */
4084 if (pstr->icase)
4086 #ifdef RE_ENABLE_I18N
4087 if (pstr->mb_cur_max > 1)
4089 ret = build_wcs_upper_buffer (pstr);
4090 if (BE (ret != REG_NOERROR, 0))
4091 return ret;
4093 else
4094 #endif /* RE_ENABLE_I18N */
4095 build_upper_buffer (pstr);
4097 else
4099 #ifdef RE_ENABLE_I18N
4100 if (pstr->mb_cur_max > 1)
4101 build_wcs_buffer (pstr);
4102 else
4103 #endif /* RE_ENABLE_I18N */
4105 if (pstr->trans != NULL)
4106 re_string_translate_buffer (pstr);
4109 return REG_NOERROR;
4113 /* Functions for matching context. */
4115 /* Initialize MCTX. */
4117 static reg_errcode_t
4118 internal_function
4119 match_ctx_init (re_match_context_t *mctx, int eflags, int n)
4121 mctx->eflags = eflags;
4122 mctx->match_last = -1;
4123 if (n > 0)
4125 mctx->bkref_ents = re_malloc (struct re_backref_cache_entry, n);
4126 mctx->sub_tops = re_malloc (re_sub_match_top_t *, n);
4127 if (BE (mctx->bkref_ents == NULL || mctx->sub_tops == NULL, 0))
4128 return REG_ESPACE;
4130 /* Already zero-ed by the caller.
4131 else
4132 mctx->bkref_ents = NULL;
4133 mctx->nbkref_ents = 0;
4134 mctx->nsub_tops = 0; */
4135 mctx->abkref_ents = n;
4136 mctx->max_mb_elem_len = 1;
4137 mctx->asub_tops = n;
4138 return REG_NOERROR;
4141 /* Clean the entries which depend on the current input in MCTX.
4142 This function must be invoked when the matcher changes the start index
4143 of the input, or changes the input string. */
4145 static void
4146 internal_function
4147 match_ctx_clean (re_match_context_t *mctx)
4149 int st_idx;
4150 for (st_idx = 0; st_idx < mctx->nsub_tops; ++st_idx)
4152 int sl_idx;
4153 re_sub_match_top_t *top = mctx->sub_tops[st_idx];
4154 for (sl_idx = 0; sl_idx < top->nlasts; ++sl_idx)
4156 re_sub_match_last_t *last = top->lasts[sl_idx];
4157 re_free (last->path.array);
4158 re_free (last);
4160 re_free (top->lasts);
4161 if (top->path)
4163 re_free (top->path->array);
4164 re_free (top->path);
4166 free (top);
4169 mctx->nsub_tops = 0;
4170 mctx->nbkref_ents = 0;
4173 /* Free all the memory associated with MCTX. */
4175 static void
4176 internal_function
4177 match_ctx_free (re_match_context_t *mctx)
4179 /* First, free all the memory associated with MCTX->SUB_TOPS. */
4180 match_ctx_clean (mctx);
4181 re_free (mctx->sub_tops);
4182 re_free (mctx->bkref_ents);
4185 /* Add a new backreference entry to MCTX.
4186 Note that we assume that caller never call this function with duplicate
4187 entry, and call with STR_IDX which isn't smaller than any existing entry.
4190 static reg_errcode_t
4191 internal_function
4192 match_ctx_add_entry (re_match_context_t *mctx, int node, int str_idx, int from,
4193 int to)
4195 if (mctx->nbkref_ents >= mctx->abkref_ents)
4197 struct re_backref_cache_entry* new_entry;
4198 new_entry = re_realloc (mctx->bkref_ents, struct re_backref_cache_entry,
4199 mctx->abkref_ents * 2);
4200 if (BE (new_entry == NULL, 0))
4202 re_free (mctx->bkref_ents);
4203 return REG_ESPACE;
4205 mctx->bkref_ents = new_entry;
4206 memset (mctx->bkref_ents + mctx->nbkref_ents, '\0',
4207 sizeof (struct re_backref_cache_entry) * mctx->abkref_ents);
4208 mctx->abkref_ents *= 2;
4210 if (mctx->nbkref_ents > 0
4211 && mctx->bkref_ents[mctx->nbkref_ents - 1].str_idx == str_idx)
4212 mctx->bkref_ents[mctx->nbkref_ents - 1].more = 1;
4214 mctx->bkref_ents[mctx->nbkref_ents].node = node;
4215 mctx->bkref_ents[mctx->nbkref_ents].str_idx = str_idx;
4216 mctx->bkref_ents[mctx->nbkref_ents].subexp_from = from;
4217 mctx->bkref_ents[mctx->nbkref_ents].subexp_to = to;
4219 /* This is a cache that saves negative results of check_dst_limits_calc_pos.
4220 If bit N is clear, means that this entry won't epsilon-transition to
4221 an OP_OPEN_SUBEXP or OP_CLOSE_SUBEXP for the N+1-th subexpression. If
4222 it is set, check_dst_limits_calc_pos_1 will recurse and try to find one
4223 such node.
4225 A backreference does not epsilon-transition unless it is empty, so set
4226 to all zeros if FROM != TO. */
4227 mctx->bkref_ents[mctx->nbkref_ents].eps_reachable_subexps_map
4228 = (from == to ? ~0 : 0);
4230 mctx->bkref_ents[mctx->nbkref_ents++].more = 0;
4231 if (mctx->max_mb_elem_len < to - from)
4232 mctx->max_mb_elem_len = to - from;
4233 return REG_NOERROR;
4236 /* Search for the first entry which has the same str_idx, or -1 if none is
4237 found. Note that MCTX->BKREF_ENTS is already sorted by MCTX->STR_IDX. */
4239 static int
4240 internal_function
4241 search_cur_bkref_entry (const re_match_context_t *mctx, int str_idx)
4243 int left, right, mid, last;
4244 last = right = mctx->nbkref_ents;
4245 for (left = 0; left < right;)
4247 mid = (left + right) / 2;
4248 if (mctx->bkref_ents[mid].str_idx < str_idx)
4249 left = mid + 1;
4250 else
4251 right = mid;
4253 if (left < last && mctx->bkref_ents[left].str_idx == str_idx)
4254 return left;
4255 else
4256 return -1;
4259 /* Register the node NODE, whose type is OP_OPEN_SUBEXP, and which matches
4260 at STR_IDX. */
4262 static reg_errcode_t
4263 internal_function
4264 match_ctx_add_subtop (re_match_context_t *mctx, int node, int str_idx)
4266 #ifdef DEBUG
4267 assert (mctx->sub_tops != NULL);
4268 assert (mctx->asub_tops > 0);
4269 #endif
4270 if (BE (mctx->nsub_tops == mctx->asub_tops, 0))
4272 int new_asub_tops = mctx->asub_tops * 2;
4273 re_sub_match_top_t **new_array = re_realloc (mctx->sub_tops,
4274 re_sub_match_top_t *,
4275 new_asub_tops);
4276 if (BE (new_array == NULL, 0))
4277 return REG_ESPACE;
4278 mctx->sub_tops = new_array;
4279 mctx->asub_tops = new_asub_tops;
4281 mctx->sub_tops[mctx->nsub_tops] = calloc (1, sizeof (re_sub_match_top_t));
4282 if (BE (mctx->sub_tops[mctx->nsub_tops] == NULL, 0))
4283 return REG_ESPACE;
4284 mctx->sub_tops[mctx->nsub_tops]->node = node;
4285 mctx->sub_tops[mctx->nsub_tops++]->str_idx = str_idx;
4286 return REG_NOERROR;
4289 /* Register the node NODE, whose type is OP_CLOSE_SUBEXP, and which matches
4290 at STR_IDX, whose corresponding OP_OPEN_SUBEXP is SUB_TOP. */
4292 static re_sub_match_last_t *
4293 internal_function
4294 match_ctx_add_sublast (re_sub_match_top_t *subtop, int node, int str_idx)
4296 re_sub_match_last_t *new_entry;
4297 if (BE (subtop->nlasts == subtop->alasts, 0))
4299 int new_alasts = 2 * subtop->alasts + 1;
4300 re_sub_match_last_t **new_array = re_realloc (subtop->lasts,
4301 re_sub_match_last_t *,
4302 new_alasts);
4303 if (BE (new_array == NULL, 0))
4304 return NULL;
4305 subtop->lasts = new_array;
4306 subtop->alasts = new_alasts;
4308 new_entry = calloc (1, sizeof (re_sub_match_last_t));
4309 if (BE (new_entry != NULL, 1))
4311 subtop->lasts[subtop->nlasts] = new_entry;
4312 new_entry->node = node;
4313 new_entry->str_idx = str_idx;
4314 ++subtop->nlasts;
4316 return new_entry;
4319 static void
4320 internal_function
4321 sift_ctx_init (re_sift_context_t *sctx, re_dfastate_t **sifted_sts,
4322 re_dfastate_t **limited_sts, int last_node, int last_str_idx)
4324 sctx->sifted_states = sifted_sts;
4325 sctx->limited_states = limited_sts;
4326 sctx->last_node = last_node;
4327 sctx->last_str_idx = last_str_idx;
4328 re_node_set_init_empty (&sctx->limits);