2005-02-25 Roland McGrath <roland@redhat.com>
[glibc.git] / posix / regexec.c
blob636396e6f7aa6da099ac9922b99380c443b13830
1 /* Extended regular expression matching and search library.
2 Copyright (C) 2002, 2003, 2004 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4 Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library; if not, write to the Free
18 Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
19 02111-1307 USA. */
21 static reg_errcode_t match_ctx_init (re_match_context_t *cache, int eflags,
22 int n) internal_function;
23 static void match_ctx_clean (re_match_context_t *mctx) internal_function;
24 static void match_ctx_free (re_match_context_t *cache) internal_function;
25 static reg_errcode_t match_ctx_add_entry (re_match_context_t *cache, int node,
26 int str_idx, int from, int to)
27 internal_function;
28 static int search_cur_bkref_entry (re_match_context_t *mctx, int str_idx)
29 internal_function;
30 static reg_errcode_t match_ctx_add_subtop (re_match_context_t *mctx, int node,
31 int str_idx) internal_function;
32 static re_sub_match_last_t * match_ctx_add_sublast (re_sub_match_top_t *subtop,
33 int node, int str_idx)
34 internal_function;
35 static void sift_ctx_init (re_sift_context_t *sctx, re_dfastate_t **sifted_sts,
36 re_dfastate_t **limited_sts, int last_node,
37 int last_str_idx)
38 internal_function;
39 static reg_errcode_t re_search_internal (const regex_t *preg,
40 const char *string, int length,
41 int start, int range, int stop,
42 size_t nmatch, regmatch_t pmatch[],
43 int eflags) internal_function;
44 static int re_search_2_stub (struct re_pattern_buffer *bufp,
45 const char *string1, int length1,
46 const char *string2, int length2,
47 int start, int range, struct re_registers *regs,
48 int stop, int ret_len) internal_function;
49 static int re_search_stub (struct re_pattern_buffer *bufp,
50 const char *string, int length, int start,
51 int range, int stop, struct re_registers *regs,
52 int ret_len) internal_function;
53 static unsigned re_copy_regs (struct re_registers *regs, regmatch_t *pmatch,
54 int nregs, int regs_allocated) internal_function;
55 static inline re_dfastate_t *acquire_init_state_context
56 (reg_errcode_t *err, const re_match_context_t *mctx, int idx)
57 __attribute ((always_inline)) internal_function;
58 static reg_errcode_t prune_impossible_nodes (re_match_context_t *mctx)
59 internal_function;
60 static int check_matching (re_match_context_t *mctx, int fl_longest_match,
61 int *p_match_first)
62 internal_function;
63 static int check_halt_node_context (const re_dfa_t *dfa, int node,
64 unsigned int context) internal_function;
65 static int check_halt_state_context (const re_match_context_t *mctx,
66 const re_dfastate_t *state, int idx)
67 internal_function;
68 static void update_regs (re_dfa_t *dfa, regmatch_t *pmatch,
69 regmatch_t *prev_idx_match, int cur_node,
70 int cur_idx, int nmatch) internal_function;
71 static int proceed_next_node (const re_match_context_t *mctx,
72 int nregs, regmatch_t *regs,
73 int *pidx, int node, re_node_set *eps_via_nodes,
74 struct re_fail_stack_t *fs) internal_function;
75 static reg_errcode_t push_fail_stack (struct re_fail_stack_t *fs,
76 int str_idx, int dest_node, int nregs,
77 regmatch_t *regs,
78 re_node_set *eps_via_nodes) internal_function;
79 static int pop_fail_stack (struct re_fail_stack_t *fs, int *pidx, int nregs,
80 regmatch_t *regs, re_node_set *eps_via_nodes) internal_function;
81 static reg_errcode_t set_regs (const regex_t *preg,
82 const re_match_context_t *mctx,
83 size_t nmatch, regmatch_t *pmatch,
84 int fl_backtrack) internal_function;
85 static reg_errcode_t free_fail_stack_return (struct re_fail_stack_t *fs) internal_function;
87 #ifdef RE_ENABLE_I18N
88 static int sift_states_iter_mb (const re_match_context_t *mctx,
89 re_sift_context_t *sctx,
90 int node_idx, int str_idx, int max_str_idx) internal_function;
91 #endif /* RE_ENABLE_I18N */
92 static reg_errcode_t sift_states_backward (re_match_context_t *mctx,
93 re_sift_context_t *sctx) internal_function;
94 static reg_errcode_t build_sifted_states (re_match_context_t *mctx,
95 re_sift_context_t *sctx, int str_idx,
96 re_node_set *cur_dest) internal_function;
97 static reg_errcode_t update_cur_sifted_state (re_match_context_t *mctx,
98 re_sift_context_t *sctx,
99 int str_idx,
100 re_node_set *dest_nodes) internal_function;
101 static reg_errcode_t add_epsilon_src_nodes (re_dfa_t *dfa,
102 re_node_set *dest_nodes,
103 const re_node_set *candidates) internal_function;
104 static reg_errcode_t sub_epsilon_src_nodes (re_dfa_t *dfa, int node,
105 re_node_set *dest_nodes,
106 const re_node_set *and_nodes) internal_function;
107 static int check_dst_limits (re_match_context_t *mctx, re_node_set *limits,
108 int dst_node, int dst_idx, int src_node,
109 int src_idx) internal_function;
110 static int check_dst_limits_calc_pos_1 (re_match_context_t *mctx,
111 int boundaries, int subexp_idx,
112 int from_node, int bkref_idx) internal_function;
113 static int check_dst_limits_calc_pos (re_match_context_t *mctx,
114 int limit, int subexp_idx,
115 int node, int str_idx,
116 int bkref_idx) internal_function;
117 static reg_errcode_t check_subexp_limits (re_dfa_t *dfa,
118 re_node_set *dest_nodes,
119 const re_node_set *candidates,
120 re_node_set *limits,
121 struct re_backref_cache_entry *bkref_ents,
122 int str_idx) internal_function;
123 static reg_errcode_t sift_states_bkref (re_match_context_t *mctx,
124 re_sift_context_t *sctx,
125 int str_idx, const re_node_set *candidates) internal_function;
126 static reg_errcode_t clean_state_log_if_needed (re_match_context_t *mctx,
127 int next_state_log_idx) internal_function;
128 static reg_errcode_t merge_state_array (re_dfa_t *dfa, re_dfastate_t **dst,
129 re_dfastate_t **src, int num) internal_function;
130 static re_dfastate_t *find_recover_state (reg_errcode_t *err,
131 re_match_context_t *mctx) internal_function;
132 static re_dfastate_t *transit_state (reg_errcode_t *err,
133 re_match_context_t *mctx,
134 re_dfastate_t *state) internal_function;
135 static re_dfastate_t *merge_state_with_log (reg_errcode_t *err,
136 re_match_context_t *mctx,
137 re_dfastate_t *next_state) internal_function;
138 static reg_errcode_t check_subexp_matching_top (re_match_context_t *mctx,
139 re_node_set *cur_nodes,
140 int str_idx) internal_function;
141 #if 0
142 static re_dfastate_t *transit_state_sb (reg_errcode_t *err,
143 re_match_context_t *mctx,
144 re_dfastate_t *pstate) internal_function;
145 #endif
146 #ifdef RE_ENABLE_I18N
147 static reg_errcode_t transit_state_mb (re_match_context_t *mctx,
148 re_dfastate_t *pstate) internal_function;
149 #endif /* RE_ENABLE_I18N */
150 static reg_errcode_t transit_state_bkref (re_match_context_t *mctx,
151 const re_node_set *nodes) internal_function;
152 static reg_errcode_t get_subexp (re_match_context_t *mctx,
153 int bkref_node, int bkref_str_idx) internal_function;
154 static reg_errcode_t get_subexp_sub (re_match_context_t *mctx,
155 const re_sub_match_top_t *sub_top,
156 re_sub_match_last_t *sub_last,
157 int bkref_node, int bkref_str) internal_function;
158 static int find_subexp_node (const re_dfa_t *dfa, const re_node_set *nodes,
159 int subexp_idx, int type) internal_function;
160 static reg_errcode_t check_arrival (re_match_context_t *mctx,
161 state_array_t *path, int top_node,
162 int top_str, int last_node, int last_str,
163 int type) internal_function;
164 static reg_errcode_t check_arrival_add_next_nodes (re_match_context_t *mctx,
165 int str_idx,
166 re_node_set *cur_nodes,
167 re_node_set *next_nodes) internal_function;
168 static reg_errcode_t check_arrival_expand_ecl (re_dfa_t *dfa,
169 re_node_set *cur_nodes,
170 int ex_subexp, int type) internal_function;
171 static reg_errcode_t check_arrival_expand_ecl_sub (re_dfa_t *dfa,
172 re_node_set *dst_nodes,
173 int target, int ex_subexp,
174 int type) internal_function;
175 static reg_errcode_t expand_bkref_cache (re_match_context_t *mctx,
176 re_node_set *cur_nodes, int cur_str,
177 int subexp_num, int type) internal_function;
178 static int build_trtable (re_dfa_t *dfa,
179 re_dfastate_t *state) internal_function;
180 #ifdef RE_ENABLE_I18N
181 static int check_node_accept_bytes (re_dfa_t *dfa, int node_idx,
182 const re_string_t *input, int idx) internal_function;
183 # ifdef _LIBC
184 static unsigned int find_collation_sequence_value (const unsigned char *mbs,
185 size_t name_len) internal_function;
186 # endif /* _LIBC */
187 #endif /* RE_ENABLE_I18N */
188 static int group_nodes_into_DFAstates (re_dfa_t *dfa,
189 const re_dfastate_t *state,
190 re_node_set *states_node,
191 bitset *states_ch) internal_function;
192 static int check_node_accept (const re_match_context_t *mctx,
193 const re_token_t *node, int idx) internal_function;
194 static reg_errcode_t extend_buffers (re_match_context_t *mctx) internal_function;
196 /* Entry point for POSIX code. */
198 /* regexec searches for a given pattern, specified by PREG, in the
199 string STRING.
201 If NMATCH is zero or REG_NOSUB was set in the cflags argument to
202 `regcomp', we ignore PMATCH. Otherwise, we assume PMATCH has at
203 least NMATCH elements, and we set them to the offsets of the
204 corresponding matched substrings.
206 EFLAGS specifies `execution flags' which affect matching: if
207 REG_NOTBOL is set, then ^ does not match at the beginning of the
208 string; if REG_NOTEOL is set, then $ does not match at the end.
210 We return 0 if we find a match and REG_NOMATCH if not. */
213 regexec (preg, string, nmatch, pmatch, eflags)
214 const regex_t *__restrict preg;
215 const char *__restrict string;
216 size_t nmatch;
217 regmatch_t pmatch[];
218 int eflags;
220 reg_errcode_t err;
221 int start, length;
223 if (eflags & ~(REG_NOTBOL | REG_NOTEOL | REG_STARTEND))
224 return REG_BADPAT;
226 if (eflags & REG_STARTEND)
228 start = pmatch[0].rm_so;
229 length = pmatch[0].rm_eo;
231 else
233 start = 0;
234 length = strlen (string);
236 if (preg->no_sub)
237 err = re_search_internal (preg, string, length, start, length - start,
238 length, 0, NULL, eflags);
239 else
240 err = re_search_internal (preg, string, length, start, length - start,
241 length, nmatch, pmatch, eflags);
242 return err != REG_NOERROR;
245 #ifdef _LIBC
246 # include <shlib-compat.h>
247 versioned_symbol (libc, __regexec, regexec, GLIBC_2_3_4);
249 # if SHLIB_COMPAT (libc, GLIBC_2_0, GLIBC_2_3_4)
250 __typeof__ (__regexec) __compat_regexec;
253 attribute_compat_text_section
254 __compat_regexec (const regex_t *__restrict preg,
255 const char *__restrict string, size_t nmatch,
256 regmatch_t pmatch[], int eflags)
258 return regexec (preg, string, nmatch, pmatch,
259 eflags & (REG_NOTBOL | REG_NOTEOL));
261 compat_symbol (libc, __compat_regexec, regexec, GLIBC_2_0);
262 # endif
263 #endif
265 /* Entry points for GNU code. */
267 /* re_match, re_search, re_match_2, re_search_2
269 The former two functions operate on STRING with length LENGTH,
270 while the later two operate on concatenation of STRING1 and STRING2
271 with lengths LENGTH1 and LENGTH2, respectively.
273 re_match() matches the compiled pattern in BUFP against the string,
274 starting at index START.
276 re_search() first tries matching at index START, then it tries to match
277 starting from index START + 1, and so on. The last start position tried
278 is START + RANGE. (Thus RANGE = 0 forces re_search to operate the same
279 way as re_match().)
281 The parameter STOP of re_{match,search}_2 specifies that no match exceeding
282 the first STOP characters of the concatenation of the strings should be
283 concerned.
285 If REGS is not NULL, and BUFP->no_sub is not set, the offsets of the match
286 and all groups is stroed in REGS. (For the "_2" variants, the offsets are
287 computed relative to the concatenation, not relative to the individual
288 strings.)
290 On success, re_match* functions return the length of the match, re_search*
291 return the position of the start of the match. Return value -1 means no
292 match was found and -2 indicates an internal error. */
295 re_match (bufp, string, length, start, regs)
296 struct re_pattern_buffer *bufp;
297 const char *string;
298 int length, start;
299 struct re_registers *regs;
301 return re_search_stub (bufp, string, length, start, 0, length, regs, 1);
303 #ifdef _LIBC
304 weak_alias (__re_match, re_match)
305 #endif
308 re_search (bufp, string, length, start, range, regs)
309 struct re_pattern_buffer *bufp;
310 const char *string;
311 int length, start, range;
312 struct re_registers *regs;
314 return re_search_stub (bufp, string, length, start, range, length, regs, 0);
316 #ifdef _LIBC
317 weak_alias (__re_search, re_search)
318 #endif
321 re_match_2 (bufp, string1, length1, string2, length2, start, regs, stop)
322 struct re_pattern_buffer *bufp;
323 const char *string1, *string2;
324 int length1, length2, start, stop;
325 struct re_registers *regs;
327 return re_search_2_stub (bufp, string1, length1, string2, length2,
328 start, 0, regs, stop, 1);
330 #ifdef _LIBC
331 weak_alias (__re_match_2, re_match_2)
332 #endif
335 re_search_2 (bufp, string1, length1, string2, length2, start, range, regs, stop)
336 struct re_pattern_buffer *bufp;
337 const char *string1, *string2;
338 int length1, length2, start, range, stop;
339 struct re_registers *regs;
341 return re_search_2_stub (bufp, string1, length1, string2, length2,
342 start, range, regs, stop, 0);
344 #ifdef _LIBC
345 weak_alias (__re_search_2, re_search_2)
346 #endif
348 static int
349 re_search_2_stub (bufp, string1, length1, string2, length2, start, range, regs,
350 stop, ret_len)
351 struct re_pattern_buffer *bufp;
352 const char *string1, *string2;
353 int length1, length2, start, range, stop, ret_len;
354 struct re_registers *regs;
356 const char *str;
357 int rval;
358 int len = length1 + length2;
359 int free_str = 0;
361 if (BE (length1 < 0 || length2 < 0 || stop < 0, 0))
362 return -2;
364 /* Concatenate the strings. */
365 if (length2 > 0)
366 if (length1 > 0)
368 char *s = re_malloc (char, len);
370 if (BE (s == NULL, 0))
371 return -2;
372 memcpy (s, string1, length1);
373 memcpy (s + length1, string2, length2);
374 str = s;
375 free_str = 1;
377 else
378 str = string2;
379 else
380 str = string1;
382 rval = re_search_stub (bufp, str, len, start, range, stop, regs,
383 ret_len);
384 if (free_str)
385 re_free ((char *) str);
386 return rval;
389 /* The parameters have the same meaning as those of re_search.
390 Additional parameters:
391 If RET_LEN is nonzero the length of the match is returned (re_match style);
392 otherwise the position of the match is returned. */
394 static int
395 re_search_stub (bufp, string, length, start, range, stop, regs, ret_len)
396 struct re_pattern_buffer *bufp;
397 const char *string;
398 int length, start, range, stop, ret_len;
399 struct re_registers *regs;
401 reg_errcode_t result;
402 regmatch_t *pmatch;
403 int nregs, rval;
404 int eflags = 0;
406 /* Check for out-of-range. */
407 if (BE (start < 0 || start > length, 0))
408 return -1;
409 if (BE (start + range > length, 0))
410 range = length - start;
411 else if (BE (start + range < 0, 0))
412 range = -start;
414 eflags |= (bufp->not_bol) ? REG_NOTBOL : 0;
415 eflags |= (bufp->not_eol) ? REG_NOTEOL : 0;
417 /* Compile fastmap if we haven't yet. */
418 if (range > 0 && bufp->fastmap != NULL && !bufp->fastmap_accurate)
419 re_compile_fastmap (bufp);
421 if (BE (bufp->no_sub, 0))
422 regs = NULL;
424 /* We need at least 1 register. */
425 if (regs == NULL)
426 nregs = 1;
427 else if (BE (bufp->regs_allocated == REGS_FIXED &&
428 regs->num_regs < bufp->re_nsub + 1, 0))
430 nregs = regs->num_regs;
431 if (BE (nregs < 1, 0))
433 /* Nothing can be copied to regs. */
434 regs = NULL;
435 nregs = 1;
438 else
439 nregs = bufp->re_nsub + 1;
440 pmatch = re_malloc (regmatch_t, nregs);
441 if (BE (pmatch == NULL, 0))
442 return -2;
444 result = re_search_internal (bufp, string, length, start, range, stop,
445 nregs, pmatch, eflags);
447 rval = 0;
449 /* I hope we needn't fill ther regs with -1's when no match was found. */
450 if (result != REG_NOERROR)
451 rval = -1;
452 else if (regs != NULL)
454 /* If caller wants register contents data back, copy them. */
455 bufp->regs_allocated = re_copy_regs (regs, pmatch, nregs,
456 bufp->regs_allocated);
457 if (BE (bufp->regs_allocated == REGS_UNALLOCATED, 0))
458 rval = -2;
461 if (BE (rval == 0, 1))
463 if (ret_len)
465 assert (pmatch[0].rm_so == start);
466 rval = pmatch[0].rm_eo - start;
468 else
469 rval = pmatch[0].rm_so;
471 re_free (pmatch);
472 return rval;
475 static unsigned
476 re_copy_regs (regs, pmatch, nregs, regs_allocated)
477 struct re_registers *regs;
478 regmatch_t *pmatch;
479 int nregs, regs_allocated;
481 int rval = REGS_REALLOCATE;
482 int i;
483 int need_regs = nregs + 1;
484 /* We need one extra element beyond `num_regs' for the `-1' marker GNU code
485 uses. */
487 /* Have the register data arrays been allocated? */
488 if (regs_allocated == REGS_UNALLOCATED)
489 { /* No. So allocate them with malloc. */
490 regs->start = re_malloc (regoff_t, need_regs);
491 regs->end = re_malloc (regoff_t, need_regs);
492 if (BE (regs->start == NULL, 0) || BE (regs->end == NULL, 0))
493 return REGS_UNALLOCATED;
494 regs->num_regs = need_regs;
496 else if (regs_allocated == REGS_REALLOCATE)
497 { /* Yes. If we need more elements than were already
498 allocated, reallocate them. If we need fewer, just
499 leave it alone. */
500 if (BE (need_regs > regs->num_regs, 0))
502 regoff_t *new_start = re_realloc (regs->start, regoff_t, need_regs);
503 regoff_t *new_end = re_realloc (regs->end, regoff_t, need_regs);
504 if (BE (new_start == NULL, 0) || BE (new_end == NULL, 0))
505 return REGS_UNALLOCATED;
506 regs->start = new_start;
507 regs->end = new_end;
508 regs->num_regs = need_regs;
511 else
513 assert (regs_allocated == REGS_FIXED);
514 /* This function may not be called with REGS_FIXED and nregs too big. */
515 assert (regs->num_regs >= nregs);
516 rval = REGS_FIXED;
519 /* Copy the regs. */
520 for (i = 0; i < nregs; ++i)
522 regs->start[i] = pmatch[i].rm_so;
523 regs->end[i] = pmatch[i].rm_eo;
525 for ( ; i < regs->num_regs; ++i)
526 regs->start[i] = regs->end[i] = -1;
528 return rval;
531 /* Set REGS to hold NUM_REGS registers, storing them in STARTS and
532 ENDS. Subsequent matches using PATTERN_BUFFER and REGS will use
533 this memory for recording register information. STARTS and ENDS
534 must be allocated using the malloc library routine, and must each
535 be at least NUM_REGS * sizeof (regoff_t) bytes long.
537 If NUM_REGS == 0, then subsequent matches should allocate their own
538 register data.
540 Unless this function is called, the first search or match using
541 PATTERN_BUFFER will allocate its own register data, without
542 freeing the old data. */
544 void
545 re_set_registers (bufp, regs, num_regs, starts, ends)
546 struct re_pattern_buffer *bufp;
547 struct re_registers *regs;
548 unsigned num_regs;
549 regoff_t *starts, *ends;
551 if (num_regs)
553 bufp->regs_allocated = REGS_REALLOCATE;
554 regs->num_regs = num_regs;
555 regs->start = starts;
556 regs->end = ends;
558 else
560 bufp->regs_allocated = REGS_UNALLOCATED;
561 regs->num_regs = 0;
562 regs->start = regs->end = (regoff_t *) 0;
565 #ifdef _LIBC
566 weak_alias (__re_set_registers, re_set_registers)
567 #endif
569 /* Entry points compatible with 4.2 BSD regex library. We don't define
570 them unless specifically requested. */
572 #if defined _REGEX_RE_COMP || defined _LIBC
574 # ifdef _LIBC
575 weak_function
576 # endif
577 re_exec (s)
578 const char *s;
580 return 0 == regexec (&re_comp_buf, s, 0, NULL, 0);
582 #endif /* _REGEX_RE_COMP */
584 /* Internal entry point. */
586 /* Searches for a compiled pattern PREG in the string STRING, whose
587 length is LENGTH. NMATCH, PMATCH, and EFLAGS have the same
588 mingings with regexec. START, and RANGE have the same meanings
589 with re_search.
590 Return REG_NOERROR if we find a match, and REG_NOMATCH if not,
591 otherwise return the error code.
592 Note: We assume front end functions already check ranges.
593 (START + RANGE >= 0 && START + RANGE <= LENGTH) */
595 static reg_errcode_t
596 re_search_internal (preg, string, length, start, range, stop, nmatch, pmatch,
597 eflags)
598 const regex_t *preg;
599 const char *string;
600 int length, start, range, stop, eflags;
601 size_t nmatch;
602 regmatch_t pmatch[];
604 reg_errcode_t err;
605 re_dfa_t *dfa = (re_dfa_t *)preg->buffer;
606 int left_lim, right_lim, incr;
607 int fl_longest_match, match_first, match_kind, match_last = -1;
608 int extra_nmatch;
609 int sb, ch;
610 #if defined _LIBC || (defined __STDC_VERSION__ && __STDC_VERSION__ >= 199901L)
611 re_match_context_t mctx = { .dfa = dfa };
612 #else
613 re_match_context_t mctx;
614 #endif
615 char *fastmap = (preg->fastmap != NULL && preg->fastmap_accurate
616 && range && !preg->can_be_null) ? preg->fastmap : NULL;
617 unsigned RE_TRANSLATE_TYPE t = (unsigned RE_TRANSLATE_TYPE) preg->translate;
619 #if !(defined _LIBC || (defined __STDC_VERSION__ && __STDC_VERSION__ >= 199901L))
620 memset (&mctx, '\0', sizeof (re_match_context_t));
621 mctx.dfa = dfa;
622 #endif
624 extra_nmatch = (nmatch > preg->re_nsub) ? nmatch - (preg->re_nsub + 1) : 0;
625 nmatch -= extra_nmatch;
627 /* Check if the DFA haven't been compiled. */
628 if (BE (preg->used == 0 || dfa->init_state == NULL
629 || dfa->init_state_word == NULL || dfa->init_state_nl == NULL
630 || dfa->init_state_begbuf == NULL, 0))
631 return REG_NOMATCH;
633 #ifdef DEBUG
634 /* We assume front-end functions already check them. */
635 assert (start + range >= 0 && start + range <= length);
636 #endif
638 /* If initial states with non-begbuf contexts have no elements,
639 the regex must be anchored. If preg->newline_anchor is set,
640 we'll never use init_state_nl, so do not check it. */
641 if (dfa->init_state->nodes.nelem == 0
642 && dfa->init_state_word->nodes.nelem == 0
643 && (dfa->init_state_nl->nodes.nelem == 0
644 || !preg->newline_anchor))
646 if (start != 0 && start + range != 0)
647 return REG_NOMATCH;
648 start = range = 0;
651 /* We must check the longest matching, if nmatch > 0. */
652 fl_longest_match = (nmatch != 0 || dfa->nbackref);
654 err = re_string_allocate (&mctx.input, string, length, dfa->nodes_len + 1,
655 preg->translate, preg->syntax & RE_ICASE, dfa);
656 if (BE (err != REG_NOERROR, 0))
657 goto free_return;
658 mctx.input.stop = stop;
659 mctx.input.raw_stop = stop;
660 mctx.input.newline_anchor = preg->newline_anchor;
662 err = match_ctx_init (&mctx, eflags, dfa->nbackref * 2);
663 if (BE (err != REG_NOERROR, 0))
664 goto free_return;
666 /* We will log all the DFA states through which the dfa pass,
667 if nmatch > 1, or this dfa has "multibyte node", which is a
668 back-reference or a node which can accept multibyte character or
669 multi character collating element. */
670 if (nmatch > 1 || dfa->has_mb_node)
672 mctx.state_log = re_malloc (re_dfastate_t *, mctx.input.bufs_len + 1);
673 if (BE (mctx.state_log == NULL, 0))
675 err = REG_ESPACE;
676 goto free_return;
679 else
680 mctx.state_log = NULL;
682 match_first = start;
683 mctx.input.tip_context = (eflags & REG_NOTBOL) ? CONTEXT_BEGBUF
684 : CONTEXT_NEWLINE | CONTEXT_BEGBUF;
686 /* Check incrementally whether of not the input string match. */
687 incr = (range < 0) ? -1 : 1;
688 left_lim = (range < 0) ? start + range : start;
689 right_lim = (range < 0) ? start : start + range;
690 sb = dfa->mb_cur_max == 1;
691 match_kind =
692 (fastmap
693 ? ((sb || !(preg->syntax & RE_ICASE || t) ? 4 : 0)
694 | (range >= 0 ? 2 : 0)
695 | (t != NULL ? 1 : 0))
696 : 8);
698 for (;; match_first += incr)
700 err = REG_NOMATCH;
701 if (match_first < left_lim || right_lim < match_first)
702 goto free_return;
704 /* Advance as rapidly as possible through the string, until we
705 find a plausible place to start matching. This may be done
706 with varying efficiency, so there are various possibilities:
707 only the most common of them are specialized, in order to
708 save on code size. We use a switch statement for speed. */
709 switch (match_kind)
711 case 8:
712 /* No fastmap. */
713 break;
715 case 7:
716 /* Fastmap with single-byte translation, match forward. */
717 while (BE (match_first < right_lim, 1)
718 && !fastmap[t[(unsigned char) string[match_first]]])
719 ++match_first;
720 goto forward_match_found_start_or_reached_end;
722 case 6:
723 /* Fastmap without translation, match forward. */
724 while (BE (match_first < right_lim, 1)
725 && !fastmap[(unsigned char) string[match_first]])
726 ++match_first;
728 forward_match_found_start_or_reached_end:
729 if (BE (match_first == right_lim, 0))
731 ch = match_first >= length
732 ? 0 : (unsigned char) string[match_first];
733 if (!fastmap[t ? t[ch] : ch])
734 goto free_return;
736 break;
738 case 4:
739 case 5:
740 /* Fastmap without multi-byte translation, match backwards. */
741 while (match_first >= left_lim)
743 ch = match_first >= length
744 ? 0 : (unsigned char) string[match_first];
745 if (fastmap[t ? t[ch] : ch])
746 break;
747 --match_first;
749 if (match_first < left_lim)
750 goto free_return;
751 break;
753 default:
754 /* In this case, we can't determine easily the current byte,
755 since it might be a component byte of a multibyte
756 character. Then we use the constructed buffer instead. */
757 for (;;)
759 /* If MATCH_FIRST is out of the valid range, reconstruct the
760 buffers. */
761 unsigned int offset = match_first - mctx.input.raw_mbs_idx;
762 if (BE (offset >= (unsigned int) mctx.input.valid_raw_len, 0))
764 err = re_string_reconstruct (&mctx.input, match_first,
765 eflags);
766 if (BE (err != REG_NOERROR, 0))
767 goto free_return;
769 offset = match_first - mctx.input.raw_mbs_idx;
771 /* If MATCH_FIRST is out of the buffer, leave it as '\0'.
772 Note that MATCH_FIRST must not be smaller than 0. */
773 ch = (match_first >= length
774 ? 0 : re_string_byte_at (&mctx.input, offset));
775 if (fastmap[ch])
776 break;
777 match_first += incr;
778 if (match_first < left_lim || match_first > right_lim)
780 err = REG_NOMATCH;
781 goto free_return;
784 break;
787 /* Reconstruct the buffers so that the matcher can assume that
788 the matching starts from the beginning of the buffer. */
789 err = re_string_reconstruct (&mctx.input, match_first, eflags);
790 if (BE (err != REG_NOERROR, 0))
791 goto free_return;
793 #ifdef RE_ENABLE_I18N
794 /* Don't consider this char as a possible match start if it part,
795 yet isn't the head, of a multibyte character. */
796 if (!sb && !re_string_first_byte (&mctx.input, 0))
797 continue;
798 #endif
800 /* It seems to be appropriate one, then use the matcher. */
801 /* We assume that the matching starts from 0. */
802 mctx.state_log_top = mctx.nbkref_ents = mctx.max_mb_elem_len = 0;
803 match_last = check_matching (&mctx, fl_longest_match,
804 range >= 0 ? &match_first : NULL);
805 if (match_last != -1)
807 if (BE (match_last == -2, 0))
809 err = REG_ESPACE;
810 goto free_return;
812 else
814 mctx.match_last = match_last;
815 if ((!preg->no_sub && nmatch > 1) || dfa->nbackref)
817 re_dfastate_t *pstate = mctx.state_log[match_last];
818 mctx.last_node = check_halt_state_context (&mctx, pstate,
819 match_last);
821 if ((!preg->no_sub && nmatch > 1 && dfa->has_plural_match)
822 || dfa->nbackref)
824 err = prune_impossible_nodes (&mctx);
825 if (err == REG_NOERROR)
826 break;
827 if (BE (err != REG_NOMATCH, 0))
828 goto free_return;
829 match_last = -1;
831 else
832 break; /* We found a match. */
836 match_ctx_clean (&mctx);
839 #ifdef DEBUG
840 assert (match_last != -1);
841 assert (err == REG_NOERROR);
842 #endif
844 /* Set pmatch[] if we need. */
845 if (nmatch > 0)
847 int reg_idx;
849 /* Initialize registers. */
850 for (reg_idx = 1; reg_idx < nmatch; ++reg_idx)
851 pmatch[reg_idx].rm_so = pmatch[reg_idx].rm_eo = -1;
853 /* Set the points where matching start/end. */
854 pmatch[0].rm_so = 0;
855 pmatch[0].rm_eo = mctx.match_last;
857 if (!preg->no_sub && nmatch > 1)
859 err = set_regs (preg, &mctx, nmatch, pmatch,
860 dfa->has_plural_match && dfa->nbackref > 0);
861 if (BE (err != REG_NOERROR, 0))
862 goto free_return;
865 /* At last, add the offset to the each registers, since we slided
866 the buffers so that we could assume that the matching starts
867 from 0. */
868 for (reg_idx = 0; reg_idx < nmatch; ++reg_idx)
869 if (pmatch[reg_idx].rm_so != -1)
871 #ifdef RE_ENABLE_I18N
872 if (BE (mctx.input.offsets_needed != 0, 0))
874 if (pmatch[reg_idx].rm_so == mctx.input.valid_len)
875 pmatch[reg_idx].rm_so += mctx.input.valid_raw_len - mctx.input.valid_len;
876 else
877 pmatch[reg_idx].rm_so = mctx.input.offsets[pmatch[reg_idx].rm_so];
878 if (pmatch[reg_idx].rm_eo == mctx.input.valid_len)
879 pmatch[reg_idx].rm_eo += mctx.input.valid_raw_len - mctx.input.valid_len;
880 else
881 pmatch[reg_idx].rm_eo = mctx.input.offsets[pmatch[reg_idx].rm_eo];
883 #else
884 assert (mctx.input.offsets_needed == 0);
885 #endif
886 pmatch[reg_idx].rm_so += match_first;
887 pmatch[reg_idx].rm_eo += match_first;
889 for (reg_idx = 0; reg_idx < extra_nmatch; ++reg_idx)
891 pmatch[nmatch + reg_idx].rm_so = -1;
892 pmatch[nmatch + reg_idx].rm_eo = -1;
895 if (dfa->subexp_map)
896 for (reg_idx = 0; reg_idx + 1 < nmatch; reg_idx++)
897 if (dfa->subexp_map[reg_idx] != reg_idx)
899 pmatch[reg_idx + 1].rm_so
900 = pmatch[dfa->subexp_map[reg_idx] + 1].rm_so;
901 pmatch[reg_idx + 1].rm_eo
902 = pmatch[dfa->subexp_map[reg_idx] + 1].rm_eo;
906 free_return:
907 re_free (mctx.state_log);
908 if (dfa->nbackref)
909 match_ctx_free (&mctx);
910 re_string_destruct (&mctx.input);
911 return err;
914 static reg_errcode_t
915 prune_impossible_nodes (mctx)
916 re_match_context_t *mctx;
918 re_dfa_t *const dfa = mctx->dfa;
919 int halt_node, match_last;
920 reg_errcode_t ret;
921 re_dfastate_t **sifted_states;
922 re_dfastate_t **lim_states = NULL;
923 re_sift_context_t sctx;
924 #ifdef DEBUG
925 assert (mctx->state_log != NULL);
926 #endif
927 match_last = mctx->match_last;
928 halt_node = mctx->last_node;
929 sifted_states = re_malloc (re_dfastate_t *, match_last + 1);
930 if (BE (sifted_states == NULL, 0))
932 ret = REG_ESPACE;
933 goto free_return;
935 if (dfa->nbackref)
937 lim_states = re_malloc (re_dfastate_t *, match_last + 1);
938 if (BE (lim_states == NULL, 0))
940 ret = REG_ESPACE;
941 goto free_return;
943 while (1)
945 memset (lim_states, '\0',
946 sizeof (re_dfastate_t *) * (match_last + 1));
947 sift_ctx_init (&sctx, sifted_states, lim_states, halt_node,
948 match_last);
949 ret = sift_states_backward (mctx, &sctx);
950 re_node_set_free (&sctx.limits);
951 if (BE (ret != REG_NOERROR, 0))
952 goto free_return;
953 if (sifted_states[0] != NULL || lim_states[0] != NULL)
954 break;
957 --match_last;
958 if (match_last < 0)
960 ret = REG_NOMATCH;
961 goto free_return;
963 } while (mctx->state_log[match_last] == NULL
964 || !mctx->state_log[match_last]->halt);
965 halt_node = check_halt_state_context (mctx,
966 mctx->state_log[match_last],
967 match_last);
969 ret = merge_state_array (dfa, sifted_states, lim_states,
970 match_last + 1);
971 re_free (lim_states);
972 lim_states = NULL;
973 if (BE (ret != REG_NOERROR, 0))
974 goto free_return;
976 else
978 sift_ctx_init (&sctx, sifted_states, lim_states, halt_node, match_last);
979 ret = sift_states_backward (mctx, &sctx);
980 re_node_set_free (&sctx.limits);
981 if (BE (ret != REG_NOERROR, 0))
982 goto free_return;
984 re_free (mctx->state_log);
985 mctx->state_log = sifted_states;
986 sifted_states = NULL;
987 mctx->last_node = halt_node;
988 mctx->match_last = match_last;
989 ret = REG_NOERROR;
990 free_return:
991 re_free (sifted_states);
992 re_free (lim_states);
993 return ret;
996 /* Acquire an initial state and return it.
997 We must select appropriate initial state depending on the context,
998 since initial states may have constraints like "\<", "^", etc.. */
1000 static inline re_dfastate_t *
1001 acquire_init_state_context (err, mctx, idx)
1002 reg_errcode_t *err;
1003 const re_match_context_t *mctx;
1004 int idx;
1006 re_dfa_t *const dfa = mctx->dfa;
1007 if (dfa->init_state->has_constraint)
1009 unsigned int context;
1010 context = re_string_context_at (&mctx->input, idx - 1, mctx->eflags);
1011 if (IS_WORD_CONTEXT (context))
1012 return dfa->init_state_word;
1013 else if (IS_ORDINARY_CONTEXT (context))
1014 return dfa->init_state;
1015 else if (IS_BEGBUF_CONTEXT (context) && IS_NEWLINE_CONTEXT (context))
1016 return dfa->init_state_begbuf;
1017 else if (IS_NEWLINE_CONTEXT (context))
1018 return dfa->init_state_nl;
1019 else if (IS_BEGBUF_CONTEXT (context))
1021 /* It is relatively rare case, then calculate on demand. */
1022 return re_acquire_state_context (err, dfa,
1023 dfa->init_state->entrance_nodes,
1024 context);
1026 else
1027 /* Must not happen? */
1028 return dfa->init_state;
1030 else
1031 return dfa->init_state;
1034 /* Check whether the regular expression match input string INPUT or not,
1035 and return the index where the matching end, return -1 if not match,
1036 or return -2 in case of an error.
1037 FL_LONGEST_MATCH means we want the POSIX longest matching.
1038 If P_MATCH_FIRST is not NULL, and the match fails, it is set to the
1039 next place where we may want to try matching.
1040 Note that the matcher assume that the maching starts from the current
1041 index of the buffer. */
1043 static int
1044 check_matching (mctx, fl_longest_match, p_match_first)
1045 re_match_context_t *mctx;
1046 int fl_longest_match;
1047 int *p_match_first;
1049 re_dfa_t *const dfa = mctx->dfa;
1050 reg_errcode_t err;
1051 int match = 0;
1052 int match_last = -1;
1053 int cur_str_idx = re_string_cur_idx (&mctx->input);
1054 re_dfastate_t *cur_state;
1055 int at_init_state = p_match_first != NULL;
1056 int next_start_idx = cur_str_idx;
1058 err = REG_NOERROR;
1059 cur_state = acquire_init_state_context (&err, mctx, cur_str_idx);
1060 /* An initial state must not be NULL (invalid). */
1061 if (BE (cur_state == NULL, 0))
1063 assert (err == REG_ESPACE);
1064 return -2;
1067 if (mctx->state_log != NULL)
1069 mctx->state_log[cur_str_idx] = cur_state;
1071 /* Check OP_OPEN_SUBEXP in the initial state in case that we use them
1072 later. E.g. Processing back references. */
1073 if (BE (dfa->nbackref, 0))
1075 at_init_state = 0;
1076 err = check_subexp_matching_top (mctx, &cur_state->nodes, 0);
1077 if (BE (err != REG_NOERROR, 0))
1078 return err;
1080 if (cur_state->has_backref)
1082 err = transit_state_bkref (mctx, &cur_state->nodes);
1083 if (BE (err != REG_NOERROR, 0))
1084 return err;
1089 /* If the RE accepts NULL string. */
1090 if (BE (cur_state->halt, 0))
1092 if (!cur_state->has_constraint
1093 || check_halt_state_context (mctx, cur_state, cur_str_idx))
1095 if (!fl_longest_match)
1096 return cur_str_idx;
1097 else
1099 match_last = cur_str_idx;
1100 match = 1;
1105 while (!re_string_eoi (&mctx->input))
1107 re_dfastate_t *old_state = cur_state;
1108 int next_char_idx = re_string_cur_idx (&mctx->input) + 1;
1110 if (BE (next_char_idx >= mctx->input.bufs_len, 0)
1111 || (BE (next_char_idx >= mctx->input.valid_len, 0)
1112 && mctx->input.valid_len < mctx->input.len))
1114 err = extend_buffers (mctx);
1115 if (BE (err != REG_NOERROR, 0))
1117 assert (err == REG_ESPACE);
1118 return -2;
1122 cur_state = transit_state (&err, mctx, cur_state);
1123 if (mctx->state_log != NULL)
1124 cur_state = merge_state_with_log (&err, mctx, cur_state);
1126 if (cur_state == NULL)
1128 /* Reached the invalid state or an error. Try to recover a valid
1129 state using the state log, if available and if we have not
1130 already found a valid (even if not the longest) match. */
1131 if (BE (err != REG_NOERROR, 0))
1132 return -2;
1134 if (mctx->state_log == NULL
1135 || (match && !fl_longest_match)
1136 || (cur_state = find_recover_state (&err, mctx)) == NULL)
1137 break;
1140 if (BE (at_init_state, 0))
1142 if (old_state == cur_state)
1143 next_start_idx = next_char_idx;
1144 else
1145 at_init_state = 0;
1148 if (cur_state->halt)
1150 /* Reached a halt state.
1151 Check the halt state can satisfy the current context. */
1152 if (!cur_state->has_constraint
1153 || check_halt_state_context (mctx, cur_state,
1154 re_string_cur_idx (&mctx->input)))
1156 /* We found an appropriate halt state. */
1157 match_last = re_string_cur_idx (&mctx->input);
1158 match = 1;
1160 /* We found a match, do not modify match_first below. */
1161 p_match_first = NULL;
1162 if (!fl_longest_match)
1163 break;
1168 if (p_match_first)
1169 *p_match_first += next_start_idx;
1171 return match_last;
1174 /* Check NODE match the current context. */
1176 static int check_halt_node_context (dfa, node, context)
1177 const re_dfa_t *dfa;
1178 int node;
1179 unsigned int context;
1181 re_token_type_t type = dfa->nodes[node].type;
1182 unsigned int constraint = dfa->nodes[node].constraint;
1183 if (type != END_OF_RE)
1184 return 0;
1185 if (!constraint)
1186 return 1;
1187 if (NOT_SATISFY_NEXT_CONSTRAINT (constraint, context))
1188 return 0;
1189 return 1;
1192 /* Check the halt state STATE match the current context.
1193 Return 0 if not match, if the node, STATE has, is a halt node and
1194 match the context, return the node. */
1196 static int
1197 check_halt_state_context (mctx, state, idx)
1198 const re_match_context_t *mctx;
1199 const re_dfastate_t *state;
1200 int idx;
1202 int i;
1203 unsigned int context;
1204 #ifdef DEBUG
1205 assert (state->halt);
1206 #endif
1207 context = re_string_context_at (&mctx->input, idx, mctx->eflags);
1208 for (i = 0; i < state->nodes.nelem; ++i)
1209 if (check_halt_node_context (mctx->dfa, state->nodes.elems[i], context))
1210 return state->nodes.elems[i];
1211 return 0;
1214 /* Compute the next node to which "NFA" transit from NODE("NFA" is a NFA
1215 corresponding to the DFA).
1216 Return the destination node, and update EPS_VIA_NODES, return -1 in case
1217 of errors. */
1219 static int
1220 proceed_next_node (mctx, nregs, regs, pidx, node, eps_via_nodes, fs)
1221 const re_match_context_t *mctx;
1222 regmatch_t *regs;
1223 int nregs, *pidx, node;
1224 re_node_set *eps_via_nodes;
1225 struct re_fail_stack_t *fs;
1227 re_dfa_t *const dfa = mctx->dfa;
1228 int i, err, dest_node;
1229 dest_node = -1;
1230 if (IS_EPSILON_NODE (dfa->nodes[node].type))
1232 re_node_set *cur_nodes = &mctx->state_log[*pidx]->nodes;
1233 re_node_set *edests = &dfa->edests[node];
1234 int dest_node;
1235 err = re_node_set_insert (eps_via_nodes, node);
1236 if (BE (err < 0, 0))
1237 return -2;
1238 /* Pick up a valid destination, or return -1 if none is found. */
1239 for (dest_node = -1, i = 0; i < edests->nelem; ++i)
1241 int candidate = edests->elems[i];
1242 if (!re_node_set_contains (cur_nodes, candidate))
1243 continue;
1244 if (dest_node == -1)
1245 dest_node = candidate;
1247 else
1249 /* In order to avoid infinite loop like "(a*)*", return the second
1250 epsilon-transition if the first was already considered. */
1251 if (re_node_set_contains (eps_via_nodes, dest_node))
1252 return candidate;
1254 /* Otherwise, push the second epsilon-transition on the fail stack. */
1255 else if (fs != NULL
1256 && push_fail_stack (fs, *pidx, candidate, nregs, regs,
1257 eps_via_nodes))
1258 return -2;
1260 /* We know we are going to exit. */
1261 break;
1264 return dest_node;
1266 else
1268 int naccepted = 0;
1269 re_token_type_t type = dfa->nodes[node].type;
1271 #ifdef RE_ENABLE_I18N
1272 if (dfa->nodes[node].accept_mb)
1273 naccepted = check_node_accept_bytes (dfa, node, &mctx->input, *pidx);
1274 else
1275 #endif /* RE_ENABLE_I18N */
1276 if (type == OP_BACK_REF)
1278 int subexp_idx = dfa->nodes[node].opr.idx + 1;
1279 naccepted = regs[subexp_idx].rm_eo - regs[subexp_idx].rm_so;
1280 if (fs != NULL)
1282 if (regs[subexp_idx].rm_so == -1 || regs[subexp_idx].rm_eo == -1)
1283 return -1;
1284 else if (naccepted)
1286 char *buf = (char *) re_string_get_buffer (&mctx->input);
1287 if (memcmp (buf + regs[subexp_idx].rm_so, buf + *pidx,
1288 naccepted) != 0)
1289 return -1;
1293 if (naccepted == 0)
1295 err = re_node_set_insert (eps_via_nodes, node);
1296 if (BE (err < 0, 0))
1297 return -2;
1298 dest_node = dfa->edests[node].elems[0];
1299 if (re_node_set_contains (&mctx->state_log[*pidx]->nodes,
1300 dest_node))
1301 return dest_node;
1305 if (naccepted != 0
1306 || check_node_accept (mctx, dfa->nodes + node, *pidx))
1308 dest_node = dfa->nexts[node];
1309 *pidx = (naccepted == 0) ? *pidx + 1 : *pidx + naccepted;
1310 if (fs && (*pidx > mctx->match_last || mctx->state_log[*pidx] == NULL
1311 || !re_node_set_contains (&mctx->state_log[*pidx]->nodes,
1312 dest_node)))
1313 return -1;
1314 re_node_set_empty (eps_via_nodes);
1315 return dest_node;
1318 return -1;
1321 static reg_errcode_t
1322 push_fail_stack (fs, str_idx, dest_node, nregs, regs, eps_via_nodes)
1323 struct re_fail_stack_t *fs;
1324 int str_idx, dest_node, nregs;
1325 regmatch_t *regs;
1326 re_node_set *eps_via_nodes;
1328 reg_errcode_t err;
1329 int num = fs->num++;
1330 if (fs->num == fs->alloc)
1332 struct re_fail_stack_ent_t *new_array;
1333 new_array = realloc (fs->stack, (sizeof (struct re_fail_stack_ent_t)
1334 * fs->alloc * 2));
1335 if (new_array == NULL)
1336 return REG_ESPACE;
1337 fs->alloc *= 2;
1338 fs->stack = new_array;
1340 fs->stack[num].idx = str_idx;
1341 fs->stack[num].node = dest_node;
1342 fs->stack[num].regs = re_malloc (regmatch_t, nregs);
1343 if (fs->stack[num].regs == NULL)
1344 return REG_ESPACE;
1345 memcpy (fs->stack[num].regs, regs, sizeof (regmatch_t) * nregs);
1346 err = re_node_set_init_copy (&fs->stack[num].eps_via_nodes, eps_via_nodes);
1347 return err;
1350 static int
1351 pop_fail_stack (fs, pidx, nregs, regs, eps_via_nodes)
1352 struct re_fail_stack_t *fs;
1353 int *pidx, nregs;
1354 regmatch_t *regs;
1355 re_node_set *eps_via_nodes;
1357 int num = --fs->num;
1358 assert (num >= 0);
1359 *pidx = fs->stack[num].idx;
1360 memcpy (regs, fs->stack[num].regs, sizeof (regmatch_t) * nregs);
1361 re_node_set_free (eps_via_nodes);
1362 re_free (fs->stack[num].regs);
1363 *eps_via_nodes = fs->stack[num].eps_via_nodes;
1364 return fs->stack[num].node;
1367 /* Set the positions where the subexpressions are starts/ends to registers
1368 PMATCH.
1369 Note: We assume that pmatch[0] is already set, and
1370 pmatch[i].rm_so == pmatch[i].rm_eo == -1 for 0 < i < nmatch. */
1372 static reg_errcode_t
1373 set_regs (preg, mctx, nmatch, pmatch, fl_backtrack)
1374 const regex_t *preg;
1375 const re_match_context_t *mctx;
1376 size_t nmatch;
1377 regmatch_t *pmatch;
1378 int fl_backtrack;
1380 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1381 int idx, cur_node;
1382 re_node_set eps_via_nodes;
1383 struct re_fail_stack_t *fs;
1384 struct re_fail_stack_t fs_body = { 0, 2, NULL };
1385 regmatch_t *prev_idx_match;
1387 #ifdef DEBUG
1388 assert (nmatch > 1);
1389 assert (mctx->state_log != NULL);
1390 #endif
1391 if (fl_backtrack)
1393 fs = &fs_body;
1394 fs->stack = re_malloc (struct re_fail_stack_ent_t, fs->alloc);
1395 if (fs->stack == NULL)
1396 return REG_ESPACE;
1398 else
1399 fs = NULL;
1401 cur_node = dfa->init_node;
1402 re_node_set_init_empty (&eps_via_nodes);
1404 prev_idx_match = (regmatch_t *) alloca (sizeof (regmatch_t) * nmatch);
1405 memcpy (prev_idx_match, pmatch, sizeof (regmatch_t) * nmatch);
1407 for (idx = pmatch[0].rm_so; idx <= pmatch[0].rm_eo ;)
1409 update_regs (dfa, pmatch, prev_idx_match, cur_node, idx, nmatch);
1411 if (idx == pmatch[0].rm_eo && cur_node == mctx->last_node)
1413 int reg_idx;
1414 if (fs)
1416 for (reg_idx = 0; reg_idx < nmatch; ++reg_idx)
1417 if (pmatch[reg_idx].rm_so > -1 && pmatch[reg_idx].rm_eo == -1)
1418 break;
1419 if (reg_idx == nmatch)
1421 re_node_set_free (&eps_via_nodes);
1422 return free_fail_stack_return (fs);
1424 cur_node = pop_fail_stack (fs, &idx, nmatch, pmatch,
1425 &eps_via_nodes);
1427 else
1429 re_node_set_free (&eps_via_nodes);
1430 return REG_NOERROR;
1434 /* Proceed to next node. */
1435 cur_node = proceed_next_node (mctx, nmatch, pmatch, &idx, cur_node,
1436 &eps_via_nodes, fs);
1438 if (BE (cur_node < 0, 0))
1440 if (BE (cur_node == -2, 0))
1442 re_node_set_free (&eps_via_nodes);
1443 free_fail_stack_return (fs);
1444 return REG_ESPACE;
1446 if (fs)
1447 cur_node = pop_fail_stack (fs, &idx, nmatch, pmatch,
1448 &eps_via_nodes);
1449 else
1451 re_node_set_free (&eps_via_nodes);
1452 return REG_NOMATCH;
1456 re_node_set_free (&eps_via_nodes);
1457 return free_fail_stack_return (fs);
1460 static reg_errcode_t
1461 free_fail_stack_return (fs)
1462 struct re_fail_stack_t *fs;
1464 if (fs)
1466 int fs_idx;
1467 for (fs_idx = 0; fs_idx < fs->num; ++fs_idx)
1469 re_node_set_free (&fs->stack[fs_idx].eps_via_nodes);
1470 re_free (fs->stack[fs_idx].regs);
1472 re_free (fs->stack);
1474 return REG_NOERROR;
1477 static void
1478 update_regs (dfa, pmatch, prev_idx_match, cur_node, cur_idx, nmatch)
1479 re_dfa_t *dfa;
1480 regmatch_t *pmatch, *prev_idx_match;
1481 int cur_node, cur_idx, nmatch;
1483 int type = dfa->nodes[cur_node].type;
1484 if (type == OP_OPEN_SUBEXP)
1486 int reg_num = dfa->nodes[cur_node].opr.idx + 1;
1488 /* We are at the first node of this sub expression. */
1489 if (reg_num < nmatch)
1491 pmatch[reg_num].rm_so = cur_idx;
1492 pmatch[reg_num].rm_eo = -1;
1495 else if (type == OP_CLOSE_SUBEXP)
1497 int reg_num = dfa->nodes[cur_node].opr.idx + 1;
1498 if (reg_num < nmatch)
1500 /* We are at the last node of this sub expression. */
1501 if (pmatch[reg_num].rm_so < cur_idx)
1503 pmatch[reg_num].rm_eo = cur_idx;
1504 /* This is a non-empty match or we are not inside an optional
1505 subexpression. Accept this right away. */
1506 memcpy (prev_idx_match, pmatch, sizeof (regmatch_t) * nmatch);
1508 else
1510 if (dfa->nodes[cur_node].opt_subexp
1511 && prev_idx_match[reg_num].rm_so != -1)
1512 /* We transited through an empty match for an optional
1513 subexpression, like (a?)*, and this is not the subexp's
1514 first match. Copy back the old content of the registers
1515 so that matches of an inner subexpression are undone as
1516 well, like in ((a?))*. */
1517 memcpy (pmatch, prev_idx_match, sizeof (regmatch_t) * nmatch);
1518 else
1519 /* We completed a subexpression, but it may be part of
1520 an optional one, so do not update PREV_IDX_MATCH. */
1521 pmatch[reg_num].rm_eo = cur_idx;
1527 /* This function checks the STATE_LOG from the SCTX->last_str_idx to 0
1528 and sift the nodes in each states according to the following rules.
1529 Updated state_log will be wrote to STATE_LOG.
1531 Rules: We throw away the Node `a' in the STATE_LOG[STR_IDX] if...
1532 1. When STR_IDX == MATCH_LAST(the last index in the state_log):
1533 If `a' isn't the LAST_NODE and `a' can't epsilon transit to
1534 the LAST_NODE, we throw away the node `a'.
1535 2. When 0 <= STR_IDX < MATCH_LAST and `a' accepts
1536 string `s' and transit to `b':
1537 i. If 'b' isn't in the STATE_LOG[STR_IDX+strlen('s')], we throw
1538 away the node `a'.
1539 ii. If 'b' is in the STATE_LOG[STR_IDX+strlen('s')] but 'b' is
1540 thrown away, we throw away the node `a'.
1541 3. When 0 <= STR_IDX < MATCH_LAST and 'a' epsilon transit to 'b':
1542 i. If 'b' isn't in the STATE_LOG[STR_IDX], we throw away the
1543 node `a'.
1544 ii. If 'b' is in the STATE_LOG[STR_IDX] but 'b' is thrown away,
1545 we throw away the node `a'. */
1547 #define STATE_NODE_CONTAINS(state,node) \
1548 ((state) != NULL && re_node_set_contains (&(state)->nodes, node))
1550 static reg_errcode_t
1551 sift_states_backward (mctx, sctx)
1552 re_match_context_t *mctx;
1553 re_sift_context_t *sctx;
1555 reg_errcode_t err;
1556 int null_cnt = 0;
1557 int str_idx = sctx->last_str_idx;
1558 re_node_set cur_dest;
1560 #ifdef DEBUG
1561 assert (mctx->state_log != NULL && mctx->state_log[str_idx] != NULL);
1562 #endif
1564 /* Build sifted state_log[str_idx]. It has the nodes which can epsilon
1565 transit to the last_node and the last_node itself. */
1566 err = re_node_set_init_1 (&cur_dest, sctx->last_node);
1567 if (BE (err != REG_NOERROR, 0))
1568 return err;
1569 err = update_cur_sifted_state (mctx, sctx, str_idx, &cur_dest);
1570 if (BE (err != REG_NOERROR, 0))
1571 goto free_return;
1573 /* Then check each states in the state_log. */
1574 while (str_idx > 0)
1576 /* Update counters. */
1577 null_cnt = (sctx->sifted_states[str_idx] == NULL) ? null_cnt + 1 : 0;
1578 if (null_cnt > mctx->max_mb_elem_len)
1580 memset (sctx->sifted_states, '\0',
1581 sizeof (re_dfastate_t *) * str_idx);
1582 re_node_set_free (&cur_dest);
1583 return REG_NOERROR;
1585 re_node_set_empty (&cur_dest);
1586 --str_idx;
1588 if (mctx->state_log[str_idx])
1590 err = build_sifted_states (mctx, sctx, str_idx, &cur_dest);
1591 if (BE (err != REG_NOERROR, 0))
1592 goto free_return;
1595 /* Add all the nodes which satisfy the following conditions:
1596 - It can epsilon transit to a node in CUR_DEST.
1597 - It is in CUR_SRC.
1598 And update state_log. */
1599 err = update_cur_sifted_state (mctx, sctx, str_idx, &cur_dest);
1600 if (BE (err != REG_NOERROR, 0))
1601 goto free_return;
1603 err = REG_NOERROR;
1604 free_return:
1605 re_node_set_free (&cur_dest);
1606 return err;
1609 static reg_errcode_t
1610 build_sifted_states (mctx, sctx, str_idx, cur_dest)
1611 re_match_context_t *mctx;
1612 re_sift_context_t *sctx;
1613 int str_idx;
1614 re_node_set *cur_dest;
1616 re_dfa_t *const dfa = mctx->dfa;
1617 re_node_set *cur_src = &mctx->state_log[str_idx]->non_eps_nodes;
1618 int i;
1620 /* Then build the next sifted state.
1621 We build the next sifted state on `cur_dest', and update
1622 `sifted_states[str_idx]' with `cur_dest'.
1623 Note:
1624 `cur_dest' is the sifted state from `state_log[str_idx + 1]'.
1625 `cur_src' points the node_set of the old `state_log[str_idx]'
1626 (with the epsilon nodes pre-filtered out). */
1627 for (i = 0; i < cur_src->nelem; i++)
1629 int prev_node = cur_src->elems[i];
1630 int naccepted = 0;
1631 int ret;
1633 #ifdef DEBUG
1634 re_token_type_t type = dfa->nodes[prev_node].type;
1635 assert (!IS_EPSILON_NODE (type));
1636 #endif
1637 #ifdef RE_ENABLE_I18N
1638 /* If the node may accept `multi byte'. */
1639 if (dfa->nodes[prev_node].accept_mb)
1640 naccepted = sift_states_iter_mb (mctx, sctx, prev_node,
1641 str_idx, sctx->last_str_idx);
1642 #endif /* RE_ENABLE_I18N */
1644 /* We don't check backreferences here.
1645 See update_cur_sifted_state(). */
1646 if (!naccepted
1647 && check_node_accept (mctx, dfa->nodes + prev_node, str_idx)
1648 && STATE_NODE_CONTAINS (sctx->sifted_states[str_idx + 1],
1649 dfa->nexts[prev_node]))
1650 naccepted = 1;
1652 if (naccepted == 0)
1653 continue;
1655 if (sctx->limits.nelem)
1657 int to_idx = str_idx + naccepted;
1658 if (check_dst_limits (mctx, &sctx->limits,
1659 dfa->nexts[prev_node], to_idx,
1660 prev_node, str_idx))
1661 continue;
1663 ret = re_node_set_insert (cur_dest, prev_node);
1664 if (BE (ret == -1, 0))
1665 return REG_ESPACE;
1668 return REG_NOERROR;
1671 /* Helper functions. */
1673 static reg_errcode_t
1674 clean_state_log_if_needed (mctx, next_state_log_idx)
1675 re_match_context_t *mctx;
1676 int next_state_log_idx;
1678 int top = mctx->state_log_top;
1680 if (next_state_log_idx >= mctx->input.bufs_len
1681 || (next_state_log_idx >= mctx->input.valid_len
1682 && mctx->input.valid_len < mctx->input.len))
1684 reg_errcode_t err;
1685 err = extend_buffers (mctx);
1686 if (BE (err != REG_NOERROR, 0))
1687 return err;
1690 if (top < next_state_log_idx)
1692 memset (mctx->state_log + top + 1, '\0',
1693 sizeof (re_dfastate_t *) * (next_state_log_idx - top));
1694 mctx->state_log_top = next_state_log_idx;
1696 return REG_NOERROR;
1699 static reg_errcode_t
1700 merge_state_array (dfa, dst, src, num)
1701 re_dfa_t *dfa;
1702 re_dfastate_t **dst;
1703 re_dfastate_t **src;
1704 int num;
1706 int st_idx;
1707 reg_errcode_t err;
1708 for (st_idx = 0; st_idx < num; ++st_idx)
1710 if (dst[st_idx] == NULL)
1711 dst[st_idx] = src[st_idx];
1712 else if (src[st_idx] != NULL)
1714 re_node_set merged_set;
1715 err = re_node_set_init_union (&merged_set, &dst[st_idx]->nodes,
1716 &src[st_idx]->nodes);
1717 if (BE (err != REG_NOERROR, 0))
1718 return err;
1719 dst[st_idx] = re_acquire_state (&err, dfa, &merged_set);
1720 re_node_set_free (&merged_set);
1721 if (BE (err != REG_NOERROR, 0))
1722 return err;
1725 return REG_NOERROR;
1728 static reg_errcode_t
1729 update_cur_sifted_state (mctx, sctx, str_idx, dest_nodes)
1730 re_match_context_t *mctx;
1731 re_sift_context_t *sctx;
1732 int str_idx;
1733 re_node_set *dest_nodes;
1735 re_dfa_t *const dfa = mctx->dfa;
1736 reg_errcode_t err;
1737 const re_node_set *candidates;
1738 candidates = ((mctx->state_log[str_idx] == NULL) ? NULL
1739 : &mctx->state_log[str_idx]->nodes);
1741 if (dest_nodes->nelem == 0)
1742 sctx->sifted_states[str_idx] = NULL;
1743 else
1745 if (candidates)
1747 /* At first, add the nodes which can epsilon transit to a node in
1748 DEST_NODE. */
1749 err = add_epsilon_src_nodes (dfa, dest_nodes, candidates);
1750 if (BE (err != REG_NOERROR, 0))
1751 return err;
1753 /* Then, check the limitations in the current sift_context. */
1754 if (sctx->limits.nelem)
1756 err = check_subexp_limits (dfa, dest_nodes, candidates, &sctx->limits,
1757 mctx->bkref_ents, str_idx);
1758 if (BE (err != REG_NOERROR, 0))
1759 return err;
1763 sctx->sifted_states[str_idx] = re_acquire_state (&err, dfa, dest_nodes);
1764 if (BE (err != REG_NOERROR, 0))
1765 return err;
1768 if (candidates && mctx->state_log[str_idx]->has_backref)
1770 err = sift_states_bkref (mctx, sctx, str_idx, candidates);
1771 if (BE (err != REG_NOERROR, 0))
1772 return err;
1774 return REG_NOERROR;
1777 static reg_errcode_t
1778 add_epsilon_src_nodes (dfa, dest_nodes, candidates)
1779 re_dfa_t *dfa;
1780 re_node_set *dest_nodes;
1781 const re_node_set *candidates;
1783 reg_errcode_t err = REG_NOERROR;
1784 int i;
1786 re_dfastate_t *state = re_acquire_state (&err, dfa, dest_nodes);
1787 if (BE (err != REG_NOERROR, 0))
1788 return err;
1790 if (!state->inveclosure.alloc)
1792 err = re_node_set_alloc (&state->inveclosure, dest_nodes->nelem);
1793 if (BE (err != REG_NOERROR, 0))
1794 return REG_ESPACE;
1795 for (i = 0; i < dest_nodes->nelem; i++)
1796 re_node_set_merge (&state->inveclosure,
1797 dfa->inveclosures + dest_nodes->elems[i]);
1799 return re_node_set_add_intersect (dest_nodes, candidates,
1800 &state->inveclosure);
1803 static reg_errcode_t
1804 sub_epsilon_src_nodes (dfa, node, dest_nodes, candidates)
1805 re_dfa_t *dfa;
1806 int node;
1807 re_node_set *dest_nodes;
1808 const re_node_set *candidates;
1810 int ecl_idx;
1811 reg_errcode_t err;
1812 re_node_set *inv_eclosure = dfa->inveclosures + node;
1813 re_node_set except_nodes;
1814 re_node_set_init_empty (&except_nodes);
1815 for (ecl_idx = 0; ecl_idx < inv_eclosure->nelem; ++ecl_idx)
1817 int cur_node = inv_eclosure->elems[ecl_idx];
1818 if (cur_node == node)
1819 continue;
1820 if (IS_EPSILON_NODE (dfa->nodes[cur_node].type))
1822 int edst1 = dfa->edests[cur_node].elems[0];
1823 int edst2 = ((dfa->edests[cur_node].nelem > 1)
1824 ? dfa->edests[cur_node].elems[1] : -1);
1825 if ((!re_node_set_contains (inv_eclosure, edst1)
1826 && re_node_set_contains (dest_nodes, edst1))
1827 || (edst2 > 0
1828 && !re_node_set_contains (inv_eclosure, edst2)
1829 && re_node_set_contains (dest_nodes, edst2)))
1831 err = re_node_set_add_intersect (&except_nodes, candidates,
1832 dfa->inveclosures + cur_node);
1833 if (BE (err != REG_NOERROR, 0))
1835 re_node_set_free (&except_nodes);
1836 return err;
1841 for (ecl_idx = 0; ecl_idx < inv_eclosure->nelem; ++ecl_idx)
1843 int cur_node = inv_eclosure->elems[ecl_idx];
1844 if (!re_node_set_contains (&except_nodes, cur_node))
1846 int idx = re_node_set_contains (dest_nodes, cur_node) - 1;
1847 re_node_set_remove_at (dest_nodes, idx);
1850 re_node_set_free (&except_nodes);
1851 return REG_NOERROR;
1854 static int
1855 check_dst_limits (mctx, limits, dst_node, dst_idx, src_node, src_idx)
1856 re_match_context_t *mctx;
1857 re_node_set *limits;
1858 int dst_node, dst_idx, src_node, src_idx;
1860 re_dfa_t *const dfa = mctx->dfa;
1861 int lim_idx, src_pos, dst_pos;
1863 int dst_bkref_idx = search_cur_bkref_entry (mctx, dst_idx);
1864 int src_bkref_idx = search_cur_bkref_entry (mctx, src_idx);
1865 for (lim_idx = 0; lim_idx < limits->nelem; ++lim_idx)
1867 int subexp_idx;
1868 struct re_backref_cache_entry *ent;
1869 ent = mctx->bkref_ents + limits->elems[lim_idx];
1870 subexp_idx = dfa->nodes[ent->node].opr.idx;
1872 dst_pos = check_dst_limits_calc_pos (mctx, limits->elems[lim_idx],
1873 subexp_idx, dst_node, dst_idx,
1874 dst_bkref_idx);
1875 src_pos = check_dst_limits_calc_pos (mctx, limits->elems[lim_idx],
1876 subexp_idx, src_node, src_idx,
1877 src_bkref_idx);
1879 /* In case of:
1880 <src> <dst> ( <subexp> )
1881 ( <subexp> ) <src> <dst>
1882 ( <subexp1> <src> <subexp2> <dst> <subexp3> ) */
1883 if (src_pos == dst_pos)
1884 continue; /* This is unrelated limitation. */
1885 else
1886 return 1;
1888 return 0;
1891 static int
1892 check_dst_limits_calc_pos_1 (mctx, boundaries, subexp_idx, from_node, bkref_idx)
1893 re_match_context_t *mctx;
1894 int boundaries, subexp_idx, from_node, bkref_idx;
1896 re_dfa_t *const dfa = mctx->dfa;
1897 re_node_set *eclosures = dfa->eclosures + from_node;
1898 int node_idx;
1900 /* Else, we are on the boundary: examine the nodes on the epsilon
1901 closure. */
1902 for (node_idx = 0; node_idx < eclosures->nelem; ++node_idx)
1904 int node = eclosures->elems[node_idx];
1905 switch (dfa->nodes[node].type)
1907 case OP_BACK_REF:
1908 if (bkref_idx != -1)
1910 struct re_backref_cache_entry *ent = mctx->bkref_ents + bkref_idx;
1913 int dst, cpos;
1915 if (ent->node != node)
1916 continue;
1918 if (subexp_idx <= 8 * sizeof (ent->eps_reachable_subexps_map)
1919 && !(ent->eps_reachable_subexps_map & (1 << subexp_idx)))
1920 continue;
1922 /* Recurse trying to reach the OP_OPEN_SUBEXP and
1923 OP_CLOSE_SUBEXP cases below. But, if the
1924 destination node is the same node as the source
1925 node, don't recurse because it would cause an
1926 infinite loop: a regex that exhibits this behavior
1927 is ()\1*\1* */
1928 dst = dfa->edests[node].elems[0];
1929 if (dst == from_node)
1931 if (boundaries & 1)
1932 return -1;
1933 else /* if (boundaries & 2) */
1934 return 0;
1937 cpos =
1938 check_dst_limits_calc_pos_1 (mctx, boundaries, subexp_idx,
1939 dst, bkref_idx);
1940 if (cpos == -1 /* && (boundaries & 1) */)
1941 return -1;
1942 if (cpos == 0 && (boundaries & 2))
1943 return 0;
1945 ent->eps_reachable_subexps_map &= ~(1 << subexp_idx);
1947 while (ent++->more);
1949 break;
1951 case OP_OPEN_SUBEXP:
1952 if ((boundaries & 1) && subexp_idx == dfa->nodes[node].opr.idx)
1953 return -1;
1954 break;
1956 case OP_CLOSE_SUBEXP:
1957 if ((boundaries & 2) && subexp_idx == dfa->nodes[node].opr.idx)
1958 return 0;
1959 break;
1961 default:
1962 break;
1966 return (boundaries & 2) ? 1 : 0;
1969 static int
1970 check_dst_limits_calc_pos (mctx, limit, subexp_idx, from_node, str_idx, bkref_idx)
1971 re_match_context_t *mctx;
1972 int limit, subexp_idx, from_node, str_idx, bkref_idx;
1974 struct re_backref_cache_entry *lim = mctx->bkref_ents + limit;
1975 int boundaries;
1977 /* If we are outside the range of the subexpression, return -1 or 1. */
1978 if (str_idx < lim->subexp_from)
1979 return -1;
1981 if (lim->subexp_to < str_idx)
1982 return 1;
1984 /* If we are within the subexpression, return 0. */
1985 boundaries = (str_idx == lim->subexp_from);
1986 boundaries |= (str_idx == lim->subexp_to) << 1;
1987 if (boundaries == 0)
1988 return 0;
1990 /* Else, examine epsilon closure. */
1991 return check_dst_limits_calc_pos_1 (mctx, boundaries, subexp_idx,
1992 from_node, bkref_idx);
1995 /* Check the limitations of sub expressions LIMITS, and remove the nodes
1996 which are against limitations from DEST_NODES. */
1998 static reg_errcode_t
1999 check_subexp_limits (dfa, dest_nodes, candidates, limits, bkref_ents, str_idx)
2000 re_dfa_t *dfa;
2001 re_node_set *dest_nodes;
2002 const re_node_set *candidates;
2003 re_node_set *limits;
2004 struct re_backref_cache_entry *bkref_ents;
2005 int str_idx;
2007 reg_errcode_t err;
2008 int node_idx, lim_idx;
2010 for (lim_idx = 0; lim_idx < limits->nelem; ++lim_idx)
2012 int subexp_idx;
2013 struct re_backref_cache_entry *ent;
2014 ent = bkref_ents + limits->elems[lim_idx];
2016 if (str_idx <= ent->subexp_from || ent->str_idx < str_idx)
2017 continue; /* This is unrelated limitation. */
2019 subexp_idx = dfa->nodes[ent->node].opr.idx;
2020 if (ent->subexp_to == str_idx)
2022 int ops_node = -1;
2023 int cls_node = -1;
2024 for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
2026 int node = dest_nodes->elems[node_idx];
2027 re_token_type_t type = dfa->nodes[node].type;
2028 if (type == OP_OPEN_SUBEXP
2029 && subexp_idx == dfa->nodes[node].opr.idx)
2030 ops_node = node;
2031 else if (type == OP_CLOSE_SUBEXP
2032 && subexp_idx == dfa->nodes[node].opr.idx)
2033 cls_node = node;
2036 /* Check the limitation of the open subexpression. */
2037 /* Note that (ent->subexp_to = str_idx != ent->subexp_from). */
2038 if (ops_node >= 0)
2040 err = sub_epsilon_src_nodes (dfa, ops_node, dest_nodes,
2041 candidates);
2042 if (BE (err != REG_NOERROR, 0))
2043 return err;
2046 /* Check the limitation of the close subexpression. */
2047 if (cls_node >= 0)
2048 for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
2050 int node = dest_nodes->elems[node_idx];
2051 if (!re_node_set_contains (dfa->inveclosures + node,
2052 cls_node)
2053 && !re_node_set_contains (dfa->eclosures + node,
2054 cls_node))
2056 /* It is against this limitation.
2057 Remove it form the current sifted state. */
2058 err = sub_epsilon_src_nodes (dfa, node, dest_nodes,
2059 candidates);
2060 if (BE (err != REG_NOERROR, 0))
2061 return err;
2062 --node_idx;
2066 else /* (ent->subexp_to != str_idx) */
2068 for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
2070 int node = dest_nodes->elems[node_idx];
2071 re_token_type_t type = dfa->nodes[node].type;
2072 if (type == OP_CLOSE_SUBEXP || type == OP_OPEN_SUBEXP)
2074 if (subexp_idx != dfa->nodes[node].opr.idx)
2075 continue;
2076 /* It is against this limitation.
2077 Remove it form the current sifted state. */
2078 err = sub_epsilon_src_nodes (dfa, node, dest_nodes,
2079 candidates);
2080 if (BE (err != REG_NOERROR, 0))
2081 return err;
2086 return REG_NOERROR;
2089 static reg_errcode_t
2090 sift_states_bkref (mctx, sctx, str_idx, candidates)
2091 re_match_context_t *mctx;
2092 re_sift_context_t *sctx;
2093 int str_idx;
2094 const re_node_set *candidates;
2096 re_dfa_t *const dfa = mctx->dfa;
2097 reg_errcode_t err;
2098 int node_idx, node;
2099 re_sift_context_t local_sctx;
2100 int first_idx = search_cur_bkref_entry (mctx, str_idx);
2102 if (first_idx == -1)
2103 return REG_NOERROR;
2105 local_sctx.sifted_states = NULL; /* Mark that it hasn't been initialized. */
2107 for (node_idx = 0; node_idx < candidates->nelem; ++node_idx)
2109 int enabled_idx;
2110 re_token_type_t type;
2111 struct re_backref_cache_entry *entry;
2112 node = candidates->elems[node_idx];
2113 type = dfa->nodes[node].type;
2114 /* Avoid infinite loop for the REs like "()\1+". */
2115 if (node == sctx->last_node && str_idx == sctx->last_str_idx)
2116 continue;
2117 if (type != OP_BACK_REF)
2118 continue;
2120 entry = mctx->bkref_ents + first_idx;
2121 enabled_idx = first_idx;
2124 int subexp_len, to_idx, dst_node;
2125 re_dfastate_t *cur_state;
2127 if (entry->node != node)
2128 continue;
2129 subexp_len = entry->subexp_to - entry->subexp_from;
2130 to_idx = str_idx + subexp_len;
2131 dst_node = (subexp_len ? dfa->nexts[node]
2132 : dfa->edests[node].elems[0]);
2134 if (to_idx > sctx->last_str_idx
2135 || sctx->sifted_states[to_idx] == NULL
2136 || !STATE_NODE_CONTAINS (sctx->sifted_states[to_idx], dst_node)
2137 || check_dst_limits (mctx, &sctx->limits, node,
2138 str_idx, dst_node, to_idx))
2139 continue;
2141 if (local_sctx.sifted_states == NULL)
2143 local_sctx = *sctx;
2144 err = re_node_set_init_copy (&local_sctx.limits, &sctx->limits);
2145 if (BE (err != REG_NOERROR, 0))
2146 goto free_return;
2148 local_sctx.last_node = node;
2149 local_sctx.last_str_idx = str_idx;
2150 err = re_node_set_insert (&local_sctx.limits, enabled_idx);
2151 if (BE (err < 0, 0))
2153 err = REG_ESPACE;
2154 goto free_return;
2156 cur_state = local_sctx.sifted_states[str_idx];
2157 err = sift_states_backward (mctx, &local_sctx);
2158 if (BE (err != REG_NOERROR, 0))
2159 goto free_return;
2160 if (sctx->limited_states != NULL)
2162 err = merge_state_array (dfa, sctx->limited_states,
2163 local_sctx.sifted_states,
2164 str_idx + 1);
2165 if (BE (err != REG_NOERROR, 0))
2166 goto free_return;
2168 local_sctx.sifted_states[str_idx] = cur_state;
2169 re_node_set_remove (&local_sctx.limits, enabled_idx);
2171 /* mctx->bkref_ents may have changed, reload the pointer. */
2172 entry = mctx->bkref_ents + enabled_idx;
2174 while (enabled_idx++, entry++->more);
2176 err = REG_NOERROR;
2177 free_return:
2178 if (local_sctx.sifted_states != NULL)
2180 re_node_set_free (&local_sctx.limits);
2183 return err;
2187 #ifdef RE_ENABLE_I18N
2188 static int
2189 sift_states_iter_mb (mctx, sctx, node_idx, str_idx, max_str_idx)
2190 const re_match_context_t *mctx;
2191 re_sift_context_t *sctx;
2192 int node_idx, str_idx, max_str_idx;
2194 re_dfa_t *const dfa = mctx->dfa;
2195 int naccepted;
2196 /* Check the node can accept `multi byte'. */
2197 naccepted = check_node_accept_bytes (dfa, node_idx, &mctx->input, str_idx);
2198 if (naccepted > 0 && str_idx + naccepted <= max_str_idx &&
2199 !STATE_NODE_CONTAINS (sctx->sifted_states[str_idx + naccepted],
2200 dfa->nexts[node_idx]))
2201 /* The node can't accept the `multi byte', or the
2202 destination was already thrown away, then the node
2203 could't accept the current input `multi byte'. */
2204 naccepted = 0;
2205 /* Otherwise, it is sure that the node could accept
2206 `naccepted' bytes input. */
2207 return naccepted;
2209 #endif /* RE_ENABLE_I18N */
2212 /* Functions for state transition. */
2214 /* Return the next state to which the current state STATE will transit by
2215 accepting the current input byte, and update STATE_LOG if necessary.
2216 If STATE can accept a multibyte char/collating element/back reference
2217 update the destination of STATE_LOG. */
2219 static re_dfastate_t *
2220 transit_state (err, mctx, state)
2221 reg_errcode_t *err;
2222 re_match_context_t *mctx;
2223 re_dfastate_t *state;
2225 re_dfastate_t **trtable;
2226 unsigned char ch;
2228 #ifdef RE_ENABLE_I18N
2229 /* If the current state can accept multibyte. */
2230 if (BE (state->accept_mb, 0))
2232 *err = transit_state_mb (mctx, state);
2233 if (BE (*err != REG_NOERROR, 0))
2234 return NULL;
2236 #endif /* RE_ENABLE_I18N */
2238 /* Then decide the next state with the single byte. */
2239 #if 0
2240 if (0)
2241 /* don't use transition table */
2242 return transit_state_sb (err, mctx, state);
2243 #endif
2245 /* Use transition table */
2246 ch = re_string_fetch_byte (&mctx->input);
2247 for (;;)
2249 trtable = state->trtable;
2250 if (BE (trtable != NULL, 1))
2251 return trtable[ch];
2253 trtable = state->word_trtable;
2254 if (BE (trtable != NULL, 1))
2256 unsigned int context;
2257 context
2258 = re_string_context_at (&mctx->input,
2259 re_string_cur_idx (&mctx->input) - 1,
2260 mctx->eflags);
2261 if (IS_WORD_CONTEXT (context))
2262 return trtable[ch + SBC_MAX];
2263 else
2264 return trtable[ch];
2267 if (!build_trtable (mctx->dfa, state))
2269 *err = REG_ESPACE;
2270 return NULL;
2273 /* Retry, we now have a transition table. */
2277 /* Update the state_log if we need */
2278 re_dfastate_t *
2279 merge_state_with_log (err, mctx, next_state)
2280 reg_errcode_t *err;
2281 re_match_context_t *mctx;
2282 re_dfastate_t *next_state;
2284 re_dfa_t *const dfa = mctx->dfa;
2285 int cur_idx = re_string_cur_idx (&mctx->input);
2287 if (cur_idx > mctx->state_log_top)
2289 mctx->state_log[cur_idx] = next_state;
2290 mctx->state_log_top = cur_idx;
2292 else if (mctx->state_log[cur_idx] == 0)
2294 mctx->state_log[cur_idx] = next_state;
2296 else
2298 re_dfastate_t *pstate;
2299 unsigned int context;
2300 re_node_set next_nodes, *log_nodes, *table_nodes = NULL;
2301 /* If (state_log[cur_idx] != 0), it implies that cur_idx is
2302 the destination of a multibyte char/collating element/
2303 back reference. Then the next state is the union set of
2304 these destinations and the results of the transition table. */
2305 pstate = mctx->state_log[cur_idx];
2306 log_nodes = pstate->entrance_nodes;
2307 if (next_state != NULL)
2309 table_nodes = next_state->entrance_nodes;
2310 *err = re_node_set_init_union (&next_nodes, table_nodes,
2311 log_nodes);
2312 if (BE (*err != REG_NOERROR, 0))
2313 return NULL;
2315 else
2316 next_nodes = *log_nodes;
2317 /* Note: We already add the nodes of the initial state,
2318 then we don't need to add them here. */
2320 context = re_string_context_at (&mctx->input,
2321 re_string_cur_idx (&mctx->input) - 1,
2322 mctx->eflags);
2323 next_state = mctx->state_log[cur_idx]
2324 = re_acquire_state_context (err, dfa, &next_nodes, context);
2325 /* We don't need to check errors here, since the return value of
2326 this function is next_state and ERR is already set. */
2328 if (table_nodes != NULL)
2329 re_node_set_free (&next_nodes);
2332 if (BE (dfa->nbackref, 0) && next_state != NULL)
2334 /* Check OP_OPEN_SUBEXP in the current state in case that we use them
2335 later. We must check them here, since the back references in the
2336 next state might use them. */
2337 *err = check_subexp_matching_top (mctx, &next_state->nodes,
2338 cur_idx);
2339 if (BE (*err != REG_NOERROR, 0))
2340 return NULL;
2342 /* If the next state has back references. */
2343 if (next_state->has_backref)
2345 *err = transit_state_bkref (mctx, &next_state->nodes);
2346 if (BE (*err != REG_NOERROR, 0))
2347 return NULL;
2348 next_state = mctx->state_log[cur_idx];
2352 return next_state;
2355 /* Skip bytes in the input that correspond to part of a
2356 multi-byte match, then look in the log for a state
2357 from which to restart matching. */
2358 re_dfastate_t *
2359 find_recover_state (err, mctx)
2360 reg_errcode_t *err;
2361 re_match_context_t *mctx;
2363 re_dfastate_t *cur_state = NULL;
2366 int max = mctx->state_log_top;
2367 int cur_str_idx = re_string_cur_idx (&mctx->input);
2371 if (++cur_str_idx > max)
2372 return NULL;
2373 re_string_skip_bytes (&mctx->input, 1);
2375 while (mctx->state_log[cur_str_idx] == NULL);
2377 cur_state = merge_state_with_log (err, mctx, NULL);
2379 while (err == REG_NOERROR && cur_state == NULL);
2380 return cur_state;
2383 /* Helper functions for transit_state. */
2385 /* From the node set CUR_NODES, pick up the nodes whose types are
2386 OP_OPEN_SUBEXP and which have corresponding back references in the regular
2387 expression. And register them to use them later for evaluating the
2388 correspoding back references. */
2390 static reg_errcode_t
2391 check_subexp_matching_top (mctx, cur_nodes, str_idx)
2392 re_match_context_t *mctx;
2393 re_node_set *cur_nodes;
2394 int str_idx;
2396 re_dfa_t *const dfa = mctx->dfa;
2397 int node_idx;
2398 reg_errcode_t err;
2400 /* TODO: This isn't efficient.
2401 Because there might be more than one nodes whose types are
2402 OP_OPEN_SUBEXP and whose index is SUBEXP_IDX, we must check all
2403 nodes.
2404 E.g. RE: (a){2} */
2405 for (node_idx = 0; node_idx < cur_nodes->nelem; ++node_idx)
2407 int node = cur_nodes->elems[node_idx];
2408 if (dfa->nodes[node].type == OP_OPEN_SUBEXP
2409 && dfa->nodes[node].opr.idx < (8 * sizeof (dfa->used_bkref_map))
2410 && dfa->used_bkref_map & (1 << dfa->nodes[node].opr.idx))
2412 err = match_ctx_add_subtop (mctx, node, str_idx);
2413 if (BE (err != REG_NOERROR, 0))
2414 return err;
2417 return REG_NOERROR;
2420 #if 0
2421 /* Return the next state to which the current state STATE will transit by
2422 accepting the current input byte. */
2424 static re_dfastate_t *
2425 transit_state_sb (err, mctx, state)
2426 reg_errcode_t *err;
2427 re_match_context_t *mctx;
2428 re_dfastate_t *state;
2430 re_dfa_t *const dfa = mctx->dfa;
2431 re_node_set next_nodes;
2432 re_dfastate_t *next_state;
2433 int node_cnt, cur_str_idx = re_string_cur_idx (&mctx->input);
2434 unsigned int context;
2436 *err = re_node_set_alloc (&next_nodes, state->nodes.nelem + 1);
2437 if (BE (*err != REG_NOERROR, 0))
2438 return NULL;
2439 for (node_cnt = 0; node_cnt < state->nodes.nelem; ++node_cnt)
2441 int cur_node = state->nodes.elems[node_cnt];
2442 if (check_node_accept (mctx, dfa->nodes + cur_node, cur_str_idx))
2444 *err = re_node_set_merge (&next_nodes,
2445 dfa->eclosures + dfa->nexts[cur_node]);
2446 if (BE (*err != REG_NOERROR, 0))
2448 re_node_set_free (&next_nodes);
2449 return NULL;
2453 context = re_string_context_at (&mctx->input, cur_str_idx, mctx->eflags);
2454 next_state = re_acquire_state_context (err, dfa, &next_nodes, context);
2455 /* We don't need to check errors here, since the return value of
2456 this function is next_state and ERR is already set. */
2458 re_node_set_free (&next_nodes);
2459 re_string_skip_bytes (&mctx->input, 1);
2460 return next_state;
2462 #endif
2464 #ifdef RE_ENABLE_I18N
2465 static reg_errcode_t
2466 transit_state_mb (mctx, pstate)
2467 re_match_context_t *mctx;
2468 re_dfastate_t *pstate;
2470 re_dfa_t *const dfa = mctx->dfa;
2471 reg_errcode_t err;
2472 int i;
2474 for (i = 0; i < pstate->nodes.nelem; ++i)
2476 re_node_set dest_nodes, *new_nodes;
2477 int cur_node_idx = pstate->nodes.elems[i];
2478 int naccepted, dest_idx;
2479 unsigned int context;
2480 re_dfastate_t *dest_state;
2482 if (!dfa->nodes[cur_node_idx].accept_mb)
2483 continue;
2485 if (dfa->nodes[cur_node_idx].constraint)
2487 context = re_string_context_at (&mctx->input,
2488 re_string_cur_idx (&mctx->input),
2489 mctx->eflags);
2490 if (NOT_SATISFY_NEXT_CONSTRAINT (dfa->nodes[cur_node_idx].constraint,
2491 context))
2492 continue;
2495 /* How many bytes the node can accept? */
2496 naccepted = check_node_accept_bytes (dfa, cur_node_idx, &mctx->input,
2497 re_string_cur_idx (&mctx->input));
2498 if (naccepted == 0)
2499 continue;
2501 /* The node can accepts `naccepted' bytes. */
2502 dest_idx = re_string_cur_idx (&mctx->input) + naccepted;
2503 mctx->max_mb_elem_len = ((mctx->max_mb_elem_len < naccepted) ? naccepted
2504 : mctx->max_mb_elem_len);
2505 err = clean_state_log_if_needed (mctx, dest_idx);
2506 if (BE (err != REG_NOERROR, 0))
2507 return err;
2508 #ifdef DEBUG
2509 assert (dfa->nexts[cur_node_idx] != -1);
2510 #endif
2511 new_nodes = dfa->eclosures + dfa->nexts[cur_node_idx];
2513 dest_state = mctx->state_log[dest_idx];
2514 if (dest_state == NULL)
2515 dest_nodes = *new_nodes;
2516 else
2518 err = re_node_set_init_union (&dest_nodes,
2519 dest_state->entrance_nodes, new_nodes);
2520 if (BE (err != REG_NOERROR, 0))
2521 return err;
2523 context = re_string_context_at (&mctx->input, dest_idx - 1, mctx->eflags);
2524 mctx->state_log[dest_idx]
2525 = re_acquire_state_context (&err, dfa, &dest_nodes, context);
2526 if (dest_state != NULL)
2527 re_node_set_free (&dest_nodes);
2528 if (BE (mctx->state_log[dest_idx] == NULL && err != REG_NOERROR, 0))
2529 return err;
2531 return REG_NOERROR;
2533 #endif /* RE_ENABLE_I18N */
2535 static reg_errcode_t
2536 transit_state_bkref (mctx, nodes)
2537 re_match_context_t *mctx;
2538 const re_node_set *nodes;
2540 re_dfa_t *const dfa = mctx->dfa;
2541 reg_errcode_t err;
2542 int i;
2543 int cur_str_idx = re_string_cur_idx (&mctx->input);
2545 for (i = 0; i < nodes->nelem; ++i)
2547 int dest_str_idx, prev_nelem, bkc_idx;
2548 int node_idx = nodes->elems[i];
2549 unsigned int context;
2550 const re_token_t *node = dfa->nodes + node_idx;
2551 re_node_set *new_dest_nodes;
2553 /* Check whether `node' is a backreference or not. */
2554 if (node->type != OP_BACK_REF)
2555 continue;
2557 if (node->constraint)
2559 context = re_string_context_at (&mctx->input, cur_str_idx,
2560 mctx->eflags);
2561 if (NOT_SATISFY_NEXT_CONSTRAINT (node->constraint, context))
2562 continue;
2565 /* `node' is a backreference.
2566 Check the substring which the substring matched. */
2567 bkc_idx = mctx->nbkref_ents;
2568 err = get_subexp (mctx, node_idx, cur_str_idx);
2569 if (BE (err != REG_NOERROR, 0))
2570 goto free_return;
2572 /* And add the epsilon closures (which is `new_dest_nodes') of
2573 the backreference to appropriate state_log. */
2574 #ifdef DEBUG
2575 assert (dfa->nexts[node_idx] != -1);
2576 #endif
2577 for (; bkc_idx < mctx->nbkref_ents; ++bkc_idx)
2579 int subexp_len;
2580 re_dfastate_t *dest_state;
2581 struct re_backref_cache_entry *bkref_ent;
2582 bkref_ent = mctx->bkref_ents + bkc_idx;
2583 if (bkref_ent->node != node_idx || bkref_ent->str_idx != cur_str_idx)
2584 continue;
2585 subexp_len = bkref_ent->subexp_to - bkref_ent->subexp_from;
2586 new_dest_nodes = (subexp_len == 0
2587 ? dfa->eclosures + dfa->edests[node_idx].elems[0]
2588 : dfa->eclosures + dfa->nexts[node_idx]);
2589 dest_str_idx = (cur_str_idx + bkref_ent->subexp_to
2590 - bkref_ent->subexp_from);
2591 context = re_string_context_at (&mctx->input, dest_str_idx - 1,
2592 mctx->eflags);
2593 dest_state = mctx->state_log[dest_str_idx];
2594 prev_nelem = ((mctx->state_log[cur_str_idx] == NULL) ? 0
2595 : mctx->state_log[cur_str_idx]->nodes.nelem);
2596 /* Add `new_dest_node' to state_log. */
2597 if (dest_state == NULL)
2599 mctx->state_log[dest_str_idx]
2600 = re_acquire_state_context (&err, dfa, new_dest_nodes,
2601 context);
2602 if (BE (mctx->state_log[dest_str_idx] == NULL
2603 && err != REG_NOERROR, 0))
2604 goto free_return;
2606 else
2608 re_node_set dest_nodes;
2609 err = re_node_set_init_union (&dest_nodes,
2610 dest_state->entrance_nodes,
2611 new_dest_nodes);
2612 if (BE (err != REG_NOERROR, 0))
2614 re_node_set_free (&dest_nodes);
2615 goto free_return;
2617 mctx->state_log[dest_str_idx]
2618 = re_acquire_state_context (&err, dfa, &dest_nodes, context);
2619 re_node_set_free (&dest_nodes);
2620 if (BE (mctx->state_log[dest_str_idx] == NULL
2621 && err != REG_NOERROR, 0))
2622 goto free_return;
2624 /* We need to check recursively if the backreference can epsilon
2625 transit. */
2626 if (subexp_len == 0
2627 && mctx->state_log[cur_str_idx]->nodes.nelem > prev_nelem)
2629 err = check_subexp_matching_top (mctx, new_dest_nodes,
2630 cur_str_idx);
2631 if (BE (err != REG_NOERROR, 0))
2632 goto free_return;
2633 err = transit_state_bkref (mctx, new_dest_nodes);
2634 if (BE (err != REG_NOERROR, 0))
2635 goto free_return;
2639 err = REG_NOERROR;
2640 free_return:
2641 return err;
2644 /* Enumerate all the candidates which the backreference BKREF_NODE can match
2645 at BKREF_STR_IDX, and register them by match_ctx_add_entry().
2646 Note that we might collect inappropriate candidates here.
2647 However, the cost of checking them strictly here is too high, then we
2648 delay these checking for prune_impossible_nodes(). */
2650 static reg_errcode_t
2651 get_subexp (mctx, bkref_node, bkref_str_idx)
2652 re_match_context_t *mctx;
2653 int bkref_node, bkref_str_idx;
2655 re_dfa_t *const dfa = mctx->dfa;
2656 int subexp_num, sub_top_idx;
2657 const char *buf = (const char *) re_string_get_buffer (&mctx->input);
2658 /* Return if we have already checked BKREF_NODE at BKREF_STR_IDX. */
2659 int cache_idx = search_cur_bkref_entry (mctx, bkref_str_idx);
2660 if (cache_idx != -1)
2662 const struct re_backref_cache_entry *entry = mctx->bkref_ents + cache_idx;
2664 if (entry->node == bkref_node)
2665 return REG_NOERROR; /* We already checked it. */
2666 while (entry++->more);
2669 subexp_num = dfa->nodes[bkref_node].opr.idx;
2671 /* For each sub expression */
2672 for (sub_top_idx = 0; sub_top_idx < mctx->nsub_tops; ++sub_top_idx)
2674 reg_errcode_t err;
2675 re_sub_match_top_t *sub_top = mctx->sub_tops[sub_top_idx];
2676 re_sub_match_last_t *sub_last;
2677 int sub_last_idx, sl_str, bkref_str_off;
2679 if (dfa->nodes[sub_top->node].opr.idx != subexp_num)
2680 continue; /* It isn't related. */
2682 sl_str = sub_top->str_idx;
2683 bkref_str_off = bkref_str_idx;
2684 /* At first, check the last node of sub expressions we already
2685 evaluated. */
2686 for (sub_last_idx = 0; sub_last_idx < sub_top->nlasts; ++sub_last_idx)
2688 int sl_str_diff;
2689 sub_last = sub_top->lasts[sub_last_idx];
2690 sl_str_diff = sub_last->str_idx - sl_str;
2691 /* The matched string by the sub expression match with the substring
2692 at the back reference? */
2693 if (sl_str_diff > 0)
2695 if (BE (bkref_str_off + sl_str_diff > mctx->input.valid_len, 0))
2697 /* Not enough chars for a successful match. */
2698 if (bkref_str_off + sl_str_diff > mctx->input.len)
2699 break;
2701 err = clean_state_log_if_needed (mctx,
2702 bkref_str_off
2703 + sl_str_diff);
2704 if (BE (err != REG_NOERROR, 0))
2705 return err;
2706 buf = (const char *) re_string_get_buffer (&mctx->input);
2708 if (memcmp (buf + bkref_str_off, buf + sl_str, sl_str_diff) != 0)
2709 break; /* We don't need to search this sub expression any more. */
2711 bkref_str_off += sl_str_diff;
2712 sl_str += sl_str_diff;
2713 err = get_subexp_sub (mctx, sub_top, sub_last, bkref_node,
2714 bkref_str_idx);
2716 /* Reload buf, since the preceding call might have reallocated
2717 the buffer. */
2718 buf = (const char *) re_string_get_buffer (&mctx->input);
2720 if (err == REG_NOMATCH)
2721 continue;
2722 if (BE (err != REG_NOERROR, 0))
2723 return err;
2726 if (sub_last_idx < sub_top->nlasts)
2727 continue;
2728 if (sub_last_idx > 0)
2729 ++sl_str;
2730 /* Then, search for the other last nodes of the sub expression. */
2731 for (; sl_str <= bkref_str_idx; ++sl_str)
2733 int cls_node, sl_str_off;
2734 const re_node_set *nodes;
2735 sl_str_off = sl_str - sub_top->str_idx;
2736 /* The matched string by the sub expression match with the substring
2737 at the back reference? */
2738 if (sl_str_off > 0)
2740 if (BE (bkref_str_off >= mctx->input.valid_len, 0))
2742 /* If we are at the end of the input, we cannot match. */
2743 if (bkref_str_off >= mctx->input.len)
2744 break;
2746 err = extend_buffers (mctx);
2747 if (BE (err != REG_NOERROR, 0))
2748 return err;
2750 buf = (const char *) re_string_get_buffer (&mctx->input);
2752 if (buf [bkref_str_off++] != buf[sl_str - 1])
2753 break; /* We don't need to search this sub expression
2754 any more. */
2756 if (mctx->state_log[sl_str] == NULL)
2757 continue;
2758 /* Does this state have a ')' of the sub expression? */
2759 nodes = &mctx->state_log[sl_str]->nodes;
2760 cls_node = find_subexp_node (dfa, nodes, subexp_num, OP_CLOSE_SUBEXP);
2761 if (cls_node == -1)
2762 continue; /* No. */
2763 if (sub_top->path == NULL)
2765 sub_top->path = calloc (sizeof (state_array_t),
2766 sl_str - sub_top->str_idx + 1);
2767 if (sub_top->path == NULL)
2768 return REG_ESPACE;
2770 /* Can the OP_OPEN_SUBEXP node arrive the OP_CLOSE_SUBEXP node
2771 in the current context? */
2772 err = check_arrival (mctx, sub_top->path, sub_top->node,
2773 sub_top->str_idx, cls_node, sl_str, OP_CLOSE_SUBEXP);
2774 if (err == REG_NOMATCH)
2775 continue;
2776 if (BE (err != REG_NOERROR, 0))
2777 return err;
2778 sub_last = match_ctx_add_sublast (sub_top, cls_node, sl_str);
2779 if (BE (sub_last == NULL, 0))
2780 return REG_ESPACE;
2781 err = get_subexp_sub (mctx, sub_top, sub_last, bkref_node,
2782 bkref_str_idx);
2783 if (err == REG_NOMATCH)
2784 continue;
2787 return REG_NOERROR;
2790 /* Helper functions for get_subexp(). */
2792 /* Check SUB_LAST can arrive to the back reference BKREF_NODE at BKREF_STR.
2793 If it can arrive, register the sub expression expressed with SUB_TOP
2794 and SUB_LAST. */
2796 static reg_errcode_t
2797 get_subexp_sub (mctx, sub_top, sub_last, bkref_node, bkref_str)
2798 re_match_context_t *mctx;
2799 const re_sub_match_top_t *sub_top;
2800 re_sub_match_last_t *sub_last;
2801 int bkref_node, bkref_str;
2803 reg_errcode_t err;
2804 int to_idx;
2805 /* Can the subexpression arrive the back reference? */
2806 err = check_arrival (mctx, &sub_last->path, sub_last->node,
2807 sub_last->str_idx, bkref_node, bkref_str, OP_OPEN_SUBEXP);
2808 if (err != REG_NOERROR)
2809 return err;
2810 err = match_ctx_add_entry (mctx, bkref_node, bkref_str, sub_top->str_idx,
2811 sub_last->str_idx);
2812 if (BE (err != REG_NOERROR, 0))
2813 return err;
2814 to_idx = bkref_str + sub_last->str_idx - sub_top->str_idx;
2815 return clean_state_log_if_needed (mctx, to_idx);
2818 /* Find the first node which is '(' or ')' and whose index is SUBEXP_IDX.
2819 Search '(' if FL_OPEN, or search ')' otherwise.
2820 TODO: This function isn't efficient...
2821 Because there might be more than one nodes whose types are
2822 OP_OPEN_SUBEXP and whose index is SUBEXP_IDX, we must check all
2823 nodes.
2824 E.g. RE: (a){2} */
2826 static int
2827 find_subexp_node (dfa, nodes, subexp_idx, type)
2828 const re_dfa_t *dfa;
2829 const re_node_set *nodes;
2830 int subexp_idx, type;
2832 int cls_idx;
2833 for (cls_idx = 0; cls_idx < nodes->nelem; ++cls_idx)
2835 int cls_node = nodes->elems[cls_idx];
2836 const re_token_t *node = dfa->nodes + cls_node;
2837 if (node->type == type
2838 && node->opr.idx == subexp_idx)
2839 return cls_node;
2841 return -1;
2844 /* Check whether the node TOP_NODE at TOP_STR can arrive to the node
2845 LAST_NODE at LAST_STR. We record the path onto PATH since it will be
2846 heavily reused.
2847 Return REG_NOERROR if it can arrive, or REG_NOMATCH otherwise. */
2849 static reg_errcode_t
2850 check_arrival (mctx, path, top_node, top_str, last_node, last_str,
2851 type)
2852 re_match_context_t *mctx;
2853 state_array_t *path;
2854 int top_node, top_str, last_node, last_str, type;
2856 re_dfa_t *const dfa = mctx->dfa;
2857 reg_errcode_t err;
2858 int subexp_num, backup_cur_idx, str_idx, null_cnt;
2859 re_dfastate_t *cur_state = NULL;
2860 re_node_set *cur_nodes, next_nodes;
2861 re_dfastate_t **backup_state_log;
2862 unsigned int context;
2864 subexp_num = dfa->nodes[top_node].opr.idx;
2865 /* Extend the buffer if we need. */
2866 if (BE (path->alloc < last_str + mctx->max_mb_elem_len + 1, 0))
2868 re_dfastate_t **new_array;
2869 int old_alloc = path->alloc;
2870 path->alloc += last_str + mctx->max_mb_elem_len + 1;
2871 new_array = re_realloc (path->array, re_dfastate_t *, path->alloc);
2872 if (new_array == NULL)
2874 path->alloc = old_alloc;
2875 return REG_ESPACE;
2877 path->array = new_array;
2878 memset (new_array + old_alloc, '\0',
2879 sizeof (re_dfastate_t *) * (path->alloc - old_alloc));
2882 str_idx = path->next_idx == 0 ? top_str : path->next_idx;
2884 /* Temporary modify MCTX. */
2885 backup_state_log = mctx->state_log;
2886 backup_cur_idx = mctx->input.cur_idx;
2887 mctx->state_log = path->array;
2888 mctx->input.cur_idx = str_idx;
2890 /* Setup initial node set. */
2891 context = re_string_context_at (&mctx->input, str_idx - 1, mctx->eflags);
2892 if (str_idx == top_str)
2894 err = re_node_set_init_1 (&next_nodes, top_node);
2895 if (BE (err != REG_NOERROR, 0))
2896 return err;
2897 err = check_arrival_expand_ecl (dfa, &next_nodes, subexp_num, type);
2898 if (BE (err != REG_NOERROR, 0))
2900 re_node_set_free (&next_nodes);
2901 return err;
2904 else
2906 cur_state = mctx->state_log[str_idx];
2907 if (cur_state && cur_state->has_backref)
2909 err = re_node_set_init_copy (&next_nodes, &cur_state->nodes);
2910 if (BE ( err != REG_NOERROR, 0))
2911 return err;
2913 else
2914 re_node_set_init_empty (&next_nodes);
2916 if (str_idx == top_str || (cur_state && cur_state->has_backref))
2918 if (next_nodes.nelem)
2920 err = expand_bkref_cache (mctx, &next_nodes, str_idx,
2921 subexp_num, type);
2922 if (BE ( err != REG_NOERROR, 0))
2924 re_node_set_free (&next_nodes);
2925 return err;
2928 cur_state = re_acquire_state_context (&err, dfa, &next_nodes, context);
2929 if (BE (cur_state == NULL && err != REG_NOERROR, 0))
2931 re_node_set_free (&next_nodes);
2932 return err;
2934 mctx->state_log[str_idx] = cur_state;
2937 for (null_cnt = 0; str_idx < last_str && null_cnt <= mctx->max_mb_elem_len;)
2939 re_node_set_empty (&next_nodes);
2940 if (mctx->state_log[str_idx + 1])
2942 err = re_node_set_merge (&next_nodes,
2943 &mctx->state_log[str_idx + 1]->nodes);
2944 if (BE (err != REG_NOERROR, 0))
2946 re_node_set_free (&next_nodes);
2947 return err;
2950 if (cur_state)
2952 err = check_arrival_add_next_nodes (mctx, str_idx,
2953 &cur_state->non_eps_nodes, &next_nodes);
2954 if (BE (err != REG_NOERROR, 0))
2956 re_node_set_free (&next_nodes);
2957 return err;
2960 ++str_idx;
2961 if (next_nodes.nelem)
2963 err = check_arrival_expand_ecl (dfa, &next_nodes, subexp_num, type);
2964 if (BE (err != REG_NOERROR, 0))
2966 re_node_set_free (&next_nodes);
2967 return err;
2969 err = expand_bkref_cache (mctx, &next_nodes, str_idx,
2970 subexp_num, type);
2971 if (BE ( err != REG_NOERROR, 0))
2973 re_node_set_free (&next_nodes);
2974 return err;
2977 context = re_string_context_at (&mctx->input, str_idx - 1, mctx->eflags);
2978 cur_state = re_acquire_state_context (&err, dfa, &next_nodes, context);
2979 if (BE (cur_state == NULL && err != REG_NOERROR, 0))
2981 re_node_set_free (&next_nodes);
2982 return err;
2984 mctx->state_log[str_idx] = cur_state;
2985 null_cnt = cur_state == NULL ? null_cnt + 1 : 0;
2987 re_node_set_free (&next_nodes);
2988 cur_nodes = (mctx->state_log[last_str] == NULL ? NULL
2989 : &mctx->state_log[last_str]->nodes);
2990 path->next_idx = str_idx;
2992 /* Fix MCTX. */
2993 mctx->state_log = backup_state_log;
2994 mctx->input.cur_idx = backup_cur_idx;
2996 /* Then check the current node set has the node LAST_NODE. */
2997 if (cur_nodes != NULL && re_node_set_contains (cur_nodes, last_node))
2998 return REG_NOERROR;
3000 return REG_NOMATCH;
3003 /* Helper functions for check_arrival. */
3005 /* Calculate the destination nodes of CUR_NODES at STR_IDX, and append them
3006 to NEXT_NODES.
3007 TODO: This function is similar to the functions transit_state*(),
3008 however this function has many additional works.
3009 Can't we unify them? */
3011 static reg_errcode_t
3012 check_arrival_add_next_nodes (mctx, str_idx, cur_nodes, next_nodes)
3013 re_match_context_t *mctx;
3014 int str_idx;
3015 re_node_set *cur_nodes, *next_nodes;
3017 re_dfa_t *const dfa = mctx->dfa;
3018 int result;
3019 int cur_idx;
3020 reg_errcode_t err;
3021 re_node_set union_set;
3022 re_node_set_init_empty (&union_set);
3023 for (cur_idx = 0; cur_idx < cur_nodes->nelem; ++cur_idx)
3025 int naccepted = 0;
3026 int cur_node = cur_nodes->elems[cur_idx];
3027 #ifdef DEBUG
3028 re_token_type_t type = dfa->nodes[cur_node].type;
3029 assert (!IS_EPSILON_NODE (type));
3030 #endif
3031 #ifdef RE_ENABLE_I18N
3032 /* If the node may accept `multi byte'. */
3033 if (dfa->nodes[cur_node].accept_mb)
3035 naccepted = check_node_accept_bytes (dfa, cur_node, &mctx->input,
3036 str_idx);
3037 if (naccepted > 1)
3039 re_dfastate_t *dest_state;
3040 int next_node = dfa->nexts[cur_node];
3041 int next_idx = str_idx + naccepted;
3042 dest_state = mctx->state_log[next_idx];
3043 re_node_set_empty (&union_set);
3044 if (dest_state)
3046 err = re_node_set_merge (&union_set, &dest_state->nodes);
3047 if (BE (err != REG_NOERROR, 0))
3049 re_node_set_free (&union_set);
3050 return err;
3053 result = re_node_set_insert (&union_set, next_node);
3054 if (BE (result < 0, 0))
3056 re_node_set_free (&union_set);
3057 return REG_ESPACE;
3059 mctx->state_log[next_idx] = re_acquire_state (&err, dfa,
3060 &union_set);
3061 if (BE (mctx->state_log[next_idx] == NULL
3062 && err != REG_NOERROR, 0))
3064 re_node_set_free (&union_set);
3065 return err;
3069 #endif /* RE_ENABLE_I18N */
3070 if (naccepted
3071 || check_node_accept (mctx, dfa->nodes + cur_node, str_idx))
3073 result = re_node_set_insert (next_nodes, dfa->nexts[cur_node]);
3074 if (BE (result < 0, 0))
3076 re_node_set_free (&union_set);
3077 return REG_ESPACE;
3081 re_node_set_free (&union_set);
3082 return REG_NOERROR;
3085 /* For all the nodes in CUR_NODES, add the epsilon closures of them to
3086 CUR_NODES, however exclude the nodes which are:
3087 - inside the sub expression whose number is EX_SUBEXP, if FL_OPEN.
3088 - out of the sub expression whose number is EX_SUBEXP, if !FL_OPEN.
3091 static reg_errcode_t
3092 check_arrival_expand_ecl (dfa, cur_nodes, ex_subexp, type)
3093 re_dfa_t *dfa;
3094 re_node_set *cur_nodes;
3095 int ex_subexp, type;
3097 reg_errcode_t err;
3098 int idx, outside_node;
3099 re_node_set new_nodes;
3100 #ifdef DEBUG
3101 assert (cur_nodes->nelem);
3102 #endif
3103 err = re_node_set_alloc (&new_nodes, cur_nodes->nelem);
3104 if (BE (err != REG_NOERROR, 0))
3105 return err;
3106 /* Create a new node set NEW_NODES with the nodes which are epsilon
3107 closures of the node in CUR_NODES. */
3109 for (idx = 0; idx < cur_nodes->nelem; ++idx)
3111 int cur_node = cur_nodes->elems[idx];
3112 re_node_set *eclosure = dfa->eclosures + cur_node;
3113 outside_node = find_subexp_node (dfa, eclosure, ex_subexp, type);
3114 if (outside_node == -1)
3116 /* There are no problematic nodes, just merge them. */
3117 err = re_node_set_merge (&new_nodes, eclosure);
3118 if (BE (err != REG_NOERROR, 0))
3120 re_node_set_free (&new_nodes);
3121 return err;
3124 else
3126 /* There are problematic nodes, re-calculate incrementally. */
3127 err = check_arrival_expand_ecl_sub (dfa, &new_nodes, cur_node,
3128 ex_subexp, type);
3129 if (BE (err != REG_NOERROR, 0))
3131 re_node_set_free (&new_nodes);
3132 return err;
3136 re_node_set_free (cur_nodes);
3137 *cur_nodes = new_nodes;
3138 return REG_NOERROR;
3141 /* Helper function for check_arrival_expand_ecl.
3142 Check incrementally the epsilon closure of TARGET, and if it isn't
3143 problematic append it to DST_NODES. */
3145 static reg_errcode_t
3146 check_arrival_expand_ecl_sub (dfa, dst_nodes, target, ex_subexp, type)
3147 re_dfa_t *dfa;
3148 int target, ex_subexp, type;
3149 re_node_set *dst_nodes;
3151 int cur_node;
3152 for (cur_node = target; !re_node_set_contains (dst_nodes, cur_node);)
3154 int err;
3156 if (dfa->nodes[cur_node].type == type
3157 && dfa->nodes[cur_node].opr.idx == ex_subexp)
3159 if (type == OP_CLOSE_SUBEXP)
3161 err = re_node_set_insert (dst_nodes, cur_node);
3162 if (BE (err == -1, 0))
3163 return REG_ESPACE;
3165 break;
3167 err = re_node_set_insert (dst_nodes, cur_node);
3168 if (BE (err == -1, 0))
3169 return REG_ESPACE;
3170 if (dfa->edests[cur_node].nelem == 0)
3171 break;
3172 if (dfa->edests[cur_node].nelem == 2)
3174 err = check_arrival_expand_ecl_sub (dfa, dst_nodes,
3175 dfa->edests[cur_node].elems[1],
3176 ex_subexp, type);
3177 if (BE (err != REG_NOERROR, 0))
3178 return err;
3180 cur_node = dfa->edests[cur_node].elems[0];
3182 return REG_NOERROR;
3186 /* For all the back references in the current state, calculate the
3187 destination of the back references by the appropriate entry
3188 in MCTX->BKREF_ENTS. */
3190 static reg_errcode_t
3191 expand_bkref_cache (mctx, cur_nodes, cur_str, subexp_num,
3192 type)
3193 re_match_context_t *mctx;
3194 int cur_str, subexp_num, type;
3195 re_node_set *cur_nodes;
3197 re_dfa_t *const dfa = mctx->dfa;
3198 reg_errcode_t err;
3199 int cache_idx_start = search_cur_bkref_entry (mctx, cur_str);
3200 struct re_backref_cache_entry *ent;
3202 if (cache_idx_start == -1)
3203 return REG_NOERROR;
3205 restart:
3206 ent = mctx->bkref_ents + cache_idx_start;
3209 int to_idx, next_node;
3211 /* Is this entry ENT is appropriate? */
3212 if (!re_node_set_contains (cur_nodes, ent->node))
3213 continue; /* No. */
3215 to_idx = cur_str + ent->subexp_to - ent->subexp_from;
3216 /* Calculate the destination of the back reference, and append it
3217 to MCTX->STATE_LOG. */
3218 if (to_idx == cur_str)
3220 /* The backreference did epsilon transit, we must re-check all the
3221 node in the current state. */
3222 re_node_set new_dests;
3223 reg_errcode_t err2, err3;
3224 next_node = dfa->edests[ent->node].elems[0];
3225 if (re_node_set_contains (cur_nodes, next_node))
3226 continue;
3227 err = re_node_set_init_1 (&new_dests, next_node);
3228 err2 = check_arrival_expand_ecl (dfa, &new_dests, subexp_num, type);
3229 err3 = re_node_set_merge (cur_nodes, &new_dests);
3230 re_node_set_free (&new_dests);
3231 if (BE (err != REG_NOERROR || err2 != REG_NOERROR
3232 || err3 != REG_NOERROR, 0))
3234 err = (err != REG_NOERROR ? err
3235 : (err2 != REG_NOERROR ? err2 : err3));
3236 return err;
3238 /* TODO: It is still inefficient... */
3239 goto restart;
3241 else
3243 re_node_set union_set;
3244 next_node = dfa->nexts[ent->node];
3245 if (mctx->state_log[to_idx])
3247 int ret;
3248 if (re_node_set_contains (&mctx->state_log[to_idx]->nodes,
3249 next_node))
3250 continue;
3251 err = re_node_set_init_copy (&union_set,
3252 &mctx->state_log[to_idx]->nodes);
3253 ret = re_node_set_insert (&union_set, next_node);
3254 if (BE (err != REG_NOERROR || ret < 0, 0))
3256 re_node_set_free (&union_set);
3257 err = err != REG_NOERROR ? err : REG_ESPACE;
3258 return err;
3261 else
3263 err = re_node_set_init_1 (&union_set, next_node);
3264 if (BE (err != REG_NOERROR, 0))
3265 return err;
3267 mctx->state_log[to_idx] = re_acquire_state (&err, dfa, &union_set);
3268 re_node_set_free (&union_set);
3269 if (BE (mctx->state_log[to_idx] == NULL
3270 && err != REG_NOERROR, 0))
3271 return err;
3274 while (ent++->more);
3275 return REG_NOERROR;
3278 /* Build transition table for the state.
3279 Return 1 if succeeded, otherwise return NULL. */
3281 static int
3282 build_trtable (dfa, state)
3283 re_dfa_t *dfa;
3284 re_dfastate_t *state;
3286 reg_errcode_t err;
3287 int i, j, ch, need_word_trtable = 0;
3288 unsigned int elem, mask;
3289 int dests_node_malloced = 0, dest_states_malloced = 0;
3290 int ndests; /* Number of the destination states from `state'. */
3291 re_dfastate_t **trtable;
3292 re_dfastate_t **dest_states = NULL, **dest_states_word, **dest_states_nl;
3293 re_node_set follows, *dests_node;
3294 bitset *dests_ch;
3295 bitset acceptable;
3297 /* We build DFA states which corresponds to the destination nodes
3298 from `state'. `dests_node[i]' represents the nodes which i-th
3299 destination state contains, and `dests_ch[i]' represents the
3300 characters which i-th destination state accepts. */
3301 #ifdef _LIBC
3302 if (__libc_use_alloca ((sizeof (re_node_set) + sizeof (bitset)) * SBC_MAX))
3303 dests_node = (re_node_set *)
3304 alloca ((sizeof (re_node_set) + sizeof (bitset)) * SBC_MAX);
3305 else
3306 #endif
3308 dests_node = (re_node_set *)
3309 malloc ((sizeof (re_node_set) + sizeof (bitset)) * SBC_MAX);
3310 if (BE (dests_node == NULL, 0))
3311 return 0;
3312 dests_node_malloced = 1;
3314 dests_ch = (bitset *) (dests_node + SBC_MAX);
3316 /* Initialize transiton table. */
3317 state->word_trtable = state->trtable = NULL;
3319 /* At first, group all nodes belonging to `state' into several
3320 destinations. */
3321 ndests = group_nodes_into_DFAstates (dfa, state, dests_node, dests_ch);
3322 if (BE (ndests <= 0, 0))
3324 if (dests_node_malloced)
3325 free (dests_node);
3326 /* Return 0 in case of an error, 1 otherwise. */
3327 if (ndests == 0)
3329 state->trtable = (re_dfastate_t **)
3330 calloc (sizeof (re_dfastate_t *), SBC_MAX);
3331 return 1;
3333 return 0;
3336 err = re_node_set_alloc (&follows, ndests + 1);
3337 if (BE (err != REG_NOERROR, 0))
3338 goto out_free;
3340 #ifdef _LIBC
3341 if (__libc_use_alloca ((sizeof (re_node_set) + sizeof (bitset)) * SBC_MAX
3342 + ndests * 3 * sizeof (re_dfastate_t *)))
3343 dest_states = (re_dfastate_t **)
3344 alloca (ndests * 3 * sizeof (re_dfastate_t *));
3345 else
3346 #endif
3348 dest_states = (re_dfastate_t **)
3349 malloc (ndests * 3 * sizeof (re_dfastate_t *));
3350 if (BE (dest_states == NULL, 0))
3352 out_free:
3353 if (dest_states_malloced)
3354 free (dest_states);
3355 re_node_set_free (&follows);
3356 for (i = 0; i < ndests; ++i)
3357 re_node_set_free (dests_node + i);
3358 if (dests_node_malloced)
3359 free (dests_node);
3360 return 0;
3362 dest_states_malloced = 1;
3364 dest_states_word = dest_states + ndests;
3365 dest_states_nl = dest_states_word + ndests;
3366 bitset_empty (acceptable);
3368 /* Then build the states for all destinations. */
3369 for (i = 0; i < ndests; ++i)
3371 int next_node;
3372 re_node_set_empty (&follows);
3373 /* Merge the follows of this destination states. */
3374 for (j = 0; j < dests_node[i].nelem; ++j)
3376 next_node = dfa->nexts[dests_node[i].elems[j]];
3377 if (next_node != -1)
3379 err = re_node_set_merge (&follows, dfa->eclosures + next_node);
3380 if (BE (err != REG_NOERROR, 0))
3381 goto out_free;
3384 dest_states[i] = re_acquire_state_context (&err, dfa, &follows, 0);
3385 if (BE (dest_states[i] == NULL && err != REG_NOERROR, 0))
3386 goto out_free;
3387 /* If the new state has context constraint,
3388 build appropriate states for these contexts. */
3389 if (dest_states[i]->has_constraint)
3391 dest_states_word[i] = re_acquire_state_context (&err, dfa, &follows,
3392 CONTEXT_WORD);
3393 if (BE (dest_states_word[i] == NULL && err != REG_NOERROR, 0))
3394 goto out_free;
3396 if (dest_states[i] != dest_states_word[i] && dfa->mb_cur_max > 1)
3397 need_word_trtable = 1;
3399 dest_states_nl[i] = re_acquire_state_context (&err, dfa, &follows,
3400 CONTEXT_NEWLINE);
3401 if (BE (dest_states_nl[i] == NULL && err != REG_NOERROR, 0))
3402 goto out_free;
3404 else
3406 dest_states_word[i] = dest_states[i];
3407 dest_states_nl[i] = dest_states[i];
3409 bitset_merge (acceptable, dests_ch[i]);
3412 if (!BE (need_word_trtable, 0))
3414 /* We don't care about whether the following character is a word
3415 character, or we are in a single-byte character set so we can
3416 discern by looking at the character code: allocate a
3417 256-entry transition table. */
3418 trtable = state->trtable =
3419 (re_dfastate_t **) calloc (sizeof (re_dfastate_t *), SBC_MAX);
3420 if (BE (trtable == NULL, 0))
3421 goto out_free;
3423 /* For all characters ch...: */
3424 for (i = 0; i < BITSET_UINTS; ++i)
3425 for (ch = i * UINT_BITS, elem = acceptable[i], mask = 1;
3426 elem;
3427 mask <<= 1, elem >>= 1, ++ch)
3428 if (BE (elem & 1, 0))
3430 /* There must be exactly one destination which accepts
3431 character ch. See group_nodes_into_DFAstates. */
3432 for (j = 0; (dests_ch[j][i] & mask) == 0; ++j)
3435 /* j-th destination accepts the word character ch. */
3436 if (dfa->word_char[i] & mask)
3437 trtable[ch] = dest_states_word[j];
3438 else
3439 trtable[ch] = dest_states[j];
3442 else
3444 /* We care about whether the following character is a word
3445 character, and we are in a multi-byte character set: discern
3446 by looking at the character code: build two 256-entry
3447 transition tables, one starting at trtable[0] and one
3448 starting at trtable[SBC_MAX]. */
3449 trtable = state->word_trtable =
3450 (re_dfastate_t **) calloc (sizeof (re_dfastate_t *), 2 * SBC_MAX);
3451 if (BE (trtable == NULL, 0))
3452 goto out_free;
3454 /* For all characters ch...: */
3455 for (i = 0; i < BITSET_UINTS; ++i)
3456 for (ch = i * UINT_BITS, elem = acceptable[i], mask = 1;
3457 elem;
3458 mask <<= 1, elem >>= 1, ++ch)
3459 if (BE (elem & 1, 0))
3461 /* There must be exactly one destination which accepts
3462 character ch. See group_nodes_into_DFAstates. */
3463 for (j = 0; (dests_ch[j][i] & mask) == 0; ++j)
3466 /* j-th destination accepts the word character ch. */
3467 trtable[ch] = dest_states[j];
3468 trtable[ch + SBC_MAX] = dest_states_word[j];
3472 /* new line */
3473 if (bitset_contain (acceptable, NEWLINE_CHAR))
3475 /* The current state accepts newline character. */
3476 for (j = 0; j < ndests; ++j)
3477 if (bitset_contain (dests_ch[j], NEWLINE_CHAR))
3479 /* k-th destination accepts newline character. */
3480 trtable[NEWLINE_CHAR] = dest_states_nl[j];
3481 if (need_word_trtable)
3482 trtable[NEWLINE_CHAR + SBC_MAX] = dest_states_nl[j];
3483 /* There must be only one destination which accepts
3484 newline. See group_nodes_into_DFAstates. */
3485 break;
3489 if (dest_states_malloced)
3490 free (dest_states);
3492 re_node_set_free (&follows);
3493 for (i = 0; i < ndests; ++i)
3494 re_node_set_free (dests_node + i);
3496 if (dests_node_malloced)
3497 free (dests_node);
3499 return 1;
3502 /* Group all nodes belonging to STATE into several destinations.
3503 Then for all destinations, set the nodes belonging to the destination
3504 to DESTS_NODE[i] and set the characters accepted by the destination
3505 to DEST_CH[i]. This function return the number of destinations. */
3507 static int
3508 group_nodes_into_DFAstates (dfa, state, dests_node, dests_ch)
3509 re_dfa_t *dfa;
3510 const re_dfastate_t *state;
3511 re_node_set *dests_node;
3512 bitset *dests_ch;
3514 reg_errcode_t err;
3515 int result;
3516 int i, j, k;
3517 int ndests; /* Number of the destinations from `state'. */
3518 bitset accepts; /* Characters a node can accept. */
3519 const re_node_set *cur_nodes = &state->nodes;
3520 bitset_empty (accepts);
3521 ndests = 0;
3523 /* For all the nodes belonging to `state', */
3524 for (i = 0; i < cur_nodes->nelem; ++i)
3526 re_token_t *node = &dfa->nodes[cur_nodes->elems[i]];
3527 re_token_type_t type = node->type;
3528 unsigned int constraint = node->constraint;
3530 /* Enumerate all single byte character this node can accept. */
3531 if (type == CHARACTER)
3532 bitset_set (accepts, node->opr.c);
3533 else if (type == SIMPLE_BRACKET)
3535 bitset_merge (accepts, node->opr.sbcset);
3537 else if (type == OP_PERIOD)
3539 #ifdef RE_ENABLE_I18N
3540 if (dfa->mb_cur_max > 1)
3541 bitset_merge (accepts, dfa->sb_char);
3542 else
3543 #endif
3544 bitset_set_all (accepts);
3545 if (!(dfa->syntax & RE_DOT_NEWLINE))
3546 bitset_clear (accepts, '\n');
3547 if (dfa->syntax & RE_DOT_NOT_NULL)
3548 bitset_clear (accepts, '\0');
3550 #ifdef RE_ENABLE_I18N
3551 else if (type == OP_UTF8_PERIOD)
3553 memset (accepts, 255, sizeof (unsigned int) * BITSET_UINTS / 2);
3554 if (!(dfa->syntax & RE_DOT_NEWLINE))
3555 bitset_clear (accepts, '\n');
3556 if (dfa->syntax & RE_DOT_NOT_NULL)
3557 bitset_clear (accepts, '\0');
3559 #endif
3560 else
3561 continue;
3563 /* Check the `accepts' and sift the characters which are not
3564 match it the context. */
3565 if (constraint)
3567 if (constraint & NEXT_NEWLINE_CONSTRAINT)
3569 int accepts_newline = bitset_contain (accepts, NEWLINE_CHAR);
3570 bitset_empty (accepts);
3571 if (accepts_newline)
3572 bitset_set (accepts, NEWLINE_CHAR);
3573 else
3574 continue;
3576 if (constraint & NEXT_ENDBUF_CONSTRAINT)
3578 bitset_empty (accepts);
3579 continue;
3582 if (constraint & NEXT_WORD_CONSTRAINT)
3584 unsigned int any_set = 0;
3585 if (type == CHARACTER && !node->word_char)
3587 bitset_empty (accepts);
3588 continue;
3590 #ifdef RE_ENABLE_I18N
3591 if (dfa->mb_cur_max > 1)
3592 for (j = 0; j < BITSET_UINTS; ++j)
3593 any_set |= (accepts[j] &= (dfa->word_char[j] | ~dfa->sb_char[j]));
3594 else
3595 #endif
3596 for (j = 0; j < BITSET_UINTS; ++j)
3597 any_set |= (accepts[j] &= dfa->word_char[j]);
3598 if (!any_set)
3599 continue;
3601 if (constraint & NEXT_NOTWORD_CONSTRAINT)
3603 unsigned int any_set = 0;
3604 if (type == CHARACTER && node->word_char)
3606 bitset_empty (accepts);
3607 continue;
3609 #ifdef RE_ENABLE_I18N
3610 if (dfa->mb_cur_max > 1)
3611 for (j = 0; j < BITSET_UINTS; ++j)
3612 any_set |= (accepts[j] &= ~(dfa->word_char[j] & dfa->sb_char[j]));
3613 else
3614 #endif
3615 for (j = 0; j < BITSET_UINTS; ++j)
3616 any_set |= (accepts[j] &= ~dfa->word_char[j]);
3617 if (!any_set)
3618 continue;
3622 /* Then divide `accepts' into DFA states, or create a new
3623 state. Above, we make sure that accepts is not empty. */
3624 for (j = 0; j < ndests; ++j)
3626 bitset intersec; /* Intersection sets, see below. */
3627 bitset remains;
3628 /* Flags, see below. */
3629 int has_intersec, not_subset, not_consumed;
3631 /* Optimization, skip if this state doesn't accept the character. */
3632 if (type == CHARACTER && !bitset_contain (dests_ch[j], node->opr.c))
3633 continue;
3635 /* Enumerate the intersection set of this state and `accepts'. */
3636 has_intersec = 0;
3637 for (k = 0; k < BITSET_UINTS; ++k)
3638 has_intersec |= intersec[k] = accepts[k] & dests_ch[j][k];
3639 /* And skip if the intersection set is empty. */
3640 if (!has_intersec)
3641 continue;
3643 /* Then check if this state is a subset of `accepts'. */
3644 not_subset = not_consumed = 0;
3645 for (k = 0; k < BITSET_UINTS; ++k)
3647 not_subset |= remains[k] = ~accepts[k] & dests_ch[j][k];
3648 not_consumed |= accepts[k] = accepts[k] & ~dests_ch[j][k];
3651 /* If this state isn't a subset of `accepts', create a
3652 new group state, which has the `remains'. */
3653 if (not_subset)
3655 bitset_copy (dests_ch[ndests], remains);
3656 bitset_copy (dests_ch[j], intersec);
3657 err = re_node_set_init_copy (dests_node + ndests, &dests_node[j]);
3658 if (BE (err != REG_NOERROR, 0))
3659 goto error_return;
3660 ++ndests;
3663 /* Put the position in the current group. */
3664 result = re_node_set_insert (&dests_node[j], cur_nodes->elems[i]);
3665 if (BE (result < 0, 0))
3666 goto error_return;
3668 /* If all characters are consumed, go to next node. */
3669 if (!not_consumed)
3670 break;
3672 /* Some characters remain, create a new group. */
3673 if (j == ndests)
3675 bitset_copy (dests_ch[ndests], accepts);
3676 err = re_node_set_init_1 (dests_node + ndests, cur_nodes->elems[i]);
3677 if (BE (err != REG_NOERROR, 0))
3678 goto error_return;
3679 ++ndests;
3680 bitset_empty (accepts);
3683 return ndests;
3684 error_return:
3685 for (j = 0; j < ndests; ++j)
3686 re_node_set_free (dests_node + j);
3687 return -1;
3690 #ifdef RE_ENABLE_I18N
3691 /* Check how many bytes the node `dfa->nodes[node_idx]' accepts.
3692 Return the number of the bytes the node accepts.
3693 STR_IDX is the current index of the input string.
3695 This function handles the nodes which can accept one character, or
3696 one collating element like '.', '[a-z]', opposite to the other nodes
3697 can only accept one byte. */
3699 static int
3700 check_node_accept_bytes (dfa, node_idx, input, str_idx)
3701 re_dfa_t *dfa;
3702 int node_idx, str_idx;
3703 const re_string_t *input;
3705 const re_token_t *node = dfa->nodes + node_idx;
3706 int char_len, elem_len;
3707 int i;
3709 if (BE (node->type == OP_UTF8_PERIOD, 0))
3711 unsigned char c = re_string_byte_at (input, str_idx), d;
3712 if (BE (c < 0xc2, 1))
3713 return 0;
3715 if (str_idx + 2 > input->len)
3716 return 0;
3718 d = re_string_byte_at (input, str_idx + 1);
3719 if (c < 0xe0)
3720 return (d < 0x80 || d > 0xbf) ? 0 : 2;
3721 else if (c < 0xf0)
3723 char_len = 3;
3724 if (c == 0xe0 && d < 0xa0)
3725 return 0;
3727 else if (c < 0xf8)
3729 char_len = 4;
3730 if (c == 0xf0 && d < 0x90)
3731 return 0;
3733 else if (c < 0xfc)
3735 char_len = 5;
3736 if (c == 0xf8 && d < 0x88)
3737 return 0;
3739 else if (c < 0xfe)
3741 char_len = 6;
3742 if (c == 0xfc && d < 0x84)
3743 return 0;
3745 else
3746 return 0;
3748 if (str_idx + char_len > input->len)
3749 return 0;
3751 for (i = 1; i < char_len; ++i)
3753 d = re_string_byte_at (input, str_idx + i);
3754 if (d < 0x80 || d > 0xbf)
3755 return 0;
3757 return char_len;
3760 char_len = re_string_char_size_at (input, str_idx);
3761 if (node->type == OP_PERIOD)
3763 if (char_len <= 1)
3764 return 0;
3765 /* FIXME: I don't think this if is needed, as both '\n'
3766 and '\0' are char_len == 1. */
3767 /* '.' accepts any one character except the following two cases. */
3768 if ((!(dfa->syntax & RE_DOT_NEWLINE) &&
3769 re_string_byte_at (input, str_idx) == '\n') ||
3770 ((dfa->syntax & RE_DOT_NOT_NULL) &&
3771 re_string_byte_at (input, str_idx) == '\0'))
3772 return 0;
3773 return char_len;
3776 elem_len = re_string_elem_size_at (input, str_idx);
3777 if ((elem_len <= 1 && char_len <= 1) || char_len == 0)
3778 return 0;
3780 if (node->type == COMPLEX_BRACKET)
3782 const re_charset_t *cset = node->opr.mbcset;
3783 # ifdef _LIBC
3784 const unsigned char *pin = ((char *) re_string_get_buffer (input)
3785 + str_idx);
3786 int j;
3787 uint32_t nrules;
3788 # endif /* _LIBC */
3789 int match_len = 0;
3790 wchar_t wc = ((cset->nranges || cset->nchar_classes || cset->nmbchars)
3791 ? re_string_wchar_at (input, str_idx) : 0);
3793 /* match with multibyte character? */
3794 for (i = 0; i < cset->nmbchars; ++i)
3795 if (wc == cset->mbchars[i])
3797 match_len = char_len;
3798 goto check_node_accept_bytes_match;
3800 /* match with character_class? */
3801 for (i = 0; i < cset->nchar_classes; ++i)
3803 wctype_t wt = cset->char_classes[i];
3804 if (__iswctype (wc, wt))
3806 match_len = char_len;
3807 goto check_node_accept_bytes_match;
3811 # ifdef _LIBC
3812 nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3813 if (nrules != 0)
3815 unsigned int in_collseq = 0;
3816 const int32_t *table, *indirect;
3817 const unsigned char *weights, *extra;
3818 const char *collseqwc;
3819 int32_t idx;
3820 /* This #include defines a local function! */
3821 # include <locale/weight.h>
3823 /* match with collating_symbol? */
3824 if (cset->ncoll_syms)
3825 extra = (const unsigned char *)
3826 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB);
3827 for (i = 0; i < cset->ncoll_syms; ++i)
3829 const unsigned char *coll_sym = extra + cset->coll_syms[i];
3830 /* Compare the length of input collating element and
3831 the length of current collating element. */
3832 if (*coll_sym != elem_len)
3833 continue;
3834 /* Compare each bytes. */
3835 for (j = 0; j < *coll_sym; j++)
3836 if (pin[j] != coll_sym[1 + j])
3837 break;
3838 if (j == *coll_sym)
3840 /* Match if every bytes is equal. */
3841 match_len = j;
3842 goto check_node_accept_bytes_match;
3846 if (cset->nranges)
3848 if (elem_len <= char_len)
3850 collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
3851 in_collseq = __collseq_table_lookup (collseqwc, wc);
3853 else
3854 in_collseq = find_collation_sequence_value (pin, elem_len);
3856 /* match with range expression? */
3857 for (i = 0; i < cset->nranges; ++i)
3858 if (cset->range_starts[i] <= in_collseq
3859 && in_collseq <= cset->range_ends[i])
3861 match_len = elem_len;
3862 goto check_node_accept_bytes_match;
3865 /* match with equivalence_class? */
3866 if (cset->nequiv_classes)
3868 const unsigned char *cp = pin;
3869 table = (const int32_t *)
3870 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3871 weights = (const unsigned char *)
3872 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_WEIGHTMB);
3873 extra = (const unsigned char *)
3874 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_EXTRAMB);
3875 indirect = (const int32_t *)
3876 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_INDIRECTMB);
3877 idx = findidx (&cp);
3878 if (idx > 0)
3879 for (i = 0; i < cset->nequiv_classes; ++i)
3881 int32_t equiv_class_idx = cset->equiv_classes[i];
3882 size_t weight_len = weights[idx];
3883 if (weight_len == weights[equiv_class_idx])
3885 int cnt = 0;
3886 while (cnt <= weight_len
3887 && (weights[equiv_class_idx + 1 + cnt]
3888 == weights[idx + 1 + cnt]))
3889 ++cnt;
3890 if (cnt > weight_len)
3892 match_len = elem_len;
3893 goto check_node_accept_bytes_match;
3899 else
3900 # endif /* _LIBC */
3902 /* match with range expression? */
3903 #if __GNUC__ >= 2
3904 wchar_t cmp_buf[] = {L'\0', L'\0', wc, L'\0', L'\0', L'\0'};
3905 #else
3906 wchar_t cmp_buf[] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'};
3907 cmp_buf[2] = wc;
3908 #endif
3909 for (i = 0; i < cset->nranges; ++i)
3911 cmp_buf[0] = cset->range_starts[i];
3912 cmp_buf[4] = cset->range_ends[i];
3913 if (wcscoll (cmp_buf, cmp_buf + 2) <= 0
3914 && wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0)
3916 match_len = char_len;
3917 goto check_node_accept_bytes_match;
3921 check_node_accept_bytes_match:
3922 if (!cset->non_match)
3923 return match_len;
3924 else
3926 if (match_len > 0)
3927 return 0;
3928 else
3929 return (elem_len > char_len) ? elem_len : char_len;
3932 return 0;
3935 # ifdef _LIBC
3936 static unsigned int
3937 find_collation_sequence_value (mbs, mbs_len)
3938 const unsigned char *mbs;
3939 size_t mbs_len;
3941 uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3942 if (nrules == 0)
3944 if (mbs_len == 1)
3946 /* No valid character. Match it as a single byte character. */
3947 const unsigned char *collseq = (const unsigned char *)
3948 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
3949 return collseq[mbs[0]];
3951 return UINT_MAX;
3953 else
3955 int32_t idx;
3956 const unsigned char *extra = (const unsigned char *)
3957 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB);
3958 int32_t extrasize = (const unsigned char *)
3959 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB + 1) - extra;
3961 for (idx = 0; idx < extrasize;)
3963 int mbs_cnt, found = 0;
3964 int32_t elem_mbs_len;
3965 /* Skip the name of collating element name. */
3966 idx = idx + extra[idx] + 1;
3967 elem_mbs_len = extra[idx++];
3968 if (mbs_len == elem_mbs_len)
3970 for (mbs_cnt = 0; mbs_cnt < elem_mbs_len; ++mbs_cnt)
3971 if (extra[idx + mbs_cnt] != mbs[mbs_cnt])
3972 break;
3973 if (mbs_cnt == elem_mbs_len)
3974 /* Found the entry. */
3975 found = 1;
3977 /* Skip the byte sequence of the collating element. */
3978 idx += elem_mbs_len;
3979 /* Adjust for the alignment. */
3980 idx = (idx + 3) & ~3;
3981 /* Skip the collation sequence value. */
3982 idx += sizeof (uint32_t);
3983 /* Skip the wide char sequence of the collating element. */
3984 idx = idx + sizeof (uint32_t) * (extra[idx] + 1);
3985 /* If we found the entry, return the sequence value. */
3986 if (found)
3987 return *(uint32_t *) (extra + idx);
3988 /* Skip the collation sequence value. */
3989 idx += sizeof (uint32_t);
3991 return UINT_MAX;
3994 # endif /* _LIBC */
3995 #endif /* RE_ENABLE_I18N */
3997 /* Check whether the node accepts the byte which is IDX-th
3998 byte of the INPUT. */
4000 static int
4001 check_node_accept (mctx, node, idx)
4002 const re_match_context_t *mctx;
4003 const re_token_t *node;
4004 int idx;
4006 unsigned char ch;
4007 ch = re_string_byte_at (&mctx->input, idx);
4008 switch (node->type)
4010 case CHARACTER:
4011 if (node->opr.c != ch)
4012 return 0;
4013 break;
4015 case SIMPLE_BRACKET:
4016 if (!bitset_contain (node->opr.sbcset, ch))
4017 return 0;
4018 break;
4020 #ifdef RE_ENABLE_I18N
4021 case OP_UTF8_PERIOD:
4022 if (ch >= 0x80)
4023 return 0;
4024 /* FALLTHROUGH */
4025 #endif
4026 case OP_PERIOD:
4027 if ((ch == '\n' && !(mctx->dfa->syntax & RE_DOT_NEWLINE))
4028 || (ch == '\0' && (mctx->dfa->syntax & RE_DOT_NOT_NULL)))
4029 return 0;
4030 break;
4032 default:
4033 return 0;
4036 if (node->constraint)
4038 /* The node has constraints. Check whether the current context
4039 satisfies the constraints. */
4040 unsigned int context = re_string_context_at (&mctx->input, idx,
4041 mctx->eflags);
4042 if (NOT_SATISFY_NEXT_CONSTRAINT (node->constraint, context))
4043 return 0;
4046 return 1;
4049 /* Extend the buffers, if the buffers have run out. */
4051 static reg_errcode_t
4052 extend_buffers (mctx)
4053 re_match_context_t *mctx;
4055 reg_errcode_t ret;
4056 re_string_t *pstr = &mctx->input;
4058 /* Double the lengthes of the buffers. */
4059 ret = re_string_realloc_buffers (pstr, pstr->bufs_len * 2);
4060 if (BE (ret != REG_NOERROR, 0))
4061 return ret;
4063 if (mctx->state_log != NULL)
4065 /* And double the length of state_log. */
4066 /* XXX We have no indication of the size of this buffer. If this
4067 allocation fail we have no indication that the state_log array
4068 does not have the right size. */
4069 re_dfastate_t **new_array = re_realloc (mctx->state_log, re_dfastate_t *,
4070 pstr->bufs_len + 1);
4071 if (BE (new_array == NULL, 0))
4072 return REG_ESPACE;
4073 mctx->state_log = new_array;
4076 /* Then reconstruct the buffers. */
4077 if (pstr->icase)
4079 #ifdef RE_ENABLE_I18N
4080 if (pstr->mb_cur_max > 1)
4082 ret = build_wcs_upper_buffer (pstr);
4083 if (BE (ret != REG_NOERROR, 0))
4084 return ret;
4086 else
4087 #endif /* RE_ENABLE_I18N */
4088 build_upper_buffer (pstr);
4090 else
4092 #ifdef RE_ENABLE_I18N
4093 if (pstr->mb_cur_max > 1)
4094 build_wcs_buffer (pstr);
4095 else
4096 #endif /* RE_ENABLE_I18N */
4098 if (pstr->trans != NULL)
4099 re_string_translate_buffer (pstr);
4102 return REG_NOERROR;
4106 /* Functions for matching context. */
4108 /* Initialize MCTX. */
4110 static reg_errcode_t
4111 match_ctx_init (mctx, eflags, n)
4112 re_match_context_t *mctx;
4113 int eflags, n;
4115 mctx->eflags = eflags;
4116 mctx->match_last = -1;
4117 if (n > 0)
4119 mctx->bkref_ents = re_malloc (struct re_backref_cache_entry, n);
4120 mctx->sub_tops = re_malloc (re_sub_match_top_t *, n);
4121 if (BE (mctx->bkref_ents == NULL || mctx->sub_tops == NULL, 0))
4122 return REG_ESPACE;
4124 /* Already zero-ed by the caller.
4125 else
4126 mctx->bkref_ents = NULL;
4127 mctx->nbkref_ents = 0;
4128 mctx->nsub_tops = 0; */
4129 mctx->abkref_ents = n;
4130 mctx->max_mb_elem_len = 1;
4131 mctx->asub_tops = n;
4132 return REG_NOERROR;
4135 /* Clean the entries which depend on the current input in MCTX.
4136 This function must be invoked when the matcher changes the start index
4137 of the input, or changes the input string. */
4139 static void
4140 match_ctx_clean (mctx)
4141 re_match_context_t *mctx;
4143 int st_idx;
4144 for (st_idx = 0; st_idx < mctx->nsub_tops; ++st_idx)
4146 int sl_idx;
4147 re_sub_match_top_t *top = mctx->sub_tops[st_idx];
4148 for (sl_idx = 0; sl_idx < top->nlasts; ++sl_idx)
4150 re_sub_match_last_t *last = top->lasts[sl_idx];
4151 re_free (last->path.array);
4152 re_free (last);
4154 re_free (top->lasts);
4155 if (top->path)
4157 re_free (top->path->array);
4158 re_free (top->path);
4160 free (top);
4163 mctx->nsub_tops = 0;
4164 mctx->nbkref_ents = 0;
4167 /* Free all the memory associated with MCTX. */
4169 static void
4170 match_ctx_free (mctx)
4171 re_match_context_t *mctx;
4173 /* First, free all the memory associated with MCTX->SUB_TOPS. */
4174 match_ctx_clean (mctx);
4175 re_free (mctx->sub_tops);
4176 re_free (mctx->bkref_ents);
4179 /* Add a new backreference entry to MCTX.
4180 Note that we assume that caller never call this function with duplicate
4181 entry, and call with STR_IDX which isn't smaller than any existing entry.
4184 static reg_errcode_t
4185 match_ctx_add_entry (mctx, node, str_idx, from, to)
4186 re_match_context_t *mctx;
4187 int node, str_idx, from, to;
4189 if (mctx->nbkref_ents >= mctx->abkref_ents)
4191 struct re_backref_cache_entry* new_entry;
4192 new_entry = re_realloc (mctx->bkref_ents, struct re_backref_cache_entry,
4193 mctx->abkref_ents * 2);
4194 if (BE (new_entry == NULL, 0))
4196 re_free (mctx->bkref_ents);
4197 return REG_ESPACE;
4199 mctx->bkref_ents = new_entry;
4200 memset (mctx->bkref_ents + mctx->nbkref_ents, '\0',
4201 sizeof (struct re_backref_cache_entry) * mctx->abkref_ents);
4202 mctx->abkref_ents *= 2;
4204 if (mctx->nbkref_ents > 0
4205 && mctx->bkref_ents[mctx->nbkref_ents - 1].str_idx == str_idx)
4206 mctx->bkref_ents[mctx->nbkref_ents - 1].more = 1;
4208 mctx->bkref_ents[mctx->nbkref_ents].node = node;
4209 mctx->bkref_ents[mctx->nbkref_ents].str_idx = str_idx;
4210 mctx->bkref_ents[mctx->nbkref_ents].subexp_from = from;
4211 mctx->bkref_ents[mctx->nbkref_ents].subexp_to = to;
4213 /* This is a cache that saves negative results of check_dst_limits_calc_pos.
4214 If bit N is clear, means that this entry won't epsilon-transition to
4215 an OP_OPEN_SUBEXP or OP_CLOSE_SUBEXP for the N+1-th subexpression. If
4216 it is set, check_dst_limits_calc_pos_1 will recurse and try to find one
4217 such node.
4219 A backreference does not epsilon-transition unless it is empty, so set
4220 to all zeros if FROM != TO. */
4221 mctx->bkref_ents[mctx->nbkref_ents].eps_reachable_subexps_map
4222 = (from == to ? ~0 : 0);
4224 mctx->bkref_ents[mctx->nbkref_ents++].more = 0;
4225 if (mctx->max_mb_elem_len < to - from)
4226 mctx->max_mb_elem_len = to - from;
4227 return REG_NOERROR;
4230 /* Search for the first entry which has the same str_idx, or -1 if none is
4231 found. Note that MCTX->BKREF_ENTS is already sorted by MCTX->STR_IDX. */
4233 static int
4234 search_cur_bkref_entry (mctx, str_idx)
4235 re_match_context_t *mctx;
4236 int str_idx;
4238 int left, right, mid, last;
4239 last = right = mctx->nbkref_ents;
4240 for (left = 0; left < right;)
4242 mid = (left + right) / 2;
4243 if (mctx->bkref_ents[mid].str_idx < str_idx)
4244 left = mid + 1;
4245 else
4246 right = mid;
4248 if (left < last && mctx->bkref_ents[left].str_idx == str_idx)
4249 return left;
4250 else
4251 return -1;
4254 /* Register the node NODE, whose type is OP_OPEN_SUBEXP, and which matches
4255 at STR_IDX. */
4257 static reg_errcode_t
4258 match_ctx_add_subtop (mctx, node, str_idx)
4259 re_match_context_t *mctx;
4260 int node, str_idx;
4262 #ifdef DEBUG
4263 assert (mctx->sub_tops != NULL);
4264 assert (mctx->asub_tops > 0);
4265 #endif
4266 if (BE (mctx->nsub_tops == mctx->asub_tops, 0))
4268 int new_asub_tops = mctx->asub_tops * 2;
4269 re_sub_match_top_t **new_array = re_realloc (mctx->sub_tops,
4270 re_sub_match_top_t *,
4271 new_asub_tops);
4272 if (BE (new_array == NULL, 0))
4273 return REG_ESPACE;
4274 mctx->sub_tops = new_array;
4275 mctx->asub_tops = new_asub_tops;
4277 mctx->sub_tops[mctx->nsub_tops] = calloc (1, sizeof (re_sub_match_top_t));
4278 if (BE (mctx->sub_tops[mctx->nsub_tops] == NULL, 0))
4279 return REG_ESPACE;
4280 mctx->sub_tops[mctx->nsub_tops]->node = node;
4281 mctx->sub_tops[mctx->nsub_tops++]->str_idx = str_idx;
4282 return REG_NOERROR;
4285 /* Register the node NODE, whose type is OP_CLOSE_SUBEXP, and which matches
4286 at STR_IDX, whose corresponding OP_OPEN_SUBEXP is SUB_TOP. */
4288 static re_sub_match_last_t *
4289 match_ctx_add_sublast (subtop, node, str_idx)
4290 re_sub_match_top_t *subtop;
4291 int node, str_idx;
4293 re_sub_match_last_t *new_entry;
4294 if (BE (subtop->nlasts == subtop->alasts, 0))
4296 int new_alasts = 2 * subtop->alasts + 1;
4297 re_sub_match_last_t **new_array = re_realloc (subtop->lasts,
4298 re_sub_match_last_t *,
4299 new_alasts);
4300 if (BE (new_array == NULL, 0))
4301 return NULL;
4302 subtop->lasts = new_array;
4303 subtop->alasts = new_alasts;
4305 new_entry = calloc (1, sizeof (re_sub_match_last_t));
4306 if (BE (new_entry != NULL, 1))
4308 subtop->lasts[subtop->nlasts] = new_entry;
4309 new_entry->node = node;
4310 new_entry->str_idx = str_idx;
4311 ++subtop->nlasts;
4313 return new_entry;
4316 static void
4317 sift_ctx_init (sctx, sifted_sts, limited_sts, last_node, last_str_idx)
4318 re_sift_context_t *sctx;
4319 re_dfastate_t **sifted_sts, **limited_sts;
4320 int last_node, last_str_idx;
4322 sctx->sifted_states = sifted_sts;
4323 sctx->limited_states = limited_sts;
4324 sctx->last_node = last_node;
4325 sctx->last_str_idx = last_str_idx;
4326 re_node_set_init_empty (&sctx->limits);