(CFLAGS-tst-align.c): Add -mpreferred-stack-boundary=4.
[glibc.git] / posix / regexec.c
blob91b48dd4a205f413d044aa476e2f0c81cf87c3ea
1 /* Extended regular expression matching and search library.
2 Copyright (C) 2002, 2003, 2004 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4 Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library; if not, write to the Free
18 Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
19 02111-1307 USA. */
21 static reg_errcode_t match_ctx_init (re_match_context_t *cache, int eflags,
22 int n) internal_function;
23 static void match_ctx_clean (re_match_context_t *mctx) internal_function;
24 static void match_ctx_free (re_match_context_t *cache) internal_function;
25 static reg_errcode_t match_ctx_add_entry (re_match_context_t *cache, int node,
26 int str_idx, int from, int to)
27 internal_function;
28 static int search_cur_bkref_entry (re_match_context_t *mctx, int str_idx)
29 internal_function;
30 static reg_errcode_t match_ctx_add_subtop (re_match_context_t *mctx, int node,
31 int str_idx) internal_function;
32 static re_sub_match_last_t * match_ctx_add_sublast (re_sub_match_top_t *subtop,
33 int node, int str_idx)
34 internal_function;
35 static void sift_ctx_init (re_sift_context_t *sctx, re_dfastate_t **sifted_sts,
36 re_dfastate_t **limited_sts, int last_node,
37 int last_str_idx)
38 internal_function;
39 static reg_errcode_t re_search_internal (const regex_t *preg,
40 const char *string, int length,
41 int start, int range, int stop,
42 size_t nmatch, regmatch_t pmatch[],
43 int eflags) internal_function;
44 static int re_search_2_stub (struct re_pattern_buffer *bufp,
45 const char *string1, int length1,
46 const char *string2, int length2,
47 int start, int range, struct re_registers *regs,
48 int stop, int ret_len) internal_function;
49 static int re_search_stub (struct re_pattern_buffer *bufp,
50 const char *string, int length, int start,
51 int range, int stop, struct re_registers *regs,
52 int ret_len) internal_function;
53 static unsigned re_copy_regs (struct re_registers *regs, regmatch_t *pmatch,
54 int nregs, int regs_allocated) internal_function;
55 static inline re_dfastate_t *acquire_init_state_context
56 (reg_errcode_t *err, const re_match_context_t *mctx, int idx)
57 __attribute ((always_inline)) internal_function;
58 static reg_errcode_t prune_impossible_nodes (re_match_context_t *mctx)
59 internal_function;
60 static int check_matching (re_match_context_t *mctx, int fl_longest_match,
61 int *p_match_first)
62 internal_function;
63 static int check_halt_node_context (const re_dfa_t *dfa, int node,
64 unsigned int context) internal_function;
65 static int check_halt_state_context (const re_match_context_t *mctx,
66 const re_dfastate_t *state, int idx)
67 internal_function;
68 static void update_regs (re_dfa_t *dfa, regmatch_t *pmatch,
69 regmatch_t *prev_idx_match, int cur_node,
70 int cur_idx, int nmatch) internal_function;
71 static int proceed_next_node (const re_match_context_t *mctx,
72 int nregs, regmatch_t *regs,
73 int *pidx, int node, re_node_set *eps_via_nodes,
74 struct re_fail_stack_t *fs) internal_function;
75 static reg_errcode_t push_fail_stack (struct re_fail_stack_t *fs,
76 int str_idx, int dest_node, int nregs,
77 regmatch_t *regs,
78 re_node_set *eps_via_nodes) internal_function;
79 static int pop_fail_stack (struct re_fail_stack_t *fs, int *pidx, int nregs,
80 regmatch_t *regs, re_node_set *eps_via_nodes) internal_function;
81 static reg_errcode_t set_regs (const regex_t *preg,
82 const re_match_context_t *mctx,
83 size_t nmatch, regmatch_t *pmatch,
84 int fl_backtrack) internal_function;
85 static reg_errcode_t free_fail_stack_return (struct re_fail_stack_t *fs) internal_function;
87 #ifdef RE_ENABLE_I18N
88 static int sift_states_iter_mb (const re_match_context_t *mctx,
89 re_sift_context_t *sctx,
90 int node_idx, int str_idx, int max_str_idx) internal_function;
91 #endif /* RE_ENABLE_I18N */
92 static reg_errcode_t sift_states_backward (re_match_context_t *mctx,
93 re_sift_context_t *sctx) internal_function;
94 static reg_errcode_t build_sifted_states (re_match_context_t *mctx,
95 re_sift_context_t *sctx, int str_idx,
96 re_node_set *cur_dest) internal_function;
97 static reg_errcode_t update_cur_sifted_state (re_match_context_t *mctx,
98 re_sift_context_t *sctx,
99 int str_idx,
100 re_node_set *dest_nodes) internal_function;
101 static reg_errcode_t add_epsilon_src_nodes (re_dfa_t *dfa,
102 re_node_set *dest_nodes,
103 const re_node_set *candidates) internal_function;
104 static reg_errcode_t sub_epsilon_src_nodes (re_dfa_t *dfa, int node,
105 re_node_set *dest_nodes,
106 const re_node_set *and_nodes) internal_function;
107 static int check_dst_limits (re_match_context_t *mctx, re_node_set *limits,
108 int dst_node, int dst_idx, int src_node,
109 int src_idx) internal_function;
110 static int check_dst_limits_calc_pos_1 (re_match_context_t *mctx,
111 int boundaries, int subexp_idx,
112 int from_node, int bkref_idx) internal_function;
113 static int check_dst_limits_calc_pos (re_match_context_t *mctx,
114 int limit, int subexp_idx,
115 int node, int str_idx,
116 int bkref_idx) internal_function;
117 static reg_errcode_t check_subexp_limits (re_dfa_t *dfa,
118 re_node_set *dest_nodes,
119 const re_node_set *candidates,
120 re_node_set *limits,
121 struct re_backref_cache_entry *bkref_ents,
122 int str_idx) internal_function;
123 static reg_errcode_t sift_states_bkref (re_match_context_t *mctx,
124 re_sift_context_t *sctx,
125 int str_idx, const re_node_set *candidates) internal_function;
126 static reg_errcode_t clean_state_log_if_needed (re_match_context_t *mctx,
127 int next_state_log_idx) internal_function;
128 static reg_errcode_t merge_state_array (re_dfa_t *dfa, re_dfastate_t **dst,
129 re_dfastate_t **src, int num) internal_function;
130 static re_dfastate_t *find_recover_state (reg_errcode_t *err,
131 re_match_context_t *mctx) internal_function;
132 static re_dfastate_t *transit_state (reg_errcode_t *err,
133 re_match_context_t *mctx,
134 re_dfastate_t *state) internal_function;
135 static re_dfastate_t *merge_state_with_log (reg_errcode_t *err,
136 re_match_context_t *mctx,
137 re_dfastate_t *next_state) internal_function;
138 static reg_errcode_t check_subexp_matching_top (re_match_context_t *mctx,
139 re_node_set *cur_nodes,
140 int str_idx) internal_function;
141 #if 0
142 static re_dfastate_t *transit_state_sb (reg_errcode_t *err,
143 re_match_context_t *mctx,
144 re_dfastate_t *pstate) internal_function;
145 #endif
146 #ifdef RE_ENABLE_I18N
147 static reg_errcode_t transit_state_mb (re_match_context_t *mctx,
148 re_dfastate_t *pstate) internal_function;
149 #endif /* RE_ENABLE_I18N */
150 static reg_errcode_t transit_state_bkref (re_match_context_t *mctx,
151 const re_node_set *nodes) internal_function;
152 static reg_errcode_t get_subexp (re_match_context_t *mctx,
153 int bkref_node, int bkref_str_idx) internal_function;
154 static reg_errcode_t get_subexp_sub (re_match_context_t *mctx,
155 const re_sub_match_top_t *sub_top,
156 re_sub_match_last_t *sub_last,
157 int bkref_node, int bkref_str) internal_function;
158 static int find_subexp_node (const re_dfa_t *dfa, const re_node_set *nodes,
159 int subexp_idx, int type) internal_function;
160 static reg_errcode_t check_arrival (re_match_context_t *mctx,
161 state_array_t *path, int top_node,
162 int top_str, int last_node, int last_str,
163 int type) internal_function;
164 static reg_errcode_t check_arrival_add_next_nodes (re_match_context_t *mctx,
165 int str_idx,
166 re_node_set *cur_nodes,
167 re_node_set *next_nodes) internal_function;
168 static reg_errcode_t check_arrival_expand_ecl (re_dfa_t *dfa,
169 re_node_set *cur_nodes,
170 int ex_subexp, int type) internal_function;
171 static reg_errcode_t check_arrival_expand_ecl_sub (re_dfa_t *dfa,
172 re_node_set *dst_nodes,
173 int target, int ex_subexp,
174 int type) internal_function;
175 static reg_errcode_t expand_bkref_cache (re_match_context_t *mctx,
176 re_node_set *cur_nodes, int cur_str,
177 int subexp_num, int type) internal_function;
178 static re_dfastate_t **build_trtable (re_dfa_t *dfa,
179 re_dfastate_t *state) internal_function;
180 #ifdef RE_ENABLE_I18N
181 static int check_node_accept_bytes (re_dfa_t *dfa, int node_idx,
182 const re_string_t *input, int idx) internal_function;
183 # ifdef _LIBC
184 static unsigned int find_collation_sequence_value (const unsigned char *mbs,
185 size_t name_len) internal_function;
186 # endif /* _LIBC */
187 #endif /* RE_ENABLE_I18N */
188 static int group_nodes_into_DFAstates (re_dfa_t *dfa,
189 const re_dfastate_t *state,
190 re_node_set *states_node,
191 bitset *states_ch) internal_function;
192 static int check_node_accept (const re_match_context_t *mctx,
193 const re_token_t *node, int idx) internal_function;
194 static reg_errcode_t extend_buffers (re_match_context_t *mctx) internal_function;
196 /* Entry point for POSIX code. */
198 /* regexec searches for a given pattern, specified by PREG, in the
199 string STRING.
201 If NMATCH is zero or REG_NOSUB was set in the cflags argument to
202 `regcomp', we ignore PMATCH. Otherwise, we assume PMATCH has at
203 least NMATCH elements, and we set them to the offsets of the
204 corresponding matched substrings.
206 EFLAGS specifies `execution flags' which affect matching: if
207 REG_NOTBOL is set, then ^ does not match at the beginning of the
208 string; if REG_NOTEOL is set, then $ does not match at the end.
210 We return 0 if we find a match and REG_NOMATCH if not. */
213 regexec (preg, string, nmatch, pmatch, eflags)
214 const regex_t *__restrict preg;
215 const char *__restrict string;
216 size_t nmatch;
217 regmatch_t pmatch[];
218 int eflags;
220 reg_errcode_t err;
221 int start, length;
223 if (eflags & ~(REG_NOTBOL | REG_NOTEOL | REG_STARTEND))
224 return REG_BADPAT;
226 if (eflags & REG_STARTEND)
228 start = pmatch[0].rm_so;
229 length = pmatch[0].rm_eo;
231 else
233 start = 0;
234 length = strlen (string);
236 if (preg->no_sub)
237 err = re_search_internal (preg, string, length, start, length - start,
238 length, 0, NULL, eflags);
239 else
240 err = re_search_internal (preg, string, length, start, length - start,
241 length, nmatch, pmatch, eflags);
242 return err != REG_NOERROR;
245 #ifdef _LIBC
246 # include <shlib-compat.h>
247 versioned_symbol (libc, __regexec, regexec, GLIBC_2_3_4);
249 # if SHLIB_COMPAT (libc, GLIBC_2_0, GLIBC_2_3_4)
250 __typeof__ (__regexec) __compat_regexec;
253 attribute_compat_text_section
254 __compat_regexec (const regex_t *__restrict preg,
255 const char *__restrict string, size_t nmatch,
256 regmatch_t pmatch[], int eflags)
258 return regexec (preg, string, nmatch, pmatch,
259 eflags & (REG_NOTBOL | REG_NOTEOL));
261 compat_symbol (libc, __compat_regexec, regexec, GLIBC_2_0);
262 # endif
263 #endif
265 /* Entry points for GNU code. */
267 /* re_match, re_search, re_match_2, re_search_2
269 The former two functions operate on STRING with length LENGTH,
270 while the later two operate on concatenation of STRING1 and STRING2
271 with lengths LENGTH1 and LENGTH2, respectively.
273 re_match() matches the compiled pattern in BUFP against the string,
274 starting at index START.
276 re_search() first tries matching at index START, then it tries to match
277 starting from index START + 1, and so on. The last start position tried
278 is START + RANGE. (Thus RANGE = 0 forces re_search to operate the same
279 way as re_match().)
281 The parameter STOP of re_{match,search}_2 specifies that no match exceeding
282 the first STOP characters of the concatenation of the strings should be
283 concerned.
285 If REGS is not NULL, and BUFP->no_sub is not set, the offsets of the match
286 and all groups is stroed in REGS. (For the "_2" variants, the offsets are
287 computed relative to the concatenation, not relative to the individual
288 strings.)
290 On success, re_match* functions return the length of the match, re_search*
291 return the position of the start of the match. Return value -1 means no
292 match was found and -2 indicates an internal error. */
295 re_match (bufp, string, length, start, regs)
296 struct re_pattern_buffer *bufp;
297 const char *string;
298 int length, start;
299 struct re_registers *regs;
301 return re_search_stub (bufp, string, length, start, 0, length, regs, 1);
303 #ifdef _LIBC
304 weak_alias (__re_match, re_match)
305 #endif
308 re_search (bufp, string, length, start, range, regs)
309 struct re_pattern_buffer *bufp;
310 const char *string;
311 int length, start, range;
312 struct re_registers *regs;
314 return re_search_stub (bufp, string, length, start, range, length, regs, 0);
316 #ifdef _LIBC
317 weak_alias (__re_search, re_search)
318 #endif
321 re_match_2 (bufp, string1, length1, string2, length2, start, regs, stop)
322 struct re_pattern_buffer *bufp;
323 const char *string1, *string2;
324 int length1, length2, start, stop;
325 struct re_registers *regs;
327 return re_search_2_stub (bufp, string1, length1, string2, length2,
328 start, 0, regs, stop, 1);
330 #ifdef _LIBC
331 weak_alias (__re_match_2, re_match_2)
332 #endif
335 re_search_2 (bufp, string1, length1, string2, length2, start, range, regs, stop)
336 struct re_pattern_buffer *bufp;
337 const char *string1, *string2;
338 int length1, length2, start, range, stop;
339 struct re_registers *regs;
341 return re_search_2_stub (bufp, string1, length1, string2, length2,
342 start, range, regs, stop, 0);
344 #ifdef _LIBC
345 weak_alias (__re_search_2, re_search_2)
346 #endif
348 static int
349 re_search_2_stub (bufp, string1, length1, string2, length2, start, range, regs,
350 stop, ret_len)
351 struct re_pattern_buffer *bufp;
352 const char *string1, *string2;
353 int length1, length2, start, range, stop, ret_len;
354 struct re_registers *regs;
356 const char *str;
357 int rval;
358 int len = length1 + length2;
359 int free_str = 0;
361 if (BE (length1 < 0 || length2 < 0 || stop < 0, 0))
362 return -2;
364 /* Concatenate the strings. */
365 if (length2 > 0)
366 if (length1 > 0)
368 char *s = re_malloc (char, len);
370 if (BE (s == NULL, 0))
371 return -2;
372 memcpy (s, string1, length1);
373 memcpy (s + length1, string2, length2);
374 str = s;
375 free_str = 1;
377 else
378 str = string2;
379 else
380 str = string1;
382 rval = re_search_stub (bufp, str, len, start, range, stop, regs,
383 ret_len);
384 if (free_str)
385 re_free ((char *) str);
386 return rval;
389 /* The parameters have the same meaning as those of re_search.
390 Additional parameters:
391 If RET_LEN is nonzero the length of the match is returned (re_match style);
392 otherwise the position of the match is returned. */
394 static int
395 re_search_stub (bufp, string, length, start, range, stop, regs, ret_len)
396 struct re_pattern_buffer *bufp;
397 const char *string;
398 int length, start, range, stop, ret_len;
399 struct re_registers *regs;
401 reg_errcode_t result;
402 regmatch_t *pmatch;
403 int nregs, rval;
404 int eflags = 0;
406 /* Check for out-of-range. */
407 if (BE (start < 0 || start > length, 0))
408 return -1;
409 if (BE (start + range > length, 0))
410 range = length - start;
411 else if (BE (start + range < 0, 0))
412 range = -start;
414 eflags |= (bufp->not_bol) ? REG_NOTBOL : 0;
415 eflags |= (bufp->not_eol) ? REG_NOTEOL : 0;
417 /* Compile fastmap if we haven't yet. */
418 if (range > 0 && bufp->fastmap != NULL && !bufp->fastmap_accurate)
419 re_compile_fastmap (bufp);
421 if (BE (bufp->no_sub, 0))
422 regs = NULL;
424 /* We need at least 1 register. */
425 if (regs == NULL)
426 nregs = 1;
427 else if (BE (bufp->regs_allocated == REGS_FIXED &&
428 regs->num_regs < bufp->re_nsub + 1, 0))
430 nregs = regs->num_regs;
431 if (BE (nregs < 1, 0))
433 /* Nothing can be copied to regs. */
434 regs = NULL;
435 nregs = 1;
438 else
439 nregs = bufp->re_nsub + 1;
440 pmatch = re_malloc (regmatch_t, nregs);
441 if (BE (pmatch == NULL, 0))
442 return -2;
444 result = re_search_internal (bufp, string, length, start, range, stop,
445 nregs, pmatch, eflags);
447 rval = 0;
449 /* I hope we needn't fill ther regs with -1's when no match was found. */
450 if (result != REG_NOERROR)
451 rval = -1;
452 else if (regs != NULL)
454 /* If caller wants register contents data back, copy them. */
455 bufp->regs_allocated = re_copy_regs (regs, pmatch, nregs,
456 bufp->regs_allocated);
457 if (BE (bufp->regs_allocated == REGS_UNALLOCATED, 0))
458 rval = -2;
461 if (BE (rval == 0, 1))
463 if (ret_len)
465 assert (pmatch[0].rm_so == start);
466 rval = pmatch[0].rm_eo - start;
468 else
469 rval = pmatch[0].rm_so;
471 re_free (pmatch);
472 return rval;
475 static unsigned
476 re_copy_regs (regs, pmatch, nregs, regs_allocated)
477 struct re_registers *regs;
478 regmatch_t *pmatch;
479 int nregs, regs_allocated;
481 int rval = REGS_REALLOCATE;
482 int i;
483 int need_regs = nregs + 1;
484 /* We need one extra element beyond `num_regs' for the `-1' marker GNU code
485 uses. */
487 /* Have the register data arrays been allocated? */
488 if (regs_allocated == REGS_UNALLOCATED)
489 { /* No. So allocate them with malloc. */
490 regs->start = re_malloc (regoff_t, need_regs);
491 regs->end = re_malloc (regoff_t, need_regs);
492 if (BE (regs->start == NULL, 0) || BE (regs->end == NULL, 0))
493 return REGS_UNALLOCATED;
494 regs->num_regs = need_regs;
496 else if (regs_allocated == REGS_REALLOCATE)
497 { /* Yes. If we need more elements than were already
498 allocated, reallocate them. If we need fewer, just
499 leave it alone. */
500 if (BE (need_regs > regs->num_regs, 0))
502 regoff_t *new_start = re_realloc (regs->start, regoff_t, need_regs);
503 regoff_t *new_end = re_realloc (regs->end, regoff_t, need_regs);
504 if (BE (new_start == NULL, 0) || BE (new_end == NULL, 0))
505 return REGS_UNALLOCATED;
506 regs->start = new_start;
507 regs->end = new_end;
508 regs->num_regs = need_regs;
511 else
513 assert (regs_allocated == REGS_FIXED);
514 /* This function may not be called with REGS_FIXED and nregs too big. */
515 assert (regs->num_regs >= nregs);
516 rval = REGS_FIXED;
519 /* Copy the regs. */
520 for (i = 0; i < nregs; ++i)
522 regs->start[i] = pmatch[i].rm_so;
523 regs->end[i] = pmatch[i].rm_eo;
525 for ( ; i < regs->num_regs; ++i)
526 regs->start[i] = regs->end[i] = -1;
528 return rval;
531 /* Set REGS to hold NUM_REGS registers, storing them in STARTS and
532 ENDS. Subsequent matches using PATTERN_BUFFER and REGS will use
533 this memory for recording register information. STARTS and ENDS
534 must be allocated using the malloc library routine, and must each
535 be at least NUM_REGS * sizeof (regoff_t) bytes long.
537 If NUM_REGS == 0, then subsequent matches should allocate their own
538 register data.
540 Unless this function is called, the first search or match using
541 PATTERN_BUFFER will allocate its own register data, without
542 freeing the old data. */
544 void
545 re_set_registers (bufp, regs, num_regs, starts, ends)
546 struct re_pattern_buffer *bufp;
547 struct re_registers *regs;
548 unsigned num_regs;
549 regoff_t *starts, *ends;
551 if (num_regs)
553 bufp->regs_allocated = REGS_REALLOCATE;
554 regs->num_regs = num_regs;
555 regs->start = starts;
556 regs->end = ends;
558 else
560 bufp->regs_allocated = REGS_UNALLOCATED;
561 regs->num_regs = 0;
562 regs->start = regs->end = (regoff_t *) 0;
565 #ifdef _LIBC
566 weak_alias (__re_set_registers, re_set_registers)
567 #endif
569 /* Entry points compatible with 4.2 BSD regex library. We don't define
570 them unless specifically requested. */
572 #if defined _REGEX_RE_COMP || defined _LIBC
574 # ifdef _LIBC
575 weak_function
576 # endif
577 re_exec (s)
578 const char *s;
580 return 0 == regexec (&re_comp_buf, s, 0, NULL, 0);
582 #endif /* _REGEX_RE_COMP */
584 /* Internal entry point. */
586 /* Searches for a compiled pattern PREG in the string STRING, whose
587 length is LENGTH. NMATCH, PMATCH, and EFLAGS have the same
588 mingings with regexec. START, and RANGE have the same meanings
589 with re_search.
590 Return REG_NOERROR if we find a match, and REG_NOMATCH if not,
591 otherwise return the error code.
592 Note: We assume front end functions already check ranges.
593 (START + RANGE >= 0 && START + RANGE <= LENGTH) */
595 static reg_errcode_t
596 re_search_internal (preg, string, length, start, range, stop, nmatch, pmatch,
597 eflags)
598 const regex_t *preg;
599 const char *string;
600 int length, start, range, stop, eflags;
601 size_t nmatch;
602 regmatch_t pmatch[];
604 reg_errcode_t err;
605 re_dfa_t *dfa = (re_dfa_t *)preg->buffer;
606 int left_lim, right_lim, incr;
607 int fl_longest_match, match_first, match_kind, match_last = -1;
608 int sb, ch;
609 #if defined _LIBC || (defined __STDC_VERSION__ && __STDC_VERSION__ >= 199901L)
610 re_match_context_t mctx = { .dfa = dfa };
611 #else
612 re_match_context_t mctx;
613 #endif
614 char *fastmap = (preg->fastmap != NULL && preg->fastmap_accurate
615 && range && !preg->can_be_null) ? preg->fastmap : NULL;
616 unsigned RE_TRANSLATE_TYPE t = (unsigned RE_TRANSLATE_TYPE) preg->translate;
618 #if !(defined _LIBC || (defined __STDC_VERSION__ && __STDC_VERSION__ >= 199901L))
619 memset (&mctx, '\0', sizeof (re_match_context_t));
620 mctx.dfa = dfa;
621 #endif
623 /* Check if the DFA haven't been compiled. */
624 if (BE (preg->used == 0 || dfa->init_state == NULL
625 || dfa->init_state_word == NULL || dfa->init_state_nl == NULL
626 || dfa->init_state_begbuf == NULL, 0))
627 return REG_NOMATCH;
629 #ifdef DEBUG
630 /* We assume front-end functions already check them. */
631 assert (start + range >= 0 && start + range <= length);
632 #endif
634 /* If initial states with non-begbuf contexts have no elements,
635 the regex must be anchored. If preg->newline_anchor is set,
636 we'll never use init_state_nl, so do not check it. */
637 if (dfa->init_state->nodes.nelem == 0
638 && dfa->init_state_word->nodes.nelem == 0
639 && (dfa->init_state_nl->nodes.nelem == 0
640 || !preg->newline_anchor))
642 if (start != 0 && start + range != 0)
643 return REG_NOMATCH;
644 start = range = 0;
647 /* We must check the longest matching, if nmatch > 0. */
648 fl_longest_match = (nmatch != 0 || dfa->nbackref);
650 err = re_string_allocate (&mctx.input, string, length, dfa->nodes_len + 1,
651 preg->translate, preg->syntax & RE_ICASE, dfa);
652 if (BE (err != REG_NOERROR, 0))
653 goto free_return;
654 mctx.input.stop = stop;
655 mctx.input.raw_stop = stop;
656 mctx.input.newline_anchor = preg->newline_anchor;
658 err = match_ctx_init (&mctx, eflags, dfa->nbackref * 2);
659 if (BE (err != REG_NOERROR, 0))
660 goto free_return;
662 /* We will log all the DFA states through which the dfa pass,
663 if nmatch > 1, or this dfa has "multibyte node", which is a
664 back-reference or a node which can accept multibyte character or
665 multi character collating element. */
666 if (nmatch > 1 || dfa->has_mb_node)
668 mctx.state_log = re_malloc (re_dfastate_t *, mctx.input.bufs_len + 1);
669 if (BE (mctx.state_log == NULL, 0))
671 err = REG_ESPACE;
672 goto free_return;
675 else
676 mctx.state_log = NULL;
678 match_first = start;
679 mctx.input.tip_context = (eflags & REG_NOTBOL) ? CONTEXT_BEGBUF
680 : CONTEXT_NEWLINE | CONTEXT_BEGBUF;
682 /* Check incrementally whether of not the input string match. */
683 incr = (range < 0) ? -1 : 1;
684 left_lim = (range < 0) ? start + range : start;
685 right_lim = (range < 0) ? start : start + range;
686 sb = dfa->mb_cur_max == 1;
687 match_kind =
688 (fastmap
689 ? ((sb || !(preg->syntax & RE_ICASE || t) ? 4 : 0)
690 | (range >= 0 ? 2 : 0)
691 | (t != NULL ? 1 : 0))
692 : 8);
694 for (;; match_first += incr)
696 err = REG_NOMATCH;
697 if (match_first < left_lim || right_lim < match_first)
698 goto free_return;
700 /* Advance as rapidly as possible through the string, until we
701 find a plausible place to start matching. This may be done
702 with varying efficiency, so there are various possibilities:
703 only the most common of them are specialized, in order to
704 save on code size. We use a switch statement for speed. */
705 switch (match_kind)
707 case 8:
708 /* No fastmap. */
709 break;
711 case 7:
712 /* Fastmap with single-byte translation, match forward. */
713 while (BE (match_first < right_lim, 1)
714 && !fastmap[t[(unsigned char) string[match_first]]])
715 ++match_first;
716 goto forward_match_found_start_or_reached_end;
718 case 6:
719 /* Fastmap without translation, match forward. */
720 while (BE (match_first < right_lim, 1)
721 && !fastmap[(unsigned char) string[match_first]])
722 ++match_first;
724 forward_match_found_start_or_reached_end:
725 if (BE (match_first == right_lim, 0))
727 ch = match_first >= length
728 ? 0 : (unsigned char) string[match_first];
729 if (!fastmap[t ? t[ch] : ch])
730 goto free_return;
732 break;
734 case 4:
735 case 5:
736 /* Fastmap without multi-byte translation, match backwards. */
737 while (match_first >= left_lim)
739 ch = match_first >= length
740 ? 0 : (unsigned char) string[match_first];
741 if (fastmap[t ? t[ch] : ch])
742 break;
743 --match_first;
745 if (match_first < left_lim)
746 goto free_return;
747 break;
749 default:
750 /* In this case, we can't determine easily the current byte,
751 since it might be a component byte of a multibyte
752 character. Then we use the constructed buffer instead. */
753 for (;;)
755 /* If MATCH_FIRST is out of the valid range, reconstruct the
756 buffers. */
757 unsigned int offset = match_first - mctx.input.raw_mbs_idx;
758 if (BE (offset >= (unsigned int) mctx.input.valid_raw_len, 0))
760 err = re_string_reconstruct (&mctx.input, match_first,
761 eflags);
762 if (BE (err != REG_NOERROR, 0))
763 goto free_return;
765 offset = match_first - mctx.input.raw_mbs_idx;
767 /* If MATCH_FIRST is out of the buffer, leave it as '\0'.
768 Note that MATCH_FIRST must not be smaller than 0. */
769 ch = (match_first >= length
770 ? 0 : re_string_byte_at (&mctx.input, offset));
771 if (fastmap[ch])
772 break;
773 match_first += incr;
774 if (match_first < left_lim || match_first > right_lim)
776 err = REG_NOMATCH;
777 goto free_return;
780 break;
783 /* Reconstruct the buffers so that the matcher can assume that
784 the matching starts from the beginning of the buffer. */
785 err = re_string_reconstruct (&mctx.input, match_first, eflags);
786 if (BE (err != REG_NOERROR, 0))
787 goto free_return;
789 #ifdef RE_ENABLE_I18N
790 /* Don't consider this char as a possible match start if it part,
791 yet isn't the head, of a multibyte character. */
792 if (!sb && !re_string_first_byte (&mctx.input, 0))
793 continue;
794 #endif
796 /* It seems to be appropriate one, then use the matcher. */
797 /* We assume that the matching starts from 0. */
798 mctx.state_log_top = mctx.nbkref_ents = mctx.max_mb_elem_len = 0;
799 match_last = check_matching (&mctx, fl_longest_match,
800 range >= 0 ? &match_first : NULL);
801 if (match_last != -1)
803 if (BE (match_last == -2, 0))
805 err = REG_ESPACE;
806 goto free_return;
808 else
810 mctx.match_last = match_last;
811 if ((!preg->no_sub && nmatch > 1) || dfa->nbackref)
813 re_dfastate_t *pstate = mctx.state_log[match_last];
814 mctx.last_node = check_halt_state_context (&mctx, pstate,
815 match_last);
817 if ((!preg->no_sub && nmatch > 1 && dfa->has_plural_match)
818 || dfa->nbackref)
820 err = prune_impossible_nodes (&mctx);
821 if (err == REG_NOERROR)
822 break;
823 if (BE (err != REG_NOMATCH, 0))
824 goto free_return;
825 match_last = -1;
827 else
828 break; /* We found a match. */
832 match_ctx_clean (&mctx);
835 #ifdef DEBUG
836 assert (match_last != -1);
837 assert (err == REG_NOERROR);
838 #endif
840 /* Set pmatch[] if we need. */
841 if (nmatch > 0)
843 int reg_idx;
845 /* Initialize registers. */
846 for (reg_idx = 1; reg_idx < nmatch; ++reg_idx)
847 pmatch[reg_idx].rm_so = pmatch[reg_idx].rm_eo = -1;
849 /* Set the points where matching start/end. */
850 pmatch[0].rm_so = 0;
851 pmatch[0].rm_eo = mctx.match_last;
853 if (!preg->no_sub && nmatch > 1)
855 err = set_regs (preg, &mctx, nmatch, pmatch,
856 dfa->has_plural_match && dfa->nbackref > 0);
857 if (BE (err != REG_NOERROR, 0))
858 goto free_return;
861 /* At last, add the offset to the each registers, since we slided
862 the buffers so that we could assume that the matching starts
863 from 0. */
864 for (reg_idx = 0; reg_idx < nmatch; ++reg_idx)
865 if (pmatch[reg_idx].rm_so != -1)
867 #ifdef RE_ENABLE_I18N
868 if (BE (mctx.input.offsets_needed != 0, 0))
870 if (pmatch[reg_idx].rm_so == mctx.input.valid_len)
871 pmatch[reg_idx].rm_so += mctx.input.valid_raw_len - mctx.input.valid_len;
872 else
873 pmatch[reg_idx].rm_so = mctx.input.offsets[pmatch[reg_idx].rm_so];
874 if (pmatch[reg_idx].rm_eo == mctx.input.valid_len)
875 pmatch[reg_idx].rm_eo += mctx.input.valid_raw_len - mctx.input.valid_len;
876 else
877 pmatch[reg_idx].rm_eo = mctx.input.offsets[pmatch[reg_idx].rm_eo];
879 #else
880 assert (mctx.input.offsets_needed == 0);
881 #endif
882 pmatch[reg_idx].rm_so += match_first;
883 pmatch[reg_idx].rm_eo += match_first;
886 if (dfa->subexp_map)
887 for (reg_idx = 0;
888 reg_idx + 1 < nmatch && reg_idx < preg->re_nsub;
889 reg_idx++)
890 if (dfa->subexp_map[reg_idx] != reg_idx)
892 pmatch[reg_idx + 1].rm_so
893 = pmatch[dfa->subexp_map[reg_idx] + 1].rm_so;
894 pmatch[reg_idx + 1].rm_eo
895 = pmatch[dfa->subexp_map[reg_idx] + 1].rm_eo;
899 free_return:
900 re_free (mctx.state_log);
901 if (dfa->nbackref)
902 match_ctx_free (&mctx);
903 re_string_destruct (&mctx.input);
904 return err;
907 static reg_errcode_t
908 prune_impossible_nodes (mctx)
909 re_match_context_t *mctx;
911 re_dfa_t *const dfa = mctx->dfa;
912 int halt_node, match_last;
913 reg_errcode_t ret;
914 re_dfastate_t **sifted_states;
915 re_dfastate_t **lim_states = NULL;
916 re_sift_context_t sctx;
917 #ifdef DEBUG
918 assert (mctx->state_log != NULL);
919 #endif
920 match_last = mctx->match_last;
921 halt_node = mctx->last_node;
922 sifted_states = re_malloc (re_dfastate_t *, match_last + 1);
923 if (BE (sifted_states == NULL, 0))
925 ret = REG_ESPACE;
926 goto free_return;
928 if (dfa->nbackref)
930 lim_states = re_malloc (re_dfastate_t *, match_last + 1);
931 if (BE (lim_states == NULL, 0))
933 ret = REG_ESPACE;
934 goto free_return;
936 while (1)
938 memset (lim_states, '\0',
939 sizeof (re_dfastate_t *) * (match_last + 1));
940 sift_ctx_init (&sctx, sifted_states, lim_states, halt_node,
941 match_last);
942 ret = sift_states_backward (mctx, &sctx);
943 re_node_set_free (&sctx.limits);
944 if (BE (ret != REG_NOERROR, 0))
945 goto free_return;
946 if (sifted_states[0] != NULL || lim_states[0] != NULL)
947 break;
950 --match_last;
951 if (match_last < 0)
953 ret = REG_NOMATCH;
954 goto free_return;
956 } while (mctx->state_log[match_last] == NULL
957 || !mctx->state_log[match_last]->halt);
958 halt_node = check_halt_state_context (mctx,
959 mctx->state_log[match_last],
960 match_last);
962 ret = merge_state_array (dfa, sifted_states, lim_states,
963 match_last + 1);
964 re_free (lim_states);
965 lim_states = NULL;
966 if (BE (ret != REG_NOERROR, 0))
967 goto free_return;
969 else
971 sift_ctx_init (&sctx, sifted_states, lim_states, halt_node, match_last);
972 ret = sift_states_backward (mctx, &sctx);
973 re_node_set_free (&sctx.limits);
974 if (BE (ret != REG_NOERROR, 0))
975 goto free_return;
977 re_free (mctx->state_log);
978 mctx->state_log = sifted_states;
979 sifted_states = NULL;
980 mctx->last_node = halt_node;
981 mctx->match_last = match_last;
982 ret = REG_NOERROR;
983 free_return:
984 re_free (sifted_states);
985 re_free (lim_states);
986 return ret;
989 /* Acquire an initial state and return it.
990 We must select appropriate initial state depending on the context,
991 since initial states may have constraints like "\<", "^", etc.. */
993 static inline re_dfastate_t *
994 acquire_init_state_context (err, mctx, idx)
995 reg_errcode_t *err;
996 const re_match_context_t *mctx;
997 int idx;
999 re_dfa_t *const dfa = mctx->dfa;
1000 if (dfa->init_state->has_constraint)
1002 unsigned int context;
1003 context = re_string_context_at (&mctx->input, idx - 1, mctx->eflags);
1004 if (IS_WORD_CONTEXT (context))
1005 return dfa->init_state_word;
1006 else if (IS_ORDINARY_CONTEXT (context))
1007 return dfa->init_state;
1008 else if (IS_BEGBUF_CONTEXT (context) && IS_NEWLINE_CONTEXT (context))
1009 return dfa->init_state_begbuf;
1010 else if (IS_NEWLINE_CONTEXT (context))
1011 return dfa->init_state_nl;
1012 else if (IS_BEGBUF_CONTEXT (context))
1014 /* It is relatively rare case, then calculate on demand. */
1015 return re_acquire_state_context (err, dfa,
1016 dfa->init_state->entrance_nodes,
1017 context);
1019 else
1020 /* Must not happen? */
1021 return dfa->init_state;
1023 else
1024 return dfa->init_state;
1027 /* Check whether the regular expression match input string INPUT or not,
1028 and return the index where the matching end, return -1 if not match,
1029 or return -2 in case of an error.
1030 FL_LONGEST_MATCH means we want the POSIX longest matching.
1031 If P_MATCH_FIRST is not NULL, and the match fails, it is set to the
1032 next place where we may want to try matching.
1033 Note that the matcher assume that the maching starts from the current
1034 index of the buffer. */
1036 static int
1037 check_matching (mctx, fl_longest_match, p_match_first)
1038 re_match_context_t *mctx;
1039 int fl_longest_match;
1040 int *p_match_first;
1042 re_dfa_t *const dfa = mctx->dfa;
1043 reg_errcode_t err;
1044 int match = 0;
1045 int match_last = -1;
1046 int cur_str_idx = re_string_cur_idx (&mctx->input);
1047 re_dfastate_t *cur_state;
1048 int at_init_state = p_match_first != NULL;
1049 int next_start_idx = cur_str_idx;
1051 err = REG_NOERROR;
1052 cur_state = acquire_init_state_context (&err, mctx, cur_str_idx);
1053 /* An initial state must not be NULL (invalid). */
1054 if (BE (cur_state == NULL, 0))
1056 assert (err == REG_ESPACE);
1057 return -2;
1060 if (mctx->state_log != NULL)
1062 mctx->state_log[cur_str_idx] = cur_state;
1064 /* Check OP_OPEN_SUBEXP in the initial state in case that we use them
1065 later. E.g. Processing back references. */
1066 if (BE (dfa->nbackref, 0))
1068 at_init_state = 0;
1069 err = check_subexp_matching_top (mctx, &cur_state->nodes, 0);
1070 if (BE (err != REG_NOERROR, 0))
1071 return err;
1073 if (cur_state->has_backref)
1075 err = transit_state_bkref (mctx, &cur_state->nodes);
1076 if (BE (err != REG_NOERROR, 0))
1077 return err;
1082 /* If the RE accepts NULL string. */
1083 if (BE (cur_state->halt, 0))
1085 if (!cur_state->has_constraint
1086 || check_halt_state_context (mctx, cur_state, cur_str_idx))
1088 if (!fl_longest_match)
1089 return cur_str_idx;
1090 else
1092 match_last = cur_str_idx;
1093 match = 1;
1098 while (!re_string_eoi (&mctx->input))
1100 re_dfastate_t *old_state = cur_state;
1101 int next_char_idx = re_string_cur_idx (&mctx->input) + 1;
1103 if (BE (next_char_idx >= mctx->input.bufs_len, 0)
1104 || (BE (next_char_idx >= mctx->input.valid_len, 0)
1105 && mctx->input.valid_len < mctx->input.len))
1107 err = extend_buffers (mctx);
1108 if (BE (err != REG_NOERROR, 0))
1110 assert (err == REG_ESPACE);
1111 return -2;
1115 cur_state = transit_state (&err, mctx, cur_state);
1116 if (mctx->state_log != NULL)
1117 cur_state = merge_state_with_log (&err, mctx, cur_state);
1119 if (cur_state == NULL)
1121 /* Reached the invalid state or an error. Try to recover a valid
1122 state using the state log, if available and if we have not
1123 already found a valid (even if not the longest) match. */
1124 if (BE (err != REG_NOERROR, 0))
1125 return -2;
1127 if (mctx->state_log == NULL
1128 || (match && !fl_longest_match)
1129 || (cur_state = find_recover_state (&err, mctx)) == NULL)
1130 break;
1133 if (BE (at_init_state, 0))
1135 if (old_state == cur_state)
1136 next_start_idx = next_char_idx;
1137 else
1138 at_init_state = 0;
1141 if (cur_state->halt)
1143 /* Reached a halt state.
1144 Check the halt state can satisfy the current context. */
1145 if (!cur_state->has_constraint
1146 || check_halt_state_context (mctx, cur_state,
1147 re_string_cur_idx (&mctx->input)))
1149 /* We found an appropriate halt state. */
1150 match_last = re_string_cur_idx (&mctx->input);
1151 match = 1;
1153 /* We found a match, do not modify match_first below. */
1154 p_match_first = NULL;
1155 if (!fl_longest_match)
1156 break;
1161 if (p_match_first)
1162 *p_match_first += next_start_idx;
1164 return match_last;
1167 /* Check NODE match the current context. */
1169 static int check_halt_node_context (dfa, node, context)
1170 const re_dfa_t *dfa;
1171 int node;
1172 unsigned int context;
1174 re_token_type_t type = dfa->nodes[node].type;
1175 unsigned int constraint = dfa->nodes[node].constraint;
1176 if (type != END_OF_RE)
1177 return 0;
1178 if (!constraint)
1179 return 1;
1180 if (NOT_SATISFY_NEXT_CONSTRAINT (constraint, context))
1181 return 0;
1182 return 1;
1185 /* Check the halt state STATE match the current context.
1186 Return 0 if not match, if the node, STATE has, is a halt node and
1187 match the context, return the node. */
1189 static int
1190 check_halt_state_context (mctx, state, idx)
1191 const re_match_context_t *mctx;
1192 const re_dfastate_t *state;
1193 int idx;
1195 int i;
1196 unsigned int context;
1197 #ifdef DEBUG
1198 assert (state->halt);
1199 #endif
1200 context = re_string_context_at (&mctx->input, idx, mctx->eflags);
1201 for (i = 0; i < state->nodes.nelem; ++i)
1202 if (check_halt_node_context (mctx->dfa, state->nodes.elems[i], context))
1203 return state->nodes.elems[i];
1204 return 0;
1207 /* Compute the next node to which "NFA" transit from NODE("NFA" is a NFA
1208 corresponding to the DFA).
1209 Return the destination node, and update EPS_VIA_NODES, return -1 in case
1210 of errors. */
1212 static int
1213 proceed_next_node (mctx, nregs, regs, pidx, node, eps_via_nodes, fs)
1214 const re_match_context_t *mctx;
1215 regmatch_t *regs;
1216 int nregs, *pidx, node;
1217 re_node_set *eps_via_nodes;
1218 struct re_fail_stack_t *fs;
1220 re_dfa_t *const dfa = mctx->dfa;
1221 int i, err, dest_node;
1222 dest_node = -1;
1223 if (IS_EPSILON_NODE (dfa->nodes[node].type))
1225 re_node_set *cur_nodes = &mctx->state_log[*pidx]->nodes;
1226 re_node_set *edests = &dfa->edests[node];
1227 int dest_node;
1228 err = re_node_set_insert (eps_via_nodes, node);
1229 if (BE (err < 0, 0))
1230 return -2;
1231 /* Pick up a valid destination, or return -1 if none is found. */
1232 for (dest_node = -1, i = 0; i < edests->nelem; ++i)
1234 int candidate = edests->elems[i];
1235 if (!re_node_set_contains (cur_nodes, candidate))
1236 continue;
1237 if (dest_node == -1)
1238 dest_node = candidate;
1240 else
1242 /* In order to avoid infinite loop like "(a*)*", return the second
1243 epsilon-transition if the first was already considered. */
1244 if (re_node_set_contains (eps_via_nodes, dest_node))
1245 return candidate;
1247 /* Otherwise, push the second epsilon-transition on the fail stack. */
1248 else if (fs != NULL
1249 && push_fail_stack (fs, *pidx, candidate, nregs, regs,
1250 eps_via_nodes))
1251 return -2;
1253 /* We know we are going to exit. */
1254 break;
1257 return dest_node;
1259 else
1261 int naccepted = 0;
1262 re_token_type_t type = dfa->nodes[node].type;
1264 #ifdef RE_ENABLE_I18N
1265 if (ACCEPT_MB_NODE (type))
1266 naccepted = check_node_accept_bytes (dfa, node, &mctx->input, *pidx);
1267 else
1268 #endif /* RE_ENABLE_I18N */
1269 if (type == OP_BACK_REF)
1271 int subexp_idx = dfa->nodes[node].opr.idx + 1;
1272 naccepted = regs[subexp_idx].rm_eo - regs[subexp_idx].rm_so;
1273 if (fs != NULL)
1275 if (regs[subexp_idx].rm_so == -1 || regs[subexp_idx].rm_eo == -1)
1276 return -1;
1277 else if (naccepted)
1279 char *buf = (char *) re_string_get_buffer (&mctx->input);
1280 if (memcmp (buf + regs[subexp_idx].rm_so, buf + *pidx,
1281 naccepted) != 0)
1282 return -1;
1286 if (naccepted == 0)
1288 err = re_node_set_insert (eps_via_nodes, node);
1289 if (BE (err < 0, 0))
1290 return -2;
1291 dest_node = dfa->edests[node].elems[0];
1292 if (re_node_set_contains (&mctx->state_log[*pidx]->nodes,
1293 dest_node))
1294 return dest_node;
1298 if (naccepted != 0
1299 || check_node_accept (mctx, dfa->nodes + node, *pidx))
1301 dest_node = dfa->nexts[node];
1302 *pidx = (naccepted == 0) ? *pidx + 1 : *pidx + naccepted;
1303 if (fs && (*pidx > mctx->match_last || mctx->state_log[*pidx] == NULL
1304 || !re_node_set_contains (&mctx->state_log[*pidx]->nodes,
1305 dest_node)))
1306 return -1;
1307 re_node_set_empty (eps_via_nodes);
1308 return dest_node;
1311 return -1;
1314 static reg_errcode_t
1315 push_fail_stack (fs, str_idx, dest_node, nregs, regs, eps_via_nodes)
1316 struct re_fail_stack_t *fs;
1317 int str_idx, dest_node, nregs;
1318 regmatch_t *regs;
1319 re_node_set *eps_via_nodes;
1321 reg_errcode_t err;
1322 int num = fs->num++;
1323 if (fs->num == fs->alloc)
1325 struct re_fail_stack_ent_t *new_array;
1326 new_array = realloc (fs->stack, (sizeof (struct re_fail_stack_ent_t)
1327 * fs->alloc * 2));
1328 if (new_array == NULL)
1329 return REG_ESPACE;
1330 fs->alloc *= 2;
1331 fs->stack = new_array;
1333 fs->stack[num].idx = str_idx;
1334 fs->stack[num].node = dest_node;
1335 fs->stack[num].regs = re_malloc (regmatch_t, nregs);
1336 if (fs->stack[num].regs == NULL)
1337 return REG_ESPACE;
1338 memcpy (fs->stack[num].regs, regs, sizeof (regmatch_t) * nregs);
1339 err = re_node_set_init_copy (&fs->stack[num].eps_via_nodes, eps_via_nodes);
1340 return err;
1343 static int
1344 pop_fail_stack (fs, pidx, nregs, regs, eps_via_nodes)
1345 struct re_fail_stack_t *fs;
1346 int *pidx, nregs;
1347 regmatch_t *regs;
1348 re_node_set *eps_via_nodes;
1350 int num = --fs->num;
1351 assert (num >= 0);
1352 *pidx = fs->stack[num].idx;
1353 memcpy (regs, fs->stack[num].regs, sizeof (regmatch_t) * nregs);
1354 re_node_set_free (eps_via_nodes);
1355 re_free (fs->stack[num].regs);
1356 *eps_via_nodes = fs->stack[num].eps_via_nodes;
1357 return fs->stack[num].node;
1360 /* Set the positions where the subexpressions are starts/ends to registers
1361 PMATCH.
1362 Note: We assume that pmatch[0] is already set, and
1363 pmatch[i].rm_so == pmatch[i].rm_eo == -1 for 0 < i < nmatch. */
1365 static reg_errcode_t
1366 set_regs (preg, mctx, nmatch, pmatch, fl_backtrack)
1367 const regex_t *preg;
1368 const re_match_context_t *mctx;
1369 size_t nmatch;
1370 regmatch_t *pmatch;
1371 int fl_backtrack;
1373 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1374 int idx, cur_node, real_nmatch;
1375 re_node_set eps_via_nodes;
1376 struct re_fail_stack_t *fs;
1377 struct re_fail_stack_t fs_body = { 0, 2, NULL };
1378 regmatch_t *prev_idx_match;
1380 #ifdef DEBUG
1381 assert (nmatch > 1);
1382 assert (mctx->state_log != NULL);
1383 #endif
1384 if (fl_backtrack)
1386 fs = &fs_body;
1387 fs->stack = re_malloc (struct re_fail_stack_ent_t, fs->alloc);
1388 if (fs->stack == NULL)
1389 return REG_ESPACE;
1391 else
1392 fs = NULL;
1394 cur_node = dfa->init_node;
1395 real_nmatch = (nmatch <= preg->re_nsub) ? nmatch : preg->re_nsub + 1;
1396 re_node_set_init_empty (&eps_via_nodes);
1398 prev_idx_match = (regmatch_t *) alloca (sizeof (regmatch_t) * real_nmatch);
1399 memcpy (prev_idx_match, pmatch, sizeof (regmatch_t) * real_nmatch);
1401 for (idx = pmatch[0].rm_so; idx <= pmatch[0].rm_eo ;)
1403 update_regs (dfa, pmatch, prev_idx_match, cur_node, idx, real_nmatch);
1405 if (idx == pmatch[0].rm_eo && cur_node == mctx->last_node)
1407 int reg_idx;
1408 if (fs)
1410 for (reg_idx = 0; reg_idx < nmatch; ++reg_idx)
1411 if (pmatch[reg_idx].rm_so > -1 && pmatch[reg_idx].rm_eo == -1)
1412 break;
1413 if (reg_idx == nmatch)
1415 re_node_set_free (&eps_via_nodes);
1416 return free_fail_stack_return (fs);
1418 cur_node = pop_fail_stack (fs, &idx, nmatch, pmatch,
1419 &eps_via_nodes);
1421 else
1423 re_node_set_free (&eps_via_nodes);
1424 return REG_NOERROR;
1428 /* Proceed to next node. */
1429 cur_node = proceed_next_node (mctx, nmatch, pmatch, &idx, cur_node,
1430 &eps_via_nodes, fs);
1432 if (BE (cur_node < 0, 0))
1434 if (BE (cur_node == -2, 0))
1436 re_node_set_free (&eps_via_nodes);
1437 free_fail_stack_return (fs);
1438 return REG_ESPACE;
1440 if (fs)
1441 cur_node = pop_fail_stack (fs, &idx, nmatch, pmatch,
1442 &eps_via_nodes);
1443 else
1445 re_node_set_free (&eps_via_nodes);
1446 return REG_NOMATCH;
1450 re_node_set_free (&eps_via_nodes);
1451 return free_fail_stack_return (fs);
1454 static reg_errcode_t
1455 free_fail_stack_return (fs)
1456 struct re_fail_stack_t *fs;
1458 if (fs)
1460 int fs_idx;
1461 for (fs_idx = 0; fs_idx < fs->num; ++fs_idx)
1463 re_node_set_free (&fs->stack[fs_idx].eps_via_nodes);
1464 re_free (fs->stack[fs_idx].regs);
1466 re_free (fs->stack);
1468 return REG_NOERROR;
1471 static void
1472 update_regs (dfa, pmatch, prev_idx_match, cur_node, cur_idx, nmatch)
1473 re_dfa_t *dfa;
1474 regmatch_t *pmatch, *prev_idx_match;
1475 int cur_node, cur_idx, nmatch;
1477 int type = dfa->nodes[cur_node].type;
1478 if (type == OP_OPEN_SUBEXP)
1480 int reg_num = dfa->nodes[cur_node].opr.idx + 1;
1482 /* We are at the first node of this sub expression. */
1483 if (reg_num < nmatch)
1485 pmatch[reg_num].rm_so = cur_idx;
1486 pmatch[reg_num].rm_eo = -1;
1489 else if (type == OP_CLOSE_SUBEXP)
1491 int reg_num = dfa->nodes[cur_node].opr.idx + 1;
1492 if (reg_num < nmatch)
1494 /* We are at the last node of this sub expression. */
1495 if (pmatch[reg_num].rm_so < cur_idx)
1497 pmatch[reg_num].rm_eo = cur_idx;
1498 /* This is a non-empty match or we are not inside an optional
1499 subexpression. Accept this right away. */
1500 memcpy (prev_idx_match, pmatch, sizeof (regmatch_t) * nmatch);
1502 else
1504 if (dfa->nodes[cur_node].opt_subexp
1505 && prev_idx_match[reg_num].rm_so != -1)
1506 /* We transited through an empty match for an optional
1507 subexpression, like (a?)*, and this is not the subexp's
1508 first match. Copy back the old content of the registers
1509 so that matches of an inner subexpression are undone as
1510 well, like in ((a?))*. */
1511 memcpy (pmatch, prev_idx_match, sizeof (regmatch_t) * nmatch);
1512 else
1513 /* We completed a subexpression, but it may be part of
1514 an optional one, so do not update PREV_IDX_MATCH. */
1515 pmatch[reg_num].rm_eo = cur_idx;
1521 /* This function checks the STATE_LOG from the SCTX->last_str_idx to 0
1522 and sift the nodes in each states according to the following rules.
1523 Updated state_log will be wrote to STATE_LOG.
1525 Rules: We throw away the Node `a' in the STATE_LOG[STR_IDX] if...
1526 1. When STR_IDX == MATCH_LAST(the last index in the state_log):
1527 If `a' isn't the LAST_NODE and `a' can't epsilon transit to
1528 the LAST_NODE, we throw away the node `a'.
1529 2. When 0 <= STR_IDX < MATCH_LAST and `a' accepts
1530 string `s' and transit to `b':
1531 i. If 'b' isn't in the STATE_LOG[STR_IDX+strlen('s')], we throw
1532 away the node `a'.
1533 ii. If 'b' is in the STATE_LOG[STR_IDX+strlen('s')] but 'b' is
1534 thrown away, we throw away the node `a'.
1535 3. When 0 <= STR_IDX < MATCH_LAST and 'a' epsilon transit to 'b':
1536 i. If 'b' isn't in the STATE_LOG[STR_IDX], we throw away the
1537 node `a'.
1538 ii. If 'b' is in the STATE_LOG[STR_IDX] but 'b' is thrown away,
1539 we throw away the node `a'. */
1541 #define STATE_NODE_CONTAINS(state,node) \
1542 ((state) != NULL && re_node_set_contains (&(state)->nodes, node))
1544 static reg_errcode_t
1545 sift_states_backward (mctx, sctx)
1546 re_match_context_t *mctx;
1547 re_sift_context_t *sctx;
1549 reg_errcode_t err;
1550 int null_cnt = 0;
1551 int str_idx = sctx->last_str_idx;
1552 re_node_set cur_dest;
1554 #ifdef DEBUG
1555 assert (mctx->state_log != NULL && mctx->state_log[str_idx] != NULL);
1556 #endif
1558 /* Build sifted state_log[str_idx]. It has the nodes which can epsilon
1559 transit to the last_node and the last_node itself. */
1560 err = re_node_set_init_1 (&cur_dest, sctx->last_node);
1561 if (BE (err != REG_NOERROR, 0))
1562 return err;
1563 err = update_cur_sifted_state (mctx, sctx, str_idx, &cur_dest);
1564 if (BE (err != REG_NOERROR, 0))
1565 goto free_return;
1567 /* Then check each states in the state_log. */
1568 while (str_idx > 0)
1570 /* Update counters. */
1571 null_cnt = (sctx->sifted_states[str_idx] == NULL) ? null_cnt + 1 : 0;
1572 if (null_cnt > mctx->max_mb_elem_len)
1574 memset (sctx->sifted_states, '\0',
1575 sizeof (re_dfastate_t *) * str_idx);
1576 re_node_set_free (&cur_dest);
1577 return REG_NOERROR;
1579 re_node_set_empty (&cur_dest);
1580 --str_idx;
1582 if (mctx->state_log[str_idx])
1584 err = build_sifted_states (mctx, sctx, str_idx, &cur_dest);
1585 if (BE (err != REG_NOERROR, 0))
1586 goto free_return;
1589 /* Add all the nodes which satisfy the following conditions:
1590 - It can epsilon transit to a node in CUR_DEST.
1591 - It is in CUR_SRC.
1592 And update state_log. */
1593 err = update_cur_sifted_state (mctx, sctx, str_idx, &cur_dest);
1594 if (BE (err != REG_NOERROR, 0))
1595 goto free_return;
1597 err = REG_NOERROR;
1598 free_return:
1599 re_node_set_free (&cur_dest);
1600 return err;
1603 static reg_errcode_t
1604 build_sifted_states (mctx, sctx, str_idx, cur_dest)
1605 re_match_context_t *mctx;
1606 re_sift_context_t *sctx;
1607 int str_idx;
1608 re_node_set *cur_dest;
1610 re_dfa_t *const dfa = mctx->dfa;
1611 re_node_set *cur_src = &mctx->state_log[str_idx]->non_eps_nodes;
1612 int i;
1614 /* Then build the next sifted state.
1615 We build the next sifted state on `cur_dest', and update
1616 `sifted_states[str_idx]' with `cur_dest'.
1617 Note:
1618 `cur_dest' is the sifted state from `state_log[str_idx + 1]'.
1619 `cur_src' points the node_set of the old `state_log[str_idx]'
1620 (with the epsilon nodes pre-filtered out). */
1621 for (i = 0; i < cur_src->nelem; i++)
1623 int prev_node = cur_src->elems[i];
1624 int naccepted = 0;
1625 int ret;
1627 #if defined DEBUG || defined RE_ENABLE_I18N
1628 re_token_type_t type = dfa->nodes[prev_node].type;
1629 #endif
1630 #ifdef DEBUG
1631 assert (!IS_EPSILON_NODE (type));
1632 #endif
1633 #ifdef RE_ENABLE_I18N
1634 /* If the node may accept `multi byte'. */
1635 if (ACCEPT_MB_NODE (type))
1636 naccepted = sift_states_iter_mb (mctx, sctx, prev_node,
1637 str_idx, sctx->last_str_idx);
1638 #endif /* RE_ENABLE_I18N */
1640 /* We don't check backreferences here.
1641 See update_cur_sifted_state(). */
1642 if (!naccepted
1643 && check_node_accept (mctx, dfa->nodes + prev_node, str_idx)
1644 && STATE_NODE_CONTAINS (sctx->sifted_states[str_idx + 1],
1645 dfa->nexts[prev_node]))
1646 naccepted = 1;
1648 if (naccepted == 0)
1649 continue;
1651 if (sctx->limits.nelem)
1653 int to_idx = str_idx + naccepted;
1654 if (check_dst_limits (mctx, &sctx->limits,
1655 dfa->nexts[prev_node], to_idx,
1656 prev_node, str_idx))
1657 continue;
1659 ret = re_node_set_insert (cur_dest, prev_node);
1660 if (BE (ret == -1, 0))
1661 return REG_ESPACE;
1664 return REG_NOERROR;
1667 /* Helper functions. */
1669 static reg_errcode_t
1670 clean_state_log_if_needed (mctx, next_state_log_idx)
1671 re_match_context_t *mctx;
1672 int next_state_log_idx;
1674 int top = mctx->state_log_top;
1676 if (next_state_log_idx >= mctx->input.bufs_len
1677 || (next_state_log_idx >= mctx->input.valid_len
1678 && mctx->input.valid_len < mctx->input.len))
1680 reg_errcode_t err;
1681 err = extend_buffers (mctx);
1682 if (BE (err != REG_NOERROR, 0))
1683 return err;
1686 if (top < next_state_log_idx)
1688 memset (mctx->state_log + top + 1, '\0',
1689 sizeof (re_dfastate_t *) * (next_state_log_idx - top));
1690 mctx->state_log_top = next_state_log_idx;
1692 return REG_NOERROR;
1695 static reg_errcode_t
1696 merge_state_array (dfa, dst, src, num)
1697 re_dfa_t *dfa;
1698 re_dfastate_t **dst;
1699 re_dfastate_t **src;
1700 int num;
1702 int st_idx;
1703 reg_errcode_t err;
1704 for (st_idx = 0; st_idx < num; ++st_idx)
1706 if (dst[st_idx] == NULL)
1707 dst[st_idx] = src[st_idx];
1708 else if (src[st_idx] != NULL)
1710 re_node_set merged_set;
1711 err = re_node_set_init_union (&merged_set, &dst[st_idx]->nodes,
1712 &src[st_idx]->nodes);
1713 if (BE (err != REG_NOERROR, 0))
1714 return err;
1715 dst[st_idx] = re_acquire_state (&err, dfa, &merged_set);
1716 re_node_set_free (&merged_set);
1717 if (BE (err != REG_NOERROR, 0))
1718 return err;
1721 return REG_NOERROR;
1724 static reg_errcode_t
1725 update_cur_sifted_state (mctx, sctx, str_idx, dest_nodes)
1726 re_match_context_t *mctx;
1727 re_sift_context_t *sctx;
1728 int str_idx;
1729 re_node_set *dest_nodes;
1731 re_dfa_t *const dfa = mctx->dfa;
1732 reg_errcode_t err;
1733 const re_node_set *candidates;
1734 candidates = ((mctx->state_log[str_idx] == NULL) ? NULL
1735 : &mctx->state_log[str_idx]->nodes);
1737 if (dest_nodes->nelem == 0)
1738 sctx->sifted_states[str_idx] = NULL;
1739 else
1741 if (candidates)
1743 /* At first, add the nodes which can epsilon transit to a node in
1744 DEST_NODE. */
1745 err = add_epsilon_src_nodes (dfa, dest_nodes, candidates);
1746 if (BE (err != REG_NOERROR, 0))
1747 return err;
1749 /* Then, check the limitations in the current sift_context. */
1750 if (sctx->limits.nelem)
1752 err = check_subexp_limits (dfa, dest_nodes, candidates, &sctx->limits,
1753 mctx->bkref_ents, str_idx);
1754 if (BE (err != REG_NOERROR, 0))
1755 return err;
1759 sctx->sifted_states[str_idx] = re_acquire_state (&err, dfa, dest_nodes);
1760 if (BE (err != REG_NOERROR, 0))
1761 return err;
1764 if (candidates && mctx->state_log[str_idx]->has_backref)
1766 err = sift_states_bkref (mctx, sctx, str_idx, candidates);
1767 if (BE (err != REG_NOERROR, 0))
1768 return err;
1770 return REG_NOERROR;
1773 static reg_errcode_t
1774 add_epsilon_src_nodes (dfa, dest_nodes, candidates)
1775 re_dfa_t *dfa;
1776 re_node_set *dest_nodes;
1777 const re_node_set *candidates;
1779 reg_errcode_t err = REG_NOERROR;
1780 int i;
1782 re_dfastate_t *state = re_acquire_state (&err, dfa, dest_nodes);
1783 if (BE (err != REG_NOERROR, 0))
1784 return err;
1786 if (!state->inveclosure.alloc)
1788 err = re_node_set_alloc (&state->inveclosure, dest_nodes->nelem);
1789 if (BE (err != REG_NOERROR, 0))
1790 return REG_ESPACE;
1791 for (i = 0; i < dest_nodes->nelem; i++)
1792 re_node_set_merge (&state->inveclosure,
1793 dfa->inveclosures + dest_nodes->elems[i]);
1795 return re_node_set_add_intersect (dest_nodes, candidates,
1796 &state->inveclosure);
1799 static reg_errcode_t
1800 sub_epsilon_src_nodes (dfa, node, dest_nodes, candidates)
1801 re_dfa_t *dfa;
1802 int node;
1803 re_node_set *dest_nodes;
1804 const re_node_set *candidates;
1806 int ecl_idx;
1807 reg_errcode_t err;
1808 re_node_set *inv_eclosure = dfa->inveclosures + node;
1809 re_node_set except_nodes;
1810 re_node_set_init_empty (&except_nodes);
1811 for (ecl_idx = 0; ecl_idx < inv_eclosure->nelem; ++ecl_idx)
1813 int cur_node = inv_eclosure->elems[ecl_idx];
1814 if (cur_node == node)
1815 continue;
1816 if (IS_EPSILON_NODE (dfa->nodes[cur_node].type))
1818 int edst1 = dfa->edests[cur_node].elems[0];
1819 int edst2 = ((dfa->edests[cur_node].nelem > 1)
1820 ? dfa->edests[cur_node].elems[1] : -1);
1821 if ((!re_node_set_contains (inv_eclosure, edst1)
1822 && re_node_set_contains (dest_nodes, edst1))
1823 || (edst2 > 0
1824 && !re_node_set_contains (inv_eclosure, edst2)
1825 && re_node_set_contains (dest_nodes, edst2)))
1827 err = re_node_set_add_intersect (&except_nodes, candidates,
1828 dfa->inveclosures + cur_node);
1829 if (BE (err != REG_NOERROR, 0))
1831 re_node_set_free (&except_nodes);
1832 return err;
1837 for (ecl_idx = 0; ecl_idx < inv_eclosure->nelem; ++ecl_idx)
1839 int cur_node = inv_eclosure->elems[ecl_idx];
1840 if (!re_node_set_contains (&except_nodes, cur_node))
1842 int idx = re_node_set_contains (dest_nodes, cur_node) - 1;
1843 re_node_set_remove_at (dest_nodes, idx);
1846 re_node_set_free (&except_nodes);
1847 return REG_NOERROR;
1850 static int
1851 check_dst_limits (mctx, limits, dst_node, dst_idx, src_node, src_idx)
1852 re_match_context_t *mctx;
1853 re_node_set *limits;
1854 int dst_node, dst_idx, src_node, src_idx;
1856 re_dfa_t *const dfa = mctx->dfa;
1857 int lim_idx, src_pos, dst_pos;
1859 int dst_bkref_idx = search_cur_bkref_entry (mctx, dst_idx);
1860 int src_bkref_idx = search_cur_bkref_entry (mctx, src_idx);
1861 for (lim_idx = 0; lim_idx < limits->nelem; ++lim_idx)
1863 int subexp_idx;
1864 struct re_backref_cache_entry *ent;
1865 ent = mctx->bkref_ents + limits->elems[lim_idx];
1866 subexp_idx = dfa->nodes[ent->node].opr.idx;
1868 dst_pos = check_dst_limits_calc_pos (mctx, limits->elems[lim_idx],
1869 subexp_idx, dst_node, dst_idx,
1870 dst_bkref_idx);
1871 src_pos = check_dst_limits_calc_pos (mctx, limits->elems[lim_idx],
1872 subexp_idx, src_node, src_idx,
1873 src_bkref_idx);
1875 /* In case of:
1876 <src> <dst> ( <subexp> )
1877 ( <subexp> ) <src> <dst>
1878 ( <subexp1> <src> <subexp2> <dst> <subexp3> ) */
1879 if (src_pos == dst_pos)
1880 continue; /* This is unrelated limitation. */
1881 else
1882 return 1;
1884 return 0;
1887 static int
1888 check_dst_limits_calc_pos_1 (mctx, boundaries, subexp_idx, from_node, bkref_idx)
1889 re_match_context_t *mctx;
1890 int boundaries, subexp_idx, from_node, bkref_idx;
1892 re_dfa_t *const dfa = mctx->dfa;
1893 re_node_set *eclosures = dfa->eclosures + from_node;
1894 int node_idx;
1896 /* Else, we are on the boundary: examine the nodes on the epsilon
1897 closure. */
1898 for (node_idx = 0; node_idx < eclosures->nelem; ++node_idx)
1900 int node = eclosures->elems[node_idx];
1901 switch (dfa->nodes[node].type)
1903 case OP_BACK_REF:
1904 if (bkref_idx != -1)
1906 struct re_backref_cache_entry *ent = mctx->bkref_ents + bkref_idx;
1909 int dst, cpos;
1911 if (ent->node != node)
1912 continue;
1914 if (subexp_idx <= 8 * sizeof (ent->eps_reachable_subexps_map)
1915 && !(ent->eps_reachable_subexps_map & (1 << subexp_idx)))
1916 continue;
1918 /* Recurse trying to reach the OP_OPEN_SUBEXP and
1919 OP_CLOSE_SUBEXP cases below. But, if the
1920 destination node is the same node as the source
1921 node, don't recurse because it would cause an
1922 infinite loop: a regex that exhibits this behavior
1923 is ()\1*\1* */
1924 dst = dfa->edests[node].elems[0];
1925 if (dst == from_node)
1927 if (boundaries & 1)
1928 return -1;
1929 else /* if (boundaries & 2) */
1930 return 0;
1933 cpos =
1934 check_dst_limits_calc_pos_1 (mctx, boundaries, subexp_idx,
1935 dst, bkref_idx);
1936 if (cpos == -1 /* && (boundaries & 1) */)
1937 return -1;
1938 if (cpos == 0 && (boundaries & 2))
1939 return 0;
1941 ent->eps_reachable_subexps_map &= ~(1 << subexp_idx);
1943 while (ent++->more);
1945 break;
1947 case OP_OPEN_SUBEXP:
1948 if ((boundaries & 1) && subexp_idx == dfa->nodes[node].opr.idx)
1949 return -1;
1950 break;
1952 case OP_CLOSE_SUBEXP:
1953 if ((boundaries & 2) && subexp_idx == dfa->nodes[node].opr.idx)
1954 return 0;
1955 break;
1957 default:
1958 break;
1962 return (boundaries & 2) ? 1 : 0;
1965 static int
1966 check_dst_limits_calc_pos (mctx, limit, subexp_idx, from_node, str_idx, bkref_idx)
1967 re_match_context_t *mctx;
1968 int limit, subexp_idx, from_node, str_idx, bkref_idx;
1970 struct re_backref_cache_entry *lim = mctx->bkref_ents + limit;
1971 int boundaries;
1973 /* If we are outside the range of the subexpression, return -1 or 1. */
1974 if (str_idx < lim->subexp_from)
1975 return -1;
1977 if (lim->subexp_to < str_idx)
1978 return 1;
1980 /* If we are within the subexpression, return 0. */
1981 boundaries = (str_idx == lim->subexp_from);
1982 boundaries |= (str_idx == lim->subexp_to) << 1;
1983 if (boundaries == 0)
1984 return 0;
1986 /* Else, examine epsilon closure. */
1987 return check_dst_limits_calc_pos_1 (mctx, boundaries, subexp_idx,
1988 from_node, bkref_idx);
1991 /* Check the limitations of sub expressions LIMITS, and remove the nodes
1992 which are against limitations from DEST_NODES. */
1994 static reg_errcode_t
1995 check_subexp_limits (dfa, dest_nodes, candidates, limits, bkref_ents, str_idx)
1996 re_dfa_t *dfa;
1997 re_node_set *dest_nodes;
1998 const re_node_set *candidates;
1999 re_node_set *limits;
2000 struct re_backref_cache_entry *bkref_ents;
2001 int str_idx;
2003 reg_errcode_t err;
2004 int node_idx, lim_idx;
2006 for (lim_idx = 0; lim_idx < limits->nelem; ++lim_idx)
2008 int subexp_idx;
2009 struct re_backref_cache_entry *ent;
2010 ent = bkref_ents + limits->elems[lim_idx];
2012 if (str_idx <= ent->subexp_from || ent->str_idx < str_idx)
2013 continue; /* This is unrelated limitation. */
2015 subexp_idx = dfa->nodes[ent->node].opr.idx;
2016 if (ent->subexp_to == str_idx)
2018 int ops_node = -1;
2019 int cls_node = -1;
2020 for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
2022 int node = dest_nodes->elems[node_idx];
2023 re_token_type_t type = dfa->nodes[node].type;
2024 if (type == OP_OPEN_SUBEXP
2025 && subexp_idx == dfa->nodes[node].opr.idx)
2026 ops_node = node;
2027 else if (type == OP_CLOSE_SUBEXP
2028 && subexp_idx == dfa->nodes[node].opr.idx)
2029 cls_node = node;
2032 /* Check the limitation of the open subexpression. */
2033 /* Note that (ent->subexp_to = str_idx != ent->subexp_from). */
2034 if (ops_node >= 0)
2036 err = sub_epsilon_src_nodes (dfa, ops_node, dest_nodes,
2037 candidates);
2038 if (BE (err != REG_NOERROR, 0))
2039 return err;
2042 /* Check the limitation of the close subexpression. */
2043 if (cls_node >= 0)
2044 for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
2046 int node = dest_nodes->elems[node_idx];
2047 if (!re_node_set_contains (dfa->inveclosures + node,
2048 cls_node)
2049 && !re_node_set_contains (dfa->eclosures + node,
2050 cls_node))
2052 /* It is against this limitation.
2053 Remove it form the current sifted state. */
2054 err = sub_epsilon_src_nodes (dfa, node, dest_nodes,
2055 candidates);
2056 if (BE (err != REG_NOERROR, 0))
2057 return err;
2058 --node_idx;
2062 else /* (ent->subexp_to != str_idx) */
2064 for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
2066 int node = dest_nodes->elems[node_idx];
2067 re_token_type_t type = dfa->nodes[node].type;
2068 if (type == OP_CLOSE_SUBEXP || type == OP_OPEN_SUBEXP)
2070 if (subexp_idx != dfa->nodes[node].opr.idx)
2071 continue;
2072 /* It is against this limitation.
2073 Remove it form the current sifted state. */
2074 err = sub_epsilon_src_nodes (dfa, node, dest_nodes,
2075 candidates);
2076 if (BE (err != REG_NOERROR, 0))
2077 return err;
2082 return REG_NOERROR;
2085 static reg_errcode_t
2086 sift_states_bkref (mctx, sctx, str_idx, candidates)
2087 re_match_context_t *mctx;
2088 re_sift_context_t *sctx;
2089 int str_idx;
2090 const re_node_set *candidates;
2092 re_dfa_t *const dfa = mctx->dfa;
2093 reg_errcode_t err;
2094 int node_idx, node;
2095 re_sift_context_t local_sctx;
2096 int first_idx = search_cur_bkref_entry (mctx, str_idx);
2098 if (first_idx == -1)
2099 return REG_NOERROR;
2101 local_sctx.sifted_states = NULL; /* Mark that it hasn't been initialized. */
2103 for (node_idx = 0; node_idx < candidates->nelem; ++node_idx)
2105 int enabled_idx;
2106 re_token_type_t type;
2107 struct re_backref_cache_entry *entry;
2108 node = candidates->elems[node_idx];
2109 type = dfa->nodes[node].type;
2110 /* Avoid infinite loop for the REs like "()\1+". */
2111 if (node == sctx->last_node && str_idx == sctx->last_str_idx)
2112 continue;
2113 if (type != OP_BACK_REF)
2114 continue;
2116 entry = mctx->bkref_ents + first_idx;
2117 enabled_idx = first_idx;
2120 int subexp_len, to_idx, dst_node;
2121 re_dfastate_t *cur_state;
2123 if (entry->node != node)
2124 continue;
2125 subexp_len = entry->subexp_to - entry->subexp_from;
2126 to_idx = str_idx + subexp_len;
2127 dst_node = (subexp_len ? dfa->nexts[node]
2128 : dfa->edests[node].elems[0]);
2130 if (to_idx > sctx->last_str_idx
2131 || sctx->sifted_states[to_idx] == NULL
2132 || !STATE_NODE_CONTAINS (sctx->sifted_states[to_idx], dst_node)
2133 || check_dst_limits (mctx, &sctx->limits, node,
2134 str_idx, dst_node, to_idx))
2135 continue;
2137 if (local_sctx.sifted_states == NULL)
2139 local_sctx = *sctx;
2140 err = re_node_set_init_copy (&local_sctx.limits, &sctx->limits);
2141 if (BE (err != REG_NOERROR, 0))
2142 goto free_return;
2144 local_sctx.last_node = node;
2145 local_sctx.last_str_idx = str_idx;
2146 err = re_node_set_insert (&local_sctx.limits, enabled_idx);
2147 if (BE (err < 0, 0))
2149 err = REG_ESPACE;
2150 goto free_return;
2152 cur_state = local_sctx.sifted_states[str_idx];
2153 err = sift_states_backward (mctx, &local_sctx);
2154 if (BE (err != REG_NOERROR, 0))
2155 goto free_return;
2156 if (sctx->limited_states != NULL)
2158 err = merge_state_array (dfa, sctx->limited_states,
2159 local_sctx.sifted_states,
2160 str_idx + 1);
2161 if (BE (err != REG_NOERROR, 0))
2162 goto free_return;
2164 local_sctx.sifted_states[str_idx] = cur_state;
2165 re_node_set_remove (&local_sctx.limits, enabled_idx);
2167 /* mctx->bkref_ents may have changed, reload the pointer. */
2168 entry = mctx->bkref_ents + enabled_idx;
2170 while (enabled_idx++, entry++->more);
2172 err = REG_NOERROR;
2173 free_return:
2174 if (local_sctx.sifted_states != NULL)
2176 re_node_set_free (&local_sctx.limits);
2179 return err;
2183 #ifdef RE_ENABLE_I18N
2184 static int
2185 sift_states_iter_mb (mctx, sctx, node_idx, str_idx, max_str_idx)
2186 const re_match_context_t *mctx;
2187 re_sift_context_t *sctx;
2188 int node_idx, str_idx, max_str_idx;
2190 re_dfa_t *const dfa = mctx->dfa;
2191 int naccepted;
2192 /* Check the node can accept `multi byte'. */
2193 naccepted = check_node_accept_bytes (dfa, node_idx, &mctx->input, str_idx);
2194 if (naccepted > 0 && str_idx + naccepted <= max_str_idx &&
2195 !STATE_NODE_CONTAINS (sctx->sifted_states[str_idx + naccepted],
2196 dfa->nexts[node_idx]))
2197 /* The node can't accept the `multi byte', or the
2198 destination was already thrown away, then the node
2199 could't accept the current input `multi byte'. */
2200 naccepted = 0;
2201 /* Otherwise, it is sure that the node could accept
2202 `naccepted' bytes input. */
2203 return naccepted;
2205 #endif /* RE_ENABLE_I18N */
2208 /* Functions for state transition. */
2210 /* Return the next state to which the current state STATE will transit by
2211 accepting the current input byte, and update STATE_LOG if necessary.
2212 If STATE can accept a multibyte char/collating element/back reference
2213 update the destination of STATE_LOG. */
2215 static re_dfastate_t *
2216 transit_state (err, mctx, state)
2217 reg_errcode_t *err;
2218 re_match_context_t *mctx;
2219 re_dfastate_t *state;
2221 re_dfa_t *const dfa = mctx->dfa;
2222 re_dfastate_t **trtable;
2223 unsigned char ch;
2225 #ifdef RE_ENABLE_I18N
2226 /* If the current state can accept multibyte. */
2227 if (BE (state->accept_mb, 0))
2229 *err = transit_state_mb (mctx, state);
2230 if (BE (*err != REG_NOERROR, 0))
2231 return NULL;
2233 #endif /* RE_ENABLE_I18N */
2235 /* Then decide the next state with the single byte. */
2236 if (1)
2238 /* Use transition table */
2239 ch = re_string_fetch_byte (&mctx->input);
2240 trtable = state->trtable;
2241 if (trtable == NULL)
2243 trtable = build_trtable (dfa, state);
2244 if (trtable == NULL)
2246 *err = REG_ESPACE;
2247 return NULL;
2250 if (BE (state->word_trtable, 0))
2252 unsigned int context;
2253 context
2254 = re_string_context_at (&mctx->input,
2255 re_string_cur_idx (&mctx->input) - 1,
2256 mctx->eflags);
2257 if (IS_WORD_CONTEXT (context))
2258 return trtable[ch + SBC_MAX];
2259 else
2260 return trtable[ch];
2262 else
2263 return trtable[ch];
2265 #if 0
2266 else
2267 /* don't use transition table */
2268 return transit_state_sb (err, mctx, state);
2269 #endif
2272 /* Update the state_log if we need */
2273 re_dfastate_t *
2274 merge_state_with_log (err, mctx, next_state)
2275 reg_errcode_t *err;
2276 re_match_context_t *mctx;
2277 re_dfastate_t *next_state;
2279 re_dfa_t *const dfa = mctx->dfa;
2280 int cur_idx = re_string_cur_idx (&mctx->input);
2282 if (cur_idx > mctx->state_log_top)
2284 mctx->state_log[cur_idx] = next_state;
2285 mctx->state_log_top = cur_idx;
2287 else if (mctx->state_log[cur_idx] == 0)
2289 mctx->state_log[cur_idx] = next_state;
2291 else
2293 re_dfastate_t *pstate;
2294 unsigned int context;
2295 re_node_set next_nodes, *log_nodes, *table_nodes = NULL;
2296 /* If (state_log[cur_idx] != 0), it implies that cur_idx is
2297 the destination of a multibyte char/collating element/
2298 back reference. Then the next state is the union set of
2299 these destinations and the results of the transition table. */
2300 pstate = mctx->state_log[cur_idx];
2301 log_nodes = pstate->entrance_nodes;
2302 if (next_state != NULL)
2304 table_nodes = next_state->entrance_nodes;
2305 *err = re_node_set_init_union (&next_nodes, table_nodes,
2306 log_nodes);
2307 if (BE (*err != REG_NOERROR, 0))
2308 return NULL;
2310 else
2311 next_nodes = *log_nodes;
2312 /* Note: We already add the nodes of the initial state,
2313 then we don't need to add them here. */
2315 context = re_string_context_at (&mctx->input,
2316 re_string_cur_idx (&mctx->input) - 1,
2317 mctx->eflags);
2318 next_state = mctx->state_log[cur_idx]
2319 = re_acquire_state_context (err, dfa, &next_nodes, context);
2320 /* We don't need to check errors here, since the return value of
2321 this function is next_state and ERR is already set. */
2323 if (table_nodes != NULL)
2324 re_node_set_free (&next_nodes);
2327 if (BE (dfa->nbackref, 0) && next_state != NULL)
2329 /* Check OP_OPEN_SUBEXP in the current state in case that we use them
2330 later. We must check them here, since the back references in the
2331 next state might use them. */
2332 *err = check_subexp_matching_top (mctx, &next_state->nodes,
2333 cur_idx);
2334 if (BE (*err != REG_NOERROR, 0))
2335 return NULL;
2337 /* If the next state has back references. */
2338 if (next_state->has_backref)
2340 *err = transit_state_bkref (mctx, &next_state->nodes);
2341 if (BE (*err != REG_NOERROR, 0))
2342 return NULL;
2343 next_state = mctx->state_log[cur_idx];
2347 return next_state;
2350 /* Skip bytes in the input that correspond to part of a
2351 multi-byte match, then look in the log for a state
2352 from which to restart matching. */
2353 re_dfastate_t *
2354 find_recover_state (err, mctx)
2355 reg_errcode_t *err;
2356 re_match_context_t *mctx;
2358 re_dfastate_t *cur_state = NULL;
2361 int max = mctx->state_log_top;
2362 int cur_str_idx = re_string_cur_idx (&mctx->input);
2366 if (++cur_str_idx > max)
2367 return NULL;
2368 re_string_skip_bytes (&mctx->input, 1);
2370 while (mctx->state_log[cur_str_idx] == NULL);
2372 cur_state = merge_state_with_log (err, mctx, NULL);
2374 while (err == REG_NOERROR && cur_state == NULL);
2375 return cur_state;
2378 /* Helper functions for transit_state. */
2380 /* From the node set CUR_NODES, pick up the nodes whose types are
2381 OP_OPEN_SUBEXP and which have corresponding back references in the regular
2382 expression. And register them to use them later for evaluating the
2383 correspoding back references. */
2385 static reg_errcode_t
2386 check_subexp_matching_top (mctx, cur_nodes, str_idx)
2387 re_match_context_t *mctx;
2388 re_node_set *cur_nodes;
2389 int str_idx;
2391 re_dfa_t *const dfa = mctx->dfa;
2392 int node_idx;
2393 reg_errcode_t err;
2395 /* TODO: This isn't efficient.
2396 Because there might be more than one nodes whose types are
2397 OP_OPEN_SUBEXP and whose index is SUBEXP_IDX, we must check all
2398 nodes.
2399 E.g. RE: (a){2} */
2400 for (node_idx = 0; node_idx < cur_nodes->nelem; ++node_idx)
2402 int node = cur_nodes->elems[node_idx];
2403 if (dfa->nodes[node].type == OP_OPEN_SUBEXP
2404 && dfa->nodes[node].opr.idx < (8 * sizeof (dfa->used_bkref_map))
2405 && dfa->used_bkref_map & (1 << dfa->nodes[node].opr.idx))
2407 err = match_ctx_add_subtop (mctx, node, str_idx);
2408 if (BE (err != REG_NOERROR, 0))
2409 return err;
2412 return REG_NOERROR;
2415 #if 0
2416 /* Return the next state to which the current state STATE will transit by
2417 accepting the current input byte. */
2419 static re_dfastate_t *
2420 transit_state_sb (err, mctx, state)
2421 reg_errcode_t *err;
2422 re_match_context_t *mctx;
2423 re_dfastate_t *state;
2425 re_dfa_t *const dfa = mctx->dfa;
2426 re_node_set next_nodes;
2427 re_dfastate_t *next_state;
2428 int node_cnt, cur_str_idx = re_string_cur_idx (&mctx->input);
2429 unsigned int context;
2431 *err = re_node_set_alloc (&next_nodes, state->nodes.nelem + 1);
2432 if (BE (*err != REG_NOERROR, 0))
2433 return NULL;
2434 for (node_cnt = 0; node_cnt < state->nodes.nelem; ++node_cnt)
2436 int cur_node = state->nodes.elems[node_cnt];
2437 if (check_node_accept (mctx, dfa->nodes + cur_node, cur_str_idx))
2439 *err = re_node_set_merge (&next_nodes,
2440 dfa->eclosures + dfa->nexts[cur_node]);
2441 if (BE (*err != REG_NOERROR, 0))
2443 re_node_set_free (&next_nodes);
2444 return NULL;
2448 context = re_string_context_at (&mctx->input, cur_str_idx, mctx->eflags);
2449 next_state = re_acquire_state_context (err, dfa, &next_nodes, context);
2450 /* We don't need to check errors here, since the return value of
2451 this function is next_state and ERR is already set. */
2453 re_node_set_free (&next_nodes);
2454 re_string_skip_bytes (&mctx->input, 1);
2455 return next_state;
2457 #endif
2459 #ifdef RE_ENABLE_I18N
2460 static reg_errcode_t
2461 transit_state_mb (mctx, pstate)
2462 re_match_context_t *mctx;
2463 re_dfastate_t *pstate;
2465 re_dfa_t *const dfa = mctx->dfa;
2466 reg_errcode_t err;
2467 int i;
2469 for (i = 0; i < pstate->nodes.nelem; ++i)
2471 re_node_set dest_nodes, *new_nodes;
2472 int cur_node_idx = pstate->nodes.elems[i];
2473 int naccepted = 0, dest_idx;
2474 unsigned int context;
2475 re_dfastate_t *dest_state;
2477 if (dfa->nodes[cur_node_idx].constraint)
2479 context = re_string_context_at (&mctx->input,
2480 re_string_cur_idx (&mctx->input),
2481 mctx->eflags);
2482 if (NOT_SATISFY_NEXT_CONSTRAINT (dfa->nodes[cur_node_idx].constraint,
2483 context))
2484 continue;
2487 /* How many bytes the node can accept? */
2488 if (ACCEPT_MB_NODE (dfa->nodes[cur_node_idx].type))
2489 naccepted = check_node_accept_bytes (dfa, cur_node_idx, &mctx->input,
2490 re_string_cur_idx (&mctx->input));
2491 if (naccepted == 0)
2492 continue;
2494 /* The node can accepts `naccepted' bytes. */
2495 dest_idx = re_string_cur_idx (&mctx->input) + naccepted;
2496 mctx->max_mb_elem_len = ((mctx->max_mb_elem_len < naccepted) ? naccepted
2497 : mctx->max_mb_elem_len);
2498 err = clean_state_log_if_needed (mctx, dest_idx);
2499 if (BE (err != REG_NOERROR, 0))
2500 return err;
2501 #ifdef DEBUG
2502 assert (dfa->nexts[cur_node_idx] != -1);
2503 #endif
2504 /* `cur_node_idx' may point the entity of the OP_CONTEXT_NODE,
2505 then we use pstate->nodes.elems[i] instead. */
2506 new_nodes = dfa->eclosures + dfa->nexts[pstate->nodes.elems[i]];
2508 dest_state = mctx->state_log[dest_idx];
2509 if (dest_state == NULL)
2510 dest_nodes = *new_nodes;
2511 else
2513 err = re_node_set_init_union (&dest_nodes,
2514 dest_state->entrance_nodes, new_nodes);
2515 if (BE (err != REG_NOERROR, 0))
2516 return err;
2518 context = re_string_context_at (&mctx->input, dest_idx - 1, mctx->eflags);
2519 mctx->state_log[dest_idx]
2520 = re_acquire_state_context (&err, dfa, &dest_nodes, context);
2521 if (dest_state != NULL)
2522 re_node_set_free (&dest_nodes);
2523 if (BE (mctx->state_log[dest_idx] == NULL && err != REG_NOERROR, 0))
2524 return err;
2526 return REG_NOERROR;
2528 #endif /* RE_ENABLE_I18N */
2530 static reg_errcode_t
2531 transit_state_bkref (mctx, nodes)
2532 re_match_context_t *mctx;
2533 const re_node_set *nodes;
2535 re_dfa_t *const dfa = mctx->dfa;
2536 reg_errcode_t err;
2537 int i;
2538 int cur_str_idx = re_string_cur_idx (&mctx->input);
2540 for (i = 0; i < nodes->nelem; ++i)
2542 int dest_str_idx, prev_nelem, bkc_idx;
2543 int node_idx = nodes->elems[i];
2544 unsigned int context;
2545 const re_token_t *node = dfa->nodes + node_idx;
2546 re_node_set *new_dest_nodes;
2548 /* Check whether `node' is a backreference or not. */
2549 if (node->type != OP_BACK_REF)
2550 continue;
2552 if (node->constraint)
2554 context = re_string_context_at (&mctx->input, cur_str_idx,
2555 mctx->eflags);
2556 if (NOT_SATISFY_NEXT_CONSTRAINT (node->constraint, context))
2557 continue;
2560 /* `node' is a backreference.
2561 Check the substring which the substring matched. */
2562 bkc_idx = mctx->nbkref_ents;
2563 err = get_subexp (mctx, node_idx, cur_str_idx);
2564 if (BE (err != REG_NOERROR, 0))
2565 goto free_return;
2567 /* And add the epsilon closures (which is `new_dest_nodes') of
2568 the backreference to appropriate state_log. */
2569 #ifdef DEBUG
2570 assert (dfa->nexts[node_idx] != -1);
2571 #endif
2572 for (; bkc_idx < mctx->nbkref_ents; ++bkc_idx)
2574 int subexp_len;
2575 re_dfastate_t *dest_state;
2576 struct re_backref_cache_entry *bkref_ent;
2577 bkref_ent = mctx->bkref_ents + bkc_idx;
2578 if (bkref_ent->node != node_idx || bkref_ent->str_idx != cur_str_idx)
2579 continue;
2580 subexp_len = bkref_ent->subexp_to - bkref_ent->subexp_from;
2581 new_dest_nodes = (subexp_len == 0
2582 ? dfa->eclosures + dfa->edests[node_idx].elems[0]
2583 : dfa->eclosures + dfa->nexts[node_idx]);
2584 dest_str_idx = (cur_str_idx + bkref_ent->subexp_to
2585 - bkref_ent->subexp_from);
2586 context = re_string_context_at (&mctx->input, dest_str_idx - 1,
2587 mctx->eflags);
2588 dest_state = mctx->state_log[dest_str_idx];
2589 prev_nelem = ((mctx->state_log[cur_str_idx] == NULL) ? 0
2590 : mctx->state_log[cur_str_idx]->nodes.nelem);
2591 /* Add `new_dest_node' to state_log. */
2592 if (dest_state == NULL)
2594 mctx->state_log[dest_str_idx]
2595 = re_acquire_state_context (&err, dfa, new_dest_nodes,
2596 context);
2597 if (BE (mctx->state_log[dest_str_idx] == NULL
2598 && err != REG_NOERROR, 0))
2599 goto free_return;
2601 else
2603 re_node_set dest_nodes;
2604 err = re_node_set_init_union (&dest_nodes,
2605 dest_state->entrance_nodes,
2606 new_dest_nodes);
2607 if (BE (err != REG_NOERROR, 0))
2609 re_node_set_free (&dest_nodes);
2610 goto free_return;
2612 mctx->state_log[dest_str_idx]
2613 = re_acquire_state_context (&err, dfa, &dest_nodes, context);
2614 re_node_set_free (&dest_nodes);
2615 if (BE (mctx->state_log[dest_str_idx] == NULL
2616 && err != REG_NOERROR, 0))
2617 goto free_return;
2619 /* We need to check recursively if the backreference can epsilon
2620 transit. */
2621 if (subexp_len == 0
2622 && mctx->state_log[cur_str_idx]->nodes.nelem > prev_nelem)
2624 err = check_subexp_matching_top (mctx, new_dest_nodes,
2625 cur_str_idx);
2626 if (BE (err != REG_NOERROR, 0))
2627 goto free_return;
2628 err = transit_state_bkref (mctx, new_dest_nodes);
2629 if (BE (err != REG_NOERROR, 0))
2630 goto free_return;
2634 err = REG_NOERROR;
2635 free_return:
2636 return err;
2639 /* Enumerate all the candidates which the backreference BKREF_NODE can match
2640 at BKREF_STR_IDX, and register them by match_ctx_add_entry().
2641 Note that we might collect inappropriate candidates here.
2642 However, the cost of checking them strictly here is too high, then we
2643 delay these checking for prune_impossible_nodes(). */
2645 static reg_errcode_t
2646 get_subexp (mctx, bkref_node, bkref_str_idx)
2647 re_match_context_t *mctx;
2648 int bkref_node, bkref_str_idx;
2650 re_dfa_t *const dfa = mctx->dfa;
2651 int subexp_num, sub_top_idx;
2652 const char *buf = (const char *) re_string_get_buffer (&mctx->input);
2653 /* Return if we have already checked BKREF_NODE at BKREF_STR_IDX. */
2654 int cache_idx = search_cur_bkref_entry (mctx, bkref_str_idx);
2655 if (cache_idx != -1)
2657 const struct re_backref_cache_entry *entry = mctx->bkref_ents + cache_idx;
2659 if (entry->node == bkref_node)
2660 return REG_NOERROR; /* We already checked it. */
2661 while (entry++->more);
2664 subexp_num = dfa->nodes[bkref_node].opr.idx;
2666 /* For each sub expression */
2667 for (sub_top_idx = 0; sub_top_idx < mctx->nsub_tops; ++sub_top_idx)
2669 reg_errcode_t err;
2670 re_sub_match_top_t *sub_top = mctx->sub_tops[sub_top_idx];
2671 re_sub_match_last_t *sub_last;
2672 int sub_last_idx, sl_str, bkref_str_off;
2674 if (dfa->nodes[sub_top->node].opr.idx != subexp_num)
2675 continue; /* It isn't related. */
2677 sl_str = sub_top->str_idx;
2678 bkref_str_off = bkref_str_idx;
2679 /* At first, check the last node of sub expressions we already
2680 evaluated. */
2681 for (sub_last_idx = 0; sub_last_idx < sub_top->nlasts; ++sub_last_idx)
2683 int sl_str_diff;
2684 sub_last = sub_top->lasts[sub_last_idx];
2685 sl_str_diff = sub_last->str_idx - sl_str;
2686 /* The matched string by the sub expression match with the substring
2687 at the back reference? */
2688 if (sl_str_diff > 0)
2690 if (BE (bkref_str_off + sl_str_diff > mctx->input.valid_len, 0))
2692 /* Not enough chars for a successful match. */
2693 if (bkref_str_off + sl_str_diff > mctx->input.len)
2694 break;
2696 err = clean_state_log_if_needed (mctx,
2697 bkref_str_off
2698 + sl_str_diff);
2699 if (BE (err != REG_NOERROR, 0))
2700 return err;
2701 buf = (const char *) re_string_get_buffer (&mctx->input);
2703 if (memcmp (buf + bkref_str_off, buf + sl_str, sl_str_diff) != 0)
2704 break; /* We don't need to search this sub expression any more. */
2706 bkref_str_off += sl_str_diff;
2707 sl_str += sl_str_diff;
2708 err = get_subexp_sub (mctx, sub_top, sub_last, bkref_node,
2709 bkref_str_idx);
2711 /* Reload buf, since the preceding call might have reallocated
2712 the buffer. */
2713 buf = (const char *) re_string_get_buffer (&mctx->input);
2715 if (err == REG_NOMATCH)
2716 continue;
2717 if (BE (err != REG_NOERROR, 0))
2718 return err;
2721 if (sub_last_idx < sub_top->nlasts)
2722 continue;
2723 if (sub_last_idx > 0)
2724 ++sl_str;
2725 /* Then, search for the other last nodes of the sub expression. */
2726 for (; sl_str <= bkref_str_idx; ++sl_str)
2728 int cls_node, sl_str_off;
2729 const re_node_set *nodes;
2730 sl_str_off = sl_str - sub_top->str_idx;
2731 /* The matched string by the sub expression match with the substring
2732 at the back reference? */
2733 if (sl_str_off > 0)
2735 if (BE (bkref_str_off >= mctx->input.valid_len, 0))
2737 /* If we are at the end of the input, we cannot match. */
2738 if (bkref_str_off >= mctx->input.len)
2739 break;
2741 err = extend_buffers (mctx);
2742 if (BE (err != REG_NOERROR, 0))
2743 return err;
2745 buf = (const char *) re_string_get_buffer (&mctx->input);
2747 if (buf [bkref_str_off++] != buf[sl_str - 1])
2748 break; /* We don't need to search this sub expression
2749 any more. */
2751 if (mctx->state_log[sl_str] == NULL)
2752 continue;
2753 /* Does this state have a ')' of the sub expression? */
2754 nodes = &mctx->state_log[sl_str]->nodes;
2755 cls_node = find_subexp_node (dfa, nodes, subexp_num, OP_CLOSE_SUBEXP);
2756 if (cls_node == -1)
2757 continue; /* No. */
2758 if (sub_top->path == NULL)
2760 sub_top->path = calloc (sizeof (state_array_t),
2761 sl_str - sub_top->str_idx + 1);
2762 if (sub_top->path == NULL)
2763 return REG_ESPACE;
2765 /* Can the OP_OPEN_SUBEXP node arrive the OP_CLOSE_SUBEXP node
2766 in the current context? */
2767 err = check_arrival (mctx, sub_top->path, sub_top->node,
2768 sub_top->str_idx, cls_node, sl_str, OP_CLOSE_SUBEXP);
2769 if (err == REG_NOMATCH)
2770 continue;
2771 if (BE (err != REG_NOERROR, 0))
2772 return err;
2773 sub_last = match_ctx_add_sublast (sub_top, cls_node, sl_str);
2774 if (BE (sub_last == NULL, 0))
2775 return REG_ESPACE;
2776 err = get_subexp_sub (mctx, sub_top, sub_last, bkref_node,
2777 bkref_str_idx);
2778 if (err == REG_NOMATCH)
2779 continue;
2782 return REG_NOERROR;
2785 /* Helper functions for get_subexp(). */
2787 /* Check SUB_LAST can arrive to the back reference BKREF_NODE at BKREF_STR.
2788 If it can arrive, register the sub expression expressed with SUB_TOP
2789 and SUB_LAST. */
2791 static reg_errcode_t
2792 get_subexp_sub (mctx, sub_top, sub_last, bkref_node, bkref_str)
2793 re_match_context_t *mctx;
2794 const re_sub_match_top_t *sub_top;
2795 re_sub_match_last_t *sub_last;
2796 int bkref_node, bkref_str;
2798 reg_errcode_t err;
2799 int to_idx;
2800 /* Can the subexpression arrive the back reference? */
2801 err = check_arrival (mctx, &sub_last->path, sub_last->node,
2802 sub_last->str_idx, bkref_node, bkref_str, OP_OPEN_SUBEXP);
2803 if (err != REG_NOERROR)
2804 return err;
2805 err = match_ctx_add_entry (mctx, bkref_node, bkref_str, sub_top->str_idx,
2806 sub_last->str_idx);
2807 if (BE (err != REG_NOERROR, 0))
2808 return err;
2809 to_idx = bkref_str + sub_last->str_idx - sub_top->str_idx;
2810 return clean_state_log_if_needed (mctx, to_idx);
2813 /* Find the first node which is '(' or ')' and whose index is SUBEXP_IDX.
2814 Search '(' if FL_OPEN, or search ')' otherwise.
2815 TODO: This function isn't efficient...
2816 Because there might be more than one nodes whose types are
2817 OP_OPEN_SUBEXP and whose index is SUBEXP_IDX, we must check all
2818 nodes.
2819 E.g. RE: (a){2} */
2821 static int
2822 find_subexp_node (dfa, nodes, subexp_idx, type)
2823 const re_dfa_t *dfa;
2824 const re_node_set *nodes;
2825 int subexp_idx, type;
2827 int cls_idx;
2828 for (cls_idx = 0; cls_idx < nodes->nelem; ++cls_idx)
2830 int cls_node = nodes->elems[cls_idx];
2831 const re_token_t *node = dfa->nodes + cls_node;
2832 if (node->type == type
2833 && node->opr.idx == subexp_idx)
2834 return cls_node;
2836 return -1;
2839 /* Check whether the node TOP_NODE at TOP_STR can arrive to the node
2840 LAST_NODE at LAST_STR. We record the path onto PATH since it will be
2841 heavily reused.
2842 Return REG_NOERROR if it can arrive, or REG_NOMATCH otherwise. */
2844 static reg_errcode_t
2845 check_arrival (mctx, path, top_node, top_str, last_node, last_str,
2846 type)
2847 re_match_context_t *mctx;
2848 state_array_t *path;
2849 int top_node, top_str, last_node, last_str, type;
2851 re_dfa_t *const dfa = mctx->dfa;
2852 reg_errcode_t err;
2853 int subexp_num, backup_cur_idx, str_idx, null_cnt;
2854 re_dfastate_t *cur_state = NULL;
2855 re_node_set *cur_nodes, next_nodes;
2856 re_dfastate_t **backup_state_log;
2857 unsigned int context;
2859 subexp_num = dfa->nodes[top_node].opr.idx;
2860 /* Extend the buffer if we need. */
2861 if (BE (path->alloc < last_str + mctx->max_mb_elem_len + 1, 0))
2863 re_dfastate_t **new_array;
2864 int old_alloc = path->alloc;
2865 path->alloc += last_str + mctx->max_mb_elem_len + 1;
2866 new_array = re_realloc (path->array, re_dfastate_t *, path->alloc);
2867 if (new_array == NULL)
2869 path->alloc = old_alloc;
2870 return REG_ESPACE;
2872 path->array = new_array;
2873 memset (new_array + old_alloc, '\0',
2874 sizeof (re_dfastate_t *) * (path->alloc - old_alloc));
2877 str_idx = path->next_idx == 0 ? top_str : path->next_idx;
2879 /* Temporary modify MCTX. */
2880 backup_state_log = mctx->state_log;
2881 backup_cur_idx = mctx->input.cur_idx;
2882 mctx->state_log = path->array;
2883 mctx->input.cur_idx = str_idx;
2885 /* Setup initial node set. */
2886 context = re_string_context_at (&mctx->input, str_idx - 1, mctx->eflags);
2887 if (str_idx == top_str)
2889 err = re_node_set_init_1 (&next_nodes, top_node);
2890 if (BE (err != REG_NOERROR, 0))
2891 return err;
2892 err = check_arrival_expand_ecl (dfa, &next_nodes, subexp_num, type);
2893 if (BE (err != REG_NOERROR, 0))
2895 re_node_set_free (&next_nodes);
2896 return err;
2899 else
2901 cur_state = mctx->state_log[str_idx];
2902 if (cur_state && cur_state->has_backref)
2904 err = re_node_set_init_copy (&next_nodes, &cur_state->nodes);
2905 if (BE ( err != REG_NOERROR, 0))
2906 return err;
2908 else
2909 re_node_set_init_empty (&next_nodes);
2911 if (str_idx == top_str || (cur_state && cur_state->has_backref))
2913 if (next_nodes.nelem)
2915 err = expand_bkref_cache (mctx, &next_nodes, str_idx,
2916 subexp_num, type);
2917 if (BE ( err != REG_NOERROR, 0))
2919 re_node_set_free (&next_nodes);
2920 return err;
2923 cur_state = re_acquire_state_context (&err, dfa, &next_nodes, context);
2924 if (BE (cur_state == NULL && err != REG_NOERROR, 0))
2926 re_node_set_free (&next_nodes);
2927 return err;
2929 mctx->state_log[str_idx] = cur_state;
2932 for (null_cnt = 0; str_idx < last_str && null_cnt <= mctx->max_mb_elem_len;)
2934 re_node_set_empty (&next_nodes);
2935 if (mctx->state_log[str_idx + 1])
2937 err = re_node_set_merge (&next_nodes,
2938 &mctx->state_log[str_idx + 1]->nodes);
2939 if (BE (err != REG_NOERROR, 0))
2941 re_node_set_free (&next_nodes);
2942 return err;
2945 if (cur_state)
2947 err = check_arrival_add_next_nodes (mctx, str_idx,
2948 &cur_state->non_eps_nodes, &next_nodes);
2949 if (BE (err != REG_NOERROR, 0))
2951 re_node_set_free (&next_nodes);
2952 return err;
2955 ++str_idx;
2956 if (next_nodes.nelem)
2958 err = check_arrival_expand_ecl (dfa, &next_nodes, subexp_num, type);
2959 if (BE (err != REG_NOERROR, 0))
2961 re_node_set_free (&next_nodes);
2962 return err;
2964 err = expand_bkref_cache (mctx, &next_nodes, str_idx,
2965 subexp_num, type);
2966 if (BE ( err != REG_NOERROR, 0))
2968 re_node_set_free (&next_nodes);
2969 return err;
2972 context = re_string_context_at (&mctx->input, str_idx - 1, mctx->eflags);
2973 cur_state = re_acquire_state_context (&err, dfa, &next_nodes, context);
2974 if (BE (cur_state == NULL && err != REG_NOERROR, 0))
2976 re_node_set_free (&next_nodes);
2977 return err;
2979 mctx->state_log[str_idx] = cur_state;
2980 null_cnt = cur_state == NULL ? null_cnt + 1 : 0;
2982 re_node_set_free (&next_nodes);
2983 cur_nodes = (mctx->state_log[last_str] == NULL ? NULL
2984 : &mctx->state_log[last_str]->nodes);
2985 path->next_idx = str_idx;
2987 /* Fix MCTX. */
2988 mctx->state_log = backup_state_log;
2989 mctx->input.cur_idx = backup_cur_idx;
2991 /* Then check the current node set has the node LAST_NODE. */
2992 if (cur_nodes != NULL && re_node_set_contains (cur_nodes, last_node))
2993 return REG_NOERROR;
2995 return REG_NOMATCH;
2998 /* Helper functions for check_arrival. */
3000 /* Calculate the destination nodes of CUR_NODES at STR_IDX, and append them
3001 to NEXT_NODES.
3002 TODO: This function is similar to the functions transit_state*(),
3003 however this function has many additional works.
3004 Can't we unify them? */
3006 static reg_errcode_t
3007 check_arrival_add_next_nodes (mctx, str_idx, cur_nodes, next_nodes)
3008 re_match_context_t *mctx;
3009 int str_idx;
3010 re_node_set *cur_nodes, *next_nodes;
3012 re_dfa_t *const dfa = mctx->dfa;
3013 int result;
3014 int cur_idx;
3015 reg_errcode_t err;
3016 re_node_set union_set;
3017 re_node_set_init_empty (&union_set);
3018 for (cur_idx = 0; cur_idx < cur_nodes->nelem; ++cur_idx)
3020 int naccepted = 0;
3021 int cur_node = cur_nodes->elems[cur_idx];
3022 #if defined DEBUG || defined RE_ENABLE_I18N
3023 re_token_type_t type = dfa->nodes[cur_node].type;
3024 #endif
3025 #ifdef DEBUG
3026 assert (!IS_EPSILON_NODE (type));
3027 #endif
3028 #ifdef RE_ENABLE_I18N
3029 /* If the node may accept `multi byte'. */
3030 if (ACCEPT_MB_NODE (type))
3032 naccepted = check_node_accept_bytes (dfa, cur_node, &mctx->input,
3033 str_idx);
3034 if (naccepted > 1)
3036 re_dfastate_t *dest_state;
3037 int next_node = dfa->nexts[cur_node];
3038 int next_idx = str_idx + naccepted;
3039 dest_state = mctx->state_log[next_idx];
3040 re_node_set_empty (&union_set);
3041 if (dest_state)
3043 err = re_node_set_merge (&union_set, &dest_state->nodes);
3044 if (BE (err != REG_NOERROR, 0))
3046 re_node_set_free (&union_set);
3047 return err;
3050 result = re_node_set_insert (&union_set, next_node);
3051 if (BE (result < 0, 0))
3053 re_node_set_free (&union_set);
3054 return REG_ESPACE;
3056 mctx->state_log[next_idx] = re_acquire_state (&err, dfa,
3057 &union_set);
3058 if (BE (mctx->state_log[next_idx] == NULL
3059 && err != REG_NOERROR, 0))
3061 re_node_set_free (&union_set);
3062 return err;
3066 #endif /* RE_ENABLE_I18N */
3067 if (naccepted
3068 || check_node_accept (mctx, dfa->nodes + cur_node, str_idx))
3070 result = re_node_set_insert (next_nodes, dfa->nexts[cur_node]);
3071 if (BE (result < 0, 0))
3073 re_node_set_free (&union_set);
3074 return REG_ESPACE;
3078 re_node_set_free (&union_set);
3079 return REG_NOERROR;
3082 /* For all the nodes in CUR_NODES, add the epsilon closures of them to
3083 CUR_NODES, however exclude the nodes which are:
3084 - inside the sub expression whose number is EX_SUBEXP, if FL_OPEN.
3085 - out of the sub expression whose number is EX_SUBEXP, if !FL_OPEN.
3088 static reg_errcode_t
3089 check_arrival_expand_ecl (dfa, cur_nodes, ex_subexp, type)
3090 re_dfa_t *dfa;
3091 re_node_set *cur_nodes;
3092 int ex_subexp, type;
3094 reg_errcode_t err;
3095 int idx, outside_node;
3096 re_node_set new_nodes;
3097 #ifdef DEBUG
3098 assert (cur_nodes->nelem);
3099 #endif
3100 err = re_node_set_alloc (&new_nodes, cur_nodes->nelem);
3101 if (BE (err != REG_NOERROR, 0))
3102 return err;
3103 /* Create a new node set NEW_NODES with the nodes which are epsilon
3104 closures of the node in CUR_NODES. */
3106 for (idx = 0; idx < cur_nodes->nelem; ++idx)
3108 int cur_node = cur_nodes->elems[idx];
3109 re_node_set *eclosure = dfa->eclosures + cur_node;
3110 outside_node = find_subexp_node (dfa, eclosure, ex_subexp, type);
3111 if (outside_node == -1)
3113 /* There are no problematic nodes, just merge them. */
3114 err = re_node_set_merge (&new_nodes, eclosure);
3115 if (BE (err != REG_NOERROR, 0))
3117 re_node_set_free (&new_nodes);
3118 return err;
3121 else
3123 /* There are problematic nodes, re-calculate incrementally. */
3124 err = check_arrival_expand_ecl_sub (dfa, &new_nodes, cur_node,
3125 ex_subexp, type);
3126 if (BE (err != REG_NOERROR, 0))
3128 re_node_set_free (&new_nodes);
3129 return err;
3133 re_node_set_free (cur_nodes);
3134 *cur_nodes = new_nodes;
3135 return REG_NOERROR;
3138 /* Helper function for check_arrival_expand_ecl.
3139 Check incrementally the epsilon closure of TARGET, and if it isn't
3140 problematic append it to DST_NODES. */
3142 static reg_errcode_t
3143 check_arrival_expand_ecl_sub (dfa, dst_nodes, target, ex_subexp, type)
3144 re_dfa_t *dfa;
3145 int target, ex_subexp, type;
3146 re_node_set *dst_nodes;
3148 int cur_node;
3149 for (cur_node = target; !re_node_set_contains (dst_nodes, cur_node);)
3151 int err;
3153 if (dfa->nodes[cur_node].type == type
3154 && dfa->nodes[cur_node].opr.idx == ex_subexp)
3156 if (type == OP_CLOSE_SUBEXP)
3158 err = re_node_set_insert (dst_nodes, cur_node);
3159 if (BE (err == -1, 0))
3160 return REG_ESPACE;
3162 break;
3164 err = re_node_set_insert (dst_nodes, cur_node);
3165 if (BE (err == -1, 0))
3166 return REG_ESPACE;
3167 if (dfa->edests[cur_node].nelem == 0)
3168 break;
3169 if (dfa->edests[cur_node].nelem == 2)
3171 err = check_arrival_expand_ecl_sub (dfa, dst_nodes,
3172 dfa->edests[cur_node].elems[1],
3173 ex_subexp, type);
3174 if (BE (err != REG_NOERROR, 0))
3175 return err;
3177 cur_node = dfa->edests[cur_node].elems[0];
3179 return REG_NOERROR;
3183 /* For all the back references in the current state, calculate the
3184 destination of the back references by the appropriate entry
3185 in MCTX->BKREF_ENTS. */
3187 static reg_errcode_t
3188 expand_bkref_cache (mctx, cur_nodes, cur_str, subexp_num,
3189 type)
3190 re_match_context_t *mctx;
3191 int cur_str, subexp_num, type;
3192 re_node_set *cur_nodes;
3194 re_dfa_t *const dfa = mctx->dfa;
3195 reg_errcode_t err;
3196 int cache_idx_start = search_cur_bkref_entry (mctx, cur_str);
3197 struct re_backref_cache_entry *ent;
3199 if (cache_idx_start == -1)
3200 return REG_NOERROR;
3202 restart:
3203 ent = mctx->bkref_ents + cache_idx_start;
3206 int to_idx, next_node;
3208 /* Is this entry ENT is appropriate? */
3209 if (!re_node_set_contains (cur_nodes, ent->node))
3210 continue; /* No. */
3212 to_idx = cur_str + ent->subexp_to - ent->subexp_from;
3213 /* Calculate the destination of the back reference, and append it
3214 to MCTX->STATE_LOG. */
3215 if (to_idx == cur_str)
3217 /* The backreference did epsilon transit, we must re-check all the
3218 node in the current state. */
3219 re_node_set new_dests;
3220 reg_errcode_t err2, err3;
3221 next_node = dfa->edests[ent->node].elems[0];
3222 if (re_node_set_contains (cur_nodes, next_node))
3223 continue;
3224 err = re_node_set_init_1 (&new_dests, next_node);
3225 err2 = check_arrival_expand_ecl (dfa, &new_dests, subexp_num, type);
3226 err3 = re_node_set_merge (cur_nodes, &new_dests);
3227 re_node_set_free (&new_dests);
3228 if (BE (err != REG_NOERROR || err2 != REG_NOERROR
3229 || err3 != REG_NOERROR, 0))
3231 err = (err != REG_NOERROR ? err
3232 : (err2 != REG_NOERROR ? err2 : err3));
3233 return err;
3235 /* TODO: It is still inefficient... */
3236 goto restart;
3238 else
3240 re_node_set union_set;
3241 next_node = dfa->nexts[ent->node];
3242 if (mctx->state_log[to_idx])
3244 int ret;
3245 if (re_node_set_contains (&mctx->state_log[to_idx]->nodes,
3246 next_node))
3247 continue;
3248 err = re_node_set_init_copy (&union_set,
3249 &mctx->state_log[to_idx]->nodes);
3250 ret = re_node_set_insert (&union_set, next_node);
3251 if (BE (err != REG_NOERROR || ret < 0, 0))
3253 re_node_set_free (&union_set);
3254 err = err != REG_NOERROR ? err : REG_ESPACE;
3255 return err;
3258 else
3260 err = re_node_set_init_1 (&union_set, next_node);
3261 if (BE (err != REG_NOERROR, 0))
3262 return err;
3264 mctx->state_log[to_idx] = re_acquire_state (&err, dfa, &union_set);
3265 re_node_set_free (&union_set);
3266 if (BE (mctx->state_log[to_idx] == NULL
3267 && err != REG_NOERROR, 0))
3268 return err;
3271 while (ent++->more);
3272 return REG_NOERROR;
3275 /* Build transition table for the state.
3276 Return the new table if succeeded, otherwise return NULL. */
3278 static re_dfastate_t **
3279 build_trtable (dfa, state)
3280 re_dfa_t *dfa;
3281 re_dfastate_t *state;
3283 reg_errcode_t err;
3284 int i, j, ch;
3285 unsigned int elem, mask;
3286 int dests_node_malloced = 0, dest_states_malloced = 0;
3287 int ndests; /* Number of the destination states from `state'. */
3288 re_dfastate_t **trtable;
3289 re_dfastate_t **dest_states = NULL, **dest_states_word, **dest_states_nl;
3290 re_node_set follows, *dests_node;
3291 bitset *dests_ch;
3292 bitset acceptable;
3294 /* We build DFA states which corresponds to the destination nodes
3295 from `state'. `dests_node[i]' represents the nodes which i-th
3296 destination state contains, and `dests_ch[i]' represents the
3297 characters which i-th destination state accepts. */
3298 #ifdef _LIBC
3299 if (__libc_use_alloca ((sizeof (re_node_set) + sizeof (bitset)) * SBC_MAX))
3300 dests_node = (re_node_set *)
3301 alloca ((sizeof (re_node_set) + sizeof (bitset)) * SBC_MAX);
3302 else
3303 #endif
3305 dests_node = (re_node_set *)
3306 malloc ((sizeof (re_node_set) + sizeof (bitset)) * SBC_MAX);
3307 if (BE (dests_node == NULL, 0))
3308 return NULL;
3309 dests_node_malloced = 1;
3311 dests_ch = (bitset *) (dests_node + SBC_MAX);
3313 /* Initialize transiton table. */
3314 state->word_trtable = 0;
3316 /* At first, group all nodes belonging to `state' into several
3317 destinations. */
3318 ndests = group_nodes_into_DFAstates (dfa, state, dests_node, dests_ch);
3319 if (BE (ndests <= 0, 0))
3321 if (dests_node_malloced)
3322 free (dests_node);
3323 /* Return NULL in case of an error, trtable otherwise. */
3324 if (ndests == 0)
3326 state->trtable = (re_dfastate_t **)
3327 calloc (sizeof (re_dfastate_t *), SBC_MAX);;
3328 return state->trtable;
3330 return NULL;
3333 err = re_node_set_alloc (&follows, ndests + 1);
3334 if (BE (err != REG_NOERROR, 0))
3335 goto out_free;
3337 #ifdef _LIBC
3338 if (__libc_use_alloca ((sizeof (re_node_set) + sizeof (bitset)) * SBC_MAX
3339 + ndests * 3 * sizeof (re_dfastate_t *)))
3340 dest_states = (re_dfastate_t **)
3341 alloca (ndests * 3 * sizeof (re_dfastate_t *));
3342 else
3343 #endif
3345 dest_states = (re_dfastate_t **)
3346 malloc (ndests * 3 * sizeof (re_dfastate_t *));
3347 if (BE (dest_states == NULL, 0))
3349 out_free:
3350 if (dest_states_malloced)
3351 free (dest_states);
3352 re_node_set_free (&follows);
3353 for (i = 0; i < ndests; ++i)
3354 re_node_set_free (dests_node + i);
3355 if (dests_node_malloced)
3356 free (dests_node);
3357 return NULL;
3359 dest_states_malloced = 1;
3361 dest_states_word = dest_states + ndests;
3362 dest_states_nl = dest_states_word + ndests;
3363 bitset_empty (acceptable);
3365 /* Then build the states for all destinations. */
3366 for (i = 0; i < ndests; ++i)
3368 int next_node;
3369 re_node_set_empty (&follows);
3370 /* Merge the follows of this destination states. */
3371 for (j = 0; j < dests_node[i].nelem; ++j)
3373 next_node = dfa->nexts[dests_node[i].elems[j]];
3374 if (next_node != -1)
3376 err = re_node_set_merge (&follows, dfa->eclosures + next_node);
3377 if (BE (err != REG_NOERROR, 0))
3378 goto out_free;
3381 dest_states[i] = re_acquire_state_context (&err, dfa, &follows, 0);
3382 if (BE (dest_states[i] == NULL && err != REG_NOERROR, 0))
3383 goto out_free;
3384 /* If the new state has context constraint,
3385 build appropriate states for these contexts. */
3386 if (dest_states[i]->has_constraint)
3388 dest_states_word[i] = re_acquire_state_context (&err, dfa, &follows,
3389 CONTEXT_WORD);
3390 if (BE (dest_states_word[i] == NULL && err != REG_NOERROR, 0))
3391 goto out_free;
3393 if (dest_states[i] != dest_states_word[i]
3394 && dfa->mb_cur_max > 1)
3395 state->word_trtable = 1;
3397 dest_states_nl[i] = re_acquire_state_context (&err, dfa, &follows,
3398 CONTEXT_NEWLINE);
3399 if (BE (dest_states_nl[i] == NULL && err != REG_NOERROR, 0))
3400 goto out_free;
3402 else
3404 dest_states_word[i] = dest_states[i];
3405 dest_states_nl[i] = dest_states[i];
3407 bitset_merge (acceptable, dests_ch[i]);
3410 if (!BE (state->word_trtable, 0))
3412 /* We don't care about whether the following character is a word
3413 character, or we are in a single-byte character set so we can
3414 discern by looking at the character code: allocate a
3415 256-entry transition table. */
3416 trtable = (re_dfastate_t **) calloc (sizeof (re_dfastate_t *), SBC_MAX);
3417 if (BE (trtable == NULL, 0))
3418 goto out_free;
3420 /* For all characters ch...: */
3421 for (i = 0; i < BITSET_UINTS; ++i)
3422 for (ch = i * UINT_BITS, elem = acceptable[i], mask = 1;
3423 elem;
3424 mask <<= 1, elem >>= 1, ++ch)
3425 if (BE (elem & 1, 0))
3427 /* There must be exactly one destination which accepts
3428 character ch. See group_nodes_into_DFAstates. */
3429 for (j = 0; (dests_ch[j][i] & mask) == 0; ++j)
3432 /* j-th destination accepts the word character ch. */
3433 if (dfa->word_char[i] & mask)
3434 trtable[ch] = dest_states_word[j];
3435 else
3436 trtable[ch] = dest_states[j];
3439 else
3441 /* We care about whether the following character is a word
3442 character, and we are in a multi-byte character set: discern
3443 by looking at the character code: build two 256-entry
3444 transition tables, one starting at trtable[0] and one
3445 starting at trtable[SBC_MAX]. */
3446 trtable = (re_dfastate_t **) calloc (sizeof (re_dfastate_t *),
3447 2 * SBC_MAX);
3448 if (BE (trtable == NULL, 0))
3449 goto out_free;
3451 /* For all characters ch...: */
3452 for (i = 0; i < BITSET_UINTS; ++i)
3453 for (ch = i * UINT_BITS, elem = acceptable[i], mask = 1;
3454 elem;
3455 mask <<= 1, elem >>= 1, ++ch)
3456 if (BE (elem & 1, 0))
3458 /* There must be exactly one destination which accepts
3459 character ch. See group_nodes_into_DFAstates. */
3460 for (j = 0; (dests_ch[j][i] & mask) == 0; ++j)
3463 /* j-th destination accepts the word character ch. */
3464 trtable[ch] = dest_states[j];
3465 trtable[ch + SBC_MAX] = dest_states_word[j];
3469 /* new line */
3470 if (bitset_contain (acceptable, NEWLINE_CHAR))
3472 /* The current state accepts newline character. */
3473 for (j = 0; j < ndests; ++j)
3474 if (bitset_contain (dests_ch[j], NEWLINE_CHAR))
3476 /* k-th destination accepts newline character. */
3477 trtable[NEWLINE_CHAR] = dest_states_nl[j];
3478 if (state->word_trtable)
3479 trtable[NEWLINE_CHAR + SBC_MAX] = dest_states_nl[j];
3480 /* There must be only one destination which accepts
3481 newline. See group_nodes_into_DFAstates. */
3482 break;
3486 if (dest_states_malloced)
3487 free (dest_states);
3489 re_node_set_free (&follows);
3490 for (i = 0; i < ndests; ++i)
3491 re_node_set_free (dests_node + i);
3493 if (dests_node_malloced)
3494 free (dests_node);
3496 state->trtable = trtable;
3497 return trtable;
3500 /* Group all nodes belonging to STATE into several destinations.
3501 Then for all destinations, set the nodes belonging to the destination
3502 to DESTS_NODE[i] and set the characters accepted by the destination
3503 to DEST_CH[i]. This function return the number of destinations. */
3505 static int
3506 group_nodes_into_DFAstates (dfa, state, dests_node, dests_ch)
3507 re_dfa_t *dfa;
3508 const re_dfastate_t *state;
3509 re_node_set *dests_node;
3510 bitset *dests_ch;
3512 reg_errcode_t err;
3513 int result;
3514 int i, j, k;
3515 int ndests; /* Number of the destinations from `state'. */
3516 bitset accepts; /* Characters a node can accept. */
3517 const re_node_set *cur_nodes = &state->nodes;
3518 bitset_empty (accepts);
3519 ndests = 0;
3521 /* For all the nodes belonging to `state', */
3522 for (i = 0; i < cur_nodes->nelem; ++i)
3524 re_token_t *node = &dfa->nodes[cur_nodes->elems[i]];
3525 re_token_type_t type = node->type;
3526 unsigned int constraint = node->constraint;
3528 /* Enumerate all single byte character this node can accept. */
3529 if (type == CHARACTER)
3530 bitset_set (accepts, node->opr.c);
3531 else if (type == SIMPLE_BRACKET)
3533 bitset_merge (accepts, node->opr.sbcset);
3535 else if (type == OP_PERIOD)
3537 #ifdef RE_ENABLE_I18N
3538 if (dfa->mb_cur_max > 1)
3539 bitset_merge (accepts, dfa->sb_char);
3540 else
3541 #endif
3542 bitset_set_all (accepts);
3543 if (!(dfa->syntax & RE_DOT_NEWLINE))
3544 bitset_clear (accepts, '\n');
3545 if (dfa->syntax & RE_DOT_NOT_NULL)
3546 bitset_clear (accepts, '\0');
3548 #ifdef RE_ENABLE_I18N
3549 else if (type == OP_UTF8_PERIOD)
3551 memset (accepts, 255, sizeof (unsigned int) * BITSET_UINTS / 2);
3552 if (!(dfa->syntax & RE_DOT_NEWLINE))
3553 bitset_clear (accepts, '\n');
3554 if (dfa->syntax & RE_DOT_NOT_NULL)
3555 bitset_clear (accepts, '\0');
3557 #endif
3558 else
3559 continue;
3561 /* Check the `accepts' and sift the characters which are not
3562 match it the context. */
3563 if (constraint)
3565 if (constraint & NEXT_NEWLINE_CONSTRAINT)
3567 int accepts_newline = bitset_contain (accepts, NEWLINE_CHAR);
3568 bitset_empty (accepts);
3569 if (accepts_newline)
3570 bitset_set (accepts, NEWLINE_CHAR);
3571 else
3572 continue;
3574 if (constraint & NEXT_ENDBUF_CONSTRAINT)
3576 bitset_empty (accepts);
3577 continue;
3580 if (constraint & NEXT_WORD_CONSTRAINT)
3582 unsigned int any_set = 0;
3583 if (type == CHARACTER && !node->word_char)
3585 bitset_empty (accepts);
3586 continue;
3588 #ifdef RE_ENABLE_I18N
3589 if (dfa->mb_cur_max > 1)
3590 for (j = 0; j < BITSET_UINTS; ++j)
3591 any_set |= (accepts[j] &= (dfa->word_char[j] | ~dfa->sb_char[j]));
3592 else
3593 #endif
3594 for (j = 0; j < BITSET_UINTS; ++j)
3595 any_set |= (accepts[j] &= dfa->word_char[j]);
3596 if (!any_set)
3597 continue;
3599 if (constraint & NEXT_NOTWORD_CONSTRAINT)
3601 unsigned int any_set = 0;
3602 if (type == CHARACTER && node->word_char)
3604 bitset_empty (accepts);
3605 continue;
3607 #ifdef RE_ENABLE_I18N
3608 if (dfa->mb_cur_max > 1)
3609 for (j = 0; j < BITSET_UINTS; ++j)
3610 any_set |= (accepts[j] &= ~(dfa->word_char[j] & dfa->sb_char[j]));
3611 else
3612 #endif
3613 for (j = 0; j < BITSET_UINTS; ++j)
3614 any_set |= (accepts[j] &= ~dfa->word_char[j]);
3615 if (!any_set)
3616 continue;
3620 /* Then divide `accepts' into DFA states, or create a new
3621 state. Above, we make sure that accepts is not empty. */
3622 for (j = 0; j < ndests; ++j)
3624 bitset intersec; /* Intersection sets, see below. */
3625 bitset remains;
3626 /* Flags, see below. */
3627 int has_intersec, not_subset, not_consumed;
3629 /* Optimization, skip if this state doesn't accept the character. */
3630 if (type == CHARACTER && !bitset_contain (dests_ch[j], node->opr.c))
3631 continue;
3633 /* Enumerate the intersection set of this state and `accepts'. */
3634 has_intersec = 0;
3635 for (k = 0; k < BITSET_UINTS; ++k)
3636 has_intersec |= intersec[k] = accepts[k] & dests_ch[j][k];
3637 /* And skip if the intersection set is empty. */
3638 if (!has_intersec)
3639 continue;
3641 /* Then check if this state is a subset of `accepts'. */
3642 not_subset = not_consumed = 0;
3643 for (k = 0; k < BITSET_UINTS; ++k)
3645 not_subset |= remains[k] = ~accepts[k] & dests_ch[j][k];
3646 not_consumed |= accepts[k] = accepts[k] & ~dests_ch[j][k];
3649 /* If this state isn't a subset of `accepts', create a
3650 new group state, which has the `remains'. */
3651 if (not_subset)
3653 bitset_copy (dests_ch[ndests], remains);
3654 bitset_copy (dests_ch[j], intersec);
3655 err = re_node_set_init_copy (dests_node + ndests, &dests_node[j]);
3656 if (BE (err != REG_NOERROR, 0))
3657 goto error_return;
3658 ++ndests;
3661 /* Put the position in the current group. */
3662 result = re_node_set_insert (&dests_node[j], cur_nodes->elems[i]);
3663 if (BE (result < 0, 0))
3664 goto error_return;
3666 /* If all characters are consumed, go to next node. */
3667 if (!not_consumed)
3668 break;
3670 /* Some characters remain, create a new group. */
3671 if (j == ndests)
3673 bitset_copy (dests_ch[ndests], accepts);
3674 err = re_node_set_init_1 (dests_node + ndests, cur_nodes->elems[i]);
3675 if (BE (err != REG_NOERROR, 0))
3676 goto error_return;
3677 ++ndests;
3678 bitset_empty (accepts);
3681 return ndests;
3682 error_return:
3683 for (j = 0; j < ndests; ++j)
3684 re_node_set_free (dests_node + j);
3685 return -1;
3688 #ifdef RE_ENABLE_I18N
3689 /* Check how many bytes the node `dfa->nodes[node_idx]' accepts.
3690 Return the number of the bytes the node accepts.
3691 STR_IDX is the current index of the input string.
3693 This function handles the nodes which can accept one character, or
3694 one collating element like '.', '[a-z]', opposite to the other nodes
3695 can only accept one byte. */
3697 static int
3698 check_node_accept_bytes (dfa, node_idx, input, str_idx)
3699 re_dfa_t *dfa;
3700 int node_idx, str_idx;
3701 const re_string_t *input;
3703 const re_token_t *node = dfa->nodes + node_idx;
3704 int char_len, elem_len;
3705 int i;
3707 if (BE (node->type == OP_UTF8_PERIOD, 0))
3709 unsigned char c = re_string_byte_at (input, str_idx), d;
3710 if (BE (c < 0xc2, 1))
3711 return 0;
3713 if (str_idx + 2 > input->len)
3714 return 0;
3716 d = re_string_byte_at (input, str_idx + 1);
3717 if (c < 0xe0)
3718 return (d < 0x80 || d > 0xbf) ? 0 : 2;
3719 else if (c < 0xf0)
3721 char_len = 3;
3722 if (c == 0xe0 && d < 0xa0)
3723 return 0;
3725 else if (c < 0xf8)
3727 char_len = 4;
3728 if (c == 0xf0 && d < 0x90)
3729 return 0;
3731 else if (c < 0xfc)
3733 char_len = 5;
3734 if (c == 0xf8 && d < 0x88)
3735 return 0;
3737 else if (c < 0xfe)
3739 char_len = 6;
3740 if (c == 0xfc && d < 0x84)
3741 return 0;
3743 else
3744 return 0;
3746 if (str_idx + char_len > input->len)
3747 return 0;
3749 for (i = 1; i < char_len; ++i)
3751 d = re_string_byte_at (input, str_idx + i);
3752 if (d < 0x80 || d > 0xbf)
3753 return 0;
3755 return char_len;
3758 char_len = re_string_char_size_at (input, str_idx);
3759 if (node->type == OP_PERIOD)
3761 if (char_len <= 1)
3762 return 0;
3763 /* FIXME: I don't think this if is needed, as both '\n'
3764 and '\0' are char_len == 1. */
3765 /* '.' accepts any one character except the following two cases. */
3766 if ((!(dfa->syntax & RE_DOT_NEWLINE) &&
3767 re_string_byte_at (input, str_idx) == '\n') ||
3768 ((dfa->syntax & RE_DOT_NOT_NULL) &&
3769 re_string_byte_at (input, str_idx) == '\0'))
3770 return 0;
3771 return char_len;
3774 elem_len = re_string_elem_size_at (input, str_idx);
3775 if ((elem_len <= 1 && char_len <= 1) || char_len == 0)
3776 return 0;
3778 if (node->type == COMPLEX_BRACKET)
3780 const re_charset_t *cset = node->opr.mbcset;
3781 # ifdef _LIBC
3782 const unsigned char *pin = ((char *) re_string_get_buffer (input)
3783 + str_idx);
3784 int j;
3785 uint32_t nrules;
3786 # endif /* _LIBC */
3787 int match_len = 0;
3788 wchar_t wc = ((cset->nranges || cset->nchar_classes || cset->nmbchars)
3789 ? re_string_wchar_at (input, str_idx) : 0);
3791 /* match with multibyte character? */
3792 for (i = 0; i < cset->nmbchars; ++i)
3793 if (wc == cset->mbchars[i])
3795 match_len = char_len;
3796 goto check_node_accept_bytes_match;
3798 /* match with character_class? */
3799 for (i = 0; i < cset->nchar_classes; ++i)
3801 wctype_t wt = cset->char_classes[i];
3802 if (__iswctype (wc, wt))
3804 match_len = char_len;
3805 goto check_node_accept_bytes_match;
3809 # ifdef _LIBC
3810 nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3811 if (nrules != 0)
3813 unsigned int in_collseq = 0;
3814 const int32_t *table, *indirect;
3815 const unsigned char *weights, *extra;
3816 const char *collseqwc;
3817 int32_t idx;
3818 /* This #include defines a local function! */
3819 # include <locale/weight.h>
3821 /* match with collating_symbol? */
3822 if (cset->ncoll_syms)
3823 extra = (const unsigned char *)
3824 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB);
3825 for (i = 0; i < cset->ncoll_syms; ++i)
3827 const unsigned char *coll_sym = extra + cset->coll_syms[i];
3828 /* Compare the length of input collating element and
3829 the length of current collating element. */
3830 if (*coll_sym != elem_len)
3831 continue;
3832 /* Compare each bytes. */
3833 for (j = 0; j < *coll_sym; j++)
3834 if (pin[j] != coll_sym[1 + j])
3835 break;
3836 if (j == *coll_sym)
3838 /* Match if every bytes is equal. */
3839 match_len = j;
3840 goto check_node_accept_bytes_match;
3844 if (cset->nranges)
3846 if (elem_len <= char_len)
3848 collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
3849 in_collseq = __collseq_table_lookup (collseqwc, wc);
3851 else
3852 in_collseq = find_collation_sequence_value (pin, elem_len);
3854 /* match with range expression? */
3855 for (i = 0; i < cset->nranges; ++i)
3856 if (cset->range_starts[i] <= in_collseq
3857 && in_collseq <= cset->range_ends[i])
3859 match_len = elem_len;
3860 goto check_node_accept_bytes_match;
3863 /* match with equivalence_class? */
3864 if (cset->nequiv_classes)
3866 const unsigned char *cp = pin;
3867 table = (const int32_t *)
3868 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3869 weights = (const unsigned char *)
3870 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_WEIGHTMB);
3871 extra = (const unsigned char *)
3872 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_EXTRAMB);
3873 indirect = (const int32_t *)
3874 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_INDIRECTMB);
3875 idx = findidx (&cp);
3876 if (idx > 0)
3877 for (i = 0; i < cset->nequiv_classes; ++i)
3879 int32_t equiv_class_idx = cset->equiv_classes[i];
3880 size_t weight_len = weights[idx];
3881 if (weight_len == weights[equiv_class_idx])
3883 int cnt = 0;
3884 while (cnt <= weight_len
3885 && (weights[equiv_class_idx + 1 + cnt]
3886 == weights[idx + 1 + cnt]))
3887 ++cnt;
3888 if (cnt > weight_len)
3890 match_len = elem_len;
3891 goto check_node_accept_bytes_match;
3897 else
3898 # endif /* _LIBC */
3900 /* match with range expression? */
3901 #if __GNUC__ >= 2
3902 wchar_t cmp_buf[] = {L'\0', L'\0', wc, L'\0', L'\0', L'\0'};
3903 #else
3904 wchar_t cmp_buf[] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'};
3905 cmp_buf[2] = wc;
3906 #endif
3907 for (i = 0; i < cset->nranges; ++i)
3909 cmp_buf[0] = cset->range_starts[i];
3910 cmp_buf[4] = cset->range_ends[i];
3911 if (wcscoll (cmp_buf, cmp_buf + 2) <= 0
3912 && wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0)
3914 match_len = char_len;
3915 goto check_node_accept_bytes_match;
3919 check_node_accept_bytes_match:
3920 if (!cset->non_match)
3921 return match_len;
3922 else
3924 if (match_len > 0)
3925 return 0;
3926 else
3927 return (elem_len > char_len) ? elem_len : char_len;
3930 return 0;
3933 # ifdef _LIBC
3934 static unsigned int
3935 find_collation_sequence_value (mbs, mbs_len)
3936 const unsigned char *mbs;
3937 size_t mbs_len;
3939 uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3940 if (nrules == 0)
3942 if (mbs_len == 1)
3944 /* No valid character. Match it as a single byte character. */
3945 const unsigned char *collseq = (const unsigned char *)
3946 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
3947 return collseq[mbs[0]];
3949 return UINT_MAX;
3951 else
3953 int32_t idx;
3954 const unsigned char *extra = (const unsigned char *)
3955 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB);
3956 int32_t extrasize = (const unsigned char *)
3957 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB + 1) - extra;
3959 for (idx = 0; idx < extrasize;)
3961 int mbs_cnt, found = 0;
3962 int32_t elem_mbs_len;
3963 /* Skip the name of collating element name. */
3964 idx = idx + extra[idx] + 1;
3965 elem_mbs_len = extra[idx++];
3966 if (mbs_len == elem_mbs_len)
3968 for (mbs_cnt = 0; mbs_cnt < elem_mbs_len; ++mbs_cnt)
3969 if (extra[idx + mbs_cnt] != mbs[mbs_cnt])
3970 break;
3971 if (mbs_cnt == elem_mbs_len)
3972 /* Found the entry. */
3973 found = 1;
3975 /* Skip the byte sequence of the collating element. */
3976 idx += elem_mbs_len;
3977 /* Adjust for the alignment. */
3978 idx = (idx + 3) & ~3;
3979 /* Skip the collation sequence value. */
3980 idx += sizeof (uint32_t);
3981 /* Skip the wide char sequence of the collating element. */
3982 idx = idx + sizeof (uint32_t) * (extra[idx] + 1);
3983 /* If we found the entry, return the sequence value. */
3984 if (found)
3985 return *(uint32_t *) (extra + idx);
3986 /* Skip the collation sequence value. */
3987 idx += sizeof (uint32_t);
3989 return UINT_MAX;
3992 # endif /* _LIBC */
3993 #endif /* RE_ENABLE_I18N */
3995 /* Check whether the node accepts the byte which is IDX-th
3996 byte of the INPUT. */
3998 static int
3999 check_node_accept (mctx, node, idx)
4000 const re_match_context_t *mctx;
4001 const re_token_t *node;
4002 int idx;
4004 unsigned char ch;
4005 ch = re_string_byte_at (&mctx->input, idx);
4006 switch (node->type)
4008 case CHARACTER:
4009 if (node->opr.c != ch)
4010 return 0;
4011 break;
4013 case SIMPLE_BRACKET:
4014 if (!bitset_contain (node->opr.sbcset, ch))
4015 return 0;
4016 break;
4018 #ifdef RE_ENABLE_I18N
4019 case OP_UTF8_PERIOD:
4020 if (ch >= 0x80)
4021 return 0;
4022 /* FALLTHROUGH */
4023 #endif
4024 case OP_PERIOD:
4025 if ((ch == '\n' && !(mctx->dfa->syntax & RE_DOT_NEWLINE))
4026 || (ch == '\0' && (mctx->dfa->syntax & RE_DOT_NOT_NULL)))
4027 return 0;
4028 break;
4030 default:
4031 return 0;
4034 if (node->constraint)
4036 /* The node has constraints. Check whether the current context
4037 satisfies the constraints. */
4038 unsigned int context = re_string_context_at (&mctx->input, idx,
4039 mctx->eflags);
4040 if (NOT_SATISFY_NEXT_CONSTRAINT (node->constraint, context))
4041 return 0;
4044 return 1;
4047 /* Extend the buffers, if the buffers have run out. */
4049 static reg_errcode_t
4050 extend_buffers (mctx)
4051 re_match_context_t *mctx;
4053 reg_errcode_t ret;
4054 re_string_t *pstr = &mctx->input;
4056 /* Double the lengthes of the buffers. */
4057 ret = re_string_realloc_buffers (pstr, pstr->bufs_len * 2);
4058 if (BE (ret != REG_NOERROR, 0))
4059 return ret;
4061 if (mctx->state_log != NULL)
4063 /* And double the length of state_log. */
4064 /* XXX We have no indication of the size of this buffer. If this
4065 allocation fail we have no indication that the state_log array
4066 does not have the right size. */
4067 re_dfastate_t **new_array = re_realloc (mctx->state_log, re_dfastate_t *,
4068 pstr->bufs_len + 1);
4069 if (BE (new_array == NULL, 0))
4070 return REG_ESPACE;
4071 mctx->state_log = new_array;
4074 /* Then reconstruct the buffers. */
4075 if (pstr->icase)
4077 #ifdef RE_ENABLE_I18N
4078 if (pstr->mb_cur_max > 1)
4080 ret = build_wcs_upper_buffer (pstr);
4081 if (BE (ret != REG_NOERROR, 0))
4082 return ret;
4084 else
4085 #endif /* RE_ENABLE_I18N */
4086 build_upper_buffer (pstr);
4088 else
4090 #ifdef RE_ENABLE_I18N
4091 if (pstr->mb_cur_max > 1)
4092 build_wcs_buffer (pstr);
4093 else
4094 #endif /* RE_ENABLE_I18N */
4096 if (pstr->trans != NULL)
4097 re_string_translate_buffer (pstr);
4100 return REG_NOERROR;
4104 /* Functions for matching context. */
4106 /* Initialize MCTX. */
4108 static reg_errcode_t
4109 match_ctx_init (mctx, eflags, n)
4110 re_match_context_t *mctx;
4111 int eflags, n;
4113 mctx->eflags = eflags;
4114 mctx->match_last = -1;
4115 if (n > 0)
4117 mctx->bkref_ents = re_malloc (struct re_backref_cache_entry, n);
4118 mctx->sub_tops = re_malloc (re_sub_match_top_t *, n);
4119 if (BE (mctx->bkref_ents == NULL || mctx->sub_tops == NULL, 0))
4120 return REG_ESPACE;
4122 /* Already zero-ed by the caller.
4123 else
4124 mctx->bkref_ents = NULL;
4125 mctx->nbkref_ents = 0;
4126 mctx->nsub_tops = 0; */
4127 mctx->abkref_ents = n;
4128 mctx->max_mb_elem_len = 1;
4129 mctx->asub_tops = n;
4130 return REG_NOERROR;
4133 /* Clean the entries which depend on the current input in MCTX.
4134 This function must be invoked when the matcher changes the start index
4135 of the input, or changes the input string. */
4137 static void
4138 match_ctx_clean (mctx)
4139 re_match_context_t *mctx;
4141 int st_idx;
4142 for (st_idx = 0; st_idx < mctx->nsub_tops; ++st_idx)
4144 int sl_idx;
4145 re_sub_match_top_t *top = mctx->sub_tops[st_idx];
4146 for (sl_idx = 0; sl_idx < top->nlasts; ++sl_idx)
4148 re_sub_match_last_t *last = top->lasts[sl_idx];
4149 re_free (last->path.array);
4150 re_free (last);
4152 re_free (top->lasts);
4153 if (top->path)
4155 re_free (top->path->array);
4156 re_free (top->path);
4158 free (top);
4161 mctx->nsub_tops = 0;
4162 mctx->nbkref_ents = 0;
4165 /* Free all the memory associated with MCTX. */
4167 static void
4168 match_ctx_free (mctx)
4169 re_match_context_t *mctx;
4171 /* First, free all the memory associated with MCTX->SUB_TOPS. */
4172 match_ctx_clean (mctx);
4173 re_free (mctx->sub_tops);
4174 re_free (mctx->bkref_ents);
4177 /* Add a new backreference entry to MCTX.
4178 Note that we assume that caller never call this function with duplicate
4179 entry, and call with STR_IDX which isn't smaller than any existing entry.
4182 static reg_errcode_t
4183 match_ctx_add_entry (mctx, node, str_idx, from, to)
4184 re_match_context_t *mctx;
4185 int node, str_idx, from, to;
4187 if (mctx->nbkref_ents >= mctx->abkref_ents)
4189 struct re_backref_cache_entry* new_entry;
4190 new_entry = re_realloc (mctx->bkref_ents, struct re_backref_cache_entry,
4191 mctx->abkref_ents * 2);
4192 if (BE (new_entry == NULL, 0))
4194 re_free (mctx->bkref_ents);
4195 return REG_ESPACE;
4197 mctx->bkref_ents = new_entry;
4198 memset (mctx->bkref_ents + mctx->nbkref_ents, '\0',
4199 sizeof (struct re_backref_cache_entry) * mctx->abkref_ents);
4200 mctx->abkref_ents *= 2;
4202 if (mctx->nbkref_ents > 0
4203 && mctx->bkref_ents[mctx->nbkref_ents - 1].str_idx == str_idx)
4204 mctx->bkref_ents[mctx->nbkref_ents - 1].more = 1;
4206 mctx->bkref_ents[mctx->nbkref_ents].node = node;
4207 mctx->bkref_ents[mctx->nbkref_ents].str_idx = str_idx;
4208 mctx->bkref_ents[mctx->nbkref_ents].subexp_from = from;
4209 mctx->bkref_ents[mctx->nbkref_ents].subexp_to = to;
4211 /* This is a cache that saves negative results of check_dst_limits_calc_pos.
4212 If bit N is clear, means that this entry won't epsilon-transition to
4213 an OP_OPEN_SUBEXP or OP_CLOSE_SUBEXP for the N+1-th subexpression. If
4214 it is set, check_dst_limits_calc_pos_1 will recurse and try to find one
4215 such node.
4217 A backreference does not epsilon-transition unless it is empty, so set
4218 to all zeros if FROM != TO. */
4219 mctx->bkref_ents[mctx->nbkref_ents].eps_reachable_subexps_map
4220 = (from == to ? ~0 : 0);
4222 mctx->bkref_ents[mctx->nbkref_ents++].more = 0;
4223 if (mctx->max_mb_elem_len < to - from)
4224 mctx->max_mb_elem_len = to - from;
4225 return REG_NOERROR;
4228 /* Search for the first entry which has the same str_idx, or -1 if none is
4229 found. Note that MCTX->BKREF_ENTS is already sorted by MCTX->STR_IDX. */
4231 static int
4232 search_cur_bkref_entry (mctx, str_idx)
4233 re_match_context_t *mctx;
4234 int str_idx;
4236 int left, right, mid, last;
4237 last = right = mctx->nbkref_ents;
4238 for (left = 0; left < right;)
4240 mid = (left + right) / 2;
4241 if (mctx->bkref_ents[mid].str_idx < str_idx)
4242 left = mid + 1;
4243 else
4244 right = mid;
4246 if (left < last && mctx->bkref_ents[left].str_idx == str_idx)
4247 return left;
4248 else
4249 return -1;
4252 /* Register the node NODE, whose type is OP_OPEN_SUBEXP, and which matches
4253 at STR_IDX. */
4255 static reg_errcode_t
4256 match_ctx_add_subtop (mctx, node, str_idx)
4257 re_match_context_t *mctx;
4258 int node, str_idx;
4260 #ifdef DEBUG
4261 assert (mctx->sub_tops != NULL);
4262 assert (mctx->asub_tops > 0);
4263 #endif
4264 if (BE (mctx->nsub_tops == mctx->asub_tops, 0))
4266 int new_asub_tops = mctx->asub_tops * 2;
4267 re_sub_match_top_t **new_array = re_realloc (mctx->sub_tops,
4268 re_sub_match_top_t *,
4269 new_asub_tops);
4270 if (BE (new_array == NULL, 0))
4271 return REG_ESPACE;
4272 mctx->sub_tops = new_array;
4273 mctx->asub_tops = new_asub_tops;
4275 mctx->sub_tops[mctx->nsub_tops] = calloc (1, sizeof (re_sub_match_top_t));
4276 if (BE (mctx->sub_tops[mctx->nsub_tops] == NULL, 0))
4277 return REG_ESPACE;
4278 mctx->sub_tops[mctx->nsub_tops]->node = node;
4279 mctx->sub_tops[mctx->nsub_tops++]->str_idx = str_idx;
4280 return REG_NOERROR;
4283 /* Register the node NODE, whose type is OP_CLOSE_SUBEXP, and which matches
4284 at STR_IDX, whose corresponding OP_OPEN_SUBEXP is SUB_TOP. */
4286 static re_sub_match_last_t *
4287 match_ctx_add_sublast (subtop, node, str_idx)
4288 re_sub_match_top_t *subtop;
4289 int node, str_idx;
4291 re_sub_match_last_t *new_entry;
4292 if (BE (subtop->nlasts == subtop->alasts, 0))
4294 int new_alasts = 2 * subtop->alasts + 1;
4295 re_sub_match_last_t **new_array = re_realloc (subtop->lasts,
4296 re_sub_match_last_t *,
4297 new_alasts);
4298 if (BE (new_array == NULL, 0))
4299 return NULL;
4300 subtop->lasts = new_array;
4301 subtop->alasts = new_alasts;
4303 new_entry = calloc (1, sizeof (re_sub_match_last_t));
4304 if (BE (new_entry != NULL, 1))
4306 subtop->lasts[subtop->nlasts] = new_entry;
4307 new_entry->node = node;
4308 new_entry->str_idx = str_idx;
4309 ++subtop->nlasts;
4311 return new_entry;
4314 static void
4315 sift_ctx_init (sctx, sifted_sts, limited_sts, last_node, last_str_idx)
4316 re_sift_context_t *sctx;
4317 re_dfastate_t **sifted_sts, **limited_sts;
4318 int last_node, last_str_idx;
4320 sctx->sifted_states = sifted_sts;
4321 sctx->limited_states = limited_sts;
4322 sctx->last_node = last_node;
4323 sctx->last_str_idx = last_str_idx;
4324 re_node_set_init_empty (&sctx->limits);