* pthread_create.c (pthread_cancel): Add PTHREAD_STATIC_FN_REQUIRE.
[glibc.git] / posix / regexec.c
blob5877adeb557352e00d564f423fa81a7bbef89401
1 /* Extended regular expression matching and search library.
2 Copyright (C) 2002, 2003, 2004 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4 Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library; if not, write to the Free
18 Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
19 02111-1307 USA. */
21 static reg_errcode_t match_ctx_init (re_match_context_t *cache, int eflags,
22 int n) internal_function;
23 static void match_ctx_clean (re_match_context_t *mctx) internal_function;
24 static void match_ctx_free (re_match_context_t *cache) internal_function;
25 static reg_errcode_t match_ctx_add_entry (re_match_context_t *cache, int node,
26 int str_idx, int from, int to)
27 internal_function;
28 static int search_cur_bkref_entry (re_match_context_t *mctx, int str_idx)
29 internal_function;
30 static reg_errcode_t match_ctx_add_subtop (re_match_context_t *mctx, int node,
31 int str_idx) internal_function;
32 static re_sub_match_last_t * match_ctx_add_sublast (re_sub_match_top_t *subtop,
33 int node, int str_idx)
34 internal_function;
35 static void sift_ctx_init (re_sift_context_t *sctx, re_dfastate_t **sifted_sts,
36 re_dfastate_t **limited_sts, int last_node,
37 int last_str_idx)
38 internal_function;
39 static reg_errcode_t re_search_internal (const regex_t *preg,
40 const char *string, int length,
41 int start, int range, int stop,
42 size_t nmatch, regmatch_t pmatch[],
43 int eflags) internal_function;
44 static int re_search_2_stub (struct re_pattern_buffer *bufp,
45 const char *string1, int length1,
46 const char *string2, int length2,
47 int start, int range, struct re_registers *regs,
48 int stop, int ret_len) internal_function;
49 static int re_search_stub (struct re_pattern_buffer *bufp,
50 const char *string, int length, int start,
51 int range, int stop, struct re_registers *regs,
52 int ret_len) internal_function;
53 static unsigned re_copy_regs (struct re_registers *regs, regmatch_t *pmatch,
54 int nregs, int regs_allocated) internal_function;
55 static inline re_dfastate_t *acquire_init_state_context
56 (reg_errcode_t *err, const re_match_context_t *mctx, int idx)
57 __attribute ((always_inline)) internal_function;
58 static reg_errcode_t prune_impossible_nodes (re_match_context_t *mctx)
59 internal_function;
60 static int check_matching (re_match_context_t *mctx, int fl_longest_match,
61 int *p_match_first)
62 internal_function;
63 static int check_halt_node_context (const re_dfa_t *dfa, int node,
64 unsigned int context) internal_function;
65 static int check_halt_state_context (const re_match_context_t *mctx,
66 const re_dfastate_t *state, int idx)
67 internal_function;
68 static void update_regs (re_dfa_t *dfa, regmatch_t *pmatch,
69 regmatch_t *prev_idx_match, int cur_node,
70 int cur_idx, int nmatch) internal_function;
71 static int proceed_next_node (const re_match_context_t *mctx,
72 int nregs, regmatch_t *regs,
73 int *pidx, int node, re_node_set *eps_via_nodes,
74 struct re_fail_stack_t *fs) internal_function;
75 static reg_errcode_t push_fail_stack (struct re_fail_stack_t *fs,
76 int str_idx, int *dests, int nregs,
77 regmatch_t *regs,
78 re_node_set *eps_via_nodes) internal_function;
79 static int pop_fail_stack (struct re_fail_stack_t *fs, int *pidx, int nregs,
80 regmatch_t *regs, re_node_set *eps_via_nodes) internal_function;
81 static reg_errcode_t set_regs (const regex_t *preg,
82 const re_match_context_t *mctx,
83 size_t nmatch, regmatch_t *pmatch,
84 int fl_backtrack) internal_function;
85 static reg_errcode_t free_fail_stack_return (struct re_fail_stack_t *fs) internal_function;
87 #ifdef RE_ENABLE_I18N
88 static int sift_states_iter_mb (const re_match_context_t *mctx,
89 re_sift_context_t *sctx,
90 int node_idx, int str_idx, int max_str_idx) internal_function;
91 #endif /* RE_ENABLE_I18N */
92 static reg_errcode_t sift_states_backward (re_match_context_t *mctx,
93 re_sift_context_t *sctx) internal_function;
94 static reg_errcode_t build_sifted_states (re_match_context_t *mctx,
95 re_sift_context_t *sctx, int str_idx,
96 re_node_set *cur_dest) internal_function;
97 static reg_errcode_t update_cur_sifted_state (re_match_context_t *mctx,
98 re_sift_context_t *sctx,
99 int str_idx,
100 re_node_set *dest_nodes) internal_function;
101 static reg_errcode_t add_epsilon_src_nodes (re_dfa_t *dfa,
102 re_node_set *dest_nodes,
103 const re_node_set *candidates) internal_function;
104 static reg_errcode_t sub_epsilon_src_nodes (re_dfa_t *dfa, int node,
105 re_node_set *dest_nodes,
106 const re_node_set *and_nodes) internal_function;
107 static int check_dst_limits (re_match_context_t *mctx, re_node_set *limits,
108 int dst_node, int dst_idx, int src_node,
109 int src_idx) internal_function;
110 static int check_dst_limits_calc_pos_1 (re_match_context_t *mctx,
111 int boundaries, int subexp_idx,
112 int from_node, int bkref_idx) internal_function;
113 static int check_dst_limits_calc_pos (re_match_context_t *mctx,
114 int limit, int subexp_idx,
115 int node, int str_idx,
116 int bkref_idx) internal_function;
117 static reg_errcode_t check_subexp_limits (re_dfa_t *dfa,
118 re_node_set *dest_nodes,
119 const re_node_set *candidates,
120 re_node_set *limits,
121 struct re_backref_cache_entry *bkref_ents,
122 int str_idx) internal_function;
123 static reg_errcode_t sift_states_bkref (re_match_context_t *mctx,
124 re_sift_context_t *sctx,
125 int str_idx, const re_node_set *candidates) internal_function;
126 static reg_errcode_t clean_state_log_if_needed (re_match_context_t *mctx,
127 int next_state_log_idx) internal_function;
128 static reg_errcode_t merge_state_array (re_dfa_t *dfa, re_dfastate_t **dst,
129 re_dfastate_t **src, int num) internal_function;
130 static re_dfastate_t *find_recover_state (reg_errcode_t *err,
131 re_match_context_t *mctx) internal_function;
132 static re_dfastate_t *transit_state (reg_errcode_t *err,
133 re_match_context_t *mctx,
134 re_dfastate_t *state) internal_function;
135 static re_dfastate_t *merge_state_with_log (reg_errcode_t *err,
136 re_match_context_t *mctx,
137 re_dfastate_t *next_state) internal_function;
138 static reg_errcode_t check_subexp_matching_top (re_match_context_t *mctx,
139 re_node_set *cur_nodes,
140 int str_idx) internal_function;
141 #if 0
142 static re_dfastate_t *transit_state_sb (reg_errcode_t *err,
143 re_match_context_t *mctx,
144 re_dfastate_t *pstate) internal_function;
145 #endif
146 #ifdef RE_ENABLE_I18N
147 static reg_errcode_t transit_state_mb (re_match_context_t *mctx,
148 re_dfastate_t *pstate) internal_function;
149 #endif /* RE_ENABLE_I18N */
150 static reg_errcode_t transit_state_bkref (re_match_context_t *mctx,
151 const re_node_set *nodes) internal_function;
152 static reg_errcode_t get_subexp (re_match_context_t *mctx,
153 int bkref_node, int bkref_str_idx) internal_function;
154 static reg_errcode_t get_subexp_sub (re_match_context_t *mctx,
155 const re_sub_match_top_t *sub_top,
156 re_sub_match_last_t *sub_last,
157 int bkref_node, int bkref_str) internal_function;
158 static int find_subexp_node (const re_dfa_t *dfa, const re_node_set *nodes,
159 int subexp_idx, int type) internal_function;
160 static reg_errcode_t check_arrival (re_match_context_t *mctx,
161 state_array_t *path, int top_node,
162 int top_str, int last_node, int last_str,
163 int type) internal_function;
164 static reg_errcode_t check_arrival_add_next_nodes (re_match_context_t *mctx,
165 int str_idx,
166 re_node_set *cur_nodes,
167 re_node_set *next_nodes) internal_function;
168 static reg_errcode_t check_arrival_expand_ecl (re_dfa_t *dfa,
169 re_node_set *cur_nodes,
170 int ex_subexp, int type) internal_function;
171 static reg_errcode_t check_arrival_expand_ecl_sub (re_dfa_t *dfa,
172 re_node_set *dst_nodes,
173 int target, int ex_subexp,
174 int type) internal_function;
175 static reg_errcode_t expand_bkref_cache (re_match_context_t *mctx,
176 re_node_set *cur_nodes, int cur_str,
177 int subexp_num, int type) internal_function;
178 static re_dfastate_t **build_trtable (re_dfa_t *dfa,
179 re_dfastate_t *state) internal_function;
180 #ifdef RE_ENABLE_I18N
181 static int check_node_accept_bytes (re_dfa_t *dfa, int node_idx,
182 const re_string_t *input, int idx) internal_function;
183 # ifdef _LIBC
184 static unsigned int find_collation_sequence_value (const unsigned char *mbs,
185 size_t name_len) internal_function;
186 # endif /* _LIBC */
187 #endif /* RE_ENABLE_I18N */
188 static int group_nodes_into_DFAstates (re_dfa_t *dfa,
189 const re_dfastate_t *state,
190 re_node_set *states_node,
191 bitset *states_ch) internal_function;
192 static int check_node_accept (const re_match_context_t *mctx,
193 const re_token_t *node, int idx) internal_function;
194 static reg_errcode_t extend_buffers (re_match_context_t *mctx) internal_function;
196 /* Entry point for POSIX code. */
198 /* regexec searches for a given pattern, specified by PREG, in the
199 string STRING.
201 If NMATCH is zero or REG_NOSUB was set in the cflags argument to
202 `regcomp', we ignore PMATCH. Otherwise, we assume PMATCH has at
203 least NMATCH elements, and we set them to the offsets of the
204 corresponding matched substrings.
206 EFLAGS specifies `execution flags' which affect matching: if
207 REG_NOTBOL is set, then ^ does not match at the beginning of the
208 string; if REG_NOTEOL is set, then $ does not match at the end.
210 We return 0 if we find a match and REG_NOMATCH if not. */
213 regexec (preg, string, nmatch, pmatch, eflags)
214 const regex_t *__restrict preg;
215 const char *__restrict string;
216 size_t nmatch;
217 regmatch_t pmatch[];
218 int eflags;
220 reg_errcode_t err;
221 int start, length;
223 if (eflags & ~(REG_NOTBOL | REG_NOTEOL | REG_STARTEND))
224 return REG_BADPAT;
226 if (eflags & REG_STARTEND)
228 start = pmatch[0].rm_so;
229 length = pmatch[0].rm_eo;
231 else
233 start = 0;
234 length = strlen (string);
236 if (preg->no_sub)
237 err = re_search_internal (preg, string, length, start, length - start,
238 length, 0, NULL, eflags);
239 else
240 err = re_search_internal (preg, string, length, start, length - start,
241 length, nmatch, pmatch, eflags);
242 return err != REG_NOERROR;
245 #ifdef _LIBC
246 # include <shlib-compat.h>
247 versioned_symbol (libc, __regexec, regexec, GLIBC_2_3_4);
249 # if SHLIB_COMPAT (libc, GLIBC_2_0, GLIBC_2_3_4)
250 __typeof__ (__regexec) __compat_regexec;
253 attribute_compat_text_section
254 __compat_regexec (const regex_t *__restrict preg,
255 const char *__restrict string, size_t nmatch,
256 regmatch_t pmatch[], int eflags)
258 return regexec (preg, string, nmatch, pmatch,
259 eflags & (REG_NOTBOL | REG_NOTEOL));
261 compat_symbol (libc, __compat_regexec, regexec, GLIBC_2_0);
262 # endif
263 #endif
265 /* Entry points for GNU code. */
267 /* re_match, re_search, re_match_2, re_search_2
269 The former two functions operate on STRING with length LENGTH,
270 while the later two operate on concatenation of STRING1 and STRING2
271 with lengths LENGTH1 and LENGTH2, respectively.
273 re_match() matches the compiled pattern in BUFP against the string,
274 starting at index START.
276 re_search() first tries matching at index START, then it tries to match
277 starting from index START + 1, and so on. The last start position tried
278 is START + RANGE. (Thus RANGE = 0 forces re_search to operate the same
279 way as re_match().)
281 The parameter STOP of re_{match,search}_2 specifies that no match exceeding
282 the first STOP characters of the concatenation of the strings should be
283 concerned.
285 If REGS is not NULL, and BUFP->no_sub is not set, the offsets of the match
286 and all groups is stroed in REGS. (For the "_2" variants, the offsets are
287 computed relative to the concatenation, not relative to the individual
288 strings.)
290 On success, re_match* functions return the length of the match, re_search*
291 return the position of the start of the match. Return value -1 means no
292 match was found and -2 indicates an internal error. */
295 re_match (bufp, string, length, start, regs)
296 struct re_pattern_buffer *bufp;
297 const char *string;
298 int length, start;
299 struct re_registers *regs;
301 return re_search_stub (bufp, string, length, start, 0, length, regs, 1);
303 #ifdef _LIBC
304 weak_alias (__re_match, re_match)
305 #endif
308 re_search (bufp, string, length, start, range, regs)
309 struct re_pattern_buffer *bufp;
310 const char *string;
311 int length, start, range;
312 struct re_registers *regs;
314 return re_search_stub (bufp, string, length, start, range, length, regs, 0);
316 #ifdef _LIBC
317 weak_alias (__re_search, re_search)
318 #endif
321 re_match_2 (bufp, string1, length1, string2, length2, start, regs, stop)
322 struct re_pattern_buffer *bufp;
323 const char *string1, *string2;
324 int length1, length2, start, stop;
325 struct re_registers *regs;
327 return re_search_2_stub (bufp, string1, length1, string2, length2,
328 start, 0, regs, stop, 1);
330 #ifdef _LIBC
331 weak_alias (__re_match_2, re_match_2)
332 #endif
335 re_search_2 (bufp, string1, length1, string2, length2, start, range, regs, stop)
336 struct re_pattern_buffer *bufp;
337 const char *string1, *string2;
338 int length1, length2, start, range, stop;
339 struct re_registers *regs;
341 return re_search_2_stub (bufp, string1, length1, string2, length2,
342 start, range, regs, stop, 0);
344 #ifdef _LIBC
345 weak_alias (__re_search_2, re_search_2)
346 #endif
348 static int
349 re_search_2_stub (bufp, string1, length1, string2, length2, start, range, regs,
350 stop, ret_len)
351 struct re_pattern_buffer *bufp;
352 const char *string1, *string2;
353 int length1, length2, start, range, stop, ret_len;
354 struct re_registers *regs;
356 const char *str;
357 int rval;
358 int len = length1 + length2;
359 int free_str = 0;
361 if (BE (length1 < 0 || length2 < 0 || stop < 0, 0))
362 return -2;
364 /* Concatenate the strings. */
365 if (length2 > 0)
366 if (length1 > 0)
368 char *s = re_malloc (char, len);
370 if (BE (s == NULL, 0))
371 return -2;
372 memcpy (s, string1, length1);
373 memcpy (s + length1, string2, length2);
374 str = s;
375 free_str = 1;
377 else
378 str = string2;
379 else
380 str = string1;
382 rval = re_search_stub (bufp, str, len, start, range, stop, regs,
383 ret_len);
384 if (free_str)
385 re_free ((char *) str);
386 return rval;
389 /* The parameters have the same meaning as those of re_search.
390 Additional parameters:
391 If RET_LEN is nonzero the length of the match is returned (re_match style);
392 otherwise the position of the match is returned. */
394 static int
395 re_search_stub (bufp, string, length, start, range, stop, regs, ret_len)
396 struct re_pattern_buffer *bufp;
397 const char *string;
398 int length, start, range, stop, ret_len;
399 struct re_registers *regs;
401 reg_errcode_t result;
402 regmatch_t *pmatch;
403 int nregs, rval;
404 int eflags = 0;
406 /* Check for out-of-range. */
407 if (BE (start < 0 || start > length, 0))
408 return -1;
409 if (BE (start + range > length, 0))
410 range = length - start;
411 else if (BE (start + range < 0, 0))
412 range = -start;
414 eflags |= (bufp->not_bol) ? REG_NOTBOL : 0;
415 eflags |= (bufp->not_eol) ? REG_NOTEOL : 0;
417 /* Compile fastmap if we haven't yet. */
418 if (range > 0 && bufp->fastmap != NULL && !bufp->fastmap_accurate)
419 re_compile_fastmap (bufp);
421 if (BE (bufp->no_sub, 0))
422 regs = NULL;
424 /* We need at least 1 register. */
425 if (regs == NULL)
426 nregs = 1;
427 else if (BE (bufp->regs_allocated == REGS_FIXED &&
428 regs->num_regs < bufp->re_nsub + 1, 0))
430 nregs = regs->num_regs;
431 if (BE (nregs < 1, 0))
433 /* Nothing can be copied to regs. */
434 regs = NULL;
435 nregs = 1;
438 else
439 nregs = bufp->re_nsub + 1;
440 pmatch = re_malloc (regmatch_t, nregs);
441 if (BE (pmatch == NULL, 0))
442 return -2;
444 result = re_search_internal (bufp, string, length, start, range, stop,
445 nregs, pmatch, eflags);
447 rval = 0;
449 /* I hope we needn't fill ther regs with -1's when no match was found. */
450 if (result != REG_NOERROR)
451 rval = -1;
452 else if (regs != NULL)
454 /* If caller wants register contents data back, copy them. */
455 bufp->regs_allocated = re_copy_regs (regs, pmatch, nregs,
456 bufp->regs_allocated);
457 if (BE (bufp->regs_allocated == REGS_UNALLOCATED, 0))
458 rval = -2;
461 if (BE (rval == 0, 1))
463 if (ret_len)
465 assert (pmatch[0].rm_so == start);
466 rval = pmatch[0].rm_eo - start;
468 else
469 rval = pmatch[0].rm_so;
471 re_free (pmatch);
472 return rval;
475 static unsigned
476 re_copy_regs (regs, pmatch, nregs, regs_allocated)
477 struct re_registers *regs;
478 regmatch_t *pmatch;
479 int nregs, regs_allocated;
481 int rval = REGS_REALLOCATE;
482 int i;
483 int need_regs = nregs + 1;
484 /* We need one extra element beyond `num_regs' for the `-1' marker GNU code
485 uses. */
487 /* Have the register data arrays been allocated? */
488 if (regs_allocated == REGS_UNALLOCATED)
489 { /* No. So allocate them with malloc. */
490 regs->start = re_malloc (regoff_t, need_regs);
491 regs->end = re_malloc (regoff_t, need_regs);
492 if (BE (regs->start == NULL, 0) || BE (regs->end == NULL, 0))
493 return REGS_UNALLOCATED;
494 regs->num_regs = need_regs;
496 else if (regs_allocated == REGS_REALLOCATE)
497 { /* Yes. If we need more elements than were already
498 allocated, reallocate them. If we need fewer, just
499 leave it alone. */
500 if (BE (need_regs > regs->num_regs, 0))
502 regoff_t *new_start = re_realloc (regs->start, regoff_t, need_regs);
503 regoff_t *new_end = re_realloc (regs->end, regoff_t, need_regs);
504 if (BE (new_start == NULL, 0) || BE (new_end == NULL, 0))
505 return REGS_UNALLOCATED;
506 regs->start = new_start;
507 regs->end = new_end;
508 regs->num_regs = need_regs;
511 else
513 assert (regs_allocated == REGS_FIXED);
514 /* This function may not be called with REGS_FIXED and nregs too big. */
515 assert (regs->num_regs >= nregs);
516 rval = REGS_FIXED;
519 /* Copy the regs. */
520 for (i = 0; i < nregs; ++i)
522 regs->start[i] = pmatch[i].rm_so;
523 regs->end[i] = pmatch[i].rm_eo;
525 for ( ; i < regs->num_regs; ++i)
526 regs->start[i] = regs->end[i] = -1;
528 return rval;
531 /* Set REGS to hold NUM_REGS registers, storing them in STARTS and
532 ENDS. Subsequent matches using PATTERN_BUFFER and REGS will use
533 this memory for recording register information. STARTS and ENDS
534 must be allocated using the malloc library routine, and must each
535 be at least NUM_REGS * sizeof (regoff_t) bytes long.
537 If NUM_REGS == 0, then subsequent matches should allocate their own
538 register data.
540 Unless this function is called, the first search or match using
541 PATTERN_BUFFER will allocate its own register data, without
542 freeing the old data. */
544 void
545 re_set_registers (bufp, regs, num_regs, starts, ends)
546 struct re_pattern_buffer *bufp;
547 struct re_registers *regs;
548 unsigned num_regs;
549 regoff_t *starts, *ends;
551 if (num_regs)
553 bufp->regs_allocated = REGS_REALLOCATE;
554 regs->num_regs = num_regs;
555 regs->start = starts;
556 regs->end = ends;
558 else
560 bufp->regs_allocated = REGS_UNALLOCATED;
561 regs->num_regs = 0;
562 regs->start = regs->end = (regoff_t *) 0;
565 #ifdef _LIBC
566 weak_alias (__re_set_registers, re_set_registers)
567 #endif
569 /* Entry points compatible with 4.2 BSD regex library. We don't define
570 them unless specifically requested. */
572 #if defined _REGEX_RE_COMP || defined _LIBC
574 # ifdef _LIBC
575 weak_function
576 # endif
577 re_exec (s)
578 const char *s;
580 return 0 == regexec (&re_comp_buf, s, 0, NULL, 0);
582 #endif /* _REGEX_RE_COMP */
584 /* Internal entry point. */
586 /* Searches for a compiled pattern PREG in the string STRING, whose
587 length is LENGTH. NMATCH, PMATCH, and EFLAGS have the same
588 mingings with regexec. START, and RANGE have the same meanings
589 with re_search.
590 Return REG_NOERROR if we find a match, and REG_NOMATCH if not,
591 otherwise return the error code.
592 Note: We assume front end functions already check ranges.
593 (START + RANGE >= 0 && START + RANGE <= LENGTH) */
595 static reg_errcode_t
596 re_search_internal (preg, string, length, start, range, stop, nmatch, pmatch,
597 eflags)
598 const regex_t *preg;
599 const char *string;
600 int length, start, range, stop, eflags;
601 size_t nmatch;
602 regmatch_t pmatch[];
604 reg_errcode_t err;
605 re_dfa_t *dfa = (re_dfa_t *)preg->buffer;
606 int left_lim, right_lim, incr;
607 int fl_longest_match, match_first, match_kind, match_last = -1;
608 int sb, ch;
609 #if defined _LIBC || (defined __STDC_VERSION__ && __STDC_VERSION__ >= 199901L)
610 re_match_context_t mctx = { .dfa = dfa };
611 #else
612 re_match_context_t mctx;
613 #endif
614 char *fastmap = (preg->fastmap != NULL && preg->fastmap_accurate
615 && range && !preg->can_be_null) ? preg->fastmap : NULL;
616 unsigned RE_TRANSLATE_TYPE t = (unsigned RE_TRANSLATE_TYPE) preg->translate;
618 #if !(defined _LIBC || (defined __STDC_VERSION__ && __STDC_VERSION__ >= 199901L))
619 memset (&mctx, '\0', sizeof (re_match_context_t));
620 mctx.dfa = dfa;
621 #endif
623 /* Check if the DFA haven't been compiled. */
624 if (BE (preg->used == 0 || dfa->init_state == NULL
625 || dfa->init_state_word == NULL || dfa->init_state_nl == NULL
626 || dfa->init_state_begbuf == NULL, 0))
627 return REG_NOMATCH;
629 #ifdef DEBUG
630 /* We assume front-end functions already check them. */
631 assert (start + range >= 0 && start + range <= length);
632 #endif
634 /* If initial states with non-begbuf contexts have no elements,
635 the regex must be anchored. If preg->newline_anchor is set,
636 we'll never use init_state_nl, so do not check it. */
637 if (dfa->init_state->nodes.nelem == 0
638 && dfa->init_state_word->nodes.nelem == 0
639 && (dfa->init_state_nl->nodes.nelem == 0
640 || !preg->newline_anchor))
642 if (start != 0 && start + range != 0)
643 return REG_NOMATCH;
644 start = range = 0;
647 /* We must check the longest matching, if nmatch > 0. */
648 fl_longest_match = (nmatch != 0 || dfa->nbackref);
650 err = re_string_allocate (&mctx.input, string, length, dfa->nodes_len + 1,
651 preg->translate, preg->syntax & RE_ICASE, dfa);
652 if (BE (err != REG_NOERROR, 0))
653 goto free_return;
654 mctx.input.stop = stop;
655 mctx.input.raw_stop = stop;
656 mctx.input.newline_anchor = preg->newline_anchor;
658 err = match_ctx_init (&mctx, eflags, dfa->nbackref * 2);
659 if (BE (err != REG_NOERROR, 0))
660 goto free_return;
662 /* We will log all the DFA states through which the dfa pass,
663 if nmatch > 1, or this dfa has "multibyte node", which is a
664 back-reference or a node which can accept multibyte character or
665 multi character collating element. */
666 if (nmatch > 1 || dfa->has_mb_node)
668 mctx.state_log = re_malloc (re_dfastate_t *, mctx.input.bufs_len + 1);
669 if (BE (mctx.state_log == NULL, 0))
671 err = REG_ESPACE;
672 goto free_return;
675 else
676 mctx.state_log = NULL;
678 match_first = start;
679 mctx.input.tip_context = (eflags & REG_NOTBOL) ? CONTEXT_BEGBUF
680 : CONTEXT_NEWLINE | CONTEXT_BEGBUF;
682 /* Check incrementally whether of not the input string match. */
683 incr = (range < 0) ? -1 : 1;
684 left_lim = (range < 0) ? start + range : start;
685 right_lim = (range < 0) ? start : start + range;
686 sb = dfa->mb_cur_max == 1;
687 match_kind =
688 (fastmap
689 ? ((sb || !(preg->syntax & RE_ICASE || t) ? 4 : 0)
690 | (range >= 0 ? 2 : 0)
691 | (t != NULL ? 1 : 0))
692 : 8);
694 for (;; match_first += incr)
696 err = REG_NOMATCH;
697 if (match_first < left_lim || right_lim < match_first)
698 goto free_return;
700 /* Advance as rapidly as possible through the string, until we
701 find a plausible place to start matching. This may be done
702 with varying efficiency, so there are various possibilities:
703 only the most common of them are specialized, in order to
704 save on code size. We use a switch statement for speed. */
705 switch (match_kind)
707 case 8:
708 /* No fastmap. */
709 break;
711 case 7:
712 /* Fastmap with single-byte translation, match forward. */
713 while (BE (match_first < right_lim, 1)
714 && !fastmap[t[(unsigned char) string[match_first]]])
715 ++match_first;
716 goto forward_match_found_start_or_reached_end;
718 case 6:
719 /* Fastmap without translation, match forward. */
720 while (BE (match_first < right_lim, 1)
721 && !fastmap[(unsigned char) string[match_first]])
722 ++match_first;
724 forward_match_found_start_or_reached_end:
725 if (BE (match_first == right_lim, 0))
727 ch = match_first >= length
728 ? 0 : (unsigned char) string[match_first];
729 if (!fastmap[t ? t[ch] : ch])
730 goto free_return;
732 break;
734 case 4:
735 case 5:
736 /* Fastmap without multi-byte translation, match backwards. */
737 while (match_first >= left_lim)
739 ch = match_first >= length
740 ? 0 : (unsigned char) string[match_first];
741 if (fastmap[t ? t[ch] : ch])
742 break;
743 --match_first;
745 if (match_first < left_lim)
746 goto free_return;
747 break;
749 default:
750 /* In this case, we can't determine easily the current byte,
751 since it might be a component byte of a multibyte
752 character. Then we use the constructed buffer instead. */
753 for (;;)
755 /* If MATCH_FIRST is out of the valid range, reconstruct the
756 buffers. */
757 unsigned int offset = match_first - mctx.input.raw_mbs_idx;
758 if (BE (offset >= (unsigned int) mctx.input.valid_raw_len, 0))
760 err = re_string_reconstruct (&mctx.input, match_first,
761 eflags);
762 if (BE (err != REG_NOERROR, 0))
763 goto free_return;
765 offset = match_first - mctx.input.raw_mbs_idx;
767 /* If MATCH_FIRST is out of the buffer, leave it as '\0'.
768 Note that MATCH_FIRST must not be smaller than 0. */
769 ch = (match_first >= length
770 ? 0 : re_string_byte_at (&mctx.input, offset));
771 if (fastmap[ch])
772 break;
773 match_first += incr;
774 if (match_first < left_lim || match_first > right_lim)
776 err = REG_NOMATCH;
777 goto free_return;
780 break;
783 /* Reconstruct the buffers so that the matcher can assume that
784 the matching starts from the beginning of the buffer. */
785 err = re_string_reconstruct (&mctx.input, match_first, eflags);
786 if (BE (err != REG_NOERROR, 0))
787 goto free_return;
789 #ifdef RE_ENABLE_I18N
790 /* Don't consider this char as a possible match start if it part,
791 yet isn't the head, of a multibyte character. */
792 if (!sb && !re_string_first_byte (&mctx.input, 0))
793 continue;
794 #endif
796 /* It seems to be appropriate one, then use the matcher. */
797 /* We assume that the matching starts from 0. */
798 mctx.state_log_top = mctx.nbkref_ents = mctx.max_mb_elem_len = 0;
799 match_last = check_matching (&mctx, fl_longest_match,
800 range >= 0 ? &match_first : NULL);
801 if (match_last != -1)
803 if (BE (match_last == -2, 0))
805 err = REG_ESPACE;
806 goto free_return;
808 else
810 mctx.match_last = match_last;
811 if ((!preg->no_sub && nmatch > 1) || dfa->nbackref)
813 re_dfastate_t *pstate = mctx.state_log[match_last];
814 mctx.last_node = check_halt_state_context (&mctx, pstate,
815 match_last);
817 if ((!preg->no_sub && nmatch > 1 && dfa->has_plural_match)
818 || dfa->nbackref)
820 err = prune_impossible_nodes (&mctx);
821 if (err == REG_NOERROR)
822 break;
823 if (BE (err != REG_NOMATCH, 0))
824 goto free_return;
825 match_last = -1;
827 else
828 break; /* We found a match. */
832 match_ctx_clean (&mctx);
835 #ifdef DEBUG
836 assert (match_last != -1);
837 assert (err == REG_NOERROR);
838 #endif
840 /* Set pmatch[] if we need. */
841 if (nmatch > 0)
843 int reg_idx;
845 /* Initialize registers. */
846 for (reg_idx = 1; reg_idx < nmatch; ++reg_idx)
847 pmatch[reg_idx].rm_so = pmatch[reg_idx].rm_eo = -1;
849 /* Set the points where matching start/end. */
850 pmatch[0].rm_so = 0;
851 pmatch[0].rm_eo = mctx.match_last;
853 if (!preg->no_sub && nmatch > 1)
855 err = set_regs (preg, &mctx, nmatch, pmatch,
856 dfa->has_plural_match && dfa->nbackref > 0);
857 if (BE (err != REG_NOERROR, 0))
858 goto free_return;
861 /* At last, add the offset to the each registers, since we slided
862 the buffers so that we could assume that the matching starts
863 from 0. */
864 for (reg_idx = 0; reg_idx < nmatch; ++reg_idx)
865 if (pmatch[reg_idx].rm_so != -1)
867 #ifdef RE_ENABLE_I18N
868 if (BE (mctx.input.offsets_needed != 0, 0))
870 if (pmatch[reg_idx].rm_so == mctx.input.valid_len)
871 pmatch[reg_idx].rm_so += mctx.input.valid_raw_len - mctx.input.valid_len;
872 else
873 pmatch[reg_idx].rm_so = mctx.input.offsets[pmatch[reg_idx].rm_so];
874 if (pmatch[reg_idx].rm_eo == mctx.input.valid_len)
875 pmatch[reg_idx].rm_eo += mctx.input.valid_raw_len - mctx.input.valid_len;
876 else
877 pmatch[reg_idx].rm_eo = mctx.input.offsets[pmatch[reg_idx].rm_eo];
879 #else
880 assert (mctx.input.offsets_needed == 0);
881 #endif
882 pmatch[reg_idx].rm_so += match_first;
883 pmatch[reg_idx].rm_eo += match_first;
886 if (dfa->subexp_map)
887 for (reg_idx = 0;
888 reg_idx + 1 < nmatch && reg_idx < preg->re_nsub;
889 reg_idx++)
890 if (dfa->subexp_map[reg_idx] != reg_idx)
892 pmatch[reg_idx + 1].rm_so
893 = pmatch[dfa->subexp_map[reg_idx] + 1].rm_so;
894 pmatch[reg_idx + 1].rm_eo
895 = pmatch[dfa->subexp_map[reg_idx] + 1].rm_eo;
899 free_return:
900 re_free (mctx.state_log);
901 if (dfa->nbackref)
902 match_ctx_free (&mctx);
903 re_string_destruct (&mctx.input);
904 return err;
907 static reg_errcode_t
908 prune_impossible_nodes (mctx)
909 re_match_context_t *mctx;
911 re_dfa_t *const dfa = mctx->dfa;
912 int halt_node, match_last;
913 reg_errcode_t ret;
914 re_dfastate_t **sifted_states;
915 re_dfastate_t **lim_states = NULL;
916 re_sift_context_t sctx;
917 #ifdef DEBUG
918 assert (mctx->state_log != NULL);
919 #endif
920 match_last = mctx->match_last;
921 halt_node = mctx->last_node;
922 sifted_states = re_malloc (re_dfastate_t *, match_last + 1);
923 if (BE (sifted_states == NULL, 0))
925 ret = REG_ESPACE;
926 goto free_return;
928 if (dfa->nbackref)
930 lim_states = re_malloc (re_dfastate_t *, match_last + 1);
931 if (BE (lim_states == NULL, 0))
933 ret = REG_ESPACE;
934 goto free_return;
936 while (1)
938 memset (lim_states, '\0',
939 sizeof (re_dfastate_t *) * (match_last + 1));
940 sift_ctx_init (&sctx, sifted_states, lim_states, halt_node,
941 match_last);
942 ret = sift_states_backward (mctx, &sctx);
943 re_node_set_free (&sctx.limits);
944 if (BE (ret != REG_NOERROR, 0))
945 goto free_return;
946 if (sifted_states[0] != NULL || lim_states[0] != NULL)
947 break;
950 --match_last;
951 if (match_last < 0)
953 ret = REG_NOMATCH;
954 goto free_return;
956 } while (mctx->state_log[match_last] == NULL
957 || !mctx->state_log[match_last]->halt);
958 halt_node = check_halt_state_context (mctx,
959 mctx->state_log[match_last],
960 match_last);
962 ret = merge_state_array (dfa, sifted_states, lim_states,
963 match_last + 1);
964 re_free (lim_states);
965 lim_states = NULL;
966 if (BE (ret != REG_NOERROR, 0))
967 goto free_return;
969 else
971 sift_ctx_init (&sctx, sifted_states, lim_states, halt_node, match_last);
972 ret = sift_states_backward (mctx, &sctx);
973 re_node_set_free (&sctx.limits);
974 if (BE (ret != REG_NOERROR, 0))
975 goto free_return;
977 re_free (mctx->state_log);
978 mctx->state_log = sifted_states;
979 sifted_states = NULL;
980 mctx->last_node = halt_node;
981 mctx->match_last = match_last;
982 ret = REG_NOERROR;
983 free_return:
984 re_free (sifted_states);
985 re_free (lim_states);
986 return ret;
989 /* Acquire an initial state and return it.
990 We must select appropriate initial state depending on the context,
991 since initial states may have constraints like "\<", "^", etc.. */
993 static inline re_dfastate_t *
994 acquire_init_state_context (err, mctx, idx)
995 reg_errcode_t *err;
996 const re_match_context_t *mctx;
997 int idx;
999 re_dfa_t *const dfa = mctx->dfa;
1000 if (dfa->init_state->has_constraint)
1002 unsigned int context;
1003 context = re_string_context_at (&mctx->input, idx - 1, mctx->eflags);
1004 if (IS_WORD_CONTEXT (context))
1005 return dfa->init_state_word;
1006 else if (IS_ORDINARY_CONTEXT (context))
1007 return dfa->init_state;
1008 else if (IS_BEGBUF_CONTEXT (context) && IS_NEWLINE_CONTEXT (context))
1009 return dfa->init_state_begbuf;
1010 else if (IS_NEWLINE_CONTEXT (context))
1011 return dfa->init_state_nl;
1012 else if (IS_BEGBUF_CONTEXT (context))
1014 /* It is relatively rare case, then calculate on demand. */
1015 return re_acquire_state_context (err, dfa,
1016 dfa->init_state->entrance_nodes,
1017 context);
1019 else
1020 /* Must not happen? */
1021 return dfa->init_state;
1023 else
1024 return dfa->init_state;
1027 /* Check whether the regular expression match input string INPUT or not,
1028 and return the index where the matching end, return -1 if not match,
1029 or return -2 in case of an error.
1030 FL_LONGEST_MATCH means we want the POSIX longest matching.
1031 If P_MATCH_FIRST is not NULL, and the match fails, it is set to the
1032 next place where we may want to try matching.
1033 Note that the matcher assume that the maching starts from the current
1034 index of the buffer. */
1036 static int
1037 check_matching (mctx, fl_longest_match, p_match_first)
1038 re_match_context_t *mctx;
1039 int fl_longest_match;
1040 int *p_match_first;
1042 re_dfa_t *const dfa = mctx->dfa;
1043 reg_errcode_t err;
1044 int match = 0;
1045 int match_last = -1;
1046 int cur_str_idx = re_string_cur_idx (&mctx->input);
1047 re_dfastate_t *cur_state;
1048 int at_init_state = p_match_first != NULL;
1049 int next_start_idx = cur_str_idx;
1051 err = REG_NOERROR;
1052 cur_state = acquire_init_state_context (&err, mctx, cur_str_idx);
1053 /* An initial state must not be NULL (invalid). */
1054 if (BE (cur_state == NULL, 0))
1056 assert (err == REG_ESPACE);
1057 return -2;
1060 if (mctx->state_log != NULL)
1062 mctx->state_log[cur_str_idx] = cur_state;
1064 /* Check OP_OPEN_SUBEXP in the initial state in case that we use them
1065 later. E.g. Processing back references. */
1066 if (BE (dfa->nbackref, 0))
1068 at_init_state = 0;
1069 err = check_subexp_matching_top (mctx, &cur_state->nodes, 0);
1070 if (BE (err != REG_NOERROR, 0))
1071 return err;
1073 if (cur_state->has_backref)
1075 err = transit_state_bkref (mctx, &cur_state->nodes);
1076 if (BE (err != REG_NOERROR, 0))
1077 return err;
1082 /* If the RE accepts NULL string. */
1083 if (BE (cur_state->halt, 0))
1085 if (!cur_state->has_constraint
1086 || check_halt_state_context (mctx, cur_state, cur_str_idx))
1088 if (!fl_longest_match)
1089 return cur_str_idx;
1090 else
1092 match_last = cur_str_idx;
1093 match = 1;
1098 while (!re_string_eoi (&mctx->input))
1100 re_dfastate_t *old_state = cur_state;
1101 int next_char_idx = re_string_cur_idx (&mctx->input) + 1;
1103 if (BE (next_char_idx >= mctx->input.bufs_len, 0)
1104 || (BE (next_char_idx >= mctx->input.valid_len, 0)
1105 && mctx->input.valid_len < mctx->input.len))
1107 err = extend_buffers (mctx);
1108 if (BE (err != REG_NOERROR, 0))
1110 assert (err == REG_ESPACE);
1111 return -2;
1115 cur_state = transit_state (&err, mctx, cur_state);
1116 if (mctx->state_log != NULL)
1117 cur_state = merge_state_with_log (&err, mctx, cur_state);
1119 if (cur_state == NULL)
1121 /* Reached the invalid state or an error. Try to recover a valid
1122 state using the state log, if available and if we have not
1123 already found a valid (even if not the longest) match. */
1124 if (BE (err != REG_NOERROR, 0))
1125 return -2;
1127 if (mctx->state_log == NULL
1128 || (match && !fl_longest_match)
1129 || (cur_state = find_recover_state (&err, mctx)) == NULL)
1130 break;
1133 if (BE (at_init_state, 0))
1135 if (old_state == cur_state)
1136 next_start_idx = next_char_idx;
1137 else
1138 at_init_state = 0;
1141 if (cur_state->halt)
1143 /* Reached a halt state.
1144 Check the halt state can satisfy the current context. */
1145 if (!cur_state->has_constraint
1146 || check_halt_state_context (mctx, cur_state,
1147 re_string_cur_idx (&mctx->input)))
1149 /* We found an appropriate halt state. */
1150 match_last = re_string_cur_idx (&mctx->input);
1151 match = 1;
1153 /* We found a match, do not modify match_first below. */
1154 p_match_first = NULL;
1155 if (!fl_longest_match)
1156 break;
1161 if (p_match_first)
1162 *p_match_first += next_start_idx;
1164 return match_last;
1167 /* Check NODE match the current context. */
1169 static int check_halt_node_context (dfa, node, context)
1170 const re_dfa_t *dfa;
1171 int node;
1172 unsigned int context;
1174 re_token_type_t type = dfa->nodes[node].type;
1175 unsigned int constraint = dfa->nodes[node].constraint;
1176 if (type != END_OF_RE)
1177 return 0;
1178 if (!constraint)
1179 return 1;
1180 if (NOT_SATISFY_NEXT_CONSTRAINT (constraint, context))
1181 return 0;
1182 return 1;
1185 /* Check the halt state STATE match the current context.
1186 Return 0 if not match, if the node, STATE has, is a halt node and
1187 match the context, return the node. */
1189 static int
1190 check_halt_state_context (mctx, state, idx)
1191 const re_match_context_t *mctx;
1192 const re_dfastate_t *state;
1193 int idx;
1195 int i;
1196 unsigned int context;
1197 #ifdef DEBUG
1198 assert (state->halt);
1199 #endif
1200 context = re_string_context_at (&mctx->input, idx, mctx->eflags);
1201 for (i = 0; i < state->nodes.nelem; ++i)
1202 if (check_halt_node_context (mctx->dfa, state->nodes.elems[i], context))
1203 return state->nodes.elems[i];
1204 return 0;
1207 /* Compute the next node to which "NFA" transit from NODE("NFA" is a NFA
1208 corresponding to the DFA).
1209 Return the destination node, and update EPS_VIA_NODES, return -1 in case
1210 of errors. */
1212 static int
1213 proceed_next_node (mctx, nregs, regs, pidx, node, eps_via_nodes, fs)
1214 const re_match_context_t *mctx;
1215 regmatch_t *regs;
1216 int nregs, *pidx, node;
1217 re_node_set *eps_via_nodes;
1218 struct re_fail_stack_t *fs;
1220 re_dfa_t *const dfa = mctx->dfa;
1221 int i, err, dest_node;
1222 dest_node = -1;
1223 if (IS_EPSILON_NODE (dfa->nodes[node].type))
1225 re_node_set *cur_nodes = &mctx->state_log[*pidx]->nodes;
1226 int ndest, dest_nodes[2];
1227 err = re_node_set_insert (eps_via_nodes, node);
1228 if (BE (err < 0, 0))
1229 return -2;
1230 /* Pick up valid destinations. */
1231 for (ndest = 0, i = 0; i < dfa->edests[node].nelem; ++i)
1233 int candidate = dfa->edests[node].elems[i];
1234 if (!re_node_set_contains (cur_nodes, candidate))
1235 continue;
1236 dest_nodes[0] = (ndest == 0) ? candidate : dest_nodes[0];
1237 dest_nodes[1] = (ndest == 1) ? candidate : dest_nodes[1];
1238 ++ndest;
1240 if (ndest <= 1)
1241 return ndest == 0 ? -1 : (ndest == 1 ? dest_nodes[0] : 0);
1242 /* In order to avoid infinite loop like "(a*)*". */
1243 if (re_node_set_contains (eps_via_nodes, dest_nodes[0]))
1244 return dest_nodes[1];
1245 if (fs != NULL
1246 && push_fail_stack (fs, *pidx, dest_nodes, nregs, regs,
1247 eps_via_nodes))
1248 return -2;
1249 return dest_nodes[0];
1251 else
1253 int naccepted = 0;
1254 re_token_type_t type = dfa->nodes[node].type;
1256 #ifdef RE_ENABLE_I18N
1257 if (ACCEPT_MB_NODE (type))
1258 naccepted = check_node_accept_bytes (dfa, node, &mctx->input, *pidx);
1259 else
1260 #endif /* RE_ENABLE_I18N */
1261 if (type == OP_BACK_REF)
1263 int subexp_idx = dfa->nodes[node].opr.idx;
1264 naccepted = regs[subexp_idx].rm_eo - regs[subexp_idx].rm_so;
1265 if (fs != NULL)
1267 if (regs[subexp_idx].rm_so == -1 || regs[subexp_idx].rm_eo == -1)
1268 return -1;
1269 else if (naccepted)
1271 char *buf = (char *) re_string_get_buffer (&mctx->input);
1272 if (memcmp (buf + regs[subexp_idx].rm_so, buf + *pidx,
1273 naccepted) != 0)
1274 return -1;
1278 if (naccepted == 0)
1280 err = re_node_set_insert (eps_via_nodes, node);
1281 if (BE (err < 0, 0))
1282 return -2;
1283 dest_node = dfa->edests[node].elems[0];
1284 if (re_node_set_contains (&mctx->state_log[*pidx]->nodes,
1285 dest_node))
1286 return dest_node;
1290 if (naccepted != 0
1291 || check_node_accept (mctx, dfa->nodes + node, *pidx))
1293 dest_node = dfa->nexts[node];
1294 *pidx = (naccepted == 0) ? *pidx + 1 : *pidx + naccepted;
1295 if (fs && (*pidx > mctx->match_last || mctx->state_log[*pidx] == NULL
1296 || !re_node_set_contains (&mctx->state_log[*pidx]->nodes,
1297 dest_node)))
1298 return -1;
1299 re_node_set_empty (eps_via_nodes);
1300 return dest_node;
1303 return -1;
1306 static reg_errcode_t
1307 push_fail_stack (fs, str_idx, dests, nregs, regs, eps_via_nodes)
1308 struct re_fail_stack_t *fs;
1309 int str_idx, *dests, nregs;
1310 regmatch_t *regs;
1311 re_node_set *eps_via_nodes;
1313 reg_errcode_t err;
1314 int num = fs->num++;
1315 if (fs->num == fs->alloc)
1317 struct re_fail_stack_ent_t *new_array;
1318 new_array = realloc (fs->stack, (sizeof (struct re_fail_stack_ent_t)
1319 * fs->alloc * 2));
1320 if (new_array == NULL)
1321 return REG_ESPACE;
1322 fs->alloc *= 2;
1323 fs->stack = new_array;
1325 fs->stack[num].idx = str_idx;
1326 fs->stack[num].node = dests[1];
1327 fs->stack[num].regs = re_malloc (regmatch_t, nregs);
1328 if (fs->stack[num].regs == NULL)
1329 return REG_ESPACE;
1330 memcpy (fs->stack[num].regs, regs, sizeof (regmatch_t) * nregs);
1331 err = re_node_set_init_copy (&fs->stack[num].eps_via_nodes, eps_via_nodes);
1332 return err;
1335 static int
1336 pop_fail_stack (fs, pidx, nregs, regs, eps_via_nodes)
1337 struct re_fail_stack_t *fs;
1338 int *pidx, nregs;
1339 regmatch_t *regs;
1340 re_node_set *eps_via_nodes;
1342 int num = --fs->num;
1343 assert (num >= 0);
1344 *pidx = fs->stack[num].idx;
1345 memcpy (regs, fs->stack[num].regs, sizeof (regmatch_t) * nregs);
1346 re_node_set_free (eps_via_nodes);
1347 re_free (fs->stack[num].regs);
1348 *eps_via_nodes = fs->stack[num].eps_via_nodes;
1349 return fs->stack[num].node;
1352 /* Set the positions where the subexpressions are starts/ends to registers
1353 PMATCH.
1354 Note: We assume that pmatch[0] is already set, and
1355 pmatch[i].rm_so == pmatch[i].rm_eo == -1 for 0 < i < nmatch. */
1357 static reg_errcode_t
1358 set_regs (preg, mctx, nmatch, pmatch, fl_backtrack)
1359 const regex_t *preg;
1360 const re_match_context_t *mctx;
1361 size_t nmatch;
1362 regmatch_t *pmatch;
1363 int fl_backtrack;
1365 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1366 int idx, cur_node, real_nmatch;
1367 re_node_set eps_via_nodes;
1368 struct re_fail_stack_t *fs;
1369 struct re_fail_stack_t fs_body = { 0, 2, NULL };
1370 regmatch_t *prev_idx_match;
1372 #ifdef DEBUG
1373 assert (nmatch > 1);
1374 assert (mctx->state_log != NULL);
1375 #endif
1376 if (fl_backtrack)
1378 fs = &fs_body;
1379 fs->stack = re_malloc (struct re_fail_stack_ent_t, fs->alloc);
1380 if (fs->stack == NULL)
1381 return REG_ESPACE;
1383 else
1384 fs = NULL;
1386 cur_node = dfa->init_node;
1387 real_nmatch = (nmatch <= preg->re_nsub) ? nmatch : preg->re_nsub + 1;
1388 re_node_set_init_empty (&eps_via_nodes);
1390 prev_idx_match = (regmatch_t *) alloca (sizeof (regmatch_t) * real_nmatch);
1391 memcpy (prev_idx_match, pmatch, sizeof (regmatch_t) * real_nmatch);
1393 for (idx = pmatch[0].rm_so; idx <= pmatch[0].rm_eo ;)
1395 update_regs (dfa, pmatch, prev_idx_match, cur_node, idx, real_nmatch);
1397 if (idx == pmatch[0].rm_eo && cur_node == mctx->last_node)
1399 int reg_idx;
1400 if (fs)
1402 for (reg_idx = 0; reg_idx < nmatch; ++reg_idx)
1403 if (pmatch[reg_idx].rm_so > -1 && pmatch[reg_idx].rm_eo == -1)
1404 break;
1405 if (reg_idx == nmatch)
1407 re_node_set_free (&eps_via_nodes);
1408 return free_fail_stack_return (fs);
1410 cur_node = pop_fail_stack (fs, &idx, nmatch, pmatch,
1411 &eps_via_nodes);
1413 else
1415 re_node_set_free (&eps_via_nodes);
1416 return REG_NOERROR;
1420 /* Proceed to next node. */
1421 cur_node = proceed_next_node (mctx, nmatch, pmatch, &idx, cur_node,
1422 &eps_via_nodes, fs);
1424 if (BE (cur_node < 0, 0))
1426 if (BE (cur_node == -2, 0))
1428 re_node_set_free (&eps_via_nodes);
1429 free_fail_stack_return (fs);
1430 return REG_ESPACE;
1432 if (fs)
1433 cur_node = pop_fail_stack (fs, &idx, nmatch, pmatch,
1434 &eps_via_nodes);
1435 else
1437 re_node_set_free (&eps_via_nodes);
1438 return REG_NOMATCH;
1442 re_node_set_free (&eps_via_nodes);
1443 return free_fail_stack_return (fs);
1446 static reg_errcode_t
1447 free_fail_stack_return (fs)
1448 struct re_fail_stack_t *fs;
1450 if (fs)
1452 int fs_idx;
1453 for (fs_idx = 0; fs_idx < fs->num; ++fs_idx)
1455 re_node_set_free (&fs->stack[fs_idx].eps_via_nodes);
1456 re_free (fs->stack[fs_idx].regs);
1458 re_free (fs->stack);
1460 return REG_NOERROR;
1463 static void
1464 update_regs (dfa, pmatch, prev_idx_match, cur_node, cur_idx, nmatch)
1465 re_dfa_t *dfa;
1466 regmatch_t *pmatch, *prev_idx_match;
1467 int cur_node, cur_idx, nmatch;
1469 int type = dfa->nodes[cur_node].type;
1470 if (type == OP_OPEN_SUBEXP)
1472 int reg_num = dfa->nodes[cur_node].opr.idx + 1;
1474 /* We are at the first node of this sub expression. */
1475 if (reg_num < nmatch)
1477 pmatch[reg_num].rm_so = cur_idx;
1478 pmatch[reg_num].rm_eo = -1;
1481 else if (type == OP_CLOSE_SUBEXP)
1483 int reg_num = dfa->nodes[cur_node].opr.idx + 1;
1484 if (reg_num < nmatch)
1486 /* We are at the last node of this sub expression. */
1487 if (pmatch[reg_num].rm_so < cur_idx)
1489 pmatch[reg_num].rm_eo = cur_idx;
1490 /* This is a non-empty match or we are not inside an optional
1491 subexpression. Accept this right away. */
1492 memcpy (prev_idx_match, pmatch, sizeof (regmatch_t) * nmatch);
1494 else
1496 if (dfa->nodes[cur_node].opt_subexp
1497 && prev_idx_match[reg_num].rm_so != -1)
1498 /* We transited through an empty match for an optional
1499 subexpression, like (a?)*, and this is not the subexp's
1500 first match. Copy back the old content of the registers
1501 so that matches of an inner subexpression are undone as
1502 well, like in ((a?))*. */
1503 memcpy (pmatch, prev_idx_match, sizeof (regmatch_t) * nmatch);
1504 else
1505 /* We completed a subexpression, but it may be part of
1506 an optional one, so do not update PREV_IDX_MATCH. */
1507 pmatch[reg_num].rm_eo = cur_idx;
1513 /* This function checks the STATE_LOG from the SCTX->last_str_idx to 0
1514 and sift the nodes in each states according to the following rules.
1515 Updated state_log will be wrote to STATE_LOG.
1517 Rules: We throw away the Node `a' in the STATE_LOG[STR_IDX] if...
1518 1. When STR_IDX == MATCH_LAST(the last index in the state_log):
1519 If `a' isn't the LAST_NODE and `a' can't epsilon transit to
1520 the LAST_NODE, we throw away the node `a'.
1521 2. When 0 <= STR_IDX < MATCH_LAST and `a' accepts
1522 string `s' and transit to `b':
1523 i. If 'b' isn't in the STATE_LOG[STR_IDX+strlen('s')], we throw
1524 away the node `a'.
1525 ii. If 'b' is in the STATE_LOG[STR_IDX+strlen('s')] but 'b' is
1526 thrown away, we throw away the node `a'.
1527 3. When 0 <= STR_IDX < MATCH_LAST and 'a' epsilon transit to 'b':
1528 i. If 'b' isn't in the STATE_LOG[STR_IDX], we throw away the
1529 node `a'.
1530 ii. If 'b' is in the STATE_LOG[STR_IDX] but 'b' is thrown away,
1531 we throw away the node `a'. */
1533 #define STATE_NODE_CONTAINS(state,node) \
1534 ((state) != NULL && re_node_set_contains (&(state)->nodes, node))
1536 static reg_errcode_t
1537 sift_states_backward (mctx, sctx)
1538 re_match_context_t *mctx;
1539 re_sift_context_t *sctx;
1541 reg_errcode_t err;
1542 int null_cnt = 0;
1543 int str_idx = sctx->last_str_idx;
1544 re_node_set cur_dest;
1546 #ifdef DEBUG
1547 assert (mctx->state_log != NULL && mctx->state_log[str_idx] != NULL);
1548 #endif
1550 /* Build sifted state_log[str_idx]. It has the nodes which can epsilon
1551 transit to the last_node and the last_node itself. */
1552 err = re_node_set_init_1 (&cur_dest, sctx->last_node);
1553 if (BE (err != REG_NOERROR, 0))
1554 return err;
1555 err = update_cur_sifted_state (mctx, sctx, str_idx, &cur_dest);
1556 if (BE (err != REG_NOERROR, 0))
1557 goto free_return;
1559 /* Then check each states in the state_log. */
1560 while (str_idx > 0)
1562 /* Update counters. */
1563 null_cnt = (sctx->sifted_states[str_idx] == NULL) ? null_cnt + 1 : 0;
1564 if (null_cnt > mctx->max_mb_elem_len)
1566 memset (sctx->sifted_states, '\0',
1567 sizeof (re_dfastate_t *) * str_idx);
1568 re_node_set_free (&cur_dest);
1569 return REG_NOERROR;
1571 re_node_set_empty (&cur_dest);
1572 --str_idx;
1574 if (mctx->state_log[str_idx])
1576 err = build_sifted_states (mctx, sctx, str_idx, &cur_dest);
1577 if (BE (err != REG_NOERROR, 0))
1578 goto free_return;
1581 /* Add all the nodes which satisfy the following conditions:
1582 - It can epsilon transit to a node in CUR_DEST.
1583 - It is in CUR_SRC.
1584 And update state_log. */
1585 err = update_cur_sifted_state (mctx, sctx, str_idx, &cur_dest);
1586 if (BE (err != REG_NOERROR, 0))
1587 goto free_return;
1589 err = REG_NOERROR;
1590 free_return:
1591 re_node_set_free (&cur_dest);
1592 return err;
1595 static reg_errcode_t
1596 build_sifted_states (mctx, sctx, str_idx, cur_dest)
1597 re_match_context_t *mctx;
1598 re_sift_context_t *sctx;
1599 int str_idx;
1600 re_node_set *cur_dest;
1602 re_dfa_t *const dfa = mctx->dfa;
1603 re_node_set *cur_src = &mctx->state_log[str_idx]->nodes;
1604 int i;
1606 /* Then build the next sifted state.
1607 We build the next sifted state on `cur_dest', and update
1608 `sifted_states[str_idx]' with `cur_dest'.
1609 Note:
1610 `cur_dest' is the sifted state from `state_log[str_idx + 1]'.
1611 `cur_src' points the node_set of the old `state_log[str_idx]'. */
1612 for (i = 0; i < cur_src->nelem; i++)
1614 int prev_node = cur_src->elems[i];
1615 int naccepted = 0;
1616 re_token_type_t type = dfa->nodes[prev_node].type;
1617 int ret;
1619 if (IS_EPSILON_NODE (type))
1620 continue;
1621 #ifdef RE_ENABLE_I18N
1622 /* If the node may accept `multi byte'. */
1623 if (ACCEPT_MB_NODE (type))
1624 naccepted = sift_states_iter_mb (mctx, sctx, prev_node,
1625 str_idx, sctx->last_str_idx);
1626 #endif /* RE_ENABLE_I18N */
1628 /* We don't check backreferences here.
1629 See update_cur_sifted_state(). */
1630 if (!naccepted
1631 && check_node_accept (mctx, dfa->nodes + prev_node, str_idx)
1632 && STATE_NODE_CONTAINS (sctx->sifted_states[str_idx + 1],
1633 dfa->nexts[prev_node]))
1634 naccepted = 1;
1636 if (naccepted == 0)
1637 continue;
1639 if (sctx->limits.nelem)
1641 int to_idx = str_idx + naccepted;
1642 if (check_dst_limits (mctx, &sctx->limits,
1643 dfa->nexts[prev_node], to_idx,
1644 prev_node, str_idx))
1645 continue;
1647 ret = re_node_set_insert (cur_dest, prev_node);
1648 if (BE (ret == -1, 0))
1649 return REG_ESPACE;
1652 return REG_NOERROR;
1655 /* Helper functions. */
1657 static reg_errcode_t
1658 clean_state_log_if_needed (mctx, next_state_log_idx)
1659 re_match_context_t *mctx;
1660 int next_state_log_idx;
1662 int top = mctx->state_log_top;
1664 if (next_state_log_idx >= mctx->input.bufs_len
1665 || (next_state_log_idx >= mctx->input.valid_len
1666 && mctx->input.valid_len < mctx->input.len))
1668 reg_errcode_t err;
1669 err = extend_buffers (mctx);
1670 if (BE (err != REG_NOERROR, 0))
1671 return err;
1674 if (top < next_state_log_idx)
1676 memset (mctx->state_log + top + 1, '\0',
1677 sizeof (re_dfastate_t *) * (next_state_log_idx - top));
1678 mctx->state_log_top = next_state_log_idx;
1680 return REG_NOERROR;
1683 static reg_errcode_t
1684 merge_state_array (dfa, dst, src, num)
1685 re_dfa_t *dfa;
1686 re_dfastate_t **dst;
1687 re_dfastate_t **src;
1688 int num;
1690 int st_idx;
1691 reg_errcode_t err;
1692 for (st_idx = 0; st_idx < num; ++st_idx)
1694 if (dst[st_idx] == NULL)
1695 dst[st_idx] = src[st_idx];
1696 else if (src[st_idx] != NULL)
1698 re_node_set merged_set;
1699 err = re_node_set_init_union (&merged_set, &dst[st_idx]->nodes,
1700 &src[st_idx]->nodes);
1701 if (BE (err != REG_NOERROR, 0))
1702 return err;
1703 dst[st_idx] = re_acquire_state (&err, dfa, &merged_set);
1704 re_node_set_free (&merged_set);
1705 if (BE (err != REG_NOERROR, 0))
1706 return err;
1709 return REG_NOERROR;
1712 static reg_errcode_t
1713 update_cur_sifted_state (mctx, sctx, str_idx, dest_nodes)
1714 re_match_context_t *mctx;
1715 re_sift_context_t *sctx;
1716 int str_idx;
1717 re_node_set *dest_nodes;
1719 re_dfa_t *const dfa = mctx->dfa;
1720 reg_errcode_t err;
1721 const re_node_set *candidates;
1722 candidates = ((mctx->state_log[str_idx] == NULL) ? NULL
1723 : &mctx->state_log[str_idx]->nodes);
1725 if (dest_nodes->nelem == 0)
1726 sctx->sifted_states[str_idx] = NULL;
1727 else
1729 if (candidates)
1731 /* At first, add the nodes which can epsilon transit to a node in
1732 DEST_NODE. */
1733 err = add_epsilon_src_nodes (dfa, dest_nodes, candidates);
1734 if (BE (err != REG_NOERROR, 0))
1735 return err;
1737 /* Then, check the limitations in the current sift_context. */
1738 if (sctx->limits.nelem)
1740 err = check_subexp_limits (dfa, dest_nodes, candidates, &sctx->limits,
1741 mctx->bkref_ents, str_idx);
1742 if (BE (err != REG_NOERROR, 0))
1743 return err;
1747 sctx->sifted_states[str_idx] = re_acquire_state (&err, dfa, dest_nodes);
1748 if (BE (err != REG_NOERROR, 0))
1749 return err;
1752 if (candidates && mctx->state_log[str_idx]->has_backref)
1754 err = sift_states_bkref (mctx, sctx, str_idx, candidates);
1755 if (BE (err != REG_NOERROR, 0))
1756 return err;
1758 return REG_NOERROR;
1761 static reg_errcode_t
1762 add_epsilon_src_nodes (dfa, dest_nodes, candidates)
1763 re_dfa_t *dfa;
1764 re_node_set *dest_nodes;
1765 const re_node_set *candidates;
1767 reg_errcode_t err;
1768 int src_idx;
1769 re_node_set src_copy;
1771 err = re_node_set_init_copy (&src_copy, dest_nodes);
1772 if (BE (err != REG_NOERROR, 0))
1773 return err;
1774 for (src_idx = 0; src_idx < src_copy.nelem; ++src_idx)
1776 err = re_node_set_add_intersect (dest_nodes, candidates,
1777 dfa->inveclosures
1778 + src_copy.elems[src_idx]);
1779 if (BE (err != REG_NOERROR, 0))
1781 re_node_set_free (&src_copy);
1782 return err;
1785 re_node_set_free (&src_copy);
1786 return REG_NOERROR;
1789 static reg_errcode_t
1790 sub_epsilon_src_nodes (dfa, node, dest_nodes, candidates)
1791 re_dfa_t *dfa;
1792 int node;
1793 re_node_set *dest_nodes;
1794 const re_node_set *candidates;
1796 int ecl_idx;
1797 reg_errcode_t err;
1798 re_node_set *inv_eclosure = dfa->inveclosures + node;
1799 re_node_set except_nodes;
1800 re_node_set_init_empty (&except_nodes);
1801 for (ecl_idx = 0; ecl_idx < inv_eclosure->nelem; ++ecl_idx)
1803 int cur_node = inv_eclosure->elems[ecl_idx];
1804 if (cur_node == node)
1805 continue;
1806 if (IS_EPSILON_NODE (dfa->nodes[cur_node].type))
1808 int edst1 = dfa->edests[cur_node].elems[0];
1809 int edst2 = ((dfa->edests[cur_node].nelem > 1)
1810 ? dfa->edests[cur_node].elems[1] : -1);
1811 if ((!re_node_set_contains (inv_eclosure, edst1)
1812 && re_node_set_contains (dest_nodes, edst1))
1813 || (edst2 > 0
1814 && !re_node_set_contains (inv_eclosure, edst2)
1815 && re_node_set_contains (dest_nodes, edst2)))
1817 err = re_node_set_add_intersect (&except_nodes, candidates,
1818 dfa->inveclosures + cur_node);
1819 if (BE (err != REG_NOERROR, 0))
1821 re_node_set_free (&except_nodes);
1822 return err;
1827 for (ecl_idx = 0; ecl_idx < inv_eclosure->nelem; ++ecl_idx)
1829 int cur_node = inv_eclosure->elems[ecl_idx];
1830 if (!re_node_set_contains (&except_nodes, cur_node))
1832 int idx = re_node_set_contains (dest_nodes, cur_node) - 1;
1833 re_node_set_remove_at (dest_nodes, idx);
1836 re_node_set_free (&except_nodes);
1837 return REG_NOERROR;
1840 static int
1841 check_dst_limits (mctx, limits, dst_node, dst_idx, src_node, src_idx)
1842 re_match_context_t *mctx;
1843 re_node_set *limits;
1844 int dst_node, dst_idx, src_node, src_idx;
1846 re_dfa_t *const dfa = mctx->dfa;
1847 int lim_idx, src_pos, dst_pos;
1849 int dst_bkref_idx = search_cur_bkref_entry (mctx, dst_idx);
1850 int src_bkref_idx = search_cur_bkref_entry (mctx, src_idx);
1851 for (lim_idx = 0; lim_idx < limits->nelem; ++lim_idx)
1853 int subexp_idx;
1854 struct re_backref_cache_entry *ent;
1855 ent = mctx->bkref_ents + limits->elems[lim_idx];
1856 subexp_idx = dfa->nodes[ent->node].opr.idx - 1;
1858 dst_pos = check_dst_limits_calc_pos (mctx, limits->elems[lim_idx],
1859 subexp_idx, dst_node, dst_idx,
1860 dst_bkref_idx);
1861 src_pos = check_dst_limits_calc_pos (mctx, limits->elems[lim_idx],
1862 subexp_idx, src_node, src_idx,
1863 src_bkref_idx);
1865 /* In case of:
1866 <src> <dst> ( <subexp> )
1867 ( <subexp> ) <src> <dst>
1868 ( <subexp1> <src> <subexp2> <dst> <subexp3> ) */
1869 if (src_pos == dst_pos)
1870 continue; /* This is unrelated limitation. */
1871 else
1872 return 1;
1874 return 0;
1877 static int
1878 check_dst_limits_calc_pos_1 (mctx, boundaries, subexp_idx, from_node, bkref_idx)
1879 re_match_context_t *mctx;
1880 int boundaries, subexp_idx, from_node, bkref_idx;
1882 re_dfa_t *const dfa = mctx->dfa;
1883 re_node_set *eclosures = dfa->eclosures + from_node;
1884 int node_idx;
1886 /* Else, we are on the boundary: examine the nodes on the epsilon
1887 closure. */
1888 for (node_idx = 0; node_idx < eclosures->nelem; ++node_idx)
1890 int node = eclosures->elems[node_idx];
1891 switch (dfa->nodes[node].type)
1893 case OP_BACK_REF:
1895 struct re_backref_cache_entry *ent = mctx->bkref_ents + bkref_idx;
1898 int dst, cpos;
1900 if (ent->node != node)
1901 continue;
1903 if (subexp_idx <= 8 * sizeof (ent->eps_reachable_subexps_map)
1904 && (ent->eps_reachable_subexps_map
1905 & (1 << (subexp_idx - 1))) == 0)
1906 continue;
1908 /* Recurse trying to reach the OP_OPEN_SUBEXP and
1909 OP_CLOSE_SUBEXP cases below. But, if the
1910 destination node is the same node as the source
1911 node, don't recurse because it would cause an
1912 infinite loop: a regex that exhibits this behavior
1913 is ()\1*\1* */
1914 dst = dfa->edests[node].elems[0];
1915 if (dst == from_node)
1917 if (boundaries & 1)
1918 return -1;
1919 else /* if (boundaries & 2) */
1920 return 0;
1923 cpos = check_dst_limits_calc_pos_1 (mctx, boundaries,
1924 subexp_idx, dst, bkref_idx);
1926 if (cpos == -1 /* && (boundaries & 1) */)
1927 return -1;
1929 if (cpos == 0 && (boundaries & 2))
1930 return 0;
1932 ent->eps_reachable_subexps_map &= ~(1 << (subexp_idx - 1));
1934 while (ent++->more);
1935 break;
1938 case OP_OPEN_SUBEXP:
1939 if ((boundaries & 1) && subexp_idx == dfa->nodes[node].opr.idx)
1940 return -1;
1941 break;
1943 case OP_CLOSE_SUBEXP:
1944 if ((boundaries & 2) && subexp_idx == dfa->nodes[node].opr.idx)
1945 return 0;
1946 break;
1948 default:
1949 break;
1953 return (boundaries & 2) ? 1 : 0;
1956 static int
1957 check_dst_limits_calc_pos (mctx, limit, subexp_idx, from_node, str_idx, bkref_idx)
1958 re_match_context_t *mctx;
1959 int limit, subexp_idx, from_node, str_idx, bkref_idx;
1961 struct re_backref_cache_entry *lim = mctx->bkref_ents + limit;
1962 int boundaries;
1964 /* If we are outside the range of the subexpression, return -1 or 1. */
1965 if (str_idx < lim->subexp_from)
1966 return -1;
1968 if (lim->subexp_to < str_idx)
1969 return 1;
1971 /* If we are within the subexpression, return 0. */
1972 boundaries = (str_idx == lim->subexp_from);
1973 boundaries |= (str_idx == lim->subexp_to) << 1;
1974 if (boundaries == 0)
1975 return 0;
1977 /* Else, examine epsilon closure. */
1978 return check_dst_limits_calc_pos_1 (mctx, boundaries, subexp_idx,
1979 from_node, bkref_idx);
1982 /* Check the limitations of sub expressions LIMITS, and remove the nodes
1983 which are against limitations from DEST_NODES. */
1985 static reg_errcode_t
1986 check_subexp_limits (dfa, dest_nodes, candidates, limits, bkref_ents, str_idx)
1987 re_dfa_t *dfa;
1988 re_node_set *dest_nodes;
1989 const re_node_set *candidates;
1990 re_node_set *limits;
1991 struct re_backref_cache_entry *bkref_ents;
1992 int str_idx;
1994 reg_errcode_t err;
1995 int node_idx, lim_idx;
1997 for (lim_idx = 0; lim_idx < limits->nelem; ++lim_idx)
1999 int subexp_idx;
2000 struct re_backref_cache_entry *ent;
2001 ent = bkref_ents + limits->elems[lim_idx];
2003 if (str_idx <= ent->subexp_from || ent->str_idx < str_idx)
2004 continue; /* This is unrelated limitation. */
2006 subexp_idx = dfa->nodes[ent->node].opr.idx - 1;
2007 if (ent->subexp_to == str_idx)
2009 int ops_node = -1;
2010 int cls_node = -1;
2011 for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
2013 int node = dest_nodes->elems[node_idx];
2014 re_token_type_t type = dfa->nodes[node].type;
2015 if (type == OP_OPEN_SUBEXP
2016 && subexp_idx == dfa->nodes[node].opr.idx)
2017 ops_node = node;
2018 else if (type == OP_CLOSE_SUBEXP
2019 && subexp_idx == dfa->nodes[node].opr.idx)
2020 cls_node = node;
2023 /* Check the limitation of the open subexpression. */
2024 /* Note that (ent->subexp_to = str_idx != ent->subexp_from). */
2025 if (ops_node >= 0)
2027 err = sub_epsilon_src_nodes (dfa, ops_node, dest_nodes,
2028 candidates);
2029 if (BE (err != REG_NOERROR, 0))
2030 return err;
2033 /* Check the limitation of the close subexpression. */
2034 if (cls_node >= 0)
2035 for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
2037 int node = dest_nodes->elems[node_idx];
2038 if (!re_node_set_contains (dfa->inveclosures + node,
2039 cls_node)
2040 && !re_node_set_contains (dfa->eclosures + node,
2041 cls_node))
2043 /* It is against this limitation.
2044 Remove it form the current sifted state. */
2045 err = sub_epsilon_src_nodes (dfa, node, dest_nodes,
2046 candidates);
2047 if (BE (err != REG_NOERROR, 0))
2048 return err;
2049 --node_idx;
2053 else /* (ent->subexp_to != str_idx) */
2055 for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
2057 int node = dest_nodes->elems[node_idx];
2058 re_token_type_t type = dfa->nodes[node].type;
2059 if (type == OP_CLOSE_SUBEXP || type == OP_OPEN_SUBEXP)
2061 if (subexp_idx != dfa->nodes[node].opr.idx)
2062 continue;
2063 if ((type == OP_CLOSE_SUBEXP && ent->subexp_to != str_idx)
2064 || (type == OP_OPEN_SUBEXP))
2066 /* It is against this limitation.
2067 Remove it form the current sifted state. */
2068 err = sub_epsilon_src_nodes (dfa, node, dest_nodes,
2069 candidates);
2070 if (BE (err != REG_NOERROR, 0))
2071 return err;
2077 return REG_NOERROR;
2080 static reg_errcode_t
2081 sift_states_bkref (mctx, sctx, str_idx, candidates)
2082 re_match_context_t *mctx;
2083 re_sift_context_t *sctx;
2084 int str_idx;
2085 const re_node_set *candidates;
2087 re_dfa_t *const dfa = mctx->dfa;
2088 reg_errcode_t err;
2089 int node_idx, node;
2090 re_sift_context_t local_sctx;
2091 int first_idx = search_cur_bkref_entry (mctx, str_idx);
2093 if (first_idx == -1)
2094 return REG_NOERROR;
2096 local_sctx.sifted_states = NULL; /* Mark that it hasn't been initialized. */
2098 for (node_idx = 0; node_idx < candidates->nelem; ++node_idx)
2100 int enabled_idx;
2101 re_token_type_t type;
2102 struct re_backref_cache_entry *entry;
2103 node = candidates->elems[node_idx];
2104 type = dfa->nodes[node].type;
2105 /* Avoid infinite loop for the REs like "()\1+". */
2106 if (node == sctx->last_node && str_idx == sctx->last_str_idx)
2107 continue;
2108 if (type != OP_BACK_REF)
2109 continue;
2111 entry = mctx->bkref_ents + first_idx;
2112 enabled_idx = first_idx;
2115 int subexp_len, to_idx, dst_node;
2116 re_dfastate_t *cur_state;
2118 if (entry->node != node)
2119 continue;
2120 subexp_len = entry->subexp_to - entry->subexp_from;
2121 to_idx = str_idx + subexp_len;
2122 dst_node = (subexp_len ? dfa->nexts[node]
2123 : dfa->edests[node].elems[0]);
2125 if (to_idx > sctx->last_str_idx
2126 || sctx->sifted_states[to_idx] == NULL
2127 || !STATE_NODE_CONTAINS (sctx->sifted_states[to_idx], dst_node)
2128 || check_dst_limits (mctx, &sctx->limits, node,
2129 str_idx, dst_node, to_idx))
2130 continue;
2132 if (local_sctx.sifted_states == NULL)
2134 local_sctx = *sctx;
2135 err = re_node_set_init_copy (&local_sctx.limits, &sctx->limits);
2136 if (BE (err != REG_NOERROR, 0))
2137 goto free_return;
2139 local_sctx.last_node = node;
2140 local_sctx.last_str_idx = str_idx;
2141 err = re_node_set_insert (&local_sctx.limits, enabled_idx);
2142 if (BE (err < 0, 0))
2144 err = REG_ESPACE;
2145 goto free_return;
2147 cur_state = local_sctx.sifted_states[str_idx];
2148 err = sift_states_backward (mctx, &local_sctx);
2149 if (BE (err != REG_NOERROR, 0))
2150 goto free_return;
2151 if (sctx->limited_states != NULL)
2153 err = merge_state_array (dfa, sctx->limited_states,
2154 local_sctx.sifted_states,
2155 str_idx + 1);
2156 if (BE (err != REG_NOERROR, 0))
2157 goto free_return;
2159 local_sctx.sifted_states[str_idx] = cur_state;
2160 re_node_set_remove (&local_sctx.limits, enabled_idx);
2162 /* mctx->bkref_ents may have changed, reload the pointer. */
2163 entry = mctx->bkref_ents + enabled_idx;
2165 while (enabled_idx++, entry++->more);
2167 err = REG_NOERROR;
2168 free_return:
2169 if (local_sctx.sifted_states != NULL)
2171 re_node_set_free (&local_sctx.limits);
2174 return err;
2178 #ifdef RE_ENABLE_I18N
2179 static int
2180 sift_states_iter_mb (mctx, sctx, node_idx, str_idx, max_str_idx)
2181 const re_match_context_t *mctx;
2182 re_sift_context_t *sctx;
2183 int node_idx, str_idx, max_str_idx;
2185 re_dfa_t *const dfa = mctx->dfa;
2186 int naccepted;
2187 /* Check the node can accept `multi byte'. */
2188 naccepted = check_node_accept_bytes (dfa, node_idx, &mctx->input, str_idx);
2189 if (naccepted > 0 && str_idx + naccepted <= max_str_idx &&
2190 !STATE_NODE_CONTAINS (sctx->sifted_states[str_idx + naccepted],
2191 dfa->nexts[node_idx]))
2192 /* The node can't accept the `multi byte', or the
2193 destination was already thrown away, then the node
2194 could't accept the current input `multi byte'. */
2195 naccepted = 0;
2196 /* Otherwise, it is sure that the node could accept
2197 `naccepted' bytes input. */
2198 return naccepted;
2200 #endif /* RE_ENABLE_I18N */
2203 /* Functions for state transition. */
2205 /* Return the next state to which the current state STATE will transit by
2206 accepting the current input byte, and update STATE_LOG if necessary.
2207 If STATE can accept a multibyte char/collating element/back reference
2208 update the destination of STATE_LOG. */
2210 static re_dfastate_t *
2211 transit_state (err, mctx, state)
2212 reg_errcode_t *err;
2213 re_match_context_t *mctx;
2214 re_dfastate_t *state;
2216 re_dfa_t *const dfa = mctx->dfa;
2217 re_dfastate_t **trtable;
2218 unsigned char ch;
2220 #ifdef RE_ENABLE_I18N
2221 /* If the current state can accept multibyte. */
2222 if (BE (state->accept_mb, 0))
2224 *err = transit_state_mb (mctx, state);
2225 if (BE (*err != REG_NOERROR, 0))
2226 return NULL;
2228 #endif /* RE_ENABLE_I18N */
2230 /* Then decide the next state with the single byte. */
2231 if (1)
2233 /* Use transition table */
2234 ch = re_string_fetch_byte (&mctx->input);
2235 trtable = state->trtable;
2236 if (trtable == NULL)
2238 trtable = build_trtable (dfa, state);
2239 if (trtable == NULL)
2241 *err = REG_ESPACE;
2242 return NULL;
2245 if (BE (state->word_trtable, 0))
2247 unsigned int context;
2248 context
2249 = re_string_context_at (&mctx->input,
2250 re_string_cur_idx (&mctx->input) - 1,
2251 mctx->eflags);
2252 if (IS_WORD_CONTEXT (context))
2253 return trtable[ch + SBC_MAX];
2254 else
2255 return trtable[ch];
2257 else
2258 return trtable[ch];
2260 #if 0
2261 else
2262 /* don't use transition table */
2263 return transit_state_sb (err, mctx, state);
2264 #endif
2267 /* Update the state_log if we need */
2268 re_dfastate_t *
2269 merge_state_with_log (err, mctx, next_state)
2270 reg_errcode_t *err;
2271 re_match_context_t *mctx;
2272 re_dfastate_t *next_state;
2274 re_dfa_t *const dfa = mctx->dfa;
2275 int cur_idx = re_string_cur_idx (&mctx->input);
2277 if (cur_idx > mctx->state_log_top)
2279 mctx->state_log[cur_idx] = next_state;
2280 mctx->state_log_top = cur_idx;
2282 else if (mctx->state_log[cur_idx] == 0)
2284 mctx->state_log[cur_idx] = next_state;
2286 else
2288 re_dfastate_t *pstate;
2289 unsigned int context;
2290 re_node_set next_nodes, *log_nodes, *table_nodes = NULL;
2291 /* If (state_log[cur_idx] != 0), it implies that cur_idx is
2292 the destination of a multibyte char/collating element/
2293 back reference. Then the next state is the union set of
2294 these destinations and the results of the transition table. */
2295 pstate = mctx->state_log[cur_idx];
2296 log_nodes = pstate->entrance_nodes;
2297 if (next_state != NULL)
2299 table_nodes = next_state->entrance_nodes;
2300 *err = re_node_set_init_union (&next_nodes, table_nodes,
2301 log_nodes);
2302 if (BE (*err != REG_NOERROR, 0))
2303 return NULL;
2305 else
2306 next_nodes = *log_nodes;
2307 /* Note: We already add the nodes of the initial state,
2308 then we don't need to add them here. */
2310 context = re_string_context_at (&mctx->input,
2311 re_string_cur_idx (&mctx->input) - 1,
2312 mctx->eflags);
2313 next_state = mctx->state_log[cur_idx]
2314 = re_acquire_state_context (err, dfa, &next_nodes, context);
2315 /* We don't need to check errors here, since the return value of
2316 this function is next_state and ERR is already set. */
2318 if (table_nodes != NULL)
2319 re_node_set_free (&next_nodes);
2322 if (BE (dfa->nbackref, 0) && next_state != NULL)
2324 /* Check OP_OPEN_SUBEXP in the current state in case that we use them
2325 later. We must check them here, since the back references in the
2326 next state might use them. */
2327 *err = check_subexp_matching_top (mctx, &next_state->nodes,
2328 cur_idx);
2329 if (BE (*err != REG_NOERROR, 0))
2330 return NULL;
2332 /* If the next state has back references. */
2333 if (next_state->has_backref)
2335 *err = transit_state_bkref (mctx, &next_state->nodes);
2336 if (BE (*err != REG_NOERROR, 0))
2337 return NULL;
2338 next_state = mctx->state_log[cur_idx];
2342 return next_state;
2345 /* Skip bytes in the input that correspond to part of a
2346 multi-byte match, then look in the log for a state
2347 from which to restart matching. */
2348 re_dfastate_t *
2349 find_recover_state (err, mctx)
2350 reg_errcode_t *err;
2351 re_match_context_t *mctx;
2353 re_dfastate_t *cur_state = NULL;
2356 int max = mctx->state_log_top;
2357 int cur_str_idx = re_string_cur_idx (&mctx->input);
2361 if (++cur_str_idx > max)
2362 return NULL;
2363 re_string_skip_bytes (&mctx->input, 1);
2365 while (mctx->state_log[cur_str_idx] == NULL);
2367 cur_state = merge_state_with_log (err, mctx, NULL);
2369 while (err == REG_NOERROR && cur_state == NULL);
2370 return cur_state;
2373 /* Helper functions for transit_state. */
2375 /* From the node set CUR_NODES, pick up the nodes whose types are
2376 OP_OPEN_SUBEXP and which have corresponding back references in the regular
2377 expression. And register them to use them later for evaluating the
2378 correspoding back references. */
2380 static reg_errcode_t
2381 check_subexp_matching_top (mctx, cur_nodes, str_idx)
2382 re_match_context_t *mctx;
2383 re_node_set *cur_nodes;
2384 int str_idx;
2386 re_dfa_t *const dfa = mctx->dfa;
2387 int node_idx;
2388 reg_errcode_t err;
2390 /* TODO: This isn't efficient.
2391 Because there might be more than one nodes whose types are
2392 OP_OPEN_SUBEXP and whose index is SUBEXP_IDX, we must check all
2393 nodes.
2394 E.g. RE: (a){2} */
2395 for (node_idx = 0; node_idx < cur_nodes->nelem; ++node_idx)
2397 int node = cur_nodes->elems[node_idx];
2398 if (dfa->nodes[node].type == OP_OPEN_SUBEXP
2399 && dfa->nodes[node].opr.idx < (8 * sizeof (dfa->used_bkref_map))
2400 && dfa->used_bkref_map & (1 << dfa->nodes[node].opr.idx))
2402 err = match_ctx_add_subtop (mctx, node, str_idx);
2403 if (BE (err != REG_NOERROR, 0))
2404 return err;
2407 return REG_NOERROR;
2410 #if 0
2411 /* Return the next state to which the current state STATE will transit by
2412 accepting the current input byte. */
2414 static re_dfastate_t *
2415 transit_state_sb (err, mctx, state)
2416 reg_errcode_t *err;
2417 re_match_context_t *mctx;
2418 re_dfastate_t *state;
2420 re_dfa_t *const dfa = mctx->dfa;
2421 re_node_set next_nodes;
2422 re_dfastate_t *next_state;
2423 int node_cnt, cur_str_idx = re_string_cur_idx (&mctx->input);
2424 unsigned int context;
2426 *err = re_node_set_alloc (&next_nodes, state->nodes.nelem + 1);
2427 if (BE (*err != REG_NOERROR, 0))
2428 return NULL;
2429 for (node_cnt = 0; node_cnt < state->nodes.nelem; ++node_cnt)
2431 int cur_node = state->nodes.elems[node_cnt];
2432 if (check_node_accept (mctx, dfa->nodes + cur_node, cur_str_idx))
2434 *err = re_node_set_merge (&next_nodes,
2435 dfa->eclosures + dfa->nexts[cur_node]);
2436 if (BE (*err != REG_NOERROR, 0))
2438 re_node_set_free (&next_nodes);
2439 return NULL;
2443 context = re_string_context_at (&mctx->input, cur_str_idx, mctx->eflags);
2444 next_state = re_acquire_state_context (err, dfa, &next_nodes, context);
2445 /* We don't need to check errors here, since the return value of
2446 this function is next_state and ERR is already set. */
2448 re_node_set_free (&next_nodes);
2449 re_string_skip_bytes (&mctx->input, 1);
2450 return next_state;
2452 #endif
2454 #ifdef RE_ENABLE_I18N
2455 static reg_errcode_t
2456 transit_state_mb (mctx, pstate)
2457 re_match_context_t *mctx;
2458 re_dfastate_t *pstate;
2460 re_dfa_t *const dfa = mctx->dfa;
2461 reg_errcode_t err;
2462 int i;
2464 for (i = 0; i < pstate->nodes.nelem; ++i)
2466 re_node_set dest_nodes, *new_nodes;
2467 int cur_node_idx = pstate->nodes.elems[i];
2468 int naccepted = 0, dest_idx;
2469 unsigned int context;
2470 re_dfastate_t *dest_state;
2472 if (dfa->nodes[cur_node_idx].constraint)
2474 context = re_string_context_at (&mctx->input,
2475 re_string_cur_idx (&mctx->input),
2476 mctx->eflags);
2477 if (NOT_SATISFY_NEXT_CONSTRAINT (dfa->nodes[cur_node_idx].constraint,
2478 context))
2479 continue;
2482 /* How many bytes the node can accept? */
2483 if (ACCEPT_MB_NODE (dfa->nodes[cur_node_idx].type))
2484 naccepted = check_node_accept_bytes (dfa, cur_node_idx, &mctx->input,
2485 re_string_cur_idx (&mctx->input));
2486 if (naccepted == 0)
2487 continue;
2489 /* The node can accepts `naccepted' bytes. */
2490 dest_idx = re_string_cur_idx (&mctx->input) + naccepted;
2491 mctx->max_mb_elem_len = ((mctx->max_mb_elem_len < naccepted) ? naccepted
2492 : mctx->max_mb_elem_len);
2493 err = clean_state_log_if_needed (mctx, dest_idx);
2494 if (BE (err != REG_NOERROR, 0))
2495 return err;
2496 #ifdef DEBUG
2497 assert (dfa->nexts[cur_node_idx] != -1);
2498 #endif
2499 /* `cur_node_idx' may point the entity of the OP_CONTEXT_NODE,
2500 then we use pstate->nodes.elems[i] instead. */
2501 new_nodes = dfa->eclosures + dfa->nexts[pstate->nodes.elems[i]];
2503 dest_state = mctx->state_log[dest_idx];
2504 if (dest_state == NULL)
2505 dest_nodes = *new_nodes;
2506 else
2508 err = re_node_set_init_union (&dest_nodes,
2509 dest_state->entrance_nodes, new_nodes);
2510 if (BE (err != REG_NOERROR, 0))
2511 return err;
2513 context = re_string_context_at (&mctx->input, dest_idx - 1, mctx->eflags);
2514 mctx->state_log[dest_idx]
2515 = re_acquire_state_context (&err, dfa, &dest_nodes, context);
2516 if (dest_state != NULL)
2517 re_node_set_free (&dest_nodes);
2518 if (BE (mctx->state_log[dest_idx] == NULL && err != REG_NOERROR, 0))
2519 return err;
2521 return REG_NOERROR;
2523 #endif /* RE_ENABLE_I18N */
2525 static reg_errcode_t
2526 transit_state_bkref (mctx, nodes)
2527 re_match_context_t *mctx;
2528 const re_node_set *nodes;
2530 re_dfa_t *const dfa = mctx->dfa;
2531 reg_errcode_t err;
2532 int i;
2533 int cur_str_idx = re_string_cur_idx (&mctx->input);
2535 for (i = 0; i < nodes->nelem; ++i)
2537 int dest_str_idx, prev_nelem, bkc_idx;
2538 int node_idx = nodes->elems[i];
2539 unsigned int context;
2540 const re_token_t *node = dfa->nodes + node_idx;
2541 re_node_set *new_dest_nodes;
2543 /* Check whether `node' is a backreference or not. */
2544 if (node->type != OP_BACK_REF)
2545 continue;
2547 if (node->constraint)
2549 context = re_string_context_at (&mctx->input, cur_str_idx,
2550 mctx->eflags);
2551 if (NOT_SATISFY_NEXT_CONSTRAINT (node->constraint, context))
2552 continue;
2555 /* `node' is a backreference.
2556 Check the substring which the substring matched. */
2557 bkc_idx = mctx->nbkref_ents;
2558 err = get_subexp (mctx, node_idx, cur_str_idx);
2559 if (BE (err != REG_NOERROR, 0))
2560 goto free_return;
2562 /* And add the epsilon closures (which is `new_dest_nodes') of
2563 the backreference to appropriate state_log. */
2564 #ifdef DEBUG
2565 assert (dfa->nexts[node_idx] != -1);
2566 #endif
2567 for (; bkc_idx < mctx->nbkref_ents; ++bkc_idx)
2569 int subexp_len;
2570 re_dfastate_t *dest_state;
2571 struct re_backref_cache_entry *bkref_ent;
2572 bkref_ent = mctx->bkref_ents + bkc_idx;
2573 if (bkref_ent->node != node_idx || bkref_ent->str_idx != cur_str_idx)
2574 continue;
2575 subexp_len = bkref_ent->subexp_to - bkref_ent->subexp_from;
2576 new_dest_nodes = (subexp_len == 0
2577 ? dfa->eclosures + dfa->edests[node_idx].elems[0]
2578 : dfa->eclosures + dfa->nexts[node_idx]);
2579 dest_str_idx = (cur_str_idx + bkref_ent->subexp_to
2580 - bkref_ent->subexp_from);
2581 context = re_string_context_at (&mctx->input, dest_str_idx - 1,
2582 mctx->eflags);
2583 dest_state = mctx->state_log[dest_str_idx];
2584 prev_nelem = ((mctx->state_log[cur_str_idx] == NULL) ? 0
2585 : mctx->state_log[cur_str_idx]->nodes.nelem);
2586 /* Add `new_dest_node' to state_log. */
2587 if (dest_state == NULL)
2589 mctx->state_log[dest_str_idx]
2590 = re_acquire_state_context (&err, dfa, new_dest_nodes,
2591 context);
2592 if (BE (mctx->state_log[dest_str_idx] == NULL
2593 && err != REG_NOERROR, 0))
2594 goto free_return;
2596 else
2598 re_node_set dest_nodes;
2599 err = re_node_set_init_union (&dest_nodes,
2600 dest_state->entrance_nodes,
2601 new_dest_nodes);
2602 if (BE (err != REG_NOERROR, 0))
2604 re_node_set_free (&dest_nodes);
2605 goto free_return;
2607 mctx->state_log[dest_str_idx]
2608 = re_acquire_state_context (&err, dfa, &dest_nodes, context);
2609 re_node_set_free (&dest_nodes);
2610 if (BE (mctx->state_log[dest_str_idx] == NULL
2611 && err != REG_NOERROR, 0))
2612 goto free_return;
2614 /* We need to check recursively if the backreference can epsilon
2615 transit. */
2616 if (subexp_len == 0
2617 && mctx->state_log[cur_str_idx]->nodes.nelem > prev_nelem)
2619 err = check_subexp_matching_top (mctx, new_dest_nodes,
2620 cur_str_idx);
2621 if (BE (err != REG_NOERROR, 0))
2622 goto free_return;
2623 err = transit_state_bkref (mctx, new_dest_nodes);
2624 if (BE (err != REG_NOERROR, 0))
2625 goto free_return;
2629 err = REG_NOERROR;
2630 free_return:
2631 return err;
2634 /* Enumerate all the candidates which the backreference BKREF_NODE can match
2635 at BKREF_STR_IDX, and register them by match_ctx_add_entry().
2636 Note that we might collect inappropriate candidates here.
2637 However, the cost of checking them strictly here is too high, then we
2638 delay these checking for prune_impossible_nodes(). */
2640 static reg_errcode_t
2641 get_subexp (mctx, bkref_node, bkref_str_idx)
2642 re_match_context_t *mctx;
2643 int bkref_node, bkref_str_idx;
2645 re_dfa_t *const dfa = mctx->dfa;
2646 int subexp_num, sub_top_idx;
2647 const char *buf = (const char *) re_string_get_buffer (&mctx->input);
2648 /* Return if we have already checked BKREF_NODE at BKREF_STR_IDX. */
2649 int cache_idx = search_cur_bkref_entry (mctx, bkref_str_idx);
2650 if (cache_idx != -1)
2652 const struct re_backref_cache_entry *entry = mctx->bkref_ents + cache_idx;
2654 if (entry->node == bkref_node)
2655 return REG_NOERROR; /* We already checked it. */
2656 while (entry++->more);
2659 subexp_num = dfa->nodes[bkref_node].opr.idx - 1;
2661 /* For each sub expression */
2662 for (sub_top_idx = 0; sub_top_idx < mctx->nsub_tops; ++sub_top_idx)
2664 reg_errcode_t err;
2665 re_sub_match_top_t *sub_top = mctx->sub_tops[sub_top_idx];
2666 re_sub_match_last_t *sub_last;
2667 int sub_last_idx, sl_str, bkref_str_off;
2669 if (dfa->nodes[sub_top->node].opr.idx != subexp_num)
2670 continue; /* It isn't related. */
2672 sl_str = sub_top->str_idx;
2673 bkref_str_off = bkref_str_idx;
2674 /* At first, check the last node of sub expressions we already
2675 evaluated. */
2676 for (sub_last_idx = 0; sub_last_idx < sub_top->nlasts; ++sub_last_idx)
2678 int sl_str_diff;
2679 sub_last = sub_top->lasts[sub_last_idx];
2680 sl_str_diff = sub_last->str_idx - sl_str;
2681 /* The matched string by the sub expression match with the substring
2682 at the back reference? */
2683 if (sl_str_diff > 0)
2685 if (BE (bkref_str_off + sl_str_diff > mctx->input.valid_len, 0))
2687 /* Not enough chars for a successful match. */
2688 if (bkref_str_off + sl_str_diff > mctx->input.len)
2689 break;
2691 err = clean_state_log_if_needed (mctx,
2692 bkref_str_off
2693 + sl_str_diff);
2694 if (BE (err != REG_NOERROR, 0))
2695 return err;
2696 buf = (const char *) re_string_get_buffer (&mctx->input);
2698 if (memcmp (buf + bkref_str_off, buf + sl_str, sl_str_diff) != 0)
2699 break; /* We don't need to search this sub expression any more. */
2701 bkref_str_off += sl_str_diff;
2702 sl_str += sl_str_diff;
2703 err = get_subexp_sub (mctx, sub_top, sub_last, bkref_node,
2704 bkref_str_idx);
2706 /* Reload buf, since the preceding call might have reallocated
2707 the buffer. */
2708 buf = (const char *) re_string_get_buffer (&mctx->input);
2710 if (err == REG_NOMATCH)
2711 continue;
2712 if (BE (err != REG_NOERROR, 0))
2713 return err;
2716 if (sub_last_idx < sub_top->nlasts)
2717 continue;
2718 if (sub_last_idx > 0)
2719 ++sl_str;
2720 /* Then, search for the other last nodes of the sub expression. */
2721 for (; sl_str <= bkref_str_idx; ++sl_str)
2723 int cls_node, sl_str_off;
2724 const re_node_set *nodes;
2725 sl_str_off = sl_str - sub_top->str_idx;
2726 /* The matched string by the sub expression match with the substring
2727 at the back reference? */
2728 if (sl_str_off > 0)
2730 if (BE (bkref_str_off >= mctx->input.valid_len, 0))
2732 /* If we are at the end of the input, we cannot match. */
2733 if (bkref_str_off >= mctx->input.len)
2734 break;
2736 err = extend_buffers (mctx);
2737 if (BE (err != REG_NOERROR, 0))
2738 return err;
2740 buf = (const char *) re_string_get_buffer (&mctx->input);
2742 if (buf [bkref_str_off++] != buf[sl_str - 1])
2743 break; /* We don't need to search this sub expression
2744 any more. */
2746 if (mctx->state_log[sl_str] == NULL)
2747 continue;
2748 /* Does this state have a ')' of the sub expression? */
2749 nodes = &mctx->state_log[sl_str]->nodes;
2750 cls_node = find_subexp_node (dfa, nodes, subexp_num, OP_CLOSE_SUBEXP);
2751 if (cls_node == -1)
2752 continue; /* No. */
2753 if (sub_top->path == NULL)
2755 sub_top->path = calloc (sizeof (state_array_t),
2756 sl_str - sub_top->str_idx + 1);
2757 if (sub_top->path == NULL)
2758 return REG_ESPACE;
2760 /* Can the OP_OPEN_SUBEXP node arrive the OP_CLOSE_SUBEXP node
2761 in the current context? */
2762 err = check_arrival (mctx, sub_top->path, sub_top->node,
2763 sub_top->str_idx, cls_node, sl_str, OP_CLOSE_SUBEXP);
2764 if (err == REG_NOMATCH)
2765 continue;
2766 if (BE (err != REG_NOERROR, 0))
2767 return err;
2768 sub_last = match_ctx_add_sublast (sub_top, cls_node, sl_str);
2769 if (BE (sub_last == NULL, 0))
2770 return REG_ESPACE;
2771 err = get_subexp_sub (mctx, sub_top, sub_last, bkref_node,
2772 bkref_str_idx);
2773 if (err == REG_NOMATCH)
2774 continue;
2777 return REG_NOERROR;
2780 /* Helper functions for get_subexp(). */
2782 /* Check SUB_LAST can arrive to the back reference BKREF_NODE at BKREF_STR.
2783 If it can arrive, register the sub expression expressed with SUB_TOP
2784 and SUB_LAST. */
2786 static reg_errcode_t
2787 get_subexp_sub (mctx, sub_top, sub_last, bkref_node, bkref_str)
2788 re_match_context_t *mctx;
2789 const re_sub_match_top_t *sub_top;
2790 re_sub_match_last_t *sub_last;
2791 int bkref_node, bkref_str;
2793 reg_errcode_t err;
2794 int to_idx;
2795 /* Can the subexpression arrive the back reference? */
2796 err = check_arrival (mctx, &sub_last->path, sub_last->node,
2797 sub_last->str_idx, bkref_node, bkref_str, OP_OPEN_SUBEXP);
2798 if (err != REG_NOERROR)
2799 return err;
2800 err = match_ctx_add_entry (mctx, bkref_node, bkref_str, sub_top->str_idx,
2801 sub_last->str_idx);
2802 if (BE (err != REG_NOERROR, 0))
2803 return err;
2804 to_idx = bkref_str + sub_last->str_idx - sub_top->str_idx;
2805 return clean_state_log_if_needed (mctx, to_idx);
2808 /* Find the first node which is '(' or ')' and whose index is SUBEXP_IDX.
2809 Search '(' if FL_OPEN, or search ')' otherwise.
2810 TODO: This function isn't efficient...
2811 Because there might be more than one nodes whose types are
2812 OP_OPEN_SUBEXP and whose index is SUBEXP_IDX, we must check all
2813 nodes.
2814 E.g. RE: (a){2} */
2816 static int
2817 find_subexp_node (dfa, nodes, subexp_idx, type)
2818 const re_dfa_t *dfa;
2819 const re_node_set *nodes;
2820 int subexp_idx, type;
2822 int cls_idx;
2823 for (cls_idx = 0; cls_idx < nodes->nelem; ++cls_idx)
2825 int cls_node = nodes->elems[cls_idx];
2826 const re_token_t *node = dfa->nodes + cls_node;
2827 if (node->type == type
2828 && node->opr.idx == subexp_idx)
2829 return cls_node;
2831 return -1;
2834 /* Check whether the node TOP_NODE at TOP_STR can arrive to the node
2835 LAST_NODE at LAST_STR. We record the path onto PATH since it will be
2836 heavily reused.
2837 Return REG_NOERROR if it can arrive, or REG_NOMATCH otherwise. */
2839 static reg_errcode_t
2840 check_arrival (mctx, path, top_node, top_str, last_node, last_str,
2841 type)
2842 re_match_context_t *mctx;
2843 state_array_t *path;
2844 int top_node, top_str, last_node, last_str, type;
2846 re_dfa_t *const dfa = mctx->dfa;
2847 reg_errcode_t err;
2848 int subexp_num, backup_cur_idx, str_idx, null_cnt;
2849 re_dfastate_t *cur_state = NULL;
2850 re_node_set *cur_nodes, next_nodes;
2851 re_dfastate_t **backup_state_log;
2852 unsigned int context;
2854 subexp_num = dfa->nodes[top_node].opr.idx;
2855 /* Extend the buffer if we need. */
2856 if (BE (path->alloc < last_str + mctx->max_mb_elem_len + 1, 0))
2858 re_dfastate_t **new_array;
2859 int old_alloc = path->alloc;
2860 path->alloc += last_str + mctx->max_mb_elem_len + 1;
2861 new_array = re_realloc (path->array, re_dfastate_t *, path->alloc);
2862 if (new_array == NULL)
2864 path->alloc = old_alloc;
2865 return REG_ESPACE;
2867 path->array = new_array;
2868 memset (new_array + old_alloc, '\0',
2869 sizeof (re_dfastate_t *) * (path->alloc - old_alloc));
2872 str_idx = path->next_idx == 0 ? top_str : path->next_idx;
2874 /* Temporary modify MCTX. */
2875 backup_state_log = mctx->state_log;
2876 backup_cur_idx = mctx->input.cur_idx;
2877 mctx->state_log = path->array;
2878 mctx->input.cur_idx = str_idx;
2880 /* Setup initial node set. */
2881 context = re_string_context_at (&mctx->input, str_idx - 1, mctx->eflags);
2882 if (str_idx == top_str)
2884 err = re_node_set_init_1 (&next_nodes, top_node);
2885 if (BE (err != REG_NOERROR, 0))
2886 return err;
2887 err = check_arrival_expand_ecl (dfa, &next_nodes, subexp_num, type);
2888 if (BE (err != REG_NOERROR, 0))
2890 re_node_set_free (&next_nodes);
2891 return err;
2894 else
2896 cur_state = mctx->state_log[str_idx];
2897 if (cur_state && cur_state->has_backref)
2899 err = re_node_set_init_copy (&next_nodes, &cur_state->nodes);
2900 if (BE ( err != REG_NOERROR, 0))
2901 return err;
2903 else
2904 re_node_set_init_empty (&next_nodes);
2906 if (str_idx == top_str || (cur_state && cur_state->has_backref))
2908 if (next_nodes.nelem)
2910 err = expand_bkref_cache (mctx, &next_nodes, str_idx,
2911 subexp_num, type);
2912 if (BE ( err != REG_NOERROR, 0))
2914 re_node_set_free (&next_nodes);
2915 return err;
2918 cur_state = re_acquire_state_context (&err, dfa, &next_nodes, context);
2919 if (BE (cur_state == NULL && err != REG_NOERROR, 0))
2921 re_node_set_free (&next_nodes);
2922 return err;
2924 mctx->state_log[str_idx] = cur_state;
2927 for (null_cnt = 0; str_idx < last_str && null_cnt <= mctx->max_mb_elem_len;)
2929 re_node_set_empty (&next_nodes);
2930 if (mctx->state_log[str_idx + 1])
2932 err = re_node_set_merge (&next_nodes,
2933 &mctx->state_log[str_idx + 1]->nodes);
2934 if (BE (err != REG_NOERROR, 0))
2936 re_node_set_free (&next_nodes);
2937 return err;
2940 if (cur_state)
2942 err = check_arrival_add_next_nodes (mctx, str_idx,
2943 &cur_state->nodes, &next_nodes);
2944 if (BE (err != REG_NOERROR, 0))
2946 re_node_set_free (&next_nodes);
2947 return err;
2950 ++str_idx;
2951 if (next_nodes.nelem)
2953 err = check_arrival_expand_ecl (dfa, &next_nodes, subexp_num, type);
2954 if (BE (err != REG_NOERROR, 0))
2956 re_node_set_free (&next_nodes);
2957 return err;
2959 err = expand_bkref_cache (mctx, &next_nodes, str_idx,
2960 subexp_num, type);
2961 if (BE ( err != REG_NOERROR, 0))
2963 re_node_set_free (&next_nodes);
2964 return err;
2967 context = re_string_context_at (&mctx->input, str_idx - 1, mctx->eflags);
2968 cur_state = re_acquire_state_context (&err, dfa, &next_nodes, context);
2969 if (BE (cur_state == NULL && err != REG_NOERROR, 0))
2971 re_node_set_free (&next_nodes);
2972 return err;
2974 mctx->state_log[str_idx] = cur_state;
2975 null_cnt = cur_state == NULL ? null_cnt + 1 : 0;
2977 re_node_set_free (&next_nodes);
2978 cur_nodes = (mctx->state_log[last_str] == NULL ? NULL
2979 : &mctx->state_log[last_str]->nodes);
2980 path->next_idx = str_idx;
2982 /* Fix MCTX. */
2983 mctx->state_log = backup_state_log;
2984 mctx->input.cur_idx = backup_cur_idx;
2986 /* Then check the current node set has the node LAST_NODE. */
2987 if (cur_nodes != NULL && re_node_set_contains (cur_nodes, last_node))
2988 return REG_NOERROR;
2990 return REG_NOMATCH;
2993 /* Helper functions for check_arrival. */
2995 /* Calculate the destination nodes of CUR_NODES at STR_IDX, and append them
2996 to NEXT_NODES.
2997 TODO: This function is similar to the functions transit_state*(),
2998 however this function has many additional works.
2999 Can't we unify them? */
3001 static reg_errcode_t
3002 check_arrival_add_next_nodes (mctx, str_idx, cur_nodes, next_nodes)
3003 re_match_context_t *mctx;
3004 int str_idx;
3005 re_node_set *cur_nodes, *next_nodes;
3007 re_dfa_t *const dfa = mctx->dfa;
3008 int result;
3009 int cur_idx;
3010 reg_errcode_t err;
3011 re_node_set union_set;
3012 re_node_set_init_empty (&union_set);
3013 for (cur_idx = 0; cur_idx < cur_nodes->nelem; ++cur_idx)
3015 int naccepted = 0;
3016 int cur_node = cur_nodes->elems[cur_idx];
3017 re_token_type_t type = dfa->nodes[cur_node].type;
3018 if (IS_EPSILON_NODE (type))
3019 continue;
3020 #ifdef RE_ENABLE_I18N
3021 /* If the node may accept `multi byte'. */
3022 if (ACCEPT_MB_NODE (type))
3024 naccepted = check_node_accept_bytes (dfa, cur_node, &mctx->input,
3025 str_idx);
3026 if (naccepted > 1)
3028 re_dfastate_t *dest_state;
3029 int next_node = dfa->nexts[cur_node];
3030 int next_idx = str_idx + naccepted;
3031 dest_state = mctx->state_log[next_idx];
3032 re_node_set_empty (&union_set);
3033 if (dest_state)
3035 err = re_node_set_merge (&union_set, &dest_state->nodes);
3036 if (BE (err != REG_NOERROR, 0))
3038 re_node_set_free (&union_set);
3039 return err;
3042 result = re_node_set_insert (&union_set, next_node);
3043 if (BE (result < 0, 0))
3045 re_node_set_free (&union_set);
3046 return REG_ESPACE;
3048 mctx->state_log[next_idx] = re_acquire_state (&err, dfa,
3049 &union_set);
3050 if (BE (mctx->state_log[next_idx] == NULL
3051 && err != REG_NOERROR, 0))
3053 re_node_set_free (&union_set);
3054 return err;
3058 #endif /* RE_ENABLE_I18N */
3059 if (naccepted
3060 || check_node_accept (mctx, dfa->nodes + cur_node, str_idx))
3062 result = re_node_set_insert (next_nodes, dfa->nexts[cur_node]);
3063 if (BE (result < 0, 0))
3065 re_node_set_free (&union_set);
3066 return REG_ESPACE;
3070 re_node_set_free (&union_set);
3071 return REG_NOERROR;
3074 /* For all the nodes in CUR_NODES, add the epsilon closures of them to
3075 CUR_NODES, however exclude the nodes which are:
3076 - inside the sub expression whose number is EX_SUBEXP, if FL_OPEN.
3077 - out of the sub expression whose number is EX_SUBEXP, if !FL_OPEN.
3080 static reg_errcode_t
3081 check_arrival_expand_ecl (dfa, cur_nodes, ex_subexp, type)
3082 re_dfa_t *dfa;
3083 re_node_set *cur_nodes;
3084 int ex_subexp, type;
3086 reg_errcode_t err;
3087 int idx, outside_node;
3088 re_node_set new_nodes;
3089 #ifdef DEBUG
3090 assert (cur_nodes->nelem);
3091 #endif
3092 err = re_node_set_alloc (&new_nodes, cur_nodes->nelem);
3093 if (BE (err != REG_NOERROR, 0))
3094 return err;
3095 /* Create a new node set NEW_NODES with the nodes which are epsilon
3096 closures of the node in CUR_NODES. */
3098 for (idx = 0; idx < cur_nodes->nelem; ++idx)
3100 int cur_node = cur_nodes->elems[idx];
3101 re_node_set *eclosure = dfa->eclosures + cur_node;
3102 outside_node = find_subexp_node (dfa, eclosure, ex_subexp, type);
3103 if (outside_node == -1)
3105 /* There are no problematic nodes, just merge them. */
3106 err = re_node_set_merge (&new_nodes, eclosure);
3107 if (BE (err != REG_NOERROR, 0))
3109 re_node_set_free (&new_nodes);
3110 return err;
3113 else
3115 /* There are problematic nodes, re-calculate incrementally. */
3116 err = check_arrival_expand_ecl_sub (dfa, &new_nodes, cur_node,
3117 ex_subexp, type);
3118 if (BE (err != REG_NOERROR, 0))
3120 re_node_set_free (&new_nodes);
3121 return err;
3125 re_node_set_free (cur_nodes);
3126 *cur_nodes = new_nodes;
3127 return REG_NOERROR;
3130 /* Helper function for check_arrival_expand_ecl.
3131 Check incrementally the epsilon closure of TARGET, and if it isn't
3132 problematic append it to DST_NODES. */
3134 static reg_errcode_t
3135 check_arrival_expand_ecl_sub (dfa, dst_nodes, target, ex_subexp, type)
3136 re_dfa_t *dfa;
3137 int target, ex_subexp, type;
3138 re_node_set *dst_nodes;
3140 int cur_node;
3141 for (cur_node = target; !re_node_set_contains (dst_nodes, cur_node);)
3143 int err;
3145 if (dfa->nodes[cur_node].type == type
3146 && dfa->nodes[cur_node].opr.idx == ex_subexp)
3148 if (type == OP_CLOSE_SUBEXP)
3150 err = re_node_set_insert (dst_nodes, cur_node);
3151 if (BE (err == -1, 0))
3152 return REG_ESPACE;
3154 break;
3156 err = re_node_set_insert (dst_nodes, cur_node);
3157 if (BE (err == -1, 0))
3158 return REG_ESPACE;
3159 if (dfa->edests[cur_node].nelem == 0)
3160 break;
3161 if (dfa->edests[cur_node].nelem == 2)
3163 err = check_arrival_expand_ecl_sub (dfa, dst_nodes,
3164 dfa->edests[cur_node].elems[1],
3165 ex_subexp, type);
3166 if (BE (err != REG_NOERROR, 0))
3167 return err;
3169 cur_node = dfa->edests[cur_node].elems[0];
3171 return REG_NOERROR;
3175 /* For all the back references in the current state, calculate the
3176 destination of the back references by the appropriate entry
3177 in MCTX->BKREF_ENTS. */
3179 static reg_errcode_t
3180 expand_bkref_cache (mctx, cur_nodes, cur_str, subexp_num,
3181 type)
3182 re_match_context_t *mctx;
3183 int cur_str, subexp_num, type;
3184 re_node_set *cur_nodes;
3186 re_dfa_t *const dfa = mctx->dfa;
3187 reg_errcode_t err;
3188 int cache_idx_start = search_cur_bkref_entry (mctx, cur_str);
3189 struct re_backref_cache_entry *ent;
3191 if (cache_idx_start == -1)
3192 return REG_NOERROR;
3194 restart:
3195 ent = mctx->bkref_ents + cache_idx_start;
3198 int to_idx, next_node;
3200 /* Is this entry ENT is appropriate? */
3201 if (!re_node_set_contains (cur_nodes, ent->node))
3202 continue; /* No. */
3204 to_idx = cur_str + ent->subexp_to - ent->subexp_from;
3205 /* Calculate the destination of the back reference, and append it
3206 to MCTX->STATE_LOG. */
3207 if (to_idx == cur_str)
3209 /* The backreference did epsilon transit, we must re-check all the
3210 node in the current state. */
3211 re_node_set new_dests;
3212 reg_errcode_t err2, err3;
3213 next_node = dfa->edests[ent->node].elems[0];
3214 if (re_node_set_contains (cur_nodes, next_node))
3215 continue;
3216 err = re_node_set_init_1 (&new_dests, next_node);
3217 err2 = check_arrival_expand_ecl (dfa, &new_dests, subexp_num, type);
3218 err3 = re_node_set_merge (cur_nodes, &new_dests);
3219 re_node_set_free (&new_dests);
3220 if (BE (err != REG_NOERROR || err2 != REG_NOERROR
3221 || err3 != REG_NOERROR, 0))
3223 err = (err != REG_NOERROR ? err
3224 : (err2 != REG_NOERROR ? err2 : err3));
3225 return err;
3227 /* TODO: It is still inefficient... */
3228 goto restart;
3230 else
3232 re_node_set union_set;
3233 next_node = dfa->nexts[ent->node];
3234 if (mctx->state_log[to_idx])
3236 int ret;
3237 if (re_node_set_contains (&mctx->state_log[to_idx]->nodes,
3238 next_node))
3239 continue;
3240 err = re_node_set_init_copy (&union_set,
3241 &mctx->state_log[to_idx]->nodes);
3242 ret = re_node_set_insert (&union_set, next_node);
3243 if (BE (err != REG_NOERROR || ret < 0, 0))
3245 re_node_set_free (&union_set);
3246 err = err != REG_NOERROR ? err : REG_ESPACE;
3247 return err;
3250 else
3252 err = re_node_set_init_1 (&union_set, next_node);
3253 if (BE (err != REG_NOERROR, 0))
3254 return err;
3256 mctx->state_log[to_idx] = re_acquire_state (&err, dfa, &union_set);
3257 re_node_set_free (&union_set);
3258 if (BE (mctx->state_log[to_idx] == NULL
3259 && err != REG_NOERROR, 0))
3260 return err;
3263 while (ent++->more);
3264 return REG_NOERROR;
3267 /* Build transition table for the state.
3268 Return the new table if succeeded, otherwise return NULL. */
3270 static re_dfastate_t **
3271 build_trtable (dfa, state)
3272 re_dfa_t *dfa;
3273 re_dfastate_t *state;
3275 reg_errcode_t err;
3276 int i, j, ch;
3277 unsigned int elem, mask;
3278 int dests_node_malloced = 0, dest_states_malloced = 0;
3279 int ndests; /* Number of the destination states from `state'. */
3280 re_dfastate_t **trtable;
3281 re_dfastate_t **dest_states = NULL, **dest_states_word, **dest_states_nl;
3282 re_node_set follows, *dests_node;
3283 bitset *dests_ch;
3284 bitset acceptable;
3286 /* We build DFA states which corresponds to the destination nodes
3287 from `state'. `dests_node[i]' represents the nodes which i-th
3288 destination state contains, and `dests_ch[i]' represents the
3289 characters which i-th destination state accepts. */
3290 #ifdef _LIBC
3291 if (__libc_use_alloca ((sizeof (re_node_set) + sizeof (bitset)) * SBC_MAX))
3292 dests_node = (re_node_set *)
3293 alloca ((sizeof (re_node_set) + sizeof (bitset)) * SBC_MAX);
3294 else
3295 #endif
3297 dests_node = (re_node_set *)
3298 malloc ((sizeof (re_node_set) + sizeof (bitset)) * SBC_MAX);
3299 if (BE (dests_node == NULL, 0))
3300 return NULL;
3301 dests_node_malloced = 1;
3303 dests_ch = (bitset *) (dests_node + SBC_MAX);
3305 /* Initialize transiton table. */
3306 state->word_trtable = 0;
3308 /* At first, group all nodes belonging to `state' into several
3309 destinations. */
3310 ndests = group_nodes_into_DFAstates (dfa, state, dests_node, dests_ch);
3311 if (BE (ndests <= 0, 0))
3313 if (dests_node_malloced)
3314 free (dests_node);
3315 /* Return NULL in case of an error, trtable otherwise. */
3316 if (ndests == 0)
3318 state->trtable = (re_dfastate_t **)
3319 calloc (sizeof (re_dfastate_t *), SBC_MAX);;
3320 return state->trtable;
3322 return NULL;
3325 err = re_node_set_alloc (&follows, ndests + 1);
3326 if (BE (err != REG_NOERROR, 0))
3327 goto out_free;
3329 #ifdef _LIBC
3330 if (__libc_use_alloca ((sizeof (re_node_set) + sizeof (bitset)) * SBC_MAX
3331 + ndests * 3 * sizeof (re_dfastate_t *)))
3332 dest_states = (re_dfastate_t **)
3333 alloca (ndests * 3 * sizeof (re_dfastate_t *));
3334 else
3335 #endif
3337 dest_states = (re_dfastate_t **)
3338 malloc (ndests * 3 * sizeof (re_dfastate_t *));
3339 if (BE (dest_states == NULL, 0))
3341 out_free:
3342 if (dest_states_malloced)
3343 free (dest_states);
3344 re_node_set_free (&follows);
3345 for (i = 0; i < ndests; ++i)
3346 re_node_set_free (dests_node + i);
3347 if (dests_node_malloced)
3348 free (dests_node);
3349 return NULL;
3351 dest_states_malloced = 1;
3353 dest_states_word = dest_states + ndests;
3354 dest_states_nl = dest_states_word + ndests;
3355 bitset_empty (acceptable);
3357 /* Then build the states for all destinations. */
3358 for (i = 0; i < ndests; ++i)
3360 int next_node;
3361 re_node_set_empty (&follows);
3362 /* Merge the follows of this destination states. */
3363 for (j = 0; j < dests_node[i].nelem; ++j)
3365 next_node = dfa->nexts[dests_node[i].elems[j]];
3366 if (next_node != -1)
3368 err = re_node_set_merge (&follows, dfa->eclosures + next_node);
3369 if (BE (err != REG_NOERROR, 0))
3370 goto out_free;
3373 dest_states[i] = re_acquire_state_context (&err, dfa, &follows, 0);
3374 if (BE (dest_states[i] == NULL && err != REG_NOERROR, 0))
3375 goto out_free;
3376 /* If the new state has context constraint,
3377 build appropriate states for these contexts. */
3378 if (dest_states[i]->has_constraint)
3380 dest_states_word[i] = re_acquire_state_context (&err, dfa, &follows,
3381 CONTEXT_WORD);
3382 if (BE (dest_states_word[i] == NULL && err != REG_NOERROR, 0))
3383 goto out_free;
3385 if (dest_states[i] != dest_states_word[i]
3386 && dfa->mb_cur_max > 1)
3387 state->word_trtable = 1;
3389 dest_states_nl[i] = re_acquire_state_context (&err, dfa, &follows,
3390 CONTEXT_NEWLINE);
3391 if (BE (dest_states_nl[i] == NULL && err != REG_NOERROR, 0))
3392 goto out_free;
3394 else
3396 dest_states_word[i] = dest_states[i];
3397 dest_states_nl[i] = dest_states[i];
3399 bitset_merge (acceptable, dests_ch[i]);
3402 if (!BE (state->word_trtable, 0))
3404 /* We don't care about whether the following character is a word
3405 character, or we are in a single-byte character set so we can
3406 discern by looking at the character code: allocate a
3407 256-entry transition table. */
3408 trtable = (re_dfastate_t **) calloc (sizeof (re_dfastate_t *), SBC_MAX);
3409 if (BE (trtable == NULL, 0))
3410 goto out_free;
3412 /* For all characters ch...: */
3413 for (i = 0; i < BITSET_UINTS; ++i)
3414 for (ch = i * UINT_BITS, elem = acceptable[i], mask = 1;
3415 elem;
3416 mask <<= 1, elem >>= 1, ++ch)
3417 if (BE (elem & 1, 0))
3419 /* There must be exactly one destination which accepts
3420 character ch. See group_nodes_into_DFAstates. */
3421 for (j = 0; (dests_ch[j][i] & mask) == 0; ++j)
3424 /* j-th destination accepts the word character ch. */
3425 if (dfa->word_char[i] & mask)
3426 trtable[ch] = dest_states_word[j];
3427 else
3428 trtable[ch] = dest_states[j];
3431 else
3433 /* We care about whether the following character is a word
3434 character, and we are in a multi-byte character set: discern
3435 by looking at the character code: build two 256-entry
3436 transition tables, one starting at trtable[0] and one
3437 starting at trtable[SBC_MAX]. */
3438 trtable = (re_dfastate_t **) calloc (sizeof (re_dfastate_t *),
3439 2 * SBC_MAX);
3440 if (BE (trtable == NULL, 0))
3441 goto out_free;
3443 /* For all characters ch...: */
3444 for (i = 0; i < BITSET_UINTS; ++i)
3445 for (ch = i * UINT_BITS, elem = acceptable[i], mask = 1;
3446 elem;
3447 mask <<= 1, elem >>= 1, ++ch)
3448 if (BE (elem & 1, 0))
3450 /* There must be exactly one destination which accepts
3451 character ch. See group_nodes_into_DFAstates. */
3452 for (j = 0; (dests_ch[j][i] & mask) == 0; ++j)
3455 /* j-th destination accepts the word character ch. */
3456 trtable[ch] = dest_states[j];
3457 trtable[ch + SBC_MAX] = dest_states_word[j];
3461 /* new line */
3462 if (bitset_contain (acceptable, NEWLINE_CHAR))
3464 /* The current state accepts newline character. */
3465 for (j = 0; j < ndests; ++j)
3466 if (bitset_contain (dests_ch[j], NEWLINE_CHAR))
3468 /* k-th destination accepts newline character. */
3469 trtable[NEWLINE_CHAR] = dest_states_nl[j];
3470 if (state->word_trtable)
3471 trtable[NEWLINE_CHAR + SBC_MAX] = dest_states_nl[j];
3472 /* There must be only one destination which accepts
3473 newline. See group_nodes_into_DFAstates. */
3474 break;
3478 if (dest_states_malloced)
3479 free (dest_states);
3481 re_node_set_free (&follows);
3482 for (i = 0; i < ndests; ++i)
3483 re_node_set_free (dests_node + i);
3485 if (dests_node_malloced)
3486 free (dests_node);
3488 state->trtable = trtable;
3489 return trtable;
3492 /* Group all nodes belonging to STATE into several destinations.
3493 Then for all destinations, set the nodes belonging to the destination
3494 to DESTS_NODE[i] and set the characters accepted by the destination
3495 to DEST_CH[i]. This function return the number of destinations. */
3497 static int
3498 group_nodes_into_DFAstates (dfa, state, dests_node, dests_ch)
3499 re_dfa_t *dfa;
3500 const re_dfastate_t *state;
3501 re_node_set *dests_node;
3502 bitset *dests_ch;
3504 reg_errcode_t err;
3505 int result;
3506 int i, j, k;
3507 int ndests; /* Number of the destinations from `state'. */
3508 bitset accepts; /* Characters a node can accept. */
3509 const re_node_set *cur_nodes = &state->nodes;
3510 bitset_empty (accepts);
3511 ndests = 0;
3513 /* For all the nodes belonging to `state', */
3514 for (i = 0; i < cur_nodes->nelem; ++i)
3516 re_token_t *node = &dfa->nodes[cur_nodes->elems[i]];
3517 re_token_type_t type = node->type;
3518 unsigned int constraint = node->constraint;
3520 /* Enumerate all single byte character this node can accept. */
3521 if (type == CHARACTER)
3522 bitset_set (accepts, node->opr.c);
3523 else if (type == SIMPLE_BRACKET)
3525 bitset_merge (accepts, node->opr.sbcset);
3527 else if (type == OP_PERIOD)
3529 #ifdef RE_ENABLE_I18N
3530 if (dfa->mb_cur_max > 1)
3531 bitset_merge (accepts, dfa->sb_char);
3532 else
3533 #endif
3534 bitset_set_all (accepts);
3535 if (!(dfa->syntax & RE_DOT_NEWLINE))
3536 bitset_clear (accepts, '\n');
3537 if (dfa->syntax & RE_DOT_NOT_NULL)
3538 bitset_clear (accepts, '\0');
3540 #ifdef RE_ENABLE_I18N
3541 else if (type == OP_UTF8_PERIOD)
3543 memset (accepts, 255, sizeof (unsigned int) * BITSET_UINTS / 2);
3544 if (!(dfa->syntax & RE_DOT_NEWLINE))
3545 bitset_clear (accepts, '\n');
3546 if (dfa->syntax & RE_DOT_NOT_NULL)
3547 bitset_clear (accepts, '\0');
3549 #endif
3550 else
3551 continue;
3553 /* Check the `accepts' and sift the characters which are not
3554 match it the context. */
3555 if (constraint)
3557 if (constraint & NEXT_NEWLINE_CONSTRAINT)
3559 int accepts_newline = bitset_contain (accepts, NEWLINE_CHAR);
3560 bitset_empty (accepts);
3561 if (accepts_newline)
3562 bitset_set (accepts, NEWLINE_CHAR);
3563 else
3564 continue;
3566 if (constraint & NEXT_ENDBUF_CONSTRAINT)
3568 bitset_empty (accepts);
3569 continue;
3572 if (constraint & NEXT_WORD_CONSTRAINT)
3574 unsigned int any_set = 0;
3575 if (type == CHARACTER && !node->word_char)
3577 bitset_empty (accepts);
3578 continue;
3580 #ifdef RE_ENABLE_I18N
3581 if (dfa->mb_cur_max > 1)
3582 for (j = 0; j < BITSET_UINTS; ++j)
3583 any_set |= (accepts[j] &= (dfa->word_char[j] | ~dfa->sb_char[j]));
3584 else
3585 #endif
3586 for (j = 0; j < BITSET_UINTS; ++j)
3587 any_set |= (accepts[j] &= dfa->word_char[j]);
3588 if (!any_set)
3589 continue;
3591 if (constraint & NEXT_NOTWORD_CONSTRAINT)
3593 unsigned int any_set = 0;
3594 if (type == CHARACTER && node->word_char)
3596 bitset_empty (accepts);
3597 continue;
3599 #ifdef RE_ENABLE_I18N
3600 if (dfa->mb_cur_max > 1)
3601 for (j = 0; j < BITSET_UINTS; ++j)
3602 any_set |= (accepts[j] &= ~(dfa->word_char[j] & dfa->sb_char[j]));
3603 else
3604 #endif
3605 for (j = 0; j < BITSET_UINTS; ++j)
3606 any_set |= (accepts[j] &= ~dfa->word_char[j]);
3607 if (!any_set)
3608 continue;
3612 /* Then divide `accepts' into DFA states, or create a new
3613 state. Above, we make sure that accepts is not empty. */
3614 for (j = 0; j < ndests; ++j)
3616 bitset intersec; /* Intersection sets, see below. */
3617 bitset remains;
3618 /* Flags, see below. */
3619 int has_intersec, not_subset, not_consumed;
3621 /* Optimization, skip if this state doesn't accept the character. */
3622 if (type == CHARACTER && !bitset_contain (dests_ch[j], node->opr.c))
3623 continue;
3625 /* Enumerate the intersection set of this state and `accepts'. */
3626 has_intersec = 0;
3627 for (k = 0; k < BITSET_UINTS; ++k)
3628 has_intersec |= intersec[k] = accepts[k] & dests_ch[j][k];
3629 /* And skip if the intersection set is empty. */
3630 if (!has_intersec)
3631 continue;
3633 /* Then check if this state is a subset of `accepts'. */
3634 not_subset = not_consumed = 0;
3635 for (k = 0; k < BITSET_UINTS; ++k)
3637 not_subset |= remains[k] = ~accepts[k] & dests_ch[j][k];
3638 not_consumed |= accepts[k] = accepts[k] & ~dests_ch[j][k];
3641 /* If this state isn't a subset of `accepts', create a
3642 new group state, which has the `remains'. */
3643 if (not_subset)
3645 bitset_copy (dests_ch[ndests], remains);
3646 bitset_copy (dests_ch[j], intersec);
3647 err = re_node_set_init_copy (dests_node + ndests, &dests_node[j]);
3648 if (BE (err != REG_NOERROR, 0))
3649 goto error_return;
3650 ++ndests;
3653 /* Put the position in the current group. */
3654 result = re_node_set_insert (&dests_node[j], cur_nodes->elems[i]);
3655 if (BE (result < 0, 0))
3656 goto error_return;
3658 /* If all characters are consumed, go to next node. */
3659 if (!not_consumed)
3660 break;
3662 /* Some characters remain, create a new group. */
3663 if (j == ndests)
3665 bitset_copy (dests_ch[ndests], accepts);
3666 err = re_node_set_init_1 (dests_node + ndests, cur_nodes->elems[i]);
3667 if (BE (err != REG_NOERROR, 0))
3668 goto error_return;
3669 ++ndests;
3670 bitset_empty (accepts);
3673 return ndests;
3674 error_return:
3675 for (j = 0; j < ndests; ++j)
3676 re_node_set_free (dests_node + j);
3677 return -1;
3680 #ifdef RE_ENABLE_I18N
3681 /* Check how many bytes the node `dfa->nodes[node_idx]' accepts.
3682 Return the number of the bytes the node accepts.
3683 STR_IDX is the current index of the input string.
3685 This function handles the nodes which can accept one character, or
3686 one collating element like '.', '[a-z]', opposite to the other nodes
3687 can only accept one byte. */
3689 static int
3690 check_node_accept_bytes (dfa, node_idx, input, str_idx)
3691 re_dfa_t *dfa;
3692 int node_idx, str_idx;
3693 const re_string_t *input;
3695 const re_token_t *node = dfa->nodes + node_idx;
3696 int char_len, elem_len;
3697 int i;
3699 if (BE (node->type == OP_UTF8_PERIOD, 0))
3701 unsigned char c = re_string_byte_at (input, str_idx), d;
3702 if (BE (c < 0xc2, 1))
3703 return 0;
3705 if (str_idx + 2 > input->len)
3706 return 0;
3708 d = re_string_byte_at (input, str_idx + 1);
3709 if (c < 0xe0)
3710 return (d < 0x80 || d > 0xbf) ? 0 : 2;
3711 else if (c < 0xf0)
3713 char_len = 3;
3714 if (c == 0xe0 && d < 0xa0)
3715 return 0;
3717 else if (c < 0xf8)
3719 char_len = 4;
3720 if (c == 0xf0 && d < 0x90)
3721 return 0;
3723 else if (c < 0xfc)
3725 char_len = 5;
3726 if (c == 0xf8 && d < 0x88)
3727 return 0;
3729 else if (c < 0xfe)
3731 char_len = 6;
3732 if (c == 0xfc && d < 0x84)
3733 return 0;
3735 else
3736 return 0;
3738 if (str_idx + char_len > input->len)
3739 return 0;
3741 for (i = 1; i < char_len; ++i)
3743 d = re_string_byte_at (input, str_idx + i);
3744 if (d < 0x80 || d > 0xbf)
3745 return 0;
3747 return char_len;
3750 char_len = re_string_char_size_at (input, str_idx);
3751 if (node->type == OP_PERIOD)
3753 if (char_len <= 1)
3754 return 0;
3755 /* FIXME: I don't think this if is needed, as both '\n'
3756 and '\0' are char_len == 1. */
3757 /* '.' accepts any one character except the following two cases. */
3758 if ((!(dfa->syntax & RE_DOT_NEWLINE) &&
3759 re_string_byte_at (input, str_idx) == '\n') ||
3760 ((dfa->syntax & RE_DOT_NOT_NULL) &&
3761 re_string_byte_at (input, str_idx) == '\0'))
3762 return 0;
3763 return char_len;
3766 elem_len = re_string_elem_size_at (input, str_idx);
3767 if ((elem_len <= 1 && char_len <= 1) || char_len == 0)
3768 return 0;
3770 if (node->type == COMPLEX_BRACKET)
3772 const re_charset_t *cset = node->opr.mbcset;
3773 # ifdef _LIBC
3774 const unsigned char *pin = ((char *) re_string_get_buffer (input)
3775 + str_idx);
3776 int j;
3777 uint32_t nrules;
3778 # endif /* _LIBC */
3779 int match_len = 0;
3780 wchar_t wc = ((cset->nranges || cset->nchar_classes || cset->nmbchars)
3781 ? re_string_wchar_at (input, str_idx) : 0);
3783 /* match with multibyte character? */
3784 for (i = 0; i < cset->nmbchars; ++i)
3785 if (wc == cset->mbchars[i])
3787 match_len = char_len;
3788 goto check_node_accept_bytes_match;
3790 /* match with character_class? */
3791 for (i = 0; i < cset->nchar_classes; ++i)
3793 wctype_t wt = cset->char_classes[i];
3794 if (__iswctype (wc, wt))
3796 match_len = char_len;
3797 goto check_node_accept_bytes_match;
3801 # ifdef _LIBC
3802 nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3803 if (nrules != 0)
3805 unsigned int in_collseq = 0;
3806 const int32_t *table, *indirect;
3807 const unsigned char *weights, *extra;
3808 const char *collseqwc;
3809 int32_t idx;
3810 /* This #include defines a local function! */
3811 # include <locale/weight.h>
3813 /* match with collating_symbol? */
3814 if (cset->ncoll_syms)
3815 extra = (const unsigned char *)
3816 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB);
3817 for (i = 0; i < cset->ncoll_syms; ++i)
3819 const unsigned char *coll_sym = extra + cset->coll_syms[i];
3820 /* Compare the length of input collating element and
3821 the length of current collating element. */
3822 if (*coll_sym != elem_len)
3823 continue;
3824 /* Compare each bytes. */
3825 for (j = 0; j < *coll_sym; j++)
3826 if (pin[j] != coll_sym[1 + j])
3827 break;
3828 if (j == *coll_sym)
3830 /* Match if every bytes is equal. */
3831 match_len = j;
3832 goto check_node_accept_bytes_match;
3836 if (cset->nranges)
3838 if (elem_len <= char_len)
3840 collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
3841 in_collseq = __collseq_table_lookup (collseqwc, wc);
3843 else
3844 in_collseq = find_collation_sequence_value (pin, elem_len);
3846 /* match with range expression? */
3847 for (i = 0; i < cset->nranges; ++i)
3848 if (cset->range_starts[i] <= in_collseq
3849 && in_collseq <= cset->range_ends[i])
3851 match_len = elem_len;
3852 goto check_node_accept_bytes_match;
3855 /* match with equivalence_class? */
3856 if (cset->nequiv_classes)
3858 const unsigned char *cp = pin;
3859 table = (const int32_t *)
3860 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3861 weights = (const unsigned char *)
3862 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_WEIGHTMB);
3863 extra = (const unsigned char *)
3864 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_EXTRAMB);
3865 indirect = (const int32_t *)
3866 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_INDIRECTMB);
3867 idx = findidx (&cp);
3868 if (idx > 0)
3869 for (i = 0; i < cset->nequiv_classes; ++i)
3871 int32_t equiv_class_idx = cset->equiv_classes[i];
3872 size_t weight_len = weights[idx];
3873 if (weight_len == weights[equiv_class_idx])
3875 int cnt = 0;
3876 while (cnt <= weight_len
3877 && (weights[equiv_class_idx + 1 + cnt]
3878 == weights[idx + 1 + cnt]))
3879 ++cnt;
3880 if (cnt > weight_len)
3882 match_len = elem_len;
3883 goto check_node_accept_bytes_match;
3889 else
3890 # endif /* _LIBC */
3892 /* match with range expression? */
3893 #if __GNUC__ >= 2
3894 wchar_t cmp_buf[] = {L'\0', L'\0', wc, L'\0', L'\0', L'\0'};
3895 #else
3896 wchar_t cmp_buf[] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'};
3897 cmp_buf[2] = wc;
3898 #endif
3899 for (i = 0; i < cset->nranges; ++i)
3901 cmp_buf[0] = cset->range_starts[i];
3902 cmp_buf[4] = cset->range_ends[i];
3903 if (wcscoll (cmp_buf, cmp_buf + 2) <= 0
3904 && wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0)
3906 match_len = char_len;
3907 goto check_node_accept_bytes_match;
3911 check_node_accept_bytes_match:
3912 if (!cset->non_match)
3913 return match_len;
3914 else
3916 if (match_len > 0)
3917 return 0;
3918 else
3919 return (elem_len > char_len) ? elem_len : char_len;
3922 return 0;
3925 # ifdef _LIBC
3926 static unsigned int
3927 find_collation_sequence_value (mbs, mbs_len)
3928 const unsigned char *mbs;
3929 size_t mbs_len;
3931 uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3932 if (nrules == 0)
3934 if (mbs_len == 1)
3936 /* No valid character. Match it as a single byte character. */
3937 const unsigned char *collseq = (const unsigned char *)
3938 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
3939 return collseq[mbs[0]];
3941 return UINT_MAX;
3943 else
3945 int32_t idx;
3946 const unsigned char *extra = (const unsigned char *)
3947 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB);
3948 int32_t extrasize = (const unsigned char *)
3949 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB + 1) - extra;
3951 for (idx = 0; idx < extrasize;)
3953 int mbs_cnt, found = 0;
3954 int32_t elem_mbs_len;
3955 /* Skip the name of collating element name. */
3956 idx = idx + extra[idx] + 1;
3957 elem_mbs_len = extra[idx++];
3958 if (mbs_len == elem_mbs_len)
3960 for (mbs_cnt = 0; mbs_cnt < elem_mbs_len; ++mbs_cnt)
3961 if (extra[idx + mbs_cnt] != mbs[mbs_cnt])
3962 break;
3963 if (mbs_cnt == elem_mbs_len)
3964 /* Found the entry. */
3965 found = 1;
3967 /* Skip the byte sequence of the collating element. */
3968 idx += elem_mbs_len;
3969 /* Adjust for the alignment. */
3970 idx = (idx + 3) & ~3;
3971 /* Skip the collation sequence value. */
3972 idx += sizeof (uint32_t);
3973 /* Skip the wide char sequence of the collating element. */
3974 idx = idx + sizeof (uint32_t) * (extra[idx] + 1);
3975 /* If we found the entry, return the sequence value. */
3976 if (found)
3977 return *(uint32_t *) (extra + idx);
3978 /* Skip the collation sequence value. */
3979 idx += sizeof (uint32_t);
3981 return UINT_MAX;
3984 # endif /* _LIBC */
3985 #endif /* RE_ENABLE_I18N */
3987 /* Check whether the node accepts the byte which is IDX-th
3988 byte of the INPUT. */
3990 static int
3991 check_node_accept (mctx, node, idx)
3992 const re_match_context_t *mctx;
3993 const re_token_t *node;
3994 int idx;
3996 re_dfa_t *const dfa = mctx->dfa;
3997 unsigned char ch;
3998 if (node->constraint)
4000 /* The node has constraints. Check whether the current context
4001 satisfies the constraints. */
4002 unsigned int context = re_string_context_at (&mctx->input, idx,
4003 mctx->eflags);
4004 if (NOT_SATISFY_NEXT_CONSTRAINT (node->constraint, context))
4005 return 0;
4007 ch = re_string_byte_at (&mctx->input, idx);
4008 switch (node->type)
4010 case CHARACTER:
4011 return node->opr.c == ch;
4012 case SIMPLE_BRACKET:
4013 return bitset_contain (node->opr.sbcset, ch);
4014 #ifdef RE_ENABLE_I18N
4015 case OP_UTF8_PERIOD:
4016 if (ch >= 0x80)
4017 return 0;
4018 /* FALLTHROUGH */
4019 #endif
4020 case OP_PERIOD:
4021 return !((ch == '\n' && !(dfa->syntax & RE_DOT_NEWLINE))
4022 || (ch == '\0' && (dfa->syntax & RE_DOT_NOT_NULL)));
4023 default:
4024 return 0;
4028 /* Extend the buffers, if the buffers have run out. */
4030 static reg_errcode_t
4031 extend_buffers (mctx)
4032 re_match_context_t *mctx;
4034 reg_errcode_t ret;
4035 re_string_t *pstr = &mctx->input;
4037 /* Double the lengthes of the buffers. */
4038 ret = re_string_realloc_buffers (pstr, pstr->bufs_len * 2);
4039 if (BE (ret != REG_NOERROR, 0))
4040 return ret;
4042 if (mctx->state_log != NULL)
4044 /* And double the length of state_log. */
4045 /* XXX We have no indication of the size of this buffer. If this
4046 allocation fail we have no indication that the state_log array
4047 does not have the right size. */
4048 re_dfastate_t **new_array = re_realloc (mctx->state_log, re_dfastate_t *,
4049 pstr->bufs_len + 1);
4050 if (BE (new_array == NULL, 0))
4051 return REG_ESPACE;
4052 mctx->state_log = new_array;
4055 /* Then reconstruct the buffers. */
4056 if (pstr->icase)
4058 #ifdef RE_ENABLE_I18N
4059 if (pstr->mb_cur_max > 1)
4061 ret = build_wcs_upper_buffer (pstr);
4062 if (BE (ret != REG_NOERROR, 0))
4063 return ret;
4065 else
4066 #endif /* RE_ENABLE_I18N */
4067 build_upper_buffer (pstr);
4069 else
4071 #ifdef RE_ENABLE_I18N
4072 if (pstr->mb_cur_max > 1)
4073 build_wcs_buffer (pstr);
4074 else
4075 #endif /* RE_ENABLE_I18N */
4077 if (pstr->trans != NULL)
4078 re_string_translate_buffer (pstr);
4081 return REG_NOERROR;
4085 /* Functions for matching context. */
4087 /* Initialize MCTX. */
4089 static reg_errcode_t
4090 match_ctx_init (mctx, eflags, n)
4091 re_match_context_t *mctx;
4092 int eflags, n;
4094 mctx->eflags = eflags;
4095 mctx->match_last = -1;
4096 if (n > 0)
4098 mctx->bkref_ents = re_malloc (struct re_backref_cache_entry, n);
4099 mctx->sub_tops = re_malloc (re_sub_match_top_t *, n);
4100 if (BE (mctx->bkref_ents == NULL || mctx->sub_tops == NULL, 0))
4101 return REG_ESPACE;
4103 /* Already zero-ed by the caller.
4104 else
4105 mctx->bkref_ents = NULL;
4106 mctx->nbkref_ents = 0;
4107 mctx->nsub_tops = 0; */
4108 mctx->abkref_ents = n;
4109 mctx->max_mb_elem_len = 1;
4110 mctx->asub_tops = n;
4111 return REG_NOERROR;
4114 /* Clean the entries which depend on the current input in MCTX.
4115 This function must be invoked when the matcher changes the start index
4116 of the input, or changes the input string. */
4118 static void
4119 match_ctx_clean (mctx)
4120 re_match_context_t *mctx;
4122 int st_idx;
4123 for (st_idx = 0; st_idx < mctx->nsub_tops; ++st_idx)
4125 int sl_idx;
4126 re_sub_match_top_t *top = mctx->sub_tops[st_idx];
4127 for (sl_idx = 0; sl_idx < top->nlasts; ++sl_idx)
4129 re_sub_match_last_t *last = top->lasts[sl_idx];
4130 re_free (last->path.array);
4131 re_free (last);
4133 re_free (top->lasts);
4134 if (top->path)
4136 re_free (top->path->array);
4137 re_free (top->path);
4139 free (top);
4142 mctx->nsub_tops = 0;
4143 mctx->nbkref_ents = 0;
4146 /* Free all the memory associated with MCTX. */
4148 static void
4149 match_ctx_free (mctx)
4150 re_match_context_t *mctx;
4152 /* First, free all the memory associated with MCTX->SUB_TOPS. */
4153 match_ctx_clean (mctx);
4154 re_free (mctx->sub_tops);
4155 re_free (mctx->bkref_ents);
4158 /* Add a new backreference entry to MCTX.
4159 Note that we assume that caller never call this function with duplicate
4160 entry, and call with STR_IDX which isn't smaller than any existing entry.
4163 static reg_errcode_t
4164 match_ctx_add_entry (mctx, node, str_idx, from, to)
4165 re_match_context_t *mctx;
4166 int node, str_idx, from, to;
4168 if (mctx->nbkref_ents >= mctx->abkref_ents)
4170 struct re_backref_cache_entry* new_entry;
4171 new_entry = re_realloc (mctx->bkref_ents, struct re_backref_cache_entry,
4172 mctx->abkref_ents * 2);
4173 if (BE (new_entry == NULL, 0))
4175 re_free (mctx->bkref_ents);
4176 return REG_ESPACE;
4178 mctx->bkref_ents = new_entry;
4179 memset (mctx->bkref_ents + mctx->nbkref_ents, '\0',
4180 sizeof (struct re_backref_cache_entry) * mctx->abkref_ents);
4181 mctx->abkref_ents *= 2;
4183 if (mctx->nbkref_ents > 0
4184 && mctx->bkref_ents[mctx->nbkref_ents - 1].str_idx == str_idx)
4185 mctx->bkref_ents[mctx->nbkref_ents - 1].more = 1;
4187 mctx->bkref_ents[mctx->nbkref_ents].node = node;
4188 mctx->bkref_ents[mctx->nbkref_ents].str_idx = str_idx;
4189 mctx->bkref_ents[mctx->nbkref_ents].subexp_from = from;
4190 mctx->bkref_ents[mctx->nbkref_ents].subexp_to = to;
4192 /* This is a cache that saves negative results of check_dst_limits_calc_pos.
4193 If bit N is clear, means that this entry won't epsilon-transition to
4194 an OP_OPEN_SUBEXP or OP_CLOSE_SUBEXP for the N+1-th subexpression. If
4195 it is set, check_dst_limits_calc_pos_1 will recurse and try to find one
4196 such node.
4198 A backreference does not epsilon-transition unless it is empty, so set
4199 to all zeros if FROM != TO. */
4200 mctx->bkref_ents[mctx->nbkref_ents].eps_reachable_subexps_map
4201 = (from == to ? ~0 : 0);
4203 mctx->bkref_ents[mctx->nbkref_ents++].more = 0;
4204 if (mctx->max_mb_elem_len < to - from)
4205 mctx->max_mb_elem_len = to - from;
4206 return REG_NOERROR;
4209 /* Search for the first entry which has the same str_idx, or -1 if none is
4210 found. Note that MCTX->BKREF_ENTS is already sorted by MCTX->STR_IDX. */
4212 static int
4213 search_cur_bkref_entry (mctx, str_idx)
4214 re_match_context_t *mctx;
4215 int str_idx;
4217 int left, right, mid, last;
4218 last = right = mctx->nbkref_ents;
4219 for (left = 0; left < right;)
4221 mid = (left + right) / 2;
4222 if (mctx->bkref_ents[mid].str_idx < str_idx)
4223 left = mid + 1;
4224 else
4225 right = mid;
4227 if (left < last && mctx->bkref_ents[left].str_idx == str_idx)
4228 return left;
4229 else
4230 return -1;
4233 /* Register the node NODE, whose type is OP_OPEN_SUBEXP, and which matches
4234 at STR_IDX. */
4236 static reg_errcode_t
4237 match_ctx_add_subtop (mctx, node, str_idx)
4238 re_match_context_t *mctx;
4239 int node, str_idx;
4241 #ifdef DEBUG
4242 assert (mctx->sub_tops != NULL);
4243 assert (mctx->asub_tops > 0);
4244 #endif
4245 if (BE (mctx->nsub_tops == mctx->asub_tops, 0))
4247 int new_asub_tops = mctx->asub_tops * 2;
4248 re_sub_match_top_t **new_array = re_realloc (mctx->sub_tops,
4249 re_sub_match_top_t *,
4250 new_asub_tops);
4251 if (BE (new_array == NULL, 0))
4252 return REG_ESPACE;
4253 mctx->sub_tops = new_array;
4254 mctx->asub_tops = new_asub_tops;
4256 mctx->sub_tops[mctx->nsub_tops] = calloc (1, sizeof (re_sub_match_top_t));
4257 if (BE (mctx->sub_tops[mctx->nsub_tops] == NULL, 0))
4258 return REG_ESPACE;
4259 mctx->sub_tops[mctx->nsub_tops]->node = node;
4260 mctx->sub_tops[mctx->nsub_tops++]->str_idx = str_idx;
4261 return REG_NOERROR;
4264 /* Register the node NODE, whose type is OP_CLOSE_SUBEXP, and which matches
4265 at STR_IDX, whose corresponding OP_OPEN_SUBEXP is SUB_TOP. */
4267 static re_sub_match_last_t *
4268 match_ctx_add_sublast (subtop, node, str_idx)
4269 re_sub_match_top_t *subtop;
4270 int node, str_idx;
4272 re_sub_match_last_t *new_entry;
4273 if (BE (subtop->nlasts == subtop->alasts, 0))
4275 int new_alasts = 2 * subtop->alasts + 1;
4276 re_sub_match_last_t **new_array = re_realloc (subtop->lasts,
4277 re_sub_match_last_t *,
4278 new_alasts);
4279 if (BE (new_array == NULL, 0))
4280 return NULL;
4281 subtop->lasts = new_array;
4282 subtop->alasts = new_alasts;
4284 new_entry = calloc (1, sizeof (re_sub_match_last_t));
4285 if (BE (new_entry != NULL, 1))
4287 subtop->lasts[subtop->nlasts] = new_entry;
4288 new_entry->node = node;
4289 new_entry->str_idx = str_idx;
4290 ++subtop->nlasts;
4292 return new_entry;
4295 static void
4296 sift_ctx_init (sctx, sifted_sts, limited_sts, last_node, last_str_idx)
4297 re_sift_context_t *sctx;
4298 re_dfastate_t **sifted_sts, **limited_sts;
4299 int last_node, last_str_idx;
4301 sctx->sifted_states = sifted_sts;
4302 sctx->limited_states = limited_sts;
4303 sctx->last_node = last_node;
4304 sctx->last_str_idx = last_str_idx;
4305 re_node_set_init_empty (&sctx->limits);