Updated to fedora-glibc-20041110T0839
[glibc.git] / posix / regexec.c
blob72ae70b9164bd457c9c8fc12b7afe708b69ca5f9
1 /* Extended regular expression matching and search library.
2 Copyright (C) 2002, 2003, 2004 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4 Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library; if not, write to the Free
18 Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
19 02111-1307 USA. */
21 static reg_errcode_t match_ctx_init (re_match_context_t *cache, int eflags,
22 int n) internal_function;
23 static void match_ctx_clean (re_match_context_t *mctx) internal_function;
24 static void match_ctx_free (re_match_context_t *cache) internal_function;
25 static void match_ctx_free_subtops (re_match_context_t *mctx)
26 internal_function;
27 static reg_errcode_t match_ctx_add_entry (re_match_context_t *cache, int node,
28 int str_idx, int from, int to)
29 internal_function;
30 static int search_cur_bkref_entry (re_match_context_t *mctx, int str_idx)
31 internal_function;
32 static reg_errcode_t match_ctx_add_subtop (re_match_context_t *mctx, int node,
33 int str_idx) internal_function;
34 static re_sub_match_last_t * match_ctx_add_sublast (re_sub_match_top_t *subtop,
35 int node, int str_idx)
36 internal_function;
37 static void sift_ctx_init (re_sift_context_t *sctx, re_dfastate_t **sifted_sts,
38 re_dfastate_t **limited_sts, int last_node,
39 int last_str_idx)
40 internal_function;
41 static reg_errcode_t re_search_internal (const regex_t *preg,
42 const char *string, int length,
43 int start, int range, int stop,
44 size_t nmatch, regmatch_t pmatch[],
45 int eflags) internal_function;
46 static int re_search_2_stub (struct re_pattern_buffer *bufp,
47 const char *string1, int length1,
48 const char *string2, int length2,
49 int start, int range, struct re_registers *regs,
50 int stop, int ret_len) internal_function;
51 static int re_search_stub (struct re_pattern_buffer *bufp,
52 const char *string, int length, int start,
53 int range, int stop, struct re_registers *regs,
54 int ret_len) internal_function;
55 static unsigned re_copy_regs (struct re_registers *regs, regmatch_t *pmatch,
56 int nregs, int regs_allocated) internal_function;
57 static inline re_dfastate_t *acquire_init_state_context
58 (reg_errcode_t *err, const re_match_context_t *mctx, int idx)
59 __attribute ((always_inline)) internal_function;
60 static reg_errcode_t prune_impossible_nodes (re_match_context_t *mctx)
61 internal_function;
62 static int check_matching (re_match_context_t *mctx, int fl_longest_match,
63 int *p_match_first)
64 internal_function;
65 static int check_halt_node_context (const re_dfa_t *dfa, int node,
66 unsigned int context) internal_function;
67 static int check_halt_state_context (const re_match_context_t *mctx,
68 const re_dfastate_t *state, int idx)
69 internal_function;
70 static void update_regs (re_dfa_t *dfa, regmatch_t *pmatch,
71 regmatch_t *prev_idx_match, int cur_node,
72 int cur_idx, int nmatch) internal_function;
73 static int proceed_next_node (const re_match_context_t *mctx,
74 int nregs, regmatch_t *regs,
75 int *pidx, int node, re_node_set *eps_via_nodes,
76 struct re_fail_stack_t *fs) internal_function;
77 static reg_errcode_t push_fail_stack (struct re_fail_stack_t *fs,
78 int str_idx, int *dests, int nregs,
79 regmatch_t *regs,
80 re_node_set *eps_via_nodes) internal_function;
81 static int pop_fail_stack (struct re_fail_stack_t *fs, int *pidx, int nregs,
82 regmatch_t *regs, re_node_set *eps_via_nodes) internal_function;
83 static reg_errcode_t set_regs (const regex_t *preg,
84 const re_match_context_t *mctx,
85 size_t nmatch, regmatch_t *pmatch,
86 int fl_backtrack) internal_function;
87 static reg_errcode_t free_fail_stack_return (struct re_fail_stack_t *fs) internal_function;
89 #ifdef RE_ENABLE_I18N
90 static int sift_states_iter_mb (const re_match_context_t *mctx,
91 re_sift_context_t *sctx,
92 int node_idx, int str_idx, int max_str_idx) internal_function;
93 #endif /* RE_ENABLE_I18N */
94 static reg_errcode_t sift_states_backward (re_match_context_t *mctx,
95 re_sift_context_t *sctx) internal_function;
96 static reg_errcode_t build_sifted_states (re_match_context_t *mctx,
97 re_sift_context_t *sctx, int str_idx,
98 re_node_set *cur_dest) internal_function;
99 static reg_errcode_t update_cur_sifted_state (re_match_context_t *mctx,
100 re_sift_context_t *sctx,
101 int str_idx,
102 re_node_set *dest_nodes) internal_function;
103 static reg_errcode_t add_epsilon_src_nodes (re_dfa_t *dfa,
104 re_node_set *dest_nodes,
105 const re_node_set *candidates) internal_function;
106 static reg_errcode_t sub_epsilon_src_nodes (re_dfa_t *dfa, int node,
107 re_node_set *dest_nodes,
108 const re_node_set *and_nodes) internal_function;
109 static int check_dst_limits (re_match_context_t *mctx, re_node_set *limits,
110 int dst_node, int dst_idx, int src_node,
111 int src_idx) internal_function;
112 static int check_dst_limits_calc_pos_1 (re_match_context_t *mctx,
113 int boundaries, int subexp_idx,
114 int from_node, int bkref_idx) internal_function;
115 static int check_dst_limits_calc_pos (re_match_context_t *mctx,
116 int limit, int subexp_idx,
117 int node, int str_idx,
118 int bkref_idx) internal_function;
119 static reg_errcode_t check_subexp_limits (re_dfa_t *dfa,
120 re_node_set *dest_nodes,
121 const re_node_set *candidates,
122 re_node_set *limits,
123 struct re_backref_cache_entry *bkref_ents,
124 int str_idx) internal_function;
125 static reg_errcode_t sift_states_bkref (re_match_context_t *mctx,
126 re_sift_context_t *sctx,
127 int str_idx, const re_node_set *candidates) internal_function;
128 static reg_errcode_t clean_state_log_if_needed (re_match_context_t *mctx,
129 int next_state_log_idx) internal_function;
130 static reg_errcode_t merge_state_array (re_dfa_t *dfa, re_dfastate_t **dst,
131 re_dfastate_t **src, int num) internal_function;
132 static re_dfastate_t *find_recover_state (reg_errcode_t *err,
133 re_match_context_t *mctx) internal_function;
134 static re_dfastate_t *transit_state (reg_errcode_t *err,
135 re_match_context_t *mctx,
136 re_dfastate_t *state) internal_function;
137 static re_dfastate_t *merge_state_with_log (reg_errcode_t *err,
138 re_match_context_t *mctx,
139 re_dfastate_t *next_state) internal_function;
140 static reg_errcode_t check_subexp_matching_top (re_match_context_t *mctx,
141 re_node_set *cur_nodes,
142 int str_idx) internal_function;
143 #if 0
144 static re_dfastate_t *transit_state_sb (reg_errcode_t *err,
145 re_match_context_t *mctx,
146 re_dfastate_t *pstate) internal_function;
147 #endif
148 #ifdef RE_ENABLE_I18N
149 static reg_errcode_t transit_state_mb (re_match_context_t *mctx,
150 re_dfastate_t *pstate) internal_function;
151 #endif /* RE_ENABLE_I18N */
152 static reg_errcode_t transit_state_bkref (re_match_context_t *mctx,
153 const re_node_set *nodes) internal_function;
154 static reg_errcode_t get_subexp (re_match_context_t *mctx,
155 int bkref_node, int bkref_str_idx) internal_function;
156 static reg_errcode_t get_subexp_sub (re_match_context_t *mctx,
157 const re_sub_match_top_t *sub_top,
158 re_sub_match_last_t *sub_last,
159 int bkref_node, int bkref_str) internal_function;
160 static int find_subexp_node (const re_dfa_t *dfa, const re_node_set *nodes,
161 int subexp_idx, int type) internal_function;
162 static reg_errcode_t check_arrival (re_match_context_t *mctx,
163 state_array_t *path, int top_node,
164 int top_str, int last_node, int last_str,
165 int type) internal_function;
166 static reg_errcode_t check_arrival_add_next_nodes (re_match_context_t *mctx,
167 int str_idx,
168 re_node_set *cur_nodes,
169 re_node_set *next_nodes) internal_function;
170 static reg_errcode_t check_arrival_expand_ecl (re_dfa_t *dfa,
171 re_node_set *cur_nodes,
172 int ex_subexp, int type) internal_function;
173 static reg_errcode_t check_arrival_expand_ecl_sub (re_dfa_t *dfa,
174 re_node_set *dst_nodes,
175 int target, int ex_subexp,
176 int type) internal_function;
177 static reg_errcode_t expand_bkref_cache (re_match_context_t *mctx,
178 re_node_set *cur_nodes, int cur_str,
179 int subexp_num, int type) internal_function;
180 static re_dfastate_t **build_trtable (re_dfa_t *dfa,
181 re_dfastate_t *state) internal_function;
182 #ifdef RE_ENABLE_I18N
183 static int check_node_accept_bytes (re_dfa_t *dfa, int node_idx,
184 const re_string_t *input, int idx) internal_function;
185 # ifdef _LIBC
186 static unsigned int find_collation_sequence_value (const unsigned char *mbs,
187 size_t name_len) internal_function;
188 # endif /* _LIBC */
189 #endif /* RE_ENABLE_I18N */
190 static int group_nodes_into_DFAstates (re_dfa_t *dfa,
191 const re_dfastate_t *state,
192 re_node_set *states_node,
193 bitset *states_ch) internal_function;
194 static int check_node_accept (const re_match_context_t *mctx,
195 const re_token_t *node, int idx) internal_function;
196 static reg_errcode_t extend_buffers (re_match_context_t *mctx) internal_function;
198 /* Entry point for POSIX code. */
200 /* regexec searches for a given pattern, specified by PREG, in the
201 string STRING.
203 If NMATCH is zero or REG_NOSUB was set in the cflags argument to
204 `regcomp', we ignore PMATCH. Otherwise, we assume PMATCH has at
205 least NMATCH elements, and we set them to the offsets of the
206 corresponding matched substrings.
208 EFLAGS specifies `execution flags' which affect matching: if
209 REG_NOTBOL is set, then ^ does not match at the beginning of the
210 string; if REG_NOTEOL is set, then $ does not match at the end.
212 We return 0 if we find a match and REG_NOMATCH if not. */
215 regexec (preg, string, nmatch, pmatch, eflags)
216 const regex_t *__restrict preg;
217 const char *__restrict string;
218 size_t nmatch;
219 regmatch_t pmatch[];
220 int eflags;
222 reg_errcode_t err;
223 int start, length;
225 if (eflags & ~(REG_NOTBOL | REG_NOTEOL | REG_STARTEND))
226 return REG_BADPAT;
228 if (eflags & REG_STARTEND)
230 start = pmatch[0].rm_so;
231 length = pmatch[0].rm_eo;
233 else
235 start = 0;
236 length = strlen (string);
238 if (preg->no_sub)
239 err = re_search_internal (preg, string, length, start, length - start,
240 length, 0, NULL, eflags);
241 else
242 err = re_search_internal (preg, string, length, start, length - start,
243 length, nmatch, pmatch, eflags);
244 return err != REG_NOERROR;
247 #ifdef _LIBC
248 # include <shlib-compat.h>
249 versioned_symbol (libc, __regexec, regexec, GLIBC_2_3_4);
251 # if SHLIB_COMPAT (libc, GLIBC_2_0, GLIBC_2_3_4)
252 __typeof__ (__regexec) __compat_regexec;
255 attribute_compat_text_section
256 __compat_regexec (const regex_t *__restrict preg,
257 const char *__restrict string, size_t nmatch,
258 regmatch_t pmatch[], int eflags)
260 return regexec (preg, string, nmatch, pmatch,
261 eflags & (REG_NOTBOL | REG_NOTEOL));
263 compat_symbol (libc, __compat_regexec, regexec, GLIBC_2_0);
264 # endif
265 #endif
267 /* Entry points for GNU code. */
269 /* re_match, re_search, re_match_2, re_search_2
271 The former two functions operate on STRING with length LENGTH,
272 while the later two operate on concatenation of STRING1 and STRING2
273 with lengths LENGTH1 and LENGTH2, respectively.
275 re_match() matches the compiled pattern in BUFP against the string,
276 starting at index START.
278 re_search() first tries matching at index START, then it tries to match
279 starting from index START + 1, and so on. The last start position tried
280 is START + RANGE. (Thus RANGE = 0 forces re_search to operate the same
281 way as re_match().)
283 The parameter STOP of re_{match,search}_2 specifies that no match exceeding
284 the first STOP characters of the concatenation of the strings should be
285 concerned.
287 If REGS is not NULL, and BUFP->no_sub is not set, the offsets of the match
288 and all groups is stroed in REGS. (For the "_2" variants, the offsets are
289 computed relative to the concatenation, not relative to the individual
290 strings.)
292 On success, re_match* functions return the length of the match, re_search*
293 return the position of the start of the match. Return value -1 means no
294 match was found and -2 indicates an internal error. */
297 re_match (bufp, string, length, start, regs)
298 struct re_pattern_buffer *bufp;
299 const char *string;
300 int length, start;
301 struct re_registers *regs;
303 return re_search_stub (bufp, string, length, start, 0, length, regs, 1);
305 #ifdef _LIBC
306 weak_alias (__re_match, re_match)
307 #endif
310 re_search (bufp, string, length, start, range, regs)
311 struct re_pattern_buffer *bufp;
312 const char *string;
313 int length, start, range;
314 struct re_registers *regs;
316 return re_search_stub (bufp, string, length, start, range, length, regs, 0);
318 #ifdef _LIBC
319 weak_alias (__re_search, re_search)
320 #endif
323 re_match_2 (bufp, string1, length1, string2, length2, start, regs, stop)
324 struct re_pattern_buffer *bufp;
325 const char *string1, *string2;
326 int length1, length2, start, stop;
327 struct re_registers *regs;
329 return re_search_2_stub (bufp, string1, length1, string2, length2,
330 start, 0, regs, stop, 1);
332 #ifdef _LIBC
333 weak_alias (__re_match_2, re_match_2)
334 #endif
337 re_search_2 (bufp, string1, length1, string2, length2, start, range, regs, stop)
338 struct re_pattern_buffer *bufp;
339 const char *string1, *string2;
340 int length1, length2, start, range, stop;
341 struct re_registers *regs;
343 return re_search_2_stub (bufp, string1, length1, string2, length2,
344 start, range, regs, stop, 0);
346 #ifdef _LIBC
347 weak_alias (__re_search_2, re_search_2)
348 #endif
350 static int
351 re_search_2_stub (bufp, string1, length1, string2, length2, start, range, regs,
352 stop, ret_len)
353 struct re_pattern_buffer *bufp;
354 const char *string1, *string2;
355 int length1, length2, start, range, stop, ret_len;
356 struct re_registers *regs;
358 const char *str;
359 int rval;
360 int len = length1 + length2;
361 int free_str = 0;
363 if (BE (length1 < 0 || length2 < 0 || stop < 0, 0))
364 return -2;
366 /* Concatenate the strings. */
367 if (length2 > 0)
368 if (length1 > 0)
370 char *s = re_malloc (char, len);
372 if (BE (s == NULL, 0))
373 return -2;
374 memcpy (s, string1, length1);
375 memcpy (s + length1, string2, length2);
376 str = s;
377 free_str = 1;
379 else
380 str = string2;
381 else
382 str = string1;
384 rval = re_search_stub (bufp, str, len, start, range, stop, regs,
385 ret_len);
386 if (free_str)
387 re_free ((char *) str);
388 return rval;
391 /* The parameters have the same meaning as those of re_search.
392 Additional parameters:
393 If RET_LEN is nonzero the length of the match is returned (re_match style);
394 otherwise the position of the match is returned. */
396 static int
397 re_search_stub (bufp, string, length, start, range, stop, regs, ret_len)
398 struct re_pattern_buffer *bufp;
399 const char *string;
400 int length, start, range, stop, ret_len;
401 struct re_registers *regs;
403 reg_errcode_t result;
404 regmatch_t *pmatch;
405 int nregs, rval;
406 int eflags = 0;
408 /* Check for out-of-range. */
409 if (BE (start < 0 || start > length, 0))
410 return -1;
411 if (BE (start + range > length, 0))
412 range = length - start;
413 else if (BE (start + range < 0, 0))
414 range = -start;
416 eflags |= (bufp->not_bol) ? REG_NOTBOL : 0;
417 eflags |= (bufp->not_eol) ? REG_NOTEOL : 0;
419 /* Compile fastmap if we haven't yet. */
420 if (range > 0 && bufp->fastmap != NULL && !bufp->fastmap_accurate)
421 re_compile_fastmap (bufp);
423 if (BE (bufp->no_sub, 0))
424 regs = NULL;
426 /* We need at least 1 register. */
427 if (regs == NULL)
428 nregs = 1;
429 else if (BE (bufp->regs_allocated == REGS_FIXED &&
430 regs->num_regs < bufp->re_nsub + 1, 0))
432 nregs = regs->num_regs;
433 if (BE (nregs < 1, 0))
435 /* Nothing can be copied to regs. */
436 regs = NULL;
437 nregs = 1;
440 else
441 nregs = bufp->re_nsub + 1;
442 pmatch = re_malloc (regmatch_t, nregs);
443 if (BE (pmatch == NULL, 0))
444 return -2;
446 result = re_search_internal (bufp, string, length, start, range, stop,
447 nregs, pmatch, eflags);
449 rval = 0;
451 /* I hope we needn't fill ther regs with -1's when no match was found. */
452 if (result != REG_NOERROR)
453 rval = -1;
454 else if (regs != NULL)
456 /* If caller wants register contents data back, copy them. */
457 bufp->regs_allocated = re_copy_regs (regs, pmatch, nregs,
458 bufp->regs_allocated);
459 if (BE (bufp->regs_allocated == REGS_UNALLOCATED, 0))
460 rval = -2;
463 if (BE (rval == 0, 1))
465 if (ret_len)
467 assert (pmatch[0].rm_so == start);
468 rval = pmatch[0].rm_eo - start;
470 else
471 rval = pmatch[0].rm_so;
473 re_free (pmatch);
474 return rval;
477 static unsigned
478 re_copy_regs (regs, pmatch, nregs, regs_allocated)
479 struct re_registers *regs;
480 regmatch_t *pmatch;
481 int nregs, regs_allocated;
483 int rval = REGS_REALLOCATE;
484 int i;
485 int need_regs = nregs + 1;
486 /* We need one extra element beyond `num_regs' for the `-1' marker GNU code
487 uses. */
489 /* Have the register data arrays been allocated? */
490 if (regs_allocated == REGS_UNALLOCATED)
491 { /* No. So allocate them with malloc. */
492 regs->start = re_malloc (regoff_t, need_regs);
493 regs->end = re_malloc (regoff_t, need_regs);
494 if (BE (regs->start == NULL, 0) || BE (regs->end == NULL, 0))
495 return REGS_UNALLOCATED;
496 regs->num_regs = need_regs;
498 else if (regs_allocated == REGS_REALLOCATE)
499 { /* Yes. If we need more elements than were already
500 allocated, reallocate them. If we need fewer, just
501 leave it alone. */
502 if (BE (need_regs > regs->num_regs, 0))
504 regoff_t *new_start = re_realloc (regs->start, regoff_t, need_regs);
505 regoff_t *new_end = re_realloc (regs->end, regoff_t, need_regs);
506 if (BE (new_start == NULL, 0) || BE (new_end == NULL, 0))
507 return REGS_UNALLOCATED;
508 regs->start = new_start;
509 regs->end = new_end;
510 regs->num_regs = need_regs;
513 else
515 assert (regs_allocated == REGS_FIXED);
516 /* This function may not be called with REGS_FIXED and nregs too big. */
517 assert (regs->num_regs >= nregs);
518 rval = REGS_FIXED;
521 /* Copy the regs. */
522 for (i = 0; i < nregs; ++i)
524 regs->start[i] = pmatch[i].rm_so;
525 regs->end[i] = pmatch[i].rm_eo;
527 for ( ; i < regs->num_regs; ++i)
528 regs->start[i] = regs->end[i] = -1;
530 return rval;
533 /* Set REGS to hold NUM_REGS registers, storing them in STARTS and
534 ENDS. Subsequent matches using PATTERN_BUFFER and REGS will use
535 this memory for recording register information. STARTS and ENDS
536 must be allocated using the malloc library routine, and must each
537 be at least NUM_REGS * sizeof (regoff_t) bytes long.
539 If NUM_REGS == 0, then subsequent matches should allocate their own
540 register data.
542 Unless this function is called, the first search or match using
543 PATTERN_BUFFER will allocate its own register data, without
544 freeing the old data. */
546 void
547 re_set_registers (bufp, regs, num_regs, starts, ends)
548 struct re_pattern_buffer *bufp;
549 struct re_registers *regs;
550 unsigned num_regs;
551 regoff_t *starts, *ends;
553 if (num_regs)
555 bufp->regs_allocated = REGS_REALLOCATE;
556 regs->num_regs = num_regs;
557 regs->start = starts;
558 regs->end = ends;
560 else
562 bufp->regs_allocated = REGS_UNALLOCATED;
563 regs->num_regs = 0;
564 regs->start = regs->end = (regoff_t *) 0;
567 #ifdef _LIBC
568 weak_alias (__re_set_registers, re_set_registers)
569 #endif
571 /* Entry points compatible with 4.2 BSD regex library. We don't define
572 them unless specifically requested. */
574 #if defined _REGEX_RE_COMP || defined _LIBC
576 # ifdef _LIBC
577 weak_function
578 # endif
579 re_exec (s)
580 const char *s;
582 return 0 == regexec (&re_comp_buf, s, 0, NULL, 0);
584 #endif /* _REGEX_RE_COMP */
586 /* Internal entry point. */
588 /* Searches for a compiled pattern PREG in the string STRING, whose
589 length is LENGTH. NMATCH, PMATCH, and EFLAGS have the same
590 mingings with regexec. START, and RANGE have the same meanings
591 with re_search.
592 Return REG_NOERROR if we find a match, and REG_NOMATCH if not,
593 otherwise return the error code.
594 Note: We assume front end functions already check ranges.
595 (START + RANGE >= 0 && START + RANGE <= LENGTH) */
597 static reg_errcode_t
598 re_search_internal (preg, string, length, start, range, stop, nmatch, pmatch,
599 eflags)
600 const regex_t *preg;
601 const char *string;
602 int length, start, range, stop, eflags;
603 size_t nmatch;
604 regmatch_t pmatch[];
606 reg_errcode_t err;
607 re_dfa_t *dfa = (re_dfa_t *)preg->buffer;
608 int left_lim, right_lim, incr;
609 int fl_longest_match, match_first, match_last = -1;
610 int fast_translate, sb;
611 #if defined _LIBC || (defined __STDC_VERSION__ && __STDC_VERSION__ >= 199901L)
612 re_match_context_t mctx = { .dfa = dfa };
613 #else
614 re_match_context_t mctx;
615 #endif
616 char *fastmap = ((preg->fastmap != NULL && preg->fastmap_accurate
617 && range && !preg->can_be_null) ? preg->fastmap : NULL);
619 #if !(defined _LIBC || (defined __STDC_VERSION__ && __STDC_VERSION__ >= 199901L))
620 memset (&mctx, '\0', sizeof (re_match_context_t));
621 mctx.dfa = dfa;
622 #endif
624 /* Check if the DFA haven't been compiled. */
625 if (BE (preg->used == 0 || dfa->init_state == NULL
626 || dfa->init_state_word == NULL || dfa->init_state_nl == NULL
627 || dfa->init_state_begbuf == NULL, 0))
628 return REG_NOMATCH;
630 #ifdef DEBUG
631 /* We assume front-end functions already check them. */
632 assert (start + range >= 0 && start + range <= length);
633 #endif
635 /* If initial states with non-begbuf contexts have no elements,
636 the regex must be anchored. If preg->newline_anchor is set,
637 we'll never use init_state_nl, so do not check it. */
638 if (dfa->init_state->nodes.nelem == 0
639 && dfa->init_state_word->nodes.nelem == 0
640 && (dfa->init_state_nl->nodes.nelem == 0
641 || !preg->newline_anchor))
643 if (start != 0 && start + range != 0)
644 return REG_NOMATCH;
645 start = range = 0;
648 /* We must check the longest matching, if nmatch > 0. */
649 fl_longest_match = (nmatch != 0 || dfa->nbackref);
651 err = re_string_allocate (&mctx.input, string, length, dfa->nodes_len + 1,
652 preg->translate, preg->syntax & RE_ICASE, dfa);
653 if (BE (err != REG_NOERROR, 0))
654 goto free_return;
655 mctx.input.stop = stop;
656 mctx.input.raw_stop = stop;
657 mctx.input.newline_anchor = preg->newline_anchor;
659 err = match_ctx_init (&mctx, eflags, dfa->nbackref * 2);
660 if (BE (err != REG_NOERROR, 0))
661 goto free_return;
663 /* We will log all the DFA states through which the dfa pass,
664 if nmatch > 1, or this dfa has "multibyte node", which is a
665 back-reference or a node which can accept multibyte character or
666 multi character collating element. */
667 if (nmatch > 1 || dfa->has_mb_node)
669 mctx.state_log = re_malloc (re_dfastate_t *, mctx.input.bufs_len + 1);
670 if (BE (mctx.state_log == NULL, 0))
672 err = REG_ESPACE;
673 goto free_return;
676 else
677 mctx.state_log = NULL;
679 match_first = start;
680 mctx.input.tip_context = (eflags & REG_NOTBOL) ? CONTEXT_BEGBUF
681 : CONTEXT_NEWLINE | CONTEXT_BEGBUF;
683 /* Check incrementally whether of not the input string match. */
684 incr = (range < 0) ? -1 : 1;
685 left_lim = (range < 0) ? start + range : start;
686 right_lim = (range < 0) ? start : start + range;
687 sb = dfa->mb_cur_max == 1;
688 fast_translate = sb || !(preg->syntax & RE_ICASE || preg->translate);
690 for (;;)
692 /* At first get the current byte from input string. */
693 if (fastmap)
695 if (BE (fast_translate, 1))
697 unsigned RE_TRANSLATE_TYPE t
698 = (unsigned RE_TRANSLATE_TYPE) preg->translate;
699 if (BE (range >= 0, 1))
701 if (BE (t != NULL, 0))
703 while (BE (match_first < right_lim, 1)
704 && !fastmap[t[(unsigned char) string[match_first]]])
705 ++match_first;
707 else
709 while (BE (match_first < right_lim, 1)
710 && !fastmap[(unsigned char) string[match_first]])
711 ++match_first;
713 if (BE (match_first == right_lim, 0))
715 int ch = match_first >= length
716 ? 0 : (unsigned char) string[match_first];
717 if (!fastmap[t ? t[ch] : ch])
718 break;
721 else
723 while (match_first >= left_lim)
725 int ch = match_first >= length
726 ? 0 : (unsigned char) string[match_first];
727 if (fastmap[t ? t[ch] : ch])
728 break;
729 --match_first;
731 if (match_first < left_lim)
732 break;
735 else
737 int ch;
741 /* In this case, we can't determine easily the current byte,
742 since it might be a component byte of a multibyte
743 character. Then we use the constructed buffer
744 instead. */
745 /* If MATCH_FIRST is out of the valid range, reconstruct the
746 buffers. */
747 if (mctx.input.raw_mbs_idx + mctx.input.valid_raw_len
748 <= match_first
749 || match_first < mctx.input.raw_mbs_idx)
751 err = re_string_reconstruct (&mctx.input, match_first,
752 eflags);
753 if (BE (err != REG_NOERROR, 0))
754 goto free_return;
756 /* If MATCH_FIRST is out of the buffer, leave it as '\0'.
757 Note that MATCH_FIRST must not be smaller than 0. */
758 ch = ((match_first >= length) ? 0
759 : re_string_byte_at (&mctx.input,
760 match_first
761 - mctx.input.raw_mbs_idx));
762 if (fastmap[ch])
763 break;
764 match_first += incr;
766 while (match_first >= left_lim && match_first <= right_lim);
767 if (! fastmap[ch])
768 break;
772 /* Reconstruct the buffers so that the matcher can assume that
773 the matching starts from the beginning of the buffer. */
774 err = re_string_reconstruct (&mctx.input, match_first, eflags);
775 if (BE (err != REG_NOERROR, 0))
776 goto free_return;
777 #ifdef RE_ENABLE_I18N
778 /* Eliminate it when it is a component of a multibyte character
779 and isn't the head of a multibyte character. */
780 if (sb || re_string_first_byte (&mctx.input, 0))
781 #endif
783 /* It seems to be appropriate one, then use the matcher. */
784 /* We assume that the matching starts from 0. */
785 mctx.state_log_top = mctx.nbkref_ents = mctx.max_mb_elem_len = 0;
786 match_last = check_matching (&mctx, fl_longest_match,
787 range >= 0 ? &match_first : NULL);
788 if (match_last != -1)
790 if (BE (match_last == -2, 0))
792 err = REG_ESPACE;
793 goto free_return;
795 else
797 mctx.match_last = match_last;
798 if ((!preg->no_sub && nmatch > 1) || dfa->nbackref)
800 re_dfastate_t *pstate = mctx.state_log[match_last];
801 mctx.last_node = check_halt_state_context (&mctx, pstate,
802 match_last);
804 if ((!preg->no_sub && nmatch > 1 && dfa->has_plural_match)
805 || dfa->nbackref)
807 err = prune_impossible_nodes (&mctx);
808 if (err == REG_NOERROR)
809 break;
810 if (BE (err != REG_NOMATCH, 0))
811 goto free_return;
812 match_last = -1;
814 else
815 break; /* We found a match. */
818 match_ctx_clean (&mctx);
820 /* Update counter. */
821 match_first += incr;
822 if (match_first < left_lim || right_lim < match_first)
823 break;
826 /* Set pmatch[] if we need. */
827 if (match_last != -1 && nmatch > 0)
829 int reg_idx;
831 /* Initialize registers. */
832 for (reg_idx = 1; reg_idx < nmatch; ++reg_idx)
833 pmatch[reg_idx].rm_so = pmatch[reg_idx].rm_eo = -1;
835 /* Set the points where matching start/end. */
836 pmatch[0].rm_so = 0;
837 pmatch[0].rm_eo = mctx.match_last;
839 if (!preg->no_sub && nmatch > 1)
841 err = set_regs (preg, &mctx, nmatch, pmatch,
842 dfa->has_plural_match && dfa->nbackref > 0);
843 if (BE (err != REG_NOERROR, 0))
844 goto free_return;
847 /* At last, add the offset to the each registers, since we slided
848 the buffers so that we could assume that the matching starts
849 from 0. */
850 for (reg_idx = 0; reg_idx < nmatch; ++reg_idx)
851 if (pmatch[reg_idx].rm_so != -1)
853 #ifdef RE_ENABLE_I18N
854 if (BE (mctx.input.offsets_needed != 0, 0))
856 if (pmatch[reg_idx].rm_so == mctx.input.valid_len)
857 pmatch[reg_idx].rm_so += mctx.input.valid_raw_len - mctx.input.valid_len;
858 else
859 pmatch[reg_idx].rm_so = mctx.input.offsets[pmatch[reg_idx].rm_so];
860 if (pmatch[reg_idx].rm_eo == mctx.input.valid_len)
861 pmatch[reg_idx].rm_eo += mctx.input.valid_raw_len - mctx.input.valid_len;
862 else
863 pmatch[reg_idx].rm_eo = mctx.input.offsets[pmatch[reg_idx].rm_eo];
865 #else
866 assert (mctx.input.offsets_needed == 0);
867 #endif
868 pmatch[reg_idx].rm_so += match_first;
869 pmatch[reg_idx].rm_eo += match_first;
872 err = (match_last == -1) ? REG_NOMATCH : REG_NOERROR;
873 free_return:
874 re_free (mctx.state_log);
875 if (dfa->nbackref)
876 match_ctx_free (&mctx);
877 re_string_destruct (&mctx.input);
878 return err;
881 static reg_errcode_t
882 prune_impossible_nodes (mctx)
883 re_match_context_t *mctx;
885 re_dfa_t *const dfa = mctx->dfa;
886 int halt_node, match_last;
887 reg_errcode_t ret;
888 re_dfastate_t **sifted_states;
889 re_dfastate_t **lim_states = NULL;
890 re_sift_context_t sctx;
891 #ifdef DEBUG
892 assert (mctx->state_log != NULL);
893 #endif
894 match_last = mctx->match_last;
895 halt_node = mctx->last_node;
896 sifted_states = re_malloc (re_dfastate_t *, match_last + 1);
897 if (BE (sifted_states == NULL, 0))
899 ret = REG_ESPACE;
900 goto free_return;
902 if (dfa->nbackref)
904 lim_states = re_malloc (re_dfastate_t *, match_last + 1);
905 if (BE (lim_states == NULL, 0))
907 ret = REG_ESPACE;
908 goto free_return;
910 while (1)
912 memset (lim_states, '\0',
913 sizeof (re_dfastate_t *) * (match_last + 1));
914 sift_ctx_init (&sctx, sifted_states, lim_states, halt_node,
915 match_last);
916 ret = sift_states_backward (mctx, &sctx);
917 re_node_set_free (&sctx.limits);
918 if (BE (ret != REG_NOERROR, 0))
919 goto free_return;
920 if (sifted_states[0] != NULL || lim_states[0] != NULL)
921 break;
924 --match_last;
925 if (match_last < 0)
927 ret = REG_NOMATCH;
928 goto free_return;
930 } while (mctx->state_log[match_last] == NULL
931 || !mctx->state_log[match_last]->halt);
932 halt_node = check_halt_state_context (mctx,
933 mctx->state_log[match_last],
934 match_last);
936 ret = merge_state_array (dfa, sifted_states, lim_states,
937 match_last + 1);
938 re_free (lim_states);
939 lim_states = NULL;
940 if (BE (ret != REG_NOERROR, 0))
941 goto free_return;
943 else
945 sift_ctx_init (&sctx, sifted_states, lim_states, halt_node, match_last);
946 ret = sift_states_backward (mctx, &sctx);
947 re_node_set_free (&sctx.limits);
948 if (BE (ret != REG_NOERROR, 0))
949 goto free_return;
951 re_free (mctx->state_log);
952 mctx->state_log = sifted_states;
953 sifted_states = NULL;
954 mctx->last_node = halt_node;
955 mctx->match_last = match_last;
956 ret = REG_NOERROR;
957 free_return:
958 re_free (sifted_states);
959 re_free (lim_states);
960 return ret;
963 /* Acquire an initial state and return it.
964 We must select appropriate initial state depending on the context,
965 since initial states may have constraints like "\<", "^", etc.. */
967 static inline re_dfastate_t *
968 acquire_init_state_context (err, mctx, idx)
969 reg_errcode_t *err;
970 const re_match_context_t *mctx;
971 int idx;
973 re_dfa_t *const dfa = mctx->dfa;
974 if (dfa->init_state->has_constraint)
976 unsigned int context;
977 context = re_string_context_at (&mctx->input, idx - 1, mctx->eflags);
978 if (IS_WORD_CONTEXT (context))
979 return dfa->init_state_word;
980 else if (IS_ORDINARY_CONTEXT (context))
981 return dfa->init_state;
982 else if (IS_BEGBUF_CONTEXT (context) && IS_NEWLINE_CONTEXT (context))
983 return dfa->init_state_begbuf;
984 else if (IS_NEWLINE_CONTEXT (context))
985 return dfa->init_state_nl;
986 else if (IS_BEGBUF_CONTEXT (context))
988 /* It is relatively rare case, then calculate on demand. */
989 return re_acquire_state_context (err, dfa,
990 dfa->init_state->entrance_nodes,
991 context);
993 else
994 /* Must not happen? */
995 return dfa->init_state;
997 else
998 return dfa->init_state;
1001 /* Check whether the regular expression match input string INPUT or not,
1002 and return the index where the matching end, return -1 if not match,
1003 or return -2 in case of an error.
1004 FL_LONGEST_MATCH means we want the POSIX longest matching.
1005 If P_MATCH_FIRST is not NULL, and the match fails, it is set to the
1006 next place where we may want to try matching.
1007 Note that the matcher assume that the maching starts from the current
1008 index of the buffer. */
1010 static int
1011 check_matching (mctx, fl_longest_match, p_match_first)
1012 re_match_context_t *mctx;
1013 int fl_longest_match;
1014 int *p_match_first;
1016 re_dfa_t *const dfa = mctx->dfa;
1017 reg_errcode_t err;
1018 int match = 0;
1019 int match_last = -1;
1020 int cur_str_idx = re_string_cur_idx (&mctx->input);
1021 re_dfastate_t *cur_state;
1022 int at_init_state = p_match_first != NULL;
1023 int next_start_idx = cur_str_idx;
1025 err = REG_NOERROR;
1026 cur_state = acquire_init_state_context (&err, mctx, cur_str_idx);
1027 /* An initial state must not be NULL (invalid). */
1028 if (BE (cur_state == NULL, 0))
1030 assert (err == REG_ESPACE);
1031 return -2;
1034 if (mctx->state_log != NULL)
1036 mctx->state_log[cur_str_idx] = cur_state;
1038 /* Check OP_OPEN_SUBEXP in the initial state in case that we use them
1039 later. E.g. Processing back references. */
1040 if (BE (dfa->nbackref, 0))
1042 at_init_state = 0;
1043 err = check_subexp_matching_top (mctx, &cur_state->nodes, 0);
1044 if (BE (err != REG_NOERROR, 0))
1045 return err;
1047 if (cur_state->has_backref)
1049 err = transit_state_bkref (mctx, &cur_state->nodes);
1050 if (BE (err != REG_NOERROR, 0))
1051 return err;
1056 /* If the RE accepts NULL string. */
1057 if (BE (cur_state->halt, 0))
1059 if (!cur_state->has_constraint
1060 || check_halt_state_context (mctx, cur_state, cur_str_idx))
1062 if (!fl_longest_match)
1063 return cur_str_idx;
1064 else
1066 match_last = cur_str_idx;
1067 match = 1;
1072 while (!re_string_eoi (&mctx->input))
1074 re_dfastate_t *old_state = cur_state;
1075 cur_state = transit_state (&err, mctx, cur_state);
1076 if (mctx->state_log != NULL)
1077 cur_state = merge_state_with_log (&err, mctx, cur_state);
1079 if (cur_state == NULL)
1081 /* Reached the invalid state or an error. Try to recover a valid
1082 state using the state log, if available and if we have not
1083 already found a valid (even if not the longest) match. */
1084 if (BE (err != REG_NOERROR, 0))
1085 return -2;
1087 if (mctx->state_log == NULL
1088 || (match && !fl_longest_match)
1089 || (cur_state = find_recover_state (&err, mctx)) == NULL)
1090 break;
1093 if (at_init_state)
1095 if (old_state == cur_state)
1096 next_start_idx = re_string_cur_idx (&mctx->input);
1097 else
1098 at_init_state = 0;
1101 if (cur_state->halt)
1103 /* Reached a halt state.
1104 Check the halt state can satisfy the current context. */
1105 if (!cur_state->has_constraint
1106 || check_halt_state_context (mctx, cur_state,
1107 re_string_cur_idx (&mctx->input)))
1109 /* We found an appropriate halt state. */
1110 match_last = re_string_cur_idx (&mctx->input);
1111 match = 1;
1112 if (!fl_longest_match)
1113 break;
1118 if (match_last == -1 && p_match_first)
1119 *p_match_first += next_start_idx;
1121 return match_last;
1124 /* Check NODE match the current context. */
1126 static int check_halt_node_context (dfa, node, context)
1127 const re_dfa_t *dfa;
1128 int node;
1129 unsigned int context;
1131 re_token_type_t type = dfa->nodes[node].type;
1132 unsigned int constraint = dfa->nodes[node].constraint;
1133 if (type != END_OF_RE)
1134 return 0;
1135 if (!constraint)
1136 return 1;
1137 if (NOT_SATISFY_NEXT_CONSTRAINT (constraint, context))
1138 return 0;
1139 return 1;
1142 /* Check the halt state STATE match the current context.
1143 Return 0 if not match, if the node, STATE has, is a halt node and
1144 match the context, return the node. */
1146 static int
1147 check_halt_state_context (mctx, state, idx)
1148 const re_match_context_t *mctx;
1149 const re_dfastate_t *state;
1150 int idx;
1152 int i;
1153 unsigned int context;
1154 #ifdef DEBUG
1155 assert (state->halt);
1156 #endif
1157 context = re_string_context_at (&mctx->input, idx, mctx->eflags);
1158 for (i = 0; i < state->nodes.nelem; ++i)
1159 if (check_halt_node_context (mctx->dfa, state->nodes.elems[i], context))
1160 return state->nodes.elems[i];
1161 return 0;
1164 /* Compute the next node to which "NFA" transit from NODE("NFA" is a NFA
1165 corresponding to the DFA).
1166 Return the destination node, and update EPS_VIA_NODES, return -1 in case
1167 of errors. */
1169 static int
1170 proceed_next_node (mctx, nregs, regs, pidx, node, eps_via_nodes, fs)
1171 const re_match_context_t *mctx;
1172 regmatch_t *regs;
1173 int nregs, *pidx, node;
1174 re_node_set *eps_via_nodes;
1175 struct re_fail_stack_t *fs;
1177 re_dfa_t *const dfa = mctx->dfa;
1178 int i, err, dest_node;
1179 dest_node = -1;
1180 if (IS_EPSILON_NODE (dfa->nodes[node].type))
1182 re_node_set *cur_nodes = &mctx->state_log[*pidx]->nodes;
1183 int ndest, dest_nodes[2];
1184 err = re_node_set_insert (eps_via_nodes, node);
1185 if (BE (err < 0, 0))
1186 return -2;
1187 /* Pick up valid destinations. */
1188 for (ndest = 0, i = 0; i < dfa->edests[node].nelem; ++i)
1190 int candidate = dfa->edests[node].elems[i];
1191 if (!re_node_set_contains (cur_nodes, candidate))
1192 continue;
1193 dest_nodes[0] = (ndest == 0) ? candidate : dest_nodes[0];
1194 dest_nodes[1] = (ndest == 1) ? candidate : dest_nodes[1];
1195 ++ndest;
1197 if (ndest <= 1)
1198 return ndest == 0 ? -1 : (ndest == 1 ? dest_nodes[0] : 0);
1199 /* In order to avoid infinite loop like "(a*)*". */
1200 if (re_node_set_contains (eps_via_nodes, dest_nodes[0]))
1201 return dest_nodes[1];
1202 if (fs != NULL
1203 && push_fail_stack (fs, *pidx, dest_nodes, nregs, regs,
1204 eps_via_nodes))
1205 return -2;
1206 return dest_nodes[0];
1208 else
1210 int naccepted = 0;
1211 re_token_type_t type = dfa->nodes[node].type;
1213 #ifdef RE_ENABLE_I18N
1214 if (ACCEPT_MB_NODE (type))
1215 naccepted = check_node_accept_bytes (dfa, node, &mctx->input, *pidx);
1216 else
1217 #endif /* RE_ENABLE_I18N */
1218 if (type == OP_BACK_REF)
1220 int subexp_idx = dfa->nodes[node].opr.idx;
1221 naccepted = regs[subexp_idx].rm_eo - regs[subexp_idx].rm_so;
1222 if (fs != NULL)
1224 if (regs[subexp_idx].rm_so == -1 || regs[subexp_idx].rm_eo == -1)
1225 return -1;
1226 else if (naccepted)
1228 char *buf = (char *) re_string_get_buffer (&mctx->input);
1229 if (memcmp (buf + regs[subexp_idx].rm_so, buf + *pidx,
1230 naccepted) != 0)
1231 return -1;
1235 if (naccepted == 0)
1237 err = re_node_set_insert (eps_via_nodes, node);
1238 if (BE (err < 0, 0))
1239 return -2;
1240 dest_node = dfa->edests[node].elems[0];
1241 if (re_node_set_contains (&mctx->state_log[*pidx]->nodes,
1242 dest_node))
1243 return dest_node;
1247 if (naccepted != 0
1248 || check_node_accept (mctx, dfa->nodes + node, *pidx))
1250 dest_node = dfa->nexts[node];
1251 *pidx = (naccepted == 0) ? *pidx + 1 : *pidx + naccepted;
1252 if (fs && (*pidx > mctx->match_last || mctx->state_log[*pidx] == NULL
1253 || !re_node_set_contains (&mctx->state_log[*pidx]->nodes,
1254 dest_node)))
1255 return -1;
1256 re_node_set_empty (eps_via_nodes);
1257 return dest_node;
1260 return -1;
1263 static reg_errcode_t
1264 push_fail_stack (fs, str_idx, dests, nregs, regs, eps_via_nodes)
1265 struct re_fail_stack_t *fs;
1266 int str_idx, *dests, nregs;
1267 regmatch_t *regs;
1268 re_node_set *eps_via_nodes;
1270 reg_errcode_t err;
1271 int num = fs->num++;
1272 if (fs->num == fs->alloc)
1274 struct re_fail_stack_ent_t *new_array;
1275 new_array = realloc (fs->stack, (sizeof (struct re_fail_stack_ent_t)
1276 * fs->alloc * 2));
1277 if (new_array == NULL)
1278 return REG_ESPACE;
1279 fs->alloc *= 2;
1280 fs->stack = new_array;
1282 fs->stack[num].idx = str_idx;
1283 fs->stack[num].node = dests[1];
1284 fs->stack[num].regs = re_malloc (regmatch_t, nregs);
1285 if (fs->stack[num].regs == NULL)
1286 return REG_ESPACE;
1287 memcpy (fs->stack[num].regs, regs, sizeof (regmatch_t) * nregs);
1288 err = re_node_set_init_copy (&fs->stack[num].eps_via_nodes, eps_via_nodes);
1289 return err;
1292 static int
1293 pop_fail_stack (fs, pidx, nregs, regs, eps_via_nodes)
1294 struct re_fail_stack_t *fs;
1295 int *pidx, nregs;
1296 regmatch_t *regs;
1297 re_node_set *eps_via_nodes;
1299 int num = --fs->num;
1300 assert (num >= 0);
1301 *pidx = fs->stack[num].idx;
1302 memcpy (regs, fs->stack[num].regs, sizeof (regmatch_t) * nregs);
1303 re_node_set_free (eps_via_nodes);
1304 re_free (fs->stack[num].regs);
1305 *eps_via_nodes = fs->stack[num].eps_via_nodes;
1306 return fs->stack[num].node;
1309 /* Set the positions where the subexpressions are starts/ends to registers
1310 PMATCH.
1311 Note: We assume that pmatch[0] is already set, and
1312 pmatch[i].rm_so == pmatch[i].rm_eo == -1 for 0 < i < nmatch. */
1314 static reg_errcode_t
1315 set_regs (preg, mctx, nmatch, pmatch, fl_backtrack)
1316 const regex_t *preg;
1317 const re_match_context_t *mctx;
1318 size_t nmatch;
1319 regmatch_t *pmatch;
1320 int fl_backtrack;
1322 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1323 int idx, cur_node, real_nmatch;
1324 re_node_set eps_via_nodes;
1325 struct re_fail_stack_t *fs;
1326 struct re_fail_stack_t fs_body = { 0, 2, NULL };
1327 regmatch_t *prev_idx_match;
1329 #ifdef DEBUG
1330 assert (nmatch > 1);
1331 assert (mctx->state_log != NULL);
1332 #endif
1333 if (fl_backtrack)
1335 fs = &fs_body;
1336 fs->stack = re_malloc (struct re_fail_stack_ent_t, fs->alloc);
1337 if (fs->stack == NULL)
1338 return REG_ESPACE;
1340 else
1341 fs = NULL;
1343 cur_node = dfa->init_node;
1344 real_nmatch = (nmatch <= preg->re_nsub) ? nmatch : preg->re_nsub + 1;
1345 re_node_set_init_empty (&eps_via_nodes);
1347 prev_idx_match = (regmatch_t *) alloca (sizeof (regmatch_t) * real_nmatch);
1348 memcpy (prev_idx_match, pmatch, sizeof (regmatch_t) * real_nmatch);
1350 for (idx = pmatch[0].rm_so; idx <= pmatch[0].rm_eo ;)
1352 update_regs (dfa, pmatch, prev_idx_match, cur_node, idx, real_nmatch);
1354 if (idx == pmatch[0].rm_eo && cur_node == mctx->last_node)
1356 int reg_idx;
1357 if (fs)
1359 for (reg_idx = 0; reg_idx < nmatch; ++reg_idx)
1360 if (pmatch[reg_idx].rm_so > -1 && pmatch[reg_idx].rm_eo == -1)
1361 break;
1362 if (reg_idx == nmatch)
1364 re_node_set_free (&eps_via_nodes);
1365 return free_fail_stack_return (fs);
1367 cur_node = pop_fail_stack (fs, &idx, nmatch, pmatch,
1368 &eps_via_nodes);
1370 else
1372 re_node_set_free (&eps_via_nodes);
1373 return REG_NOERROR;
1377 /* Proceed to next node. */
1378 cur_node = proceed_next_node (mctx, nmatch, pmatch, &idx, cur_node,
1379 &eps_via_nodes, fs);
1381 if (BE (cur_node < 0, 0))
1383 if (BE (cur_node == -2, 0))
1385 re_node_set_free (&eps_via_nodes);
1386 free_fail_stack_return (fs);
1387 return REG_ESPACE;
1389 if (fs)
1390 cur_node = pop_fail_stack (fs, &idx, nmatch, pmatch,
1391 &eps_via_nodes);
1392 else
1394 re_node_set_free (&eps_via_nodes);
1395 return REG_NOMATCH;
1399 re_node_set_free (&eps_via_nodes);
1400 return free_fail_stack_return (fs);
1403 static reg_errcode_t
1404 free_fail_stack_return (fs)
1405 struct re_fail_stack_t *fs;
1407 if (fs)
1409 int fs_idx;
1410 for (fs_idx = 0; fs_idx < fs->num; ++fs_idx)
1412 re_node_set_free (&fs->stack[fs_idx].eps_via_nodes);
1413 re_free (fs->stack[fs_idx].regs);
1415 re_free (fs->stack);
1417 return REG_NOERROR;
1420 static void
1421 update_regs (dfa, pmatch, prev_idx_match, cur_node, cur_idx, nmatch)
1422 re_dfa_t *dfa;
1423 regmatch_t *pmatch, *prev_idx_match;
1424 int cur_node, cur_idx, nmatch;
1426 int type = dfa->nodes[cur_node].type;
1427 if (type == OP_OPEN_SUBEXP)
1429 int reg_num = dfa->nodes[cur_node].opr.idx + 1;
1431 /* We are at the first node of this sub expression. */
1432 if (reg_num < nmatch)
1434 pmatch[reg_num].rm_so = cur_idx;
1435 pmatch[reg_num].rm_eo = -1;
1438 else if (type == OP_CLOSE_SUBEXP)
1440 int reg_num = dfa->nodes[cur_node].opr.idx + 1;
1441 if (reg_num < nmatch)
1443 /* We are at the last node of this sub expression. */
1444 if (pmatch[reg_num].rm_so < cur_idx)
1446 pmatch[reg_num].rm_eo = cur_idx;
1447 /* This is a non-empty match or we are not inside an optional
1448 subexpression. Accept this right away. */
1449 memcpy (prev_idx_match, pmatch, sizeof (regmatch_t) * nmatch);
1451 else
1453 if (dfa->nodes[cur_node].opt_subexp
1454 && prev_idx_match[reg_num].rm_so != -1)
1455 /* We transited through an empty match for an optional
1456 subexpression, like (a?)*, and this is not the subexp's
1457 first match. Copy back the old content of the registers
1458 so that matches of an inner subexpression are undone as
1459 well, like in ((a?))*. */
1460 memcpy (pmatch, prev_idx_match, sizeof (regmatch_t) * nmatch);
1461 else
1462 /* We completed a subexpression, but it may be part of
1463 an optional one, so do not update PREV_IDX_MATCH. */
1464 pmatch[reg_num].rm_eo = cur_idx;
1470 /* This function checks the STATE_LOG from the SCTX->last_str_idx to 0
1471 and sift the nodes in each states according to the following rules.
1472 Updated state_log will be wrote to STATE_LOG.
1474 Rules: We throw away the Node `a' in the STATE_LOG[STR_IDX] if...
1475 1. When STR_IDX == MATCH_LAST(the last index in the state_log):
1476 If `a' isn't the LAST_NODE and `a' can't epsilon transit to
1477 the LAST_NODE, we throw away the node `a'.
1478 2. When 0 <= STR_IDX < MATCH_LAST and `a' accepts
1479 string `s' and transit to `b':
1480 i. If 'b' isn't in the STATE_LOG[STR_IDX+strlen('s')], we throw
1481 away the node `a'.
1482 ii. If 'b' is in the STATE_LOG[STR_IDX+strlen('s')] but 'b' is
1483 thrown away, we throw away the node `a'.
1484 3. When 0 <= STR_IDX < MATCH_LAST and 'a' epsilon transit to 'b':
1485 i. If 'b' isn't in the STATE_LOG[STR_IDX], we throw away the
1486 node `a'.
1487 ii. If 'b' is in the STATE_LOG[STR_IDX] but 'b' is thrown away,
1488 we throw away the node `a'. */
1490 #define STATE_NODE_CONTAINS(state,node) \
1491 ((state) != NULL && re_node_set_contains (&(state)->nodes, node))
1493 static reg_errcode_t
1494 sift_states_backward (mctx, sctx)
1495 re_match_context_t *mctx;
1496 re_sift_context_t *sctx;
1498 reg_errcode_t err;
1499 int null_cnt = 0;
1500 int str_idx = sctx->last_str_idx;
1501 re_node_set cur_dest;
1503 #ifdef DEBUG
1504 assert (mctx->state_log != NULL && mctx->state_log[str_idx] != NULL);
1505 #endif
1507 /* Build sifted state_log[str_idx]. It has the nodes which can epsilon
1508 transit to the last_node and the last_node itself. */
1509 err = re_node_set_init_1 (&cur_dest, sctx->last_node);
1510 if (BE (err != REG_NOERROR, 0))
1511 return err;
1512 err = update_cur_sifted_state (mctx, sctx, str_idx, &cur_dest);
1513 if (BE (err != REG_NOERROR, 0))
1514 goto free_return;
1516 /* Then check each states in the state_log. */
1517 while (str_idx > 0)
1519 /* Update counters. */
1520 null_cnt = (sctx->sifted_states[str_idx] == NULL) ? null_cnt + 1 : 0;
1521 if (null_cnt > mctx->max_mb_elem_len)
1523 memset (sctx->sifted_states, '\0',
1524 sizeof (re_dfastate_t *) * str_idx);
1525 re_node_set_free (&cur_dest);
1526 return REG_NOERROR;
1528 re_node_set_empty (&cur_dest);
1529 --str_idx;
1531 if (mctx->state_log[str_idx])
1533 err = build_sifted_states (mctx, sctx, str_idx, &cur_dest);
1534 if (BE (err != REG_NOERROR, 0))
1535 goto free_return;
1538 /* Add all the nodes which satisfy the following conditions:
1539 - It can epsilon transit to a node in CUR_DEST.
1540 - It is in CUR_SRC.
1541 And update state_log. */
1542 err = update_cur_sifted_state (mctx, sctx, str_idx, &cur_dest);
1543 if (BE (err != REG_NOERROR, 0))
1544 goto free_return;
1546 err = REG_NOERROR;
1547 free_return:
1548 re_node_set_free (&cur_dest);
1549 return err;
1552 static reg_errcode_t
1553 build_sifted_states (mctx, sctx, str_idx, cur_dest)
1554 re_match_context_t *mctx;
1555 re_sift_context_t *sctx;
1556 int str_idx;
1557 re_node_set *cur_dest;
1559 re_dfa_t *const dfa = mctx->dfa;
1560 re_node_set *cur_src = &mctx->state_log[str_idx]->nodes;
1561 int i;
1563 /* Then build the next sifted state.
1564 We build the next sifted state on `cur_dest', and update
1565 `sifted_states[str_idx]' with `cur_dest'.
1566 Note:
1567 `cur_dest' is the sifted state from `state_log[str_idx + 1]'.
1568 `cur_src' points the node_set of the old `state_log[str_idx]'. */
1569 for (i = 0; i < cur_src->nelem; i++)
1571 int prev_node = cur_src->elems[i];
1572 int naccepted = 0;
1573 re_token_type_t type = dfa->nodes[prev_node].type;
1574 int ret;
1576 if (IS_EPSILON_NODE (type))
1577 continue;
1578 #ifdef RE_ENABLE_I18N
1579 /* If the node may accept `multi byte'. */
1580 if (ACCEPT_MB_NODE (type))
1581 naccepted = sift_states_iter_mb (mctx, sctx, prev_node,
1582 str_idx, sctx->last_str_idx);
1583 #endif /* RE_ENABLE_I18N */
1585 /* We don't check backreferences here.
1586 See update_cur_sifted_state(). */
1587 if (!naccepted
1588 && check_node_accept (mctx, dfa->nodes + prev_node, str_idx)
1589 && STATE_NODE_CONTAINS (sctx->sifted_states[str_idx + 1],
1590 dfa->nexts[prev_node]))
1591 naccepted = 1;
1593 if (naccepted == 0)
1594 continue;
1596 if (sctx->limits.nelem)
1598 int to_idx = str_idx + naccepted;
1599 if (check_dst_limits (mctx, &sctx->limits,
1600 dfa->nexts[prev_node], to_idx,
1601 prev_node, str_idx))
1602 continue;
1604 ret = re_node_set_insert (cur_dest, prev_node);
1605 if (BE (ret == -1, 0))
1606 return REG_ESPACE;
1609 return REG_NOERROR;
1612 /* Helper functions. */
1614 static reg_errcode_t
1615 clean_state_log_if_needed (mctx, next_state_log_idx)
1616 re_match_context_t *mctx;
1617 int next_state_log_idx;
1619 int top = mctx->state_log_top;
1621 if (next_state_log_idx >= mctx->input.bufs_len
1622 || (next_state_log_idx >= mctx->input.valid_len
1623 && mctx->input.valid_len < mctx->input.len))
1625 reg_errcode_t err;
1626 err = extend_buffers (mctx);
1627 if (BE (err != REG_NOERROR, 0))
1628 return err;
1631 if (top < next_state_log_idx)
1633 memset (mctx->state_log + top + 1, '\0',
1634 sizeof (re_dfastate_t *) * (next_state_log_idx - top));
1635 mctx->state_log_top = next_state_log_idx;
1637 return REG_NOERROR;
1640 static reg_errcode_t
1641 merge_state_array (dfa, dst, src, num)
1642 re_dfa_t *dfa;
1643 re_dfastate_t **dst;
1644 re_dfastate_t **src;
1645 int num;
1647 int st_idx;
1648 reg_errcode_t err;
1649 for (st_idx = 0; st_idx < num; ++st_idx)
1651 if (dst[st_idx] == NULL)
1652 dst[st_idx] = src[st_idx];
1653 else if (src[st_idx] != NULL)
1655 re_node_set merged_set;
1656 err = re_node_set_init_union (&merged_set, &dst[st_idx]->nodes,
1657 &src[st_idx]->nodes);
1658 if (BE (err != REG_NOERROR, 0))
1659 return err;
1660 dst[st_idx] = re_acquire_state (&err, dfa, &merged_set);
1661 re_node_set_free (&merged_set);
1662 if (BE (err != REG_NOERROR, 0))
1663 return err;
1666 return REG_NOERROR;
1669 static reg_errcode_t
1670 update_cur_sifted_state (mctx, sctx, str_idx, dest_nodes)
1671 re_match_context_t *mctx;
1672 re_sift_context_t *sctx;
1673 int str_idx;
1674 re_node_set *dest_nodes;
1676 re_dfa_t *const dfa = mctx->dfa;
1677 reg_errcode_t err;
1678 const re_node_set *candidates;
1679 candidates = ((mctx->state_log[str_idx] == NULL) ? NULL
1680 : &mctx->state_log[str_idx]->nodes);
1682 if (dest_nodes->nelem == 0)
1683 sctx->sifted_states[str_idx] = NULL;
1684 else
1686 if (candidates)
1688 /* At first, add the nodes which can epsilon transit to a node in
1689 DEST_NODE. */
1690 err = add_epsilon_src_nodes (dfa, dest_nodes, candidates);
1691 if (BE (err != REG_NOERROR, 0))
1692 return err;
1694 /* Then, check the limitations in the current sift_context. */
1695 if (sctx->limits.nelem)
1697 err = check_subexp_limits (dfa, dest_nodes, candidates, &sctx->limits,
1698 mctx->bkref_ents, str_idx);
1699 if (BE (err != REG_NOERROR, 0))
1700 return err;
1704 sctx->sifted_states[str_idx] = re_acquire_state (&err, dfa, dest_nodes);
1705 if (BE (err != REG_NOERROR, 0))
1706 return err;
1709 if (candidates && mctx->state_log[str_idx]->has_backref)
1711 err = sift_states_bkref (mctx, sctx, str_idx, candidates);
1712 if (BE (err != REG_NOERROR, 0))
1713 return err;
1715 return REG_NOERROR;
1718 static reg_errcode_t
1719 add_epsilon_src_nodes (dfa, dest_nodes, candidates)
1720 re_dfa_t *dfa;
1721 re_node_set *dest_nodes;
1722 const re_node_set *candidates;
1724 reg_errcode_t err;
1725 int src_idx;
1726 re_node_set src_copy;
1728 err = re_node_set_init_copy (&src_copy, dest_nodes);
1729 if (BE (err != REG_NOERROR, 0))
1730 return err;
1731 for (src_idx = 0; src_idx < src_copy.nelem; ++src_idx)
1733 err = re_node_set_add_intersect (dest_nodes, candidates,
1734 dfa->inveclosures
1735 + src_copy.elems[src_idx]);
1736 if (BE (err != REG_NOERROR, 0))
1738 re_node_set_free (&src_copy);
1739 return err;
1742 re_node_set_free (&src_copy);
1743 return REG_NOERROR;
1746 static reg_errcode_t
1747 sub_epsilon_src_nodes (dfa, node, dest_nodes, candidates)
1748 re_dfa_t *dfa;
1749 int node;
1750 re_node_set *dest_nodes;
1751 const re_node_set *candidates;
1753 int ecl_idx;
1754 reg_errcode_t err;
1755 re_node_set *inv_eclosure = dfa->inveclosures + node;
1756 re_node_set except_nodes;
1757 re_node_set_init_empty (&except_nodes);
1758 for (ecl_idx = 0; ecl_idx < inv_eclosure->nelem; ++ecl_idx)
1760 int cur_node = inv_eclosure->elems[ecl_idx];
1761 if (cur_node == node)
1762 continue;
1763 if (IS_EPSILON_NODE (dfa->nodes[cur_node].type))
1765 int edst1 = dfa->edests[cur_node].elems[0];
1766 int edst2 = ((dfa->edests[cur_node].nelem > 1)
1767 ? dfa->edests[cur_node].elems[1] : -1);
1768 if ((!re_node_set_contains (inv_eclosure, edst1)
1769 && re_node_set_contains (dest_nodes, edst1))
1770 || (edst2 > 0
1771 && !re_node_set_contains (inv_eclosure, edst2)
1772 && re_node_set_contains (dest_nodes, edst2)))
1774 err = re_node_set_add_intersect (&except_nodes, candidates,
1775 dfa->inveclosures + cur_node);
1776 if (BE (err != REG_NOERROR, 0))
1778 re_node_set_free (&except_nodes);
1779 return err;
1784 for (ecl_idx = 0; ecl_idx < inv_eclosure->nelem; ++ecl_idx)
1786 int cur_node = inv_eclosure->elems[ecl_idx];
1787 if (!re_node_set_contains (&except_nodes, cur_node))
1789 int idx = re_node_set_contains (dest_nodes, cur_node) - 1;
1790 re_node_set_remove_at (dest_nodes, idx);
1793 re_node_set_free (&except_nodes);
1794 return REG_NOERROR;
1797 static int
1798 check_dst_limits (mctx, limits, dst_node, dst_idx, src_node, src_idx)
1799 re_match_context_t *mctx;
1800 re_node_set *limits;
1801 int dst_node, dst_idx, src_node, src_idx;
1803 re_dfa_t *const dfa = mctx->dfa;
1804 int lim_idx, src_pos, dst_pos;
1806 int dst_bkref_idx = search_cur_bkref_entry (mctx, dst_idx);
1807 int src_bkref_idx = search_cur_bkref_entry (mctx, src_idx);
1808 for (lim_idx = 0; lim_idx < limits->nelem; ++lim_idx)
1810 int subexp_idx;
1811 struct re_backref_cache_entry *ent;
1812 ent = mctx->bkref_ents + limits->elems[lim_idx];
1813 subexp_idx = dfa->nodes[ent->node].opr.idx - 1;
1815 dst_pos = check_dst_limits_calc_pos (mctx, limits->elems[lim_idx],
1816 subexp_idx, dst_node, dst_idx,
1817 dst_bkref_idx);
1818 src_pos = check_dst_limits_calc_pos (mctx, limits->elems[lim_idx],
1819 subexp_idx, src_node, src_idx,
1820 src_bkref_idx);
1822 /* In case of:
1823 <src> <dst> ( <subexp> )
1824 ( <subexp> ) <src> <dst>
1825 ( <subexp1> <src> <subexp2> <dst> <subexp3> ) */
1826 if (src_pos == dst_pos)
1827 continue; /* This is unrelated limitation. */
1828 else
1829 return 1;
1831 return 0;
1834 static int
1835 check_dst_limits_calc_pos_1 (mctx, boundaries, subexp_idx, from_node, bkref_idx)
1836 re_match_context_t *mctx;
1837 int boundaries, subexp_idx, from_node, bkref_idx;
1839 re_dfa_t *const dfa = mctx->dfa;
1840 re_node_set *eclosures = dfa->eclosures + from_node;
1841 int node_idx;
1843 /* Else, we are on the boundary: examine the nodes on the epsilon
1844 closure. */
1845 for (node_idx = 0; node_idx < eclosures->nelem; ++node_idx)
1847 int node = eclosures->elems[node_idx];
1848 switch (dfa->nodes[node].type)
1850 case OP_BACK_REF:
1852 struct re_backref_cache_entry *ent = mctx->bkref_ents + bkref_idx;
1855 int dst, cpos;
1857 if (ent->node != node || ent->subexp_from != ent->subexp_to)
1858 continue;
1860 /* Recurse trying to reach the OP_OPEN_SUBEXP and
1861 OP_CLOSE_SUBEXP cases below. But, if the
1862 destination node is the same node as the source
1863 node, don't recurse because it would cause an
1864 infinite loop: a regex that exhibits this behavior
1865 is ()\1*\1* */
1866 dst = dfa->edests[node].elems[0];
1867 if (dst == from_node)
1869 if (boundaries & 1)
1870 return -1;
1871 else /* if (boundaries & 2) */
1872 return 0;
1875 cpos = check_dst_limits_calc_pos_1 (mctx, boundaries,
1876 subexp_idx, dst, bkref_idx);
1878 if (cpos == -1 && (boundaries & 1))
1879 return -1;
1881 if (cpos == 0 /* && (boundaries & 2) */)
1882 return 0;
1884 while (ent++->more);
1885 break;
1888 case OP_OPEN_SUBEXP:
1889 if ((boundaries & 1) && subexp_idx == dfa->nodes[node].opr.idx)
1890 return -1;
1891 break;
1893 case OP_CLOSE_SUBEXP:
1894 if ((boundaries & 2) && subexp_idx == dfa->nodes[node].opr.idx)
1895 return 0;
1896 break;
1898 default:
1899 break;
1903 return (boundaries & 2) ? 1 : 0;
1906 static int
1907 check_dst_limits_calc_pos (mctx, limit, subexp_idx, from_node, str_idx, bkref_idx)
1908 re_match_context_t *mctx;
1909 int limit, subexp_idx, from_node, str_idx, bkref_idx;
1911 struct re_backref_cache_entry *lim = mctx->bkref_ents + limit;
1912 int boundaries;
1914 /* If we are outside the range of the subexpression, return -1 or 1. */
1915 if (str_idx < lim->subexp_from)
1916 return -1;
1918 if (lim->subexp_to < str_idx)
1919 return 1;
1921 /* If we are within the subexpression, return 0. */
1922 boundaries = (str_idx == lim->subexp_from);
1923 boundaries |= (str_idx == lim->subexp_to) << 1;
1924 if (boundaries == 0)
1925 return 0;
1927 /* Else, examine epsilon closure. */
1928 return check_dst_limits_calc_pos_1 (mctx, boundaries, subexp_idx,
1929 from_node, bkref_idx);
1932 /* Check the limitations of sub expressions LIMITS, and remove the nodes
1933 which are against limitations from DEST_NODES. */
1935 static reg_errcode_t
1936 check_subexp_limits (dfa, dest_nodes, candidates, limits, bkref_ents, str_idx)
1937 re_dfa_t *dfa;
1938 re_node_set *dest_nodes;
1939 const re_node_set *candidates;
1940 re_node_set *limits;
1941 struct re_backref_cache_entry *bkref_ents;
1942 int str_idx;
1944 reg_errcode_t err;
1945 int node_idx, lim_idx;
1947 for (lim_idx = 0; lim_idx < limits->nelem; ++lim_idx)
1949 int subexp_idx;
1950 struct re_backref_cache_entry *ent;
1951 ent = bkref_ents + limits->elems[lim_idx];
1953 if (str_idx <= ent->subexp_from || ent->str_idx < str_idx)
1954 continue; /* This is unrelated limitation. */
1956 subexp_idx = dfa->nodes[ent->node].opr.idx - 1;
1957 if (ent->subexp_to == str_idx)
1959 int ops_node = -1;
1960 int cls_node = -1;
1961 for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
1963 int node = dest_nodes->elems[node_idx];
1964 re_token_type_t type = dfa->nodes[node].type;
1965 if (type == OP_OPEN_SUBEXP
1966 && subexp_idx == dfa->nodes[node].opr.idx)
1967 ops_node = node;
1968 else if (type == OP_CLOSE_SUBEXP
1969 && subexp_idx == dfa->nodes[node].opr.idx)
1970 cls_node = node;
1973 /* Check the limitation of the open subexpression. */
1974 /* Note that (ent->subexp_to = str_idx != ent->subexp_from). */
1975 if (ops_node >= 0)
1977 err = sub_epsilon_src_nodes (dfa, ops_node, dest_nodes,
1978 candidates);
1979 if (BE (err != REG_NOERROR, 0))
1980 return err;
1983 /* Check the limitation of the close subexpression. */
1984 if (cls_node >= 0)
1985 for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
1987 int node = dest_nodes->elems[node_idx];
1988 if (!re_node_set_contains (dfa->inveclosures + node,
1989 cls_node)
1990 && !re_node_set_contains (dfa->eclosures + node,
1991 cls_node))
1993 /* It is against this limitation.
1994 Remove it form the current sifted state. */
1995 err = sub_epsilon_src_nodes (dfa, node, dest_nodes,
1996 candidates);
1997 if (BE (err != REG_NOERROR, 0))
1998 return err;
1999 --node_idx;
2003 else /* (ent->subexp_to != str_idx) */
2005 for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
2007 int node = dest_nodes->elems[node_idx];
2008 re_token_type_t type = dfa->nodes[node].type;
2009 if (type == OP_CLOSE_SUBEXP || type == OP_OPEN_SUBEXP)
2011 if (subexp_idx != dfa->nodes[node].opr.idx)
2012 continue;
2013 if ((type == OP_CLOSE_SUBEXP && ent->subexp_to != str_idx)
2014 || (type == OP_OPEN_SUBEXP))
2016 /* It is against this limitation.
2017 Remove it form the current sifted state. */
2018 err = sub_epsilon_src_nodes (dfa, node, dest_nodes,
2019 candidates);
2020 if (BE (err != REG_NOERROR, 0))
2021 return err;
2027 return REG_NOERROR;
2030 static reg_errcode_t
2031 sift_states_bkref (mctx, sctx, str_idx, candidates)
2032 re_match_context_t *mctx;
2033 re_sift_context_t *sctx;
2034 int str_idx;
2035 const re_node_set *candidates;
2037 re_dfa_t *const dfa = mctx->dfa;
2038 reg_errcode_t err;
2039 int node_idx, node;
2040 re_sift_context_t local_sctx;
2041 int first_idx = search_cur_bkref_entry (mctx, str_idx);
2043 if (first_idx == -1)
2044 return REG_NOERROR;
2046 local_sctx.sifted_states = NULL; /* Mark that it hasn't been initialized. */
2048 for (node_idx = 0; node_idx < candidates->nelem; ++node_idx)
2050 int enabled_idx;
2051 re_token_type_t type;
2052 struct re_backref_cache_entry *entry;
2053 node = candidates->elems[node_idx];
2054 type = dfa->nodes[node].type;
2055 /* Avoid infinite loop for the REs like "()\1+". */
2056 if (node == sctx->last_node && str_idx == sctx->last_str_idx)
2057 continue;
2058 if (type != OP_BACK_REF)
2059 continue;
2061 entry = mctx->bkref_ents + first_idx;
2062 enabled_idx = first_idx;
2065 int subexp_len, to_idx, dst_node;
2066 re_dfastate_t *cur_state;
2068 if (entry->node != node)
2069 continue;
2070 subexp_len = entry->subexp_to - entry->subexp_from;
2071 to_idx = str_idx + subexp_len;
2072 dst_node = (subexp_len ? dfa->nexts[node]
2073 : dfa->edests[node].elems[0]);
2075 if (to_idx > sctx->last_str_idx
2076 || sctx->sifted_states[to_idx] == NULL
2077 || !STATE_NODE_CONTAINS (sctx->sifted_states[to_idx], dst_node)
2078 || check_dst_limits (mctx, &sctx->limits, node,
2079 str_idx, dst_node, to_idx))
2080 continue;
2082 if (local_sctx.sifted_states == NULL)
2084 local_sctx = *sctx;
2085 err = re_node_set_init_copy (&local_sctx.limits, &sctx->limits);
2086 if (BE (err != REG_NOERROR, 0))
2087 goto free_return;
2089 local_sctx.last_node = node;
2090 local_sctx.last_str_idx = str_idx;
2091 err = re_node_set_insert (&local_sctx.limits, enabled_idx);
2092 if (BE (err < 0, 0))
2094 err = REG_ESPACE;
2095 goto free_return;
2097 cur_state = local_sctx.sifted_states[str_idx];
2098 err = sift_states_backward (mctx, &local_sctx);
2099 if (BE (err != REG_NOERROR, 0))
2100 goto free_return;
2101 if (sctx->limited_states != NULL)
2103 err = merge_state_array (dfa, sctx->limited_states,
2104 local_sctx.sifted_states,
2105 str_idx + 1);
2106 if (BE (err != REG_NOERROR, 0))
2107 goto free_return;
2109 local_sctx.sifted_states[str_idx] = cur_state;
2110 re_node_set_remove (&local_sctx.limits, enabled_idx);
2112 /* mctx->bkref_ents may have changed, reload the pointer. */
2113 entry = mctx->bkref_ents + enabled_idx;
2115 while (enabled_idx++, entry++->more);
2117 err = REG_NOERROR;
2118 free_return:
2119 if (local_sctx.sifted_states != NULL)
2121 re_node_set_free (&local_sctx.limits);
2124 return err;
2128 #ifdef RE_ENABLE_I18N
2129 static int
2130 sift_states_iter_mb (mctx, sctx, node_idx, str_idx, max_str_idx)
2131 const re_match_context_t *mctx;
2132 re_sift_context_t *sctx;
2133 int node_idx, str_idx, max_str_idx;
2135 re_dfa_t *const dfa = mctx->dfa;
2136 int naccepted;
2137 /* Check the node can accept `multi byte'. */
2138 naccepted = check_node_accept_bytes (dfa, node_idx, &mctx->input, str_idx);
2139 if (naccepted > 0 && str_idx + naccepted <= max_str_idx &&
2140 !STATE_NODE_CONTAINS (sctx->sifted_states[str_idx + naccepted],
2141 dfa->nexts[node_idx]))
2142 /* The node can't accept the `multi byte', or the
2143 destination was already thrown away, then the node
2144 could't accept the current input `multi byte'. */
2145 naccepted = 0;
2146 /* Otherwise, it is sure that the node could accept
2147 `naccepted' bytes input. */
2148 return naccepted;
2150 #endif /* RE_ENABLE_I18N */
2153 /* Functions for state transition. */
2155 /* Return the next state to which the current state STATE will transit by
2156 accepting the current input byte, and update STATE_LOG if necessary.
2157 If STATE can accept a multibyte char/collating element/back reference
2158 update the destination of STATE_LOG. */
2160 static re_dfastate_t *
2161 transit_state (err, mctx, state)
2162 reg_errcode_t *err;
2163 re_match_context_t *mctx;
2164 re_dfastate_t *state;
2166 re_dfa_t *const dfa = mctx->dfa;
2167 re_dfastate_t **trtable;
2168 unsigned char ch;
2170 if (re_string_cur_idx (&mctx->input) + 1 >= mctx->input.bufs_len
2171 || (re_string_cur_idx (&mctx->input) + 1 >= mctx->input.valid_len
2172 && mctx->input.valid_len < mctx->input.len))
2174 *err = extend_buffers (mctx);
2175 if (BE (*err != REG_NOERROR, 0))
2176 return NULL;
2179 #ifdef RE_ENABLE_I18N
2180 /* If the current state can accept multibyte. */
2181 if (state->accept_mb)
2183 *err = transit_state_mb (mctx, state);
2184 if (BE (*err != REG_NOERROR, 0))
2185 return NULL;
2187 #endif /* RE_ENABLE_I18N */
2189 /* Then decide the next state with the single byte. */
2190 if (1)
2192 /* Use transition table */
2193 ch = re_string_fetch_byte (&mctx->input);
2194 trtable = state->trtable;
2195 if (trtable == NULL)
2197 trtable = build_trtable (dfa, state);
2198 if (trtable == NULL)
2200 *err = REG_ESPACE;
2201 return NULL;
2204 if (BE (state->word_trtable, 0))
2206 unsigned int context;
2207 context
2208 = re_string_context_at (&mctx->input,
2209 re_string_cur_idx (&mctx->input) - 1,
2210 mctx->eflags);
2211 if (IS_WORD_CONTEXT (context))
2212 return trtable[ch + SBC_MAX];
2213 else
2214 return trtable[ch];
2216 else
2217 return trtable[ch];
2219 #if 0
2220 else
2221 /* don't use transition table */
2222 return transit_state_sb (err, mctx, state);
2223 #endif
2226 /* Update the state_log if we need */
2227 re_dfastate_t *
2228 merge_state_with_log (err, mctx, next_state)
2229 reg_errcode_t *err;
2230 re_match_context_t *mctx;
2231 re_dfastate_t *next_state;
2233 re_dfa_t *const dfa = mctx->dfa;
2234 int cur_idx = re_string_cur_idx (&mctx->input);
2236 if (cur_idx > mctx->state_log_top)
2238 mctx->state_log[cur_idx] = next_state;
2239 mctx->state_log_top = cur_idx;
2241 else if (mctx->state_log[cur_idx] == 0)
2243 mctx->state_log[cur_idx] = next_state;
2245 else
2247 re_dfastate_t *pstate;
2248 unsigned int context;
2249 re_node_set next_nodes, *log_nodes, *table_nodes = NULL;
2250 /* If (state_log[cur_idx] != 0), it implies that cur_idx is
2251 the destination of a multibyte char/collating element/
2252 back reference. Then the next state is the union set of
2253 these destinations and the results of the transition table. */
2254 pstate = mctx->state_log[cur_idx];
2255 log_nodes = pstate->entrance_nodes;
2256 if (next_state != NULL)
2258 table_nodes = next_state->entrance_nodes;
2259 *err = re_node_set_init_union (&next_nodes, table_nodes,
2260 log_nodes);
2261 if (BE (*err != REG_NOERROR, 0))
2262 return NULL;
2264 else
2265 next_nodes = *log_nodes;
2266 /* Note: We already add the nodes of the initial state,
2267 then we don't need to add them here. */
2269 context = re_string_context_at (&mctx->input,
2270 re_string_cur_idx (&mctx->input) - 1,
2271 mctx->eflags);
2272 next_state = mctx->state_log[cur_idx]
2273 = re_acquire_state_context (err, dfa, &next_nodes, context);
2274 /* We don't need to check errors here, since the return value of
2275 this function is next_state and ERR is already set. */
2277 if (table_nodes != NULL)
2278 re_node_set_free (&next_nodes);
2281 if (BE (dfa->nbackref, 0) && next_state != NULL)
2283 /* Check OP_OPEN_SUBEXP in the current state in case that we use them
2284 later. We must check them here, since the back references in the
2285 next state might use them. */
2286 *err = check_subexp_matching_top (mctx, &next_state->nodes,
2287 cur_idx);
2288 if (BE (*err != REG_NOERROR, 0))
2289 return NULL;
2291 /* If the next state has back references. */
2292 if (next_state->has_backref)
2294 *err = transit_state_bkref (mctx, &next_state->nodes);
2295 if (BE (*err != REG_NOERROR, 0))
2296 return NULL;
2297 next_state = mctx->state_log[cur_idx];
2301 return next_state;
2304 /* Skip bytes in the input that correspond to part of a
2305 multi-byte match, then look in the log for a state
2306 from which to restart matching. */
2307 re_dfastate_t *
2308 find_recover_state (err, mctx)
2309 reg_errcode_t *err;
2310 re_match_context_t *mctx;
2312 re_dfastate_t *cur_state = NULL;
2315 int max = mctx->state_log_top;
2316 int cur_str_idx = re_string_cur_idx (&mctx->input);
2320 if (++cur_str_idx > max)
2321 return NULL;
2322 re_string_skip_bytes (&mctx->input, 1);
2324 while (mctx->state_log[cur_str_idx] == NULL);
2326 cur_state = merge_state_with_log (err, mctx, NULL);
2328 while (err == REG_NOERROR && cur_state == NULL);
2329 return cur_state;
2332 /* Helper functions for transit_state. */
2334 /* From the node set CUR_NODES, pick up the nodes whose types are
2335 OP_OPEN_SUBEXP and which have corresponding back references in the regular
2336 expression. And register them to use them later for evaluating the
2337 correspoding back references. */
2339 static reg_errcode_t
2340 check_subexp_matching_top (mctx, cur_nodes, str_idx)
2341 re_match_context_t *mctx;
2342 re_node_set *cur_nodes;
2343 int str_idx;
2345 re_dfa_t *const dfa = mctx->dfa;
2346 int node_idx;
2347 reg_errcode_t err;
2349 /* TODO: This isn't efficient.
2350 Because there might be more than one nodes whose types are
2351 OP_OPEN_SUBEXP and whose index is SUBEXP_IDX, we must check all
2352 nodes.
2353 E.g. RE: (a){2} */
2354 for (node_idx = 0; node_idx < cur_nodes->nelem; ++node_idx)
2356 int node = cur_nodes->elems[node_idx];
2357 if (dfa->nodes[node].type == OP_OPEN_SUBEXP
2358 && dfa->nodes[node].opr.idx < (8 * sizeof (dfa->used_bkref_map))
2359 && dfa->used_bkref_map & (1 << dfa->nodes[node].opr.idx))
2361 err = match_ctx_add_subtop (mctx, node, str_idx);
2362 if (BE (err != REG_NOERROR, 0))
2363 return err;
2366 return REG_NOERROR;
2369 #if 0
2370 /* Return the next state to which the current state STATE will transit by
2371 accepting the current input byte. */
2373 static re_dfastate_t *
2374 transit_state_sb (err, mctx, state)
2375 reg_errcode_t *err;
2376 re_match_context_t *mctx;
2377 re_dfastate_t *state;
2379 re_dfa_t *const dfa = mctx->dfa;
2380 re_node_set next_nodes;
2381 re_dfastate_t *next_state;
2382 int node_cnt, cur_str_idx = re_string_cur_idx (&mctx->input);
2383 unsigned int context;
2385 *err = re_node_set_alloc (&next_nodes, state->nodes.nelem + 1);
2386 if (BE (*err != REG_NOERROR, 0))
2387 return NULL;
2388 for (node_cnt = 0; node_cnt < state->nodes.nelem; ++node_cnt)
2390 int cur_node = state->nodes.elems[node_cnt];
2391 if (check_node_accept (mctx, dfa->nodes + cur_node, cur_str_idx))
2393 *err = re_node_set_merge (&next_nodes,
2394 dfa->eclosures + dfa->nexts[cur_node]);
2395 if (BE (*err != REG_NOERROR, 0))
2397 re_node_set_free (&next_nodes);
2398 return NULL;
2402 context = re_string_context_at (&mctx->input, cur_str_idx, mctx->eflags);
2403 next_state = re_acquire_state_context (err, dfa, &next_nodes, context);
2404 /* We don't need to check errors here, since the return value of
2405 this function is next_state and ERR is already set. */
2407 re_node_set_free (&next_nodes);
2408 re_string_skip_bytes (&mctx->input, 1);
2409 return next_state;
2411 #endif
2413 #ifdef RE_ENABLE_I18N
2414 static reg_errcode_t
2415 transit_state_mb (mctx, pstate)
2416 re_match_context_t *mctx;
2417 re_dfastate_t *pstate;
2419 re_dfa_t *const dfa = mctx->dfa;
2420 reg_errcode_t err;
2421 int i;
2423 for (i = 0; i < pstate->nodes.nelem; ++i)
2425 re_node_set dest_nodes, *new_nodes;
2426 int cur_node_idx = pstate->nodes.elems[i];
2427 int naccepted = 0, dest_idx;
2428 unsigned int context;
2429 re_dfastate_t *dest_state;
2431 if (dfa->nodes[cur_node_idx].constraint)
2433 context = re_string_context_at (&mctx->input,
2434 re_string_cur_idx (&mctx->input),
2435 mctx->eflags);
2436 if (NOT_SATISFY_NEXT_CONSTRAINT (dfa->nodes[cur_node_idx].constraint,
2437 context))
2438 continue;
2441 /* How many bytes the node can accept? */
2442 if (ACCEPT_MB_NODE (dfa->nodes[cur_node_idx].type))
2443 naccepted = check_node_accept_bytes (dfa, cur_node_idx, &mctx->input,
2444 re_string_cur_idx (&mctx->input));
2445 if (naccepted == 0)
2446 continue;
2448 /* The node can accepts `naccepted' bytes. */
2449 dest_idx = re_string_cur_idx (&mctx->input) + naccepted;
2450 mctx->max_mb_elem_len = ((mctx->max_mb_elem_len < naccepted) ? naccepted
2451 : mctx->max_mb_elem_len);
2452 err = clean_state_log_if_needed (mctx, dest_idx);
2453 if (BE (err != REG_NOERROR, 0))
2454 return err;
2455 #ifdef DEBUG
2456 assert (dfa->nexts[cur_node_idx] != -1);
2457 #endif
2458 /* `cur_node_idx' may point the entity of the OP_CONTEXT_NODE,
2459 then we use pstate->nodes.elems[i] instead. */
2460 new_nodes = dfa->eclosures + dfa->nexts[pstate->nodes.elems[i]];
2462 dest_state = mctx->state_log[dest_idx];
2463 if (dest_state == NULL)
2464 dest_nodes = *new_nodes;
2465 else
2467 err = re_node_set_init_union (&dest_nodes,
2468 dest_state->entrance_nodes, new_nodes);
2469 if (BE (err != REG_NOERROR, 0))
2470 return err;
2472 context = re_string_context_at (&mctx->input, dest_idx - 1, mctx->eflags);
2473 mctx->state_log[dest_idx]
2474 = re_acquire_state_context (&err, dfa, &dest_nodes, context);
2475 if (dest_state != NULL)
2476 re_node_set_free (&dest_nodes);
2477 if (BE (mctx->state_log[dest_idx] == NULL && err != REG_NOERROR, 0))
2478 return err;
2480 return REG_NOERROR;
2482 #endif /* RE_ENABLE_I18N */
2484 static reg_errcode_t
2485 transit_state_bkref (mctx, nodes)
2486 re_match_context_t *mctx;
2487 const re_node_set *nodes;
2489 re_dfa_t *const dfa = mctx->dfa;
2490 reg_errcode_t err;
2491 int i;
2492 int cur_str_idx = re_string_cur_idx (&mctx->input);
2494 for (i = 0; i < nodes->nelem; ++i)
2496 int dest_str_idx, prev_nelem, bkc_idx;
2497 int node_idx = nodes->elems[i];
2498 unsigned int context;
2499 const re_token_t *node = dfa->nodes + node_idx;
2500 re_node_set *new_dest_nodes;
2502 /* Check whether `node' is a backreference or not. */
2503 if (node->type != OP_BACK_REF)
2504 continue;
2506 if (node->constraint)
2508 context = re_string_context_at (&mctx->input, cur_str_idx,
2509 mctx->eflags);
2510 if (NOT_SATISFY_NEXT_CONSTRAINT (node->constraint, context))
2511 continue;
2514 /* `node' is a backreference.
2515 Check the substring which the substring matched. */
2516 bkc_idx = mctx->nbkref_ents;
2517 err = get_subexp (mctx, node_idx, cur_str_idx);
2518 if (BE (err != REG_NOERROR, 0))
2519 goto free_return;
2521 /* And add the epsilon closures (which is `new_dest_nodes') of
2522 the backreference to appropriate state_log. */
2523 #ifdef DEBUG
2524 assert (dfa->nexts[node_idx] != -1);
2525 #endif
2526 for (; bkc_idx < mctx->nbkref_ents; ++bkc_idx)
2528 int subexp_len;
2529 re_dfastate_t *dest_state;
2530 struct re_backref_cache_entry *bkref_ent;
2531 bkref_ent = mctx->bkref_ents + bkc_idx;
2532 if (bkref_ent->node != node_idx || bkref_ent->str_idx != cur_str_idx)
2533 continue;
2534 subexp_len = bkref_ent->subexp_to - bkref_ent->subexp_from;
2535 new_dest_nodes = (subexp_len == 0
2536 ? dfa->eclosures + dfa->edests[node_idx].elems[0]
2537 : dfa->eclosures + dfa->nexts[node_idx]);
2538 dest_str_idx = (cur_str_idx + bkref_ent->subexp_to
2539 - bkref_ent->subexp_from);
2540 context = re_string_context_at (&mctx->input, dest_str_idx - 1,
2541 mctx->eflags);
2542 dest_state = mctx->state_log[dest_str_idx];
2543 prev_nelem = ((mctx->state_log[cur_str_idx] == NULL) ? 0
2544 : mctx->state_log[cur_str_idx]->nodes.nelem);
2545 /* Add `new_dest_node' to state_log. */
2546 if (dest_state == NULL)
2548 mctx->state_log[dest_str_idx]
2549 = re_acquire_state_context (&err, dfa, new_dest_nodes,
2550 context);
2551 if (BE (mctx->state_log[dest_str_idx] == NULL
2552 && err != REG_NOERROR, 0))
2553 goto free_return;
2555 else
2557 re_node_set dest_nodes;
2558 err = re_node_set_init_union (&dest_nodes,
2559 dest_state->entrance_nodes,
2560 new_dest_nodes);
2561 if (BE (err != REG_NOERROR, 0))
2563 re_node_set_free (&dest_nodes);
2564 goto free_return;
2566 mctx->state_log[dest_str_idx]
2567 = re_acquire_state_context (&err, dfa, &dest_nodes, context);
2568 re_node_set_free (&dest_nodes);
2569 if (BE (mctx->state_log[dest_str_idx] == NULL
2570 && err != REG_NOERROR, 0))
2571 goto free_return;
2573 /* We need to check recursively if the backreference can epsilon
2574 transit. */
2575 if (subexp_len == 0
2576 && mctx->state_log[cur_str_idx]->nodes.nelem > prev_nelem)
2578 err = check_subexp_matching_top (mctx, new_dest_nodes,
2579 cur_str_idx);
2580 if (BE (err != REG_NOERROR, 0))
2581 goto free_return;
2582 err = transit_state_bkref (mctx, new_dest_nodes);
2583 if (BE (err != REG_NOERROR, 0))
2584 goto free_return;
2588 err = REG_NOERROR;
2589 free_return:
2590 return err;
2593 /* Enumerate all the candidates which the backreference BKREF_NODE can match
2594 at BKREF_STR_IDX, and register them by match_ctx_add_entry().
2595 Note that we might collect inappropriate candidates here.
2596 However, the cost of checking them strictly here is too high, then we
2597 delay these checking for prune_impossible_nodes(). */
2599 static reg_errcode_t
2600 get_subexp (mctx, bkref_node, bkref_str_idx)
2601 re_match_context_t *mctx;
2602 int bkref_node, bkref_str_idx;
2604 re_dfa_t *const dfa = mctx->dfa;
2605 int subexp_num, sub_top_idx;
2606 const char *buf = (const char *) re_string_get_buffer (&mctx->input);
2607 /* Return if we have already checked BKREF_NODE at BKREF_STR_IDX. */
2608 int cache_idx = search_cur_bkref_entry (mctx, bkref_str_idx);
2609 if (cache_idx != -1)
2611 const struct re_backref_cache_entry *entry = mctx->bkref_ents + cache_idx;
2613 if (entry->node == bkref_node)
2614 return REG_NOERROR; /* We already checked it. */
2615 while (entry++->more);
2618 subexp_num = dfa->nodes[bkref_node].opr.idx - 1;
2620 /* For each sub expression */
2621 for (sub_top_idx = 0; sub_top_idx < mctx->nsub_tops; ++sub_top_idx)
2623 reg_errcode_t err;
2624 re_sub_match_top_t *sub_top = mctx->sub_tops[sub_top_idx];
2625 re_sub_match_last_t *sub_last;
2626 int sub_last_idx, sl_str, bkref_str_off;
2628 if (dfa->nodes[sub_top->node].opr.idx != subexp_num)
2629 continue; /* It isn't related. */
2631 sl_str = sub_top->str_idx;
2632 bkref_str_off = bkref_str_idx;
2633 /* At first, check the last node of sub expressions we already
2634 evaluated. */
2635 for (sub_last_idx = 0; sub_last_idx < sub_top->nlasts; ++sub_last_idx)
2637 int sl_str_diff;
2638 sub_last = sub_top->lasts[sub_last_idx];
2639 sl_str_diff = sub_last->str_idx - sl_str;
2640 /* The matched string by the sub expression match with the substring
2641 at the back reference? */
2642 if (sl_str_diff > 0)
2644 if (BE (bkref_str_off + sl_str_diff > mctx->input.valid_len, 0))
2646 /* Not enough chars for a successful match. */
2647 if (bkref_str_off + sl_str_diff > mctx->input.len)
2648 break;
2650 err = clean_state_log_if_needed (mctx,
2651 bkref_str_off
2652 + sl_str_diff);
2653 if (BE (err != REG_NOERROR, 0))
2654 return err;
2655 buf = (const char *) re_string_get_buffer (&mctx->input);
2657 if (memcmp (buf + bkref_str_off, buf + sl_str, sl_str_diff) != 0)
2658 break; /* We don't need to search this sub expression any more. */
2660 bkref_str_off += sl_str_diff;
2661 sl_str += sl_str_diff;
2662 err = get_subexp_sub (mctx, sub_top, sub_last, bkref_node,
2663 bkref_str_idx);
2665 /* Reload buf, since the preceding call might have reallocated
2666 the buffer. */
2667 buf = (const char *) re_string_get_buffer (&mctx->input);
2669 if (err == REG_NOMATCH)
2670 continue;
2671 if (BE (err != REG_NOERROR, 0))
2672 return err;
2675 if (sub_last_idx < sub_top->nlasts)
2676 continue;
2677 if (sub_last_idx > 0)
2678 ++sl_str;
2679 /* Then, search for the other last nodes of the sub expression. */
2680 for (; sl_str <= bkref_str_idx; ++sl_str)
2682 int cls_node, sl_str_off;
2683 const re_node_set *nodes;
2684 sl_str_off = sl_str - sub_top->str_idx;
2685 /* The matched string by the sub expression match with the substring
2686 at the back reference? */
2687 if (sl_str_off > 0)
2689 if (BE (bkref_str_off >= mctx->input.valid_len, 0))
2691 /* If we are at the end of the input, we cannot match. */
2692 if (bkref_str_off >= mctx->input.len)
2693 break;
2695 err = extend_buffers (mctx);
2696 if (BE (err != REG_NOERROR, 0))
2697 return err;
2699 buf = (const char *) re_string_get_buffer (&mctx->input);
2701 if (buf [bkref_str_off++] != buf[sl_str - 1])
2702 break; /* We don't need to search this sub expression
2703 any more. */
2705 if (mctx->state_log[sl_str] == NULL)
2706 continue;
2707 /* Does this state have a ')' of the sub expression? */
2708 nodes = &mctx->state_log[sl_str]->nodes;
2709 cls_node = find_subexp_node (dfa, nodes, subexp_num, OP_CLOSE_SUBEXP);
2710 if (cls_node == -1)
2711 continue; /* No. */
2712 if (sub_top->path == NULL)
2714 sub_top->path = calloc (sizeof (state_array_t),
2715 sl_str - sub_top->str_idx + 1);
2716 if (sub_top->path == NULL)
2717 return REG_ESPACE;
2719 /* Can the OP_OPEN_SUBEXP node arrive the OP_CLOSE_SUBEXP node
2720 in the current context? */
2721 err = check_arrival (mctx, sub_top->path, sub_top->node,
2722 sub_top->str_idx, cls_node, sl_str, OP_CLOSE_SUBEXP);
2723 if (err == REG_NOMATCH)
2724 continue;
2725 if (BE (err != REG_NOERROR, 0))
2726 return err;
2727 sub_last = match_ctx_add_sublast (sub_top, cls_node, sl_str);
2728 if (BE (sub_last == NULL, 0))
2729 return REG_ESPACE;
2730 err = get_subexp_sub (mctx, sub_top, sub_last, bkref_node,
2731 bkref_str_idx);
2732 if (err == REG_NOMATCH)
2733 continue;
2736 return REG_NOERROR;
2739 /* Helper functions for get_subexp(). */
2741 /* Check SUB_LAST can arrive to the back reference BKREF_NODE at BKREF_STR.
2742 If it can arrive, register the sub expression expressed with SUB_TOP
2743 and SUB_LAST. */
2745 static reg_errcode_t
2746 get_subexp_sub (mctx, sub_top, sub_last, bkref_node, bkref_str)
2747 re_match_context_t *mctx;
2748 const re_sub_match_top_t *sub_top;
2749 re_sub_match_last_t *sub_last;
2750 int bkref_node, bkref_str;
2752 reg_errcode_t err;
2753 int to_idx;
2754 /* Can the subexpression arrive the back reference? */
2755 err = check_arrival (mctx, &sub_last->path, sub_last->node,
2756 sub_last->str_idx, bkref_node, bkref_str, OP_OPEN_SUBEXP);
2757 if (err != REG_NOERROR)
2758 return err;
2759 err = match_ctx_add_entry (mctx, bkref_node, bkref_str, sub_top->str_idx,
2760 sub_last->str_idx);
2761 if (BE (err != REG_NOERROR, 0))
2762 return err;
2763 to_idx = bkref_str + sub_last->str_idx - sub_top->str_idx;
2764 return clean_state_log_if_needed (mctx, to_idx);
2767 /* Find the first node which is '(' or ')' and whose index is SUBEXP_IDX.
2768 Search '(' if FL_OPEN, or search ')' otherwise.
2769 TODO: This function isn't efficient...
2770 Because there might be more than one nodes whose types are
2771 OP_OPEN_SUBEXP and whose index is SUBEXP_IDX, we must check all
2772 nodes.
2773 E.g. RE: (a){2} */
2775 static int
2776 find_subexp_node (dfa, nodes, subexp_idx, type)
2777 const re_dfa_t *dfa;
2778 const re_node_set *nodes;
2779 int subexp_idx, type;
2781 int cls_idx;
2782 for (cls_idx = 0; cls_idx < nodes->nelem; ++cls_idx)
2784 int cls_node = nodes->elems[cls_idx];
2785 const re_token_t *node = dfa->nodes + cls_node;
2786 if (node->type == type
2787 && node->opr.idx == subexp_idx)
2788 return cls_node;
2790 return -1;
2793 /* Check whether the node TOP_NODE at TOP_STR can arrive to the node
2794 LAST_NODE at LAST_STR. We record the path onto PATH since it will be
2795 heavily reused.
2796 Return REG_NOERROR if it can arrive, or REG_NOMATCH otherwise. */
2798 static reg_errcode_t
2799 check_arrival (mctx, path, top_node, top_str, last_node, last_str,
2800 type)
2801 re_match_context_t *mctx;
2802 state_array_t *path;
2803 int top_node, top_str, last_node, last_str, type;
2805 re_dfa_t *const dfa = mctx->dfa;
2806 reg_errcode_t err;
2807 int subexp_num, backup_cur_idx, str_idx, null_cnt;
2808 re_dfastate_t *cur_state = NULL;
2809 re_node_set *cur_nodes, next_nodes;
2810 re_dfastate_t **backup_state_log;
2811 unsigned int context;
2813 subexp_num = dfa->nodes[top_node].opr.idx;
2814 /* Extend the buffer if we need. */
2815 if (BE (path->alloc < last_str + mctx->max_mb_elem_len + 1, 0))
2817 re_dfastate_t **new_array;
2818 int old_alloc = path->alloc;
2819 path->alloc += last_str + mctx->max_mb_elem_len + 1;
2820 new_array = re_realloc (path->array, re_dfastate_t *, path->alloc);
2821 if (new_array == NULL)
2823 path->alloc = old_alloc;
2824 return REG_ESPACE;
2826 path->array = new_array;
2827 memset (new_array + old_alloc, '\0',
2828 sizeof (re_dfastate_t *) * (path->alloc - old_alloc));
2831 str_idx = path->next_idx == 0 ? top_str : path->next_idx;
2833 /* Temporary modify MCTX. */
2834 backup_state_log = mctx->state_log;
2835 backup_cur_idx = mctx->input.cur_idx;
2836 mctx->state_log = path->array;
2837 mctx->input.cur_idx = str_idx;
2839 /* Setup initial node set. */
2840 context = re_string_context_at (&mctx->input, str_idx - 1, mctx->eflags);
2841 if (str_idx == top_str)
2843 err = re_node_set_init_1 (&next_nodes, top_node);
2844 if (BE (err != REG_NOERROR, 0))
2845 return err;
2846 err = check_arrival_expand_ecl (dfa, &next_nodes, subexp_num, type);
2847 if (BE (err != REG_NOERROR, 0))
2849 re_node_set_free (&next_nodes);
2850 return err;
2853 else
2855 cur_state = mctx->state_log[str_idx];
2856 if (cur_state && cur_state->has_backref)
2858 err = re_node_set_init_copy (&next_nodes, &cur_state->nodes);
2859 if (BE ( err != REG_NOERROR, 0))
2860 return err;
2862 else
2863 re_node_set_init_empty (&next_nodes);
2865 if (str_idx == top_str || (cur_state && cur_state->has_backref))
2867 if (next_nodes.nelem)
2869 err = expand_bkref_cache (mctx, &next_nodes, str_idx,
2870 subexp_num, type);
2871 if (BE ( err != REG_NOERROR, 0))
2873 re_node_set_free (&next_nodes);
2874 return err;
2877 cur_state = re_acquire_state_context (&err, dfa, &next_nodes, context);
2878 if (BE (cur_state == NULL && err != REG_NOERROR, 0))
2880 re_node_set_free (&next_nodes);
2881 return err;
2883 mctx->state_log[str_idx] = cur_state;
2886 for (null_cnt = 0; str_idx < last_str && null_cnt <= mctx->max_mb_elem_len;)
2888 re_node_set_empty (&next_nodes);
2889 if (mctx->state_log[str_idx + 1])
2891 err = re_node_set_merge (&next_nodes,
2892 &mctx->state_log[str_idx + 1]->nodes);
2893 if (BE (err != REG_NOERROR, 0))
2895 re_node_set_free (&next_nodes);
2896 return err;
2899 if (cur_state)
2901 err = check_arrival_add_next_nodes (mctx, str_idx,
2902 &cur_state->nodes, &next_nodes);
2903 if (BE (err != REG_NOERROR, 0))
2905 re_node_set_free (&next_nodes);
2906 return err;
2909 ++str_idx;
2910 if (next_nodes.nelem)
2912 err = check_arrival_expand_ecl (dfa, &next_nodes, subexp_num, type);
2913 if (BE (err != REG_NOERROR, 0))
2915 re_node_set_free (&next_nodes);
2916 return err;
2918 err = expand_bkref_cache (mctx, &next_nodes, str_idx,
2919 subexp_num, type);
2920 if (BE ( err != REG_NOERROR, 0))
2922 re_node_set_free (&next_nodes);
2923 return err;
2926 context = re_string_context_at (&mctx->input, str_idx - 1, mctx->eflags);
2927 cur_state = re_acquire_state_context (&err, dfa, &next_nodes, context);
2928 if (BE (cur_state == NULL && err != REG_NOERROR, 0))
2930 re_node_set_free (&next_nodes);
2931 return err;
2933 mctx->state_log[str_idx] = cur_state;
2934 null_cnt = cur_state == NULL ? null_cnt + 1 : 0;
2936 re_node_set_free (&next_nodes);
2937 cur_nodes = (mctx->state_log[last_str] == NULL ? NULL
2938 : &mctx->state_log[last_str]->nodes);
2939 path->next_idx = str_idx;
2941 /* Fix MCTX. */
2942 mctx->state_log = backup_state_log;
2943 mctx->input.cur_idx = backup_cur_idx;
2945 /* Then check the current node set has the node LAST_NODE. */
2946 if (cur_nodes != NULL && re_node_set_contains (cur_nodes, last_node))
2947 return REG_NOERROR;
2949 return REG_NOMATCH;
2952 /* Helper functions for check_arrival. */
2954 /* Calculate the destination nodes of CUR_NODES at STR_IDX, and append them
2955 to NEXT_NODES.
2956 TODO: This function is similar to the functions transit_state*(),
2957 however this function has many additional works.
2958 Can't we unify them? */
2960 static reg_errcode_t
2961 check_arrival_add_next_nodes (mctx, str_idx, cur_nodes, next_nodes)
2962 re_match_context_t *mctx;
2963 int str_idx;
2964 re_node_set *cur_nodes, *next_nodes;
2966 re_dfa_t *const dfa = mctx->dfa;
2967 int result;
2968 int cur_idx;
2969 reg_errcode_t err;
2970 re_node_set union_set;
2971 re_node_set_init_empty (&union_set);
2972 for (cur_idx = 0; cur_idx < cur_nodes->nelem; ++cur_idx)
2974 int naccepted = 0;
2975 int cur_node = cur_nodes->elems[cur_idx];
2976 re_token_type_t type = dfa->nodes[cur_node].type;
2977 if (IS_EPSILON_NODE (type))
2978 continue;
2979 #ifdef RE_ENABLE_I18N
2980 /* If the node may accept `multi byte'. */
2981 if (ACCEPT_MB_NODE (type))
2983 naccepted = check_node_accept_bytes (dfa, cur_node, &mctx->input,
2984 str_idx);
2985 if (naccepted > 1)
2987 re_dfastate_t *dest_state;
2988 int next_node = dfa->nexts[cur_node];
2989 int next_idx = str_idx + naccepted;
2990 dest_state = mctx->state_log[next_idx];
2991 re_node_set_empty (&union_set);
2992 if (dest_state)
2994 err = re_node_set_merge (&union_set, &dest_state->nodes);
2995 if (BE (err != REG_NOERROR, 0))
2997 re_node_set_free (&union_set);
2998 return err;
3001 result = re_node_set_insert (&union_set, next_node);
3002 if (BE (result < 0, 0))
3004 re_node_set_free (&union_set);
3005 return REG_ESPACE;
3007 mctx->state_log[next_idx] = re_acquire_state (&err, dfa,
3008 &union_set);
3009 if (BE (mctx->state_log[next_idx] == NULL
3010 && err != REG_NOERROR, 0))
3012 re_node_set_free (&union_set);
3013 return err;
3017 #endif /* RE_ENABLE_I18N */
3018 if (naccepted
3019 || check_node_accept (mctx, dfa->nodes + cur_node, str_idx))
3021 result = re_node_set_insert (next_nodes, dfa->nexts[cur_node]);
3022 if (BE (result < 0, 0))
3024 re_node_set_free (&union_set);
3025 return REG_ESPACE;
3029 re_node_set_free (&union_set);
3030 return REG_NOERROR;
3033 /* For all the nodes in CUR_NODES, add the epsilon closures of them to
3034 CUR_NODES, however exclude the nodes which are:
3035 - inside the sub expression whose number is EX_SUBEXP, if FL_OPEN.
3036 - out of the sub expression whose number is EX_SUBEXP, if !FL_OPEN.
3039 static reg_errcode_t
3040 check_arrival_expand_ecl (dfa, cur_nodes, ex_subexp, type)
3041 re_dfa_t *dfa;
3042 re_node_set *cur_nodes;
3043 int ex_subexp, type;
3045 reg_errcode_t err;
3046 int idx, outside_node;
3047 re_node_set new_nodes;
3048 #ifdef DEBUG
3049 assert (cur_nodes->nelem);
3050 #endif
3051 err = re_node_set_alloc (&new_nodes, cur_nodes->nelem);
3052 if (BE (err != REG_NOERROR, 0))
3053 return err;
3054 /* Create a new node set NEW_NODES with the nodes which are epsilon
3055 closures of the node in CUR_NODES. */
3057 for (idx = 0; idx < cur_nodes->nelem; ++idx)
3059 int cur_node = cur_nodes->elems[idx];
3060 re_node_set *eclosure = dfa->eclosures + cur_node;
3061 outside_node = find_subexp_node (dfa, eclosure, ex_subexp, type);
3062 if (outside_node == -1)
3064 /* There are no problematic nodes, just merge them. */
3065 err = re_node_set_merge (&new_nodes, eclosure);
3066 if (BE (err != REG_NOERROR, 0))
3068 re_node_set_free (&new_nodes);
3069 return err;
3072 else
3074 /* There are problematic nodes, re-calculate incrementally. */
3075 err = check_arrival_expand_ecl_sub (dfa, &new_nodes, cur_node,
3076 ex_subexp, type);
3077 if (BE (err != REG_NOERROR, 0))
3079 re_node_set_free (&new_nodes);
3080 return err;
3084 re_node_set_free (cur_nodes);
3085 *cur_nodes = new_nodes;
3086 return REG_NOERROR;
3089 /* Helper function for check_arrival_expand_ecl.
3090 Check incrementally the epsilon closure of TARGET, and if it isn't
3091 problematic append it to DST_NODES. */
3093 static reg_errcode_t
3094 check_arrival_expand_ecl_sub (dfa, dst_nodes, target, ex_subexp, type)
3095 re_dfa_t *dfa;
3096 int target, ex_subexp, type;
3097 re_node_set *dst_nodes;
3099 int cur_node;
3100 for (cur_node = target; !re_node_set_contains (dst_nodes, cur_node);)
3102 int err;
3104 if (dfa->nodes[cur_node].type == type
3105 && dfa->nodes[cur_node].opr.idx == ex_subexp)
3107 if (type == OP_CLOSE_SUBEXP)
3109 err = re_node_set_insert (dst_nodes, cur_node);
3110 if (BE (err == -1, 0))
3111 return REG_ESPACE;
3113 break;
3115 err = re_node_set_insert (dst_nodes, cur_node);
3116 if (BE (err == -1, 0))
3117 return REG_ESPACE;
3118 if (dfa->edests[cur_node].nelem == 0)
3119 break;
3120 if (dfa->edests[cur_node].nelem == 2)
3122 err = check_arrival_expand_ecl_sub (dfa, dst_nodes,
3123 dfa->edests[cur_node].elems[1],
3124 ex_subexp, type);
3125 if (BE (err != REG_NOERROR, 0))
3126 return err;
3128 cur_node = dfa->edests[cur_node].elems[0];
3130 return REG_NOERROR;
3134 /* For all the back references in the current state, calculate the
3135 destination of the back references by the appropriate entry
3136 in MCTX->BKREF_ENTS. */
3138 static reg_errcode_t
3139 expand_bkref_cache (mctx, cur_nodes, cur_str, subexp_num,
3140 type)
3141 re_match_context_t *mctx;
3142 int cur_str, subexp_num, type;
3143 re_node_set *cur_nodes;
3145 re_dfa_t *const dfa = mctx->dfa;
3146 reg_errcode_t err;
3147 int cache_idx_start = search_cur_bkref_entry (mctx, cur_str);
3148 struct re_backref_cache_entry *ent;
3150 if (cache_idx_start == -1)
3151 return REG_NOERROR;
3153 restart:
3154 ent = mctx->bkref_ents + cache_idx_start;
3157 int to_idx, next_node;
3159 /* Is this entry ENT is appropriate? */
3160 if (!re_node_set_contains (cur_nodes, ent->node))
3161 continue; /* No. */
3163 to_idx = cur_str + ent->subexp_to - ent->subexp_from;
3164 /* Calculate the destination of the back reference, and append it
3165 to MCTX->STATE_LOG. */
3166 if (to_idx == cur_str)
3168 /* The backreference did epsilon transit, we must re-check all the
3169 node in the current state. */
3170 re_node_set new_dests;
3171 reg_errcode_t err2, err3;
3172 next_node = dfa->edests[ent->node].elems[0];
3173 if (re_node_set_contains (cur_nodes, next_node))
3174 continue;
3175 err = re_node_set_init_1 (&new_dests, next_node);
3176 err2 = check_arrival_expand_ecl (dfa, &new_dests, subexp_num, type);
3177 err3 = re_node_set_merge (cur_nodes, &new_dests);
3178 re_node_set_free (&new_dests);
3179 if (BE (err != REG_NOERROR || err2 != REG_NOERROR
3180 || err3 != REG_NOERROR, 0))
3182 err = (err != REG_NOERROR ? err
3183 : (err2 != REG_NOERROR ? err2 : err3));
3184 return err;
3186 /* TODO: It is still inefficient... */
3187 goto restart;
3189 else
3191 re_node_set union_set;
3192 next_node = dfa->nexts[ent->node];
3193 if (mctx->state_log[to_idx])
3195 int ret;
3196 if (re_node_set_contains (&mctx->state_log[to_idx]->nodes,
3197 next_node))
3198 continue;
3199 err = re_node_set_init_copy (&union_set,
3200 &mctx->state_log[to_idx]->nodes);
3201 ret = re_node_set_insert (&union_set, next_node);
3202 if (BE (err != REG_NOERROR || ret < 0, 0))
3204 re_node_set_free (&union_set);
3205 err = err != REG_NOERROR ? err : REG_ESPACE;
3206 return err;
3209 else
3211 err = re_node_set_init_1 (&union_set, next_node);
3212 if (BE (err != REG_NOERROR, 0))
3213 return err;
3215 mctx->state_log[to_idx] = re_acquire_state (&err, dfa, &union_set);
3216 re_node_set_free (&union_set);
3217 if (BE (mctx->state_log[to_idx] == NULL
3218 && err != REG_NOERROR, 0))
3219 return err;
3222 while (ent++->more);
3223 return REG_NOERROR;
3226 /* Build transition table for the state.
3227 Return the new table if succeeded, otherwise return NULL. */
3229 static re_dfastate_t **
3230 build_trtable (dfa, state)
3231 re_dfa_t *dfa;
3232 re_dfastate_t *state;
3234 reg_errcode_t err;
3235 int i, j, ch;
3236 unsigned int elem, mask;
3237 int dests_node_malloced = 0, dest_states_malloced = 0;
3238 int ndests; /* Number of the destination states from `state'. */
3239 re_dfastate_t **trtable;
3240 re_dfastate_t **dest_states = NULL, **dest_states_word, **dest_states_nl;
3241 re_node_set follows, *dests_node;
3242 bitset *dests_ch;
3243 bitset acceptable;
3245 /* We build DFA states which corresponds to the destination nodes
3246 from `state'. `dests_node[i]' represents the nodes which i-th
3247 destination state contains, and `dests_ch[i]' represents the
3248 characters which i-th destination state accepts. */
3249 #ifdef _LIBC
3250 if (__libc_use_alloca ((sizeof (re_node_set) + sizeof (bitset)) * SBC_MAX))
3251 dests_node = (re_node_set *)
3252 alloca ((sizeof (re_node_set) + sizeof (bitset)) * SBC_MAX);
3253 else
3254 #endif
3256 dests_node = (re_node_set *)
3257 malloc ((sizeof (re_node_set) + sizeof (bitset)) * SBC_MAX);
3258 if (BE (dests_node == NULL, 0))
3259 return NULL;
3260 dests_node_malloced = 1;
3262 dests_ch = (bitset *) (dests_node + SBC_MAX);
3264 /* Initialize transiton table. */
3265 state->word_trtable = 0;
3267 /* At first, group all nodes belonging to `state' into several
3268 destinations. */
3269 ndests = group_nodes_into_DFAstates (dfa, state, dests_node, dests_ch);
3270 if (BE (ndests <= 0, 0))
3272 if (dests_node_malloced)
3273 free (dests_node);
3274 /* Return NULL in case of an error, trtable otherwise. */
3275 if (ndests == 0)
3277 state->trtable = (re_dfastate_t **)
3278 calloc (sizeof (re_dfastate_t *), SBC_MAX);;
3279 return state->trtable;
3281 return NULL;
3284 err = re_node_set_alloc (&follows, ndests + 1);
3285 if (BE (err != REG_NOERROR, 0))
3286 goto out_free;
3288 #ifdef _LIBC
3289 if (__libc_use_alloca ((sizeof (re_node_set) + sizeof (bitset)) * SBC_MAX
3290 + ndests * 3 * sizeof (re_dfastate_t *)))
3291 dest_states = (re_dfastate_t **)
3292 alloca (ndests * 3 * sizeof (re_dfastate_t *));
3293 else
3294 #endif
3296 dest_states = (re_dfastate_t **)
3297 malloc (ndests * 3 * sizeof (re_dfastate_t *));
3298 if (BE (dest_states == NULL, 0))
3300 out_free:
3301 if (dest_states_malloced)
3302 free (dest_states);
3303 re_node_set_free (&follows);
3304 for (i = 0; i < ndests; ++i)
3305 re_node_set_free (dests_node + i);
3306 if (dests_node_malloced)
3307 free (dests_node);
3308 return NULL;
3310 dest_states_malloced = 1;
3312 dest_states_word = dest_states + ndests;
3313 dest_states_nl = dest_states_word + ndests;
3314 bitset_empty (acceptable);
3316 /* Then build the states for all destinations. */
3317 for (i = 0; i < ndests; ++i)
3319 int next_node;
3320 re_node_set_empty (&follows);
3321 /* Merge the follows of this destination states. */
3322 for (j = 0; j < dests_node[i].nelem; ++j)
3324 next_node = dfa->nexts[dests_node[i].elems[j]];
3325 if (next_node != -1)
3327 err = re_node_set_merge (&follows, dfa->eclosures + next_node);
3328 if (BE (err != REG_NOERROR, 0))
3329 goto out_free;
3332 dest_states[i] = re_acquire_state_context (&err, dfa, &follows, 0);
3333 if (BE (dest_states[i] == NULL && err != REG_NOERROR, 0))
3334 goto out_free;
3335 /* If the new state has context constraint,
3336 build appropriate states for these contexts. */
3337 if (dest_states[i]->has_constraint)
3339 dest_states_word[i] = re_acquire_state_context (&err, dfa, &follows,
3340 CONTEXT_WORD);
3341 if (BE (dest_states_word[i] == NULL && err != REG_NOERROR, 0))
3342 goto out_free;
3344 if (dest_states[i] != dest_states_word[i]
3345 && dfa->mb_cur_max > 1)
3346 state->word_trtable = 1;
3348 dest_states_nl[i] = re_acquire_state_context (&err, dfa, &follows,
3349 CONTEXT_NEWLINE);
3350 if (BE (dest_states_nl[i] == NULL && err != REG_NOERROR, 0))
3351 goto out_free;
3353 else
3355 dest_states_word[i] = dest_states[i];
3356 dest_states_nl[i] = dest_states[i];
3358 bitset_merge (acceptable, dests_ch[i]);
3361 if (!BE (state->word_trtable, 0))
3363 /* We don't care about whether the following character is a word
3364 character, or we are in a single-byte character set so we can
3365 discern by looking at the character code: allocate a
3366 256-entry transition table. */
3367 trtable = (re_dfastate_t **) calloc (sizeof (re_dfastate_t *), SBC_MAX);
3368 if (BE (trtable == NULL, 0))
3369 goto out_free;
3371 /* For all characters ch...: */
3372 for (i = 0; i < BITSET_UINTS; ++i)
3373 for (ch = i * UINT_BITS, elem = acceptable[i], mask = 1;
3374 elem;
3375 mask <<= 1, elem >>= 1, ++ch)
3376 if (BE (elem & 1, 0))
3378 /* There must be exactly one destination which accepts
3379 character ch. See group_nodes_into_DFAstates. */
3380 for (j = 0; (dests_ch[j][i] & mask) == 0; ++j)
3383 /* j-th destination accepts the word character ch. */
3384 if (dfa->word_char[i] & mask)
3385 trtable[ch] = dest_states_word[j];
3386 else
3387 trtable[ch] = dest_states[j];
3390 else
3392 /* We care about whether the following character is a word
3393 character, and we are in a multi-byte character set: discern
3394 by looking at the character code: build two 256-entry
3395 transition tables, one starting at trtable[0] and one
3396 starting at trtable[SBC_MAX]. */
3397 trtable = (re_dfastate_t **) calloc (sizeof (re_dfastate_t *),
3398 2 * SBC_MAX);
3399 if (BE (trtable == NULL, 0))
3400 goto out_free;
3402 /* For all characters ch...: */
3403 for (i = 0; i < BITSET_UINTS; ++i)
3404 for (ch = i * UINT_BITS, elem = acceptable[i], mask = 1;
3405 elem;
3406 mask <<= 1, elem >>= 1, ++ch)
3407 if (BE (elem & 1, 0))
3409 /* There must be exactly one destination which accepts
3410 character ch. See group_nodes_into_DFAstates. */
3411 for (j = 0; (dests_ch[j][i] & mask) == 0; ++j)
3414 /* j-th destination accepts the word character ch. */
3415 trtable[ch] = dest_states[j];
3416 trtable[ch + SBC_MAX] = dest_states_word[j];
3420 /* new line */
3421 if (bitset_contain (acceptable, NEWLINE_CHAR))
3423 /* The current state accepts newline character. */
3424 for (j = 0; j < ndests; ++j)
3425 if (bitset_contain (dests_ch[j], NEWLINE_CHAR))
3427 /* k-th destination accepts newline character. */
3428 trtable[NEWLINE_CHAR] = dest_states_nl[j];
3429 if (state->word_trtable)
3430 trtable[NEWLINE_CHAR + SBC_MAX] = dest_states_nl[j];
3431 /* There must be only one destination which accepts
3432 newline. See group_nodes_into_DFAstates. */
3433 break;
3437 if (dest_states_malloced)
3438 free (dest_states);
3440 re_node_set_free (&follows);
3441 for (i = 0; i < ndests; ++i)
3442 re_node_set_free (dests_node + i);
3444 if (dests_node_malloced)
3445 free (dests_node);
3447 state->trtable = trtable;
3448 return trtable;
3451 /* Group all nodes belonging to STATE into several destinations.
3452 Then for all destinations, set the nodes belonging to the destination
3453 to DESTS_NODE[i] and set the characters accepted by the destination
3454 to DEST_CH[i]. This function return the number of destinations. */
3456 static int
3457 group_nodes_into_DFAstates (dfa, state, dests_node, dests_ch)
3458 re_dfa_t *dfa;
3459 const re_dfastate_t *state;
3460 re_node_set *dests_node;
3461 bitset *dests_ch;
3463 reg_errcode_t err;
3464 int result;
3465 int i, j, k;
3466 int ndests; /* Number of the destinations from `state'. */
3467 bitset accepts; /* Characters a node can accept. */
3468 const re_node_set *cur_nodes = &state->nodes;
3469 bitset_empty (accepts);
3470 ndests = 0;
3472 /* For all the nodes belonging to `state', */
3473 for (i = 0; i < cur_nodes->nelem; ++i)
3475 re_token_t *node = &dfa->nodes[cur_nodes->elems[i]];
3476 re_token_type_t type = node->type;
3477 unsigned int constraint = node->constraint;
3479 /* Enumerate all single byte character this node can accept. */
3480 if (type == CHARACTER)
3481 bitset_set (accepts, node->opr.c);
3482 else if (type == SIMPLE_BRACKET)
3484 bitset_merge (accepts, node->opr.sbcset);
3486 else if (type == OP_PERIOD)
3488 #ifdef RE_ENABLE_I18N
3489 if (dfa->mb_cur_max > 1)
3490 bitset_merge (accepts, dfa->sb_char);
3491 else
3492 #endif
3493 bitset_set_all (accepts);
3494 if (!(dfa->syntax & RE_DOT_NEWLINE))
3495 bitset_clear (accepts, '\n');
3496 if (dfa->syntax & RE_DOT_NOT_NULL)
3497 bitset_clear (accepts, '\0');
3499 #ifdef RE_ENABLE_I18N
3500 else if (type == OP_UTF8_PERIOD)
3502 memset (accepts, 255, sizeof (unsigned int) * BITSET_UINTS / 2);
3503 if (!(dfa->syntax & RE_DOT_NEWLINE))
3504 bitset_clear (accepts, '\n');
3505 if (dfa->syntax & RE_DOT_NOT_NULL)
3506 bitset_clear (accepts, '\0');
3508 #endif
3509 else
3510 continue;
3512 /* Check the `accepts' and sift the characters which are not
3513 match it the context. */
3514 if (constraint)
3516 if (constraint & NEXT_NEWLINE_CONSTRAINT)
3518 int accepts_newline = bitset_contain (accepts, NEWLINE_CHAR);
3519 bitset_empty (accepts);
3520 if (accepts_newline)
3521 bitset_set (accepts, NEWLINE_CHAR);
3522 else
3523 continue;
3525 if (constraint & NEXT_ENDBUF_CONSTRAINT)
3527 bitset_empty (accepts);
3528 continue;
3531 if (constraint & NEXT_WORD_CONSTRAINT)
3533 unsigned int any_set = 0;
3534 if (type == CHARACTER && !node->word_char)
3536 bitset_empty (accepts);
3537 continue;
3539 #ifdef RE_ENABLE_I18N
3540 if (dfa->mb_cur_max > 1)
3541 for (j = 0; j < BITSET_UINTS; ++j)
3542 any_set |= (accepts[j] &= (dfa->word_char[j] | ~dfa->sb_char[j]));
3543 else
3544 #endif
3545 for (j = 0; j < BITSET_UINTS; ++j)
3546 any_set |= (accepts[j] &= dfa->word_char[j]);
3547 if (!any_set)
3548 continue;
3550 if (constraint & NEXT_NOTWORD_CONSTRAINT)
3552 unsigned int any_set = 0;
3553 if (type == CHARACTER && node->word_char)
3555 bitset_empty (accepts);
3556 continue;
3558 #ifdef RE_ENABLE_I18N
3559 if (dfa->mb_cur_max > 1)
3560 for (j = 0; j < BITSET_UINTS; ++j)
3561 any_set |= (accepts[j] &= ~(dfa->word_char[j] & dfa->sb_char[j]));
3562 else
3563 #endif
3564 for (j = 0; j < BITSET_UINTS; ++j)
3565 any_set |= (accepts[j] &= ~dfa->word_char[j]);
3566 if (!any_set)
3567 continue;
3571 /* Then divide `accepts' into DFA states, or create a new
3572 state. Above, we make sure that accepts is not empty. */
3573 for (j = 0; j < ndests; ++j)
3575 bitset intersec; /* Intersection sets, see below. */
3576 bitset remains;
3577 /* Flags, see below. */
3578 int has_intersec, not_subset, not_consumed;
3580 /* Optimization, skip if this state doesn't accept the character. */
3581 if (type == CHARACTER && !bitset_contain (dests_ch[j], node->opr.c))
3582 continue;
3584 /* Enumerate the intersection set of this state and `accepts'. */
3585 has_intersec = 0;
3586 for (k = 0; k < BITSET_UINTS; ++k)
3587 has_intersec |= intersec[k] = accepts[k] & dests_ch[j][k];
3588 /* And skip if the intersection set is empty. */
3589 if (!has_intersec)
3590 continue;
3592 /* Then check if this state is a subset of `accepts'. */
3593 not_subset = not_consumed = 0;
3594 for (k = 0; k < BITSET_UINTS; ++k)
3596 not_subset |= remains[k] = ~accepts[k] & dests_ch[j][k];
3597 not_consumed |= accepts[k] = accepts[k] & ~dests_ch[j][k];
3600 /* If this state isn't a subset of `accepts', create a
3601 new group state, which has the `remains'. */
3602 if (not_subset)
3604 bitset_copy (dests_ch[ndests], remains);
3605 bitset_copy (dests_ch[j], intersec);
3606 err = re_node_set_init_copy (dests_node + ndests, &dests_node[j]);
3607 if (BE (err != REG_NOERROR, 0))
3608 goto error_return;
3609 ++ndests;
3612 /* Put the position in the current group. */
3613 result = re_node_set_insert (&dests_node[j], cur_nodes->elems[i]);
3614 if (BE (result < 0, 0))
3615 goto error_return;
3617 /* If all characters are consumed, go to next node. */
3618 if (!not_consumed)
3619 break;
3621 /* Some characters remain, create a new group. */
3622 if (j == ndests)
3624 bitset_copy (dests_ch[ndests], accepts);
3625 err = re_node_set_init_1 (dests_node + ndests, cur_nodes->elems[i]);
3626 if (BE (err != REG_NOERROR, 0))
3627 goto error_return;
3628 ++ndests;
3629 bitset_empty (accepts);
3632 return ndests;
3633 error_return:
3634 for (j = 0; j < ndests; ++j)
3635 re_node_set_free (dests_node + j);
3636 return -1;
3639 #ifdef RE_ENABLE_I18N
3640 /* Check how many bytes the node `dfa->nodes[node_idx]' accepts.
3641 Return the number of the bytes the node accepts.
3642 STR_IDX is the current index of the input string.
3644 This function handles the nodes which can accept one character, or
3645 one collating element like '.', '[a-z]', opposite to the other nodes
3646 can only accept one byte. */
3648 static int
3649 check_node_accept_bytes (dfa, node_idx, input, str_idx)
3650 re_dfa_t *dfa;
3651 int node_idx, str_idx;
3652 const re_string_t *input;
3654 const re_token_t *node = dfa->nodes + node_idx;
3655 int char_len, elem_len;
3656 int i;
3658 if (BE (node->type == OP_UTF8_PERIOD, 0))
3660 unsigned char c = re_string_byte_at (input, str_idx), d;
3661 if (BE (c < 0xc2, 1))
3662 return 0;
3664 if (str_idx + 2 > input->len)
3665 return 0;
3667 d = re_string_byte_at (input, str_idx + 1);
3668 if (c < 0xe0)
3669 return (d < 0x80 || d > 0xbf) ? 0 : 2;
3670 else if (c < 0xf0)
3672 char_len = 3;
3673 if (c == 0xe0 && d < 0xa0)
3674 return 0;
3676 else if (c < 0xf8)
3678 char_len = 4;
3679 if (c == 0xf0 && d < 0x90)
3680 return 0;
3682 else if (c < 0xfc)
3684 char_len = 5;
3685 if (c == 0xf8 && d < 0x88)
3686 return 0;
3688 else if (c < 0xfe)
3690 char_len = 6;
3691 if (c == 0xfc && d < 0x84)
3692 return 0;
3694 else
3695 return 0;
3697 if (str_idx + char_len > input->len)
3698 return 0;
3700 for (i = 1; i < char_len; ++i)
3702 d = re_string_byte_at (input, str_idx + i);
3703 if (d < 0x80 || d > 0xbf)
3704 return 0;
3706 return char_len;
3709 char_len = re_string_char_size_at (input, str_idx);
3710 if (node->type == OP_PERIOD)
3712 if (char_len <= 1)
3713 return 0;
3714 /* FIXME: I don't think this if is needed, as both '\n'
3715 and '\0' are char_len == 1. */
3716 /* '.' accepts any one character except the following two cases. */
3717 if ((!(dfa->syntax & RE_DOT_NEWLINE) &&
3718 re_string_byte_at (input, str_idx) == '\n') ||
3719 ((dfa->syntax & RE_DOT_NOT_NULL) &&
3720 re_string_byte_at (input, str_idx) == '\0'))
3721 return 0;
3722 return char_len;
3725 elem_len = re_string_elem_size_at (input, str_idx);
3726 if ((elem_len <= 1 && char_len <= 1) || char_len == 0)
3727 return 0;
3729 if (node->type == COMPLEX_BRACKET)
3731 const re_charset_t *cset = node->opr.mbcset;
3732 # ifdef _LIBC
3733 const unsigned char *pin = ((char *) re_string_get_buffer (input)
3734 + str_idx);
3735 int j;
3736 uint32_t nrules;
3737 # endif /* _LIBC */
3738 int match_len = 0;
3739 wchar_t wc = ((cset->nranges || cset->nchar_classes || cset->nmbchars)
3740 ? re_string_wchar_at (input, str_idx) : 0);
3742 /* match with multibyte character? */
3743 for (i = 0; i < cset->nmbchars; ++i)
3744 if (wc == cset->mbchars[i])
3746 match_len = char_len;
3747 goto check_node_accept_bytes_match;
3749 /* match with character_class? */
3750 for (i = 0; i < cset->nchar_classes; ++i)
3752 wctype_t wt = cset->char_classes[i];
3753 if (__iswctype (wc, wt))
3755 match_len = char_len;
3756 goto check_node_accept_bytes_match;
3760 # ifdef _LIBC
3761 nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3762 if (nrules != 0)
3764 unsigned int in_collseq = 0;
3765 const int32_t *table, *indirect;
3766 const unsigned char *weights, *extra;
3767 const char *collseqwc;
3768 int32_t idx;
3769 /* This #include defines a local function! */
3770 # include <locale/weight.h>
3772 /* match with collating_symbol? */
3773 if (cset->ncoll_syms)
3774 extra = (const unsigned char *)
3775 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB);
3776 for (i = 0; i < cset->ncoll_syms; ++i)
3778 const unsigned char *coll_sym = extra + cset->coll_syms[i];
3779 /* Compare the length of input collating element and
3780 the length of current collating element. */
3781 if (*coll_sym != elem_len)
3782 continue;
3783 /* Compare each bytes. */
3784 for (j = 0; j < *coll_sym; j++)
3785 if (pin[j] != coll_sym[1 + j])
3786 break;
3787 if (j == *coll_sym)
3789 /* Match if every bytes is equal. */
3790 match_len = j;
3791 goto check_node_accept_bytes_match;
3795 if (cset->nranges)
3797 if (elem_len <= char_len)
3799 collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
3800 in_collseq = __collseq_table_lookup (collseqwc, wc);
3802 else
3803 in_collseq = find_collation_sequence_value (pin, elem_len);
3805 /* match with range expression? */
3806 for (i = 0; i < cset->nranges; ++i)
3807 if (cset->range_starts[i] <= in_collseq
3808 && in_collseq <= cset->range_ends[i])
3810 match_len = elem_len;
3811 goto check_node_accept_bytes_match;
3814 /* match with equivalence_class? */
3815 if (cset->nequiv_classes)
3817 const unsigned char *cp = pin;
3818 table = (const int32_t *)
3819 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3820 weights = (const unsigned char *)
3821 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_WEIGHTMB);
3822 extra = (const unsigned char *)
3823 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_EXTRAMB);
3824 indirect = (const int32_t *)
3825 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_INDIRECTMB);
3826 idx = findidx (&cp);
3827 if (idx > 0)
3828 for (i = 0; i < cset->nequiv_classes; ++i)
3830 int32_t equiv_class_idx = cset->equiv_classes[i];
3831 size_t weight_len = weights[idx];
3832 if (weight_len == weights[equiv_class_idx])
3834 int cnt = 0;
3835 while (cnt <= weight_len
3836 && (weights[equiv_class_idx + 1 + cnt]
3837 == weights[idx + 1 + cnt]))
3838 ++cnt;
3839 if (cnt > weight_len)
3841 match_len = elem_len;
3842 goto check_node_accept_bytes_match;
3848 else
3849 # endif /* _LIBC */
3851 /* match with range expression? */
3852 #if __GNUC__ >= 2
3853 wchar_t cmp_buf[] = {L'\0', L'\0', wc, L'\0', L'\0', L'\0'};
3854 #else
3855 wchar_t cmp_buf[] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'};
3856 cmp_buf[2] = wc;
3857 #endif
3858 for (i = 0; i < cset->nranges; ++i)
3860 cmp_buf[0] = cset->range_starts[i];
3861 cmp_buf[4] = cset->range_ends[i];
3862 if (wcscoll (cmp_buf, cmp_buf + 2) <= 0
3863 && wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0)
3865 match_len = char_len;
3866 goto check_node_accept_bytes_match;
3870 check_node_accept_bytes_match:
3871 if (!cset->non_match)
3872 return match_len;
3873 else
3875 if (match_len > 0)
3876 return 0;
3877 else
3878 return (elem_len > char_len) ? elem_len : char_len;
3881 return 0;
3884 # ifdef _LIBC
3885 static unsigned int
3886 find_collation_sequence_value (mbs, mbs_len)
3887 const unsigned char *mbs;
3888 size_t mbs_len;
3890 uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3891 if (nrules == 0)
3893 if (mbs_len == 1)
3895 /* No valid character. Match it as a single byte character. */
3896 const unsigned char *collseq = (const unsigned char *)
3897 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
3898 return collseq[mbs[0]];
3900 return UINT_MAX;
3902 else
3904 int32_t idx;
3905 const unsigned char *extra = (const unsigned char *)
3906 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB);
3907 int32_t extrasize = (const unsigned char *)
3908 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB + 1) - extra;
3910 for (idx = 0; idx < extrasize;)
3912 int mbs_cnt, found = 0;
3913 int32_t elem_mbs_len;
3914 /* Skip the name of collating element name. */
3915 idx = idx + extra[idx] + 1;
3916 elem_mbs_len = extra[idx++];
3917 if (mbs_len == elem_mbs_len)
3919 for (mbs_cnt = 0; mbs_cnt < elem_mbs_len; ++mbs_cnt)
3920 if (extra[idx + mbs_cnt] != mbs[mbs_cnt])
3921 break;
3922 if (mbs_cnt == elem_mbs_len)
3923 /* Found the entry. */
3924 found = 1;
3926 /* Skip the byte sequence of the collating element. */
3927 idx += elem_mbs_len;
3928 /* Adjust for the alignment. */
3929 idx = (idx + 3) & ~3;
3930 /* Skip the collation sequence value. */
3931 idx += sizeof (uint32_t);
3932 /* Skip the wide char sequence of the collating element. */
3933 idx = idx + sizeof (uint32_t) * (extra[idx] + 1);
3934 /* If we found the entry, return the sequence value. */
3935 if (found)
3936 return *(uint32_t *) (extra + idx);
3937 /* Skip the collation sequence value. */
3938 idx += sizeof (uint32_t);
3940 return UINT_MAX;
3943 # endif /* _LIBC */
3944 #endif /* RE_ENABLE_I18N */
3946 /* Check whether the node accepts the byte which is IDX-th
3947 byte of the INPUT. */
3949 static int
3950 check_node_accept (mctx, node, idx)
3951 const re_match_context_t *mctx;
3952 const re_token_t *node;
3953 int idx;
3955 re_dfa_t *const dfa = mctx->dfa;
3956 unsigned char ch;
3957 if (node->constraint)
3959 /* The node has constraints. Check whether the current context
3960 satisfies the constraints. */
3961 unsigned int context = re_string_context_at (&mctx->input, idx,
3962 mctx->eflags);
3963 if (NOT_SATISFY_NEXT_CONSTRAINT (node->constraint, context))
3964 return 0;
3966 ch = re_string_byte_at (&mctx->input, idx);
3967 switch (node->type)
3969 case CHARACTER:
3970 return node->opr.c == ch;
3971 case SIMPLE_BRACKET:
3972 return bitset_contain (node->opr.sbcset, ch);
3973 #ifdef RE_ENABLE_I18N
3974 case OP_UTF8_PERIOD:
3975 if (ch >= 0x80)
3976 return 0;
3977 /* FALLTHROUGH */
3978 #endif
3979 case OP_PERIOD:
3980 return !((ch == '\n' && !(dfa->syntax & RE_DOT_NEWLINE))
3981 || (ch == '\0' && (dfa->syntax & RE_DOT_NOT_NULL)));
3982 default:
3983 return 0;
3987 /* Extend the buffers, if the buffers have run out. */
3989 static reg_errcode_t
3990 extend_buffers (mctx)
3991 re_match_context_t *mctx;
3993 reg_errcode_t ret;
3994 re_string_t *pstr = &mctx->input;
3996 /* Double the lengthes of the buffers. */
3997 ret = re_string_realloc_buffers (pstr, pstr->bufs_len * 2);
3998 if (BE (ret != REG_NOERROR, 0))
3999 return ret;
4001 if (mctx->state_log != NULL)
4003 /* And double the length of state_log. */
4004 /* XXX We have no indication of the size of this buffer. If this
4005 allocation fail we have no indication that the state_log array
4006 does not have the right size. */
4007 re_dfastate_t **new_array = re_realloc (mctx->state_log, re_dfastate_t *,
4008 pstr->bufs_len + 1);
4009 if (BE (new_array == NULL, 0))
4010 return REG_ESPACE;
4011 mctx->state_log = new_array;
4014 /* Then reconstruct the buffers. */
4015 if (pstr->icase)
4017 #ifdef RE_ENABLE_I18N
4018 if (pstr->mb_cur_max > 1)
4020 ret = build_wcs_upper_buffer (pstr);
4021 if (BE (ret != REG_NOERROR, 0))
4022 return ret;
4024 else
4025 #endif /* RE_ENABLE_I18N */
4026 build_upper_buffer (pstr);
4028 else
4030 #ifdef RE_ENABLE_I18N
4031 if (pstr->mb_cur_max > 1)
4032 build_wcs_buffer (pstr);
4033 else
4034 #endif /* RE_ENABLE_I18N */
4036 if (pstr->trans != NULL)
4037 re_string_translate_buffer (pstr);
4040 return REG_NOERROR;
4044 /* Functions for matching context. */
4046 /* Initialize MCTX. */
4048 static reg_errcode_t
4049 match_ctx_init (mctx, eflags, n)
4050 re_match_context_t *mctx;
4051 int eflags, n;
4053 mctx->eflags = eflags;
4054 mctx->match_last = -1;
4055 if (n > 0)
4057 mctx->bkref_ents = re_malloc (struct re_backref_cache_entry, n);
4058 mctx->sub_tops = re_malloc (re_sub_match_top_t *, n);
4059 if (BE (mctx->bkref_ents == NULL || mctx->sub_tops == NULL, 0))
4060 return REG_ESPACE;
4062 /* Already zero-ed by the caller.
4063 else
4064 mctx->bkref_ents = NULL;
4065 mctx->nbkref_ents = 0;
4066 mctx->nsub_tops = 0; */
4067 mctx->abkref_ents = n;
4068 mctx->max_mb_elem_len = 1;
4069 mctx->asub_tops = n;
4070 return REG_NOERROR;
4073 /* Clean the entries which depend on the current input in MCTX.
4074 This function must be invoked when the matcher changes the start index
4075 of the input, or changes the input string. */
4077 static void
4078 match_ctx_clean (mctx)
4079 re_match_context_t *mctx;
4081 match_ctx_free_subtops (mctx);
4082 mctx->nsub_tops = 0;
4083 mctx->nbkref_ents = 0;
4086 /* Free all the memory associated with MCTX. */
4088 static void
4089 match_ctx_free (mctx)
4090 re_match_context_t *mctx;
4092 match_ctx_free_subtops (mctx);
4093 re_free (mctx->sub_tops);
4094 re_free (mctx->bkref_ents);
4097 /* Free all the memory associated with MCTX->SUB_TOPS. */
4099 static void
4100 match_ctx_free_subtops (mctx)
4101 re_match_context_t *mctx;
4103 int st_idx;
4104 for (st_idx = 0; st_idx < mctx->nsub_tops; ++st_idx)
4106 int sl_idx;
4107 re_sub_match_top_t *top = mctx->sub_tops[st_idx];
4108 for (sl_idx = 0; sl_idx < top->nlasts; ++sl_idx)
4110 re_sub_match_last_t *last = top->lasts[sl_idx];
4111 re_free (last->path.array);
4112 re_free (last);
4114 re_free (top->lasts);
4115 if (top->path)
4117 re_free (top->path->array);
4118 re_free (top->path);
4120 free (top);
4124 /* Add a new backreference entry to MCTX.
4125 Note that we assume that caller never call this function with duplicate
4126 entry, and call with STR_IDX which isn't smaller than any existing entry.
4129 static reg_errcode_t
4130 match_ctx_add_entry (mctx, node, str_idx, from, to)
4131 re_match_context_t *mctx;
4132 int node, str_idx, from, to;
4134 if (mctx->nbkref_ents >= mctx->abkref_ents)
4136 struct re_backref_cache_entry* new_entry;
4137 new_entry = re_realloc (mctx->bkref_ents, struct re_backref_cache_entry,
4138 mctx->abkref_ents * 2);
4139 if (BE (new_entry == NULL, 0))
4141 re_free (mctx->bkref_ents);
4142 return REG_ESPACE;
4144 mctx->bkref_ents = new_entry;
4145 memset (mctx->bkref_ents + mctx->nbkref_ents, '\0',
4146 sizeof (struct re_backref_cache_entry) * mctx->abkref_ents);
4147 mctx->abkref_ents *= 2;
4149 if (mctx->nbkref_ents > 0
4150 && mctx->bkref_ents[mctx->nbkref_ents - 1].str_idx == str_idx)
4151 mctx->bkref_ents[mctx->nbkref_ents - 1].more = 1;
4153 mctx->bkref_ents[mctx->nbkref_ents].node = node;
4154 mctx->bkref_ents[mctx->nbkref_ents].str_idx = str_idx;
4155 mctx->bkref_ents[mctx->nbkref_ents].subexp_from = from;
4156 mctx->bkref_ents[mctx->nbkref_ents].subexp_to = to;
4157 mctx->bkref_ents[mctx->nbkref_ents++].more = 0;
4158 if (mctx->max_mb_elem_len < to - from)
4159 mctx->max_mb_elem_len = to - from;
4160 return REG_NOERROR;
4163 /* Search for the first entry which has the same str_idx, or -1 if none is
4164 found. Note that MCTX->BKREF_ENTS is already sorted by MCTX->STR_IDX. */
4166 static int
4167 search_cur_bkref_entry (mctx, str_idx)
4168 re_match_context_t *mctx;
4169 int str_idx;
4171 int left, right, mid, last;
4172 last = right = mctx->nbkref_ents;
4173 for (left = 0; left < right;)
4175 mid = (left + right) / 2;
4176 if (mctx->bkref_ents[mid].str_idx < str_idx)
4177 left = mid + 1;
4178 else
4179 right = mid;
4181 if (left < last && mctx->bkref_ents[left].str_idx == str_idx)
4182 return left;
4183 else
4184 return -1;
4187 /* Register the node NODE, whose type is OP_OPEN_SUBEXP, and which matches
4188 at STR_IDX. */
4190 static reg_errcode_t
4191 match_ctx_add_subtop (mctx, node, str_idx)
4192 re_match_context_t *mctx;
4193 int node, str_idx;
4195 #ifdef DEBUG
4196 assert (mctx->sub_tops != NULL);
4197 assert (mctx->asub_tops > 0);
4198 #endif
4199 if (BE (mctx->nsub_tops == mctx->asub_tops, 0))
4201 int new_asub_tops = mctx->asub_tops * 2;
4202 re_sub_match_top_t **new_array = re_realloc (mctx->sub_tops,
4203 re_sub_match_top_t *,
4204 new_asub_tops);
4205 if (BE (new_array == NULL, 0))
4206 return REG_ESPACE;
4207 mctx->sub_tops = new_array;
4208 mctx->asub_tops = new_asub_tops;
4210 mctx->sub_tops[mctx->nsub_tops] = calloc (1, sizeof (re_sub_match_top_t));
4211 if (BE (mctx->sub_tops[mctx->nsub_tops] == NULL, 0))
4212 return REG_ESPACE;
4213 mctx->sub_tops[mctx->nsub_tops]->node = node;
4214 mctx->sub_tops[mctx->nsub_tops++]->str_idx = str_idx;
4215 return REG_NOERROR;
4218 /* Register the node NODE, whose type is OP_CLOSE_SUBEXP, and which matches
4219 at STR_IDX, whose corresponding OP_OPEN_SUBEXP is SUB_TOP. */
4221 static re_sub_match_last_t *
4222 match_ctx_add_sublast (subtop, node, str_idx)
4223 re_sub_match_top_t *subtop;
4224 int node, str_idx;
4226 re_sub_match_last_t *new_entry;
4227 if (BE (subtop->nlasts == subtop->alasts, 0))
4229 int new_alasts = 2 * subtop->alasts + 1;
4230 re_sub_match_last_t **new_array = re_realloc (subtop->lasts,
4231 re_sub_match_last_t *,
4232 new_alasts);
4233 if (BE (new_array == NULL, 0))
4234 return NULL;
4235 subtop->lasts = new_array;
4236 subtop->alasts = new_alasts;
4238 new_entry = calloc (1, sizeof (re_sub_match_last_t));
4239 if (BE (new_entry != NULL, 1))
4241 subtop->lasts[subtop->nlasts] = new_entry;
4242 new_entry->node = node;
4243 new_entry->str_idx = str_idx;
4244 ++subtop->nlasts;
4246 return new_entry;
4249 static void
4250 sift_ctx_init (sctx, sifted_sts, limited_sts, last_node, last_str_idx)
4251 re_sift_context_t *sctx;
4252 re_dfastate_t **sifted_sts, **limited_sts;
4253 int last_node, last_str_idx;
4255 sctx->sifted_states = sifted_sts;
4256 sctx->limited_states = limited_sts;
4257 sctx->last_node = last_node;
4258 sctx->last_str_idx = last_str_idx;
4259 re_node_set_init_empty (&sctx->limits);