Let tst-swscanf find its locale
[glibc.git] / posix / regexec.c
blobf85c5e80177de923a5419d2e4c59f6d833dd1da9
1 /* Extended regular expression matching and search library.
2 Copyright (C) 2002-2013 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4 Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library; if not, see
18 <http://www.gnu.org/licenses/>. */
20 #include <stdint.h>
22 static reg_errcode_t match_ctx_init (re_match_context_t *cache, int eflags,
23 int n) internal_function;
24 static void match_ctx_clean (re_match_context_t *mctx) internal_function;
25 static void match_ctx_free (re_match_context_t *cache) internal_function;
26 static reg_errcode_t match_ctx_add_entry (re_match_context_t *cache, int node,
27 int str_idx, int from, int to)
28 internal_function;
29 static int search_cur_bkref_entry (const re_match_context_t *mctx, int str_idx)
30 internal_function;
31 static reg_errcode_t match_ctx_add_subtop (re_match_context_t *mctx, int node,
32 int str_idx) internal_function;
33 static re_sub_match_last_t * match_ctx_add_sublast (re_sub_match_top_t *subtop,
34 int node, int str_idx)
35 internal_function;
36 static void sift_ctx_init (re_sift_context_t *sctx, re_dfastate_t **sifted_sts,
37 re_dfastate_t **limited_sts, int last_node,
38 int last_str_idx)
39 internal_function;
40 static reg_errcode_t re_search_internal (const regex_t *preg,
41 const char *string, int length,
42 int start, int range, int stop,
43 size_t nmatch, regmatch_t pmatch[],
44 int eflags) internal_function;
45 static int re_search_2_stub (struct re_pattern_buffer *bufp,
46 const char *string1, int length1,
47 const char *string2, int length2,
48 int start, int range, struct re_registers *regs,
49 int stop, int ret_len) internal_function;
50 static int re_search_stub (struct re_pattern_buffer *bufp,
51 const char *string, int length, int start,
52 int range, int stop, struct re_registers *regs,
53 int ret_len) internal_function;
54 static unsigned re_copy_regs (struct re_registers *regs, regmatch_t *pmatch,
55 int nregs, int regs_allocated) internal_function;
56 static reg_errcode_t prune_impossible_nodes (re_match_context_t *mctx)
57 internal_function;
58 static int check_matching (re_match_context_t *mctx, int fl_longest_match,
59 int *p_match_first) internal_function;
60 static int check_halt_state_context (const re_match_context_t *mctx,
61 const re_dfastate_t *state, int idx)
62 internal_function;
63 static void update_regs (const re_dfa_t *dfa, regmatch_t *pmatch,
64 regmatch_t *prev_idx_match, int cur_node,
65 int cur_idx, int nmatch) internal_function;
66 static reg_errcode_t push_fail_stack (struct re_fail_stack_t *fs,
67 int str_idx, int dest_node, int nregs,
68 regmatch_t *regs,
69 re_node_set *eps_via_nodes)
70 internal_function;
71 static reg_errcode_t set_regs (const regex_t *preg,
72 const re_match_context_t *mctx,
73 size_t nmatch, regmatch_t *pmatch,
74 int fl_backtrack) internal_function;
75 static reg_errcode_t free_fail_stack_return (struct re_fail_stack_t *fs)
76 internal_function;
78 #ifdef RE_ENABLE_I18N
79 static int sift_states_iter_mb (const re_match_context_t *mctx,
80 re_sift_context_t *sctx,
81 int node_idx, int str_idx, int max_str_idx)
82 internal_function;
83 #endif /* RE_ENABLE_I18N */
84 static reg_errcode_t sift_states_backward (const re_match_context_t *mctx,
85 re_sift_context_t *sctx)
86 internal_function;
87 static reg_errcode_t build_sifted_states (const re_match_context_t *mctx,
88 re_sift_context_t *sctx, int str_idx,
89 re_node_set *cur_dest)
90 internal_function;
91 static reg_errcode_t update_cur_sifted_state (const re_match_context_t *mctx,
92 re_sift_context_t *sctx,
93 int str_idx,
94 re_node_set *dest_nodes)
95 internal_function;
96 static reg_errcode_t add_epsilon_src_nodes (const re_dfa_t *dfa,
97 re_node_set *dest_nodes,
98 const re_node_set *candidates)
99 internal_function;
100 static int check_dst_limits (const re_match_context_t *mctx,
101 re_node_set *limits,
102 int dst_node, int dst_idx, int src_node,
103 int src_idx) internal_function;
104 static int check_dst_limits_calc_pos_1 (const re_match_context_t *mctx,
105 int boundaries, int subexp_idx,
106 int from_node, int bkref_idx)
107 internal_function;
108 static int check_dst_limits_calc_pos (const re_match_context_t *mctx,
109 int limit, int subexp_idx,
110 int node, int str_idx,
111 int bkref_idx) internal_function;
112 static reg_errcode_t check_subexp_limits (const re_dfa_t *dfa,
113 re_node_set *dest_nodes,
114 const re_node_set *candidates,
115 re_node_set *limits,
116 struct re_backref_cache_entry *bkref_ents,
117 int str_idx) internal_function;
118 static reg_errcode_t sift_states_bkref (const re_match_context_t *mctx,
119 re_sift_context_t *sctx,
120 int str_idx, const re_node_set *candidates)
121 internal_function;
122 static reg_errcode_t merge_state_array (const re_dfa_t *dfa,
123 re_dfastate_t **dst,
124 re_dfastate_t **src, int num)
125 internal_function;
126 static re_dfastate_t *find_recover_state (reg_errcode_t *err,
127 re_match_context_t *mctx) internal_function;
128 static re_dfastate_t *transit_state (reg_errcode_t *err,
129 re_match_context_t *mctx,
130 re_dfastate_t *state) internal_function;
131 static re_dfastate_t *merge_state_with_log (reg_errcode_t *err,
132 re_match_context_t *mctx,
133 re_dfastate_t *next_state)
134 internal_function;
135 static reg_errcode_t check_subexp_matching_top (re_match_context_t *mctx,
136 re_node_set *cur_nodes,
137 int str_idx) internal_function;
138 #if 0
139 static re_dfastate_t *transit_state_sb (reg_errcode_t *err,
140 re_match_context_t *mctx,
141 re_dfastate_t *pstate)
142 internal_function;
143 #endif
144 #ifdef RE_ENABLE_I18N
145 static reg_errcode_t transit_state_mb (re_match_context_t *mctx,
146 re_dfastate_t *pstate)
147 internal_function;
148 #endif /* RE_ENABLE_I18N */
149 static reg_errcode_t transit_state_bkref (re_match_context_t *mctx,
150 const re_node_set *nodes)
151 internal_function;
152 static reg_errcode_t get_subexp (re_match_context_t *mctx,
153 int bkref_node, int bkref_str_idx)
154 internal_function;
155 static reg_errcode_t get_subexp_sub (re_match_context_t *mctx,
156 const re_sub_match_top_t *sub_top,
157 re_sub_match_last_t *sub_last,
158 int bkref_node, int bkref_str)
159 internal_function;
160 static int find_subexp_node (const re_dfa_t *dfa, const re_node_set *nodes,
161 int subexp_idx, int type) internal_function;
162 static reg_errcode_t check_arrival (re_match_context_t *mctx,
163 state_array_t *path, int top_node,
164 int top_str, int last_node, int last_str,
165 int type) internal_function;
166 static reg_errcode_t check_arrival_add_next_nodes (re_match_context_t *mctx,
167 int str_idx,
168 re_node_set *cur_nodes,
169 re_node_set *next_nodes)
170 internal_function;
171 static reg_errcode_t check_arrival_expand_ecl (const re_dfa_t *dfa,
172 re_node_set *cur_nodes,
173 int ex_subexp, int type)
174 internal_function;
175 static reg_errcode_t check_arrival_expand_ecl_sub (const re_dfa_t *dfa,
176 re_node_set *dst_nodes,
177 int target, int ex_subexp,
178 int type) internal_function;
179 static reg_errcode_t expand_bkref_cache (re_match_context_t *mctx,
180 re_node_set *cur_nodes, int cur_str,
181 int subexp_num, int type)
182 internal_function;
183 static int build_trtable (const re_dfa_t *dfa,
184 re_dfastate_t *state) internal_function;
185 #ifdef RE_ENABLE_I18N
186 static int check_node_accept_bytes (const re_dfa_t *dfa, int node_idx,
187 const re_string_t *input, int idx)
188 internal_function;
189 # ifdef _LIBC
190 static unsigned int find_collation_sequence_value (const unsigned char *mbs,
191 size_t name_len)
192 internal_function;
193 # endif /* _LIBC */
194 #endif /* RE_ENABLE_I18N */
195 static int group_nodes_into_DFAstates (const re_dfa_t *dfa,
196 const re_dfastate_t *state,
197 re_node_set *states_node,
198 bitset_t *states_ch) internal_function;
199 static int check_node_accept (const re_match_context_t *mctx,
200 const re_token_t *node, int idx)
201 internal_function;
202 static reg_errcode_t extend_buffers (re_match_context_t *mctx, int min_len)
203 internal_function;
205 /* Entry point for POSIX code. */
207 /* regexec searches for a given pattern, specified by PREG, in the
208 string STRING.
210 If NMATCH is zero or REG_NOSUB was set in the cflags argument to
211 `regcomp', we ignore PMATCH. Otherwise, we assume PMATCH has at
212 least NMATCH elements, and we set them to the offsets of the
213 corresponding matched substrings.
215 EFLAGS specifies `execution flags' which affect matching: if
216 REG_NOTBOL is set, then ^ does not match at the beginning of the
217 string; if REG_NOTEOL is set, then $ does not match at the end.
219 We return 0 if we find a match and REG_NOMATCH if not. */
222 regexec (preg, string, nmatch, pmatch, eflags)
223 const regex_t *__restrict preg;
224 const char *__restrict string;
225 size_t nmatch;
226 regmatch_t pmatch[];
227 int eflags;
229 reg_errcode_t err;
230 int start, length;
231 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
233 if (eflags & ~(REG_NOTBOL | REG_NOTEOL | REG_STARTEND))
234 return REG_BADPAT;
236 if (eflags & REG_STARTEND)
238 start = pmatch[0].rm_so;
239 length = pmatch[0].rm_eo;
241 else
243 start = 0;
244 length = strlen (string);
247 __libc_lock_lock (dfa->lock);
248 if (preg->no_sub)
249 err = re_search_internal (preg, string, length, start, length - start,
250 length, 0, NULL, eflags);
251 else
252 err = re_search_internal (preg, string, length, start, length - start,
253 length, nmatch, pmatch, eflags);
254 __libc_lock_unlock (dfa->lock);
255 return err != REG_NOERROR;
258 #ifdef _LIBC
259 # include <shlib-compat.h>
260 versioned_symbol (libc, __regexec, regexec, GLIBC_2_3_4);
262 # if SHLIB_COMPAT (libc, GLIBC_2_0, GLIBC_2_3_4)
263 __typeof__ (__regexec) __compat_regexec;
266 attribute_compat_text_section
267 __compat_regexec (const regex_t *__restrict preg,
268 const char *__restrict string, size_t nmatch,
269 regmatch_t pmatch[], int eflags)
271 return regexec (preg, string, nmatch, pmatch,
272 eflags & (REG_NOTBOL | REG_NOTEOL));
274 compat_symbol (libc, __compat_regexec, regexec, GLIBC_2_0);
275 # endif
276 #endif
278 /* Entry points for GNU code. */
280 /* re_match, re_search, re_match_2, re_search_2
282 The former two functions operate on STRING with length LENGTH,
283 while the later two operate on concatenation of STRING1 and STRING2
284 with lengths LENGTH1 and LENGTH2, respectively.
286 re_match() matches the compiled pattern in BUFP against the string,
287 starting at index START.
289 re_search() first tries matching at index START, then it tries to match
290 starting from index START + 1, and so on. The last start position tried
291 is START + RANGE. (Thus RANGE = 0 forces re_search to operate the same
292 way as re_match().)
294 The parameter STOP of re_{match,search}_2 specifies that no match exceeding
295 the first STOP characters of the concatenation of the strings should be
296 concerned.
298 If REGS is not NULL, and BUFP->no_sub is not set, the offsets of the match
299 and all groups is stroed in REGS. (For the "_2" variants, the offsets are
300 computed relative to the concatenation, not relative to the individual
301 strings.)
303 On success, re_match* functions return the length of the match, re_search*
304 return the position of the start of the match. Return value -1 means no
305 match was found and -2 indicates an internal error. */
308 re_match (bufp, string, length, start, regs)
309 struct re_pattern_buffer *bufp;
310 const char *string;
311 int length, start;
312 struct re_registers *regs;
314 return re_search_stub (bufp, string, length, start, 0, length, regs, 1);
316 #ifdef _LIBC
317 weak_alias (__re_match, re_match)
318 #endif
321 re_search (bufp, string, length, start, range, regs)
322 struct re_pattern_buffer *bufp;
323 const char *string;
324 int length, start, range;
325 struct re_registers *regs;
327 return re_search_stub (bufp, string, length, start, range, length, regs, 0);
329 #ifdef _LIBC
330 weak_alias (__re_search, re_search)
331 #endif
334 re_match_2 (bufp, string1, length1, string2, length2, start, regs, stop)
335 struct re_pattern_buffer *bufp;
336 const char *string1, *string2;
337 int length1, length2, start, stop;
338 struct re_registers *regs;
340 return re_search_2_stub (bufp, string1, length1, string2, length2,
341 start, 0, regs, stop, 1);
343 #ifdef _LIBC
344 weak_alias (__re_match_2, re_match_2)
345 #endif
348 re_search_2 (bufp, string1, length1, string2, length2, start, range, regs, stop)
349 struct re_pattern_buffer *bufp;
350 const char *string1, *string2;
351 int length1, length2, start, range, stop;
352 struct re_registers *regs;
354 return re_search_2_stub (bufp, string1, length1, string2, length2,
355 start, range, regs, stop, 0);
357 #ifdef _LIBC
358 weak_alias (__re_search_2, re_search_2)
359 #endif
361 static int
362 re_search_2_stub (bufp, string1, length1, string2, length2, start, range, regs,
363 stop, ret_len)
364 struct re_pattern_buffer *bufp;
365 const char *string1, *string2;
366 int length1, length2, start, range, stop, ret_len;
367 struct re_registers *regs;
369 const char *str;
370 int rval;
371 int len = length1 + length2;
372 char *s = NULL;
374 if (BE (length1 < 0 || length2 < 0 || stop < 0 || len < length1, 0))
375 return -2;
377 /* Concatenate the strings. */
378 if (length2 > 0)
379 if (length1 > 0)
381 s = re_malloc (char, len);
383 if (BE (s == NULL, 0))
384 return -2;
385 #ifdef _LIBC
386 memcpy (__mempcpy (s, string1, length1), string2, length2);
387 #else
388 memcpy (s, string1, length1);
389 memcpy (s + length1, string2, length2);
390 #endif
391 str = s;
393 else
394 str = string2;
395 else
396 str = string1;
398 rval = re_search_stub (bufp, str, len, start, range, stop, regs, ret_len);
399 re_free (s);
400 return rval;
403 /* The parameters have the same meaning as those of re_search.
404 Additional parameters:
405 If RET_LEN is nonzero the length of the match is returned (re_match style);
406 otherwise the position of the match is returned. */
408 static int
409 re_search_stub (bufp, string, length, start, range, stop, regs, ret_len)
410 struct re_pattern_buffer *bufp;
411 const char *string;
412 int length, start, range, stop, ret_len;
413 struct re_registers *regs;
415 reg_errcode_t result;
416 regmatch_t *pmatch;
417 int nregs, rval;
418 int eflags = 0;
419 re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
421 /* Check for out-of-range. */
422 if (BE (start < 0 || start > length, 0))
423 return -1;
424 if (BE (start + range > length, 0))
425 range = length - start;
426 else if (BE (start + range < 0, 0))
427 range = -start;
429 __libc_lock_lock (dfa->lock);
431 eflags |= (bufp->not_bol) ? REG_NOTBOL : 0;
432 eflags |= (bufp->not_eol) ? REG_NOTEOL : 0;
434 /* Compile fastmap if we haven't yet. */
435 if (range > 0 && bufp->fastmap != NULL && !bufp->fastmap_accurate)
436 re_compile_fastmap (bufp);
438 if (BE (bufp->no_sub, 0))
439 regs = NULL;
441 /* We need at least 1 register. */
442 if (regs == NULL)
443 nregs = 1;
444 else if (BE (bufp->regs_allocated == REGS_FIXED &&
445 regs->num_regs < bufp->re_nsub + 1, 0))
447 nregs = regs->num_regs;
448 if (BE (nregs < 1, 0))
450 /* Nothing can be copied to regs. */
451 regs = NULL;
452 nregs = 1;
455 else
456 nregs = bufp->re_nsub + 1;
457 pmatch = re_malloc (regmatch_t, nregs);
458 if (BE (pmatch == NULL, 0))
460 rval = -2;
461 goto out;
464 result = re_search_internal (bufp, string, length, start, range, stop,
465 nregs, pmatch, eflags);
467 rval = 0;
469 /* I hope we needn't fill ther regs with -1's when no match was found. */
470 if (result != REG_NOERROR)
471 rval = -1;
472 else if (regs != NULL)
474 /* If caller wants register contents data back, copy them. */
475 bufp->regs_allocated = re_copy_regs (regs, pmatch, nregs,
476 bufp->regs_allocated);
477 if (BE (bufp->regs_allocated == REGS_UNALLOCATED, 0))
478 rval = -2;
481 if (BE (rval == 0, 1))
483 if (ret_len)
485 assert (pmatch[0].rm_so == start);
486 rval = pmatch[0].rm_eo - start;
488 else
489 rval = pmatch[0].rm_so;
491 re_free (pmatch);
492 out:
493 __libc_lock_unlock (dfa->lock);
494 return rval;
497 static unsigned
498 re_copy_regs (regs, pmatch, nregs, regs_allocated)
499 struct re_registers *regs;
500 regmatch_t *pmatch;
501 int nregs, regs_allocated;
503 int rval = REGS_REALLOCATE;
504 int i;
505 int need_regs = nregs + 1;
506 /* We need one extra element beyond `num_regs' for the `-1' marker GNU code
507 uses. */
509 /* Have the register data arrays been allocated? */
510 if (regs_allocated == REGS_UNALLOCATED)
511 { /* No. So allocate them with malloc. */
512 regs->start = re_malloc (regoff_t, need_regs);
513 if (BE (regs->start == NULL, 0))
514 return REGS_UNALLOCATED;
515 regs->end = re_malloc (regoff_t, need_regs);
516 if (BE (regs->end == NULL, 0))
518 re_free (regs->start);
519 return REGS_UNALLOCATED;
521 regs->num_regs = need_regs;
523 else if (regs_allocated == REGS_REALLOCATE)
524 { /* Yes. If we need more elements than were already
525 allocated, reallocate them. If we need fewer, just
526 leave it alone. */
527 if (BE (need_regs > regs->num_regs, 0))
529 regoff_t *new_start = re_realloc (regs->start, regoff_t, need_regs);
530 regoff_t *new_end;
531 if (BE (new_start == NULL, 0))
532 return REGS_UNALLOCATED;
533 new_end = re_realloc (regs->end, regoff_t, need_regs);
534 if (BE (new_end == NULL, 0))
536 re_free (new_start);
537 return REGS_UNALLOCATED;
539 regs->start = new_start;
540 regs->end = new_end;
541 regs->num_regs = need_regs;
544 else
546 assert (regs_allocated == REGS_FIXED);
547 /* This function may not be called with REGS_FIXED and nregs too big. */
548 assert (regs->num_regs >= nregs);
549 rval = REGS_FIXED;
552 /* Copy the regs. */
553 for (i = 0; i < nregs; ++i)
555 regs->start[i] = pmatch[i].rm_so;
556 regs->end[i] = pmatch[i].rm_eo;
558 for ( ; i < regs->num_regs; ++i)
559 regs->start[i] = regs->end[i] = -1;
561 return rval;
564 /* Set REGS to hold NUM_REGS registers, storing them in STARTS and
565 ENDS. Subsequent matches using PATTERN_BUFFER and REGS will use
566 this memory for recording register information. STARTS and ENDS
567 must be allocated using the malloc library routine, and must each
568 be at least NUM_REGS * sizeof (regoff_t) bytes long.
570 If NUM_REGS == 0, then subsequent matches should allocate their own
571 register data.
573 Unless this function is called, the first search or match using
574 PATTERN_BUFFER will allocate its own register data, without
575 freeing the old data. */
577 void
578 re_set_registers (bufp, regs, num_regs, starts, ends)
579 struct re_pattern_buffer *bufp;
580 struct re_registers *regs;
581 unsigned num_regs;
582 regoff_t *starts, *ends;
584 if (num_regs)
586 bufp->regs_allocated = REGS_REALLOCATE;
587 regs->num_regs = num_regs;
588 regs->start = starts;
589 regs->end = ends;
591 else
593 bufp->regs_allocated = REGS_UNALLOCATED;
594 regs->num_regs = 0;
595 regs->start = regs->end = (regoff_t *) 0;
598 #ifdef _LIBC
599 weak_alias (__re_set_registers, re_set_registers)
600 #endif
602 /* Entry points compatible with 4.2 BSD regex library. We don't define
603 them unless specifically requested. */
605 #if defined _REGEX_RE_COMP || defined _LIBC
607 # ifdef _LIBC
608 weak_function
609 # endif
610 re_exec (s)
611 const char *s;
613 return 0 == regexec (&re_comp_buf, s, 0, NULL, 0);
615 #endif /* _REGEX_RE_COMP */
617 /* Internal entry point. */
619 /* Searches for a compiled pattern PREG in the string STRING, whose
620 length is LENGTH. NMATCH, PMATCH, and EFLAGS have the same
621 mingings with regexec. START, and RANGE have the same meanings
622 with re_search.
623 Return REG_NOERROR if we find a match, and REG_NOMATCH if not,
624 otherwise return the error code.
625 Note: We assume front end functions already check ranges.
626 (START + RANGE >= 0 && START + RANGE <= LENGTH) */
628 static reg_errcode_t
629 __attribute_warn_unused_result__
630 re_search_internal (preg, string, length, start, range, stop, nmatch, pmatch,
631 eflags)
632 const regex_t *preg;
633 const char *string;
634 int length, start, range, stop, eflags;
635 size_t nmatch;
636 regmatch_t pmatch[];
638 reg_errcode_t err;
639 const re_dfa_t *dfa = (const re_dfa_t *) preg->buffer;
640 int left_lim, right_lim, incr;
641 int fl_longest_match, match_first, match_kind, match_last = -1;
642 int extra_nmatch;
643 int sb, ch;
644 #if defined _LIBC || (defined __STDC_VERSION__ && __STDC_VERSION__ >= 199901L)
645 re_match_context_t mctx = { .dfa = dfa };
646 #else
647 re_match_context_t mctx;
648 #endif
649 char *fastmap = (preg->fastmap != NULL && preg->fastmap_accurate
650 && range && !preg->can_be_null) ? preg->fastmap : NULL;
651 RE_TRANSLATE_TYPE t = preg->translate;
653 #if !(defined _LIBC || (defined __STDC_VERSION__ && __STDC_VERSION__ >= 199901L))
654 memset (&mctx, '\0', sizeof (re_match_context_t));
655 mctx.dfa = dfa;
656 #endif
658 extra_nmatch = (nmatch > preg->re_nsub) ? nmatch - (preg->re_nsub + 1) : 0;
659 nmatch -= extra_nmatch;
661 /* Check if the DFA haven't been compiled. */
662 if (BE (preg->used == 0 || dfa->init_state == NULL
663 || dfa->init_state_word == NULL || dfa->init_state_nl == NULL
664 || dfa->init_state_begbuf == NULL, 0))
665 return REG_NOMATCH;
667 #ifdef DEBUG
668 /* We assume front-end functions already check them. */
669 assert (start + range >= 0 && start + range <= length);
670 #endif
672 /* If initial states with non-begbuf contexts have no elements,
673 the regex must be anchored. If preg->newline_anchor is set,
674 we'll never use init_state_nl, so do not check it. */
675 if (dfa->init_state->nodes.nelem == 0
676 && dfa->init_state_word->nodes.nelem == 0
677 && (dfa->init_state_nl->nodes.nelem == 0
678 || !preg->newline_anchor))
680 if (start != 0 && start + range != 0)
681 return REG_NOMATCH;
682 start = range = 0;
685 /* We must check the longest matching, if nmatch > 0. */
686 fl_longest_match = (nmatch != 0 || dfa->nbackref);
688 err = re_string_allocate (&mctx.input, string, length, dfa->nodes_len + 1,
689 preg->translate, preg->syntax & RE_ICASE, dfa);
690 if (BE (err != REG_NOERROR, 0))
691 goto free_return;
692 mctx.input.stop = stop;
693 mctx.input.raw_stop = stop;
694 mctx.input.newline_anchor = preg->newline_anchor;
696 err = match_ctx_init (&mctx, eflags, dfa->nbackref * 2);
697 if (BE (err != REG_NOERROR, 0))
698 goto free_return;
700 /* We will log all the DFA states through which the dfa pass,
701 if nmatch > 1, or this dfa has "multibyte node", which is a
702 back-reference or a node which can accept multibyte character or
703 multi character collating element. */
704 if (nmatch > 1 || dfa->has_mb_node)
706 /* Avoid overflow. */
707 if (BE (SIZE_MAX / sizeof (re_dfastate_t *) <= mctx.input.bufs_len, 0))
709 err = REG_ESPACE;
710 goto free_return;
713 mctx.state_log = re_malloc (re_dfastate_t *, mctx.input.bufs_len + 1);
714 if (BE (mctx.state_log == NULL, 0))
716 err = REG_ESPACE;
717 goto free_return;
720 else
721 mctx.state_log = NULL;
723 match_first = start;
724 mctx.input.tip_context = (eflags & REG_NOTBOL) ? CONTEXT_BEGBUF
725 : CONTEXT_NEWLINE | CONTEXT_BEGBUF;
727 /* Check incrementally whether of not the input string match. */
728 incr = (range < 0) ? -1 : 1;
729 left_lim = (range < 0) ? start + range : start;
730 right_lim = (range < 0) ? start : start + range;
731 sb = dfa->mb_cur_max == 1;
732 match_kind =
733 (fastmap
734 ? ((sb || !(preg->syntax & RE_ICASE || t) ? 4 : 0)
735 | (range >= 0 ? 2 : 0)
736 | (t != NULL ? 1 : 0))
737 : 8);
739 for (;; match_first += incr)
741 err = REG_NOMATCH;
742 if (match_first < left_lim || right_lim < match_first)
743 goto free_return;
745 /* Advance as rapidly as possible through the string, until we
746 find a plausible place to start matching. This may be done
747 with varying efficiency, so there are various possibilities:
748 only the most common of them are specialized, in order to
749 save on code size. We use a switch statement for speed. */
750 switch (match_kind)
752 case 8:
753 /* No fastmap. */
754 break;
756 case 7:
757 /* Fastmap with single-byte translation, match forward. */
758 while (BE (match_first < right_lim, 1)
759 && !fastmap[t[(unsigned char) string[match_first]]])
760 ++match_first;
761 goto forward_match_found_start_or_reached_end;
763 case 6:
764 /* Fastmap without translation, match forward. */
765 while (BE (match_first < right_lim, 1)
766 && !fastmap[(unsigned char) string[match_first]])
767 ++match_first;
769 forward_match_found_start_or_reached_end:
770 if (BE (match_first == right_lim, 0))
772 ch = match_first >= length
773 ? 0 : (unsigned char) string[match_first];
774 if (!fastmap[t ? t[ch] : ch])
775 goto free_return;
777 break;
779 case 4:
780 case 5:
781 /* Fastmap without multi-byte translation, match backwards. */
782 while (match_first >= left_lim)
784 ch = match_first >= length
785 ? 0 : (unsigned char) string[match_first];
786 if (fastmap[t ? t[ch] : ch])
787 break;
788 --match_first;
790 if (match_first < left_lim)
791 goto free_return;
792 break;
794 default:
795 /* In this case, we can't determine easily the current byte,
796 since it might be a component byte of a multibyte
797 character. Then we use the constructed buffer instead. */
798 for (;;)
800 /* If MATCH_FIRST is out of the valid range, reconstruct the
801 buffers. */
802 unsigned int offset = match_first - mctx.input.raw_mbs_idx;
803 if (BE (offset >= (unsigned int) mctx.input.valid_raw_len, 0))
805 err = re_string_reconstruct (&mctx.input, match_first,
806 eflags);
807 if (BE (err != REG_NOERROR, 0))
808 goto free_return;
810 offset = match_first - mctx.input.raw_mbs_idx;
812 /* If MATCH_FIRST is out of the buffer, leave it as '\0'.
813 Note that MATCH_FIRST must not be smaller than 0. */
814 ch = (match_first >= length
815 ? 0 : re_string_byte_at (&mctx.input, offset));
816 if (fastmap[ch])
817 break;
818 match_first += incr;
819 if (match_first < left_lim || match_first > right_lim)
821 err = REG_NOMATCH;
822 goto free_return;
825 break;
828 /* Reconstruct the buffers so that the matcher can assume that
829 the matching starts from the beginning of the buffer. */
830 err = re_string_reconstruct (&mctx.input, match_first, eflags);
831 if (BE (err != REG_NOERROR, 0))
832 goto free_return;
834 #ifdef RE_ENABLE_I18N
835 /* Don't consider this char as a possible match start if it part,
836 yet isn't the head, of a multibyte character. */
837 if (!sb && !re_string_first_byte (&mctx.input, 0))
838 continue;
839 #endif
841 /* It seems to be appropriate one, then use the matcher. */
842 /* We assume that the matching starts from 0. */
843 mctx.state_log_top = mctx.nbkref_ents = mctx.max_mb_elem_len = 0;
844 match_last = check_matching (&mctx, fl_longest_match,
845 range >= 0 ? &match_first : NULL);
846 if (match_last != -1)
848 if (BE (match_last == -2, 0))
850 err = REG_ESPACE;
851 goto free_return;
853 else
855 mctx.match_last = match_last;
856 if ((!preg->no_sub && nmatch > 1) || dfa->nbackref)
858 re_dfastate_t *pstate = mctx.state_log[match_last];
859 mctx.last_node = check_halt_state_context (&mctx, pstate,
860 match_last);
862 if ((!preg->no_sub && nmatch > 1 && dfa->has_plural_match)
863 || dfa->nbackref)
865 err = prune_impossible_nodes (&mctx);
866 if (err == REG_NOERROR)
867 break;
868 if (BE (err != REG_NOMATCH, 0))
869 goto free_return;
870 match_last = -1;
872 else
873 break; /* We found a match. */
877 match_ctx_clean (&mctx);
880 #ifdef DEBUG
881 assert (match_last != -1);
882 assert (err == REG_NOERROR);
883 #endif
885 /* Set pmatch[] if we need. */
886 if (nmatch > 0)
888 int reg_idx;
890 /* Initialize registers. */
891 for (reg_idx = 1; reg_idx < nmatch; ++reg_idx)
892 pmatch[reg_idx].rm_so = pmatch[reg_idx].rm_eo = -1;
894 /* Set the points where matching start/end. */
895 pmatch[0].rm_so = 0;
896 pmatch[0].rm_eo = mctx.match_last;
898 if (!preg->no_sub && nmatch > 1)
900 err = set_regs (preg, &mctx, nmatch, pmatch,
901 dfa->has_plural_match && dfa->nbackref > 0);
902 if (BE (err != REG_NOERROR, 0))
903 goto free_return;
906 /* At last, add the offset to the each registers, since we slided
907 the buffers so that we could assume that the matching starts
908 from 0. */
909 for (reg_idx = 0; reg_idx < nmatch; ++reg_idx)
910 if (pmatch[reg_idx].rm_so != -1)
912 #ifdef RE_ENABLE_I18N
913 if (BE (mctx.input.offsets_needed != 0, 0))
915 pmatch[reg_idx].rm_so =
916 (pmatch[reg_idx].rm_so == mctx.input.valid_len
917 ? mctx.input.valid_raw_len
918 : mctx.input.offsets[pmatch[reg_idx].rm_so]);
919 pmatch[reg_idx].rm_eo =
920 (pmatch[reg_idx].rm_eo == mctx.input.valid_len
921 ? mctx.input.valid_raw_len
922 : mctx.input.offsets[pmatch[reg_idx].rm_eo]);
924 #else
925 assert (mctx.input.offsets_needed == 0);
926 #endif
927 pmatch[reg_idx].rm_so += match_first;
928 pmatch[reg_idx].rm_eo += match_first;
930 for (reg_idx = 0; reg_idx < extra_nmatch; ++reg_idx)
932 pmatch[nmatch + reg_idx].rm_so = -1;
933 pmatch[nmatch + reg_idx].rm_eo = -1;
936 if (dfa->subexp_map)
937 for (reg_idx = 0; reg_idx + 1 < nmatch; reg_idx++)
938 if (dfa->subexp_map[reg_idx] != reg_idx)
940 pmatch[reg_idx + 1].rm_so
941 = pmatch[dfa->subexp_map[reg_idx] + 1].rm_so;
942 pmatch[reg_idx + 1].rm_eo
943 = pmatch[dfa->subexp_map[reg_idx] + 1].rm_eo;
947 free_return:
948 re_free (mctx.state_log);
949 if (dfa->nbackref)
950 match_ctx_free (&mctx);
951 re_string_destruct (&mctx.input);
952 return err;
955 static reg_errcode_t
956 __attribute_warn_unused_result__
957 prune_impossible_nodes (mctx)
958 re_match_context_t *mctx;
960 const re_dfa_t *const dfa = mctx->dfa;
961 int halt_node, match_last;
962 reg_errcode_t ret;
963 re_dfastate_t **sifted_states;
964 re_dfastate_t **lim_states = NULL;
965 re_sift_context_t sctx;
966 #ifdef DEBUG
967 assert (mctx->state_log != NULL);
968 #endif
969 match_last = mctx->match_last;
970 halt_node = mctx->last_node;
972 /* Avoid overflow. */
973 if (BE (SIZE_MAX / sizeof (re_dfastate_t *) <= match_last, 0))
974 return REG_ESPACE;
976 sifted_states = re_malloc (re_dfastate_t *, match_last + 1);
977 if (BE (sifted_states == NULL, 0))
979 ret = REG_ESPACE;
980 goto free_return;
982 if (dfa->nbackref)
984 lim_states = re_malloc (re_dfastate_t *, match_last + 1);
985 if (BE (lim_states == NULL, 0))
987 ret = REG_ESPACE;
988 goto free_return;
990 while (1)
992 memset (lim_states, '\0',
993 sizeof (re_dfastate_t *) * (match_last + 1));
994 sift_ctx_init (&sctx, sifted_states, lim_states, halt_node,
995 match_last);
996 ret = sift_states_backward (mctx, &sctx);
997 re_node_set_free (&sctx.limits);
998 if (BE (ret != REG_NOERROR, 0))
999 goto free_return;
1000 if (sifted_states[0] != NULL || lim_states[0] != NULL)
1001 break;
1004 --match_last;
1005 if (match_last < 0)
1007 ret = REG_NOMATCH;
1008 goto free_return;
1010 } while (mctx->state_log[match_last] == NULL
1011 || !mctx->state_log[match_last]->halt);
1012 halt_node = check_halt_state_context (mctx,
1013 mctx->state_log[match_last],
1014 match_last);
1016 ret = merge_state_array (dfa, sifted_states, lim_states,
1017 match_last + 1);
1018 re_free (lim_states);
1019 lim_states = NULL;
1020 if (BE (ret != REG_NOERROR, 0))
1021 goto free_return;
1023 else
1025 sift_ctx_init (&sctx, sifted_states, lim_states, halt_node, match_last);
1026 ret = sift_states_backward (mctx, &sctx);
1027 re_node_set_free (&sctx.limits);
1028 if (BE (ret != REG_NOERROR, 0))
1029 goto free_return;
1030 if (sifted_states[0] == NULL)
1032 ret = REG_NOMATCH;
1033 goto free_return;
1036 re_free (mctx->state_log);
1037 mctx->state_log = sifted_states;
1038 sifted_states = NULL;
1039 mctx->last_node = halt_node;
1040 mctx->match_last = match_last;
1041 ret = REG_NOERROR;
1042 free_return:
1043 re_free (sifted_states);
1044 re_free (lim_states);
1045 return ret;
1048 /* Acquire an initial state and return it.
1049 We must select appropriate initial state depending on the context,
1050 since initial states may have constraints like "\<", "^", etc.. */
1052 static inline re_dfastate_t *
1053 __attribute ((always_inline)) internal_function
1054 acquire_init_state_context (reg_errcode_t *err, const re_match_context_t *mctx,
1055 int idx)
1057 const re_dfa_t *const dfa = mctx->dfa;
1058 if (dfa->init_state->has_constraint)
1060 unsigned int context;
1061 context = re_string_context_at (&mctx->input, idx - 1, mctx->eflags);
1062 if (IS_WORD_CONTEXT (context))
1063 return dfa->init_state_word;
1064 else if (IS_ORDINARY_CONTEXT (context))
1065 return dfa->init_state;
1066 else if (IS_BEGBUF_CONTEXT (context) && IS_NEWLINE_CONTEXT (context))
1067 return dfa->init_state_begbuf;
1068 else if (IS_NEWLINE_CONTEXT (context))
1069 return dfa->init_state_nl;
1070 else if (IS_BEGBUF_CONTEXT (context))
1072 /* It is relatively rare case, then calculate on demand. */
1073 return re_acquire_state_context (err, dfa,
1074 dfa->init_state->entrance_nodes,
1075 context);
1077 else
1078 /* Must not happen? */
1079 return dfa->init_state;
1081 else
1082 return dfa->init_state;
1085 /* Check whether the regular expression match input string INPUT or not,
1086 and return the index where the matching end, return -1 if not match,
1087 or return -2 in case of an error.
1088 FL_LONGEST_MATCH means we want the POSIX longest matching.
1089 If P_MATCH_FIRST is not NULL, and the match fails, it is set to the
1090 next place where we may want to try matching.
1091 Note that the matcher assume that the maching starts from the current
1092 index of the buffer. */
1094 static int
1095 internal_function __attribute_warn_unused_result__
1096 check_matching (re_match_context_t *mctx, int fl_longest_match,
1097 int *p_match_first)
1099 const re_dfa_t *const dfa = mctx->dfa;
1100 reg_errcode_t err;
1101 int match = 0;
1102 int match_last = -1;
1103 int cur_str_idx = re_string_cur_idx (&mctx->input);
1104 re_dfastate_t *cur_state;
1105 int at_init_state = p_match_first != NULL;
1106 int next_start_idx = cur_str_idx;
1108 err = REG_NOERROR;
1109 cur_state = acquire_init_state_context (&err, mctx, cur_str_idx);
1110 /* An initial state must not be NULL (invalid). */
1111 if (BE (cur_state == NULL, 0))
1113 assert (err == REG_ESPACE);
1114 return -2;
1117 if (mctx->state_log != NULL)
1119 mctx->state_log[cur_str_idx] = cur_state;
1121 /* Check OP_OPEN_SUBEXP in the initial state in case that we use them
1122 later. E.g. Processing back references. */
1123 if (BE (dfa->nbackref, 0))
1125 at_init_state = 0;
1126 err = check_subexp_matching_top (mctx, &cur_state->nodes, 0);
1127 if (BE (err != REG_NOERROR, 0))
1128 return err;
1130 if (cur_state->has_backref)
1132 err = transit_state_bkref (mctx, &cur_state->nodes);
1133 if (BE (err != REG_NOERROR, 0))
1134 return err;
1139 /* If the RE accepts NULL string. */
1140 if (BE (cur_state->halt, 0))
1142 if (!cur_state->has_constraint
1143 || check_halt_state_context (mctx, cur_state, cur_str_idx))
1145 if (!fl_longest_match)
1146 return cur_str_idx;
1147 else
1149 match_last = cur_str_idx;
1150 match = 1;
1155 while (!re_string_eoi (&mctx->input))
1157 re_dfastate_t *old_state = cur_state;
1158 int next_char_idx = re_string_cur_idx (&mctx->input) + 1;
1160 if ((BE (next_char_idx >= mctx->input.bufs_len, 0)
1161 && mctx->input.bufs_len < mctx->input.len)
1162 || (BE (next_char_idx >= mctx->input.valid_len, 0)
1163 && mctx->input.valid_len < mctx->input.len))
1165 err = extend_buffers (mctx, next_char_idx + 1);
1166 if (BE (err != REG_NOERROR, 0))
1168 assert (err == REG_ESPACE);
1169 return -2;
1173 cur_state = transit_state (&err, mctx, cur_state);
1174 if (mctx->state_log != NULL)
1175 cur_state = merge_state_with_log (&err, mctx, cur_state);
1177 if (cur_state == NULL)
1179 /* Reached the invalid state or an error. Try to recover a valid
1180 state using the state log, if available and if we have not
1181 already found a valid (even if not the longest) match. */
1182 if (BE (err != REG_NOERROR, 0))
1183 return -2;
1185 if (mctx->state_log == NULL
1186 || (match && !fl_longest_match)
1187 || (cur_state = find_recover_state (&err, mctx)) == NULL)
1188 break;
1191 if (BE (at_init_state, 0))
1193 if (old_state == cur_state)
1194 next_start_idx = next_char_idx;
1195 else
1196 at_init_state = 0;
1199 if (cur_state->halt)
1201 /* Reached a halt state.
1202 Check the halt state can satisfy the current context. */
1203 if (!cur_state->has_constraint
1204 || check_halt_state_context (mctx, cur_state,
1205 re_string_cur_idx (&mctx->input)))
1207 /* We found an appropriate halt state. */
1208 match_last = re_string_cur_idx (&mctx->input);
1209 match = 1;
1211 /* We found a match, do not modify match_first below. */
1212 p_match_first = NULL;
1213 if (!fl_longest_match)
1214 break;
1219 if (p_match_first)
1220 *p_match_first += next_start_idx;
1222 return match_last;
1225 /* Check NODE match the current context. */
1227 static int
1228 internal_function
1229 check_halt_node_context (const re_dfa_t *dfa, int node, unsigned int context)
1231 re_token_type_t type = dfa->nodes[node].type;
1232 unsigned int constraint = dfa->nodes[node].constraint;
1233 if (type != END_OF_RE)
1234 return 0;
1235 if (!constraint)
1236 return 1;
1237 if (NOT_SATISFY_NEXT_CONSTRAINT (constraint, context))
1238 return 0;
1239 return 1;
1242 /* Check the halt state STATE match the current context.
1243 Return 0 if not match, if the node, STATE has, is a halt node and
1244 match the context, return the node. */
1246 static int
1247 internal_function
1248 check_halt_state_context (const re_match_context_t *mctx,
1249 const re_dfastate_t *state, int idx)
1251 int i;
1252 unsigned int context;
1253 #ifdef DEBUG
1254 assert (state->halt);
1255 #endif
1256 context = re_string_context_at (&mctx->input, idx, mctx->eflags);
1257 for (i = 0; i < state->nodes.nelem; ++i)
1258 if (check_halt_node_context (mctx->dfa, state->nodes.elems[i], context))
1259 return state->nodes.elems[i];
1260 return 0;
1263 /* Compute the next node to which "NFA" transit from NODE("NFA" is a NFA
1264 corresponding to the DFA).
1265 Return the destination node, and update EPS_VIA_NODES, return -1 in case
1266 of errors. */
1268 static int
1269 internal_function
1270 proceed_next_node (const re_match_context_t *mctx, int nregs, regmatch_t *regs,
1271 int *pidx, int node, re_node_set *eps_via_nodes,
1272 struct re_fail_stack_t *fs)
1274 const re_dfa_t *const dfa = mctx->dfa;
1275 int i, err;
1276 if (IS_EPSILON_NODE (dfa->nodes[node].type))
1278 re_node_set *cur_nodes = &mctx->state_log[*pidx]->nodes;
1279 re_node_set *edests = &dfa->edests[node];
1280 int dest_node;
1281 err = re_node_set_insert (eps_via_nodes, node);
1282 if (BE (err < 0, 0))
1283 return -2;
1284 /* Pick up a valid destination, or return -1 if none is found. */
1285 for (dest_node = -1, i = 0; i < edests->nelem; ++i)
1287 int candidate = edests->elems[i];
1288 if (!re_node_set_contains (cur_nodes, candidate))
1289 continue;
1290 if (dest_node == -1)
1291 dest_node = candidate;
1293 else
1295 /* In order to avoid infinite loop like "(a*)*", return the second
1296 epsilon-transition if the first was already considered. */
1297 if (re_node_set_contains (eps_via_nodes, dest_node))
1298 return candidate;
1300 /* Otherwise, push the second epsilon-transition on the fail stack. */
1301 else if (fs != NULL
1302 && push_fail_stack (fs, *pidx, candidate, nregs, regs,
1303 eps_via_nodes))
1304 return -2;
1306 /* We know we are going to exit. */
1307 break;
1310 return dest_node;
1312 else
1314 int naccepted = 0;
1315 re_token_type_t type = dfa->nodes[node].type;
1317 #ifdef RE_ENABLE_I18N
1318 if (dfa->nodes[node].accept_mb)
1319 naccepted = check_node_accept_bytes (dfa, node, &mctx->input, *pidx);
1320 else
1321 #endif /* RE_ENABLE_I18N */
1322 if (type == OP_BACK_REF)
1324 int subexp_idx = dfa->nodes[node].opr.idx + 1;
1325 naccepted = regs[subexp_idx].rm_eo - regs[subexp_idx].rm_so;
1326 if (fs != NULL)
1328 if (regs[subexp_idx].rm_so == -1 || regs[subexp_idx].rm_eo == -1)
1329 return -1;
1330 else if (naccepted)
1332 char *buf = (char *) re_string_get_buffer (&mctx->input);
1333 if (memcmp (buf + regs[subexp_idx].rm_so, buf + *pidx,
1334 naccepted) != 0)
1335 return -1;
1339 if (naccepted == 0)
1341 int dest_node;
1342 err = re_node_set_insert (eps_via_nodes, node);
1343 if (BE (err < 0, 0))
1344 return -2;
1345 dest_node = dfa->edests[node].elems[0];
1346 if (re_node_set_contains (&mctx->state_log[*pidx]->nodes,
1347 dest_node))
1348 return dest_node;
1352 if (naccepted != 0
1353 || check_node_accept (mctx, dfa->nodes + node, *pidx))
1355 int dest_node = dfa->nexts[node];
1356 *pidx = (naccepted == 0) ? *pidx + 1 : *pidx + naccepted;
1357 if (fs && (*pidx > mctx->match_last || mctx->state_log[*pidx] == NULL
1358 || !re_node_set_contains (&mctx->state_log[*pidx]->nodes,
1359 dest_node)))
1360 return -1;
1361 re_node_set_empty (eps_via_nodes);
1362 return dest_node;
1365 return -1;
1368 static reg_errcode_t
1369 internal_function __attribute_warn_unused_result__
1370 push_fail_stack (struct re_fail_stack_t *fs, int str_idx, int dest_node,
1371 int nregs, regmatch_t *regs, re_node_set *eps_via_nodes)
1373 reg_errcode_t err;
1374 int num = fs->num++;
1375 if (fs->num == fs->alloc)
1377 struct re_fail_stack_ent_t *new_array;
1378 new_array = realloc (fs->stack, (sizeof (struct re_fail_stack_ent_t)
1379 * fs->alloc * 2));
1380 if (new_array == NULL)
1381 return REG_ESPACE;
1382 fs->alloc *= 2;
1383 fs->stack = new_array;
1385 fs->stack[num].idx = str_idx;
1386 fs->stack[num].node = dest_node;
1387 fs->stack[num].regs = re_malloc (regmatch_t, nregs);
1388 if (fs->stack[num].regs == NULL)
1389 return REG_ESPACE;
1390 memcpy (fs->stack[num].regs, regs, sizeof (regmatch_t) * nregs);
1391 err = re_node_set_init_copy (&fs->stack[num].eps_via_nodes, eps_via_nodes);
1392 return err;
1395 static int
1396 internal_function
1397 pop_fail_stack (struct re_fail_stack_t *fs, int *pidx, int nregs,
1398 regmatch_t *regs, re_node_set *eps_via_nodes)
1400 int num = --fs->num;
1401 assert (num >= 0);
1402 *pidx = fs->stack[num].idx;
1403 memcpy (regs, fs->stack[num].regs, sizeof (regmatch_t) * nregs);
1404 re_node_set_free (eps_via_nodes);
1405 re_free (fs->stack[num].regs);
1406 *eps_via_nodes = fs->stack[num].eps_via_nodes;
1407 return fs->stack[num].node;
1410 /* Set the positions where the subexpressions are starts/ends to registers
1411 PMATCH.
1412 Note: We assume that pmatch[0] is already set, and
1413 pmatch[i].rm_so == pmatch[i].rm_eo == -1 for 0 < i < nmatch. */
1415 static reg_errcode_t
1416 internal_function __attribute_warn_unused_result__
1417 set_regs (const regex_t *preg, const re_match_context_t *mctx, size_t nmatch,
1418 regmatch_t *pmatch, int fl_backtrack)
1420 const re_dfa_t *dfa = (const re_dfa_t *) preg->buffer;
1421 int idx, cur_node;
1422 re_node_set eps_via_nodes;
1423 struct re_fail_stack_t *fs;
1424 struct re_fail_stack_t fs_body = { 0, 2, NULL };
1425 regmatch_t *prev_idx_match;
1426 int prev_idx_match_malloced = 0;
1428 #ifdef DEBUG
1429 assert (nmatch > 1);
1430 assert (mctx->state_log != NULL);
1431 #endif
1432 if (fl_backtrack)
1434 fs = &fs_body;
1435 fs->stack = re_malloc (struct re_fail_stack_ent_t, fs->alloc);
1436 if (fs->stack == NULL)
1437 return REG_ESPACE;
1439 else
1440 fs = NULL;
1442 cur_node = dfa->init_node;
1443 re_node_set_init_empty (&eps_via_nodes);
1445 if (__libc_use_alloca (nmatch * sizeof (regmatch_t)))
1446 prev_idx_match = (regmatch_t *) alloca (nmatch * sizeof (regmatch_t));
1447 else
1449 prev_idx_match = re_malloc (regmatch_t, nmatch);
1450 if (prev_idx_match == NULL)
1452 free_fail_stack_return (fs);
1453 return REG_ESPACE;
1455 prev_idx_match_malloced = 1;
1457 memcpy (prev_idx_match, pmatch, sizeof (regmatch_t) * nmatch);
1459 for (idx = pmatch[0].rm_so; idx <= pmatch[0].rm_eo ;)
1461 update_regs (dfa, pmatch, prev_idx_match, cur_node, idx, nmatch);
1463 if (idx == pmatch[0].rm_eo && cur_node == mctx->last_node)
1465 int reg_idx;
1466 if (fs)
1468 for (reg_idx = 0; reg_idx < nmatch; ++reg_idx)
1469 if (pmatch[reg_idx].rm_so > -1 && pmatch[reg_idx].rm_eo == -1)
1470 break;
1471 if (reg_idx == nmatch)
1473 re_node_set_free (&eps_via_nodes);
1474 if (prev_idx_match_malloced)
1475 re_free (prev_idx_match);
1476 return free_fail_stack_return (fs);
1478 cur_node = pop_fail_stack (fs, &idx, nmatch, pmatch,
1479 &eps_via_nodes);
1481 else
1483 re_node_set_free (&eps_via_nodes);
1484 if (prev_idx_match_malloced)
1485 re_free (prev_idx_match);
1486 return REG_NOERROR;
1490 /* Proceed to next node. */
1491 cur_node = proceed_next_node (mctx, nmatch, pmatch, &idx, cur_node,
1492 &eps_via_nodes, fs);
1494 if (BE (cur_node < 0, 0))
1496 if (BE (cur_node == -2, 0))
1498 re_node_set_free (&eps_via_nodes);
1499 if (prev_idx_match_malloced)
1500 re_free (prev_idx_match);
1501 free_fail_stack_return (fs);
1502 return REG_ESPACE;
1504 if (fs)
1505 cur_node = pop_fail_stack (fs, &idx, nmatch, pmatch,
1506 &eps_via_nodes);
1507 else
1509 re_node_set_free (&eps_via_nodes);
1510 if (prev_idx_match_malloced)
1511 re_free (prev_idx_match);
1512 return REG_NOMATCH;
1516 re_node_set_free (&eps_via_nodes);
1517 if (prev_idx_match_malloced)
1518 re_free (prev_idx_match);
1519 return free_fail_stack_return (fs);
1522 static reg_errcode_t
1523 internal_function
1524 free_fail_stack_return (struct re_fail_stack_t *fs)
1526 if (fs)
1528 int fs_idx;
1529 for (fs_idx = 0; fs_idx < fs->num; ++fs_idx)
1531 re_node_set_free (&fs->stack[fs_idx].eps_via_nodes);
1532 re_free (fs->stack[fs_idx].regs);
1534 re_free (fs->stack);
1536 return REG_NOERROR;
1539 static void
1540 internal_function
1541 update_regs (const re_dfa_t *dfa, regmatch_t *pmatch,
1542 regmatch_t *prev_idx_match, int cur_node, int cur_idx, int nmatch)
1544 int type = dfa->nodes[cur_node].type;
1545 if (type == OP_OPEN_SUBEXP)
1547 int reg_num = dfa->nodes[cur_node].opr.idx + 1;
1549 /* We are at the first node of this sub expression. */
1550 if (reg_num < nmatch)
1552 pmatch[reg_num].rm_so = cur_idx;
1553 pmatch[reg_num].rm_eo = -1;
1556 else if (type == OP_CLOSE_SUBEXP)
1558 int reg_num = dfa->nodes[cur_node].opr.idx + 1;
1559 if (reg_num < nmatch)
1561 /* We are at the last node of this sub expression. */
1562 if (pmatch[reg_num].rm_so < cur_idx)
1564 pmatch[reg_num].rm_eo = cur_idx;
1565 /* This is a non-empty match or we are not inside an optional
1566 subexpression. Accept this right away. */
1567 memcpy (prev_idx_match, pmatch, sizeof (regmatch_t) * nmatch);
1569 else
1571 if (dfa->nodes[cur_node].opt_subexp
1572 && prev_idx_match[reg_num].rm_so != -1)
1573 /* We transited through an empty match for an optional
1574 subexpression, like (a?)*, and this is not the subexp's
1575 first match. Copy back the old content of the registers
1576 so that matches of an inner subexpression are undone as
1577 well, like in ((a?))*. */
1578 memcpy (pmatch, prev_idx_match, sizeof (regmatch_t) * nmatch);
1579 else
1580 /* We completed a subexpression, but it may be part of
1581 an optional one, so do not update PREV_IDX_MATCH. */
1582 pmatch[reg_num].rm_eo = cur_idx;
1588 /* This function checks the STATE_LOG from the SCTX->last_str_idx to 0
1589 and sift the nodes in each states according to the following rules.
1590 Updated state_log will be wrote to STATE_LOG.
1592 Rules: We throw away the Node `a' in the STATE_LOG[STR_IDX] if...
1593 1. When STR_IDX == MATCH_LAST(the last index in the state_log):
1594 If `a' isn't the LAST_NODE and `a' can't epsilon transit to
1595 the LAST_NODE, we throw away the node `a'.
1596 2. When 0 <= STR_IDX < MATCH_LAST and `a' accepts
1597 string `s' and transit to `b':
1598 i. If 'b' isn't in the STATE_LOG[STR_IDX+strlen('s')], we throw
1599 away the node `a'.
1600 ii. If 'b' is in the STATE_LOG[STR_IDX+strlen('s')] but 'b' is
1601 thrown away, we throw away the node `a'.
1602 3. When 0 <= STR_IDX < MATCH_LAST and 'a' epsilon transit to 'b':
1603 i. If 'b' isn't in the STATE_LOG[STR_IDX], we throw away the
1604 node `a'.
1605 ii. If 'b' is in the STATE_LOG[STR_IDX] but 'b' is thrown away,
1606 we throw away the node `a'. */
1608 #define STATE_NODE_CONTAINS(state,node) \
1609 ((state) != NULL && re_node_set_contains (&(state)->nodes, node))
1611 static reg_errcode_t
1612 internal_function
1613 sift_states_backward (const re_match_context_t *mctx, re_sift_context_t *sctx)
1615 reg_errcode_t err;
1616 int null_cnt = 0;
1617 int str_idx = sctx->last_str_idx;
1618 re_node_set cur_dest;
1620 #ifdef DEBUG
1621 assert (mctx->state_log != NULL && mctx->state_log[str_idx] != NULL);
1622 #endif
1624 /* Build sifted state_log[str_idx]. It has the nodes which can epsilon
1625 transit to the last_node and the last_node itself. */
1626 err = re_node_set_init_1 (&cur_dest, sctx->last_node);
1627 if (BE (err != REG_NOERROR, 0))
1628 return err;
1629 err = update_cur_sifted_state (mctx, sctx, str_idx, &cur_dest);
1630 if (BE (err != REG_NOERROR, 0))
1631 goto free_return;
1633 /* Then check each states in the state_log. */
1634 while (str_idx > 0)
1636 /* Update counters. */
1637 null_cnt = (sctx->sifted_states[str_idx] == NULL) ? null_cnt + 1 : 0;
1638 if (null_cnt > mctx->max_mb_elem_len)
1640 memset (sctx->sifted_states, '\0',
1641 sizeof (re_dfastate_t *) * str_idx);
1642 re_node_set_free (&cur_dest);
1643 return REG_NOERROR;
1645 re_node_set_empty (&cur_dest);
1646 --str_idx;
1648 if (mctx->state_log[str_idx])
1650 err = build_sifted_states (mctx, sctx, str_idx, &cur_dest);
1651 if (BE (err != REG_NOERROR, 0))
1652 goto free_return;
1655 /* Add all the nodes which satisfy the following conditions:
1656 - It can epsilon transit to a node in CUR_DEST.
1657 - It is in CUR_SRC.
1658 And update state_log. */
1659 err = update_cur_sifted_state (mctx, sctx, str_idx, &cur_dest);
1660 if (BE (err != REG_NOERROR, 0))
1661 goto free_return;
1663 err = REG_NOERROR;
1664 free_return:
1665 re_node_set_free (&cur_dest);
1666 return err;
1669 static reg_errcode_t
1670 internal_function __attribute_warn_unused_result__
1671 build_sifted_states (const re_match_context_t *mctx, re_sift_context_t *sctx,
1672 int str_idx, re_node_set *cur_dest)
1674 const re_dfa_t *const dfa = mctx->dfa;
1675 const re_node_set *cur_src = &mctx->state_log[str_idx]->non_eps_nodes;
1676 int i;
1678 /* Then build the next sifted state.
1679 We build the next sifted state on `cur_dest', and update
1680 `sifted_states[str_idx]' with `cur_dest'.
1681 Note:
1682 `cur_dest' is the sifted state from `state_log[str_idx + 1]'.
1683 `cur_src' points the node_set of the old `state_log[str_idx]'
1684 (with the epsilon nodes pre-filtered out). */
1685 for (i = 0; i < cur_src->nelem; i++)
1687 int prev_node = cur_src->elems[i];
1688 int naccepted = 0;
1689 int ret;
1691 #ifdef DEBUG
1692 re_token_type_t type = dfa->nodes[prev_node].type;
1693 assert (!IS_EPSILON_NODE (type));
1694 #endif
1695 #ifdef RE_ENABLE_I18N
1696 /* If the node may accept `multi byte'. */
1697 if (dfa->nodes[prev_node].accept_mb)
1698 naccepted = sift_states_iter_mb (mctx, sctx, prev_node,
1699 str_idx, sctx->last_str_idx);
1700 #endif /* RE_ENABLE_I18N */
1702 /* We don't check backreferences here.
1703 See update_cur_sifted_state(). */
1704 if (!naccepted
1705 && check_node_accept (mctx, dfa->nodes + prev_node, str_idx)
1706 && STATE_NODE_CONTAINS (sctx->sifted_states[str_idx + 1],
1707 dfa->nexts[prev_node]))
1708 naccepted = 1;
1710 if (naccepted == 0)
1711 continue;
1713 if (sctx->limits.nelem)
1715 int to_idx = str_idx + naccepted;
1716 if (check_dst_limits (mctx, &sctx->limits,
1717 dfa->nexts[prev_node], to_idx,
1718 prev_node, str_idx))
1719 continue;
1721 ret = re_node_set_insert (cur_dest, prev_node);
1722 if (BE (ret == -1, 0))
1723 return REG_ESPACE;
1726 return REG_NOERROR;
1729 /* Helper functions. */
1731 static reg_errcode_t
1732 internal_function
1733 clean_state_log_if_needed (re_match_context_t *mctx, int next_state_log_idx)
1735 int top = mctx->state_log_top;
1737 if ((next_state_log_idx >= mctx->input.bufs_len
1738 && mctx->input.bufs_len < mctx->input.len)
1739 || (next_state_log_idx >= mctx->input.valid_len
1740 && mctx->input.valid_len < mctx->input.len))
1742 reg_errcode_t err;
1743 err = extend_buffers (mctx, next_state_log_idx + 1);
1744 if (BE (err != REG_NOERROR, 0))
1745 return err;
1748 if (top < next_state_log_idx)
1750 memset (mctx->state_log + top + 1, '\0',
1751 sizeof (re_dfastate_t *) * (next_state_log_idx - top));
1752 mctx->state_log_top = next_state_log_idx;
1754 return REG_NOERROR;
1757 static reg_errcode_t
1758 internal_function
1759 merge_state_array (const re_dfa_t *dfa, re_dfastate_t **dst,
1760 re_dfastate_t **src, int num)
1762 int st_idx;
1763 reg_errcode_t err;
1764 for (st_idx = 0; st_idx < num; ++st_idx)
1766 if (dst[st_idx] == NULL)
1767 dst[st_idx] = src[st_idx];
1768 else if (src[st_idx] != NULL)
1770 re_node_set merged_set;
1771 err = re_node_set_init_union (&merged_set, &dst[st_idx]->nodes,
1772 &src[st_idx]->nodes);
1773 if (BE (err != REG_NOERROR, 0))
1774 return err;
1775 dst[st_idx] = re_acquire_state (&err, dfa, &merged_set);
1776 re_node_set_free (&merged_set);
1777 if (BE (err != REG_NOERROR, 0))
1778 return err;
1781 return REG_NOERROR;
1784 static reg_errcode_t
1785 internal_function
1786 update_cur_sifted_state (const re_match_context_t *mctx,
1787 re_sift_context_t *sctx, int str_idx,
1788 re_node_set *dest_nodes)
1790 const re_dfa_t *const dfa = mctx->dfa;
1791 reg_errcode_t err = REG_NOERROR;
1792 const re_node_set *candidates;
1793 candidates = ((mctx->state_log[str_idx] == NULL) ? NULL
1794 : &mctx->state_log[str_idx]->nodes);
1796 if (dest_nodes->nelem == 0)
1797 sctx->sifted_states[str_idx] = NULL;
1798 else
1800 if (candidates)
1802 /* At first, add the nodes which can epsilon transit to a node in
1803 DEST_NODE. */
1804 err = add_epsilon_src_nodes (dfa, dest_nodes, candidates);
1805 if (BE (err != REG_NOERROR, 0))
1806 return err;
1808 /* Then, check the limitations in the current sift_context. */
1809 if (sctx->limits.nelem)
1811 err = check_subexp_limits (dfa, dest_nodes, candidates, &sctx->limits,
1812 mctx->bkref_ents, str_idx);
1813 if (BE (err != REG_NOERROR, 0))
1814 return err;
1818 sctx->sifted_states[str_idx] = re_acquire_state (&err, dfa, dest_nodes);
1819 if (BE (err != REG_NOERROR, 0))
1820 return err;
1823 if (candidates && mctx->state_log[str_idx]->has_backref)
1825 err = sift_states_bkref (mctx, sctx, str_idx, candidates);
1826 if (BE (err != REG_NOERROR, 0))
1827 return err;
1829 return REG_NOERROR;
1832 static reg_errcode_t
1833 internal_function __attribute_warn_unused_result__
1834 add_epsilon_src_nodes (const re_dfa_t *dfa, re_node_set *dest_nodes,
1835 const re_node_set *candidates)
1837 reg_errcode_t err = REG_NOERROR;
1838 int i;
1840 re_dfastate_t *state = re_acquire_state (&err, dfa, dest_nodes);
1841 if (BE (err != REG_NOERROR, 0))
1842 return err;
1844 if (!state->inveclosure.alloc)
1846 err = re_node_set_alloc (&state->inveclosure, dest_nodes->nelem);
1847 if (BE (err != REG_NOERROR, 0))
1848 return REG_ESPACE;
1849 for (i = 0; i < dest_nodes->nelem; i++)
1851 err = re_node_set_merge (&state->inveclosure,
1852 dfa->inveclosures + dest_nodes->elems[i]);
1853 if (BE (err != REG_NOERROR, 0))
1854 return REG_ESPACE;
1857 return re_node_set_add_intersect (dest_nodes, candidates,
1858 &state->inveclosure);
1861 static reg_errcode_t
1862 internal_function
1863 sub_epsilon_src_nodes (const re_dfa_t *dfa, int node, re_node_set *dest_nodes,
1864 const re_node_set *candidates)
1866 int ecl_idx;
1867 reg_errcode_t err;
1868 re_node_set *inv_eclosure = dfa->inveclosures + node;
1869 re_node_set except_nodes;
1870 re_node_set_init_empty (&except_nodes);
1871 for (ecl_idx = 0; ecl_idx < inv_eclosure->nelem; ++ecl_idx)
1873 int cur_node = inv_eclosure->elems[ecl_idx];
1874 if (cur_node == node)
1875 continue;
1876 if (IS_EPSILON_NODE (dfa->nodes[cur_node].type))
1878 int edst1 = dfa->edests[cur_node].elems[0];
1879 int edst2 = ((dfa->edests[cur_node].nelem > 1)
1880 ? dfa->edests[cur_node].elems[1] : -1);
1881 if ((!re_node_set_contains (inv_eclosure, edst1)
1882 && re_node_set_contains (dest_nodes, edst1))
1883 || (edst2 > 0
1884 && !re_node_set_contains (inv_eclosure, edst2)
1885 && re_node_set_contains (dest_nodes, edst2)))
1887 err = re_node_set_add_intersect (&except_nodes, candidates,
1888 dfa->inveclosures + cur_node);
1889 if (BE (err != REG_NOERROR, 0))
1891 re_node_set_free (&except_nodes);
1892 return err;
1897 for (ecl_idx = 0; ecl_idx < inv_eclosure->nelem; ++ecl_idx)
1899 int cur_node = inv_eclosure->elems[ecl_idx];
1900 if (!re_node_set_contains (&except_nodes, cur_node))
1902 int idx = re_node_set_contains (dest_nodes, cur_node) - 1;
1903 re_node_set_remove_at (dest_nodes, idx);
1906 re_node_set_free (&except_nodes);
1907 return REG_NOERROR;
1910 static int
1911 internal_function
1912 check_dst_limits (const re_match_context_t *mctx, re_node_set *limits,
1913 int dst_node, int dst_idx, int src_node, int src_idx)
1915 const re_dfa_t *const dfa = mctx->dfa;
1916 int lim_idx, src_pos, dst_pos;
1918 int dst_bkref_idx = search_cur_bkref_entry (mctx, dst_idx);
1919 int src_bkref_idx = search_cur_bkref_entry (mctx, src_idx);
1920 for (lim_idx = 0; lim_idx < limits->nelem; ++lim_idx)
1922 int subexp_idx;
1923 struct re_backref_cache_entry *ent;
1924 ent = mctx->bkref_ents + limits->elems[lim_idx];
1925 subexp_idx = dfa->nodes[ent->node].opr.idx;
1927 dst_pos = check_dst_limits_calc_pos (mctx, limits->elems[lim_idx],
1928 subexp_idx, dst_node, dst_idx,
1929 dst_bkref_idx);
1930 src_pos = check_dst_limits_calc_pos (mctx, limits->elems[lim_idx],
1931 subexp_idx, src_node, src_idx,
1932 src_bkref_idx);
1934 /* In case of:
1935 <src> <dst> ( <subexp> )
1936 ( <subexp> ) <src> <dst>
1937 ( <subexp1> <src> <subexp2> <dst> <subexp3> ) */
1938 if (src_pos == dst_pos)
1939 continue; /* This is unrelated limitation. */
1940 else
1941 return 1;
1943 return 0;
1946 static int
1947 internal_function
1948 check_dst_limits_calc_pos_1 (const re_match_context_t *mctx, int boundaries,
1949 int subexp_idx, int from_node, int bkref_idx)
1951 const re_dfa_t *const dfa = mctx->dfa;
1952 const re_node_set *eclosures = dfa->eclosures + from_node;
1953 int node_idx;
1955 /* Else, we are on the boundary: examine the nodes on the epsilon
1956 closure. */
1957 for (node_idx = 0; node_idx < eclosures->nelem; ++node_idx)
1959 int node = eclosures->elems[node_idx];
1960 switch (dfa->nodes[node].type)
1962 case OP_BACK_REF:
1963 if (bkref_idx != -1)
1965 struct re_backref_cache_entry *ent = mctx->bkref_ents + bkref_idx;
1968 int dst, cpos;
1970 if (ent->node != node)
1971 continue;
1973 if (subexp_idx < BITSET_WORD_BITS
1974 && !(ent->eps_reachable_subexps_map
1975 & ((bitset_word_t) 1 << subexp_idx)))
1976 continue;
1978 /* Recurse trying to reach the OP_OPEN_SUBEXP and
1979 OP_CLOSE_SUBEXP cases below. But, if the
1980 destination node is the same node as the source
1981 node, don't recurse because it would cause an
1982 infinite loop: a regex that exhibits this behavior
1983 is ()\1*\1* */
1984 dst = dfa->edests[node].elems[0];
1985 if (dst == from_node)
1987 if (boundaries & 1)
1988 return -1;
1989 else /* if (boundaries & 2) */
1990 return 0;
1993 cpos =
1994 check_dst_limits_calc_pos_1 (mctx, boundaries, subexp_idx,
1995 dst, bkref_idx);
1996 if (cpos == -1 /* && (boundaries & 1) */)
1997 return -1;
1998 if (cpos == 0 && (boundaries & 2))
1999 return 0;
2001 if (subexp_idx < BITSET_WORD_BITS)
2002 ent->eps_reachable_subexps_map
2003 &= ~((bitset_word_t) 1 << subexp_idx);
2005 while (ent++->more);
2007 break;
2009 case OP_OPEN_SUBEXP:
2010 if ((boundaries & 1) && subexp_idx == dfa->nodes[node].opr.idx)
2011 return -1;
2012 break;
2014 case OP_CLOSE_SUBEXP:
2015 if ((boundaries & 2) && subexp_idx == dfa->nodes[node].opr.idx)
2016 return 0;
2017 break;
2019 default:
2020 break;
2024 return (boundaries & 2) ? 1 : 0;
2027 static int
2028 internal_function
2029 check_dst_limits_calc_pos (const re_match_context_t *mctx, int limit,
2030 int subexp_idx, int from_node, int str_idx,
2031 int bkref_idx)
2033 struct re_backref_cache_entry *lim = mctx->bkref_ents + limit;
2034 int boundaries;
2036 /* If we are outside the range of the subexpression, return -1 or 1. */
2037 if (str_idx < lim->subexp_from)
2038 return -1;
2040 if (lim->subexp_to < str_idx)
2041 return 1;
2043 /* If we are within the subexpression, return 0. */
2044 boundaries = (str_idx == lim->subexp_from);
2045 boundaries |= (str_idx == lim->subexp_to) << 1;
2046 if (boundaries == 0)
2047 return 0;
2049 /* Else, examine epsilon closure. */
2050 return check_dst_limits_calc_pos_1 (mctx, boundaries, subexp_idx,
2051 from_node, bkref_idx);
2054 /* Check the limitations of sub expressions LIMITS, and remove the nodes
2055 which are against limitations from DEST_NODES. */
2057 static reg_errcode_t
2058 internal_function
2059 check_subexp_limits (const re_dfa_t *dfa, re_node_set *dest_nodes,
2060 const re_node_set *candidates, re_node_set *limits,
2061 struct re_backref_cache_entry *bkref_ents, int str_idx)
2063 reg_errcode_t err;
2064 int node_idx, lim_idx;
2066 for (lim_idx = 0; lim_idx < limits->nelem; ++lim_idx)
2068 int subexp_idx;
2069 struct re_backref_cache_entry *ent;
2070 ent = bkref_ents + limits->elems[lim_idx];
2072 if (str_idx <= ent->subexp_from || ent->str_idx < str_idx)
2073 continue; /* This is unrelated limitation. */
2075 subexp_idx = dfa->nodes[ent->node].opr.idx;
2076 if (ent->subexp_to == str_idx)
2078 int ops_node = -1;
2079 int cls_node = -1;
2080 for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
2082 int node = dest_nodes->elems[node_idx];
2083 re_token_type_t type = dfa->nodes[node].type;
2084 if (type == OP_OPEN_SUBEXP
2085 && subexp_idx == dfa->nodes[node].opr.idx)
2086 ops_node = node;
2087 else if (type == OP_CLOSE_SUBEXP
2088 && subexp_idx == dfa->nodes[node].opr.idx)
2089 cls_node = node;
2092 /* Check the limitation of the open subexpression. */
2093 /* Note that (ent->subexp_to = str_idx != ent->subexp_from). */
2094 if (ops_node >= 0)
2096 err = sub_epsilon_src_nodes (dfa, ops_node, dest_nodes,
2097 candidates);
2098 if (BE (err != REG_NOERROR, 0))
2099 return err;
2102 /* Check the limitation of the close subexpression. */
2103 if (cls_node >= 0)
2104 for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
2106 int node = dest_nodes->elems[node_idx];
2107 if (!re_node_set_contains (dfa->inveclosures + node,
2108 cls_node)
2109 && !re_node_set_contains (dfa->eclosures + node,
2110 cls_node))
2112 /* It is against this limitation.
2113 Remove it form the current sifted state. */
2114 err = sub_epsilon_src_nodes (dfa, node, dest_nodes,
2115 candidates);
2116 if (BE (err != REG_NOERROR, 0))
2117 return err;
2118 --node_idx;
2122 else /* (ent->subexp_to != str_idx) */
2124 for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
2126 int node = dest_nodes->elems[node_idx];
2127 re_token_type_t type = dfa->nodes[node].type;
2128 if (type == OP_CLOSE_SUBEXP || type == OP_OPEN_SUBEXP)
2130 if (subexp_idx != dfa->nodes[node].opr.idx)
2131 continue;
2132 /* It is against this limitation.
2133 Remove it form the current sifted state. */
2134 err = sub_epsilon_src_nodes (dfa, node, dest_nodes,
2135 candidates);
2136 if (BE (err != REG_NOERROR, 0))
2137 return err;
2142 return REG_NOERROR;
2145 static reg_errcode_t
2146 internal_function __attribute_warn_unused_result__
2147 sift_states_bkref (const re_match_context_t *mctx, re_sift_context_t *sctx,
2148 int str_idx, const re_node_set *candidates)
2150 const re_dfa_t *const dfa = mctx->dfa;
2151 reg_errcode_t err;
2152 int node_idx, node;
2153 re_sift_context_t local_sctx;
2154 int first_idx = search_cur_bkref_entry (mctx, str_idx);
2156 if (first_idx == -1)
2157 return REG_NOERROR;
2159 local_sctx.sifted_states = NULL; /* Mark that it hasn't been initialized. */
2161 for (node_idx = 0; node_idx < candidates->nelem; ++node_idx)
2163 int enabled_idx;
2164 re_token_type_t type;
2165 struct re_backref_cache_entry *entry;
2166 node = candidates->elems[node_idx];
2167 type = dfa->nodes[node].type;
2168 /* Avoid infinite loop for the REs like "()\1+". */
2169 if (node == sctx->last_node && str_idx == sctx->last_str_idx)
2170 continue;
2171 if (type != OP_BACK_REF)
2172 continue;
2174 entry = mctx->bkref_ents + first_idx;
2175 enabled_idx = first_idx;
2178 int subexp_len;
2179 int to_idx;
2180 int dst_node;
2181 int ret;
2182 re_dfastate_t *cur_state;
2184 if (entry->node != node)
2185 continue;
2186 subexp_len = entry->subexp_to - entry->subexp_from;
2187 to_idx = str_idx + subexp_len;
2188 dst_node = (subexp_len ? dfa->nexts[node]
2189 : dfa->edests[node].elems[0]);
2191 if (to_idx > sctx->last_str_idx
2192 || sctx->sifted_states[to_idx] == NULL
2193 || !STATE_NODE_CONTAINS (sctx->sifted_states[to_idx], dst_node)
2194 || check_dst_limits (mctx, &sctx->limits, node,
2195 str_idx, dst_node, to_idx))
2196 continue;
2198 if (local_sctx.sifted_states == NULL)
2200 local_sctx = *sctx;
2201 err = re_node_set_init_copy (&local_sctx.limits, &sctx->limits);
2202 if (BE (err != REG_NOERROR, 0))
2203 goto free_return;
2205 local_sctx.last_node = node;
2206 local_sctx.last_str_idx = str_idx;
2207 ret = re_node_set_insert (&local_sctx.limits, enabled_idx);
2208 if (BE (ret < 0, 0))
2210 err = REG_ESPACE;
2211 goto free_return;
2213 cur_state = local_sctx.sifted_states[str_idx];
2214 err = sift_states_backward (mctx, &local_sctx);
2215 if (BE (err != REG_NOERROR, 0))
2216 goto free_return;
2217 if (sctx->limited_states != NULL)
2219 err = merge_state_array (dfa, sctx->limited_states,
2220 local_sctx.sifted_states,
2221 str_idx + 1);
2222 if (BE (err != REG_NOERROR, 0))
2223 goto free_return;
2225 local_sctx.sifted_states[str_idx] = cur_state;
2226 re_node_set_remove (&local_sctx.limits, enabled_idx);
2228 /* mctx->bkref_ents may have changed, reload the pointer. */
2229 entry = mctx->bkref_ents + enabled_idx;
2231 while (enabled_idx++, entry++->more);
2233 err = REG_NOERROR;
2234 free_return:
2235 if (local_sctx.sifted_states != NULL)
2237 re_node_set_free (&local_sctx.limits);
2240 return err;
2244 #ifdef RE_ENABLE_I18N
2245 static int
2246 internal_function
2247 sift_states_iter_mb (const re_match_context_t *mctx, re_sift_context_t *sctx,
2248 int node_idx, int str_idx, int max_str_idx)
2250 const re_dfa_t *const dfa = mctx->dfa;
2251 int naccepted;
2252 /* Check the node can accept `multi byte'. */
2253 naccepted = check_node_accept_bytes (dfa, node_idx, &mctx->input, str_idx);
2254 if (naccepted > 0 && str_idx + naccepted <= max_str_idx &&
2255 !STATE_NODE_CONTAINS (sctx->sifted_states[str_idx + naccepted],
2256 dfa->nexts[node_idx]))
2257 /* The node can't accept the `multi byte', or the
2258 destination was already thrown away, then the node
2259 could't accept the current input `multi byte'. */
2260 naccepted = 0;
2261 /* Otherwise, it is sure that the node could accept
2262 `naccepted' bytes input. */
2263 return naccepted;
2265 #endif /* RE_ENABLE_I18N */
2268 /* Functions for state transition. */
2270 /* Return the next state to which the current state STATE will transit by
2271 accepting the current input byte, and update STATE_LOG if necessary.
2272 If STATE can accept a multibyte char/collating element/back reference
2273 update the destination of STATE_LOG. */
2275 static re_dfastate_t *
2276 internal_function __attribute_warn_unused_result__
2277 transit_state (reg_errcode_t *err, re_match_context_t *mctx,
2278 re_dfastate_t *state)
2280 re_dfastate_t **trtable;
2281 unsigned char ch;
2283 #ifdef RE_ENABLE_I18N
2284 /* If the current state can accept multibyte. */
2285 if (BE (state->accept_mb, 0))
2287 *err = transit_state_mb (mctx, state);
2288 if (BE (*err != REG_NOERROR, 0))
2289 return NULL;
2291 #endif /* RE_ENABLE_I18N */
2293 /* Then decide the next state with the single byte. */
2294 #if 0
2295 if (0)
2296 /* don't use transition table */
2297 return transit_state_sb (err, mctx, state);
2298 #endif
2300 /* Use transition table */
2301 ch = re_string_fetch_byte (&mctx->input);
2302 for (;;)
2304 trtable = state->trtable;
2305 if (BE (trtable != NULL, 1))
2306 return trtable[ch];
2308 trtable = state->word_trtable;
2309 if (BE (trtable != NULL, 1))
2311 unsigned int context;
2312 context
2313 = re_string_context_at (&mctx->input,
2314 re_string_cur_idx (&mctx->input) - 1,
2315 mctx->eflags);
2316 if (IS_WORD_CONTEXT (context))
2317 return trtable[ch + SBC_MAX];
2318 else
2319 return trtable[ch];
2322 if (!build_trtable (mctx->dfa, state))
2324 *err = REG_ESPACE;
2325 return NULL;
2328 /* Retry, we now have a transition table. */
2332 /* Update the state_log if we need */
2333 re_dfastate_t *
2334 internal_function
2335 merge_state_with_log (reg_errcode_t *err, re_match_context_t *mctx,
2336 re_dfastate_t *next_state)
2338 const re_dfa_t *const dfa = mctx->dfa;
2339 int cur_idx = re_string_cur_idx (&mctx->input);
2341 if (cur_idx > mctx->state_log_top)
2343 mctx->state_log[cur_idx] = next_state;
2344 mctx->state_log_top = cur_idx;
2346 else if (mctx->state_log[cur_idx] == 0)
2348 mctx->state_log[cur_idx] = next_state;
2350 else
2352 re_dfastate_t *pstate;
2353 unsigned int context;
2354 re_node_set next_nodes, *log_nodes, *table_nodes = NULL;
2355 /* If (state_log[cur_idx] != 0), it implies that cur_idx is
2356 the destination of a multibyte char/collating element/
2357 back reference. Then the next state is the union set of
2358 these destinations and the results of the transition table. */
2359 pstate = mctx->state_log[cur_idx];
2360 log_nodes = pstate->entrance_nodes;
2361 if (next_state != NULL)
2363 table_nodes = next_state->entrance_nodes;
2364 *err = re_node_set_init_union (&next_nodes, table_nodes,
2365 log_nodes);
2366 if (BE (*err != REG_NOERROR, 0))
2367 return NULL;
2369 else
2370 next_nodes = *log_nodes;
2371 /* Note: We already add the nodes of the initial state,
2372 then we don't need to add them here. */
2374 context = re_string_context_at (&mctx->input,
2375 re_string_cur_idx (&mctx->input) - 1,
2376 mctx->eflags);
2377 next_state = mctx->state_log[cur_idx]
2378 = re_acquire_state_context (err, dfa, &next_nodes, context);
2379 /* We don't need to check errors here, since the return value of
2380 this function is next_state and ERR is already set. */
2382 if (table_nodes != NULL)
2383 re_node_set_free (&next_nodes);
2386 if (BE (dfa->nbackref, 0) && next_state != NULL)
2388 /* Check OP_OPEN_SUBEXP in the current state in case that we use them
2389 later. We must check them here, since the back references in the
2390 next state might use them. */
2391 *err = check_subexp_matching_top (mctx, &next_state->nodes,
2392 cur_idx);
2393 if (BE (*err != REG_NOERROR, 0))
2394 return NULL;
2396 /* If the next state has back references. */
2397 if (next_state->has_backref)
2399 *err = transit_state_bkref (mctx, &next_state->nodes);
2400 if (BE (*err != REG_NOERROR, 0))
2401 return NULL;
2402 next_state = mctx->state_log[cur_idx];
2406 return next_state;
2409 /* Skip bytes in the input that correspond to part of a
2410 multi-byte match, then look in the log for a state
2411 from which to restart matching. */
2412 re_dfastate_t *
2413 internal_function
2414 find_recover_state (reg_errcode_t *err, re_match_context_t *mctx)
2416 re_dfastate_t *cur_state;
2419 int max = mctx->state_log_top;
2420 int cur_str_idx = re_string_cur_idx (&mctx->input);
2424 if (++cur_str_idx > max)
2425 return NULL;
2426 re_string_skip_bytes (&mctx->input, 1);
2428 while (mctx->state_log[cur_str_idx] == NULL);
2430 cur_state = merge_state_with_log (err, mctx, NULL);
2432 while (*err == REG_NOERROR && cur_state == NULL);
2433 return cur_state;
2436 /* Helper functions for transit_state. */
2438 /* From the node set CUR_NODES, pick up the nodes whose types are
2439 OP_OPEN_SUBEXP and which have corresponding back references in the regular
2440 expression. And register them to use them later for evaluating the
2441 correspoding back references. */
2443 static reg_errcode_t
2444 internal_function
2445 check_subexp_matching_top (re_match_context_t *mctx, re_node_set *cur_nodes,
2446 int str_idx)
2448 const re_dfa_t *const dfa = mctx->dfa;
2449 int node_idx;
2450 reg_errcode_t err;
2452 /* TODO: This isn't efficient.
2453 Because there might be more than one nodes whose types are
2454 OP_OPEN_SUBEXP and whose index is SUBEXP_IDX, we must check all
2455 nodes.
2456 E.g. RE: (a){2} */
2457 for (node_idx = 0; node_idx < cur_nodes->nelem; ++node_idx)
2459 int node = cur_nodes->elems[node_idx];
2460 if (dfa->nodes[node].type == OP_OPEN_SUBEXP
2461 && dfa->nodes[node].opr.idx < BITSET_WORD_BITS
2462 && (dfa->used_bkref_map
2463 & ((bitset_word_t) 1 << dfa->nodes[node].opr.idx)))
2465 err = match_ctx_add_subtop (mctx, node, str_idx);
2466 if (BE (err != REG_NOERROR, 0))
2467 return err;
2470 return REG_NOERROR;
2473 #if 0
2474 /* Return the next state to which the current state STATE will transit by
2475 accepting the current input byte. */
2477 static re_dfastate_t *
2478 transit_state_sb (reg_errcode_t *err, re_match_context_t *mctx,
2479 re_dfastate_t *state)
2481 const re_dfa_t *const dfa = mctx->dfa;
2482 re_node_set next_nodes;
2483 re_dfastate_t *next_state;
2484 int node_cnt, cur_str_idx = re_string_cur_idx (&mctx->input);
2485 unsigned int context;
2487 *err = re_node_set_alloc (&next_nodes, state->nodes.nelem + 1);
2488 if (BE (*err != REG_NOERROR, 0))
2489 return NULL;
2490 for (node_cnt = 0; node_cnt < state->nodes.nelem; ++node_cnt)
2492 int cur_node = state->nodes.elems[node_cnt];
2493 if (check_node_accept (mctx, dfa->nodes + cur_node, cur_str_idx))
2495 *err = re_node_set_merge (&next_nodes,
2496 dfa->eclosures + dfa->nexts[cur_node]);
2497 if (BE (*err != REG_NOERROR, 0))
2499 re_node_set_free (&next_nodes);
2500 return NULL;
2504 context = re_string_context_at (&mctx->input, cur_str_idx, mctx->eflags);
2505 next_state = re_acquire_state_context (err, dfa, &next_nodes, context);
2506 /* We don't need to check errors here, since the return value of
2507 this function is next_state and ERR is already set. */
2509 re_node_set_free (&next_nodes);
2510 re_string_skip_bytes (&mctx->input, 1);
2511 return next_state;
2513 #endif
2515 #ifdef RE_ENABLE_I18N
2516 static reg_errcode_t
2517 internal_function
2518 transit_state_mb (re_match_context_t *mctx, re_dfastate_t *pstate)
2520 const re_dfa_t *const dfa = mctx->dfa;
2521 reg_errcode_t err;
2522 int i;
2524 for (i = 0; i < pstate->nodes.nelem; ++i)
2526 re_node_set dest_nodes, *new_nodes;
2527 int cur_node_idx = pstate->nodes.elems[i];
2528 int naccepted, dest_idx;
2529 unsigned int context;
2530 re_dfastate_t *dest_state;
2532 if (!dfa->nodes[cur_node_idx].accept_mb)
2533 continue;
2535 if (dfa->nodes[cur_node_idx].constraint)
2537 context = re_string_context_at (&mctx->input,
2538 re_string_cur_idx (&mctx->input),
2539 mctx->eflags);
2540 if (NOT_SATISFY_NEXT_CONSTRAINT (dfa->nodes[cur_node_idx].constraint,
2541 context))
2542 continue;
2545 /* How many bytes the node can accept? */
2546 naccepted = check_node_accept_bytes (dfa, cur_node_idx, &mctx->input,
2547 re_string_cur_idx (&mctx->input));
2548 if (naccepted == 0)
2549 continue;
2551 /* The node can accepts `naccepted' bytes. */
2552 dest_idx = re_string_cur_idx (&mctx->input) + naccepted;
2553 mctx->max_mb_elem_len = ((mctx->max_mb_elem_len < naccepted) ? naccepted
2554 : mctx->max_mb_elem_len);
2555 err = clean_state_log_if_needed (mctx, dest_idx);
2556 if (BE (err != REG_NOERROR, 0))
2557 return err;
2558 #ifdef DEBUG
2559 assert (dfa->nexts[cur_node_idx] != -1);
2560 #endif
2561 new_nodes = dfa->eclosures + dfa->nexts[cur_node_idx];
2563 dest_state = mctx->state_log[dest_idx];
2564 if (dest_state == NULL)
2565 dest_nodes = *new_nodes;
2566 else
2568 err = re_node_set_init_union (&dest_nodes,
2569 dest_state->entrance_nodes, new_nodes);
2570 if (BE (err != REG_NOERROR, 0))
2571 return err;
2573 context = re_string_context_at (&mctx->input, dest_idx - 1,
2574 mctx->eflags);
2575 mctx->state_log[dest_idx]
2576 = re_acquire_state_context (&err, dfa, &dest_nodes, context);
2577 if (dest_state != NULL)
2578 re_node_set_free (&dest_nodes);
2579 if (BE (mctx->state_log[dest_idx] == NULL && err != REG_NOERROR, 0))
2580 return err;
2582 return REG_NOERROR;
2584 #endif /* RE_ENABLE_I18N */
2586 static reg_errcode_t
2587 internal_function
2588 transit_state_bkref (re_match_context_t *mctx, const re_node_set *nodes)
2590 const re_dfa_t *const dfa = mctx->dfa;
2591 reg_errcode_t err;
2592 int i;
2593 int cur_str_idx = re_string_cur_idx (&mctx->input);
2595 for (i = 0; i < nodes->nelem; ++i)
2597 int dest_str_idx, prev_nelem, bkc_idx;
2598 int node_idx = nodes->elems[i];
2599 unsigned int context;
2600 const re_token_t *node = dfa->nodes + node_idx;
2601 re_node_set *new_dest_nodes;
2603 /* Check whether `node' is a backreference or not. */
2604 if (node->type != OP_BACK_REF)
2605 continue;
2607 if (node->constraint)
2609 context = re_string_context_at (&mctx->input, cur_str_idx,
2610 mctx->eflags);
2611 if (NOT_SATISFY_NEXT_CONSTRAINT (node->constraint, context))
2612 continue;
2615 /* `node' is a backreference.
2616 Check the substring which the substring matched. */
2617 bkc_idx = mctx->nbkref_ents;
2618 err = get_subexp (mctx, node_idx, cur_str_idx);
2619 if (BE (err != REG_NOERROR, 0))
2620 goto free_return;
2622 /* And add the epsilon closures (which is `new_dest_nodes') of
2623 the backreference to appropriate state_log. */
2624 #ifdef DEBUG
2625 assert (dfa->nexts[node_idx] != -1);
2626 #endif
2627 for (; bkc_idx < mctx->nbkref_ents; ++bkc_idx)
2629 int subexp_len;
2630 re_dfastate_t *dest_state;
2631 struct re_backref_cache_entry *bkref_ent;
2632 bkref_ent = mctx->bkref_ents + bkc_idx;
2633 if (bkref_ent->node != node_idx || bkref_ent->str_idx != cur_str_idx)
2634 continue;
2635 subexp_len = bkref_ent->subexp_to - bkref_ent->subexp_from;
2636 new_dest_nodes = (subexp_len == 0
2637 ? dfa->eclosures + dfa->edests[node_idx].elems[0]
2638 : dfa->eclosures + dfa->nexts[node_idx]);
2639 dest_str_idx = (cur_str_idx + bkref_ent->subexp_to
2640 - bkref_ent->subexp_from);
2641 context = re_string_context_at (&mctx->input, dest_str_idx - 1,
2642 mctx->eflags);
2643 dest_state = mctx->state_log[dest_str_idx];
2644 prev_nelem = ((mctx->state_log[cur_str_idx] == NULL) ? 0
2645 : mctx->state_log[cur_str_idx]->nodes.nelem);
2646 /* Add `new_dest_node' to state_log. */
2647 if (dest_state == NULL)
2649 mctx->state_log[dest_str_idx]
2650 = re_acquire_state_context (&err, dfa, new_dest_nodes,
2651 context);
2652 if (BE (mctx->state_log[dest_str_idx] == NULL
2653 && err != REG_NOERROR, 0))
2654 goto free_return;
2656 else
2658 re_node_set dest_nodes;
2659 err = re_node_set_init_union (&dest_nodes,
2660 dest_state->entrance_nodes,
2661 new_dest_nodes);
2662 if (BE (err != REG_NOERROR, 0))
2664 re_node_set_free (&dest_nodes);
2665 goto free_return;
2667 mctx->state_log[dest_str_idx]
2668 = re_acquire_state_context (&err, dfa, &dest_nodes, context);
2669 re_node_set_free (&dest_nodes);
2670 if (BE (mctx->state_log[dest_str_idx] == NULL
2671 && err != REG_NOERROR, 0))
2672 goto free_return;
2674 /* We need to check recursively if the backreference can epsilon
2675 transit. */
2676 if (subexp_len == 0
2677 && mctx->state_log[cur_str_idx]->nodes.nelem > prev_nelem)
2679 err = check_subexp_matching_top (mctx, new_dest_nodes,
2680 cur_str_idx);
2681 if (BE (err != REG_NOERROR, 0))
2682 goto free_return;
2683 err = transit_state_bkref (mctx, new_dest_nodes);
2684 if (BE (err != REG_NOERROR, 0))
2685 goto free_return;
2689 err = REG_NOERROR;
2690 free_return:
2691 return err;
2694 /* Enumerate all the candidates which the backreference BKREF_NODE can match
2695 at BKREF_STR_IDX, and register them by match_ctx_add_entry().
2696 Note that we might collect inappropriate candidates here.
2697 However, the cost of checking them strictly here is too high, then we
2698 delay these checking for prune_impossible_nodes(). */
2700 static reg_errcode_t
2701 internal_function __attribute_warn_unused_result__
2702 get_subexp (re_match_context_t *mctx, int bkref_node, int bkref_str_idx)
2704 const re_dfa_t *const dfa = mctx->dfa;
2705 int subexp_num, sub_top_idx;
2706 const char *buf = (const char *) re_string_get_buffer (&mctx->input);
2707 /* Return if we have already checked BKREF_NODE at BKREF_STR_IDX. */
2708 int cache_idx = search_cur_bkref_entry (mctx, bkref_str_idx);
2709 if (cache_idx != -1)
2711 const struct re_backref_cache_entry *entry
2712 = mctx->bkref_ents + cache_idx;
2714 if (entry->node == bkref_node)
2715 return REG_NOERROR; /* We already checked it. */
2716 while (entry++->more);
2719 subexp_num = dfa->nodes[bkref_node].opr.idx;
2721 /* For each sub expression */
2722 for (sub_top_idx = 0; sub_top_idx < mctx->nsub_tops; ++sub_top_idx)
2724 reg_errcode_t err;
2725 re_sub_match_top_t *sub_top = mctx->sub_tops[sub_top_idx];
2726 re_sub_match_last_t *sub_last;
2727 int sub_last_idx, sl_str, bkref_str_off;
2729 if (dfa->nodes[sub_top->node].opr.idx != subexp_num)
2730 continue; /* It isn't related. */
2732 sl_str = sub_top->str_idx;
2733 bkref_str_off = bkref_str_idx;
2734 /* At first, check the last node of sub expressions we already
2735 evaluated. */
2736 for (sub_last_idx = 0; sub_last_idx < sub_top->nlasts; ++sub_last_idx)
2738 int sl_str_diff;
2739 sub_last = sub_top->lasts[sub_last_idx];
2740 sl_str_diff = sub_last->str_idx - sl_str;
2741 /* The matched string by the sub expression match with the substring
2742 at the back reference? */
2743 if (sl_str_diff > 0)
2745 if (BE (bkref_str_off + sl_str_diff > mctx->input.valid_len, 0))
2747 /* Not enough chars for a successful match. */
2748 if (bkref_str_off + sl_str_diff > mctx->input.len)
2749 break;
2751 err = clean_state_log_if_needed (mctx,
2752 bkref_str_off
2753 + sl_str_diff);
2754 if (BE (err != REG_NOERROR, 0))
2755 return err;
2756 buf = (const char *) re_string_get_buffer (&mctx->input);
2758 if (memcmp (buf + bkref_str_off, buf + sl_str, sl_str_diff) != 0)
2759 /* We don't need to search this sub expression any more. */
2760 break;
2762 bkref_str_off += sl_str_diff;
2763 sl_str += sl_str_diff;
2764 err = get_subexp_sub (mctx, sub_top, sub_last, bkref_node,
2765 bkref_str_idx);
2767 /* Reload buf, since the preceding call might have reallocated
2768 the buffer. */
2769 buf = (const char *) re_string_get_buffer (&mctx->input);
2771 if (err == REG_NOMATCH)
2772 continue;
2773 if (BE (err != REG_NOERROR, 0))
2774 return err;
2777 if (sub_last_idx < sub_top->nlasts)
2778 continue;
2779 if (sub_last_idx > 0)
2780 ++sl_str;
2781 /* Then, search for the other last nodes of the sub expression. */
2782 for (; sl_str <= bkref_str_idx; ++sl_str)
2784 int cls_node, sl_str_off;
2785 const re_node_set *nodes;
2786 sl_str_off = sl_str - sub_top->str_idx;
2787 /* The matched string by the sub expression match with the substring
2788 at the back reference? */
2789 if (sl_str_off > 0)
2791 if (BE (bkref_str_off >= mctx->input.valid_len, 0))
2793 /* If we are at the end of the input, we cannot match. */
2794 if (bkref_str_off >= mctx->input.len)
2795 break;
2797 err = extend_buffers (mctx, bkref_str_off + 1);
2798 if (BE (err != REG_NOERROR, 0))
2799 return err;
2801 buf = (const char *) re_string_get_buffer (&mctx->input);
2803 if (buf [bkref_str_off++] != buf[sl_str - 1])
2804 break; /* We don't need to search this sub expression
2805 any more. */
2807 if (mctx->state_log[sl_str] == NULL)
2808 continue;
2809 /* Does this state have a ')' of the sub expression? */
2810 nodes = &mctx->state_log[sl_str]->nodes;
2811 cls_node = find_subexp_node (dfa, nodes, subexp_num,
2812 OP_CLOSE_SUBEXP);
2813 if (cls_node == -1)
2814 continue; /* No. */
2815 if (sub_top->path == NULL)
2817 sub_top->path = calloc (sizeof (state_array_t),
2818 sl_str - sub_top->str_idx + 1);
2819 if (sub_top->path == NULL)
2820 return REG_ESPACE;
2822 /* Can the OP_OPEN_SUBEXP node arrive the OP_CLOSE_SUBEXP node
2823 in the current context? */
2824 err = check_arrival (mctx, sub_top->path, sub_top->node,
2825 sub_top->str_idx, cls_node, sl_str,
2826 OP_CLOSE_SUBEXP);
2827 if (err == REG_NOMATCH)
2828 continue;
2829 if (BE (err != REG_NOERROR, 0))
2830 return err;
2831 sub_last = match_ctx_add_sublast (sub_top, cls_node, sl_str);
2832 if (BE (sub_last == NULL, 0))
2833 return REG_ESPACE;
2834 err = get_subexp_sub (mctx, sub_top, sub_last, bkref_node,
2835 bkref_str_idx);
2836 if (err == REG_NOMATCH)
2837 continue;
2840 return REG_NOERROR;
2843 /* Helper functions for get_subexp(). */
2845 /* Check SUB_LAST can arrive to the back reference BKREF_NODE at BKREF_STR.
2846 If it can arrive, register the sub expression expressed with SUB_TOP
2847 and SUB_LAST. */
2849 static reg_errcode_t
2850 internal_function
2851 get_subexp_sub (re_match_context_t *mctx, const re_sub_match_top_t *sub_top,
2852 re_sub_match_last_t *sub_last, int bkref_node, int bkref_str)
2854 reg_errcode_t err;
2855 int to_idx;
2856 /* Can the subexpression arrive the back reference? */
2857 err = check_arrival (mctx, &sub_last->path, sub_last->node,
2858 sub_last->str_idx, bkref_node, bkref_str,
2859 OP_OPEN_SUBEXP);
2860 if (err != REG_NOERROR)
2861 return err;
2862 err = match_ctx_add_entry (mctx, bkref_node, bkref_str, sub_top->str_idx,
2863 sub_last->str_idx);
2864 if (BE (err != REG_NOERROR, 0))
2865 return err;
2866 to_idx = bkref_str + sub_last->str_idx - sub_top->str_idx;
2867 return clean_state_log_if_needed (mctx, to_idx);
2870 /* Find the first node which is '(' or ')' and whose index is SUBEXP_IDX.
2871 Search '(' if FL_OPEN, or search ')' otherwise.
2872 TODO: This function isn't efficient...
2873 Because there might be more than one nodes whose types are
2874 OP_OPEN_SUBEXP and whose index is SUBEXP_IDX, we must check all
2875 nodes.
2876 E.g. RE: (a){2} */
2878 static int
2879 internal_function
2880 find_subexp_node (const re_dfa_t *dfa, const re_node_set *nodes,
2881 int subexp_idx, int type)
2883 int cls_idx;
2884 for (cls_idx = 0; cls_idx < nodes->nelem; ++cls_idx)
2886 int cls_node = nodes->elems[cls_idx];
2887 const re_token_t *node = dfa->nodes + cls_node;
2888 if (node->type == type
2889 && node->opr.idx == subexp_idx)
2890 return cls_node;
2892 return -1;
2895 /* Check whether the node TOP_NODE at TOP_STR can arrive to the node
2896 LAST_NODE at LAST_STR. We record the path onto PATH since it will be
2897 heavily reused.
2898 Return REG_NOERROR if it can arrive, or REG_NOMATCH otherwise. */
2900 static reg_errcode_t
2901 internal_function __attribute_warn_unused_result__
2902 check_arrival (re_match_context_t *mctx, state_array_t *path, int top_node,
2903 int top_str, int last_node, int last_str, int type)
2905 const re_dfa_t *const dfa = mctx->dfa;
2906 reg_errcode_t err = REG_NOERROR;
2907 int subexp_num, backup_cur_idx, str_idx, null_cnt;
2908 re_dfastate_t *cur_state = NULL;
2909 re_node_set *cur_nodes, next_nodes;
2910 re_dfastate_t **backup_state_log;
2911 unsigned int context;
2913 subexp_num = dfa->nodes[top_node].opr.idx;
2914 /* Extend the buffer if we need. */
2915 if (BE (path->alloc < last_str + mctx->max_mb_elem_len + 1, 0))
2917 re_dfastate_t **new_array;
2918 int old_alloc = path->alloc;
2919 path->alloc += last_str + mctx->max_mb_elem_len + 1;
2920 new_array = re_realloc (path->array, re_dfastate_t *, path->alloc);
2921 if (BE (new_array == NULL, 0))
2923 path->alloc = old_alloc;
2924 return REG_ESPACE;
2926 path->array = new_array;
2927 memset (new_array + old_alloc, '\0',
2928 sizeof (re_dfastate_t *) * (path->alloc - old_alloc));
2931 str_idx = path->next_idx ?: top_str;
2933 /* Temporary modify MCTX. */
2934 backup_state_log = mctx->state_log;
2935 backup_cur_idx = mctx->input.cur_idx;
2936 mctx->state_log = path->array;
2937 mctx->input.cur_idx = str_idx;
2939 /* Setup initial node set. */
2940 context = re_string_context_at (&mctx->input, str_idx - 1, mctx->eflags);
2941 if (str_idx == top_str)
2943 err = re_node_set_init_1 (&next_nodes, top_node);
2944 if (BE (err != REG_NOERROR, 0))
2945 return err;
2946 err = check_arrival_expand_ecl (dfa, &next_nodes, subexp_num, type);
2947 if (BE (err != REG_NOERROR, 0))
2949 re_node_set_free (&next_nodes);
2950 return err;
2953 else
2955 cur_state = mctx->state_log[str_idx];
2956 if (cur_state && cur_state->has_backref)
2958 err = re_node_set_init_copy (&next_nodes, &cur_state->nodes);
2959 if (BE (err != REG_NOERROR, 0))
2960 return err;
2962 else
2963 re_node_set_init_empty (&next_nodes);
2965 if (str_idx == top_str || (cur_state && cur_state->has_backref))
2967 if (next_nodes.nelem)
2969 err = expand_bkref_cache (mctx, &next_nodes, str_idx,
2970 subexp_num, type);
2971 if (BE (err != REG_NOERROR, 0))
2973 re_node_set_free (&next_nodes);
2974 return err;
2977 cur_state = re_acquire_state_context (&err, dfa, &next_nodes, context);
2978 if (BE (cur_state == NULL && err != REG_NOERROR, 0))
2980 re_node_set_free (&next_nodes);
2981 return err;
2983 mctx->state_log[str_idx] = cur_state;
2986 for (null_cnt = 0; str_idx < last_str && null_cnt <= mctx->max_mb_elem_len;)
2988 re_node_set_empty (&next_nodes);
2989 if (mctx->state_log[str_idx + 1])
2991 err = re_node_set_merge (&next_nodes,
2992 &mctx->state_log[str_idx + 1]->nodes);
2993 if (BE (err != REG_NOERROR, 0))
2995 re_node_set_free (&next_nodes);
2996 return err;
2999 if (cur_state)
3001 err = check_arrival_add_next_nodes (mctx, str_idx,
3002 &cur_state->non_eps_nodes,
3003 &next_nodes);
3004 if (BE (err != REG_NOERROR, 0))
3006 re_node_set_free (&next_nodes);
3007 return err;
3010 ++str_idx;
3011 if (next_nodes.nelem)
3013 err = check_arrival_expand_ecl (dfa, &next_nodes, subexp_num, type);
3014 if (BE (err != REG_NOERROR, 0))
3016 re_node_set_free (&next_nodes);
3017 return err;
3019 err = expand_bkref_cache (mctx, &next_nodes, str_idx,
3020 subexp_num, type);
3021 if (BE (err != REG_NOERROR, 0))
3023 re_node_set_free (&next_nodes);
3024 return err;
3027 context = re_string_context_at (&mctx->input, str_idx - 1, mctx->eflags);
3028 cur_state = re_acquire_state_context (&err, dfa, &next_nodes, context);
3029 if (BE (cur_state == NULL && err != REG_NOERROR, 0))
3031 re_node_set_free (&next_nodes);
3032 return err;
3034 mctx->state_log[str_idx] = cur_state;
3035 null_cnt = cur_state == NULL ? null_cnt + 1 : 0;
3037 re_node_set_free (&next_nodes);
3038 cur_nodes = (mctx->state_log[last_str] == NULL ? NULL
3039 : &mctx->state_log[last_str]->nodes);
3040 path->next_idx = str_idx;
3042 /* Fix MCTX. */
3043 mctx->state_log = backup_state_log;
3044 mctx->input.cur_idx = backup_cur_idx;
3046 /* Then check the current node set has the node LAST_NODE. */
3047 if (cur_nodes != NULL && re_node_set_contains (cur_nodes, last_node))
3048 return REG_NOERROR;
3050 return REG_NOMATCH;
3053 /* Helper functions for check_arrival. */
3055 /* Calculate the destination nodes of CUR_NODES at STR_IDX, and append them
3056 to NEXT_NODES.
3057 TODO: This function is similar to the functions transit_state*(),
3058 however this function has many additional works.
3059 Can't we unify them? */
3061 static reg_errcode_t
3062 internal_function __attribute_warn_unused_result__
3063 check_arrival_add_next_nodes (re_match_context_t *mctx, int str_idx,
3064 re_node_set *cur_nodes, re_node_set *next_nodes)
3066 const re_dfa_t *const dfa = mctx->dfa;
3067 int result;
3068 int cur_idx;
3069 reg_errcode_t err = REG_NOERROR;
3070 re_node_set union_set;
3071 re_node_set_init_empty (&union_set);
3072 for (cur_idx = 0; cur_idx < cur_nodes->nelem; ++cur_idx)
3074 int naccepted = 0;
3075 int cur_node = cur_nodes->elems[cur_idx];
3076 #ifdef DEBUG
3077 re_token_type_t type = dfa->nodes[cur_node].type;
3078 assert (!IS_EPSILON_NODE (type));
3079 #endif
3080 #ifdef RE_ENABLE_I18N
3081 /* If the node may accept `multi byte'. */
3082 if (dfa->nodes[cur_node].accept_mb)
3084 naccepted = check_node_accept_bytes (dfa, cur_node, &mctx->input,
3085 str_idx);
3086 if (naccepted > 1)
3088 re_dfastate_t *dest_state;
3089 int next_node = dfa->nexts[cur_node];
3090 int next_idx = str_idx + naccepted;
3091 dest_state = mctx->state_log[next_idx];
3092 re_node_set_empty (&union_set);
3093 if (dest_state)
3095 err = re_node_set_merge (&union_set, &dest_state->nodes);
3096 if (BE (err != REG_NOERROR, 0))
3098 re_node_set_free (&union_set);
3099 return err;
3102 result = re_node_set_insert (&union_set, next_node);
3103 if (BE (result < 0, 0))
3105 re_node_set_free (&union_set);
3106 return REG_ESPACE;
3108 mctx->state_log[next_idx] = re_acquire_state (&err, dfa,
3109 &union_set);
3110 if (BE (mctx->state_log[next_idx] == NULL
3111 && err != REG_NOERROR, 0))
3113 re_node_set_free (&union_set);
3114 return err;
3118 #endif /* RE_ENABLE_I18N */
3119 if (naccepted
3120 || check_node_accept (mctx, dfa->nodes + cur_node, str_idx))
3122 result = re_node_set_insert (next_nodes, dfa->nexts[cur_node]);
3123 if (BE (result < 0, 0))
3125 re_node_set_free (&union_set);
3126 return REG_ESPACE;
3130 re_node_set_free (&union_set);
3131 return REG_NOERROR;
3134 /* For all the nodes in CUR_NODES, add the epsilon closures of them to
3135 CUR_NODES, however exclude the nodes which are:
3136 - inside the sub expression whose number is EX_SUBEXP, if FL_OPEN.
3137 - out of the sub expression whose number is EX_SUBEXP, if !FL_OPEN.
3140 static reg_errcode_t
3141 internal_function
3142 check_arrival_expand_ecl (const re_dfa_t *dfa, re_node_set *cur_nodes,
3143 int ex_subexp, int type)
3145 reg_errcode_t err;
3146 int idx, outside_node;
3147 re_node_set new_nodes;
3148 #ifdef DEBUG
3149 assert (cur_nodes->nelem);
3150 #endif
3151 err = re_node_set_alloc (&new_nodes, cur_nodes->nelem);
3152 if (BE (err != REG_NOERROR, 0))
3153 return err;
3154 /* Create a new node set NEW_NODES with the nodes which are epsilon
3155 closures of the node in CUR_NODES. */
3157 for (idx = 0; idx < cur_nodes->nelem; ++idx)
3159 int cur_node = cur_nodes->elems[idx];
3160 const re_node_set *eclosure = dfa->eclosures + cur_node;
3161 outside_node = find_subexp_node (dfa, eclosure, ex_subexp, type);
3162 if (outside_node == -1)
3164 /* There are no problematic nodes, just merge them. */
3165 err = re_node_set_merge (&new_nodes, eclosure);
3166 if (BE (err != REG_NOERROR, 0))
3168 re_node_set_free (&new_nodes);
3169 return err;
3172 else
3174 /* There are problematic nodes, re-calculate incrementally. */
3175 err = check_arrival_expand_ecl_sub (dfa, &new_nodes, cur_node,
3176 ex_subexp, type);
3177 if (BE (err != REG_NOERROR, 0))
3179 re_node_set_free (&new_nodes);
3180 return err;
3184 re_node_set_free (cur_nodes);
3185 *cur_nodes = new_nodes;
3186 return REG_NOERROR;
3189 /* Helper function for check_arrival_expand_ecl.
3190 Check incrementally the epsilon closure of TARGET, and if it isn't
3191 problematic append it to DST_NODES. */
3193 static reg_errcode_t
3194 internal_function __attribute_warn_unused_result__
3195 check_arrival_expand_ecl_sub (const re_dfa_t *dfa, re_node_set *dst_nodes,
3196 int target, int ex_subexp, int type)
3198 int cur_node;
3199 for (cur_node = target; !re_node_set_contains (dst_nodes, cur_node);)
3201 int err;
3203 if (dfa->nodes[cur_node].type == type
3204 && dfa->nodes[cur_node].opr.idx == ex_subexp)
3206 if (type == OP_CLOSE_SUBEXP)
3208 err = re_node_set_insert (dst_nodes, cur_node);
3209 if (BE (err == -1, 0))
3210 return REG_ESPACE;
3212 break;
3214 err = re_node_set_insert (dst_nodes, cur_node);
3215 if (BE (err == -1, 0))
3216 return REG_ESPACE;
3217 if (dfa->edests[cur_node].nelem == 0)
3218 break;
3219 if (dfa->edests[cur_node].nelem == 2)
3221 err = check_arrival_expand_ecl_sub (dfa, dst_nodes,
3222 dfa->edests[cur_node].elems[1],
3223 ex_subexp, type);
3224 if (BE (err != REG_NOERROR, 0))
3225 return err;
3227 cur_node = dfa->edests[cur_node].elems[0];
3229 return REG_NOERROR;
3233 /* For all the back references in the current state, calculate the
3234 destination of the back references by the appropriate entry
3235 in MCTX->BKREF_ENTS. */
3237 static reg_errcode_t
3238 internal_function __attribute_warn_unused_result__
3239 expand_bkref_cache (re_match_context_t *mctx, re_node_set *cur_nodes,
3240 int cur_str, int subexp_num, int type)
3242 const re_dfa_t *const dfa = mctx->dfa;
3243 reg_errcode_t err;
3244 int cache_idx_start = search_cur_bkref_entry (mctx, cur_str);
3245 struct re_backref_cache_entry *ent;
3247 if (cache_idx_start == -1)
3248 return REG_NOERROR;
3250 restart:
3251 ent = mctx->bkref_ents + cache_idx_start;
3254 int to_idx, next_node;
3256 /* Is this entry ENT is appropriate? */
3257 if (!re_node_set_contains (cur_nodes, ent->node))
3258 continue; /* No. */
3260 to_idx = cur_str + ent->subexp_to - ent->subexp_from;
3261 /* Calculate the destination of the back reference, and append it
3262 to MCTX->STATE_LOG. */
3263 if (to_idx == cur_str)
3265 /* The backreference did epsilon transit, we must re-check all the
3266 node in the current state. */
3267 re_node_set new_dests;
3268 reg_errcode_t err2, err3;
3269 next_node = dfa->edests[ent->node].elems[0];
3270 if (re_node_set_contains (cur_nodes, next_node))
3271 continue;
3272 err = re_node_set_init_1 (&new_dests, next_node);
3273 err2 = check_arrival_expand_ecl (dfa, &new_dests, subexp_num, type);
3274 err3 = re_node_set_merge (cur_nodes, &new_dests);
3275 re_node_set_free (&new_dests);
3276 if (BE (err != REG_NOERROR || err2 != REG_NOERROR
3277 || err3 != REG_NOERROR, 0))
3279 err = (err != REG_NOERROR ? err
3280 : (err2 != REG_NOERROR ? err2 : err3));
3281 return err;
3283 /* TODO: It is still inefficient... */
3284 goto restart;
3286 else
3288 re_node_set union_set;
3289 next_node = dfa->nexts[ent->node];
3290 if (mctx->state_log[to_idx])
3292 int ret;
3293 if (re_node_set_contains (&mctx->state_log[to_idx]->nodes,
3294 next_node))
3295 continue;
3296 err = re_node_set_init_copy (&union_set,
3297 &mctx->state_log[to_idx]->nodes);
3298 ret = re_node_set_insert (&union_set, next_node);
3299 if (BE (err != REG_NOERROR || ret < 0, 0))
3301 re_node_set_free (&union_set);
3302 err = err != REG_NOERROR ? err : REG_ESPACE;
3303 return err;
3306 else
3308 err = re_node_set_init_1 (&union_set, next_node);
3309 if (BE (err != REG_NOERROR, 0))
3310 return err;
3312 mctx->state_log[to_idx] = re_acquire_state (&err, dfa, &union_set);
3313 re_node_set_free (&union_set);
3314 if (BE (mctx->state_log[to_idx] == NULL
3315 && err != REG_NOERROR, 0))
3316 return err;
3319 while (ent++->more);
3320 return REG_NOERROR;
3323 /* Build transition table for the state.
3324 Return 1 if succeeded, otherwise return NULL. */
3326 static int
3327 internal_function
3328 build_trtable (const re_dfa_t *dfa, re_dfastate_t *state)
3330 reg_errcode_t err;
3331 int i, j, ch, need_word_trtable = 0;
3332 bitset_word_t elem, mask;
3333 bool dests_node_malloced = false;
3334 bool dest_states_malloced = false;
3335 int ndests; /* Number of the destination states from `state'. */
3336 re_dfastate_t **trtable;
3337 re_dfastate_t **dest_states = NULL, **dest_states_word, **dest_states_nl;
3338 re_node_set follows, *dests_node;
3339 bitset_t *dests_ch;
3340 bitset_t acceptable;
3342 struct dests_alloc
3344 re_node_set dests_node[SBC_MAX];
3345 bitset_t dests_ch[SBC_MAX];
3346 } *dests_alloc;
3348 /* We build DFA states which corresponds to the destination nodes
3349 from `state'. `dests_node[i]' represents the nodes which i-th
3350 destination state contains, and `dests_ch[i]' represents the
3351 characters which i-th destination state accepts. */
3352 if (__libc_use_alloca (sizeof (struct dests_alloc)))
3353 dests_alloc = (struct dests_alloc *) alloca (sizeof (struct dests_alloc));
3354 else
3356 dests_alloc = re_malloc (struct dests_alloc, 1);
3357 if (BE (dests_alloc == NULL, 0))
3358 return 0;
3359 dests_node_malloced = true;
3361 dests_node = dests_alloc->dests_node;
3362 dests_ch = dests_alloc->dests_ch;
3364 /* Initialize transiton table. */
3365 state->word_trtable = state->trtable = NULL;
3367 /* At first, group all nodes belonging to `state' into several
3368 destinations. */
3369 ndests = group_nodes_into_DFAstates (dfa, state, dests_node, dests_ch);
3370 if (BE (ndests <= 0, 0))
3372 if (dests_node_malloced)
3373 free (dests_alloc);
3374 /* Return 0 in case of an error, 1 otherwise. */
3375 if (ndests == 0)
3377 state->trtable = (re_dfastate_t **)
3378 calloc (sizeof (re_dfastate_t *), SBC_MAX);
3379 if (BE (state->trtable == NULL, 0))
3380 return 0;
3381 return 1;
3383 return 0;
3386 err = re_node_set_alloc (&follows, ndests + 1);
3387 if (BE (err != REG_NOERROR, 0))
3388 goto out_free;
3390 /* Avoid arithmetic overflow in size calculation. */
3391 if (BE ((((SIZE_MAX - (sizeof (re_node_set) + sizeof (bitset_t)) * SBC_MAX)
3392 / (3 * sizeof (re_dfastate_t *)))
3393 < ndests),
3395 goto out_free;
3397 if (__libc_use_alloca ((sizeof (re_node_set) + sizeof (bitset_t)) * SBC_MAX
3398 + ndests * 3 * sizeof (re_dfastate_t *)))
3399 dest_states = (re_dfastate_t **)
3400 alloca (ndests * 3 * sizeof (re_dfastate_t *));
3401 else
3403 dest_states = (re_dfastate_t **)
3404 malloc (ndests * 3 * sizeof (re_dfastate_t *));
3405 if (BE (dest_states == NULL, 0))
3407 out_free:
3408 if (dest_states_malloced)
3409 free (dest_states);
3410 re_node_set_free (&follows);
3411 for (i = 0; i < ndests; ++i)
3412 re_node_set_free (dests_node + i);
3413 if (dests_node_malloced)
3414 free (dests_alloc);
3415 return 0;
3417 dest_states_malloced = true;
3419 dest_states_word = dest_states + ndests;
3420 dest_states_nl = dest_states_word + ndests;
3421 bitset_empty (acceptable);
3423 /* Then build the states for all destinations. */
3424 for (i = 0; i < ndests; ++i)
3426 int next_node;
3427 re_node_set_empty (&follows);
3428 /* Merge the follows of this destination states. */
3429 for (j = 0; j < dests_node[i].nelem; ++j)
3431 next_node = dfa->nexts[dests_node[i].elems[j]];
3432 if (next_node != -1)
3434 err = re_node_set_merge (&follows, dfa->eclosures + next_node);
3435 if (BE (err != REG_NOERROR, 0))
3436 goto out_free;
3439 dest_states[i] = re_acquire_state_context (&err, dfa, &follows, 0);
3440 if (BE (dest_states[i] == NULL && err != REG_NOERROR, 0))
3441 goto out_free;
3442 /* If the new state has context constraint,
3443 build appropriate states for these contexts. */
3444 if (dest_states[i]->has_constraint)
3446 dest_states_word[i] = re_acquire_state_context (&err, dfa, &follows,
3447 CONTEXT_WORD);
3448 if (BE (dest_states_word[i] == NULL && err != REG_NOERROR, 0))
3449 goto out_free;
3451 if (dest_states[i] != dest_states_word[i] && dfa->mb_cur_max > 1)
3452 need_word_trtable = 1;
3454 dest_states_nl[i] = re_acquire_state_context (&err, dfa, &follows,
3455 CONTEXT_NEWLINE);
3456 if (BE (dest_states_nl[i] == NULL && err != REG_NOERROR, 0))
3457 goto out_free;
3459 else
3461 dest_states_word[i] = dest_states[i];
3462 dest_states_nl[i] = dest_states[i];
3464 bitset_merge (acceptable, dests_ch[i]);
3467 if (!BE (need_word_trtable, 0))
3469 /* We don't care about whether the following character is a word
3470 character, or we are in a single-byte character set so we can
3471 discern by looking at the character code: allocate a
3472 256-entry transition table. */
3473 trtable = state->trtable =
3474 (re_dfastate_t **) calloc (sizeof (re_dfastate_t *), SBC_MAX);
3475 if (BE (trtable == NULL, 0))
3476 goto out_free;
3478 /* For all characters ch...: */
3479 for (i = 0; i < BITSET_WORDS; ++i)
3480 for (ch = i * BITSET_WORD_BITS, elem = acceptable[i], mask = 1;
3481 elem;
3482 mask <<= 1, elem >>= 1, ++ch)
3483 if (BE (elem & 1, 0))
3485 /* There must be exactly one destination which accepts
3486 character ch. See group_nodes_into_DFAstates. */
3487 for (j = 0; (dests_ch[j][i] & mask) == 0; ++j)
3490 /* j-th destination accepts the word character ch. */
3491 if (dfa->word_char[i] & mask)
3492 trtable[ch] = dest_states_word[j];
3493 else
3494 trtable[ch] = dest_states[j];
3497 else
3499 /* We care about whether the following character is a word
3500 character, and we are in a multi-byte character set: discern
3501 by looking at the character code: build two 256-entry
3502 transition tables, one starting at trtable[0] and one
3503 starting at trtable[SBC_MAX]. */
3504 trtable = state->word_trtable =
3505 (re_dfastate_t **) calloc (sizeof (re_dfastate_t *), 2 * SBC_MAX);
3506 if (BE (trtable == NULL, 0))
3507 goto out_free;
3509 /* For all characters ch...: */
3510 for (i = 0; i < BITSET_WORDS; ++i)
3511 for (ch = i * BITSET_WORD_BITS, elem = acceptable[i], mask = 1;
3512 elem;
3513 mask <<= 1, elem >>= 1, ++ch)
3514 if (BE (elem & 1, 0))
3516 /* There must be exactly one destination which accepts
3517 character ch. See group_nodes_into_DFAstates. */
3518 for (j = 0; (dests_ch[j][i] & mask) == 0; ++j)
3521 /* j-th destination accepts the word character ch. */
3522 trtable[ch] = dest_states[j];
3523 trtable[ch + SBC_MAX] = dest_states_word[j];
3527 /* new line */
3528 if (bitset_contain (acceptable, NEWLINE_CHAR))
3530 /* The current state accepts newline character. */
3531 for (j = 0; j < ndests; ++j)
3532 if (bitset_contain (dests_ch[j], NEWLINE_CHAR))
3534 /* k-th destination accepts newline character. */
3535 trtable[NEWLINE_CHAR] = dest_states_nl[j];
3536 if (need_word_trtable)
3537 trtable[NEWLINE_CHAR + SBC_MAX] = dest_states_nl[j];
3538 /* There must be only one destination which accepts
3539 newline. See group_nodes_into_DFAstates. */
3540 break;
3544 if (dest_states_malloced)
3545 free (dest_states);
3547 re_node_set_free (&follows);
3548 for (i = 0; i < ndests; ++i)
3549 re_node_set_free (dests_node + i);
3551 if (dests_node_malloced)
3552 free (dests_alloc);
3554 return 1;
3557 /* Group all nodes belonging to STATE into several destinations.
3558 Then for all destinations, set the nodes belonging to the destination
3559 to DESTS_NODE[i] and set the characters accepted by the destination
3560 to DEST_CH[i]. This function return the number of destinations. */
3562 static int
3563 internal_function
3564 group_nodes_into_DFAstates (const re_dfa_t *dfa, const re_dfastate_t *state,
3565 re_node_set *dests_node, bitset_t *dests_ch)
3567 reg_errcode_t err;
3568 int result;
3569 int i, j, k;
3570 int ndests; /* Number of the destinations from `state'. */
3571 bitset_t accepts; /* Characters a node can accept. */
3572 const re_node_set *cur_nodes = &state->nodes;
3573 bitset_empty (accepts);
3574 ndests = 0;
3576 /* For all the nodes belonging to `state', */
3577 for (i = 0; i < cur_nodes->nelem; ++i)
3579 re_token_t *node = &dfa->nodes[cur_nodes->elems[i]];
3580 re_token_type_t type = node->type;
3581 unsigned int constraint = node->constraint;
3583 /* Enumerate all single byte character this node can accept. */
3584 if (type == CHARACTER)
3585 bitset_set (accepts, node->opr.c);
3586 else if (type == SIMPLE_BRACKET)
3588 bitset_merge (accepts, node->opr.sbcset);
3590 else if (type == OP_PERIOD)
3592 #ifdef RE_ENABLE_I18N
3593 if (dfa->mb_cur_max > 1)
3594 bitset_merge (accepts, dfa->sb_char);
3595 else
3596 #endif
3597 bitset_set_all (accepts);
3598 if (!(dfa->syntax & RE_DOT_NEWLINE))
3599 bitset_clear (accepts, '\n');
3600 if (dfa->syntax & RE_DOT_NOT_NULL)
3601 bitset_clear (accepts, '\0');
3603 #ifdef RE_ENABLE_I18N
3604 else if (type == OP_UTF8_PERIOD)
3606 memset (accepts, '\xff', sizeof (bitset_t) / 2);
3607 if (!(dfa->syntax & RE_DOT_NEWLINE))
3608 bitset_clear (accepts, '\n');
3609 if (dfa->syntax & RE_DOT_NOT_NULL)
3610 bitset_clear (accepts, '\0');
3612 #endif
3613 else
3614 continue;
3616 /* Check the `accepts' and sift the characters which are not
3617 match it the context. */
3618 if (constraint)
3620 if (constraint & NEXT_NEWLINE_CONSTRAINT)
3622 bool accepts_newline = bitset_contain (accepts, NEWLINE_CHAR);
3623 bitset_empty (accepts);
3624 if (accepts_newline)
3625 bitset_set (accepts, NEWLINE_CHAR);
3626 else
3627 continue;
3629 if (constraint & NEXT_ENDBUF_CONSTRAINT)
3631 bitset_empty (accepts);
3632 continue;
3635 if (constraint & NEXT_WORD_CONSTRAINT)
3637 bitset_word_t any_set = 0;
3638 if (type == CHARACTER && !node->word_char)
3640 bitset_empty (accepts);
3641 continue;
3643 #ifdef RE_ENABLE_I18N
3644 if (dfa->mb_cur_max > 1)
3645 for (j = 0; j < BITSET_WORDS; ++j)
3646 any_set |= (accepts[j] &= (dfa->word_char[j] | ~dfa->sb_char[j]));
3647 else
3648 #endif
3649 for (j = 0; j < BITSET_WORDS; ++j)
3650 any_set |= (accepts[j] &= dfa->word_char[j]);
3651 if (!any_set)
3652 continue;
3654 if (constraint & NEXT_NOTWORD_CONSTRAINT)
3656 bitset_word_t any_set = 0;
3657 if (type == CHARACTER && node->word_char)
3659 bitset_empty (accepts);
3660 continue;
3662 #ifdef RE_ENABLE_I18N
3663 if (dfa->mb_cur_max > 1)
3664 for (j = 0; j < BITSET_WORDS; ++j)
3665 any_set |= (accepts[j] &= ~(dfa->word_char[j] & dfa->sb_char[j]));
3666 else
3667 #endif
3668 for (j = 0; j < BITSET_WORDS; ++j)
3669 any_set |= (accepts[j] &= ~dfa->word_char[j]);
3670 if (!any_set)
3671 continue;
3675 /* Then divide `accepts' into DFA states, or create a new
3676 state. Above, we make sure that accepts is not empty. */
3677 for (j = 0; j < ndests; ++j)
3679 bitset_t intersec; /* Intersection sets, see below. */
3680 bitset_t remains;
3681 /* Flags, see below. */
3682 bitset_word_t has_intersec, not_subset, not_consumed;
3684 /* Optimization, skip if this state doesn't accept the character. */
3685 if (type == CHARACTER && !bitset_contain (dests_ch[j], node->opr.c))
3686 continue;
3688 /* Enumerate the intersection set of this state and `accepts'. */
3689 has_intersec = 0;
3690 for (k = 0; k < BITSET_WORDS; ++k)
3691 has_intersec |= intersec[k] = accepts[k] & dests_ch[j][k];
3692 /* And skip if the intersection set is empty. */
3693 if (!has_intersec)
3694 continue;
3696 /* Then check if this state is a subset of `accepts'. */
3697 not_subset = not_consumed = 0;
3698 for (k = 0; k < BITSET_WORDS; ++k)
3700 not_subset |= remains[k] = ~accepts[k] & dests_ch[j][k];
3701 not_consumed |= accepts[k] = accepts[k] & ~dests_ch[j][k];
3704 /* If this state isn't a subset of `accepts', create a
3705 new group state, which has the `remains'. */
3706 if (not_subset)
3708 bitset_copy (dests_ch[ndests], remains);
3709 bitset_copy (dests_ch[j], intersec);
3710 err = re_node_set_init_copy (dests_node + ndests, &dests_node[j]);
3711 if (BE (err != REG_NOERROR, 0))
3712 goto error_return;
3713 ++ndests;
3716 /* Put the position in the current group. */
3717 result = re_node_set_insert (&dests_node[j], cur_nodes->elems[i]);
3718 if (BE (result < 0, 0))
3719 goto error_return;
3721 /* If all characters are consumed, go to next node. */
3722 if (!not_consumed)
3723 break;
3725 /* Some characters remain, create a new group. */
3726 if (j == ndests)
3728 bitset_copy (dests_ch[ndests], accepts);
3729 err = re_node_set_init_1 (dests_node + ndests, cur_nodes->elems[i]);
3730 if (BE (err != REG_NOERROR, 0))
3731 goto error_return;
3732 ++ndests;
3733 bitset_empty (accepts);
3736 return ndests;
3737 error_return:
3738 for (j = 0; j < ndests; ++j)
3739 re_node_set_free (dests_node + j);
3740 return -1;
3743 #ifdef RE_ENABLE_I18N
3744 /* Check how many bytes the node `dfa->nodes[node_idx]' accepts.
3745 Return the number of the bytes the node accepts.
3746 STR_IDX is the current index of the input string.
3748 This function handles the nodes which can accept one character, or
3749 one collating element like '.', '[a-z]', opposite to the other nodes
3750 can only accept one byte. */
3752 static int
3753 internal_function
3754 check_node_accept_bytes (const re_dfa_t *dfa, int node_idx,
3755 const re_string_t *input, int str_idx)
3757 const re_token_t *node = dfa->nodes + node_idx;
3758 int char_len, elem_len;
3759 int i;
3761 if (BE (node->type == OP_UTF8_PERIOD, 0))
3763 unsigned char c = re_string_byte_at (input, str_idx), d;
3764 if (BE (c < 0xc2, 1))
3765 return 0;
3767 if (str_idx + 2 > input->len)
3768 return 0;
3770 d = re_string_byte_at (input, str_idx + 1);
3771 if (c < 0xe0)
3772 return (d < 0x80 || d > 0xbf) ? 0 : 2;
3773 else if (c < 0xf0)
3775 char_len = 3;
3776 if (c == 0xe0 && d < 0xa0)
3777 return 0;
3779 else if (c < 0xf8)
3781 char_len = 4;
3782 if (c == 0xf0 && d < 0x90)
3783 return 0;
3785 else if (c < 0xfc)
3787 char_len = 5;
3788 if (c == 0xf8 && d < 0x88)
3789 return 0;
3791 else if (c < 0xfe)
3793 char_len = 6;
3794 if (c == 0xfc && d < 0x84)
3795 return 0;
3797 else
3798 return 0;
3800 if (str_idx + char_len > input->len)
3801 return 0;
3803 for (i = 1; i < char_len; ++i)
3805 d = re_string_byte_at (input, str_idx + i);
3806 if (d < 0x80 || d > 0xbf)
3807 return 0;
3809 return char_len;
3812 char_len = re_string_char_size_at (input, str_idx);
3813 if (node->type == OP_PERIOD)
3815 if (char_len <= 1)
3816 return 0;
3817 /* FIXME: I don't think this if is needed, as both '\n'
3818 and '\0' are char_len == 1. */
3819 /* '.' accepts any one character except the following two cases. */
3820 if ((!(dfa->syntax & RE_DOT_NEWLINE) &&
3821 re_string_byte_at (input, str_idx) == '\n') ||
3822 ((dfa->syntax & RE_DOT_NOT_NULL) &&
3823 re_string_byte_at (input, str_idx) == '\0'))
3824 return 0;
3825 return char_len;
3828 elem_len = re_string_elem_size_at (input, str_idx);
3829 if ((elem_len <= 1 && char_len <= 1) || char_len == 0)
3830 return 0;
3832 if (node->type == COMPLEX_BRACKET)
3834 const re_charset_t *cset = node->opr.mbcset;
3835 # ifdef _LIBC
3836 const unsigned char *pin
3837 = ((const unsigned char *) re_string_get_buffer (input) + str_idx);
3838 int j;
3839 uint32_t nrules;
3840 # endif /* _LIBC */
3841 int match_len = 0;
3842 wchar_t wc = ((cset->nranges || cset->nchar_classes || cset->nmbchars)
3843 ? re_string_wchar_at (input, str_idx) : 0);
3845 /* match with multibyte character? */
3846 for (i = 0; i < cset->nmbchars; ++i)
3847 if (wc == cset->mbchars[i])
3849 match_len = char_len;
3850 goto check_node_accept_bytes_match;
3852 /* match with character_class? */
3853 for (i = 0; i < cset->nchar_classes; ++i)
3855 wctype_t wt = cset->char_classes[i];
3856 if (__iswctype (wc, wt))
3858 match_len = char_len;
3859 goto check_node_accept_bytes_match;
3863 # ifdef _LIBC
3864 nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3865 if (nrules != 0)
3867 unsigned int in_collseq = 0;
3868 const int32_t *table, *indirect;
3869 const unsigned char *weights, *extra;
3870 const char *collseqwc;
3871 /* This #include defines a local function! */
3872 # include <locale/weight.h>
3874 /* match with collating_symbol? */
3875 if (cset->ncoll_syms)
3876 extra = (const unsigned char *)
3877 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB);
3878 for (i = 0; i < cset->ncoll_syms; ++i)
3880 const unsigned char *coll_sym = extra + cset->coll_syms[i];
3881 /* Compare the length of input collating element and
3882 the length of current collating element. */
3883 if (*coll_sym != elem_len)
3884 continue;
3885 /* Compare each bytes. */
3886 for (j = 0; j < *coll_sym; j++)
3887 if (pin[j] != coll_sym[1 + j])
3888 break;
3889 if (j == *coll_sym)
3891 /* Match if every bytes is equal. */
3892 match_len = j;
3893 goto check_node_accept_bytes_match;
3897 if (cset->nranges)
3899 if (elem_len <= char_len)
3901 collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
3902 in_collseq = __collseq_table_lookup (collseqwc, wc);
3904 else
3905 in_collseq = find_collation_sequence_value (pin, elem_len);
3907 /* match with range expression? */
3908 for (i = 0; i < cset->nranges; ++i)
3909 if (cset->range_starts[i] <= in_collseq
3910 && in_collseq <= cset->range_ends[i])
3912 match_len = elem_len;
3913 goto check_node_accept_bytes_match;
3916 /* match with equivalence_class? */
3917 if (cset->nequiv_classes)
3919 const unsigned char *cp = pin;
3920 table = (const int32_t *)
3921 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3922 weights = (const unsigned char *)
3923 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_WEIGHTMB);
3924 extra = (const unsigned char *)
3925 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_EXTRAMB);
3926 indirect = (const int32_t *)
3927 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_INDIRECTMB);
3928 int32_t idx = findidx (&cp, elem_len);
3929 if (idx > 0)
3930 for (i = 0; i < cset->nequiv_classes; ++i)
3932 int32_t equiv_class_idx = cset->equiv_classes[i];
3933 size_t weight_len = weights[idx & 0xffffff];
3934 if (weight_len == weights[equiv_class_idx & 0xffffff]
3935 && (idx >> 24) == (equiv_class_idx >> 24))
3937 int cnt = 0;
3939 idx &= 0xffffff;
3940 equiv_class_idx &= 0xffffff;
3942 while (cnt <= weight_len
3943 && (weights[equiv_class_idx + 1 + cnt]
3944 == weights[idx + 1 + cnt]))
3945 ++cnt;
3946 if (cnt > weight_len)
3948 match_len = elem_len;
3949 goto check_node_accept_bytes_match;
3955 else
3956 # endif /* _LIBC */
3958 /* match with range expression? */
3959 #if __GNUC__ >= 2
3960 wchar_t cmp_buf[] = {L'\0', L'\0', wc, L'\0', L'\0', L'\0'};
3961 #else
3962 wchar_t cmp_buf[] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'};
3963 cmp_buf[2] = wc;
3964 #endif
3965 for (i = 0; i < cset->nranges; ++i)
3967 cmp_buf[0] = cset->range_starts[i];
3968 cmp_buf[4] = cset->range_ends[i];
3969 if (wcscoll (cmp_buf, cmp_buf + 2) <= 0
3970 && wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0)
3972 match_len = char_len;
3973 goto check_node_accept_bytes_match;
3977 check_node_accept_bytes_match:
3978 if (!cset->non_match)
3979 return match_len;
3980 else
3982 if (match_len > 0)
3983 return 0;
3984 else
3985 return (elem_len > char_len) ? elem_len : char_len;
3988 return 0;
3991 # ifdef _LIBC
3992 static unsigned int
3993 internal_function
3994 find_collation_sequence_value (const unsigned char *mbs, size_t mbs_len)
3996 uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3997 if (nrules == 0)
3999 if (mbs_len == 1)
4001 /* No valid character. Match it as a single byte character. */
4002 const unsigned char *collseq = (const unsigned char *)
4003 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
4004 return collseq[mbs[0]];
4006 return UINT_MAX;
4008 else
4010 int32_t idx;
4011 const unsigned char *extra = (const unsigned char *)
4012 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB);
4013 int32_t extrasize = (const unsigned char *)
4014 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB + 1) - extra;
4016 for (idx = 0; idx < extrasize;)
4018 int mbs_cnt, found = 0;
4019 int32_t elem_mbs_len;
4020 /* Skip the name of collating element name. */
4021 idx = idx + extra[idx] + 1;
4022 elem_mbs_len = extra[idx++];
4023 if (mbs_len == elem_mbs_len)
4025 for (mbs_cnt = 0; mbs_cnt < elem_mbs_len; ++mbs_cnt)
4026 if (extra[idx + mbs_cnt] != mbs[mbs_cnt])
4027 break;
4028 if (mbs_cnt == elem_mbs_len)
4029 /* Found the entry. */
4030 found = 1;
4032 /* Skip the byte sequence of the collating element. */
4033 idx += elem_mbs_len;
4034 /* Adjust for the alignment. */
4035 idx = (idx + 3) & ~3;
4036 /* Skip the collation sequence value. */
4037 idx += sizeof (uint32_t);
4038 /* Skip the wide char sequence of the collating element. */
4039 idx = idx + sizeof (uint32_t) * (*(int32_t *) (extra + idx) + 1);
4040 /* If we found the entry, return the sequence value. */
4041 if (found)
4042 return *(uint32_t *) (extra + idx);
4043 /* Skip the collation sequence value. */
4044 idx += sizeof (uint32_t);
4046 return UINT_MAX;
4049 # endif /* _LIBC */
4050 #endif /* RE_ENABLE_I18N */
4052 /* Check whether the node accepts the byte which is IDX-th
4053 byte of the INPUT. */
4055 static int
4056 internal_function
4057 check_node_accept (const re_match_context_t *mctx, const re_token_t *node,
4058 int idx)
4060 unsigned char ch;
4061 ch = re_string_byte_at (&mctx->input, idx);
4062 switch (node->type)
4064 case CHARACTER:
4065 if (node->opr.c != ch)
4066 return 0;
4067 break;
4069 case SIMPLE_BRACKET:
4070 if (!bitset_contain (node->opr.sbcset, ch))
4071 return 0;
4072 break;
4074 #ifdef RE_ENABLE_I18N
4075 case OP_UTF8_PERIOD:
4076 if (ch >= 0x80)
4077 return 0;
4078 /* FALLTHROUGH */
4079 #endif
4080 case OP_PERIOD:
4081 if ((ch == '\n' && !(mctx->dfa->syntax & RE_DOT_NEWLINE))
4082 || (ch == '\0' && (mctx->dfa->syntax & RE_DOT_NOT_NULL)))
4083 return 0;
4084 break;
4086 default:
4087 return 0;
4090 if (node->constraint)
4092 /* The node has constraints. Check whether the current context
4093 satisfies the constraints. */
4094 unsigned int context = re_string_context_at (&mctx->input, idx,
4095 mctx->eflags);
4096 if (NOT_SATISFY_NEXT_CONSTRAINT (node->constraint, context))
4097 return 0;
4100 return 1;
4103 /* Extend the buffers, if the buffers have run out. */
4105 static reg_errcode_t
4106 internal_function __attribute_warn_unused_result__
4107 extend_buffers (re_match_context_t *mctx, int min_len)
4109 reg_errcode_t ret;
4110 re_string_t *pstr = &mctx->input;
4112 /* Avoid overflow. */
4113 if (BE (INT_MAX / 2 / sizeof (re_dfastate_t *) <= pstr->bufs_len, 0))
4114 return REG_ESPACE;
4116 /* Double the lengthes of the buffers, but allocate at least MIN_LEN. */
4117 ret = re_string_realloc_buffers (pstr,
4118 MAX (min_len,
4119 MIN (pstr->len, pstr->bufs_len * 2)));
4120 if (BE (ret != REG_NOERROR, 0))
4121 return ret;
4123 if (mctx->state_log != NULL)
4125 /* And double the length of state_log. */
4126 /* XXX We have no indication of the size of this buffer. If this
4127 allocation fail we have no indication that the state_log array
4128 does not have the right size. */
4129 re_dfastate_t **new_array = re_realloc (mctx->state_log, re_dfastate_t *,
4130 pstr->bufs_len + 1);
4131 if (BE (new_array == NULL, 0))
4132 return REG_ESPACE;
4133 mctx->state_log = new_array;
4136 /* Then reconstruct the buffers. */
4137 if (pstr->icase)
4139 #ifdef RE_ENABLE_I18N
4140 if (pstr->mb_cur_max > 1)
4142 ret = build_wcs_upper_buffer (pstr);
4143 if (BE (ret != REG_NOERROR, 0))
4144 return ret;
4146 else
4147 #endif /* RE_ENABLE_I18N */
4148 build_upper_buffer (pstr);
4150 else
4152 #ifdef RE_ENABLE_I18N
4153 if (pstr->mb_cur_max > 1)
4154 build_wcs_buffer (pstr);
4155 else
4156 #endif /* RE_ENABLE_I18N */
4158 if (pstr->trans != NULL)
4159 re_string_translate_buffer (pstr);
4162 return REG_NOERROR;
4166 /* Functions for matching context. */
4168 /* Initialize MCTX. */
4170 static reg_errcode_t
4171 internal_function __attribute_warn_unused_result__
4172 match_ctx_init (re_match_context_t *mctx, int eflags, int n)
4174 mctx->eflags = eflags;
4175 mctx->match_last = -1;
4176 if (n > 0)
4178 mctx->bkref_ents = re_malloc (struct re_backref_cache_entry, n);
4179 mctx->sub_tops = re_malloc (re_sub_match_top_t *, n);
4180 if (BE (mctx->bkref_ents == NULL || mctx->sub_tops == NULL, 0))
4181 return REG_ESPACE;
4183 /* Already zero-ed by the caller.
4184 else
4185 mctx->bkref_ents = NULL;
4186 mctx->nbkref_ents = 0;
4187 mctx->nsub_tops = 0; */
4188 mctx->abkref_ents = n;
4189 mctx->max_mb_elem_len = 1;
4190 mctx->asub_tops = n;
4191 return REG_NOERROR;
4194 /* Clean the entries which depend on the current input in MCTX.
4195 This function must be invoked when the matcher changes the start index
4196 of the input, or changes the input string. */
4198 static void
4199 internal_function
4200 match_ctx_clean (re_match_context_t *mctx)
4202 int st_idx;
4203 for (st_idx = 0; st_idx < mctx->nsub_tops; ++st_idx)
4205 int sl_idx;
4206 re_sub_match_top_t *top = mctx->sub_tops[st_idx];
4207 for (sl_idx = 0; sl_idx < top->nlasts; ++sl_idx)
4209 re_sub_match_last_t *last = top->lasts[sl_idx];
4210 re_free (last->path.array);
4211 re_free (last);
4213 re_free (top->lasts);
4214 if (top->path)
4216 re_free (top->path->array);
4217 re_free (top->path);
4219 free (top);
4222 mctx->nsub_tops = 0;
4223 mctx->nbkref_ents = 0;
4226 /* Free all the memory associated with MCTX. */
4228 static void
4229 internal_function
4230 match_ctx_free (re_match_context_t *mctx)
4232 /* First, free all the memory associated with MCTX->SUB_TOPS. */
4233 match_ctx_clean (mctx);
4234 re_free (mctx->sub_tops);
4235 re_free (mctx->bkref_ents);
4238 /* Add a new backreference entry to MCTX.
4239 Note that we assume that caller never call this function with duplicate
4240 entry, and call with STR_IDX which isn't smaller than any existing entry.
4243 static reg_errcode_t
4244 internal_function __attribute_warn_unused_result__
4245 match_ctx_add_entry (re_match_context_t *mctx, int node, int str_idx, int from,
4246 int to)
4248 if (mctx->nbkref_ents >= mctx->abkref_ents)
4250 struct re_backref_cache_entry* new_entry;
4251 new_entry = re_realloc (mctx->bkref_ents, struct re_backref_cache_entry,
4252 mctx->abkref_ents * 2);
4253 if (BE (new_entry == NULL, 0))
4255 re_free (mctx->bkref_ents);
4256 return REG_ESPACE;
4258 mctx->bkref_ents = new_entry;
4259 memset (mctx->bkref_ents + mctx->nbkref_ents, '\0',
4260 sizeof (struct re_backref_cache_entry) * mctx->abkref_ents);
4261 mctx->abkref_ents *= 2;
4263 if (mctx->nbkref_ents > 0
4264 && mctx->bkref_ents[mctx->nbkref_ents - 1].str_idx == str_idx)
4265 mctx->bkref_ents[mctx->nbkref_ents - 1].more = 1;
4267 mctx->bkref_ents[mctx->nbkref_ents].node = node;
4268 mctx->bkref_ents[mctx->nbkref_ents].str_idx = str_idx;
4269 mctx->bkref_ents[mctx->nbkref_ents].subexp_from = from;
4270 mctx->bkref_ents[mctx->nbkref_ents].subexp_to = to;
4272 /* This is a cache that saves negative results of check_dst_limits_calc_pos.
4273 If bit N is clear, means that this entry won't epsilon-transition to
4274 an OP_OPEN_SUBEXP or OP_CLOSE_SUBEXP for the N+1-th subexpression. If
4275 it is set, check_dst_limits_calc_pos_1 will recurse and try to find one
4276 such node.
4278 A backreference does not epsilon-transition unless it is empty, so set
4279 to all zeros if FROM != TO. */
4280 mctx->bkref_ents[mctx->nbkref_ents].eps_reachable_subexps_map
4281 = (from == to ? ~0 : 0);
4283 mctx->bkref_ents[mctx->nbkref_ents++].more = 0;
4284 if (mctx->max_mb_elem_len < to - from)
4285 mctx->max_mb_elem_len = to - from;
4286 return REG_NOERROR;
4289 /* Search for the first entry which has the same str_idx, or -1 if none is
4290 found. Note that MCTX->BKREF_ENTS is already sorted by MCTX->STR_IDX. */
4292 static int
4293 internal_function
4294 search_cur_bkref_entry (const re_match_context_t *mctx, int str_idx)
4296 int left, right, mid, last;
4297 last = right = mctx->nbkref_ents;
4298 for (left = 0; left < right;)
4300 mid = (left + right) / 2;
4301 if (mctx->bkref_ents[mid].str_idx < str_idx)
4302 left = mid + 1;
4303 else
4304 right = mid;
4306 if (left < last && mctx->bkref_ents[left].str_idx == str_idx)
4307 return left;
4308 else
4309 return -1;
4312 /* Register the node NODE, whose type is OP_OPEN_SUBEXP, and which matches
4313 at STR_IDX. */
4315 static reg_errcode_t
4316 internal_function __attribute_warn_unused_result__
4317 match_ctx_add_subtop (re_match_context_t *mctx, int node, int str_idx)
4319 #ifdef DEBUG
4320 assert (mctx->sub_tops != NULL);
4321 assert (mctx->asub_tops > 0);
4322 #endif
4323 if (BE (mctx->nsub_tops == mctx->asub_tops, 0))
4325 int new_asub_tops = mctx->asub_tops * 2;
4326 re_sub_match_top_t **new_array = re_realloc (mctx->sub_tops,
4327 re_sub_match_top_t *,
4328 new_asub_tops);
4329 if (BE (new_array == NULL, 0))
4330 return REG_ESPACE;
4331 mctx->sub_tops = new_array;
4332 mctx->asub_tops = new_asub_tops;
4334 mctx->sub_tops[mctx->nsub_tops] = calloc (1, sizeof (re_sub_match_top_t));
4335 if (BE (mctx->sub_tops[mctx->nsub_tops] == NULL, 0))
4336 return REG_ESPACE;
4337 mctx->sub_tops[mctx->nsub_tops]->node = node;
4338 mctx->sub_tops[mctx->nsub_tops++]->str_idx = str_idx;
4339 return REG_NOERROR;
4342 /* Register the node NODE, whose type is OP_CLOSE_SUBEXP, and which matches
4343 at STR_IDX, whose corresponding OP_OPEN_SUBEXP is SUB_TOP. */
4345 static re_sub_match_last_t *
4346 internal_function
4347 match_ctx_add_sublast (re_sub_match_top_t *subtop, int node, int str_idx)
4349 re_sub_match_last_t *new_entry;
4350 if (BE (subtop->nlasts == subtop->alasts, 0))
4352 int new_alasts = 2 * subtop->alasts + 1;
4353 re_sub_match_last_t **new_array = re_realloc (subtop->lasts,
4354 re_sub_match_last_t *,
4355 new_alasts);
4356 if (BE (new_array == NULL, 0))
4357 return NULL;
4358 subtop->lasts = new_array;
4359 subtop->alasts = new_alasts;
4361 new_entry = calloc (1, sizeof (re_sub_match_last_t));
4362 if (BE (new_entry != NULL, 1))
4364 subtop->lasts[subtop->nlasts] = new_entry;
4365 new_entry->node = node;
4366 new_entry->str_idx = str_idx;
4367 ++subtop->nlasts;
4369 return new_entry;
4372 static void
4373 internal_function
4374 sift_ctx_init (re_sift_context_t *sctx, re_dfastate_t **sifted_sts,
4375 re_dfastate_t **limited_sts, int last_node, int last_str_idx)
4377 sctx->sifted_states = sifted_sts;
4378 sctx->limited_states = limited_sts;
4379 sctx->last_node = last_node;
4380 sctx->last_str_idx = last_str_idx;
4381 re_node_set_init_empty (&sctx->limits);