Support inline syscall with six arguments.
[glibc/pb-stable.git] / posix / regexec.c
blobf7e0d7f062be6da3e43e0f10b240243bbe594978
1 /* Extended regular expression matching and search library.
2 Copyright (C) 2002 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4 Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library; if not, write to the Free
18 Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
19 02111-1307 USA. */
21 #include <assert.h>
22 #include <ctype.h>
23 #include <stdio.h>
24 #include <stdlib.h>
25 #include <string.h>
27 #if defined HAVE_WCHAR_H || defined _LIBC
28 # include <wchar.h>
29 #endif /* HAVE_WCHAR_H || _LIBC */
30 #if defined HAVE_WCTYPE_H || defined _LIBC
31 # include <wctype.h>
32 #endif /* HAVE_WCTYPE_H || _LIBC */
34 #ifdef _LIBC
35 # ifndef _RE_DEFINE_LOCALE_FUNCTIONS
36 # define _RE_DEFINE_LOCALE_FUNCTIONS 1
37 # include <locale/localeinfo.h>
38 # include <locale/elem-hash.h>
39 # include <locale/coll-lookup.h>
40 # endif
41 #endif
43 #include "regex.h"
44 #include "regex_internal.h"
46 static reg_errcode_t match_ctx_init (re_match_context_t *cache, int eflags,
47 re_string_t *input, int n);
48 static void match_ctx_free (re_match_context_t *cache);
49 static reg_errcode_t match_ctx_add_entry (re_match_context_t *cache, int node,
50 int str_idx, int from, int to);
51 static void match_ctx_clear_flag (re_match_context_t *mctx);
52 static void sift_ctx_init (re_sift_context_t *sctx, re_dfastate_t **sifted_sts,
53 re_dfastate_t **limited_sts, int last_node,
54 int last_str_idx, int check_subexp);
55 static reg_errcode_t re_search_internal (const regex_t *preg,
56 const char *string, int length,
57 int start, int range, int stop,
58 size_t nmatch, regmatch_t pmatch[],
59 int eflags);
60 static int re_search_2_stub (struct re_pattern_buffer *bufp,
61 const char *string1, int length1,
62 const char *string2, int length2,
63 int start, int range, struct re_registers *regs,
64 int stop, int ret_len);
65 static int re_search_stub (struct re_pattern_buffer *bufp,
66 const char *string, int length, int start,
67 int range, int stop, struct re_registers *regs,
68 int ret_len);
69 static unsigned re_copy_regs (struct re_registers *regs, regmatch_t *pmatch,
70 int nregs, int regs_allocated);
71 static inline re_dfastate_t *acquire_init_state_context (reg_errcode_t *err,
72 const regex_t *preg,
73 const re_match_context_t *mctx,
74 int idx);
75 static int check_matching (const regex_t *preg, re_match_context_t *mctx,
76 int fl_search, int fl_longest_match);
77 static int check_halt_node_context (const re_dfa_t *dfa, int node,
78 unsigned int context);
79 static int check_halt_state_context (const regex_t *preg,
80 const re_dfastate_t *state,
81 const re_match_context_t *mctx, int idx);
82 static void update_regs (re_dfa_t *dfa, regmatch_t *pmatch, int cur_node,
83 int cur_idx, int nmatch);
84 static int proceed_next_node (const regex_t *preg, int nregs, regmatch_t *regs,
85 const re_match_context_t *mctx,
86 int *pidx, int node, re_node_set *eps_via_nodes,
87 struct re_fail_stack_t *fs);
88 static reg_errcode_t push_fail_stack (struct re_fail_stack_t *fs,
89 int str_idx, int *dests, int nregs,
90 regmatch_t *regs,
91 re_node_set *eps_via_nodes);
92 static int pop_fail_stack (struct re_fail_stack_t *fs, int *pidx, int nregs,
93 regmatch_t *regs, re_node_set *eps_via_nodes);
94 static reg_errcode_t set_regs (const regex_t *preg,
95 const re_match_context_t *mctx,
96 size_t nmatch, regmatch_t *pmatch,
97 int fl_backtrack);
98 static reg_errcode_t free_fail_stack_return (struct re_fail_stack_t *fs);
100 #ifdef RE_ENABLE_I18N
101 static int sift_states_iter_mb (const regex_t *preg,
102 const re_match_context_t *mctx,
103 re_sift_context_t *sctx,
104 int node_idx, int str_idx, int max_str_idx);
105 #endif /* RE_ENABLE_I18N */
106 static reg_errcode_t sift_states_backward (const regex_t *preg,
107 re_match_context_t *mctx,
108 re_sift_context_t *sctx);
109 static reg_errcode_t update_cur_sifted_state (const regex_t *preg,
110 re_match_context_t *mctx,
111 re_sift_context_t *sctx,
112 int str_idx,
113 re_node_set *dest_nodes);
114 static reg_errcode_t add_epsilon_src_nodes (re_dfa_t *dfa,
115 re_node_set *dest_nodes,
116 const re_node_set *candidates);
117 static reg_errcode_t sub_epsilon_src_nodes (re_dfa_t *dfa, int node,
118 re_node_set *dest_nodes,
119 const re_node_set *and_nodes);
120 static int check_dst_limits (re_dfa_t *dfa, re_node_set *limits,
121 re_match_context_t *mctx, int dst_node,
122 int dst_idx, int src_node, int src_idx);
123 static int check_dst_limits_calc_pos (re_dfa_t *dfa, re_match_context_t *mctx,
124 int limit, re_node_set *eclosures,
125 int subexp_idx, int node, int str_idx);
126 static reg_errcode_t check_subexp_limits (re_dfa_t *dfa,
127 re_node_set *dest_nodes,
128 const re_node_set *candidates,
129 re_node_set *limits,
130 struct re_backref_cache_entry *bkref_ents,
131 int str_idx);
132 static reg_errcode_t search_subexp (const regex_t *preg,
133 re_match_context_t *mctx,
134 re_sift_context_t *sctx, int str_idx,
135 re_node_set *dest_nodes);
136 static reg_errcode_t sift_states_bkref (const regex_t *preg,
137 re_match_context_t *mctx,
138 re_sift_context_t *sctx,
139 int str_idx, re_node_set *dest_nodes);
140 static reg_errcode_t clean_state_log_if_need (re_match_context_t *mctx,
141 int next_state_log_idx);
142 static reg_errcode_t merge_state_array (re_dfa_t *dfa, re_dfastate_t **dst,
143 re_dfastate_t **src, int num);
144 static re_dfastate_t *transit_state (reg_errcode_t *err, const regex_t *preg,
145 re_match_context_t *mctx,
146 re_dfastate_t *state, int fl_search);
147 static re_dfastate_t *transit_state_sb (reg_errcode_t *err, const regex_t *preg,
148 re_dfastate_t *pstate,
149 int fl_search,
150 re_match_context_t *mctx);
151 #ifdef RE_ENABLE_I18N
152 static reg_errcode_t transit_state_mb (const regex_t *preg,
153 re_dfastate_t *pstate,
154 re_match_context_t *mctx);
155 #endif /* RE_ENABLE_I18N */
156 static reg_errcode_t transit_state_bkref (const regex_t *preg,
157 re_dfastate_t *pstate,
158 re_match_context_t *mctx);
159 static reg_errcode_t transit_state_bkref_loop (const regex_t *preg,
160 re_node_set *nodes,
161 re_dfastate_t **work_state_log,
162 re_match_context_t *mctx);
163 static re_dfastate_t **build_trtable (const regex_t *dfa,
164 const re_dfastate_t *state,
165 int fl_search);
166 #ifdef RE_ENABLE_I18N
167 static int check_node_accept_bytes (const regex_t *preg, int node_idx,
168 const re_string_t *input, int idx);
169 # ifdef _LIBC
170 static unsigned int find_collation_sequence_value (const unsigned char *mbs,
171 size_t name_len);
172 # endif /* _LIBC */
173 #endif /* RE_ENABLE_I18N */
174 static int group_nodes_into_DFAstates (const regex_t *dfa,
175 const re_dfastate_t *state,
176 re_node_set *states_node,
177 bitset *states_ch);
178 static int check_node_accept (const regex_t *preg, const re_token_t *node,
179 const re_match_context_t *mctx, int idx);
180 static reg_errcode_t extend_buffers (re_match_context_t *mctx);
182 /* Entry point for POSIX code. */
184 /* regexec searches for a given pattern, specified by PREG, in the
185 string STRING.
187 If NMATCH is zero or REG_NOSUB was set in the cflags argument to
188 `regcomp', we ignore PMATCH. Otherwise, we assume PMATCH has at
189 least NMATCH elements, and we set them to the offsets of the
190 corresponding matched substrings.
192 EFLAGS specifies `execution flags' which affect matching: if
193 REG_NOTBOL is set, then ^ does not match at the beginning of the
194 string; if REG_NOTEOL is set, then $ does not match at the end.
196 We return 0 if we find a match and REG_NOMATCH if not. */
199 regexec (preg, string, nmatch, pmatch, eflags)
200 const regex_t *__restrict preg;
201 const char *__restrict string;
202 size_t nmatch;
203 regmatch_t pmatch[];
204 int eflags;
206 reg_errcode_t err;
207 int length = strlen (string);
208 if (preg->no_sub)
209 err = re_search_internal (preg, string, length, 0, length, length, 0,
210 NULL, eflags);
211 else
212 err = re_search_internal (preg, string, length, 0, length, length, nmatch,
213 pmatch, eflags);
214 return err != REG_NOERROR;
216 #ifdef _LIBC
217 weak_alias (__regexec, regexec)
218 #endif
220 /* Entry points for GNU code. */
222 /* re_match, re_search, re_match_2, re_search_2
224 The former two functions operate on STRING with length LENGTH,
225 while the later two operate on concatenation of STRING1 and STRING2
226 with lengths LENGTH1 and LENGTH2, respectively.
228 re_match() matches the compiled pattern in BUFP against the string,
229 starting at index START.
231 re_search() first tries matching at index START, then it tries to match
232 starting from index START + 1, and so on. The last start position tried
233 is START + RANGE. (Thus RANGE = 0 forces re_search to operate the same
234 way as re_match().)
236 The parameter STOP of re_{match,search}_2 specifies that no match exceeding
237 the first STOP characters of the concatenation of the strings should be
238 concerned.
240 If REGS is not NULL, and BUFP->no_sub is not set, the offsets of the match
241 and all groups is stroed in REGS. (For the "_2" variants, the offsets are
242 computed relative to the concatenation, not relative to the individual
243 strings.)
245 On success, re_match* functions return the length of the match, re_search*
246 return the position of the start of the match. Return value -1 means no
247 match was found and -2 indicates an internal error. */
250 re_match (bufp, string, length, start, regs)
251 struct re_pattern_buffer *bufp;
252 const char *string;
253 int length, start;
254 struct re_registers *regs;
256 return re_search_stub (bufp, string, length, start, 0, length, regs, 1);
258 #ifdef _LIBC
259 weak_alias (__re_match, re_match)
260 #endif
263 re_search (bufp, string, length, start, range, regs)
264 struct re_pattern_buffer *bufp;
265 const char *string;
266 int length, start, range;
267 struct re_registers *regs;
269 return re_search_stub (bufp, string, length, start, range, length, regs, 0);
271 #ifdef _LIBC
272 weak_alias (__re_search, re_search)
273 #endif
276 re_match_2 (bufp, string1, length1, string2, length2, start, regs, stop)
277 struct re_pattern_buffer *bufp;
278 const char *string1, *string2;
279 int length1, length2, start, stop;
280 struct re_registers *regs;
282 return re_search_2_stub (bufp, string1, length1, string2, length2,
283 start, 0, regs, stop, 1);
285 #ifdef _LIBC
286 weak_alias (__re_match_2, re_match_2)
287 #endif
290 re_search_2 (bufp, string1, length1, string2, length2, start, range, regs, stop)
291 struct re_pattern_buffer *bufp;
292 const char *string1, *string2;
293 int length1, length2, start, range, stop;
294 struct re_registers *regs;
296 return re_search_2_stub (bufp, string1, length1, string2, length2,
297 start, range, regs, stop, 0);
299 #ifdef _LIBC
300 weak_alias (__re_search_2, re_search_2)
301 #endif
303 static int
304 re_search_2_stub (bufp, string1, length1, string2, length2, start, range, regs,
305 stop, ret_len)
306 struct re_pattern_buffer *bufp;
307 const char *string1, *string2;
308 int length1, length2, start, range, stop, ret_len;
309 struct re_registers *regs;
311 const char *str;
312 int rval;
313 int len = length1 + length2;
314 int free_str = 0;
316 if (BE (length1 < 0 || length2 < 0 || stop < 0, 0))
317 return -2;
319 /* Concatenate the strings. */
320 if (length2 > 0)
321 if (length1 > 0)
323 char *s = re_malloc (char, len);
325 if (BE (s == NULL, 0))
326 return -2;
327 memcpy (s, string1, length1);
328 memcpy (s + length1, string2, length2);
329 str = s;
330 free_str = 1;
332 else
333 str = string2;
334 else
335 str = string1;
337 rval = re_search_stub (bufp, str, len, start, range, stop, regs,
338 ret_len);
339 if (free_str)
340 re_free ((char *) str);
341 return rval;
344 /* The parameters have the same meaning as those of re_search.
345 Additional parameters:
346 If RET_LEN is nonzero the length of the match is returned (re_match style);
347 otherwise the position of the match is returned. */
349 static int
350 re_search_stub (bufp, string, length, start, range, stop, regs, ret_len)
351 struct re_pattern_buffer *bufp;
352 const char *string;
353 int length, start, range, stop, ret_len;
354 struct re_registers *regs;
356 reg_errcode_t result;
357 regmatch_t *pmatch;
358 int nregs, rval;
359 int eflags = 0;
361 /* Check for out-of-range. */
362 if (BE (start < 0 || start > length, 0))
363 return -1;
364 if (BE (start + range > length, 0))
365 range = length - start;
366 else if (BE (start + range < 0, 0))
367 range = -start;
369 eflags |= (bufp->not_bol) ? REG_NOTBOL : 0;
370 eflags |= (bufp->not_eol) ? REG_NOTEOL : 0;
372 /* Compile fastmap if we haven't yet. */
373 if (range > 0 && bufp->fastmap != NULL && !bufp->fastmap_accurate)
374 re_compile_fastmap (bufp);
376 if (BE (bufp->no_sub, 0))
377 regs = NULL;
379 /* We need at least 1 register. */
380 if (regs == NULL)
381 nregs = 1;
382 else if (BE (bufp->regs_allocated == REGS_FIXED &&
383 regs->num_regs < bufp->re_nsub + 1, 0))
385 nregs = regs->num_regs;
386 if (BE (nregs < 1, 0))
388 /* Nothing can be copied to regs. */
389 regs = NULL;
390 nregs = 1;
393 else
394 nregs = bufp->re_nsub + 1;
395 pmatch = re_malloc (regmatch_t, nregs);
396 if (BE (pmatch == NULL, 0))
397 return -2;
399 result = re_search_internal (bufp, string, length, start, range, stop,
400 nregs, pmatch, eflags);
402 rval = 0;
404 /* I hope we needn't fill ther regs with -1's when no match was found. */
405 if (result != REG_NOERROR)
406 rval = -1;
407 else if (regs != NULL)
409 /* If caller wants register contents data back, copy them. */
410 bufp->regs_allocated = re_copy_regs (regs, pmatch, nregs,
411 bufp->regs_allocated);
412 if (BE (bufp->regs_allocated == REGS_UNALLOCATED, 0))
413 rval = -2;
416 if (BE (rval == 0, 1))
418 if (ret_len)
420 assert (pmatch[0].rm_so == start);
421 rval = pmatch[0].rm_eo - start;
423 else
424 rval = pmatch[0].rm_so;
426 re_free (pmatch);
427 return rval;
430 static unsigned
431 re_copy_regs (regs, pmatch, nregs, regs_allocated)
432 struct re_registers *regs;
433 regmatch_t *pmatch;
434 int nregs, regs_allocated;
436 int rval = REGS_REALLOCATE;
437 int i;
438 int need_regs = nregs + 1;
439 /* We need one extra element beyond `num_regs' for the `-1' marker GNU code
440 uses. */
442 /* Have the register data arrays been allocated? */
443 if (regs_allocated == REGS_UNALLOCATED)
444 { /* No. So allocate them with malloc. */
445 regs->start = re_malloc (regoff_t, need_regs);
446 if (BE (regs->start == NULL, 0))
447 return REGS_UNALLOCATED;
448 regs->end = re_malloc (regoff_t, need_regs);
449 if (BE (regs->end == NULL, 0))
451 re_free (regs->start);
452 return REGS_UNALLOCATED;
454 regs->num_regs = need_regs;
456 else if (regs_allocated == REGS_REALLOCATE)
457 { /* Yes. If we need more elements than were already
458 allocated, reallocate them. If we need fewer, just
459 leave it alone. */
460 if (need_regs > regs->num_regs)
462 regs->start = re_realloc (regs->start, regoff_t, need_regs);
463 if (BE (regs->start == NULL, 0))
465 if (regs->end != NULL)
466 re_free (regs->end);
467 return REGS_UNALLOCATED;
469 regs->end = re_realloc (regs->end, regoff_t, need_regs);
470 if (BE (regs->end == NULL, 0))
472 re_free (regs->start);
473 return REGS_UNALLOCATED;
475 regs->num_regs = need_regs;
478 else
480 assert (regs_allocated == REGS_FIXED);
481 /* This function may not be called with REGS_FIXED and nregs too big. */
482 assert (regs->num_regs >= nregs);
483 rval = REGS_FIXED;
486 /* Copy the regs. */
487 for (i = 0; i < nregs; ++i)
489 regs->start[i] = pmatch[i].rm_so;
490 regs->end[i] = pmatch[i].rm_eo;
492 for ( ; i < regs->num_regs; ++i)
493 regs->start[i] = regs->end[i] = -1;
495 return rval;
498 /* Set REGS to hold NUM_REGS registers, storing them in STARTS and
499 ENDS. Subsequent matches using PATTERN_BUFFER and REGS will use
500 this memory for recording register information. STARTS and ENDS
501 must be allocated using the malloc library routine, and must each
502 be at least NUM_REGS * sizeof (regoff_t) bytes long.
504 If NUM_REGS == 0, then subsequent matches should allocate their own
505 register data.
507 Unless this function is called, the first search or match using
508 PATTERN_BUFFER will allocate its own register data, without
509 freeing the old data. */
511 void
512 re_set_registers (bufp, regs, num_regs, starts, ends)
513 struct re_pattern_buffer *bufp;
514 struct re_registers *regs;
515 unsigned num_regs;
516 regoff_t *starts, *ends;
518 if (num_regs)
520 bufp->regs_allocated = REGS_REALLOCATE;
521 regs->num_regs = num_regs;
522 regs->start = starts;
523 regs->end = ends;
525 else
527 bufp->regs_allocated = REGS_UNALLOCATED;
528 regs->num_regs = 0;
529 regs->start = regs->end = (regoff_t *) 0;
532 #ifdef _LIBC
533 weak_alias (__re_set_registers, re_set_registers)
534 #endif
536 /* Entry points compatible with 4.2 BSD regex library. We don't define
537 them unless specifically requested. */
539 #if defined _REGEX_RE_COMP || defined _LIBC
541 # ifdef _LIBC
542 weak_function
543 # endif
544 re_exec (s)
545 const char *s;
547 return 0 == regexec (&re_comp_buf, s, 0, NULL, 0);
549 #endif /* _REGEX_RE_COMP */
551 static re_node_set empty_set;
553 /* Internal entry point. */
555 /* Searches for a compiled pattern PREG in the string STRING, whose
556 length is LENGTH. NMATCH, PMATCH, and EFLAGS have the same
557 mingings with regexec. START, and RANGE have the same meanings
558 with re_search.
559 Return REG_NOERROR if we find a match, and REG_NOMATCH if not,
560 otherwise return the error code.
561 Note: We assume front end functions already check ranges.
562 (START + RANGE >= 0 && START + RANGE <= LENGTH) */
564 static reg_errcode_t
565 re_search_internal (preg, string, length, start, range, stop, nmatch, pmatch,
566 eflags)
567 const regex_t *preg;
568 const char *string;
569 int length, start, range, stop, eflags;
570 size_t nmatch;
571 regmatch_t pmatch[];
573 reg_errcode_t err;
574 re_dfa_t *dfa = (re_dfa_t *)preg->buffer;
575 re_string_t input;
576 int left_lim, right_lim, incr;
577 int fl_longest_match, match_first, match_last = -1;
578 int fast_translate, sb;
579 re_match_context_t mctx;
580 char *fastmap = ((preg->fastmap != NULL && preg->fastmap_accurate
581 && range && !preg->can_be_null) ? preg->fastmap : NULL);
583 /* Check if the DFA haven't been compiled. */
584 if (BE (preg->used == 0 || dfa->init_state == NULL
585 || dfa->init_state_word == NULL || dfa->init_state_nl == NULL
586 || dfa->init_state_begbuf == NULL, 0))
587 return REG_NOMATCH;
589 re_node_set_init_empty (&empty_set);
590 memset (&mctx, '\0', sizeof (re_match_context_t));
592 /* We must check the longest matching, if nmatch > 0. */
593 fl_longest_match = (nmatch != 0);
595 err = re_string_allocate (&input, string, length, dfa->nodes_len + 1,
596 preg->translate, preg->syntax & RE_ICASE);
597 if (BE (err != REG_NOERROR, 0))
598 goto free_return;
599 input.stop = stop;
601 err = match_ctx_init (&mctx, eflags, &input, dfa->nbackref * 2);
602 if (BE (err != REG_NOERROR, 0))
603 goto free_return;
605 /* We will log all the DFA states through which the dfa pass,
606 if nmatch > 1, or this dfa has "multibyte node", which is a
607 back-reference or a node which can accept multibyte character or
608 multi character collating element. */
609 if (nmatch > 1 || dfa->has_mb_node)
611 mctx.state_log = re_malloc (re_dfastate_t *, dfa->nodes_len + 1);
612 if (BE (mctx.state_log == NULL, 0))
614 err = REG_ESPACE;
615 goto free_return;
618 else
619 mctx.state_log = NULL;
621 #ifdef DEBUG
622 /* We assume front-end functions already check them. */
623 assert (start + range >= 0 && start + range <= length);
624 #endif
626 match_first = start;
627 input.tip_context = ((eflags & REG_NOTBOL) ? CONTEXT_BEGBUF
628 : CONTEXT_NEWLINE | CONTEXT_BEGBUF);
630 /* Check incrementally whether of not the input string match. */
631 incr = (range < 0) ? -1 : 1;
632 left_lim = (range < 0) ? start + range : start;
633 right_lim = (range < 0) ? start : start + range;
634 sb = MB_CUR_MAX == 1;
635 fast_translate = sb || !(preg->syntax & RE_ICASE || preg->translate);
637 for (;;)
639 /* At first get the current byte from input string. */
640 if (fastmap)
642 if (BE (fast_translate, 1))
644 unsigned RE_TRANSLATE_TYPE t
645 = (unsigned RE_TRANSLATE_TYPE) preg->translate;
646 if (BE (range >= 0, 1))
648 if (BE (t != NULL, 0))
650 while (BE (match_first < right_lim, 1)
651 && !fastmap[t[(unsigned char) string[match_first]]])
652 ++match_first;
654 else
656 while (BE (match_first < right_lim, 1)
657 && !fastmap[(unsigned char) string[match_first]])
658 ++match_first;
660 if (BE (match_first == right_lim, 0))
662 int ch = match_first >= length
663 ? 0 : (unsigned char) string[match_first];
664 if (!fastmap[t ? t[ch] : ch])
665 break;
668 else
670 while (match_first >= left_lim)
672 int ch = match_first >= length
673 ? 0 : (unsigned char) string[match_first];
674 if (fastmap[t ? t[ch] : ch])
675 break;
676 --match_first;
678 if (match_first < left_lim)
679 break;
682 else
684 int ch;
688 /* In this case, we can't determine easily the current byte,
689 since it might be a component byte of a multibyte
690 character. Then we use the constructed buffer
691 instead. */
692 /* If MATCH_FIRST is out of the valid range, reconstruct the
693 buffers. */
694 if (input.raw_mbs_idx + input.valid_len <= match_first
695 || match_first < input.raw_mbs_idx)
697 err = re_string_reconstruct (&input, match_first, eflags,
698 preg->newline_anchor);
699 if (BE (err != REG_NOERROR, 0))
700 goto free_return;
702 /* If MATCH_FIRST is out of the buffer, leave it as '\0'.
703 Note that MATCH_FIRST must not be smaller than 0. */
704 ch = ((match_first >= length) ? 0
705 : re_string_byte_at (&input,
706 match_first - input.raw_mbs_idx));
707 if (fastmap[ch])
708 break;
709 match_first += incr;
711 while (match_first >= left_lim && match_first <= right_lim);
712 if (! fastmap[ch])
713 break;
717 /* Reconstruct the buffers so that the matcher can assume that
718 the matching starts from the begining of the buffer. */
719 err = re_string_reconstruct (&input, match_first, eflags,
720 preg->newline_anchor);
721 if (BE (err != REG_NOERROR, 0))
722 goto free_return;
723 #ifdef RE_ENABLE_I18N
724 /* Eliminate it when it is a component of a multibyte character
725 and isn't the head of a multibyte character. */
726 if (sb || re_string_first_byte (&input, 0))
727 #endif
729 /* It seems to be appropriate one, then use the matcher. */
730 /* We assume that the matching starts from 0. */
731 mctx.state_log_top = mctx.nbkref_ents = mctx.max_mb_elem_len = 0;
732 match_last = check_matching (preg, &mctx, 0, fl_longest_match);
733 if (match_last != -1)
735 if (BE (match_last == -2, 0))
737 err = REG_ESPACE;
738 goto free_return;
740 else
741 break; /* We found a matching. */
745 /* Update counter. */
746 match_first += incr;
747 if (match_first < left_lim || right_lim < match_first)
748 break;
751 /* Set pmatch[] if we need. */
752 if (match_last != -1 && nmatch > 0)
754 int reg_idx;
756 /* Initialize registers. */
757 for (reg_idx = 0; reg_idx < nmatch; ++reg_idx)
758 pmatch[reg_idx].rm_so = pmatch[reg_idx].rm_eo = -1;
760 /* Set the points where matching start/end. */
761 pmatch[0].rm_so = 0;
762 mctx.match_last = pmatch[0].rm_eo = match_last;
764 if (!preg->no_sub && nmatch > 1)
766 /* We need the ranges of all the subexpressions. */
767 int halt_node;
768 re_dfastate_t **sifted_states;
769 re_dfastate_t **lim_states = NULL;
770 re_dfastate_t *pstate = mctx.state_log[match_last];
771 re_sift_context_t sctx;
772 #ifdef DEBUG
773 assert (mctx.state_log != NULL);
774 #endif
775 halt_node = check_halt_state_context (preg, pstate, &mctx,
776 match_last);
777 if (dfa->has_plural_match)
779 match_ctx_clear_flag (&mctx);
780 sifted_states = re_malloc (re_dfastate_t *, match_last + 1);
781 if (BE (sifted_states == NULL, 0))
783 err = REG_ESPACE;
784 goto free_return;
786 if (dfa->nbackref)
788 lim_states = calloc (sizeof (re_dfastate_t *),
789 match_last + 1);
790 if (BE (lim_states == NULL, 0))
792 re_free (sifted_states);
793 err = REG_ESPACE;
794 goto free_return;
797 sift_ctx_init (&sctx, sifted_states, lim_states, halt_node,
798 mctx.match_last, 0);
799 err = sift_states_backward (preg, &mctx, &sctx);
800 re_node_set_free (&sctx.limits);
801 if (BE (err != REG_NOERROR, 0))
803 re_free (sifted_states);
804 re_free (lim_states);
805 goto free_return;
807 if (lim_states != NULL)
809 err = merge_state_array (dfa, sifted_states, lim_states,
810 match_last + 1);
811 re_free (lim_states);
812 if (BE (err != REG_NOERROR, 0))
814 re_free (sifted_states);
815 goto free_return;
818 re_free (mctx.state_log);
819 mctx.state_log = sifted_states;
821 mctx.last_node = halt_node;
822 err = set_regs (preg, &mctx, nmatch, pmatch,
823 dfa->has_plural_match && dfa->nbackref > 0);
824 if (BE (err != REG_NOERROR, 0))
825 goto free_return;
828 /* At last, add the offset to the each registers, since we slided
829 the buffers so that We can assume that the matching starts from 0. */
830 for (reg_idx = 0; reg_idx < nmatch; ++reg_idx)
831 if (pmatch[reg_idx].rm_so != -1)
833 pmatch[reg_idx].rm_so += match_first;
834 pmatch[reg_idx].rm_eo += match_first;
837 err = (match_last == -1) ? REG_NOMATCH : REG_NOERROR;
838 free_return:
839 re_free (mctx.state_log);
840 if (dfa->nbackref)
841 match_ctx_free (&mctx);
842 re_string_destruct (&input);
843 return err;
846 /* Acquire an initial state and return it.
847 We must select appropriate initial state depending on the context,
848 since initial states may have constraints like "\<", "^", etc.. */
850 static inline re_dfastate_t *
851 acquire_init_state_context (err, preg, mctx, idx)
852 reg_errcode_t *err;
853 const regex_t *preg;
854 const re_match_context_t *mctx;
855 int idx;
857 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
859 *err = REG_NOERROR;
860 if (dfa->init_state->has_constraint)
862 unsigned int context;
863 context = re_string_context_at (mctx->input, idx - 1, mctx->eflags,
864 preg->newline_anchor);
865 if (IS_WORD_CONTEXT (context))
866 return dfa->init_state_word;
867 else if (IS_ORDINARY_CONTEXT (context))
868 return dfa->init_state;
869 else if (IS_BEGBUF_CONTEXT (context) && IS_NEWLINE_CONTEXT (context))
870 return dfa->init_state_begbuf;
871 else if (IS_NEWLINE_CONTEXT (context))
872 return dfa->init_state_nl;
873 else if (IS_BEGBUF_CONTEXT (context))
875 /* It is relatively rare case, then calculate on demand. */
876 return re_acquire_state_context (err, dfa,
877 dfa->init_state->entrance_nodes,
878 context);
880 else
881 /* Must not happen? */
882 return dfa->init_state;
884 else
885 return dfa->init_state;
888 /* Check whether the regular expression match input string INPUT or not,
889 and return the index where the matching end, return -1 if not match,
890 or return -2 in case of an error.
891 FL_SEARCH means we must search where the matching starts,
892 FL_LONGEST_MATCH means we want the POSIX longest matching.
893 Note that the matcher assume that the maching starts from the current
894 index of the buffer. */
896 static int
897 check_matching (preg, mctx, fl_search, fl_longest_match)
898 const regex_t *preg;
899 re_match_context_t *mctx;
900 int fl_search, fl_longest_match;
902 reg_errcode_t err;
903 int match = 0;
904 int match_last = -1;
905 int cur_str_idx = re_string_cur_idx (mctx->input);
906 re_dfastate_t *cur_state;
908 cur_state = acquire_init_state_context (&err, preg, mctx, cur_str_idx);
909 /* An initial state must not be NULL(invalid state). */
910 if (BE (cur_state == NULL, 0))
911 return -2;
912 if (mctx->state_log != NULL)
913 mctx->state_log[cur_str_idx] = cur_state;
915 if (cur_state->has_backref)
917 int i;
918 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
919 for (i = 0; i < cur_state->nodes.nelem; ++i)
921 int node = cur_state->nodes.elems[i];
922 re_token_type_t type = dfa->nodes[node].type;
923 if (type == OP_BACK_REF)
925 int clexp_idx;
926 for (clexp_idx = 0; clexp_idx < cur_state->nodes.nelem;
927 ++clexp_idx)
929 re_token_t *clexp_node;
930 clexp_node = dfa->nodes + cur_state->nodes.elems[clexp_idx];
931 if (clexp_node->type == OP_CLOSE_SUBEXP
932 && clexp_node->opr.idx + 1== dfa->nodes[node].opr.idx)
934 err = match_ctx_add_entry (mctx, node, 0, 0, 0);
935 if (BE (err != REG_NOERROR, 0))
936 return -2;
937 break;
944 /* If the RE accepts NULL string. */
945 if (cur_state->halt)
947 if (!cur_state->has_constraint
948 || check_halt_state_context (preg, cur_state, mctx, cur_str_idx))
950 if (!fl_longest_match)
951 return cur_str_idx;
952 else
954 match_last = cur_str_idx;
955 match = 1;
960 while (!re_string_eoi (mctx->input))
962 cur_state = transit_state (&err, preg, mctx, cur_state,
963 fl_search && !match);
964 if (cur_state == NULL) /* Reached at the invalid state or an error. */
966 cur_str_idx = re_string_cur_idx (mctx->input);
967 if (BE (err != REG_NOERROR, 0))
968 return -2;
969 if (fl_search && !match)
971 /* Restart from initial state, since we are searching
972 the point from where matching start. */
973 #ifdef RE_ENABLE_I18N
974 if (MB_CUR_MAX == 1
975 || re_string_first_byte (mctx->input, cur_str_idx))
976 #endif /* RE_ENABLE_I18N */
977 cur_state = acquire_init_state_context (&err, preg, mctx,
978 cur_str_idx);
979 if (BE (cur_state == NULL && err != REG_NOERROR, 0))
980 return -2;
981 if (mctx->state_log != NULL)
982 mctx->state_log[cur_str_idx] = cur_state;
984 else if (!fl_longest_match && match)
985 break;
986 else /* (fl_longest_match && match) || (!fl_search && !match) */
988 if (mctx->state_log == NULL)
989 break;
990 else
992 int max = mctx->state_log_top;
993 for (; cur_str_idx <= max; ++cur_str_idx)
994 if (mctx->state_log[cur_str_idx] != NULL)
995 break;
996 if (cur_str_idx > max)
997 break;
1002 if (cur_state != NULL && cur_state->halt)
1004 /* Reached at a halt state.
1005 Check the halt state can satisfy the current context. */
1006 if (!cur_state->has_constraint
1007 || check_halt_state_context (preg, cur_state, mctx,
1008 re_string_cur_idx (mctx->input)))
1010 /* We found an appropriate halt state. */
1011 match_last = re_string_cur_idx (mctx->input);
1012 match = 1;
1013 if (!fl_longest_match)
1014 break;
1018 return match_last;
1021 /* Check NODE match the current context. */
1023 static int check_halt_node_context (dfa, node, context)
1024 const re_dfa_t *dfa;
1025 int node;
1026 unsigned int context;
1028 re_token_type_t type = dfa->nodes[node].type;
1029 unsigned int constraint = dfa->nodes[node].constraint;
1030 if (type != END_OF_RE)
1031 return 0;
1032 if (!constraint)
1033 return 1;
1034 if (NOT_SATISFY_NEXT_CONSTRAINT (constraint, context))
1035 return 0;
1036 return 1;
1039 /* Check the halt state STATE match the current context.
1040 Return 0 if not match, if the node, STATE has, is a halt node and
1041 match the context, return the node. */
1043 static int
1044 check_halt_state_context (preg, state, mctx, idx)
1045 const regex_t *preg;
1046 const re_dfastate_t *state;
1047 const re_match_context_t *mctx;
1048 int idx;
1050 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1051 int i;
1052 unsigned int context;
1053 #ifdef DEBUG
1054 assert (state->halt);
1055 #endif
1056 context = re_string_context_at (mctx->input, idx, mctx->eflags,
1057 preg->newline_anchor);
1058 for (i = 0; i < state->nodes.nelem; ++i)
1059 if (check_halt_node_context (dfa, state->nodes.elems[i], context))
1060 return state->nodes.elems[i];
1061 return 0;
1064 /* Compute the next node to which "NFA" transit from NODE("NFA" is a NFA
1065 corresponding to the DFA).
1066 Return the destination node, and update EPS_VIA_NODES, return -1 in case
1067 of errors. */
1069 static int
1070 proceed_next_node (preg, nregs, regs, mctx, pidx, node, eps_via_nodes, fs)
1071 const regex_t *preg;
1072 regmatch_t *regs;
1073 const re_match_context_t *mctx;
1074 int nregs, *pidx, node;
1075 re_node_set *eps_via_nodes;
1076 struct re_fail_stack_t *fs;
1078 re_dfa_t *dfa = (re_dfa_t *)preg->buffer;
1079 int i, err, dest_node;
1080 dest_node = -1;
1081 if (IS_EPSILON_NODE (dfa->nodes[node].type))
1083 re_node_set *cur_nodes = &mctx->state_log[*pidx]->nodes;
1084 int ndest, dest_nodes[2];
1085 err = re_node_set_insert (eps_via_nodes, node);
1086 if (BE (err < 0, 0))
1087 return -1;
1088 /* Pick up valid destinations. */
1089 for (ndest = 0, i = 0; i < dfa->edests[node].nelem; ++i)
1091 int candidate = dfa->edests[node].elems[i];
1092 if (!re_node_set_contains (cur_nodes, candidate))
1093 continue;
1094 dest_nodes[0] = (ndest == 0) ? candidate : dest_nodes[0];
1095 dest_nodes[1] = (ndest == 1) ? candidate : dest_nodes[1];
1096 ++ndest;
1098 if (ndest <= 1)
1099 return ndest == 0 ? -1 : (ndest == 1 ? dest_nodes[0] : 0);
1100 /* In order to avoid infinite loop like "(a*)*". */
1101 if (re_node_set_contains (eps_via_nodes, dest_nodes[0]))
1102 return dest_nodes[1];
1103 if (fs != NULL)
1104 push_fail_stack (fs, *pidx, dest_nodes, nregs, regs, eps_via_nodes);
1105 return dest_nodes[0];
1107 else
1109 int naccepted = 0;
1110 re_token_type_t type = dfa->nodes[node].type;
1112 #ifdef RE_ENABLE_I18N
1113 if (ACCEPT_MB_NODE (type))
1114 naccepted = check_node_accept_bytes (preg, node, mctx->input, *pidx);
1115 else
1116 #endif /* RE_ENABLE_I18N */
1117 if (type == OP_BACK_REF)
1119 int subexp_idx = dfa->nodes[node].opr.idx;
1120 naccepted = regs[subexp_idx].rm_eo - regs[subexp_idx].rm_so;
1121 if (fs != NULL)
1123 if (regs[subexp_idx].rm_so == -1 || regs[subexp_idx].rm_eo == -1)
1124 return -1;
1125 else if (naccepted)
1127 char *buf = re_string_get_buffer (mctx->input);
1128 if (strncmp (buf + regs[subexp_idx].rm_so, buf + *pidx,
1129 naccepted) != 0)
1130 return -1;
1134 if (naccepted == 0)
1136 err = re_node_set_insert (eps_via_nodes, node);
1137 if (BE (err < 0, 0))
1138 return -2;
1139 dest_node = dfa->edests[node].elems[0];
1140 if (re_node_set_contains (&mctx->state_log[*pidx]->nodes,
1141 dest_node))
1142 return dest_node;
1146 if (naccepted != 0
1147 || check_node_accept (preg, dfa->nodes + node, mctx, *pidx))
1149 dest_node = dfa->nexts[node];
1150 *pidx = (naccepted == 0) ? *pidx + 1 : *pidx + naccepted;
1151 if (fs && (*pidx > mctx->match_last || mctx->state_log[*pidx] == NULL
1152 || !re_node_set_contains (&mctx->state_log[*pidx]->nodes,
1153 dest_node)))
1154 return -1;
1155 re_node_set_empty (eps_via_nodes);
1156 return dest_node;
1159 return -1;
1162 static reg_errcode_t
1163 push_fail_stack (fs, str_idx, dests, nregs, regs, eps_via_nodes)
1164 struct re_fail_stack_t *fs;
1165 int str_idx, *dests, nregs;
1166 regmatch_t *regs;
1167 re_node_set *eps_via_nodes;
1169 reg_errcode_t err;
1170 int num = fs->num++;
1171 if (fs->num == fs->alloc)
1173 struct re_fail_stack_ent_t *new_array;
1174 fs->alloc *= 2;
1175 new_array = realloc (fs->stack, (sizeof (struct re_fail_stack_ent_t)
1176 * fs->alloc));
1177 if (new_array == NULL)
1178 return REG_ESPACE;
1179 fs->stack = new_array;
1181 fs->stack[num].idx = str_idx;
1182 fs->stack[num].node = dests[1];
1183 fs->stack[num].regs = re_malloc (regmatch_t, nregs);
1184 memcpy (fs->stack[num].regs, regs, sizeof (regmatch_t) * nregs);
1185 err = re_node_set_init_copy (&fs->stack[num].eps_via_nodes, eps_via_nodes);
1186 return err;
1189 static int
1190 pop_fail_stack (fs, pidx, nregs, regs, eps_via_nodes)
1191 struct re_fail_stack_t *fs;
1192 int *pidx, nregs;
1193 regmatch_t *regs;
1194 re_node_set *eps_via_nodes;
1196 int num = --fs->num;
1197 assert (num >= 0);
1198 *pidx = fs->stack[num].idx;
1199 memcpy (regs, fs->stack[num].regs, sizeof (regmatch_t) * nregs);
1200 re_node_set_free (eps_via_nodes);
1201 re_free (fs->stack[num].regs);
1202 *eps_via_nodes = fs->stack[num].eps_via_nodes;
1203 return fs->stack[num].node;
1206 /* Set the positions where the subexpressions are starts/ends to registers
1207 PMATCH.
1208 Note: We assume that pmatch[0] is already set, and
1209 pmatch[i].rm_so == pmatch[i].rm_eo == -1 (i > 1). */
1211 static reg_errcode_t
1212 set_regs (preg, mctx, nmatch, pmatch, fl_backtrack)
1213 const regex_t *preg;
1214 const re_match_context_t *mctx;
1215 size_t nmatch;
1216 regmatch_t *pmatch;
1217 int fl_backtrack;
1219 re_dfa_t *dfa = (re_dfa_t *)preg->buffer;
1220 int idx, cur_node, real_nmatch;
1221 re_node_set eps_via_nodes;
1222 struct re_fail_stack_t *fs;
1223 struct re_fail_stack_t fs_body = {0, 2, NULL};
1224 #ifdef DEBUG
1225 assert (nmatch > 1);
1226 assert (mctx->state_log != NULL);
1227 #endif
1228 if (fl_backtrack)
1230 fs = &fs_body;
1231 fs->stack = re_malloc (struct re_fail_stack_ent_t, fs->alloc);
1233 else
1234 fs = NULL;
1235 cur_node = dfa->init_node;
1236 real_nmatch = (nmatch <= preg->re_nsub) ? nmatch : preg->re_nsub + 1;
1237 re_node_set_init_empty (&eps_via_nodes);
1238 for (idx = pmatch[0].rm_so; idx <= pmatch[0].rm_eo ;)
1240 update_regs (dfa, pmatch, cur_node, idx, real_nmatch);
1241 if (idx == pmatch[0].rm_eo && cur_node == mctx->last_node)
1243 int reg_idx;
1244 if (fs)
1246 for (reg_idx = 0; reg_idx < nmatch; ++reg_idx)
1247 if (pmatch[reg_idx].rm_so > -1 && pmatch[reg_idx].rm_eo == -1)
1248 break;
1249 if (reg_idx == nmatch)
1251 re_node_set_free (&eps_via_nodes);
1252 return free_fail_stack_return (fs);
1254 cur_node = pop_fail_stack (fs, &idx, nmatch, pmatch,
1255 &eps_via_nodes);
1257 else
1259 re_node_set_free (&eps_via_nodes);
1260 return REG_NOERROR;
1264 /* Proceed to next node. */
1265 cur_node = proceed_next_node (preg, nmatch, pmatch, mctx, &idx, cur_node,
1266 &eps_via_nodes, fs);
1268 if (BE (cur_node < 0, 0))
1270 if (cur_node == -2)
1271 return REG_ESPACE;
1272 if (fs)
1273 cur_node = pop_fail_stack (fs, &idx, nmatch, pmatch,
1274 &eps_via_nodes);
1275 else
1277 re_node_set_free (&eps_via_nodes);
1278 return REG_NOMATCH;
1282 re_node_set_free (&eps_via_nodes);
1283 return free_fail_stack_return (fs);
1286 static reg_errcode_t
1287 free_fail_stack_return (fs)
1288 struct re_fail_stack_t *fs;
1290 if (fs)
1292 int fs_idx;
1293 for (fs_idx = 0; fs_idx < fs->num; ++fs_idx)
1295 re_node_set_free (&fs->stack[fs_idx].eps_via_nodes);
1296 re_free (fs->stack[fs_idx].regs);
1298 re_free (fs->stack);
1300 return REG_NOERROR;
1303 static void
1304 update_regs (dfa, pmatch, cur_node, cur_idx, nmatch)
1305 re_dfa_t *dfa;
1306 regmatch_t *pmatch;
1307 int cur_node, cur_idx, nmatch;
1309 int type = dfa->nodes[cur_node].type;
1310 int reg_num;
1311 if (type != OP_OPEN_SUBEXP && type != OP_CLOSE_SUBEXP)
1312 return;
1313 reg_num = dfa->nodes[cur_node].opr.idx + 1;
1314 if (reg_num >= nmatch)
1315 return;
1316 if (type == OP_OPEN_SUBEXP)
1318 /* We are at the first node of this sub expression. */
1319 pmatch[reg_num].rm_so = cur_idx;
1320 pmatch[reg_num].rm_eo = -1;
1322 else if (type == OP_CLOSE_SUBEXP)
1323 /* We are at the first node of this sub expression. */
1324 pmatch[reg_num].rm_eo = cur_idx;
1327 #define NUMBER_OF_STATE 1
1329 /* This function checks the STATE_LOG from the SCTX->last_str_idx to 0
1330 and sift the nodes in each states according to the following rules.
1331 Updated state_log will be wrote to STATE_LOG.
1333 Rules: We throw away the Node `a' in the STATE_LOG[STR_IDX] if...
1334 1. When STR_IDX == MATCH_LAST(the last index in the state_log):
1335 If `a' isn't the LAST_NODE and `a' can't epsilon transit to
1336 the LAST_NODE, we throw away the node `a'.
1337 2. When 0 <= STR_IDX < MATCH_LAST and `a' accepts
1338 string `s' and transit to `b':
1339 i. If 'b' isn't in the STATE_LOG[STR_IDX+strlen('s')], we throw
1340 away the node `a'.
1341 ii. If 'b' is in the STATE_LOG[STR_IDX+strlen('s')] but 'b' is
1342 throwed away, we throw away the node `a'.
1343 3. When 0 <= STR_IDX < n and 'a' epsilon transit to 'b':
1344 i. If 'b' isn't in the STATE_LOG[STR_IDX], we throw away the
1345 node `a'.
1346 ii. If 'b' is in the STATE_LOG[STR_IDX] but 'b' is throwed away,
1347 we throw away the node `a'. */
1349 #define STATE_NODE_CONTAINS(state,node) \
1350 ((state) != NULL && re_node_set_contains (&(state)->nodes, node))
1352 static reg_errcode_t
1353 sift_states_backward (preg, mctx, sctx)
1354 const regex_t *preg;
1355 re_match_context_t *mctx;
1356 re_sift_context_t *sctx;
1358 reg_errcode_t err;
1359 re_dfa_t *dfa = (re_dfa_t *)preg->buffer;
1360 int null_cnt = 0;
1361 int str_idx = sctx->last_str_idx;
1362 re_node_set cur_dest;
1363 re_node_set *cur_src; /* Points the state_log[str_idx]->nodes */
1365 #ifdef DEBUG
1366 assert (mctx->state_log != NULL && mctx->state_log[str_idx] != NULL);
1367 #endif
1368 cur_src = &mctx->state_log[str_idx]->nodes;
1370 /* Build sifted state_log[str_idx]. It has the nodes which can epsilon
1371 transit to the last_node and the last_node itself. */
1372 err = re_node_set_init_1 (&cur_dest, sctx->last_node);
1373 if (BE (err != REG_NOERROR, 0))
1374 return err;
1375 err = update_cur_sifted_state (preg, mctx, sctx, str_idx, &cur_dest);
1376 if (BE (err != REG_NOERROR, 0))
1377 goto free_return;
1379 /* Then check each states in the state_log. */
1380 while (str_idx > 0)
1382 int i, ret;
1383 /* Update counters. */
1384 null_cnt = (sctx->sifted_states[str_idx] == NULL) ? null_cnt + 1 : 0;
1385 if (null_cnt > mctx->max_mb_elem_len)
1387 memset (sctx->sifted_states, '\0',
1388 sizeof (re_dfastate_t *) * str_idx);
1389 re_node_set_free (&cur_dest);
1390 return REG_NOERROR;
1392 re_node_set_empty (&cur_dest);
1393 --str_idx;
1394 cur_src = ((mctx->state_log[str_idx] == NULL) ? &empty_set
1395 : &mctx->state_log[str_idx]->nodes);
1397 /* Then build the next sifted state.
1398 We build the next sifted state on `cur_dest', and update
1399 `sifted_states[str_idx]' with `cur_dest'.
1400 Note:
1401 `cur_dest' is the sifted state from `state_log[str_idx + 1]'.
1402 `cur_src' points the node_set of the old `state_log[str_idx]'. */
1403 for (i = 0; i < cur_src->nelem; i++)
1405 int prev_node = cur_src->elems[i];
1406 int naccepted = 0;
1407 re_token_type_t type = dfa->nodes[prev_node].type;
1409 if (IS_EPSILON_NODE(type))
1410 continue;
1411 #ifdef RE_ENABLE_I18N
1412 /* If the node may accept `multi byte'. */
1413 if (ACCEPT_MB_NODE (type))
1414 naccepted = sift_states_iter_mb (preg, mctx, sctx, prev_node,
1415 str_idx, sctx->last_str_idx);
1417 #endif /* RE_ENABLE_I18N */
1418 /* We don't check backreferences here.
1419 See update_cur_sifted_state(). */
1421 if (!naccepted
1422 && check_node_accept (preg, dfa->nodes + prev_node, mctx,
1423 str_idx)
1424 && STATE_NODE_CONTAINS (sctx->sifted_states[str_idx + 1],
1425 dfa->nexts[prev_node]))
1426 naccepted = 1;
1428 if (naccepted == 0)
1429 continue;
1431 if (sctx->limits.nelem)
1433 int to_idx = str_idx + naccepted;
1434 if (check_dst_limits (dfa, &sctx->limits, mctx,
1435 dfa->nexts[prev_node], to_idx,
1436 prev_node, str_idx))
1437 continue;
1439 ret = re_node_set_insert (&cur_dest, prev_node);
1440 if (BE (ret == -1, 0))
1442 err = REG_ESPACE;
1443 goto free_return;
1447 /* Add all the nodes which satisfy the following conditions:
1448 - It can epsilon transit to a node in CUR_DEST.
1449 - It is in CUR_SRC.
1450 And update state_log. */
1451 err = update_cur_sifted_state (preg, mctx, sctx, str_idx, &cur_dest);
1452 if (BE (err != REG_NOERROR, 0))
1453 goto free_return;
1455 err = REG_NOERROR;
1456 free_return:
1457 re_node_set_free (&cur_dest);
1458 return err;
1461 /* Helper functions. */
1463 static inline reg_errcode_t
1464 clean_state_log_if_need (mctx, next_state_log_idx)
1465 re_match_context_t *mctx;
1466 int next_state_log_idx;
1468 int top = mctx->state_log_top;
1470 if (next_state_log_idx >= mctx->input->bufs_len
1471 || (next_state_log_idx >= mctx->input->valid_len
1472 && mctx->input->valid_len < mctx->input->len))
1474 reg_errcode_t err;
1475 err = extend_buffers (mctx);
1476 if (BE (err != REG_NOERROR, 0))
1477 return err;
1480 if (top < next_state_log_idx)
1482 memset (mctx->state_log + top + 1, '\0',
1483 sizeof (re_dfastate_t *) * (next_state_log_idx - top));
1484 mctx->state_log_top = next_state_log_idx;
1486 return REG_NOERROR;
1489 static reg_errcode_t
1490 merge_state_array (dfa, dst, src, num)
1491 re_dfa_t *dfa;
1492 re_dfastate_t **dst;
1493 re_dfastate_t **src;
1494 int num;
1496 int st_idx;
1497 reg_errcode_t err;
1498 for (st_idx = 0; st_idx < num; ++st_idx)
1500 if (dst[st_idx] == NULL)
1501 dst[st_idx] = src[st_idx];
1502 else if (src[st_idx] != NULL)
1504 re_node_set merged_set;
1505 err = re_node_set_init_union (&merged_set, &dst[st_idx]->nodes,
1506 &src[st_idx]->nodes);
1507 if (BE (err != REG_NOERROR, 0))
1508 return err;
1509 dst[st_idx] = re_acquire_state (&err, dfa, &merged_set);
1510 re_node_set_free (&merged_set);
1511 if (BE (err != REG_NOERROR, 0))
1512 return err;
1515 return REG_NOERROR;
1518 static reg_errcode_t
1519 update_cur_sifted_state (preg, mctx, sctx, str_idx, dest_nodes)
1520 const regex_t *preg;
1521 re_match_context_t *mctx;
1522 re_sift_context_t *sctx;
1523 int str_idx;
1524 re_node_set *dest_nodes;
1526 reg_errcode_t err;
1527 re_dfa_t *dfa = (re_dfa_t *)preg->buffer;
1528 const re_node_set *candidates;
1529 candidates = ((mctx->state_log[str_idx] == NULL) ? &empty_set
1530 : &mctx->state_log[str_idx]->nodes);
1532 /* At first, add the nodes which can epsilon transit to a node in
1533 DEST_NODE. */
1534 if (dest_nodes->nelem)
1536 err = add_epsilon_src_nodes (dfa, dest_nodes, candidates);
1537 if (BE (err != REG_NOERROR, 0))
1538 return err;
1541 /* Then, check the limitations in the current sift_context. */
1542 if (dest_nodes->nelem && sctx->limits.nelem)
1544 err = check_subexp_limits (dfa, dest_nodes, candidates, &sctx->limits,
1545 mctx->bkref_ents, str_idx);
1546 if (BE (err != REG_NOERROR, 0))
1547 return err;
1550 /* Update state_log. */
1551 sctx->sifted_states[str_idx] = re_acquire_state (&err, dfa, dest_nodes);
1552 if (BE (sctx->sifted_states[str_idx] == NULL && err != REG_NOERROR, 0))
1553 return err;
1555 /* If we are searching for the subexpression candidates.
1556 Note that we were from transit_state_bkref_loop() in this case. */
1557 if (dest_nodes->nelem && sctx->check_subexp)
1559 err = search_subexp (preg, mctx, sctx, str_idx, dest_nodes);
1560 if (BE (err != REG_NOERROR, 0))
1561 return err;
1564 if ((mctx->state_log[str_idx] != NULL
1565 && mctx->state_log[str_idx]->has_backref))
1567 err = sift_states_bkref (preg, mctx, sctx, str_idx, dest_nodes);
1568 if (BE (err != REG_NOERROR, 0))
1569 return err;
1571 return REG_NOERROR;
1574 static reg_errcode_t
1575 add_epsilon_src_nodes (dfa, dest_nodes, candidates)
1576 re_dfa_t *dfa;
1577 re_node_set *dest_nodes;
1578 const re_node_set *candidates;
1580 reg_errcode_t err;
1581 int src_idx;
1582 re_node_set src_copy;
1584 err = re_node_set_init_copy (&src_copy, dest_nodes);
1585 if (BE (err != REG_NOERROR, 0))
1586 return err;
1587 for (src_idx = 0; src_idx < src_copy.nelem; ++src_idx)
1589 err = re_node_set_add_intersect (dest_nodes, candidates,
1590 dfa->inveclosures
1591 + src_copy.elems[src_idx]);
1592 if (BE (err != REG_NOERROR, 0))
1594 re_node_set_free (&src_copy);
1595 return err;
1598 re_node_set_free (&src_copy);
1599 return REG_NOERROR;
1602 static reg_errcode_t
1603 sub_epsilon_src_nodes (dfa, node, dest_nodes, candidates)
1604 re_dfa_t *dfa;
1605 int node;
1606 re_node_set *dest_nodes;
1607 const re_node_set *candidates;
1609 int ecl_idx;
1610 reg_errcode_t err;
1611 re_node_set *inv_eclosure = dfa->inveclosures + node;
1612 re_node_set except_nodes;
1613 re_node_set_init_empty (&except_nodes);
1614 for (ecl_idx = 0; ecl_idx < inv_eclosure->nelem; ++ecl_idx)
1616 int cur_node = inv_eclosure->elems[ecl_idx];
1617 if (cur_node == node)
1618 continue;
1619 if (IS_EPSILON_NODE (dfa->nodes[cur_node].type))
1621 int edst1 = dfa->edests[cur_node].elems[0];
1622 int edst2 = ((dfa->edests[cur_node].nelem > 1)
1623 ? dfa->edests[cur_node].elems[1] : -1);
1624 if ((!re_node_set_contains (inv_eclosure, edst1)
1625 && re_node_set_contains (dest_nodes, edst1))
1626 || (edst2 > 0
1627 && !re_node_set_contains (inv_eclosure, edst2)
1628 && re_node_set_contains (dest_nodes, edst2)))
1630 err = re_node_set_add_intersect (&except_nodes, candidates,
1631 dfa->inveclosures + cur_node);
1632 if (BE (err != REG_NOERROR, 0))
1634 re_node_set_free (&except_nodes);
1635 return err;
1640 for (ecl_idx = 0; ecl_idx < inv_eclosure->nelem; ++ecl_idx)
1642 int cur_node = inv_eclosure->elems[ecl_idx];
1643 if (!re_node_set_contains (&except_nodes, cur_node))
1645 int idx = re_node_set_contains (dest_nodes, cur_node) - 1;
1646 re_node_set_remove_at (dest_nodes, idx);
1649 re_node_set_free (&except_nodes);
1650 return REG_NOERROR;
1653 static int
1654 check_dst_limits (dfa, limits, mctx, dst_node, dst_idx, src_node, src_idx)
1655 re_dfa_t *dfa;
1656 re_node_set *limits;
1657 re_match_context_t *mctx;
1658 int dst_node, dst_idx, src_node, src_idx;
1660 int lim_idx, src_pos, dst_pos;
1662 for (lim_idx = 0; lim_idx < limits->nelem; ++lim_idx)
1664 int subexp_idx;
1665 struct re_backref_cache_entry *ent;
1666 ent = mctx->bkref_ents + limits->elems[lim_idx];
1667 subexp_idx = dfa->nodes[ent->node].opr.idx - 1;
1669 dst_pos = check_dst_limits_calc_pos (dfa, mctx, limits->elems[lim_idx],
1670 dfa->eclosures + dst_node,
1671 subexp_idx, dst_node, dst_idx);
1672 src_pos = check_dst_limits_calc_pos (dfa, mctx, limits->elems[lim_idx],
1673 dfa->eclosures + src_node,
1674 subexp_idx, src_node, src_idx);
1676 /* In case of:
1677 <src> <dst> ( <subexp> )
1678 ( <subexp> ) <src> <dst>
1679 ( <subexp1> <src> <subexp2> <dst> <subexp3> ) */
1680 if (src_pos == dst_pos)
1681 continue; /* This is unrelated limitation. */
1682 else
1683 return 1;
1685 return 0;
1688 static int
1689 check_dst_limits_calc_pos (dfa, mctx, limit, eclosures, subexp_idx, node,
1690 str_idx)
1691 re_dfa_t *dfa;
1692 re_match_context_t *mctx;
1693 re_node_set *eclosures;
1694 int limit, subexp_idx, node, str_idx;
1696 struct re_backref_cache_entry *lim = mctx->bkref_ents + limit;
1697 int pos = (str_idx < lim->subexp_from ? -1
1698 : (lim->subexp_to < str_idx ? 1 : 0));
1699 if (pos == 0
1700 && (str_idx == lim->subexp_from || str_idx == lim->subexp_to))
1702 int node_idx;
1703 for (node_idx = 0; node_idx < eclosures->nelem; ++node_idx)
1705 int node = eclosures->elems[node_idx];
1706 re_token_type_t type= dfa->nodes[node].type;
1707 if (type == OP_BACK_REF)
1709 int bi;
1710 for (bi = 0; bi < mctx->nbkref_ents; ++bi)
1712 struct re_backref_cache_entry *ent = mctx->bkref_ents + bi;
1713 if (ent->node == node && ent->subexp_from == ent->subexp_to
1714 && ent->str_idx == str_idx)
1716 int cpos, dst;
1717 dst = dfa->edests[node].elems[0];
1718 cpos = check_dst_limits_calc_pos (dfa, mctx, limit,
1719 dfa->eclosures + dst,
1720 subexp_idx, dst,
1721 str_idx);
1722 if ((str_idx == lim->subexp_from && cpos == -1)
1723 || (str_idx == lim->subexp_to && cpos == 0))
1724 return cpos;
1728 if (type == OP_OPEN_SUBEXP && subexp_idx == dfa->nodes[node].opr.idx
1729 && str_idx == lim->subexp_from)
1731 pos = -1;
1732 break;
1734 if (type == OP_CLOSE_SUBEXP && subexp_idx == dfa->nodes[node].opr.idx
1735 && str_idx == lim->subexp_to)
1736 break;
1738 if (node_idx == eclosures->nelem && str_idx == lim->subexp_to)
1739 pos = 1;
1741 return pos;
1744 /* Check the limitations of sub expressions LIMITS, and remove the nodes
1745 which are against limitations from DEST_NODES. */
1747 static reg_errcode_t
1748 check_subexp_limits (dfa, dest_nodes, candidates, limits, bkref_ents, str_idx)
1749 re_dfa_t *dfa;
1750 re_node_set *dest_nodes;
1751 const re_node_set *candidates;
1752 re_node_set *limits;
1753 struct re_backref_cache_entry *bkref_ents;
1754 int str_idx;
1756 reg_errcode_t err;
1757 int node_idx, lim_idx;
1759 for (lim_idx = 0; lim_idx < limits->nelem; ++lim_idx)
1761 int subexp_idx;
1762 struct re_backref_cache_entry *ent;
1763 ent = bkref_ents + limits->elems[lim_idx];
1765 if (str_idx <= ent->subexp_from || ent->str_idx < str_idx)
1766 continue; /* This is unrelated limitation. */
1768 subexp_idx = dfa->nodes[ent->node].opr.idx - 1;
1769 if (ent->subexp_to == str_idx)
1771 int ops_node = -1;
1772 int cls_node = -1;
1773 for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
1775 int node = dest_nodes->elems[node_idx];
1776 re_token_type_t type= dfa->nodes[node].type;
1777 if (type == OP_OPEN_SUBEXP
1778 && subexp_idx == dfa->nodes[node].opr.idx)
1779 ops_node = node;
1780 else if (type == OP_CLOSE_SUBEXP
1781 && subexp_idx == dfa->nodes[node].opr.idx)
1782 cls_node = node;
1785 /* Check the limitation of the open subexpression. */
1786 /* Note that (ent->subexp_to = str_idx != ent->subexp_from). */
1787 if (ops_node >= 0)
1789 err = sub_epsilon_src_nodes(dfa, ops_node, dest_nodes,
1790 candidates);
1791 if (BE (err != REG_NOERROR, 0))
1792 return err;
1794 /* Check the limitation of the close subexpression. */
1795 for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
1797 int node = dest_nodes->elems[node_idx];
1798 if (!re_node_set_contains (dfa->inveclosures + node, cls_node)
1799 && !re_node_set_contains (dfa->eclosures + node, cls_node))
1801 /* It is against this limitation.
1802 Remove it form the current sifted state. */
1803 err = sub_epsilon_src_nodes(dfa, node, dest_nodes,
1804 candidates);
1805 if (BE (err != REG_NOERROR, 0))
1806 return err;
1807 --node_idx;
1811 else /* (ent->subexp_to != str_idx) */
1813 for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
1815 int node = dest_nodes->elems[node_idx];
1816 re_token_type_t type= dfa->nodes[node].type;
1817 if (type == OP_CLOSE_SUBEXP || type == OP_OPEN_SUBEXP)
1819 if (subexp_idx != dfa->nodes[node].opr.idx)
1820 continue;
1821 if ((type == OP_CLOSE_SUBEXP && ent->subexp_to != str_idx)
1822 || (type == OP_OPEN_SUBEXP))
1824 /* It is against this limitation.
1825 Remove it form the current sifted state. */
1826 err = sub_epsilon_src_nodes(dfa, node, dest_nodes,
1827 candidates);
1828 if (BE (err != REG_NOERROR, 0))
1829 return err;
1835 return REG_NOERROR;
1838 /* Search for the top (in case of sctx->check_subexp < 0) or the
1839 bottom (in case of sctx->check_subexp > 0) of the subexpressions
1840 which the backreference sctx->cur_bkref can match. */
1842 static reg_errcode_t
1843 search_subexp (preg, mctx, sctx, str_idx, dest_nodes)
1844 const regex_t *preg;
1845 re_match_context_t *mctx;
1846 re_sift_context_t *sctx;
1847 int str_idx;
1848 re_node_set *dest_nodes;
1850 reg_errcode_t err;
1851 re_dfa_t *dfa = (re_dfa_t *)preg->buffer;
1852 re_sift_context_t local_sctx;
1853 int node_idx, node;
1854 const re_node_set *candidates;
1855 re_dfastate_t **lim_states = NULL;
1856 candidates = ((mctx->state_log[str_idx] == NULL) ? &empty_set
1857 : &mctx->state_log[str_idx]->nodes);
1858 local_sctx.sifted_states = NULL; /* Mark that it hasn't been initialized. */
1860 for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
1862 re_token_type_t type;
1863 node = dest_nodes->elems[node_idx];
1864 type = dfa->nodes[node].type;
1866 if (type == OP_CLOSE_SUBEXP
1867 && sctx->check_subexp == dfa->nodes[node].opr.idx + 1)
1869 re_dfastate_t *cur_state;
1870 /* Found the bottom of the subexpression, then search for the
1871 top of it. */
1872 if (local_sctx.sifted_states == NULL)
1874 /* It hasn't been initialized yet, initialize it now. */
1875 local_sctx = *sctx;
1876 err = re_node_set_init_copy (&local_sctx.limits, &sctx->limits);
1877 if (BE (err != REG_NOERROR, 0))
1878 goto free_return;
1880 local_sctx.check_subexp = -sctx->check_subexp;
1881 local_sctx.limited_states = sctx->limited_states;
1882 local_sctx.last_node = node;
1883 local_sctx.last_str_idx = local_sctx.cls_subexp_idx = str_idx;
1884 cur_state = local_sctx.sifted_states[str_idx];
1885 err = sift_states_backward (preg, mctx, &local_sctx);
1886 local_sctx.sifted_states[str_idx] = cur_state;
1887 if (BE (err != REG_NOERROR, 0))
1888 goto free_return;
1889 /* There must not 2 same node in a node set. */
1890 break;
1892 else if (type == OP_OPEN_SUBEXP
1893 && -sctx->check_subexp == dfa->nodes[node].opr.idx + 1)
1895 /* Found the top of the subexpression, check that the
1896 backreference can match the input string. */
1897 char *buf;
1898 int dest_str_idx;
1899 int bkref_str_idx = re_string_cur_idx (mctx->input);
1900 int subexp_len = sctx->cls_subexp_idx - str_idx;
1901 if (subexp_len < 0 || bkref_str_idx + subexp_len > mctx->input->len)
1902 break;
1904 if (bkref_str_idx + subexp_len > mctx->input->valid_len
1905 && mctx->input->valid_len < mctx->input->len)
1907 reg_errcode_t err;
1908 err = extend_buffers (mctx);
1909 if (BE (err != REG_NOERROR, 0))
1910 goto free_return;
1912 buf = (char *) re_string_get_buffer (mctx->input);
1913 if (strncmp (buf + str_idx, buf + bkref_str_idx, subexp_len) != 0)
1914 break;
1916 if (sctx->limits.nelem && str_idx > 0)
1918 re_dfastate_t *cur_state = sctx->sifted_states[str_idx];
1919 if (lim_states == NULL)
1921 lim_states = re_malloc (re_dfastate_t *, str_idx + 1);
1922 if (BE (lim_states == NULL, 0))
1924 err = REG_ESPACE;
1925 goto free_return;
1928 if (local_sctx.sifted_states == NULL)
1930 /* It hasn't been initialized yet, initialize it now. */
1931 local_sctx = *sctx;
1932 err = re_node_set_init_copy (&local_sctx.limits,
1933 &sctx->limits);
1934 if (BE (err != REG_NOERROR, 0))
1935 goto free_return;
1937 local_sctx.check_subexp = 0;
1938 local_sctx.last_node = node;
1939 local_sctx.last_str_idx = str_idx;
1940 local_sctx.limited_states = lim_states;
1941 memset (lim_states, '\0',
1942 sizeof (re_dfastate_t*) * (str_idx + 1));
1943 err = sift_states_backward (preg, mctx, &local_sctx);
1944 if (BE (err != REG_NOERROR, 0))
1945 goto free_return;
1946 if (local_sctx.sifted_states[0] == NULL
1947 && local_sctx.limited_states[0] == NULL)
1949 sctx->sifted_states[str_idx] = cur_state;
1950 break;
1952 sctx->sifted_states[str_idx] = cur_state;
1954 /* Successfully matched, add a new cache entry. */
1955 dest_str_idx = bkref_str_idx + subexp_len;
1956 err = match_ctx_add_entry (mctx, sctx->cur_bkref, bkref_str_idx,
1957 str_idx, sctx->cls_subexp_idx);
1958 if (BE (err != REG_NOERROR, 0))
1959 goto free_return;
1960 err = clean_state_log_if_need (mctx, dest_str_idx);
1961 if (BE (err != REG_NOERROR, 0))
1962 goto free_return;
1963 break;
1967 /* Remove the top/bottom of the sub expression we processed. */
1968 if (node_idx < dest_nodes->nelem)
1970 err = sub_epsilon_src_nodes(dfa, node, dest_nodes, candidates);
1971 if (BE (err != REG_NOERROR, 0))
1972 goto free_return;
1973 /* Update state_log. */
1974 sctx->sifted_states[str_idx] = re_acquire_state (&err, dfa, dest_nodes);
1975 if (BE (err != REG_NOERROR, 0))
1976 goto free_return;
1978 err = REG_NOERROR;
1979 free_return:
1980 if (local_sctx.sifted_states != NULL)
1981 re_node_set_free (&local_sctx.limits);
1982 if (lim_states != NULL)
1983 re_free (lim_states);
1984 return err;
1987 static reg_errcode_t
1988 sift_states_bkref (preg, mctx, sctx, str_idx, dest_nodes)
1989 const regex_t *preg;
1990 re_match_context_t *mctx;
1991 re_sift_context_t *sctx;
1992 int str_idx;
1993 re_node_set *dest_nodes;
1995 reg_errcode_t err;
1996 re_dfa_t *dfa = (re_dfa_t *)preg->buffer;
1997 int node_idx, node;
1998 re_sift_context_t local_sctx;
1999 const re_node_set *candidates;
2000 candidates = ((mctx->state_log[str_idx] == NULL) ? &empty_set
2001 : &mctx->state_log[str_idx]->nodes);
2002 local_sctx.sifted_states = NULL; /* Mark that it hasn't been initialized. */
2004 for (node_idx = 0; node_idx < candidates->nelem; ++node_idx)
2006 int cur_bkref_idx = re_string_cur_idx (mctx->input);
2007 re_token_type_t type;
2008 node = candidates->elems[node_idx];
2009 type = dfa->nodes[node].type;
2010 if (node == sctx->cur_bkref && str_idx == cur_bkref_idx)
2011 continue;
2012 /* Avoid infinite loop for the REs like "()\1+". */
2013 if (node == sctx->last_node && str_idx == sctx->last_str_idx)
2014 continue;
2015 if (type == OP_BACK_REF)
2017 int enabled_idx;
2018 for (enabled_idx = 0; enabled_idx < mctx->nbkref_ents; ++enabled_idx)
2020 int disabled_idx, subexp_len, to_idx, dst_node;
2021 struct re_backref_cache_entry *entry;
2022 entry = mctx->bkref_ents + enabled_idx;
2023 subexp_len = entry->subexp_to - entry->subexp_from;
2024 to_idx = str_idx + subexp_len;
2025 dst_node = (subexp_len ? dfa->nexts[node]
2026 : dfa->edests[node].elems[0]);
2028 if (entry->node != node || entry->str_idx != str_idx
2029 || to_idx > sctx->last_str_idx
2030 || sctx->sifted_states[to_idx] == NULL)
2031 continue;
2032 if (!STATE_NODE_CONTAINS (sctx->sifted_states[to_idx], dst_node))
2033 continue;
2035 if (check_dst_limits (dfa, &sctx->limits, mctx, node,
2036 str_idx, dst_node, to_idx))
2037 continue;
2038 if (sctx->check_subexp == dfa->nodes[node].opr.idx)
2040 char *buf;
2041 buf = (char *) re_string_get_buffer (mctx->input);
2042 if (strncmp (buf + entry->subexp_from,
2043 buf + cur_bkref_idx, subexp_len) != 0)
2044 continue;
2045 err = match_ctx_add_entry (mctx, sctx->cur_bkref,
2046 cur_bkref_idx, entry->subexp_from,
2047 entry->subexp_to);
2048 if (BE (err != REG_NOERROR, 0))
2049 goto free_return;
2050 err = clean_state_log_if_need (mctx, cur_bkref_idx
2051 + subexp_len);
2052 if (BE (err != REG_NOERROR, 0))
2053 goto free_return;
2055 else
2057 re_dfastate_t *cur_state;
2058 entry->flag = 0;
2059 for (disabled_idx = enabled_idx + 1;
2060 disabled_idx < mctx->nbkref_ents; ++disabled_idx)
2062 struct re_backref_cache_entry *entry2;
2063 entry2 = mctx->bkref_ents + disabled_idx;
2064 if (entry2->node != node || entry2->str_idx != str_idx)
2065 continue;
2066 entry2->flag = 1;
2069 if (local_sctx.sifted_states == NULL)
2071 local_sctx = *sctx;
2072 err = re_node_set_init_copy (&local_sctx.limits,
2073 &sctx->limits);
2074 if (BE (err != REG_NOERROR, 0))
2075 goto free_return;
2077 local_sctx.last_node = node;
2078 local_sctx.last_str_idx = str_idx;
2079 err = re_node_set_insert (&local_sctx.limits, enabled_idx);
2080 if (BE (err < 0, 0))
2082 err = REG_ESPACE;
2083 goto free_return;
2085 cur_state = local_sctx.sifted_states[str_idx];
2086 err = sift_states_backward (preg, mctx, &local_sctx);
2087 if (BE (err != REG_NOERROR, 0))
2088 goto free_return;
2089 if (sctx->limited_states != NULL)
2091 err = merge_state_array (dfa, sctx->limited_states,
2092 local_sctx.sifted_states,
2093 str_idx + 1);
2094 if (BE (err != REG_NOERROR, 0))
2095 goto free_return;
2097 local_sctx.sifted_states[str_idx] = cur_state;
2098 re_node_set_remove (&local_sctx.limits, enabled_idx);
2099 /* We must not use the variable entry here, since
2100 mctx->bkref_ents might be realloced. */
2101 mctx->bkref_ents[enabled_idx].flag = 1;
2104 for (enabled_idx = 0; enabled_idx < mctx->nbkref_ents; ++enabled_idx)
2106 struct re_backref_cache_entry *entry;
2107 entry = mctx->bkref_ents + enabled_idx;
2108 if (entry->node == node && entry->str_idx == str_idx)
2109 entry->flag = 0;
2113 err = REG_NOERROR;
2114 free_return:
2115 if (local_sctx.sifted_states != NULL)
2117 re_node_set_free (&local_sctx.limits);
2120 return err;
2124 #ifdef RE_ENABLE_I18N
2125 static int
2126 sift_states_iter_mb (preg, mctx, sctx, node_idx, str_idx, max_str_idx)
2127 const regex_t *preg;
2128 const re_match_context_t *mctx;
2129 re_sift_context_t *sctx;
2130 int node_idx, str_idx, max_str_idx;
2132 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2133 int naccepted;
2134 /* Check the node can accept `multi byte'. */
2135 naccepted = check_node_accept_bytes (preg, node_idx, mctx->input, str_idx);
2136 if (naccepted > 0 && str_idx + naccepted <= max_str_idx &&
2137 !STATE_NODE_CONTAINS (sctx->sifted_states[str_idx + naccepted],
2138 dfa->nexts[node_idx]))
2139 /* The node can't accept the `multi byte', or the
2140 destination was already throwed away, then the node
2141 could't accept the current input `multi byte'. */
2142 naccepted = 0;
2143 /* Otherwise, it is sure that the node could accept
2144 `naccepted' bytes input. */
2145 return naccepted;
2147 #endif /* RE_ENABLE_I18N */
2150 /* Functions for state transition. */
2152 /* Return the next state to which the current state STATE will transit by
2153 accepting the current input byte, and update STATE_LOG if necessary.
2154 If STATE can accept a multibyte char/collating element/back reference
2155 update the destination of STATE_LOG. */
2157 static re_dfastate_t *
2158 transit_state (err, preg, mctx, state, fl_search)
2159 reg_errcode_t *err;
2160 const regex_t *preg;
2161 re_match_context_t *mctx;
2162 re_dfastate_t *state;
2163 int fl_search;
2165 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2166 re_dfastate_t **trtable, *next_state;
2167 unsigned char ch;
2169 if (re_string_cur_idx (mctx->input) + 1 >= mctx->input->bufs_len
2170 || (re_string_cur_idx (mctx->input) + 1 >= mctx->input->valid_len
2171 && mctx->input->valid_len < mctx->input->len))
2173 *err = extend_buffers (mctx);
2174 if (BE (*err != REG_NOERROR, 0))
2175 return NULL;
2178 *err = REG_NOERROR;
2179 if (state == NULL)
2181 next_state = state;
2182 re_string_skip_bytes (mctx->input, 1);
2184 else
2186 #ifdef RE_ENABLE_I18N
2187 /* If the current state can accept multibyte. */
2188 if (state->accept_mb)
2190 *err = transit_state_mb (preg, state, mctx);
2191 if (BE (*err != REG_NOERROR, 0))
2192 return NULL;
2194 #endif /* RE_ENABLE_I18N */
2196 /* Then decide the next state with the single byte. */
2197 if (1)
2199 /* Use transition table */
2200 ch = re_string_fetch_byte (mctx->input);
2201 trtable = fl_search ? state->trtable_search : state->trtable;
2202 if (trtable == NULL)
2204 trtable = build_trtable (preg, state, fl_search);
2205 if (fl_search)
2206 state->trtable_search = trtable;
2207 else
2208 state->trtable = trtable;
2210 next_state = trtable[ch];
2212 else
2214 /* don't use transition table */
2215 next_state = transit_state_sb (err, preg, state, fl_search, mctx);
2216 if (BE (next_state == NULL && err != REG_NOERROR, 0))
2217 return NULL;
2221 /* Update the state_log if we need. */
2222 if (mctx->state_log != NULL)
2224 int cur_idx = re_string_cur_idx (mctx->input);
2225 if (cur_idx > mctx->state_log_top)
2227 mctx->state_log[cur_idx] = next_state;
2228 mctx->state_log_top = cur_idx;
2230 else if (mctx->state_log[cur_idx] == 0)
2232 mctx->state_log[cur_idx] = next_state;
2234 else
2236 re_dfastate_t *pstate;
2237 unsigned int context;
2238 re_node_set next_nodes, *log_nodes, *table_nodes = NULL;
2239 /* If (state_log[cur_idx] != 0), it implies that cur_idx is
2240 the destination of a multibyte char/collating element/
2241 back reference. Then the next state is the union set of
2242 these destinations and the results of the transition table. */
2243 pstate = mctx->state_log[cur_idx];
2244 log_nodes = pstate->entrance_nodes;
2245 if (next_state != NULL)
2247 table_nodes = next_state->entrance_nodes;
2248 *err = re_node_set_init_union (&next_nodes, table_nodes,
2249 log_nodes);
2250 if (BE (*err != REG_NOERROR, 0))
2251 return NULL;
2253 else
2254 next_nodes = *log_nodes;
2255 /* Note: We already add the nodes of the initial state,
2256 then we don't need to add them here. */
2258 context = re_string_context_at (mctx->input,
2259 re_string_cur_idx (mctx->input) - 1,
2260 mctx->eflags, preg->newline_anchor);
2261 next_state = mctx->state_log[cur_idx]
2262 = re_acquire_state_context (err, dfa, &next_nodes, context);
2263 /* We don't need to check errors here, since the return value of
2264 this function is next_state and ERR is already set. */
2266 if (table_nodes != NULL)
2267 re_node_set_free (&next_nodes);
2269 /* If the next state has back references. */
2270 if (next_state != NULL && next_state->has_backref)
2272 *err = transit_state_bkref (preg, next_state, mctx);
2273 if (BE (*err != REG_NOERROR, 0))
2274 return NULL;
2275 next_state = mctx->state_log[cur_idx];
2278 return next_state;
2281 /* Helper functions for transit_state. */
2283 /* Return the next state to which the current state STATE will transit by
2284 accepting the current input byte. */
2286 static re_dfastate_t *
2287 transit_state_sb (err, preg, state, fl_search, mctx)
2288 reg_errcode_t *err;
2289 const regex_t *preg;
2290 re_dfastate_t *state;
2291 int fl_search;
2292 re_match_context_t *mctx;
2294 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2295 re_node_set next_nodes;
2296 re_dfastate_t *next_state;
2297 int node_cnt, cur_str_idx = re_string_cur_idx (mctx->input);
2298 unsigned int context;
2300 *err = re_node_set_alloc (&next_nodes, state->nodes.nelem + 1);
2301 if (BE (*err != REG_NOERROR, 0))
2302 return NULL;
2303 for (node_cnt = 0; node_cnt < state->nodes.nelem; ++node_cnt)
2305 int cur_node = state->nodes.elems[node_cnt];
2306 if (check_node_accept (preg, dfa->nodes + cur_node, mctx, cur_str_idx))
2308 *err = re_node_set_merge (&next_nodes,
2309 dfa->eclosures + dfa->nexts[cur_node]);
2310 if (BE (*err != REG_NOERROR, 0))
2312 re_node_set_free (&next_nodes);
2313 return NULL;
2317 if (fl_search)
2319 #ifdef RE_ENABLE_I18N
2320 int not_initial = 0;
2321 if (MB_CUR_MAX > 1)
2322 for (node_cnt = 0; node_cnt < next_nodes.nelem; ++node_cnt)
2323 if (dfa->nodes[next_nodes.elems[node_cnt]].type == CHARACTER)
2325 not_initial = dfa->nodes[next_nodes.elems[node_cnt]].mb_partial;
2326 break;
2328 if (!not_initial)
2329 #endif
2331 *err = re_node_set_merge (&next_nodes,
2332 dfa->init_state->entrance_nodes);
2333 if (BE (*err != REG_NOERROR, 0))
2335 re_node_set_free (&next_nodes);
2336 return NULL;
2340 context = re_string_context_at (mctx->input, cur_str_idx, mctx->eflags,
2341 preg->newline_anchor);
2342 next_state = re_acquire_state_context (err, dfa, &next_nodes, context);
2343 /* We don't need to check errors here, since the return value of
2344 this function is next_state and ERR is already set. */
2346 re_node_set_free (&next_nodes);
2347 re_string_skip_bytes (mctx->input, 1);
2348 return next_state;
2351 #ifdef RE_ENABLE_I18N
2352 static reg_errcode_t
2353 transit_state_mb (preg, pstate, mctx)
2354 const regex_t *preg;
2355 re_dfastate_t *pstate;
2356 re_match_context_t *mctx;
2358 reg_errcode_t err;
2359 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2360 int i;
2362 for (i = 0; i < pstate->nodes.nelem; ++i)
2364 re_node_set dest_nodes, *new_nodes;
2365 int cur_node_idx = pstate->nodes.elems[i];
2366 int naccepted = 0, dest_idx;
2367 unsigned int context;
2368 re_dfastate_t *dest_state;
2370 if (dfa->nodes[cur_node_idx].constraint)
2372 context = re_string_context_at (mctx->input,
2373 re_string_cur_idx (mctx->input),
2374 mctx->eflags, preg->newline_anchor);
2375 if (NOT_SATISFY_NEXT_CONSTRAINT (dfa->nodes[cur_node_idx].constraint,
2376 context))
2377 continue;
2380 /* How many bytes the node can accepts? */
2381 if (ACCEPT_MB_NODE (dfa->nodes[cur_node_idx].type))
2382 naccepted = check_node_accept_bytes (preg, cur_node_idx, mctx->input,
2383 re_string_cur_idx (mctx->input));
2384 if (naccepted == 0)
2385 continue;
2387 /* The node can accepts `naccepted' bytes. */
2388 dest_idx = re_string_cur_idx (mctx->input) + naccepted;
2389 mctx->max_mb_elem_len = ((mctx->max_mb_elem_len < naccepted) ? naccepted
2390 : mctx->max_mb_elem_len);
2391 err = clean_state_log_if_need (mctx, dest_idx);
2392 if (BE (err != REG_NOERROR, 0))
2393 return err;
2394 #ifdef DEBUG
2395 assert (dfa->nexts[cur_node_idx] != -1);
2396 #endif
2397 /* `cur_node_idx' may point the entity of the OP_CONTEXT_NODE,
2398 then we use pstate->nodes.elems[i] instead. */
2399 new_nodes = dfa->eclosures + dfa->nexts[pstate->nodes.elems[i]];
2401 dest_state = mctx->state_log[dest_idx];
2402 if (dest_state == NULL)
2403 dest_nodes = *new_nodes;
2404 else
2406 err = re_node_set_init_union (&dest_nodes,
2407 dest_state->entrance_nodes, new_nodes);
2408 if (BE (err != REG_NOERROR, 0))
2409 return err;
2411 context = re_string_context_at (mctx->input, dest_idx - 1, mctx->eflags,
2412 preg->newline_anchor);
2413 mctx->state_log[dest_idx]
2414 = re_acquire_state_context (&err, dfa, &dest_nodes, context);
2415 if (dest_state != NULL)
2416 re_node_set_free (&dest_nodes);
2417 if (BE (mctx->state_log[dest_idx] == NULL && err != REG_NOERROR, 0))
2418 return err;
2420 return REG_NOERROR;
2422 #endif /* RE_ENABLE_I18N */
2424 static reg_errcode_t
2425 transit_state_bkref (preg, pstate, mctx)
2426 const regex_t *preg;
2427 re_dfastate_t *pstate;
2428 re_match_context_t *mctx;
2430 reg_errcode_t err;
2431 re_dfastate_t **work_state_log;
2433 work_state_log = re_malloc (re_dfastate_t *,
2434 re_string_cur_idx (mctx->input) + 1);
2435 if (BE (work_state_log == NULL, 0))
2436 return REG_ESPACE;
2438 err = transit_state_bkref_loop (preg, &pstate->nodes, work_state_log, mctx);
2439 re_free (work_state_log);
2440 return err;
2443 /* Caller must allocate `work_state_log'. */
2445 static reg_errcode_t
2446 transit_state_bkref_loop (preg, nodes, work_state_log, mctx)
2447 const regex_t *preg;
2448 re_node_set *nodes;
2449 re_dfastate_t **work_state_log;
2450 re_match_context_t *mctx;
2452 reg_errcode_t err;
2453 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2454 int i;
2455 regmatch_t *cur_regs = re_malloc (regmatch_t, preg->re_nsub + 1);
2456 int cur_str_idx = re_string_cur_idx (mctx->input);
2457 if (BE (cur_regs == NULL, 0))
2458 return REG_ESPACE;
2460 for (i = 0; i < nodes->nelem; ++i)
2462 int dest_str_idx, subexp_idx, prev_nelem, bkc_idx;
2463 int node_idx = nodes->elems[i];
2464 unsigned int context;
2465 re_token_t *node = dfa->nodes + node_idx;
2466 re_node_set *new_dest_nodes;
2467 re_sift_context_t sctx;
2469 /* Check whether `node' is a backreference or not. */
2470 if (node->type == OP_BACK_REF)
2471 subexp_idx = node->opr.idx;
2472 else
2473 continue;
2475 if (node->constraint)
2477 context = re_string_context_at (mctx->input, cur_str_idx,
2478 mctx->eflags, preg->newline_anchor);
2479 if (NOT_SATISFY_NEXT_CONSTRAINT (node->constraint, context))
2480 continue;
2483 /* `node' is a backreference.
2484 Check the substring which the substring matched. */
2485 sift_ctx_init (&sctx, work_state_log, NULL, node_idx, cur_str_idx,
2486 subexp_idx);
2487 sctx.cur_bkref = node_idx;
2488 match_ctx_clear_flag (mctx);
2489 err = sift_states_backward (preg, mctx, &sctx);
2490 if (BE (err != REG_NOERROR, 0))
2491 goto free_return;
2493 /* And add the epsilon closures (which is `new_dest_nodes') of
2494 the backreference to appropriate state_log. */
2495 #ifdef DEBUG
2496 assert (dfa->nexts[node_idx] != -1);
2497 #endif
2498 for (bkc_idx = 0; bkc_idx < mctx->nbkref_ents; ++bkc_idx)
2500 int subexp_len;
2501 re_dfastate_t *dest_state;
2502 struct re_backref_cache_entry *bkref_ent;
2503 bkref_ent = mctx->bkref_ents + bkc_idx;
2504 if (bkref_ent->node != node_idx || bkref_ent->str_idx != cur_str_idx)
2505 continue;
2506 subexp_len = bkref_ent->subexp_to - bkref_ent->subexp_from;
2507 new_dest_nodes = (subexp_len == 0
2508 ? dfa->eclosures + dfa->edests[node_idx].elems[0]
2509 : dfa->eclosures + dfa->nexts[node_idx]);
2510 dest_str_idx = (cur_str_idx + bkref_ent->subexp_to
2511 - bkref_ent->subexp_from);
2512 context = (IS_WORD_CHAR (re_string_byte_at (mctx->input,
2513 dest_str_idx - 1))
2514 ? CONTEXT_WORD : 0);
2515 dest_state = mctx->state_log[dest_str_idx];
2516 prev_nelem = ((mctx->state_log[cur_str_idx] == NULL) ? 0
2517 : mctx->state_log[cur_str_idx]->nodes.nelem);
2518 /* Add `new_dest_node' to state_log. */
2519 if (dest_state == NULL)
2521 mctx->state_log[dest_str_idx]
2522 = re_acquire_state_context (&err, dfa, new_dest_nodes,
2523 context);
2524 if (BE (mctx->state_log[dest_str_idx] == NULL
2525 && err != REG_NOERROR, 0))
2526 goto free_return;
2528 else
2530 re_node_set dest_nodes;
2531 err = re_node_set_init_union (&dest_nodes,
2532 dest_state->entrance_nodes,
2533 new_dest_nodes);
2534 if (BE (err != REG_NOERROR, 0))
2536 re_node_set_free (&dest_nodes);
2537 goto free_return;
2539 mctx->state_log[dest_str_idx]
2540 = re_acquire_state_context (&err, dfa, &dest_nodes, context);
2541 re_node_set_free (&dest_nodes);
2542 if (BE (mctx->state_log[dest_str_idx] == NULL
2543 && err != REG_NOERROR, 0))
2544 goto free_return;
2546 /* We need to check recursively if the backreference can epsilon
2547 transit. */
2548 if (subexp_len == 0
2549 && mctx->state_log[cur_str_idx]->nodes.nelem > prev_nelem)
2551 err = transit_state_bkref_loop (preg, new_dest_nodes,
2552 work_state_log, mctx);
2553 if (BE (err != REG_NOERROR, 0))
2554 goto free_return;
2558 err = REG_NOERROR;
2559 free_return:
2560 re_free (cur_regs);
2561 return err;
2564 /* Build transition table for the state.
2565 Return the new table if succeeded, otherwise return NULL. */
2567 static re_dfastate_t **
2568 build_trtable (preg, state, fl_search)
2569 const regex_t *preg;
2570 const re_dfastate_t *state;
2571 int fl_search;
2573 reg_errcode_t err;
2574 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2575 int i, j, k, ch;
2576 int dests_node_malloced = 0, dest_states_malloced = 0;
2577 int ndests; /* Number of the destination states from `state'. */
2578 re_dfastate_t **trtable;
2579 re_dfastate_t **dest_states = NULL, **dest_states_word, **dest_states_nl;
2580 re_node_set follows, *dests_node;
2581 bitset *dests_ch;
2582 bitset acceptable;
2584 /* We build DFA states which corresponds to the destination nodes
2585 from `state'. `dests_node[i]' represents the nodes which i-th
2586 destination state contains, and `dests_ch[i]' represents the
2587 characters which i-th destination state accepts. */
2588 #ifdef _LIBC
2589 if (__libc_use_alloca ((sizeof (re_node_set) + sizeof (bitset)) * SBC_MAX))
2590 dests_node = (re_node_set *)
2591 alloca ((sizeof (re_node_set) + sizeof (bitset)) * SBC_MAX);
2592 else
2593 #endif
2595 dests_node = (re_node_set *)
2596 malloc ((sizeof (re_node_set) + sizeof (bitset)) * SBC_MAX);
2597 if (BE (dests_node == NULL, 0))
2598 return NULL;
2599 dests_node_malloced = 1;
2601 dests_ch = (bitset *) (dests_node + SBC_MAX);
2603 /* Initialize transiton table. */
2604 trtable = (re_dfastate_t **) calloc (sizeof (re_dfastate_t *), SBC_MAX);
2605 if (BE (trtable == NULL, 0))
2607 if (dests_node_malloced)
2608 free (dests_node);
2609 return NULL;
2612 /* At first, group all nodes belonging to `state' into several
2613 destinations. */
2614 ndests = group_nodes_into_DFAstates (preg, state, dests_node, dests_ch);
2615 if (BE (ndests <= 0, 0))
2617 if (dests_node_malloced)
2618 free (dests_node);
2619 /* Return NULL in case of an error, trtable otherwise. */
2620 if (ndests == 0)
2621 return trtable;
2622 free (trtable);
2623 return NULL;
2626 err = re_node_set_alloc (&follows, ndests + 1);
2627 if (BE (err != REG_NOERROR, 0))
2628 goto out_free;
2630 #ifdef _LIBC
2631 if (__libc_use_alloca ((sizeof (re_node_set) + sizeof (bitset)) * SBC_MAX
2632 + ndests * 3 * sizeof (re_dfastate_t *)))
2633 dest_states = (re_dfastate_t **)
2634 alloca (ndests * 3 * sizeof (re_dfastate_t *));
2635 else
2636 #endif
2638 dest_states = (re_dfastate_t **)
2639 malloc (ndests * 3 * sizeof (re_dfastate_t *));
2640 if (BE (dest_states == NULL, 0))
2642 out_free:
2643 if (dest_states_malloced)
2644 free (dest_states);
2645 re_node_set_free (&follows);
2646 for (i = 0; i < ndests; ++i)
2647 re_node_set_free (dests_node + i);
2648 free (trtable);
2649 if (dests_node_malloced)
2650 free (dests_node);
2651 return NULL;
2653 dest_states_malloced = 1;
2655 dest_states_word = dest_states + ndests;
2656 dest_states_nl = dest_states_word + ndests;
2657 bitset_empty (acceptable);
2659 /* Then build the states for all destinations. */
2660 for (i = 0; i < ndests; ++i)
2662 int next_node;
2663 re_node_set_empty (&follows);
2664 /* Merge the follows of this destination states. */
2665 for (j = 0; j < dests_node[i].nelem; ++j)
2667 next_node = dfa->nexts[dests_node[i].elems[j]];
2668 if (next_node != -1)
2670 err = re_node_set_merge (&follows, dfa->eclosures + next_node);
2671 if (BE (err != REG_NOERROR, 0))
2672 goto out_free;
2675 /* If search flag is set, merge the initial state. */
2676 if (fl_search)
2678 #ifdef RE_ENABLE_I18N
2679 int not_initial = 0;
2680 for (j = 0; j < follows.nelem; ++j)
2681 if (dfa->nodes[follows.elems[j]].type == CHARACTER)
2683 not_initial = dfa->nodes[follows.elems[j]].mb_partial;
2684 break;
2686 if (!not_initial)
2687 #endif
2689 err = re_node_set_merge (&follows,
2690 dfa->init_state->entrance_nodes);
2691 if (BE (err != REG_NOERROR, 0))
2692 goto out_free;
2695 dest_states[i] = re_acquire_state_context (&err, dfa, &follows, 0);
2696 if (BE (dest_states[i] == NULL && err != REG_NOERROR, 0))
2697 goto out_free;
2698 /* If the new state has context constraint,
2699 build appropriate states for these contexts. */
2700 if (dest_states[i]->has_constraint)
2702 dest_states_word[i] = re_acquire_state_context (&err, dfa, &follows,
2703 CONTEXT_WORD);
2704 if (BE (dest_states_word[i] == NULL && err != REG_NOERROR, 0))
2705 goto out_free;
2706 dest_states_nl[i] = re_acquire_state_context (&err, dfa, &follows,
2707 CONTEXT_NEWLINE);
2708 if (BE (dest_states_nl[i] == NULL && err != REG_NOERROR, 0))
2709 goto out_free;
2711 else
2713 dest_states_word[i] = dest_states[i];
2714 dest_states_nl[i] = dest_states[i];
2716 bitset_merge (acceptable, dests_ch[i]);
2719 /* Update the transition table. */
2720 /* For all characters ch...: */
2721 for (i = 0, ch = 0; i < BITSET_UINTS; ++i)
2722 for (j = 0; j < UINT_BITS; ++j, ++ch)
2723 if ((acceptable[i] >> j) & 1)
2725 /* The current state accepts the character ch. */
2726 if (IS_WORD_CHAR (ch))
2728 for (k = 0; k < ndests; ++k)
2729 if ((dests_ch[k][i] >> j) & 1)
2731 /* k-th destination accepts the word character ch. */
2732 trtable[ch] = dest_states_word[k];
2733 /* There must be only one destination which accepts
2734 character ch. See group_nodes_into_DFAstates. */
2735 break;
2738 else /* not WORD_CHAR */
2740 for (k = 0; k < ndests; ++k)
2741 if ((dests_ch[k][i] >> j) & 1)
2743 /* k-th destination accepts the non-word character ch. */
2744 trtable[ch] = dest_states[k];
2745 /* There must be only one destination which accepts
2746 character ch. See group_nodes_into_DFAstates. */
2747 break;
2751 /* new line */
2752 if (bitset_contain (acceptable, NEWLINE_CHAR))
2754 /* The current state accepts newline character. */
2755 for (k = 0; k < ndests; ++k)
2756 if (bitset_contain (dests_ch[k], NEWLINE_CHAR))
2758 /* k-th destination accepts newline character. */
2759 trtable[NEWLINE_CHAR] = dest_states_nl[k];
2760 /* There must be only one destination which accepts
2761 newline. See group_nodes_into_DFAstates. */
2762 break;
2766 if (dest_states_malloced)
2767 free (dest_states);
2769 re_node_set_free (&follows);
2770 for (i = 0; i < ndests; ++i)
2771 re_node_set_free (dests_node + i);
2773 if (dests_node_malloced)
2774 free (dests_node);
2776 return trtable;
2779 /* Group all nodes belonging to STATE into several destinations.
2780 Then for all destinations, set the nodes belonging to the destination
2781 to DESTS_NODE[i] and set the characters accepted by the destination
2782 to DEST_CH[i]. This function return the number of destinations. */
2784 static int
2785 group_nodes_into_DFAstates (preg, state, dests_node, dests_ch)
2786 const regex_t *preg;
2787 const re_dfastate_t *state;
2788 re_node_set *dests_node;
2789 bitset *dests_ch;
2791 reg_errcode_t err;
2792 const re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2793 int i, j, k;
2794 int ndests; /* Number of the destinations from `state'. */
2795 bitset accepts; /* Characters a node can accept. */
2796 const re_node_set *cur_nodes = &state->nodes;
2797 bitset_empty (accepts);
2798 ndests = 0;
2800 /* For all the nodes belonging to `state', */
2801 for (i = 0; i < cur_nodes->nelem; ++i)
2803 re_token_t *node = &dfa->nodes[cur_nodes->elems[i]];
2804 re_token_type_t type = node->type;
2805 unsigned int constraint = node->constraint;
2807 /* Enumerate all single byte character this node can accept. */
2808 if (type == CHARACTER)
2809 bitset_set (accepts, node->opr.c);
2810 else if (type == SIMPLE_BRACKET)
2812 bitset_merge (accepts, node->opr.sbcset);
2814 else if (type == OP_PERIOD)
2816 bitset_set_all (accepts);
2817 if (!(preg->syntax & RE_DOT_NEWLINE))
2818 bitset_clear (accepts, '\n');
2819 if (preg->syntax & RE_DOT_NOT_NULL)
2820 bitset_clear (accepts, '\0');
2822 else
2823 continue;
2825 /* Check the `accepts' and sift the characters which are not
2826 match it the context. */
2827 if (constraint)
2829 if (constraint & NEXT_WORD_CONSTRAINT)
2830 for (j = 0; j < BITSET_UINTS; ++j)
2831 accepts[j] &= dfa->word_char[j];
2832 if (constraint & NEXT_NOTWORD_CONSTRAINT)
2833 for (j = 0; j < BITSET_UINTS; ++j)
2834 accepts[j] &= ~dfa->word_char[j];
2835 if (constraint & NEXT_NEWLINE_CONSTRAINT)
2837 int accepts_newline = bitset_contain (accepts, NEWLINE_CHAR);
2838 bitset_empty (accepts);
2839 if (accepts_newline)
2840 bitset_set (accepts, NEWLINE_CHAR);
2841 else
2842 continue;
2846 /* Then divide `accepts' into DFA states, or create a new
2847 state. */
2848 for (j = 0; j < ndests; ++j)
2850 bitset intersec; /* Intersection sets, see below. */
2851 bitset remains;
2852 /* Flags, see below. */
2853 int has_intersec, not_subset, not_consumed;
2855 /* Optimization, skip if this state doesn't accept the character. */
2856 if (type == CHARACTER && !bitset_contain (dests_ch[j], node->opr.c))
2857 continue;
2859 /* Enumerate the intersection set of this state and `accepts'. */
2860 has_intersec = 0;
2861 for (k = 0; k < BITSET_UINTS; ++k)
2862 has_intersec |= intersec[k] = accepts[k] & dests_ch[j][k];
2863 /* And skip if the intersection set is empty. */
2864 if (!has_intersec)
2865 continue;
2867 /* Then check if this state is a subset of `accepts'. */
2868 not_subset = not_consumed = 0;
2869 for (k = 0; k < BITSET_UINTS; ++k)
2871 not_subset |= remains[k] = ~accepts[k] & dests_ch[j][k];
2872 not_consumed |= accepts[k] = accepts[k] & ~dests_ch[j][k];
2875 /* If this state isn't a subset of `accepts', create a
2876 new group state, which has the `remains'. */
2877 if (not_subset)
2879 bitset_copy (dests_ch[ndests], remains);
2880 bitset_copy (dests_ch[j], intersec);
2881 err = re_node_set_init_copy (dests_node + ndests, &dests_node[j]);
2882 if (BE (err != REG_NOERROR, 0))
2883 goto error_return;
2884 ++ndests;
2887 /* Put the position in the current group. */
2888 err = re_node_set_insert (&dests_node[j], cur_nodes->elems[i]);
2889 if (BE (err < 0, 0))
2890 goto error_return;
2892 /* If all characters are consumed, go to next node. */
2893 if (!not_consumed)
2894 break;
2896 /* Some characters remain, create a new group. */
2897 if (j == ndests)
2899 bitset_copy (dests_ch[ndests], accepts);
2900 err = re_node_set_init_1 (dests_node + ndests, cur_nodes->elems[i]);
2901 if (BE (err != REG_NOERROR, 0))
2902 goto error_return;
2903 ++ndests;
2904 bitset_empty (accepts);
2907 return ndests;
2908 error_return:
2909 for (j = 0; j < ndests; ++j)
2910 re_node_set_free (dests_node + j);
2911 return -1;
2914 #ifdef RE_ENABLE_I18N
2915 /* Check how many bytes the node `dfa->nodes[node_idx]' accepts.
2916 Return the number of the bytes the node accepts.
2917 STR_IDX is the current index of the input string.
2919 This function handles the nodes which can accept one character, or
2920 one collating element like '.', '[a-z]', opposite to the other nodes
2921 can only accept one byte. */
2923 static int
2924 check_node_accept_bytes (preg, node_idx, input, str_idx)
2925 const regex_t *preg;
2926 int node_idx, str_idx;
2927 const re_string_t *input;
2929 const re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2930 const re_token_t *node = dfa->nodes + node_idx;
2931 int elem_len = re_string_elem_size_at (input, str_idx);
2932 int char_len = re_string_char_size_at (input, str_idx);
2933 int i;
2934 # ifdef _LIBC
2935 int j;
2936 uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
2937 # endif /* _LIBC */
2938 if (elem_len <= 1 && char_len <= 1)
2939 return 0;
2940 if (node->type == OP_PERIOD)
2942 /* '.' accepts any one character except the following two cases. */
2943 if ((!(preg->syntax & RE_DOT_NEWLINE) &&
2944 re_string_byte_at (input, str_idx) == '\n') ||
2945 ((preg->syntax & RE_DOT_NOT_NULL) &&
2946 re_string_byte_at (input, str_idx) == '\0'))
2947 return 0;
2948 return char_len;
2950 else if (node->type == COMPLEX_BRACKET)
2952 const re_charset_t *cset = node->opr.mbcset;
2953 # ifdef _LIBC
2954 const unsigned char *pin = re_string_get_buffer (input) + str_idx;
2955 # endif /* _LIBC */
2956 int match_len = 0;
2957 wchar_t wc = ((cset->nranges || cset->nchar_classes || cset->nmbchars)
2958 ? re_string_wchar_at (input, str_idx) : 0);
2960 /* match with multibyte character? */
2961 for (i = 0; i < cset->nmbchars; ++i)
2962 if (wc == cset->mbchars[i])
2964 match_len = char_len;
2965 goto check_node_accept_bytes_match;
2967 /* match with character_class? */
2968 for (i = 0; i < cset->nchar_classes; ++i)
2970 wctype_t wt = cset->char_classes[i];
2971 if (__iswctype (wc, wt))
2973 match_len = char_len;
2974 goto check_node_accept_bytes_match;
2978 # ifdef _LIBC
2979 if (nrules != 0)
2981 unsigned int in_collseq = 0;
2982 const int32_t *table, *indirect;
2983 const unsigned char *weights, *extra;
2984 const char *collseqwc;
2985 int32_t idx;
2986 /* This #include defines a local function! */
2987 # include <locale/weight.h>
2989 /* match with collating_symbol? */
2990 if (cset->ncoll_syms)
2991 extra = (const unsigned char *)
2992 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB);
2993 for (i = 0; i < cset->ncoll_syms; ++i)
2995 const unsigned char *coll_sym = extra + cset->coll_syms[i];
2996 /* Compare the length of input collating element and
2997 the length of current collating element. */
2998 if (*coll_sym != elem_len)
2999 continue;
3000 /* Compare each bytes. */
3001 for (j = 0; j < *coll_sym; j++)
3002 if (pin[j] != coll_sym[1 + j])
3003 break;
3004 if (j == *coll_sym)
3006 /* Match if every bytes is equal. */
3007 match_len = j;
3008 goto check_node_accept_bytes_match;
3012 if (cset->nranges)
3014 if (elem_len <= char_len)
3016 collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
3017 in_collseq = collseq_table_lookup (collseqwc, wc);
3019 else
3020 in_collseq = find_collation_sequence_value (pin, elem_len);
3022 /* match with range expression? */
3023 for (i = 0; i < cset->nranges; ++i)
3024 if (cset->range_starts[i] <= in_collseq
3025 && in_collseq <= cset->range_ends[i])
3027 match_len = elem_len;
3028 goto check_node_accept_bytes_match;
3031 /* match with equivalence_class? */
3032 if (cset->nequiv_classes)
3034 const unsigned char *cp = pin;
3035 table = (const int32_t *)
3036 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3037 weights = (const unsigned char *)
3038 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_WEIGHTMB);
3039 extra = (const unsigned char *)
3040 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_EXTRAMB);
3041 indirect = (const int32_t *)
3042 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_INDIRECTMB);
3043 idx = findidx (&cp);
3044 if (idx > 0)
3045 for (i = 0; i < cset->nequiv_classes; ++i)
3047 int32_t equiv_class_idx = cset->equiv_classes[i];
3048 size_t weight_len = weights[idx];
3049 if (weight_len == weights[equiv_class_idx])
3051 int cnt = 0;
3052 while (cnt <= weight_len
3053 && (weights[equiv_class_idx + 1 + cnt]
3054 == weights[idx + 1 + cnt]))
3055 ++cnt;
3056 if (cnt > weight_len)
3058 match_len = elem_len;
3059 goto check_node_accept_bytes_match;
3065 else
3066 # endif /* _LIBC */
3068 /* match with range expression? */
3069 #if __GNUC__ >= 2
3070 wchar_t cmp_buf[] = {L'\0', L'\0', wc, L'\0', L'\0', L'\0'};
3071 #else
3072 wchar_t cmp_buf[] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'};
3073 cmp_buf[2] = wc;
3074 #endif
3075 for (i = 0; i < cset->nranges; ++i)
3077 cmp_buf[0] = cset->range_starts[i];
3078 cmp_buf[4] = cset->range_ends[i];
3079 if (wcscoll (cmp_buf, cmp_buf + 2) <= 0
3080 && wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0)
3082 match_len = char_len;
3083 goto check_node_accept_bytes_match;
3087 check_node_accept_bytes_match:
3088 if (!cset->non_match)
3089 return match_len;
3090 else
3092 if (match_len > 0)
3093 return 0;
3094 else
3095 return (elem_len > char_len) ? elem_len : char_len;
3098 return 0;
3101 # ifdef _LIBC
3102 static unsigned int
3103 find_collation_sequence_value (mbs, mbs_len)
3104 const unsigned char *mbs;
3105 size_t mbs_len;
3107 uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3108 if (nrules == 0)
3110 if (mbs_len == 1)
3112 /* No valid character. Match it as a single byte character. */
3113 const unsigned char *collseq = (const unsigned char *)
3114 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
3115 return collseq[mbs[0]];
3117 return UINT_MAX;
3119 else
3121 int32_t idx;
3122 const unsigned char *extra = (const unsigned char *)
3123 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB);
3125 for (idx = 0; ;)
3127 int mbs_cnt, found = 0;
3128 int32_t elem_mbs_len;
3129 /* Skip the name of collating element name. */
3130 idx = idx + extra[idx] + 1;
3131 elem_mbs_len = extra[idx++];
3132 if (mbs_len == elem_mbs_len)
3134 for (mbs_cnt = 0; mbs_cnt < elem_mbs_len; ++mbs_cnt)
3135 if (extra[idx + mbs_cnt] != mbs[mbs_cnt])
3136 break;
3137 if (mbs_cnt == elem_mbs_len)
3138 /* Found the entry. */
3139 found = 1;
3141 /* Skip the byte sequence of the collating element. */
3142 idx += elem_mbs_len;
3143 /* Adjust for the alignment. */
3144 idx = (idx + 3) & ~3;
3145 /* Skip the collation sequence value. */
3146 idx += sizeof (uint32_t);
3147 /* Skip the wide char sequence of the collating element. */
3148 idx = idx + sizeof (uint32_t) * (extra[idx] + 1);
3149 /* If we found the entry, return the sequence value. */
3150 if (found)
3151 return *(uint32_t *) (extra + idx);
3152 /* Skip the collation sequence value. */
3153 idx += sizeof (uint32_t);
3157 # endif /* _LIBC */
3158 #endif /* RE_ENABLE_I18N */
3160 /* Check whether the node accepts the byte which is IDX-th
3161 byte of the INPUT. */
3163 static int
3164 check_node_accept (preg, node, mctx, idx)
3165 const regex_t *preg;
3166 const re_token_t *node;
3167 const re_match_context_t *mctx;
3168 int idx;
3170 unsigned char ch;
3171 if (node->constraint)
3173 /* The node has constraints. Check whether the current context
3174 satisfies the constraints. */
3175 unsigned int context = re_string_context_at (mctx->input, idx,
3176 mctx->eflags,
3177 preg->newline_anchor);
3178 if (NOT_SATISFY_NEXT_CONSTRAINT (node->constraint, context))
3179 return 0;
3181 ch = re_string_byte_at (mctx->input, idx);
3182 if (node->type == CHARACTER)
3183 return node->opr.c == ch;
3184 else if (node->type == SIMPLE_BRACKET)
3185 return bitset_contain (node->opr.sbcset, ch);
3186 else if (node->type == OP_PERIOD)
3187 return !((ch == '\n' && !(preg->syntax & RE_DOT_NEWLINE))
3188 || (ch == '\0' && (preg->syntax & RE_DOT_NOT_NULL)));
3189 else
3190 return 0;
3193 /* Extend the buffers, if the buffers have run out. */
3195 static reg_errcode_t
3196 extend_buffers (mctx)
3197 re_match_context_t *mctx;
3199 reg_errcode_t ret;
3200 re_string_t *pstr = mctx->input;
3202 /* Double the lengthes of the buffers. */
3203 ret = re_string_realloc_buffers (pstr, pstr->bufs_len * 2);
3204 if (BE (ret != REG_NOERROR, 0))
3205 return ret;
3207 if (mctx->state_log != NULL)
3209 /* And double the length of state_log. */
3210 re_dfastate_t **new_array;
3211 new_array = re_realloc (mctx->state_log, re_dfastate_t *,
3212 pstr->bufs_len * 2);
3213 if (BE (new_array == NULL, 0))
3214 return REG_ESPACE;
3215 mctx->state_log = new_array;
3218 /* Then reconstruct the buffers. */
3219 if (pstr->icase)
3221 #ifdef RE_ENABLE_I18N
3222 if (MB_CUR_MAX > 1)
3223 build_wcs_upper_buffer (pstr);
3224 else
3225 #endif /* RE_ENABLE_I18N */
3226 build_upper_buffer (pstr);
3228 else
3230 #ifdef RE_ENABLE_I18N
3231 if (MB_CUR_MAX > 1)
3232 build_wcs_buffer (pstr);
3233 else
3234 #endif /* RE_ENABLE_I18N */
3236 if (pstr->trans != NULL)
3237 re_string_translate_buffer (pstr);
3238 else
3239 pstr->valid_len = pstr->bufs_len;
3242 return REG_NOERROR;
3246 /* Functions for matching context. */
3248 static reg_errcode_t
3249 match_ctx_init (mctx, eflags, input, n)
3250 re_match_context_t *mctx;
3251 int eflags, n;
3252 re_string_t *input;
3254 mctx->eflags = eflags;
3255 mctx->input = input;
3256 mctx->match_last = -1;
3257 if (n > 0)
3259 mctx->bkref_ents = re_malloc (struct re_backref_cache_entry, n);
3260 if (BE (mctx->bkref_ents == NULL, 0))
3261 return REG_ESPACE;
3263 else
3264 mctx->bkref_ents = NULL;
3265 mctx->nbkref_ents = 0;
3266 mctx->abkref_ents = n;
3267 mctx->max_mb_elem_len = 0;
3268 return REG_NOERROR;
3271 static void
3272 match_ctx_free (mctx)
3273 re_match_context_t *mctx;
3275 re_free (mctx->bkref_ents);
3278 /* Add a new backreference entry to the cache. */
3280 static reg_errcode_t
3281 match_ctx_add_entry (mctx, node, str_idx, from, to)
3282 re_match_context_t *mctx;
3283 int node, str_idx, from, to;
3285 if (mctx->nbkref_ents >= mctx->abkref_ents)
3287 struct re_backref_cache_entry* new_entry;
3288 new_entry = re_realloc (mctx->bkref_ents, struct re_backref_cache_entry,
3289 mctx->abkref_ents * 2);
3290 if (BE (new_entry == NULL, 0))
3292 re_free (mctx->bkref_ents);
3293 return REG_ESPACE;
3295 mctx->bkref_ents = new_entry;
3296 memset (mctx->bkref_ents + mctx->nbkref_ents, '\0',
3297 sizeof (struct re_backref_cache_entry) * mctx->abkref_ents);
3298 mctx->abkref_ents *= 2;
3300 mctx->bkref_ents[mctx->nbkref_ents].node = node;
3301 mctx->bkref_ents[mctx->nbkref_ents].str_idx = str_idx;
3302 mctx->bkref_ents[mctx->nbkref_ents].subexp_from = from;
3303 mctx->bkref_ents[mctx->nbkref_ents].subexp_to = to;
3304 mctx->bkref_ents[mctx->nbkref_ents++].flag = 0;
3305 if (mctx->max_mb_elem_len < to - from)
3306 mctx->max_mb_elem_len = to - from;
3307 return REG_NOERROR;
3310 static void
3311 match_ctx_clear_flag (mctx)
3312 re_match_context_t *mctx;
3314 int i;
3315 for (i = 0; i < mctx->nbkref_ents; ++i)
3317 mctx->bkref_ents[i].flag = 0;
3321 static void
3322 sift_ctx_init (sctx, sifted_sts, limited_sts, last_node, last_str_idx,
3323 check_subexp)
3324 re_sift_context_t *sctx;
3325 re_dfastate_t **sifted_sts, **limited_sts;
3326 int last_node, last_str_idx, check_subexp;
3328 sctx->sifted_states = sifted_sts;
3329 sctx->limited_states = limited_sts;
3330 sctx->last_node = last_node;
3331 sctx->last_str_idx = last_str_idx;
3332 sctx->check_subexp = check_subexp;
3333 sctx->cur_bkref = -1;
3334 sctx->cls_subexp_idx = -1;
3335 re_node_set_init_empty (&sctx->limits);