Update.
[glibc.git] / posix / regexec.c
blob8988c6633c799b496dac7b8580ddaab9cc58e5c9
1 /* Extended regular expression matching and search library.
2 Copyright (C) 2002 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4 Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library; if not, write to the Free
18 Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
19 02111-1307 USA. */
21 #include <assert.h>
22 #include <ctype.h>
23 #include <stdio.h>
24 #include <stdlib.h>
25 #include <string.h>
27 #if defined HAVE_WCHAR_H || defined _LIBC
28 # include <wchar.h>
29 #endif /* HAVE_WCHAR_H || _LIBC */
30 #if defined HAVE_WCTYPE_H || defined _LIBC
31 # include <wctype.h>
32 #endif /* HAVE_WCTYPE_H || _LIBC */
34 #ifdef _LIBC
35 # ifndef _RE_DEFINE_LOCALE_FUNCTIONS
36 # define _RE_DEFINE_LOCALE_FUNCTIONS 1
37 # include <locale/localeinfo.h>
38 # include <locale/elem-hash.h>
39 # include <locale/coll-lookup.h>
40 # endif
41 #endif
43 #include "regex.h"
44 #include "regex_internal.h"
46 static reg_errcode_t match_ctx_init (re_match_context_t *cache, int eflags,
47 re_string_t *input, int n);
48 static void match_ctx_free (re_match_context_t *cache);
49 static reg_errcode_t match_ctx_add_entry (re_match_context_t *cache, int node,
50 int str_idx, int from, int to);
51 static void match_ctx_clear_flag (re_match_context_t *mctx);
52 static void sift_ctx_init (re_sift_context_t *sctx, re_dfastate_t **sifted_sts,
53 re_dfastate_t **limited_sts, int last_node,
54 int last_str_idx, int check_subexp);
55 static reg_errcode_t re_search_internal (const regex_t *preg,
56 const char *string, int length,
57 int start, int range, int stop,
58 size_t nmatch, regmatch_t pmatch[],
59 int eflags);
60 static int re_search_2_stub (struct re_pattern_buffer *bufp,
61 const char *string1, int length1,
62 const char *string2, int length2,
63 int start, int range, struct re_registers *regs,
64 int stop, int ret_len);
65 static int re_search_stub (struct re_pattern_buffer *bufp,
66 const char *string, int length, int start,
67 int range, int stop, struct re_registers *regs,
68 int ret_len);
69 static unsigned re_copy_regs (struct re_registers *regs, regmatch_t *pmatch,
70 int nregs, int regs_allocated);
71 static inline re_dfastate_t *acquire_init_state_context (reg_errcode_t *err,
72 const regex_t *preg,
73 const re_match_context_t *mctx,
74 int idx);
75 static int check_matching (const regex_t *preg, re_match_context_t *mctx,
76 int fl_search, int fl_longest_match);
77 static int check_halt_node_context (const re_dfa_t *dfa, int node,
78 unsigned int context);
79 static int check_halt_state_context (const regex_t *preg,
80 const re_dfastate_t *state,
81 const re_match_context_t *mctx, int idx);
82 static void update_regs (re_dfa_t *dfa, regmatch_t *pmatch, int cur_node,
83 int cur_idx, int nmatch);
84 static int proceed_next_node (const regex_t *preg, int nregs, regmatch_t *regs,
85 const re_match_context_t *mctx,
86 int *pidx, int node, re_node_set *eps_via_nodes,
87 struct re_fail_stack_t *fs);
88 static reg_errcode_t push_fail_stack (struct re_fail_stack_t *fs,
89 int str_idx, int *dests, int nregs,
90 regmatch_t *regs,
91 re_node_set *eps_via_nodes);
92 static int pop_fail_stack (struct re_fail_stack_t *fs, int *pidx, int nregs,
93 regmatch_t *regs, re_node_set *eps_via_nodes);
94 static reg_errcode_t set_regs (const regex_t *preg,
95 const re_match_context_t *mctx,
96 size_t nmatch, regmatch_t *pmatch,
97 int fl_backtrack);
98 #ifdef RE_ENABLE_I18N
99 static int sift_states_iter_mb (const regex_t *preg,
100 const re_match_context_t *mctx,
101 re_sift_context_t *sctx,
102 int node_idx, int str_idx, int max_str_idx);
103 #endif /* RE_ENABLE_I18N */
104 static reg_errcode_t sift_states_backward (const regex_t *preg,
105 re_match_context_t *mctx,
106 re_sift_context_t *sctx);
107 static reg_errcode_t update_cur_sifted_state (const regex_t *preg,
108 re_match_context_t *mctx,
109 re_sift_context_t *sctx,
110 int str_idx,
111 re_node_set *dest_nodes);
112 static reg_errcode_t add_epsilon_src_nodes (re_dfa_t *dfa,
113 re_node_set *dest_nodes,
114 const re_node_set *candidates);
115 static reg_errcode_t sub_epsilon_src_nodes (re_dfa_t *dfa, int node,
116 re_node_set *dest_nodes,
117 const re_node_set *and_nodes);
118 static int check_dst_limits (re_dfa_t *dfa, re_node_set *limits,
119 re_match_context_t *mctx, int dst_node,
120 int dst_idx, int src_node, int src_idx);
121 static int check_dst_limits_calc_pos (re_dfa_t *dfa, re_match_context_t *mctx,
122 int limit, re_node_set *eclosures,
123 int subexp_idx, int node, int str_idx);
124 static reg_errcode_t check_subexp_limits (re_dfa_t *dfa,
125 re_node_set *dest_nodes,
126 const re_node_set *candidates,
127 re_node_set *limits,
128 struct re_backref_cache_entry *bkref_ents,
129 int str_idx);
130 static reg_errcode_t search_subexp (const regex_t *preg,
131 re_match_context_t *mctx,
132 re_sift_context_t *sctx, int str_idx,
133 re_node_set *dest_nodes);
134 static reg_errcode_t sift_states_bkref (const regex_t *preg,
135 re_match_context_t *mctx,
136 re_sift_context_t *sctx,
137 int str_idx, re_node_set *dest_nodes);
138 static reg_errcode_t clean_state_log_if_need (re_match_context_t *mctx,
139 int next_state_log_idx);
140 static reg_errcode_t merge_state_array (re_dfa_t *dfa, re_dfastate_t **dst,
141 re_dfastate_t **src, int num);
142 static re_dfastate_t *transit_state (reg_errcode_t *err, const regex_t *preg,
143 re_match_context_t *mctx,
144 re_dfastate_t *state, int fl_search);
145 static re_dfastate_t *transit_state_sb (reg_errcode_t *err, const regex_t *preg,
146 re_dfastate_t *pstate,
147 int fl_search,
148 re_match_context_t *mctx);
149 #ifdef RE_ENABLE_I18N
150 static reg_errcode_t transit_state_mb (const regex_t *preg,
151 re_dfastate_t *pstate,
152 re_match_context_t *mctx);
153 #endif /* RE_ENABLE_I18N */
154 static reg_errcode_t transit_state_bkref (const regex_t *preg,
155 re_dfastate_t *pstate,
156 re_match_context_t *mctx);
157 static reg_errcode_t transit_state_bkref_loop (const regex_t *preg,
158 re_node_set *nodes,
159 re_dfastate_t **work_state_log,
160 re_match_context_t *mctx);
161 static re_dfastate_t **build_trtable (const regex_t *dfa,
162 const re_dfastate_t *state,
163 int fl_search);
164 #ifdef RE_ENABLE_I18N
165 static int check_node_accept_bytes (const regex_t *preg, int node_idx,
166 const re_string_t *input, int idx);
167 # ifdef _LIBC
168 static unsigned int find_collation_sequence_value (const unsigned char *mbs,
169 size_t name_len);
170 # endif /* _LIBC */
171 #endif /* RE_ENABLE_I18N */
172 static int group_nodes_into_DFAstates (const regex_t *dfa,
173 const re_dfastate_t *state,
174 re_node_set *states_node,
175 bitset *states_ch);
176 static int check_node_accept (const regex_t *preg, const re_token_t *node,
177 const re_match_context_t *mctx, int idx);
178 static reg_errcode_t extend_buffers (re_match_context_t *mctx);
180 /* Entry point for POSIX code. */
182 /* regexec searches for a given pattern, specified by PREG, in the
183 string STRING.
185 If NMATCH is zero or REG_NOSUB was set in the cflags argument to
186 `regcomp', we ignore PMATCH. Otherwise, we assume PMATCH has at
187 least NMATCH elements, and we set them to the offsets of the
188 corresponding matched substrings.
190 EFLAGS specifies `execution flags' which affect matching: if
191 REG_NOTBOL is set, then ^ does not match at the beginning of the
192 string; if REG_NOTEOL is set, then $ does not match at the end.
194 We return 0 if we find a match and REG_NOMATCH if not. */
197 regexec (preg, string, nmatch, pmatch, eflags)
198 const regex_t *__restrict preg;
199 const char *__restrict string;
200 size_t nmatch;
201 regmatch_t pmatch[];
202 int eflags;
204 reg_errcode_t err;
205 int length = strlen (string);
206 if (preg->no_sub)
207 err = re_search_internal (preg, string, length, 0, length, length, 0,
208 NULL, eflags);
209 else
210 err = re_search_internal (preg, string, length, 0, length, length, nmatch,
211 pmatch, eflags);
212 return err != REG_NOERROR;
214 #ifdef _LIBC
215 weak_alias (__regexec, regexec)
216 #endif
218 /* Entry points for GNU code. */
220 /* re_match, re_search, re_match_2, re_search_2
222 The former two functions operate on STRING with length LENGTH,
223 while the later two operate on concatenation of STRING1 and STRING2
224 with lengths LENGTH1 and LENGTH2, respectively.
226 re_match() matches the compiled pattern in BUFP against the string,
227 starting at index START.
229 re_search() first tries matching at index START, then it tries to match
230 starting from index START + 1, and so on. The last start position tried
231 is START + RANGE. (Thus RANGE = 0 forces re_search to operate the same
232 way as re_match().)
234 The parameter STOP of re_{match,search}_2 specifies that no match exceeding
235 the first STOP characters of the concatenation of the strings should be
236 concerned.
238 If REGS is not NULL, and BUFP->no_sub is not set, the offsets of the match
239 and all groups is stroed in REGS. (For the "_2" variants, the offsets are
240 computed relative to the concatenation, not relative to the individual
241 strings.)
243 On success, re_match* functions return the length of the match, re_search*
244 return the position of the start of the match. Return value -1 means no
245 match was found and -2 indicates an internal error. */
248 re_match (bufp, string, length, start, regs)
249 struct re_pattern_buffer *bufp;
250 const char *string;
251 int length, start;
252 struct re_registers *regs;
254 return re_search_stub (bufp, string, length, start, 0, length, regs, 1);
256 #ifdef _LIBC
257 weak_alias (__re_match, re_match)
258 #endif
261 re_search (bufp, string, length, start, range, regs)
262 struct re_pattern_buffer *bufp;
263 const char *string;
264 int length, start, range;
265 struct re_registers *regs;
267 return re_search_stub (bufp, string, length, start, range, length, regs, 0);
269 #ifdef _LIBC
270 weak_alias (__re_search, re_search)
271 #endif
274 re_match_2 (bufp, string1, length1, string2, length2, start, regs, stop)
275 struct re_pattern_buffer *bufp;
276 const char *string1, *string2;
277 int length1, length2, start, stop;
278 struct re_registers *regs;
280 return re_search_2_stub (bufp, string1, length1, string2, length2,
281 start, 0, regs, stop, 1);
283 #ifdef _LIBC
284 weak_alias (__re_match_2, re_match_2)
285 #endif
288 re_search_2 (bufp, string1, length1, string2, length2, start, range, regs, stop)
289 struct re_pattern_buffer *bufp;
290 const char *string1, *string2;
291 int length1, length2, start, range, stop;
292 struct re_registers *regs;
294 return re_search_2_stub (bufp, string1, length1, string2, length2,
295 start, range, regs, stop, 0);
297 #ifdef _LIBC
298 weak_alias (__re_search_2, re_search_2)
299 #endif
301 static int
302 re_search_2_stub (bufp, string1, length1, string2, length2, start, range, regs,
303 stop, ret_len)
304 struct re_pattern_buffer *bufp;
305 const char *string1, *string2;
306 int length1, length2, start, range, stop, ret_len;
307 struct re_registers *regs;
309 const char *str;
310 int rval;
311 int len = length1 + length2;
312 int free_str = 0;
314 if (BE (length1 < 0 || length2 < 0 || stop < 0, 0))
315 return -2;
317 /* Concatenate the strings. */
318 if (length2 > 0)
319 if (length1 > 0)
321 char *s = re_malloc (char, len);
323 if (BE (s == NULL, 0))
324 return -2;
325 memcpy (s, string1, length1);
326 memcpy (s + length1, string2, length2);
327 str = s;
328 free_str = 1;
330 else
331 str = string2;
332 else
333 str = string1;
335 rval = re_search_stub (bufp, str, len, start, range, stop, regs,
336 ret_len);
337 if (free_str)
338 re_free ((char *) str);
339 return rval;
342 /* The parameters have the same meaning as those of re_search.
343 Additional parameters:
344 If RET_LEN is nonzero the length of the match is returned (re_match style);
345 otherwise the position of the match is returned. */
347 static int
348 re_search_stub (bufp, string, length, start, range, stop, regs, ret_len)
349 struct re_pattern_buffer *bufp;
350 const char *string;
351 int length, start, range, stop, ret_len;
352 struct re_registers *regs;
354 reg_errcode_t result;
355 regmatch_t *pmatch;
356 int nregs, rval;
357 int eflags = 0;
359 /* Check for out-of-range. */
360 if (BE (start < 0 || start > length || range < 0, 0))
361 return -1;
362 if (BE (start + range > length, 0))
363 range = length - start;
365 eflags |= (bufp->not_bol) ? REG_NOTBOL : 0;
366 eflags |= (bufp->not_eol) ? REG_NOTEOL : 0;
368 /* Compile fastmap if we haven't yet. */
369 if (range > 0 && bufp->fastmap != NULL && !bufp->fastmap_accurate)
370 re_compile_fastmap (bufp);
372 if (BE (bufp->no_sub, 0))
373 regs = NULL;
375 /* We need at least 1 register. */
376 if (regs == NULL)
377 nregs = 1;
378 else if (BE (bufp->regs_allocated == REGS_FIXED &&
379 regs->num_regs < bufp->re_nsub + 1, 0))
381 nregs = regs->num_regs;
382 if (BE (nregs < 1, 0))
384 /* Nothing can be copied to regs. */
385 regs = NULL;
386 nregs = 1;
389 else
390 nregs = bufp->re_nsub + 1;
391 pmatch = re_malloc (regmatch_t, nregs);
392 if (BE (pmatch == NULL, 0))
393 return -2;
395 result = re_search_internal (bufp, string, length, start, range, stop,
396 nregs, pmatch, eflags);
398 rval = 0;
400 /* I hope we needn't fill ther regs with -1's when no match was found. */
401 if (result != REG_NOERROR)
402 rval = -1;
403 else if (regs != NULL)
405 /* If caller wants register contents data back, copy them. */
406 bufp->regs_allocated = re_copy_regs (regs, pmatch, nregs,
407 bufp->regs_allocated);
408 if (BE (bufp->regs_allocated == REGS_UNALLOCATED, 0))
409 rval = -2;
412 if (BE (rval == 0, 1))
414 if (ret_len)
416 assert (pmatch[0].rm_so == start);
417 rval = pmatch[0].rm_eo - start;
419 else
420 rval = pmatch[0].rm_so;
422 re_free (pmatch);
423 return rval;
426 static unsigned
427 re_copy_regs (regs, pmatch, nregs, regs_allocated)
428 struct re_registers *regs;
429 regmatch_t *pmatch;
430 int nregs, regs_allocated;
432 int rval = REGS_REALLOCATE;
433 int i;
434 int need_regs = nregs + 1;
435 /* We need one extra element beyond `num_regs' for the `-1' marker GNU code
436 uses. */
438 /* Have the register data arrays been allocated? */
439 if (regs_allocated == REGS_UNALLOCATED)
440 { /* No. So allocate them with malloc. */
441 regs->start = re_malloc (regoff_t, need_regs);
442 if (BE (regs->start == NULL, 0))
443 return REGS_UNALLOCATED;
444 regs->end = re_malloc (regoff_t, need_regs);
445 if (BE (regs->end == NULL, 0))
447 re_free (regs->start);
448 return REGS_UNALLOCATED;
450 regs->num_regs = need_regs;
452 else if (regs_allocated == REGS_REALLOCATE)
453 { /* Yes. If we need more elements than were already
454 allocated, reallocate them. If we need fewer, just
455 leave it alone. */
456 if (need_regs > regs->num_regs)
458 regs->start = re_realloc (regs->start, regoff_t, need_regs);
459 if (BE (regs->start == NULL, 0))
461 if (regs->end != NULL)
462 re_free (regs->end);
463 return REGS_UNALLOCATED;
465 regs->end = re_realloc (regs->end, regoff_t, need_regs);
466 if (BE (regs->end == NULL, 0))
468 re_free (regs->start);
469 return REGS_UNALLOCATED;
471 regs->num_regs = need_regs;
474 else
476 assert (regs_allocated == REGS_FIXED);
477 /* This function may not be called with REGS_FIXED and nregs too big. */
478 assert (regs->num_regs >= nregs);
479 rval = REGS_FIXED;
482 /* Copy the regs. */
483 for (i = 0; i < nregs; ++i)
485 regs->start[i] = pmatch[i].rm_so;
486 regs->end[i] = pmatch[i].rm_eo;
488 for ( ; i < regs->num_regs; ++i)
489 regs->start[i] = regs->end[i] = -1;
491 return rval;
494 /* Set REGS to hold NUM_REGS registers, storing them in STARTS and
495 ENDS. Subsequent matches using PATTERN_BUFFER and REGS will use
496 this memory for recording register information. STARTS and ENDS
497 must be allocated using the malloc library routine, and must each
498 be at least NUM_REGS * sizeof (regoff_t) bytes long.
500 If NUM_REGS == 0, then subsequent matches should allocate their own
501 register data.
503 Unless this function is called, the first search or match using
504 PATTERN_BUFFER will allocate its own register data, without
505 freeing the old data. */
507 void
508 re_set_registers (bufp, regs, num_regs, starts, ends)
509 struct re_pattern_buffer *bufp;
510 struct re_registers *regs;
511 unsigned num_regs;
512 regoff_t *starts, *ends;
514 if (num_regs)
516 bufp->regs_allocated = REGS_REALLOCATE;
517 regs->num_regs = num_regs;
518 regs->start = starts;
519 regs->end = ends;
521 else
523 bufp->regs_allocated = REGS_UNALLOCATED;
524 regs->num_regs = 0;
525 regs->start = regs->end = (regoff_t *) 0;
528 #ifdef _LIBC
529 weak_alias (__re_set_registers, re_set_registers)
530 #endif
532 /* Entry points compatible with 4.2 BSD regex library. We don't define
533 them unless specifically requested. */
535 #if defined _REGEX_RE_COMP || defined _LIBC
537 # ifdef _LIBC
538 weak_function
539 # endif
540 re_exec (s)
541 const char *s;
543 return 0 == regexec (&re_comp_buf, s, 0, NULL, 0);
545 #endif /* _REGEX_RE_COMP */
547 static re_node_set empty_set;
549 /* Internal entry point. */
551 /* Searches for a compiled pattern PREG in the string STRING, whose
552 length is LENGTH. NMATCH, PMATCH, and EFLAGS have the same
553 mingings with regexec. START, and RANGE have the same meanings
554 with re_search.
555 Return REG_NOERROR if we find a match, and REG_NOMATCH if not,
556 otherwise return the error code.
557 Note: We assume front end functions already check ranges.
558 (START + RANGE >= 0 && START + RANGE <= LENGTH) */
560 static reg_errcode_t
561 re_search_internal (preg, string, length, start, range, stop, nmatch, pmatch,
562 eflags)
563 const regex_t *preg;
564 const char *string;
565 int length, start, range, stop, eflags;
566 size_t nmatch;
567 regmatch_t pmatch[];
569 reg_errcode_t err;
570 re_dfa_t *dfa = (re_dfa_t *)preg->buffer;
571 re_string_t input;
572 int left_lim, right_lim, incr;
573 int fl_longest_match, match_first, match_last = -1;
574 re_match_context_t mctx;
575 char *fastmap = ((preg->fastmap != NULL && preg->fastmap_accurate)
576 ? preg->fastmap : NULL);
578 /* Check if the DFA haven't been compiled. */
579 if (BE (preg->used == 0 || dfa->init_state == NULL
580 || dfa->init_state_word == NULL || dfa->init_state_nl == NULL
581 || dfa->init_state_begbuf == NULL, 0))
582 return REG_NOMATCH;
584 re_node_set_init_empty (&empty_set);
586 /* We must check the longest matching, if nmatch > 0. */
587 fl_longest_match = (nmatch != 0);
589 err = re_string_allocate (&input, string, length, dfa->nodes_len + 1,
590 preg->translate, preg->syntax & RE_ICASE);
591 if (BE (err != REG_NOERROR, 0))
592 return err;
593 input.stop = stop;
595 err = match_ctx_init (&mctx, eflags, &input, dfa->nbackref * 2);
596 if (BE (err != REG_NOERROR, 0))
597 return err;
599 /* We will log all the DFA states through which the dfa pass,
600 if nmatch > 1, or this dfa has "multibyte node", which is a
601 back-reference or a node which can accept multibyte character or
602 multi character collating element. */
603 if (nmatch > 1 || dfa->has_mb_node)
605 mctx.state_log = re_malloc (re_dfastate_t *, dfa->nodes_len + 1);
606 if (BE (mctx.state_log == NULL, 0))
607 return REG_ESPACE;
609 else
610 mctx.state_log = NULL;
612 #ifdef DEBUG
613 /* We assume front-end functions already check them. */
614 assert (start + range >= 0 && start + range <= length);
615 #endif
617 match_first = start;
618 input.tip_context = ((eflags & REG_NOTBOL) ? CONTEXT_BEGBUF
619 : CONTEXT_NEWLINE | CONTEXT_BEGBUF);
621 /* Check incrementally whether of not the input string match. */
622 incr = (range < 0) ? -1 : 1;
623 left_lim = (range < 0) ? start + range : start;
624 right_lim = (range < 0) ? start : start + range;
626 for (;;)
628 /* At first get the current byte from input string. */
629 int ch;
630 if (MB_CUR_MAX > 1 && (preg->syntax & RE_ICASE || preg->translate))
632 /* In this case, we can't determin easily the current byte,
633 since it might be a component byte of a multibyte character.
634 Then we use the constructed buffer instead. */
635 /* If MATCH_FIRST is out of the valid range, reconstruct the
636 buffers. */
637 if (input.raw_mbs_idx + input.valid_len <= match_first)
638 re_string_reconstruct (&input, match_first, eflags,
639 preg->newline_anchor);
640 /* If MATCH_FIRST is out of the buffer, leave it as '\0'.
641 Note that MATCH_FIRST must not be smaller than 0. */
642 ch = ((match_first >= length) ? 0
643 : re_string_byte_at (&input, match_first - input.raw_mbs_idx));
645 else
647 /* We apply translate/conversion manually, since it is trivial
648 in this case. */
649 /* If MATCH_FIRST is out of the buffer, leave it as '\0'.
650 Note that MATCH_FIRST must not be smaller than 0. */
651 ch = (match_first < length) ? (unsigned char)string[match_first] : 0;
652 /* Apply translation if we need. */
653 ch = preg->translate ? preg->translate[ch] : ch;
654 /* In case of case insensitive mode, convert to upper case. */
655 ch = ((preg->syntax & RE_ICASE) && islower (ch)) ? toupper (ch) : ch;
658 /* Eliminate inappropriate one by fastmap. */
659 if (preg->can_be_null || fastmap == NULL || fastmap[ch])
661 /* Reconstruct the buffers so that the matcher can assume that
662 the matching starts from the begining of the buffer. */
663 re_string_reconstruct (&input, match_first, eflags,
664 preg->newline_anchor);
665 #ifdef RE_ENABLE_I18N
666 /* Eliminate it when it is a component of a multibyte character
667 and isn't the head of a multibyte character. */
668 if (MB_CUR_MAX == 1 || re_string_first_byte (&input, 0))
669 #endif
671 /* It seems to be appropriate one, then use the matcher. */
672 /* We assume that the matching starts from 0. */
673 mctx.state_log_top = mctx.nbkref_ents = mctx.max_mb_elem_len = 0;
674 match_last = check_matching (preg, &mctx, 0, fl_longest_match);
675 if (match_last != -1)
677 if (BE (match_last == -2, 0))
678 return REG_ESPACE;
679 else
680 break; /* We found a matching. */
684 /* Update counter. */
685 match_first += incr;
686 if (match_first < left_lim || right_lim < match_first)
687 break;
690 /* Set pmatch[] if we need. */
691 if (match_last != -1 && nmatch > 0)
693 int reg_idx;
695 /* Initialize registers. */
696 for (reg_idx = 0; reg_idx < nmatch; ++reg_idx)
697 pmatch[reg_idx].rm_so = pmatch[reg_idx].rm_eo = -1;
699 /* Set the points where matching start/end. */
700 pmatch[0].rm_so = 0;
701 mctx.match_last = pmatch[0].rm_eo = match_last;
703 if (!preg->no_sub && nmatch > 1)
705 /* We need the ranges of all the subexpressions. */
706 int halt_node;
707 re_dfastate_t **sifted_states;
708 re_dfastate_t **lim_states = NULL;
709 re_dfastate_t *pstate = mctx.state_log[match_last];
710 re_sift_context_t sctx;
711 #ifdef DEBUG
712 assert (mctx.state_log != NULL);
713 #endif
714 halt_node = check_halt_state_context (preg, pstate, &mctx,
715 match_last);
716 if (dfa->has_plural_match)
718 match_ctx_clear_flag (&mctx);
719 sifted_states = re_malloc (re_dfastate_t *, match_last + 1);
720 if (BE (sifted_states == NULL, 0))
721 return REG_ESPACE;
722 if (dfa->nbackref)
724 lim_states = calloc (sizeof (re_dfastate_t *),
725 match_last + 1);
726 if (BE (lim_states == NULL, 0))
727 return REG_ESPACE;
729 sift_ctx_init (&sctx, sifted_states, lim_states, halt_node,
730 mctx.match_last, 0);
731 err = sift_states_backward (preg, &mctx, &sctx);
732 if (BE (err != REG_NOERROR, 0))
733 return err;
734 if (lim_states != NULL)
736 err = merge_state_array (dfa, sifted_states, lim_states,
737 match_last + 1);
738 if (BE (err != REG_NOERROR, 0))
739 return err;
740 re_free (lim_states);
742 re_node_set_free (&sctx.limits);
743 re_free (mctx.state_log);
744 mctx.state_log = sifted_states;
746 mctx.last_node = halt_node;
747 err = set_regs (preg, &mctx, nmatch, pmatch,
748 dfa->has_plural_match && dfa->nbackref > 0);
749 if (BE (err != REG_NOERROR, 0))
750 return err;
753 /* At last, add the offset to the each registers, since we slided
754 the buffers so that We can assume that the matching starts from 0. */
755 for (reg_idx = 0; reg_idx < nmatch; ++reg_idx)
756 if (pmatch[reg_idx].rm_so != -1)
758 pmatch[reg_idx].rm_so += match_first;
759 pmatch[reg_idx].rm_eo += match_first;
763 re_free (mctx.state_log);
764 if (dfa->nbackref)
765 match_ctx_free (&mctx);
766 re_string_destruct (&input);
768 return (match_last == -1) ? REG_NOMATCH : REG_NOERROR;
771 /* Acquire an initial state and return it.
772 We must select appropriate initial state depending on the context,
773 since initial states may have constraints like "\<", "^", etc.. */
775 static inline re_dfastate_t *
776 acquire_init_state_context (err, preg, mctx, idx)
777 reg_errcode_t *err;
778 const regex_t *preg;
779 const re_match_context_t *mctx;
780 int idx;
782 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
784 *err = REG_NOERROR;
785 if (dfa->init_state->has_constraint)
787 unsigned int context;
788 context = re_string_context_at (mctx->input, idx - 1, mctx->eflags,
789 preg->newline_anchor);
790 if (IS_WORD_CONTEXT (context))
791 return dfa->init_state_word;
792 else if (IS_ORDINARY_CONTEXT (context))
793 return dfa->init_state;
794 else if (IS_BEGBUF_CONTEXT (context) && IS_NEWLINE_CONTEXT (context))
795 return dfa->init_state_begbuf;
796 else if (IS_NEWLINE_CONTEXT (context))
797 return dfa->init_state_nl;
798 else if (IS_BEGBUF_CONTEXT (context))
800 /* It is relatively rare case, then calculate on demand. */
801 return re_acquire_state_context (err, dfa,
802 dfa->init_state->entrance_nodes,
803 context);
805 else
806 /* Must not happen? */
807 return dfa->init_state;
809 else
810 return dfa->init_state;
813 /* Check whether the regular expression match input string INPUT or not,
814 and return the index where the matching end, return -1 if not match,
815 or return -2 in case of an error.
816 FL_SEARCH means we must search where the matching starts,
817 FL_LONGEST_MATCH means we want the POSIX longest matching.
818 Note that the matcher assume that the maching starts from the current
819 index of the buffer. */
821 static int
822 check_matching (preg, mctx, fl_search, fl_longest_match)
823 const regex_t *preg;
824 re_match_context_t *mctx;
825 int fl_search, fl_longest_match;
827 reg_errcode_t err;
828 int match = 0;
829 int match_last = -1;
830 int cur_str_idx = re_string_cur_idx (mctx->input);
831 re_dfastate_t *cur_state;
833 cur_state = acquire_init_state_context (&err, preg, mctx, cur_str_idx);
834 /* An initial state must not be NULL(invalid state). */
835 if (BE (cur_state == NULL, 0))
836 return -2;
837 if (mctx->state_log != NULL)
838 mctx->state_log[cur_str_idx] = cur_state;
840 if (cur_state->has_backref)
842 int i;
843 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
844 for (i = 0; i < cur_state->nodes.nelem; ++i)
846 re_token_type_t type;
847 int node = cur_state->nodes.elems[i];
848 int entity = (dfa->nodes[node].type != OP_CONTEXT_NODE ? node
849 : dfa->nodes[node].opr.ctx_info->entity);
850 type = dfa->nodes[entity].type;
851 if (type == OP_BACK_REF)
853 int clexp_idx;
854 for (clexp_idx = 0; clexp_idx < cur_state->nodes.nelem;
855 ++clexp_idx)
857 re_token_t *clexp_node;
858 clexp_node = dfa->nodes + cur_state->nodes.elems[clexp_idx];
859 if (clexp_node->type == OP_CLOSE_SUBEXP
860 && clexp_node->opr.idx + 1== dfa->nodes[entity].opr.idx)
862 err = match_ctx_add_entry (mctx, node, 0, 0, 0);
863 if (BE (err != REG_NOERROR, 0))
864 return -2;
865 break;
872 /* If the RE accepts NULL string. */
873 if (cur_state->halt)
875 if (!cur_state->has_constraint
876 || check_halt_state_context (preg, cur_state, mctx, cur_str_idx))
878 if (!fl_longest_match)
879 return cur_str_idx;
880 else
882 match_last = cur_str_idx;
883 match = 1;
888 while (!re_string_eoi (mctx->input))
890 cur_state = transit_state (&err, preg, mctx, cur_state,
891 fl_search && !match);
892 if (cur_state == NULL) /* Reached at the invalid state or an error. */
894 cur_str_idx = re_string_cur_idx (mctx->input);
895 if (BE (err != REG_NOERROR, 0))
896 return -2;
897 if (fl_search && !match)
899 /* Restart from initial state, since we are searching
900 the point from where matching start. */
901 #ifdef RE_ENABLE_I18N
902 if (MB_CUR_MAX == 1
903 || re_string_first_byte (mctx->input, cur_str_idx))
904 #endif /* RE_ENABLE_I18N */
905 cur_state = acquire_init_state_context (&err, preg, mctx,
906 cur_str_idx);
907 if (BE (cur_state == NULL && err != REG_NOERROR, 0))
908 return -2;
909 if (mctx->state_log != NULL)
910 mctx->state_log[cur_str_idx] = cur_state;
912 else if (!fl_longest_match && match)
913 break;
914 else /* (fl_longest_match && match) || (!fl_search && !match) */
916 if (mctx->state_log == NULL)
917 break;
918 else
920 int max = mctx->state_log_top;
921 for (; cur_str_idx <= max; ++cur_str_idx)
922 if (mctx->state_log[cur_str_idx] != NULL)
923 break;
924 if (cur_str_idx > max)
925 break;
930 if (cur_state != NULL && cur_state->halt)
932 /* Reached at a halt state.
933 Check the halt state can satisfy the current context. */
934 if (!cur_state->has_constraint
935 || check_halt_state_context (preg, cur_state, mctx,
936 re_string_cur_idx (mctx->input)))
938 /* We found an appropriate halt state. */
939 match_last = re_string_cur_idx (mctx->input);
940 match = 1;
941 if (!fl_longest_match)
942 break;
946 return match_last;
949 /* Check NODE match the current context. */
951 static int check_halt_node_context (dfa, node, context)
952 const re_dfa_t *dfa;
953 int node;
954 unsigned int context;
956 int entity;
957 re_token_type_t type = dfa->nodes[node].type;
958 if (type == END_OF_RE)
959 return 1;
960 if (type != OP_CONTEXT_NODE)
961 return 0;
962 entity = dfa->nodes[node].opr.ctx_info->entity;
963 if (dfa->nodes[entity].type != END_OF_RE
964 || NOT_SATISFY_NEXT_CONSTRAINT (dfa->nodes[node].constraint, context))
965 return 0;
966 return 1;
969 /* Check the halt state STATE match the current context.
970 Return 0 if not match, if the node, STATE has, is a halt node and
971 match the context, return the node. */
973 static int
974 check_halt_state_context (preg, state, mctx, idx)
975 const regex_t *preg;
976 const re_dfastate_t *state;
977 const re_match_context_t *mctx;
978 int idx;
980 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
981 int i;
982 unsigned int context;
983 #ifdef DEBUG
984 assert (state->halt);
985 #endif
986 context = re_string_context_at (mctx->input, idx, mctx->eflags,
987 preg->newline_anchor);
988 for (i = 0; i < state->nodes.nelem; ++i)
989 if (check_halt_node_context (dfa, state->nodes.elems[i], context))
990 return state->nodes.elems[i];
991 return 0;
994 /* Compute the next node to which "NFA" transit from NODE("NFA" is a NFA
995 corresponding to the DFA).
996 Return the destination node, and update EPS_VIA_NODES, return -1 in case
997 of errors. */
999 static int
1000 proceed_next_node (preg, nregs, regs, mctx, pidx, node, eps_via_nodes, fs)
1001 const regex_t *preg;
1002 regmatch_t *regs;
1003 const re_match_context_t *mctx;
1004 int nregs, *pidx, node;
1005 re_node_set *eps_via_nodes;
1006 struct re_fail_stack_t *fs;
1008 re_dfa_t *dfa = (re_dfa_t *)preg->buffer;
1009 int i, err, dest_node, cur_entity;
1010 dest_node = -1;
1011 cur_entity = ((dfa->nodes[node].type == OP_CONTEXT_NODE)
1012 ? dfa->nodes[node].opr.ctx_info->entity : node);
1013 if (IS_EPSILON_NODE (dfa->nodes[node].type))
1015 int ndest, dest_nodes[2], dest_entities[2];
1016 err = re_node_set_insert (eps_via_nodes, node);
1017 if (BE (err < 0, 0))
1018 return -1;
1019 /* Pick up valid destinations. */
1020 for (ndest = 0, i = 0; i < mctx->state_log[*pidx]->nodes.nelem; ++i)
1022 int candidate = mctx->state_log[*pidx]->nodes.elems[i];
1023 int entity;
1024 entity = ((dfa->nodes[candidate].type == OP_CONTEXT_NODE)
1025 ? dfa->nodes[candidate].opr.ctx_info->entity : candidate);
1026 if (!re_node_set_contains (dfa->edests + node, entity))
1027 continue;
1028 dest_nodes[0] = (ndest == 0) ? candidate : dest_nodes[0];
1029 dest_entities[0] = (ndest == 0) ? entity : dest_entities[0];
1030 dest_nodes[1] = (ndest == 1) ? candidate : dest_nodes[1];
1031 dest_entities[1] = (ndest == 1) ? entity : dest_entities[1];
1032 ++ndest;
1034 if (ndest <= 1)
1035 return ndest == 0 ? -1 : (ndest == 1 ? dest_nodes[0] : 0);
1036 if (dest_entities[0] > dest_entities[1])
1038 int swap_work = dest_nodes[0];
1039 dest_nodes[0] = dest_nodes[1];
1040 dest_nodes[1] = swap_work;
1042 /* In order to avoid infinite loop like "(a*)*". */
1043 if (re_node_set_contains (eps_via_nodes, dest_nodes[0]))
1044 return dest_nodes[1];
1045 if (fs != NULL)
1046 push_fail_stack (fs, *pidx, dest_nodes, nregs, regs, eps_via_nodes);
1047 return dest_nodes[0];
1049 else
1051 int naccepted = 0, entity = node;
1052 re_token_type_t type = dfa->nodes[node].type;
1053 if (type == OP_CONTEXT_NODE)
1055 entity = dfa->nodes[node].opr.ctx_info->entity;
1056 type = dfa->nodes[entity].type;
1059 #ifdef RE_ENABLE_I18N
1060 if (ACCEPT_MB_NODE (type))
1061 naccepted = check_node_accept_bytes (preg, entity, mctx->input, *pidx);
1062 else
1063 #endif /* RE_ENABLE_I18N */
1064 if (type == OP_BACK_REF)
1066 int subexp_idx = dfa->nodes[entity].opr.idx;
1067 naccepted = regs[subexp_idx].rm_eo - regs[subexp_idx].rm_so;
1068 if (fs != NULL)
1070 if (regs[subexp_idx].rm_so == -1 || regs[subexp_idx].rm_eo == -1)
1071 return -1;
1072 else if (naccepted)
1074 char *buf = re_string_get_buffer (mctx->input);
1075 if (strncmp (buf + regs[subexp_idx].rm_so, buf + *pidx,
1076 naccepted) != 0)
1077 return -1;
1081 if (naccepted == 0)
1083 err = re_node_set_insert (eps_via_nodes, node);
1084 if (BE (err < 0, 0))
1085 return -2;
1086 dest_node = dfa->nexts[node];
1087 if (re_node_set_contains (&mctx->state_log[*pidx]->nodes,
1088 dest_node))
1089 return dest_node;
1090 for (i = 0; i < mctx->state_log[*pidx]->nodes.nelem; ++i)
1092 dest_node = mctx->state_log[*pidx]->nodes.elems[i];
1093 if ((dfa->nodes[dest_node].type == OP_CONTEXT_NODE
1094 && (dfa->nexts[node]
1095 == dfa->nodes[dest_node].opr.ctx_info->entity)))
1096 return dest_node;
1101 if (naccepted != 0
1102 || check_node_accept (preg, dfa->nodes + node, mctx, *pidx))
1104 dest_node = dfa->nexts[node];
1105 *pidx = (naccepted == 0) ? *pidx + 1 : *pidx + naccepted;
1106 if (fs && (*pidx > mctx->match_last || mctx->state_log[*pidx] == NULL
1107 || !re_node_set_contains (&mctx->state_log[*pidx]->nodes,
1108 dest_node)))
1109 return -1;
1110 re_node_set_empty (eps_via_nodes);
1111 return dest_node;
1114 return -1;
1117 static reg_errcode_t
1118 push_fail_stack (fs, str_idx, dests, nregs, regs, eps_via_nodes)
1119 struct re_fail_stack_t *fs;
1120 int str_idx, *dests, nregs;
1121 regmatch_t *regs;
1122 re_node_set *eps_via_nodes;
1124 reg_errcode_t err;
1125 int num = fs->num++;
1126 if (fs->num == fs->alloc)
1128 fs->alloc *= 2;
1129 fs->stack = realloc (fs->stack, (sizeof (struct re_fail_stack_ent_t)
1130 * fs->alloc));
1131 if (fs->stack == NULL)
1132 return REG_ESPACE;
1134 fs->stack[num].idx = str_idx;
1135 fs->stack[num].node = dests[1];
1136 fs->stack[num].regs = re_malloc (regmatch_t, nregs);
1137 memcpy (fs->stack[num].regs, regs, sizeof (regmatch_t) * nregs);
1138 err = re_node_set_init_copy (&fs->stack[num].eps_via_nodes, eps_via_nodes);
1139 return err;
1142 static int
1143 pop_fail_stack (fs, pidx, nregs, regs, eps_via_nodes)
1144 struct re_fail_stack_t *fs;
1145 int *pidx, nregs;
1146 regmatch_t *regs;
1147 re_node_set *eps_via_nodes;
1149 int num = --fs->num;
1150 assert (num >= 0);
1151 *pidx = fs->stack[num].idx;
1152 memcpy (regs, fs->stack[num].regs, sizeof (regmatch_t) * nregs);
1153 re_node_set_free (eps_via_nodes);
1154 *eps_via_nodes = fs->stack[num].eps_via_nodes;
1155 return fs->stack[num].node;
1158 /* Set the positions where the subexpressions are starts/ends to registers
1159 PMATCH.
1160 Note: We assume that pmatch[0] is already set, and
1161 pmatch[i].rm_so == pmatch[i].rm_eo == -1 (i > 1). */
1163 static reg_errcode_t
1164 set_regs (preg, mctx, nmatch, pmatch, fl_backtrack)
1165 const regex_t *preg;
1166 const re_match_context_t *mctx;
1167 size_t nmatch;
1168 regmatch_t *pmatch;
1169 int fl_backtrack;
1171 re_dfa_t *dfa = (re_dfa_t *)preg->buffer;
1172 int idx, cur_node, real_nmatch;
1173 re_node_set eps_via_nodes;
1174 struct re_fail_stack_t *fs;
1175 struct re_fail_stack_t fs_body = {0, 2, NULL};
1176 #ifdef DEBUG
1177 assert (nmatch > 1);
1178 assert (mctx->state_log != NULL);
1179 #endif
1180 if (fl_backtrack)
1182 fs = &fs_body;
1183 fs->stack = re_malloc (struct re_fail_stack_ent_t, fs->alloc);
1185 else
1186 fs = NULL;
1187 cur_node = dfa->init_node;
1188 real_nmatch = (nmatch <= preg->re_nsub) ? nmatch : preg->re_nsub + 1;
1189 re_node_set_init_empty (&eps_via_nodes);
1190 for (idx = pmatch[0].rm_so; idx <= pmatch[0].rm_eo ;)
1192 update_regs (dfa, pmatch, cur_node, idx, real_nmatch);
1193 if (idx == pmatch[0].rm_eo && cur_node == mctx->last_node)
1195 int reg_idx;
1196 if (fs)
1198 for (reg_idx = 0; reg_idx < nmatch; ++reg_idx)
1199 if (pmatch[reg_idx].rm_so > -1 && pmatch[reg_idx].rm_eo == -1)
1200 break;
1201 if (reg_idx == nmatch)
1202 return REG_NOERROR;
1203 cur_node = pop_fail_stack (fs, &idx, nmatch, pmatch,
1204 &eps_via_nodes);
1206 else
1207 return REG_NOERROR;
1210 /* Proceed to next node. */
1211 cur_node = proceed_next_node (preg, nmatch, pmatch, mctx, &idx, cur_node,
1212 &eps_via_nodes, fs);
1214 if (BE (cur_node < 0, 0))
1216 if (cur_node == -2)
1217 return REG_ESPACE;
1218 if (fs)
1219 cur_node = pop_fail_stack (fs, &idx, nmatch, pmatch,
1220 &eps_via_nodes);
1221 else
1222 return REG_NOMATCH;
1225 re_node_set_free (&eps_via_nodes);
1226 return REG_NOERROR;
1229 static void
1230 update_regs (dfa, pmatch, cur_node, cur_idx, nmatch)
1231 re_dfa_t *dfa;
1232 regmatch_t *pmatch;
1233 int cur_node, cur_idx, nmatch;
1235 int type = dfa->nodes[cur_node].type;
1236 int reg_num;
1237 if (type != OP_OPEN_SUBEXP && type != OP_CLOSE_SUBEXP)
1238 return;
1239 reg_num = dfa->nodes[cur_node].opr.idx + 1;
1240 if (reg_num >= nmatch)
1241 return;
1242 if (type == OP_OPEN_SUBEXP)
1244 /* We are at the first node of this sub expression. */
1245 pmatch[reg_num].rm_so = cur_idx;
1246 pmatch[reg_num].rm_eo = -1;
1248 else if (type == OP_CLOSE_SUBEXP)
1249 /* We are at the first node of this sub expression. */
1250 pmatch[reg_num].rm_eo = cur_idx;
1253 #define NUMBER_OF_STATE 1
1255 /* This function checks the STATE_LOG from the SCTX->last_str_idx to 0
1256 and sift the nodes in each states according to the following rules.
1257 Updated state_log will be wrote to STATE_LOG.
1259 Rules: We throw away the Node `a' in the STATE_LOG[STR_IDX] if...
1260 1. When STR_IDX == MATCH_LAST(the last index in the state_log):
1261 If `a' isn't the LAST_NODE and `a' can't epsilon transit to
1262 the LAST_NODE, we throw away the node `a'.
1263 2. When 0 <= STR_IDX < MATCH_LAST and `a' accepts
1264 string `s' and transit to `b':
1265 i. If 'b' isn't in the STATE_LOG[STR_IDX+strlen('s')], we throw
1266 away the node `a'.
1267 ii. If 'b' is in the STATE_LOG[STR_IDX+strlen('s')] but 'b' is
1268 throwed away, we throw away the node `a'.
1269 3. When 0 <= STR_IDX < n and 'a' epsilon transit to 'b':
1270 i. If 'b' isn't in the STATE_LOG[STR_IDX], we throw away the
1271 node `a'.
1272 ii. If 'b' is in the STATE_LOG[STR_IDX] but 'b' is throwed away,
1273 we throw away the node `a'. */
1275 #define STATE_NODE_CONTAINS(state,node) \
1276 ((state) != NULL && re_node_set_contains (&(state)->nodes, node))
1278 static reg_errcode_t
1279 sift_states_backward (preg, mctx, sctx)
1280 const regex_t *preg;
1281 re_match_context_t *mctx;
1282 re_sift_context_t *sctx;
1284 reg_errcode_t err;
1285 re_dfa_t *dfa = (re_dfa_t *)preg->buffer;
1286 int null_cnt = 0;
1287 int str_idx = sctx->last_str_idx;
1288 re_node_set cur_dest;
1289 re_node_set *cur_src; /* Points the state_log[str_idx]->nodes */
1291 #ifdef DEBUG
1292 assert (mctx->state_log != NULL && mctx->state_log[str_idx] != NULL);
1293 #endif
1294 cur_src = &mctx->state_log[str_idx]->nodes;
1296 /* Build sifted state_log[str_idx]. It has the nodes which can epsilon
1297 transit to the last_node and the last_node itself. */
1298 err = re_node_set_init_1 (&cur_dest, sctx->last_node);
1299 if (BE (err != REG_NOERROR, 0))
1300 return err;
1301 err = update_cur_sifted_state (preg, mctx, sctx, str_idx, &cur_dest);
1302 if (BE (err != REG_NOERROR, 0))
1303 return err;
1305 /* Then check each states in the state_log. */
1306 while (str_idx > 0)
1308 int i, ret;
1309 /* Update counters. */
1310 null_cnt = (sctx->sifted_states[str_idx] == NULL) ? null_cnt + 1 : 0;
1311 if (null_cnt > mctx->max_mb_elem_len)
1313 memset (sctx->sifted_states, '\0',
1314 sizeof (re_dfastate_t *) * str_idx);
1315 return REG_NOERROR;
1317 re_node_set_empty (&cur_dest);
1318 --str_idx;
1319 cur_src = ((mctx->state_log[str_idx] == NULL) ? &empty_set
1320 : &mctx->state_log[str_idx]->nodes);
1322 /* Then build the next sifted state.
1323 We build the next sifted state on `cur_dest', and update
1324 `sifted_states[str_idx]' with `cur_dest'.
1325 Note:
1326 `cur_dest' is the sifted state from `state_log[str_idx + 1]'.
1327 `cur_src' points the node_set of the old `state_log[str_idx]'. */
1328 for (i = 0; i < cur_src->nelem; i++)
1330 int prev_node = cur_src->elems[i];
1331 int entity = prev_node;
1332 int naccepted = 0;
1333 re_token_type_t type = dfa->nodes[prev_node].type;
1335 if (IS_EPSILON_NODE(type))
1336 continue;
1337 if (type == OP_CONTEXT_NODE)
1339 entity = dfa->nodes[prev_node].opr.ctx_info->entity;
1340 type = dfa->nodes[entity].type;
1342 #ifdef RE_ENABLE_I18N
1343 /* If the node may accept `multi byte'. */
1344 if (ACCEPT_MB_NODE (type))
1345 naccepted = sift_states_iter_mb (preg, mctx, sctx, entity, str_idx,
1346 sctx->last_str_idx);
1348 #endif /* RE_ENABLE_I18N */
1349 /* We don't check backreferences here.
1350 See update_cur_sifted_state(). */
1352 if (!naccepted
1353 && check_node_accept (preg, dfa->nodes + prev_node, mctx,
1354 str_idx)
1355 && STATE_NODE_CONTAINS (sctx->sifted_states[str_idx + 1],
1356 dfa->nexts[prev_node]))
1357 naccepted = 1;
1359 if (naccepted == 0)
1360 continue;
1362 if (sctx->limits.nelem)
1364 int to_idx = str_idx + naccepted;
1365 if (check_dst_limits (dfa, &sctx->limits, mctx,
1366 dfa->nexts[prev_node], to_idx,
1367 prev_node, str_idx))
1368 continue;
1370 ret = re_node_set_insert (&cur_dest, prev_node);
1371 if (BE (ret == -1, 0))
1372 return err;
1375 /* Add all the nodes which satisfy the following conditions:
1376 - It can epsilon transit to a node in CUR_DEST.
1377 - It is in CUR_SRC.
1378 And update state_log. */
1379 err = update_cur_sifted_state (preg, mctx, sctx, str_idx, &cur_dest);
1380 if (BE (err != REG_NOERROR, 0))
1381 return err;
1384 re_node_set_free (&cur_dest);
1385 return REG_NOERROR;
1388 /* Helper functions. */
1390 static inline reg_errcode_t
1391 clean_state_log_if_need (mctx, next_state_log_idx)
1392 re_match_context_t *mctx;
1393 int next_state_log_idx;
1395 int top = mctx->state_log_top;
1397 if (next_state_log_idx >= mctx->input->bufs_len
1398 || (next_state_log_idx >= mctx->input->valid_len
1399 && mctx->input->valid_len < mctx->input->len))
1401 reg_errcode_t err;
1402 err = extend_buffers (mctx);
1403 if (BE (err != REG_NOERROR, 0))
1404 return err;
1407 if (top < next_state_log_idx)
1409 memset (mctx->state_log + top + 1, '\0',
1410 sizeof (re_dfastate_t *) * (next_state_log_idx - top));
1411 mctx->state_log_top = next_state_log_idx;
1413 return REG_NOERROR;
1416 static reg_errcode_t merge_state_array (dfa, dst, src, num)
1417 re_dfa_t *dfa;
1418 re_dfastate_t **dst;
1419 re_dfastate_t **src;
1420 int num;
1422 int st_idx;
1423 reg_errcode_t err;
1424 for (st_idx = 0; st_idx < num; ++st_idx)
1426 if (dst[st_idx] == NULL)
1427 dst[st_idx] = src[st_idx];
1428 else if (src[st_idx] != NULL)
1430 re_node_set merged_set;
1431 err = re_node_set_init_union (&merged_set, &dst[st_idx]->nodes,
1432 &src[st_idx]->nodes);
1433 if (BE (err != REG_NOERROR, 0))
1434 return err;
1435 dst[st_idx] = re_acquire_state (&err, dfa, &merged_set);
1436 if (BE (err != REG_NOERROR, 0))
1437 return err;
1438 re_node_set_free (&merged_set);
1441 return REG_NOERROR;
1444 static reg_errcode_t
1445 update_cur_sifted_state (preg, mctx, sctx, str_idx, dest_nodes)
1446 const regex_t *preg;
1447 re_match_context_t *mctx;
1448 re_sift_context_t *sctx;
1449 int str_idx;
1450 re_node_set *dest_nodes;
1452 reg_errcode_t err;
1453 re_dfa_t *dfa = (re_dfa_t *)preg->buffer;
1454 const re_node_set *candidates;
1455 candidates = ((mctx->state_log[str_idx] == NULL) ? &empty_set
1456 : &mctx->state_log[str_idx]->nodes);
1458 /* At first, add the nodes which can epsilon transit to a node in
1459 DEST_NODE. */
1460 err = add_epsilon_src_nodes (dfa, dest_nodes, candidates);
1461 if (BE (err != REG_NOERROR, 0))
1462 return err;
1464 /* Then, check the limitations in the current sift_context. */
1465 if (sctx->limits.nelem)
1467 err = check_subexp_limits (dfa, dest_nodes, candidates, &sctx->limits,
1468 mctx->bkref_ents, str_idx);
1469 if (BE (err != REG_NOERROR, 0))
1470 return err;
1473 /* Update state_log. */
1474 sctx->sifted_states[str_idx] = re_acquire_state (&err, dfa, dest_nodes);
1475 if (BE (sctx->sifted_states[str_idx] == NULL && err != REG_NOERROR, 0))
1476 return err;
1478 /* If we are searching for the subexpression candidates.
1479 Note that we were from transit_state_bkref_loop() in this case. */
1480 if (sctx->check_subexp)
1482 err = search_subexp (preg, mctx, sctx, str_idx, dest_nodes);
1483 if (BE (err != REG_NOERROR, 0))
1484 return err;
1487 if ((mctx->state_log[str_idx] != NULL
1488 && mctx->state_log[str_idx]->has_backref))
1490 err = sift_states_bkref (preg, mctx, sctx, str_idx, dest_nodes);
1491 if (BE (err != REG_NOERROR, 0))
1492 return err;
1494 return REG_NOERROR;
1497 static reg_errcode_t
1498 add_epsilon_src_nodes (dfa, dest_nodes, candidates)
1499 re_dfa_t *dfa;
1500 re_node_set *dest_nodes;
1501 const re_node_set *candidates;
1503 reg_errcode_t err;
1504 int src_idx;
1505 re_node_set src_copy;
1507 err = re_node_set_init_copy (&src_copy, dest_nodes);
1508 if (BE (err != REG_NOERROR, 0))
1509 return err;
1510 for (src_idx = 0; src_idx < src_copy.nelem; ++src_idx)
1512 err = re_node_set_add_intersect (dest_nodes, candidates,
1513 dfa->inveclosures
1514 + src_copy.elems[src_idx]);
1515 if (BE (err != REG_NOERROR, 0))
1516 return err;
1518 re_node_set_free (&src_copy);
1519 return REG_NOERROR;
1522 static reg_errcode_t
1523 sub_epsilon_src_nodes (dfa, node, dest_nodes, candidates)
1524 re_dfa_t *dfa;
1525 int node;
1526 re_node_set *dest_nodes;
1527 const re_node_set *candidates;
1529 int ecl_idx;
1530 reg_errcode_t err;
1531 re_node_set *inv_eclosure = dfa->inveclosures + node;
1532 re_node_set except_nodes;
1533 re_node_set_init_empty (&except_nodes);
1534 for (ecl_idx = 0; ecl_idx < inv_eclosure->nelem; ++ecl_idx)
1536 int cur_node = inv_eclosure->elems[ecl_idx];
1537 if (cur_node == node)
1538 continue;
1539 if (dfa->edests[cur_node].nelem)
1541 int edst1 = dfa->edests[cur_node].elems[0];
1542 int edst2 = ((dfa->edests[cur_node].nelem > 1)
1543 ? dfa->edests[cur_node].elems[1] : -1);
1544 if ((!re_node_set_contains (inv_eclosure, edst1)
1545 && re_node_set_contains (dest_nodes, edst1))
1546 || (edst2 > 0
1547 && !re_node_set_contains (inv_eclosure, edst2)
1548 && re_node_set_contains (dest_nodes, edst2)))
1550 err = re_node_set_add_intersect (&except_nodes, candidates,
1551 dfa->inveclosures + cur_node);
1552 if (BE (err != REG_NOERROR, 0))
1553 return err;
1557 for (ecl_idx = 0; ecl_idx < inv_eclosure->nelem; ++ecl_idx)
1559 int cur_node = inv_eclosure->elems[ecl_idx];
1560 if (!re_node_set_contains (&except_nodes, cur_node))
1562 int idx = re_node_set_contains (dest_nodes, cur_node) - 1;
1563 re_node_set_remove_at (dest_nodes, idx);
1566 re_node_set_free (&except_nodes);
1567 return REG_NOERROR;
1570 static int
1571 check_dst_limits (dfa, limits, mctx, dst_node, dst_idx, src_node, src_idx)
1572 re_dfa_t *dfa;
1573 re_node_set *limits;
1574 re_match_context_t *mctx;
1575 int dst_node, dst_idx, src_node, src_idx;
1577 int lim_idx, src_pos, dst_pos;
1579 for (lim_idx = 0; lim_idx < limits->nelem; ++lim_idx)
1581 int bkref, subexp_idx/*, node_idx, cls_node*/;
1582 struct re_backref_cache_entry *ent;
1583 ent = mctx->bkref_ents + limits->elems[lim_idx];
1584 bkref = (dfa->nodes[ent->node].type == OP_CONTEXT_NODE
1585 ? dfa->nodes[ent->node].opr.ctx_info->entity : ent->node);
1586 subexp_idx = dfa->nodes[bkref].opr.idx - 1;
1588 dst_pos = check_dst_limits_calc_pos (dfa, mctx, limits->elems[lim_idx],
1589 dfa->eclosures + dst_node,
1590 subexp_idx, dst_node, dst_idx);
1591 src_pos = check_dst_limits_calc_pos (dfa, mctx, limits->elems[lim_idx],
1592 dfa->eclosures + src_node,
1593 subexp_idx, src_node, src_idx);
1595 /* In case of:
1596 <src> <dst> ( <subexp> )
1597 ( <subexp> ) <src> <dst>
1598 ( <subexp1> <src> <subexp2> <dst> <subexp3> ) */
1599 if (src_pos == dst_pos)
1600 continue; /* This is unrelated limitation. */
1601 else
1602 return 1;
1604 return 0;
1607 static int
1608 check_dst_limits_calc_pos (dfa, mctx, limit, eclosures, subexp_idx, node,
1609 str_idx)
1610 re_dfa_t *dfa;
1611 re_match_context_t *mctx;
1612 re_node_set *eclosures;
1613 int limit, subexp_idx, node, str_idx;
1615 struct re_backref_cache_entry *lim = mctx->bkref_ents + limit;
1616 int pos = (str_idx < lim->subexp_from ? -1
1617 : (lim->subexp_to < str_idx ? 1 : 0));
1618 if (pos == 0
1619 && (str_idx == lim->subexp_from || str_idx == lim->subexp_to))
1621 int node_idx;
1622 for (node_idx = 0; node_idx < eclosures->nelem; ++node_idx)
1624 int node = eclosures->elems[node_idx];
1625 int entity = node;
1626 re_token_type_t type= dfa->nodes[node].type;
1627 if (type == OP_CONTEXT_NODE)
1629 entity = dfa->nodes[node].opr.ctx_info->entity;
1630 type = dfa->nodes[entity].type;
1632 if (type == OP_BACK_REF)
1634 int bi;
1635 for (bi = 0; bi < mctx->nbkref_ents; ++bi)
1637 struct re_backref_cache_entry *ent = mctx->bkref_ents + bi;
1638 if (ent->node == node && ent->subexp_from == ent->subexp_to
1639 && ent->str_idx == str_idx)
1641 int cpos, dst;
1642 dst = dfa->nexts[node];
1643 cpos = check_dst_limits_calc_pos (dfa, mctx, limit,
1644 dfa->eclosures + dst,
1645 subexp_idx, dst,
1646 str_idx);
1647 if ((str_idx == lim->subexp_from && cpos == -1)
1648 || (str_idx == lim->subexp_to && cpos == 0))
1649 return cpos;
1653 if (type == OP_OPEN_SUBEXP && subexp_idx == dfa->nodes[node].opr.idx
1654 && str_idx == lim->subexp_from)
1656 pos = -1;
1657 break;
1659 if (type == OP_CLOSE_SUBEXP && subexp_idx == dfa->nodes[node].opr.idx
1660 && str_idx == lim->subexp_to)
1661 break;
1663 if (node_idx == eclosures->nelem && str_idx == lim->subexp_to)
1664 pos = 1;
1666 return pos;
1669 /* Check the limitations of sub expressions LIMITS, and remove the nodes
1670 which are against limitations from DEST_NODES. */
1672 static reg_errcode_t
1673 check_subexp_limits (dfa, dest_nodes, candidates, limits, bkref_ents, str_idx)
1674 re_dfa_t *dfa;
1675 re_node_set *dest_nodes;
1676 const re_node_set *candidates;
1677 re_node_set *limits;
1678 struct re_backref_cache_entry *bkref_ents;
1679 int str_idx;
1681 reg_errcode_t err;
1682 int node_idx, lim_idx;
1684 for (lim_idx = 0; lim_idx < limits->nelem; ++lim_idx)
1686 int bkref, subexp_idx;
1687 struct re_backref_cache_entry *ent;
1688 ent = bkref_ents + limits->elems[lim_idx];
1690 if (str_idx <= ent->subexp_from || ent->str_idx < str_idx)
1691 continue; /* This is unrelated limitation. */
1693 bkref = (dfa->nodes[ent->node].type == OP_CONTEXT_NODE
1694 ? dfa->nodes[ent->node].opr.ctx_info->entity : ent->node);
1695 subexp_idx = dfa->nodes[bkref].opr.idx - 1;
1697 if (ent->subexp_to == str_idx)
1699 int ops_node = -1;
1700 int cls_node = -1;
1701 for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
1703 int node = dest_nodes->elems[node_idx];
1704 re_token_type_t type= dfa->nodes[node].type;
1705 if (type == OP_OPEN_SUBEXP
1706 && subexp_idx == dfa->nodes[node].opr.idx)
1707 ops_node = node;
1708 else if (type == OP_CLOSE_SUBEXP
1709 && subexp_idx == dfa->nodes[node].opr.idx)
1710 cls_node = node;
1713 /* Check the limitation of the open subexpression. */
1714 /* Note that (ent->subexp_to = str_idx != ent->subexp_from). */
1715 if (ops_node >= 0)
1717 err = sub_epsilon_src_nodes(dfa, ops_node, dest_nodes,
1718 candidates);
1719 if (BE (err != REG_NOERROR, 0))
1720 return err;
1722 /* Check the limitation of the close subexpression. */
1723 for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
1725 int node = dest_nodes->elems[node_idx];
1726 if (!re_node_set_contains (dfa->inveclosures + node, cls_node)
1727 && !re_node_set_contains (dfa->eclosures + node, cls_node))
1729 /* It is against this limitation.
1730 Remove it form the current sifted state. */
1731 err = sub_epsilon_src_nodes(dfa, node, dest_nodes,
1732 candidates);
1733 if (BE (err != REG_NOERROR, 0))
1734 return err;
1735 --node_idx;
1739 else /* (ent->subexp_to != str_idx) */
1741 for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
1743 int node = dest_nodes->elems[node_idx];
1744 re_token_type_t type= dfa->nodes[node].type;
1745 if (type == OP_CLOSE_SUBEXP || type == OP_OPEN_SUBEXP)
1747 if (subexp_idx != dfa->nodes[node].opr.idx)
1748 continue;
1749 if ((type == OP_CLOSE_SUBEXP && ent->subexp_to != str_idx)
1750 || (type == OP_OPEN_SUBEXP))
1752 /* It is against this limitation.
1753 Remove it form the current sifted state. */
1754 err = sub_epsilon_src_nodes(dfa, node, dest_nodes,
1755 candidates);
1756 if (BE (err != REG_NOERROR, 0))
1757 return err;
1763 return REG_NOERROR;
1766 /* Search for the top (in case of sctx->check_subexp < 0) or the
1767 bottom (in case of sctx->check_subexp > 0) of the subexpressions
1768 which the backreference sctx->cur_bkref can match. */
1770 static reg_errcode_t
1771 search_subexp (preg, mctx, sctx, str_idx, dest_nodes)
1772 const regex_t *preg;
1773 re_match_context_t *mctx;
1774 re_sift_context_t *sctx;
1775 int str_idx;
1776 re_node_set *dest_nodes;
1778 reg_errcode_t err;
1779 re_dfa_t *dfa = (re_dfa_t *)preg->buffer;
1780 re_sift_context_t local_sctx;
1781 int node_idx, node;
1782 const re_node_set *candidates;
1783 re_dfastate_t **lim_states = NULL;
1784 candidates = ((mctx->state_log[str_idx] == NULL) ? &empty_set
1785 : &mctx->state_log[str_idx]->nodes);
1786 local_sctx.sifted_states = NULL; /* Mark that it hasn't been initialized. */
1788 for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
1790 re_token_type_t type;
1791 int entity;
1792 node = dest_nodes->elems[node_idx];
1793 type = dfa->nodes[node].type;
1794 entity = (type != OP_CONTEXT_NODE ? node
1795 : dfa->nodes[node].opr.ctx_info->entity);
1796 type = (type != OP_CONTEXT_NODE ? type : dfa->nodes[entity].type);
1798 if (type == OP_CLOSE_SUBEXP
1799 && sctx->check_subexp == dfa->nodes[node].opr.idx + 1)
1801 re_dfastate_t *cur_state;
1802 /* Found the bottom of the subexpression, then search for the
1803 top of it. */
1804 if (local_sctx.sifted_states == NULL)
1806 /* It hasn't been initialized yet, initialize it now. */
1807 local_sctx = *sctx;
1808 err = re_node_set_init_copy (&local_sctx.limits, &sctx->limits);
1809 if (BE (err != REG_NOERROR, 0))
1810 return err;
1812 local_sctx.check_subexp = -sctx->check_subexp;
1813 local_sctx.limited_states = sctx->limited_states;
1814 local_sctx.last_node = node;
1815 local_sctx.last_str_idx = local_sctx.cls_subexp_idx = str_idx;
1816 cur_state = local_sctx.sifted_states[str_idx];
1817 err = sift_states_backward (preg, mctx, &local_sctx);
1818 local_sctx.sifted_states[str_idx] = cur_state;
1819 if (BE (err != REG_NOERROR, 0))
1820 return err;
1821 /* There must not 2 same node in a node set. */
1822 break;
1824 else if (type == OP_OPEN_SUBEXP
1825 && -sctx->check_subexp == dfa->nodes[node].opr.idx + 1)
1827 /* Found the top of the subexpression, check that the
1828 backreference can match the input string. */
1829 char *buf;
1830 int dest_str_idx;
1831 int bkref_str_idx = re_string_cur_idx (mctx->input);
1832 int subexp_len = sctx->cls_subexp_idx - str_idx;
1833 if (subexp_len < 0 || bkref_str_idx + subexp_len > mctx->input->len)
1834 break;
1836 if (bkref_str_idx + subexp_len > mctx->input->valid_len
1837 && mctx->input->valid_len < mctx->input->len)
1839 reg_errcode_t err;
1840 err = extend_buffers (mctx);
1841 if (BE (err != REG_NOERROR, 0))
1842 return err;
1844 buf = (char *) re_string_get_buffer (mctx->input);
1845 if (strncmp (buf + str_idx, buf + bkref_str_idx, subexp_len) != 0)
1846 break;
1848 if (sctx->limits.nelem && str_idx > 0)
1850 re_dfastate_t *cur_state = sctx->sifted_states[str_idx];
1851 if (lim_states == NULL)
1853 lim_states = re_malloc (re_dfastate_t *, str_idx + 1);
1855 if (local_sctx.sifted_states == NULL)
1857 /* It hasn't been initialized yet, initialize it now. */
1858 local_sctx = *sctx;
1859 if (BE (lim_states == NULL, 0))
1860 return REG_ESPACE;
1861 err = re_node_set_init_copy (&local_sctx.limits,
1862 &sctx->limits);
1863 if (BE (err != REG_NOERROR, 0))
1864 return err;
1866 local_sctx.check_subexp = 0;
1867 local_sctx.last_node = node;
1868 local_sctx.last_str_idx = str_idx;
1869 local_sctx.limited_states = lim_states;
1870 memset (lim_states, '\0',
1871 sizeof (re_dfastate_t*) * (str_idx + 1));
1872 err = sift_states_backward (preg, mctx, &local_sctx);
1873 if (BE (err != REG_NOERROR, 0))
1874 return err;
1875 if (local_sctx.sifted_states[0] == NULL
1876 && local_sctx.limited_states[0] == NULL)
1878 sctx->sifted_states[str_idx] = cur_state;
1879 break;
1881 sctx->sifted_states[str_idx] = cur_state;
1883 /* Successfully matched, add a new cache entry. */
1884 dest_str_idx = bkref_str_idx + subexp_len;
1885 err = match_ctx_add_entry (mctx, sctx->cur_bkref, bkref_str_idx,
1886 str_idx, sctx->cls_subexp_idx);
1887 if (BE (err != REG_NOERROR, 0))
1888 return err;
1889 err = clean_state_log_if_need (mctx, dest_str_idx);
1890 if (BE (err != REG_NOERROR, 0))
1891 return err;
1892 break;
1896 /* Remove the top/bottom of the sub expression we processed. */
1897 if (node_idx < dest_nodes->nelem)
1899 err = sub_epsilon_src_nodes(dfa, node, dest_nodes, candidates);
1900 if (BE (err != REG_NOERROR, 0))
1901 return err;
1902 /* Update state_log. */
1903 sctx->sifted_states[str_idx] = re_acquire_state (&err, dfa, dest_nodes);
1904 if (BE (err != REG_NOERROR, 0))
1905 return err;
1908 if (local_sctx.sifted_states != NULL)
1909 re_node_set_free (&local_sctx.limits);
1910 if (lim_states != NULL)
1911 re_free (lim_states);
1912 return REG_NOERROR;
1915 static reg_errcode_t
1916 sift_states_bkref (preg, mctx, sctx, str_idx, dest_nodes)
1917 const regex_t *preg;
1918 re_match_context_t *mctx;
1919 re_sift_context_t *sctx;
1920 int str_idx;
1921 re_node_set *dest_nodes;
1923 reg_errcode_t err;
1924 re_dfa_t *dfa = (re_dfa_t *)preg->buffer;
1925 int node_idx, node;
1926 re_sift_context_t local_sctx;
1927 const re_node_set *candidates;
1928 candidates = ((mctx->state_log[str_idx] == NULL) ? &empty_set
1929 : &mctx->state_log[str_idx]->nodes);
1930 local_sctx.sifted_states = NULL; /* Mark that it hasn't been initialized. */
1932 for (node_idx = 0; node_idx < candidates->nelem; ++node_idx)
1934 int entity;
1935 int cur_bkref_idx = re_string_cur_idx (mctx->input);
1936 re_token_type_t type;
1937 node = candidates->elems[node_idx];
1938 type = dfa->nodes[node].type;
1939 entity = (type != OP_CONTEXT_NODE ? node
1940 : dfa->nodes[node].opr.ctx_info->entity);
1941 type = (type != OP_CONTEXT_NODE ? type : dfa->nodes[entity].type);
1942 if (node == sctx->cur_bkref && str_idx == cur_bkref_idx)
1943 continue;
1944 /* Avoid infinite loop for the REs like "()\1+". */
1945 if (node == sctx->last_node && str_idx == sctx->last_str_idx)
1946 continue;
1947 if (type == OP_BACK_REF)
1949 int enabled_idx;
1950 for (enabled_idx = 0; enabled_idx < mctx->nbkref_ents; ++enabled_idx)
1952 int disabled_idx, subexp_len, to_idx;
1953 struct re_backref_cache_entry *entry;
1954 entry = mctx->bkref_ents + enabled_idx;
1955 subexp_len = entry->subexp_to - entry->subexp_from;
1956 to_idx = str_idx + subexp_len;
1958 if (entry->node != node || entry->str_idx != str_idx
1959 || to_idx > sctx->last_str_idx
1960 || sctx->sifted_states[to_idx] == NULL)
1961 continue;
1962 if (!STATE_NODE_CONTAINS (sctx->sifted_states[to_idx],
1963 dfa->nexts[node]))
1965 int dst_idx;
1966 re_node_set *dsts = &sctx->sifted_states[to_idx]->nodes;
1967 for (dst_idx = 0; dst_idx < dsts->nelem; ++dst_idx)
1969 int dst_node = dsts->elems[dst_idx];
1970 if (dfa->nodes[dst_node].type == OP_CONTEXT_NODE
1971 && (dfa->nodes[dst_node].opr.ctx_info->entity
1972 == dfa->nexts[node]))
1973 break;
1975 if (dst_idx == dsts->nelem)
1976 continue;
1979 if (check_dst_limits (dfa, &sctx->limits, mctx, node,
1980 str_idx, dfa->nexts[node], to_idx))
1981 continue;
1982 if (sctx->check_subexp == dfa->nodes[entity].opr.idx)
1984 char *buf;
1985 buf = (char *) re_string_get_buffer (mctx->input);
1986 if (strncmp (buf + entry->subexp_from,
1987 buf + cur_bkref_idx, subexp_len) != 0)
1988 continue;
1989 err = match_ctx_add_entry (mctx, sctx->cur_bkref,
1990 cur_bkref_idx, entry->subexp_from,
1991 entry->subexp_to);
1992 if (BE (err != REG_NOERROR, 0))
1993 return err;
1994 err = clean_state_log_if_need (mctx, cur_bkref_idx
1995 + subexp_len);
1996 if (BE (err != REG_NOERROR, 0))
1997 return err;
1999 else
2001 re_dfastate_t *cur_state;
2002 entry->flag = 0;
2003 for (disabled_idx = enabled_idx + 1;
2004 disabled_idx < mctx->nbkref_ents; ++disabled_idx)
2006 struct re_backref_cache_entry *entry2;
2007 entry2 = mctx->bkref_ents + disabled_idx;
2008 if (entry2->node != node || entry2->str_idx != str_idx)
2009 continue;
2010 entry2->flag = 1;
2013 if (local_sctx.sifted_states == NULL)
2015 local_sctx = *sctx;
2016 err = re_node_set_init_copy (&local_sctx.limits,
2017 &sctx->limits);
2018 if (BE (err != REG_NOERROR, 0))
2019 return err;
2021 local_sctx.last_node = node;
2022 local_sctx.last_str_idx = str_idx;
2023 err = re_node_set_insert (&local_sctx.limits, enabled_idx);
2024 if (BE (err < 0, 0))
2025 return REG_ESPACE;
2026 cur_state = local_sctx.sifted_states[str_idx];
2027 err = sift_states_backward (preg, mctx, &local_sctx);
2028 if (BE (err != REG_NOERROR, 0))
2029 return err;
2030 if (sctx->limited_states != NULL)
2032 err = merge_state_array (dfa, sctx->limited_states,
2033 local_sctx.sifted_states,
2034 str_idx + 1);
2035 if (BE (err != REG_NOERROR, 0))
2036 return err;
2038 local_sctx.sifted_states[str_idx] = cur_state;
2039 re_node_set_remove_at (&local_sctx.limits,
2040 local_sctx.limits.nelem - 1);
2041 entry->flag = 1;
2044 for (enabled_idx = 0; enabled_idx < mctx->nbkref_ents; ++enabled_idx)
2046 struct re_backref_cache_entry *entry;
2047 entry = mctx->bkref_ents + enabled_idx;
2048 if (entry->node == node && entry->str_idx == str_idx)
2049 entry->flag = 0;
2053 if (local_sctx.sifted_states != NULL)
2055 re_node_set_free (&local_sctx.limits);
2058 return REG_NOERROR;
2062 #ifdef RE_ENABLE_I18N
2063 static int
2064 sift_states_iter_mb (preg, mctx, sctx, node_idx, str_idx, max_str_idx)
2065 const regex_t *preg;
2066 const re_match_context_t *mctx;
2067 re_sift_context_t *sctx;
2068 int node_idx, str_idx, max_str_idx;
2070 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2071 int naccepted;
2072 /* Check the node can accept `multi byte'. */
2073 naccepted = check_node_accept_bytes (preg, node_idx, mctx->input, str_idx);
2074 if (naccepted > 0 && str_idx + naccepted <= max_str_idx &&
2075 !STATE_NODE_CONTAINS (sctx->sifted_states[str_idx + naccepted],
2076 dfa->nexts[node_idx]))
2077 /* The node can't accept the `multi byte', or the
2078 destination was already throwed away, then the node
2079 could't accept the current input `multi byte'. */
2080 naccepted = 0;
2081 /* Otherwise, it is sure that the node could accept
2082 `naccepted' bytes input. */
2083 return naccepted;
2085 #endif /* RE_ENABLE_I18N */
2088 /* Functions for state transition. */
2090 /* Return the next state to which the current state STATE will transit by
2091 accepting the current input byte, and update STATE_LOG if necessary.
2092 If STATE can accept a multibyte char/collating element/back reference
2093 update the destination of STATE_LOG. */
2095 static re_dfastate_t *
2096 transit_state (err, preg, mctx, state, fl_search)
2097 reg_errcode_t *err;
2098 const regex_t *preg;
2099 re_match_context_t *mctx;
2100 re_dfastate_t *state;
2101 int fl_search;
2103 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2104 re_dfastate_t **trtable, *next_state;
2105 unsigned char ch;
2107 if (re_string_cur_idx (mctx->input) + 1 >= mctx->input->bufs_len
2108 || (re_string_cur_idx (mctx->input) + 1 >= mctx->input->valid_len
2109 && mctx->input->valid_len < mctx->input->len))
2111 *err = extend_buffers (mctx);
2112 if (BE (*err != REG_NOERROR, 0))
2113 return NULL;
2116 *err = REG_NOERROR;
2117 if (state == NULL)
2119 next_state = state;
2120 re_string_skip_bytes (mctx->input, 1);
2122 else
2124 #ifdef RE_ENABLE_I18N
2125 /* If the current state can accept multibyte. */
2126 if (state->accept_mb)
2128 *err = transit_state_mb (preg, state, mctx);
2129 if (BE (*err != REG_NOERROR, 0))
2130 return NULL;
2132 #endif /* RE_ENABLE_I18N */
2134 /* Then decide the next state with the single byte. */
2135 if (1)
2137 /* Use transition table */
2138 ch = re_string_fetch_byte (mctx->input);
2139 trtable = fl_search ? state->trtable_search : state->trtable;
2140 if (trtable == NULL)
2142 trtable = build_trtable (preg, state, fl_search);
2143 if (fl_search)
2144 state->trtable_search = trtable;
2145 else
2146 state->trtable = trtable;
2148 next_state = trtable[ch];
2150 else
2152 /* don't use transition table */
2153 next_state = transit_state_sb (err, preg, state, fl_search, mctx);
2154 if (BE (next_state == NULL && err != REG_NOERROR, 0))
2155 return NULL;
2159 /* Update the state_log if we need. */
2160 if (mctx->state_log != NULL)
2162 int cur_idx = re_string_cur_idx (mctx->input);
2163 if (cur_idx > mctx->state_log_top)
2165 mctx->state_log[cur_idx] = next_state;
2166 mctx->state_log_top = cur_idx;
2168 else if (mctx->state_log[cur_idx] == 0)
2170 mctx->state_log[cur_idx] = next_state;
2172 else
2174 re_dfastate_t *pstate;
2175 unsigned int context;
2176 re_node_set next_nodes, *log_nodes, *table_nodes = NULL;
2177 /* If (state_log[cur_idx] != 0), it implies that cur_idx is
2178 the destination of a multibyte char/collating element/
2179 back reference. Then the next state is the union set of
2180 these destinations and the results of the transition table. */
2181 pstate = mctx->state_log[cur_idx];
2182 log_nodes = pstate->entrance_nodes;
2183 if (next_state != NULL)
2185 table_nodes = next_state->entrance_nodes;
2186 *err = re_node_set_init_union (&next_nodes, table_nodes,
2187 log_nodes);
2188 if (BE (*err != REG_NOERROR, 0))
2189 return NULL;
2191 else
2192 next_nodes = *log_nodes;
2193 /* Note: We already add the nodes of the initial state,
2194 then we don't need to add them here. */
2196 context = re_string_context_at (mctx->input,
2197 re_string_cur_idx (mctx->input) - 1,
2198 mctx->eflags, preg->newline_anchor);
2199 next_state = mctx->state_log[cur_idx]
2200 = re_acquire_state_context (err, dfa, &next_nodes, context);
2201 /* We don't need to check errors here, since the return value of
2202 this function is next_state and ERR is already set. */
2204 if (table_nodes != NULL)
2205 re_node_set_free (&next_nodes);
2207 /* If the next state has back references. */
2208 if (next_state != NULL && next_state->has_backref)
2210 *err = transit_state_bkref (preg, next_state, mctx);
2211 if (BE (*err != REG_NOERROR, 0))
2212 return NULL;
2213 next_state = mctx->state_log[cur_idx];
2216 return next_state;
2219 /* Helper functions for transit_state. */
2221 /* Return the next state to which the current state STATE will transit by
2222 accepting the current input byte. */
2224 static re_dfastate_t *
2225 transit_state_sb (err, preg, state, fl_search, mctx)
2226 reg_errcode_t *err;
2227 const regex_t *preg;
2228 re_dfastate_t *state;
2229 int fl_search;
2230 re_match_context_t *mctx;
2232 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2233 re_node_set next_nodes;
2234 re_dfastate_t *next_state;
2235 int node_cnt, cur_str_idx = re_string_cur_idx (mctx->input);
2236 unsigned int context;
2238 *err = re_node_set_alloc (&next_nodes, state->nodes.nelem + 1);
2239 if (BE (*err != REG_NOERROR, 0))
2240 return NULL;
2241 for (node_cnt = 0; node_cnt < state->nodes.nelem; ++node_cnt)
2243 int cur_node = state->nodes.elems[node_cnt];
2244 if (check_node_accept (preg, dfa->nodes + cur_node, mctx, cur_str_idx))
2246 *err = re_node_set_merge (&next_nodes,
2247 dfa->eclosures + dfa->nexts[cur_node]);
2248 if (BE (*err != REG_NOERROR, 0))
2249 return NULL;
2252 if (fl_search)
2254 #ifdef RE_ENABLE_I18N
2255 int not_initial = 0;
2256 if (MB_CUR_MAX > 1)
2257 for (node_cnt = 0; node_cnt < next_nodes.nelem; ++node_cnt)
2258 if (dfa->nodes[next_nodes.elems[node_cnt]].type == CHARACTER)
2260 not_initial = dfa->nodes[next_nodes.elems[node_cnt]].mb_partial;
2261 break;
2263 if (!not_initial)
2264 #endif
2266 *err = re_node_set_merge (&next_nodes,
2267 dfa->init_state->entrance_nodes);
2268 if (BE (*err != REG_NOERROR, 0))
2269 return NULL;
2272 context = re_string_context_at (mctx->input, cur_str_idx, mctx->eflags,
2273 preg->newline_anchor);
2274 next_state = re_acquire_state_context (err, dfa, &next_nodes, context);
2275 /* We don't need to check errors here, since the return value of
2276 this function is next_state and ERR is already set. */
2278 re_node_set_free (&next_nodes);
2279 re_string_skip_bytes (mctx->input, 1);
2280 return next_state;
2283 #ifdef RE_ENABLE_I18N
2284 static reg_errcode_t
2285 transit_state_mb (preg, pstate, mctx)
2286 const regex_t *preg;
2287 re_dfastate_t *pstate;
2288 re_match_context_t *mctx;
2290 reg_errcode_t err;
2291 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2292 int i;
2294 for (i = 0; i < pstate->nodes.nelem; ++i)
2296 re_node_set dest_nodes, *new_nodes;
2297 int cur_node_idx = pstate->nodes.elems[i];
2298 int naccepted = 0, dest_idx;
2299 unsigned int context;
2300 re_dfastate_t *dest_state;
2302 if (dfa->nodes[cur_node_idx].type == OP_CONTEXT_NODE)
2304 context = re_string_context_at (mctx->input,
2305 re_string_cur_idx (mctx->input),
2306 mctx->eflags, preg->newline_anchor);
2307 if (NOT_SATISFY_NEXT_CONSTRAINT (dfa->nodes[cur_node_idx].constraint,
2308 context))
2309 continue;
2310 cur_node_idx = dfa->nodes[cur_node_idx].opr.ctx_info->entity;
2313 /* How many bytes the node can accepts? */
2314 if (ACCEPT_MB_NODE (dfa->nodes[cur_node_idx].type))
2315 naccepted = check_node_accept_bytes (preg, cur_node_idx, mctx->input,
2316 re_string_cur_idx (mctx->input));
2317 if (naccepted == 0)
2318 continue;
2320 /* The node can accepts `naccepted' bytes. */
2321 dest_idx = re_string_cur_idx (mctx->input) + naccepted;
2322 mctx->max_mb_elem_len = ((mctx->max_mb_elem_len < naccepted) ? naccepted
2323 : mctx->max_mb_elem_len);
2324 err = clean_state_log_if_need (mctx, dest_idx);
2325 if (BE (err != REG_NOERROR, 0))
2326 return err;
2327 #ifdef DEBUG
2328 assert (dfa->nexts[cur_node_idx] != -1);
2329 #endif
2330 /* `cur_node_idx' may point the entity of the OP_CONTEXT_NODE,
2331 then we use pstate->nodes.elems[i] instead. */
2332 new_nodes = dfa->eclosures + dfa->nexts[pstate->nodes.elems[i]];
2334 dest_state = mctx->state_log[dest_idx];
2335 if (dest_state == NULL)
2336 dest_nodes = *new_nodes;
2337 else
2339 err = re_node_set_init_union (&dest_nodes,
2340 dest_state->entrance_nodes, new_nodes);
2341 if (BE (err != REG_NOERROR, 0))
2342 return err;
2344 context = re_string_context_at (mctx->input, dest_idx - 1, mctx->eflags,
2345 preg->newline_anchor);
2346 mctx->state_log[dest_idx]
2347 = re_acquire_state_context (&err, dfa, &dest_nodes, context);
2348 if (BE (mctx->state_log[dest_idx] == NULL && err != REG_NOERROR, 0))
2349 return err;
2350 if (dest_state != NULL)
2351 re_node_set_free (&dest_nodes);
2353 return REG_NOERROR;
2355 #endif /* RE_ENABLE_I18N */
2357 static reg_errcode_t
2358 transit_state_bkref (preg, pstate, mctx)
2359 const regex_t *preg;
2360 re_dfastate_t *pstate;
2361 re_match_context_t *mctx;
2363 reg_errcode_t err;
2364 re_dfastate_t **work_state_log;
2366 work_state_log = re_malloc (re_dfastate_t *,
2367 re_string_cur_idx (mctx->input) + 1);
2368 if (BE (work_state_log == NULL, 0))
2369 return REG_ESPACE;
2371 err = transit_state_bkref_loop (preg, &pstate->nodes, work_state_log, mctx);
2372 re_free (work_state_log);
2373 return err;
2376 /* Caller must allocate `work_state_log'. */
2378 static reg_errcode_t
2379 transit_state_bkref_loop (preg, nodes, work_state_log, mctx)
2380 const regex_t *preg;
2381 re_node_set *nodes;
2382 re_dfastate_t **work_state_log;
2383 re_match_context_t *mctx;
2385 reg_errcode_t err;
2386 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2387 int i;
2388 regmatch_t *cur_regs = re_malloc (regmatch_t, preg->re_nsub + 1);
2389 int cur_str_idx = re_string_cur_idx (mctx->input);
2390 if (BE (cur_regs == NULL, 0))
2391 return REG_ESPACE;
2393 for (i = 0; i < nodes->nelem; ++i)
2395 int dest_str_idx, subexp_idx, prev_nelem, bkc_idx;
2396 int node_idx = nodes->elems[i];
2397 unsigned int context;
2398 re_token_t *node = dfa->nodes + node_idx;
2399 re_node_set *new_dest_nodes;
2400 re_sift_context_t sctx;
2402 /* Check whether `node' is a backreference or not. */
2403 if (node->type == OP_BACK_REF)
2404 subexp_idx = node->opr.idx;
2405 else if (node->type == OP_CONTEXT_NODE &&
2406 dfa->nodes[node->opr.ctx_info->entity].type == OP_BACK_REF)
2408 context = re_string_context_at (mctx->input, cur_str_idx,
2409 mctx->eflags, preg->newline_anchor);
2410 if (NOT_SATISFY_NEXT_CONSTRAINT (node->constraint, context))
2411 continue;
2412 subexp_idx = dfa->nodes[node->opr.ctx_info->entity].opr.idx;
2414 else
2415 continue;
2417 /* `node' is a backreference.
2418 Check the substring which the substring matched. */
2419 sift_ctx_init (&sctx, work_state_log, NULL, node_idx, cur_str_idx,
2420 subexp_idx);
2421 sctx.cur_bkref = node_idx;
2422 match_ctx_clear_flag (mctx);
2423 err = sift_states_backward (preg, mctx, &sctx);
2424 if (BE (err != REG_NOERROR, 0))
2425 return err;
2427 /* And add the epsilon closures (which is `new_dest_nodes') of
2428 the backreference to appropriate state_log. */
2429 #ifdef DEBUG
2430 assert (dfa->nexts[node_idx] != -1);
2431 #endif
2432 for (bkc_idx = 0; bkc_idx < mctx->nbkref_ents; ++bkc_idx)
2434 int subexp_len;
2435 re_dfastate_t *dest_state;
2436 struct re_backref_cache_entry *bkref_ent;
2437 bkref_ent = mctx->bkref_ents + bkc_idx;
2438 if (bkref_ent->node != node_idx || bkref_ent->str_idx != cur_str_idx)
2439 continue;
2440 subexp_len = bkref_ent->subexp_to - bkref_ent->subexp_from;
2441 new_dest_nodes = ((node->type == OP_CONTEXT_NODE && subexp_len == 0)
2442 ? dfa->nodes[node_idx].opr.ctx_info->bkref_eclosure
2443 : dfa->eclosures + dfa->nexts[node_idx]);
2444 dest_str_idx = (cur_str_idx + bkref_ent->subexp_to
2445 - bkref_ent->subexp_from);
2446 context = (IS_WORD_CHAR (re_string_byte_at (mctx->input,
2447 dest_str_idx - 1))
2448 ? CONTEXT_WORD : 0);
2449 dest_state = mctx->state_log[dest_str_idx];
2450 prev_nelem = ((mctx->state_log[cur_str_idx] == NULL) ? 0
2451 : mctx->state_log[cur_str_idx]->nodes.nelem);
2452 /* Add `new_dest_node' to state_log. */
2453 if (dest_state == NULL)
2455 mctx->state_log[dest_str_idx]
2456 = re_acquire_state_context (&err, dfa, new_dest_nodes,
2457 context);
2458 if (BE (mctx->state_log[dest_str_idx] == NULL
2459 && err != REG_NOERROR, 0))
2460 return err;
2462 else
2464 re_node_set dest_nodes;
2465 err = re_node_set_init_union (&dest_nodes,
2466 dest_state->entrance_nodes,
2467 new_dest_nodes);
2468 if (BE (err != REG_NOERROR, 0))
2469 return err;
2470 mctx->state_log[dest_str_idx]
2471 = re_acquire_state_context (&err, dfa, &dest_nodes, context);
2472 if (BE (mctx->state_log[dest_str_idx] == NULL
2473 && err != REG_NOERROR, 0))
2474 return err;
2475 re_node_set_free (&dest_nodes);
2477 /* We need to check recursively if the backreference can epsilon
2478 transit. */
2479 if (subexp_len == 0
2480 && mctx->state_log[cur_str_idx]->nodes.nelem > prev_nelem)
2482 err = transit_state_bkref_loop (preg, new_dest_nodes,
2483 work_state_log, mctx);
2484 if (BE (err != REG_NOERROR, 0))
2485 return err;
2489 re_free (cur_regs);
2490 return REG_NOERROR;
2493 /* Build transition table for the state.
2494 Return the new table if succeeded, otherwise return NULL. */
2496 static re_dfastate_t **
2497 build_trtable (preg, state, fl_search)
2498 const regex_t *preg;
2499 const re_dfastate_t *state;
2500 int fl_search;
2502 reg_errcode_t err;
2503 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2504 int i, j, k, ch;
2505 int ndests; /* Number of the destination states from `state'. */
2506 re_dfastate_t **trtable, **dest_states, **dest_states_word, **dest_states_nl;
2507 re_node_set follows, *dests_node;
2508 bitset *dests_ch;
2509 bitset acceptable;
2511 /* We build DFA states which corresponds to the destination nodes
2512 from `state'. `dests_node[i]' represents the nodes which i-th
2513 destination state contains, and `dests_ch[i]' represents the
2514 characters which i-th destination state accepts. */
2515 dests_node = re_malloc (re_node_set, SBC_MAX);
2516 dests_ch = re_malloc (bitset, SBC_MAX);
2518 /* Initialize transiton table. */
2519 trtable = (re_dfastate_t **) calloc (sizeof (re_dfastate_t *), SBC_MAX);
2520 if (BE (dests_node == NULL || dests_ch == NULL || trtable == NULL, 0))
2521 return NULL;
2523 /* At first, group all nodes belonging to `state' into several
2524 destinations. */
2525 ndests = group_nodes_into_DFAstates (preg, state, dests_node, dests_ch);
2526 if (BE (ndests <= 0, 0))
2528 re_free (dests_node);
2529 re_free (dests_ch);
2530 /* Return NULL in case of an error, trtable otherwise. */
2531 return (ndests < 0) ? NULL : trtable;
2534 dest_states = re_malloc (re_dfastate_t *, ndests);
2535 dest_states_word = re_malloc (re_dfastate_t *, ndests);
2536 dest_states_nl = re_malloc (re_dfastate_t *, ndests);
2537 bitset_empty (acceptable);
2539 err = re_node_set_alloc (&follows, ndests + 1);
2540 if (BE (dest_states == NULL || dest_states_word == NULL
2541 || dest_states_nl == NULL || err != REG_NOERROR, 0))
2542 return NULL;
2544 /* Then build the states for all destinations. */
2545 for (i = 0; i < ndests; ++i)
2547 int next_node;
2548 re_node_set_empty (&follows);
2549 /* Merge the follows of this destination states. */
2550 for (j = 0; j < dests_node[i].nelem; ++j)
2552 next_node = dfa->nexts[dests_node[i].elems[j]];
2553 if (next_node != -1)
2555 err = re_node_set_merge (&follows, dfa->eclosures + next_node);
2556 if (BE (err != REG_NOERROR, 0))
2557 return NULL;
2560 /* If search flag is set, merge the initial state. */
2561 if (fl_search)
2563 #ifdef RE_ENABLE_I18N
2564 int not_initial = 0;
2565 for (j = 0; j < follows.nelem; ++j)
2566 if (dfa->nodes[follows.elems[j]].type == CHARACTER)
2568 not_initial = dfa->nodes[follows.elems[j]].mb_partial;
2569 break;
2571 if (!not_initial)
2572 #endif
2574 err = re_node_set_merge (&follows,
2575 dfa->init_state->entrance_nodes);
2576 if (BE (err != REG_NOERROR, 0))
2577 return NULL;
2580 dest_states[i] = re_acquire_state_context (&err, dfa, &follows, 0);
2581 if (BE (dest_states[i] == NULL && err != REG_NOERROR, 0))
2582 return NULL;
2583 /* If the new state has context constraint,
2584 build appropriate states for these contexts. */
2585 if (dest_states[i]->has_constraint)
2587 dest_states_word[i] = re_acquire_state_context (&err, dfa, &follows,
2588 CONTEXT_WORD);
2589 if (BE (dest_states_word[i] == NULL && err != REG_NOERROR, 0))
2590 return NULL;
2591 dest_states_nl[i] = re_acquire_state_context (&err, dfa, &follows,
2592 CONTEXT_NEWLINE);
2593 if (BE (dest_states_nl[i] == NULL && err != REG_NOERROR, 0))
2594 return NULL;
2596 else
2598 dest_states_word[i] = dest_states[i];
2599 dest_states_nl[i] = dest_states[i];
2601 bitset_merge (acceptable, dests_ch[i]);
2604 /* Update the transition table. */
2605 /* For all characters ch...: */
2606 for (i = 0, ch = 0; i < BITSET_UINTS; ++i)
2607 for (j = 0; j < UINT_BITS; ++j, ++ch)
2608 if ((acceptable[i] >> j) & 1)
2610 /* The current state accepts the character ch. */
2611 if (IS_WORD_CHAR (ch))
2613 for (k = 0; k < ndests; ++k)
2614 if ((dests_ch[k][i] >> j) & 1)
2616 /* k-th destination accepts the word character ch. */
2617 trtable[ch] = dest_states_word[k];
2618 /* There must be only one destination which accepts
2619 character ch. See group_nodes_into_DFAstates. */
2620 break;
2623 else /* not WORD_CHAR */
2625 for (k = 0; k < ndests; ++k)
2626 if ((dests_ch[k][i] >> j) & 1)
2628 /* k-th destination accepts the non-word character ch. */
2629 trtable[ch] = dest_states[k];
2630 /* There must be only one destination which accepts
2631 character ch. See group_nodes_into_DFAstates. */
2632 break;
2636 /* new line */
2637 if (bitset_contain (acceptable, NEWLINE_CHAR))
2639 /* The current state accepts newline character. */
2640 for (k = 0; k < ndests; ++k)
2641 if (bitset_contain (dests_ch[k], NEWLINE_CHAR))
2643 /* k-th destination accepts newline character. */
2644 trtable[NEWLINE_CHAR] = dest_states_nl[k];
2645 /* There must be only one destination which accepts
2646 newline. See group_nodes_into_DFAstates. */
2647 break;
2651 re_free (dest_states_nl);
2652 re_free (dest_states_word);
2653 re_free (dest_states);
2655 re_node_set_free (&follows);
2656 for (i = 0; i < ndests; ++i)
2657 re_node_set_free (dests_node + i);
2659 re_free (dests_ch);
2660 re_free (dests_node);
2662 return trtable;
2665 /* Group all nodes belonging to STATE into several destinations.
2666 Then for all destinations, set the nodes belonging to the destination
2667 to DESTS_NODE[i] and set the characters accepted by the destination
2668 to DEST_CH[i]. This function return the number of destinations. */
2670 static int
2671 group_nodes_into_DFAstates (preg, state, dests_node, dests_ch)
2672 const regex_t *preg;
2673 const re_dfastate_t *state;
2674 re_node_set *dests_node;
2675 bitset *dests_ch;
2677 reg_errcode_t err;
2678 const re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2679 int i, j, k;
2680 int ndests; /* Number of the destinations from `state'. */
2681 bitset accepts; /* Characters a node can accept. */
2682 const re_node_set *cur_nodes = &state->nodes;
2683 bitset_empty (accepts);
2684 ndests = 0;
2686 /* For all the nodes belonging to `state', */
2687 for (i = 0; i < cur_nodes->nelem; ++i)
2689 unsigned int constraint = 0;
2690 re_token_t *node = &dfa->nodes[cur_nodes->elems[i]];
2691 re_token_type_t type = node->type;
2693 if (type == OP_CONTEXT_NODE)
2695 constraint = node->constraint;
2696 node = dfa->nodes + node->opr.ctx_info->entity;
2697 type = node->type;
2700 /* Enumerate all single byte character this node can accept. */
2701 if (type == CHARACTER)
2702 bitset_set (accepts, node->opr.c);
2703 else if (type == SIMPLE_BRACKET)
2705 bitset_merge (accepts, node->opr.sbcset);
2707 else if (type == OP_PERIOD)
2709 bitset_set_all (accepts);
2710 if (!(preg->syntax & RE_DOT_NEWLINE))
2711 bitset_clear (accepts, '\n');
2712 if (preg->syntax & RE_DOT_NOT_NULL)
2713 bitset_clear (accepts, '\0');
2715 else
2716 continue;
2718 /* Check the `accepts' and sift the characters which are not
2719 match it the context. */
2720 if (constraint)
2722 if (constraint & NEXT_WORD_CONSTRAINT)
2723 for (j = 0; j < BITSET_UINTS; ++j)
2724 accepts[j] &= dfa->word_char[j];
2725 else if (constraint & NEXT_NOTWORD_CONSTRAINT)
2726 for (j = 0; j < BITSET_UINTS; ++j)
2727 accepts[j] &= ~dfa->word_char[j];
2728 else if (constraint & NEXT_NEWLINE_CONSTRAINT)
2730 int accepts_newline = bitset_contain (accepts, NEWLINE_CHAR);
2731 bitset_empty (accepts);
2732 if (accepts_newline)
2733 bitset_set (accepts, NEWLINE_CHAR);
2734 else
2735 continue;
2739 /* Then divide `accepts' into DFA states, or create a new
2740 state. */
2741 for (j = 0; j < ndests; ++j)
2743 bitset intersec; /* Intersection sets, see below. */
2744 bitset remains;
2745 /* Flags, see below. */
2746 int has_intersec, not_subset, not_consumed;
2748 /* Optimization, skip if this state doesn't accept the character. */
2749 if (type == CHARACTER && !bitset_contain (dests_ch[j], node->opr.c))
2750 continue;
2752 /* Enumerate the intersection set of this state and `accepts'. */
2753 has_intersec = 0;
2754 for (k = 0; k < BITSET_UINTS; ++k)
2755 has_intersec |= intersec[k] = accepts[k] & dests_ch[j][k];
2756 /* And skip if the intersection set is empty. */
2757 if (!has_intersec)
2758 continue;
2760 /* Then check if this state is a subset of `accepts'. */
2761 not_subset = not_consumed = 0;
2762 for (k = 0; k < BITSET_UINTS; ++k)
2764 not_subset |= remains[k] = ~accepts[k] & dests_ch[j][k];
2765 not_consumed |= accepts[k] = accepts[k] & ~dests_ch[j][k];
2768 /* If this state isn't a subset of `accepts', create a
2769 new group state, which has the `remains'. */
2770 if (not_subset)
2772 bitset_copy (dests_ch[ndests], remains);
2773 bitset_copy (dests_ch[j], intersec);
2774 err = re_node_set_init_copy (dests_node + ndests, &dests_node[j]);
2775 if (BE (err != REG_NOERROR, 0))
2776 return -1;
2777 ++ndests;
2780 /* Put the position in the current group. */
2781 err = re_node_set_insert (&dests_node[j], cur_nodes->elems[i]);
2782 if (BE (err < 0, 0))
2783 return -1;
2785 /* If all characters are consumed, go to next node. */
2786 if (!not_consumed)
2787 break;
2789 /* Some characters remain, create a new group. */
2790 if (j == ndests)
2792 bitset_copy (dests_ch[ndests], accepts);
2793 err = re_node_set_init_1 (dests_node + ndests, cur_nodes->elems[i]);
2794 if (BE (err != REG_NOERROR, 0))
2795 return -1;
2796 ++ndests;
2797 bitset_empty (accepts);
2800 return ndests;
2803 #ifdef RE_ENABLE_I18N
2804 /* Check how many bytes the node `dfa->nodes[node_idx]' accepts.
2805 Return the number of the bytes the node accepts.
2806 STR_IDX is the current index of the input string.
2808 This function handles the nodes which can accept one character, or
2809 one collating element like '.', '[a-z]', opposite to the other nodes
2810 can only accept one byte. */
2812 static int
2813 check_node_accept_bytes (preg, node_idx, input, str_idx)
2814 const regex_t *preg;
2815 int node_idx, str_idx;
2816 const re_string_t *input;
2818 const re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2819 const re_token_t *node = dfa->nodes + node_idx;
2820 int elem_len = re_string_elem_size_at (input, str_idx);
2821 int char_len = re_string_char_size_at (input, str_idx);
2822 int i;
2823 # ifdef _LIBC
2824 int j;
2825 uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
2826 # endif /* _LIBC */
2827 if (elem_len <= 1 && char_len <= 1)
2828 return 0;
2829 if (node->type == OP_PERIOD)
2831 /* '.' accepts any one character except the following two cases. */
2832 if ((!(preg->syntax & RE_DOT_NEWLINE) &&
2833 re_string_byte_at (input, str_idx) == '\n') ||
2834 ((preg->syntax & RE_DOT_NOT_NULL) &&
2835 re_string_byte_at (input, str_idx) == '\0'))
2836 return 0;
2837 return char_len;
2839 else if (node->type == COMPLEX_BRACKET)
2841 const re_charset_t *cset = node->opr.mbcset;
2842 # ifdef _LIBC
2843 const unsigned char *pin = re_string_get_buffer (input) + str_idx;
2844 # endif /* _LIBC */
2845 int match_len = 0;
2846 wchar_t wc = ((cset->nranges || cset->nchar_classes || cset->nmbchars)
2847 ? re_string_wchar_at (input, str_idx) : 0);
2849 /* match with multibyte character? */
2850 for (i = 0; i < cset->nmbchars; ++i)
2851 if (wc == cset->mbchars[i])
2853 match_len = char_len;
2854 goto check_node_accept_bytes_match;
2856 /* match with character_class? */
2857 for (i = 0; i < cset->nchar_classes; ++i)
2859 wctype_t wt = cset->char_classes[i];
2860 if (__iswctype (wc, wt))
2862 match_len = char_len;
2863 goto check_node_accept_bytes_match;
2867 # ifdef _LIBC
2868 if (nrules != 0)
2870 unsigned int in_collseq = 0;
2871 const int32_t *table, *indirect;
2872 const unsigned char *weights, *extra;
2873 const char *collseqwc;
2874 int32_t idx;
2875 /* This #include defines a local function! */
2876 # include <locale/weight.h>
2878 /* match with collating_symbol? */
2879 if (cset->ncoll_syms)
2880 extra = (const unsigned char *)
2881 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB);
2882 for (i = 0; i < cset->ncoll_syms; ++i)
2884 const unsigned char *coll_sym = extra + cset->coll_syms[i];
2885 /* Compare the length of input collating element and
2886 the length of current collating element. */
2887 if (*coll_sym != elem_len)
2888 continue;
2889 /* Compare each bytes. */
2890 for (j = 0; j < *coll_sym; j++)
2891 if (pin[j] != coll_sym[1 + j])
2892 break;
2893 if (j == *coll_sym)
2895 /* Match if every bytes is equal. */
2896 match_len = j;
2897 goto check_node_accept_bytes_match;
2901 if (cset->nranges)
2903 if (elem_len <= char_len)
2905 collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
2906 in_collseq = collseq_table_lookup (collseqwc, wc);
2908 else
2909 in_collseq = find_collation_sequence_value (pin, elem_len);
2911 /* match with range expression? */
2912 for (i = 0; i < cset->nranges; ++i)
2913 if (cset->range_starts[i] <= in_collseq
2914 && in_collseq <= cset->range_ends[i])
2916 match_len = elem_len;
2917 goto check_node_accept_bytes_match;
2920 /* match with equivalence_class? */
2921 if (cset->nequiv_classes)
2923 const unsigned char *cp = pin;
2924 table = (const int32_t *)
2925 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
2926 weights = (const unsigned char *)
2927 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_WEIGHTMB);
2928 extra = (const unsigned char *)
2929 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_EXTRAMB);
2930 indirect = (const int32_t *)
2931 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_INDIRECTMB);
2932 idx = findidx (&cp);
2933 if (idx > 0)
2934 for (i = 0; i < cset->nequiv_classes; ++i)
2936 int32_t equiv_class_idx = cset->equiv_classes[i];
2937 size_t weight_len = weights[idx];
2938 if (weight_len == weights[equiv_class_idx])
2940 int cnt = 0;
2941 while (cnt <= weight_len
2942 && (weights[equiv_class_idx + 1 + cnt]
2943 == weights[idx + 1 + cnt]))
2944 ++cnt;
2945 if (cnt > weight_len)
2947 match_len = elem_len;
2948 goto check_node_accept_bytes_match;
2954 else
2955 # endif /* _LIBC */
2957 /* match with range expression? */
2958 #if __GNUC__ >= 2
2959 wchar_t cmp_buf[] = {L'\0', L'\0', wc, L'\0', L'\0', L'\0'};
2960 #else
2961 wchar_t cmp_buf[] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'};
2962 cmp_buf[2] = wc;
2963 #endif
2964 for (i = 0; i < cset->nranges; ++i)
2966 cmp_buf[0] = cset->range_starts[i];
2967 cmp_buf[4] = cset->range_ends[i];
2968 if (wcscoll (cmp_buf, cmp_buf + 2) <= 0
2969 && wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0)
2971 match_len = char_len;
2972 goto check_node_accept_bytes_match;
2976 check_node_accept_bytes_match:
2977 if (!cset->non_match)
2978 return match_len;
2979 else
2981 if (match_len > 0)
2982 return 0;
2983 else
2984 return (elem_len > char_len) ? elem_len : char_len;
2987 return 0;
2990 # ifdef _LIBC
2991 static unsigned int
2992 find_collation_sequence_value (mbs, mbs_len)
2993 const unsigned char *mbs;
2994 size_t mbs_len;
2996 uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
2997 if (nrules == 0)
2999 if (mbs_len == 1)
3001 /* No valid character. Match it as a single byte character. */
3002 const unsigned char *collseq = (const unsigned char *)
3003 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
3004 return collseq[mbs[0]];
3006 return UINT_MAX;
3008 else
3010 int32_t idx;
3011 const unsigned char *extra = (const unsigned char *)
3012 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB);
3014 for (idx = 0; ;)
3016 int mbs_cnt, found = 0;
3017 int32_t elem_mbs_len;
3018 /* Skip the name of collating element name. */
3019 idx = idx + extra[idx] + 1;
3020 elem_mbs_len = extra[idx++];
3021 if (mbs_len == elem_mbs_len)
3023 for (mbs_cnt = 0; mbs_cnt < elem_mbs_len; ++mbs_cnt)
3024 if (extra[idx + mbs_cnt] != mbs[mbs_cnt])
3025 break;
3026 if (mbs_cnt == elem_mbs_len)
3027 /* Found the entry. */
3028 found = 1;
3030 /* Skip the byte sequence of the collating element. */
3031 idx += elem_mbs_len;
3032 /* Adjust for the alignment. */
3033 idx = (idx + 3) & ~3;
3034 /* Skip the collation sequence value. */
3035 idx += sizeof (uint32_t);
3036 /* Skip the wide char sequence of the collating element. */
3037 idx = idx + sizeof (uint32_t) * (extra[idx] + 1);
3038 /* If we found the entry, return the sequence value. */
3039 if (found)
3040 return *(uint32_t *) (extra + idx);
3041 /* Skip the collation sequence value. */
3042 idx += sizeof (uint32_t);
3046 # endif /* _LIBC */
3047 #endif /* RE_ENABLE_I18N */
3049 /* Check whether the node accepts the byte which is IDX-th
3050 byte of the INPUT. */
3052 static int
3053 check_node_accept (preg, node, mctx, idx)
3054 const regex_t *preg;
3055 const re_token_t *node;
3056 const re_match_context_t *mctx;
3057 int idx;
3059 const re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
3060 const re_token_t *cur_node;
3061 unsigned char ch;
3062 if (node->type == OP_CONTEXT_NODE)
3064 /* The node has constraints. Check whether the current context
3065 satisfies the constraints. */
3066 unsigned int context = re_string_context_at (mctx->input, idx,
3067 mctx->eflags,
3068 preg->newline_anchor);
3069 if (NOT_SATISFY_NEXT_CONSTRAINT (node->constraint, context))
3070 return 0;
3071 cur_node = dfa->nodes + node->opr.ctx_info->entity;
3073 else
3074 cur_node = node;
3076 ch = re_string_byte_at (mctx->input, idx);
3077 if (cur_node->type == CHARACTER)
3078 return cur_node->opr.c == ch;
3079 else if (cur_node->type == SIMPLE_BRACKET)
3080 return bitset_contain (cur_node->opr.sbcset, ch);
3081 else if (cur_node->type == OP_PERIOD)
3082 return !((ch == '\n' && !(preg->syntax & RE_DOT_NEWLINE))
3083 || (ch == '\0' && (preg->syntax & RE_DOT_NOT_NULL)));
3084 else
3085 return 0;
3088 /* Extend the buffers, if the buffers have run out. */
3090 static reg_errcode_t
3091 extend_buffers (mctx)
3092 re_match_context_t *mctx;
3094 reg_errcode_t ret;
3095 re_string_t *pstr = mctx->input;
3097 /* Double the lengthes of the buffers. */
3098 ret = re_string_realloc_buffers (pstr, pstr->bufs_len * 2);
3099 if (BE (ret != REG_NOERROR, 0))
3100 return ret;
3102 if (mctx->state_log != NULL)
3104 /* And double the length of state_log. */
3105 mctx->state_log = re_realloc (mctx->state_log, re_dfastate_t *,
3106 pstr->bufs_len * 2);
3107 if (BE (mctx->state_log == NULL, 0))
3108 return REG_ESPACE;
3111 /* Then reconstruct the buffers. */
3112 if (pstr->icase)
3114 #ifdef RE_ENABLE_I18N
3115 if (MB_CUR_MAX > 1)
3116 build_wcs_upper_buffer (pstr);
3117 else
3118 #endif /* RE_ENABLE_I18N */
3119 build_upper_buffer (pstr);
3121 else
3123 #ifdef RE_ENABLE_I18N
3124 if (MB_CUR_MAX > 1)
3125 build_wcs_buffer (pstr);
3126 else
3127 #endif /* RE_ENABLE_I18N */
3129 if (pstr->trans != NULL)
3130 re_string_translate_buffer (pstr);
3131 else
3132 pstr->valid_len = pstr->bufs_len;
3135 return REG_NOERROR;
3139 /* Functions for matching context. */
3141 static reg_errcode_t
3142 match_ctx_init (mctx, eflags, input, n)
3143 re_match_context_t *mctx;
3144 int eflags, n;
3145 re_string_t *input;
3147 mctx->eflags = eflags;
3148 mctx->input = input;
3149 mctx->match_last = -1;
3150 if (n > 0)
3152 mctx->bkref_ents = re_malloc (struct re_backref_cache_entry, n);
3153 if (BE (mctx->bkref_ents == NULL, 0))
3154 return REG_ESPACE;
3156 else
3157 mctx->bkref_ents = NULL;
3158 mctx->nbkref_ents = 0;
3159 mctx->abkref_ents = n;
3160 mctx->max_mb_elem_len = 0;
3161 return REG_NOERROR;
3164 static void
3165 match_ctx_free (mctx)
3166 re_match_context_t *mctx;
3168 re_free (mctx->bkref_ents);
3171 /* Add a new backreference entry to the cache. */
3173 static reg_errcode_t
3174 match_ctx_add_entry (mctx, node, str_idx, from, to)
3175 re_match_context_t *mctx;
3176 int node, str_idx, from, to;
3178 if (mctx->nbkref_ents >= mctx->abkref_ents)
3180 mctx->bkref_ents = re_realloc (mctx->bkref_ents,
3181 struct re_backref_cache_entry,
3182 mctx->abkref_ents * 2);
3183 if (BE (mctx->bkref_ents == NULL, 0))
3184 return REG_ESPACE;
3185 memset (mctx->bkref_ents + mctx->nbkref_ents, '\0',
3186 sizeof (struct re_backref_cache_entry) * mctx->abkref_ents);
3187 mctx->abkref_ents *= 2;
3189 mctx->bkref_ents[mctx->nbkref_ents].node = node;
3190 mctx->bkref_ents[mctx->nbkref_ents].str_idx = str_idx;
3191 mctx->bkref_ents[mctx->nbkref_ents].subexp_from = from;
3192 mctx->bkref_ents[mctx->nbkref_ents].subexp_to = to;
3193 mctx->bkref_ents[mctx->nbkref_ents++].flag = 0;
3194 if (mctx->max_mb_elem_len < to - from)
3195 mctx->max_mb_elem_len = to - from;
3196 return REG_NOERROR;
3199 static void
3200 match_ctx_clear_flag (mctx)
3201 re_match_context_t *mctx;
3203 int i;
3204 for (i = 0; i < mctx->nbkref_ents; ++i)
3206 mctx->bkref_ents[i].flag = 0;
3210 static void
3211 sift_ctx_init (sctx, sifted_sts, limited_sts, last_node, last_str_idx,
3212 check_subexp)
3213 re_sift_context_t *sctx;
3214 re_dfastate_t **sifted_sts, **limited_sts;
3215 int last_node, last_str_idx, check_subexp;
3217 sctx->sifted_states = sifted_sts;
3218 sctx->limited_states = limited_sts;
3219 sctx->last_node = last_node;
3220 sctx->last_str_idx = last_str_idx;
3221 sctx->check_subexp = check_subexp;
3222 re_node_set_init_empty (&sctx->limits);