* sysdeps/m68k/fpu/bits/mathinline.h (isgreater, isgreaterequal)
[glibc.git] / posix / regexec.c
blob142127883d7761ac7c78e9fb921832c85f88e721
1 /* Extended regular expression matching and search library.
2 Copyright (C) 2002 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4 Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library; if not, write to the Free
18 Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
19 02111-1307 USA. */
21 #include <assert.h>
22 #include <ctype.h>
23 #include <stdio.h>
24 #include <stdlib.h>
25 #include <string.h>
27 #if defined HAVE_WCHAR_H || defined _LIBC
28 # include <wchar.h>
29 #endif /* HAVE_WCHAR_H || _LIBC */
30 #if defined HAVE_WCTYPE_H || defined _LIBC
31 # include <wctype.h>
32 #endif /* HAVE_WCTYPE_H || _LIBC */
34 #ifdef _LIBC
35 # ifndef _RE_DEFINE_LOCALE_FUNCTIONS
36 # define _RE_DEFINE_LOCALE_FUNCTIONS 1
37 # include <locale/localeinfo.h>
38 # include <locale/elem-hash.h>
39 # include <locale/coll-lookup.h>
40 # endif
41 #endif
43 #include "regex.h"
44 #include "regex_internal.h"
46 static reg_errcode_t match_ctx_init (re_match_context_t *cache, int eflags,
47 re_string_t *input, int n);
48 static void match_ctx_free (re_match_context_t *cache);
49 static reg_errcode_t match_ctx_add_entry (re_match_context_t *cache, int node,
50 int from, int to);
51 static reg_errcode_t re_search_internal (const regex_t *preg,
52 const char *string, int length,
53 int start, int range, int stop,
54 size_t nmatch, regmatch_t pmatch[],
55 int eflags);
56 static int re_search_2_stub (struct re_pattern_buffer *bufp,
57 const char *string1, int length1,
58 const char *string2, int length2,
59 int start, int range, struct re_registers *regs,
60 int stop, int ret_len);
61 static int re_search_stub (struct re_pattern_buffer *bufp,
62 const char *string, int length, int start,
63 int range, int stop, struct re_registers *regs,
64 int ret_len);
65 static unsigned re_copy_regs (struct re_registers *regs, regmatch_t *pmatch,
66 int nregs, int regs_allocated);
67 static inline re_dfastate_t *acquire_init_state_context (reg_errcode_t *err,
68 const regex_t *preg,
69 const re_match_context_t *mctx,
70 int idx);
71 static int check_matching (const regex_t *preg, re_match_context_t *mctx,
72 int fl_search, int fl_longest_match);
73 static int check_halt_node_context (const re_dfa_t *dfa, int node,
74 unsigned int context);
75 static int check_halt_state_context (const regex_t *preg,
76 const re_dfastate_t *state,
77 const re_match_context_t *mctx, int idx);
78 static void update_regs (re_dfa_t *dfa, regmatch_t *pmatch, int cur_node,
79 int cur_idx, int nmatch);
80 static int proceed_next_node (const regex_t *preg,
81 const re_match_context_t *mctx,
82 int *pidx, int node, re_node_set *eps_via_nodes);
83 static reg_errcode_t set_regs (const regex_t *preg,
84 const re_match_context_t *mctx,
85 size_t nmatch, regmatch_t *pmatch, int last);
86 #ifdef RE_ENABLE_I18N
87 static int sift_states_iter_mb (const regex_t *preg,
88 const re_match_context_t *mctx,
89 int node_idx, int str_idx, int max_str_idx);
90 #endif /* RE_ENABLE_I18N */
91 static int sift_states_iter_bkref (const re_dfa_t *dfa,
92 re_dfastate_t **state_log,
93 struct re_backref_cache_entry *mctx_entry,
94 int node_idx, int idx, int match_last);
95 static reg_errcode_t sift_states_backward (const regex_t *preg,
96 const re_match_context_t *mctx,
97 int last_node);
98 static reg_errcode_t clean_state_log_if_need (re_match_context_t *mctx,
99 int next_state_log_idx);
100 static reg_errcode_t add_epsilon_backreference (const re_dfa_t *dfa,
101 const re_match_context_t *mctx,
102 const re_node_set *plog,
103 int idx,
104 re_node_set *state_buf);
105 static re_dfastate_t *transit_state (reg_errcode_t *err, const regex_t *preg,
106 re_match_context_t *mctx,
107 re_dfastate_t *state, int fl_search);
108 static re_dfastate_t *transit_state_sb (reg_errcode_t *err, const regex_t *preg,
109 re_dfastate_t *pstate,
110 int fl_search,
111 re_match_context_t *mctx);
112 #ifdef RE_ENABLE_I18N
113 static reg_errcode_t transit_state_mb (const regex_t *preg,
114 re_dfastate_t *pstate,
115 re_match_context_t *mctx);
116 #endif /* RE_ENABLE_I18N */
117 static reg_errcode_t transit_state_bkref (const regex_t *preg,
118 re_dfastate_t *pstate,
119 re_match_context_t *mctx);
120 static reg_errcode_t transit_state_bkref_loop (const regex_t *preg,
121 re_node_set *nodes,
122 re_dfastate_t **work_state_log,
123 re_match_context_t *mctx);
124 static re_dfastate_t **build_trtable (const regex_t *dfa,
125 const re_dfastate_t *state,
126 int fl_search);
127 #ifdef RE_ENABLE_I18N
128 static int check_node_accept_bytes (const regex_t *preg, int node_idx,
129 const re_string_t *input, int idx);
130 # ifdef _LIBC
131 static unsigned int find_collation_sequence_value (const unsigned char *mbs,
132 size_t name_len);
133 # endif /* _LIBC */
134 #endif /* RE_ENABLE_I18N */
135 static int group_nodes_into_DFAstates (const regex_t *dfa,
136 const re_dfastate_t *state,
137 re_node_set *states_node,
138 bitset *states_ch);
139 static int check_node_accept (const regex_t *preg, const re_token_t *node,
140 const re_match_context_t *mctx, int idx);
141 static reg_errcode_t extend_buffers (re_match_context_t *mctx);
143 /* Entry point for POSIX code. */
145 /* regexec searches for a given pattern, specified by PREG, in the
146 string STRING.
148 If NMATCH is zero or REG_NOSUB was set in the cflags argument to
149 `regcomp', we ignore PMATCH. Otherwise, we assume PMATCH has at
150 least NMATCH elements, and we set them to the offsets of the
151 corresponding matched substrings.
153 EFLAGS specifies `execution flags' which affect matching: if
154 REG_NOTBOL is set, then ^ does not match at the beginning of the
155 string; if REG_NOTEOL is set, then $ does not match at the end.
157 We return 0 if we find a match and REG_NOMATCH if not. */
160 regexec (preg, string, nmatch, pmatch, eflags)
161 const regex_t *__restrict preg;
162 const char *__restrict string;
163 size_t nmatch;
164 regmatch_t pmatch[];
165 int eflags;
167 reg_errcode_t err;
168 int length = strlen (string);
169 if (preg->no_sub)
170 err = re_search_internal (preg, string, length, 0, length, length, 0,
171 NULL, eflags);
172 else
173 err = re_search_internal (preg, string, length, 0, length, length, nmatch,
174 pmatch, eflags);
175 return err != REG_NOERROR;
177 #ifdef _LIBC
178 weak_alias (__regexec, regexec)
179 #endif
181 /* Entry points for GNU code. */
183 /* re_match, re_search, re_match_2, re_search_2
185 The former two functions operate on STRING with length LENGTH,
186 while the later two operate on concatenation of STRING1 and STRING2
187 with lengths LENGTH1 and LENGTH2, respectively.
189 re_match() matches the compiled pattern in BUFP against the string,
190 starting at index START.
192 re_search() first tries matching at index START, then it tries to match
193 starting from index START + 1, and so on. The last start position tried
194 is START + RANGE. (Thus RANGE = 0 forces re_search to operate the same
195 way as re_match().)
197 The parameter STOP of re_{match,search}_2 specifies that no match exceeding
198 the first STOP characters of the concatenation of the strings should be
199 concerned.
201 If REGS is not NULL, and BUFP->no_sub is not set, the offsets of the match
202 and all groups is stroed in REGS. (For the "_2" variants, the offsets are
203 computed relative to the concatenation, not relative to the individual
204 strings.)
206 On success, re_match* functions return the length of the match, re_search*
207 return the position of the start of the match. Return value -1 means no
208 match was found and -2 indicates an internal error. */
211 re_match (bufp, string, length, start, regs)
212 struct re_pattern_buffer *bufp;
213 const char *string;
214 int length, start;
215 struct re_registers *regs;
217 return re_search_stub (bufp, string, length, start, 0, length, regs, 1);
219 #ifdef _LIBC
220 weak_alias (__re_match, re_match)
221 #endif
224 re_search (bufp, string, length, start, range, regs)
225 struct re_pattern_buffer *bufp;
226 const char *string;
227 int length, start, range;
228 struct re_registers *regs;
230 return re_search_stub (bufp, string, length, start, range, length, regs, 0);
232 #ifdef _LIBC
233 weak_alias (__re_search, re_search)
234 #endif
237 re_match_2 (bufp, string1, length1, string2, length2, start, regs, stop)
238 struct re_pattern_buffer *bufp;
239 const char *string1, *string2;
240 int length1, length2, start, stop;
241 struct re_registers *regs;
243 return re_search_2_stub (bufp, string1, length1, string2, length2,
244 start, 0, regs, stop, 1);
246 #ifdef _LIBC
247 weak_alias (__re_match_2, re_match_2)
248 #endif
251 re_search_2 (bufp, string1, length1, string2, length2, start, range, regs, stop)
252 struct re_pattern_buffer *bufp;
253 const char *string1, *string2;
254 int length1, length2, start, range, stop;
255 struct re_registers *regs;
257 return re_search_2_stub (bufp, string1, length1, string2, length2,
258 start, range, regs, stop, 0);
260 #ifdef _LIBC
261 weak_alias (__re_search_2, re_search_2)
262 #endif
264 static int
265 re_search_2_stub (bufp, string1, length1, string2, length2, start, range, regs,
266 stop, ret_len)
267 struct re_pattern_buffer *bufp;
268 const char *string1, *string2;
269 int length1, length2, start, range, stop, ret_len;
270 struct re_registers *regs;
272 const char *str;
273 int rval;
274 int len = length1 + length2;
275 int free_str = 0;
277 if (BE (length1 < 0 || length2 < 0 || stop < 0, 0))
278 return -2;
280 /* Concatenate the strings. */
281 if (length2 > 0)
282 if (length1 > 0)
284 char *s = re_malloc (char, len);
286 if (BE (s == NULL, 0))
287 return -2;
288 memcpy (s, string1, length1);
289 memcpy (s + length1, string2, length2);
290 str = s;
291 free_str = 1;
293 else
294 str = string2;
295 else
296 str = string1;
298 rval = re_search_stub (bufp, str, len, start, range, stop, regs,
299 ret_len);
300 if (free_str)
301 re_free ((char *) str);
302 return rval;
305 /* The parameters have the same meaning as those of re_search.
306 Additional parameters:
307 If RET_LEN is nonzero the length of the match is returned (re_match style);
308 otherwise the position of the match is returned. */
310 static int
311 re_search_stub (bufp, string, length, start, range, stop, regs, ret_len)
312 struct re_pattern_buffer *bufp;
313 const char *string;
314 int length, start, range, stop, ret_len;
315 struct re_registers *regs;
317 reg_errcode_t result;
318 regmatch_t *pmatch;
319 int nregs, rval;
320 int eflags = 0;
322 /* Check for out-of-range. */
323 if (BE (start < 0 || start > length || range < 0, 0))
324 return -1;
325 if (BE (start + range > length, 0))
326 range = length - start;
328 eflags |= (bufp->not_bol) ? REG_NOTBOL : 0;
329 eflags |= (bufp->not_eol) ? REG_NOTEOL : 0;
331 /* Compile fastmap if we haven't yet. */
332 if (range > 0 && bufp->fastmap != NULL && !bufp->fastmap_accurate)
333 re_compile_fastmap (bufp);
335 if (BE (bufp->no_sub, 0))
336 regs = NULL;
338 /* We need at least 1 register. */
339 if (regs == NULL)
340 nregs = 1;
341 else if (BE (bufp->regs_allocated == REGS_FIXED &&
342 regs->num_regs < bufp->re_nsub + 1, 0))
344 nregs = regs->num_regs;
345 if (BE (nregs < 1, 0))
347 /* Nothing can be copied to regs. */
348 regs = NULL;
349 nregs = 1;
352 else
353 nregs = bufp->re_nsub + 1;
354 pmatch = re_malloc (regmatch_t, nregs);
355 if (BE (pmatch == NULL, 0))
356 return -2;
358 result = re_search_internal (bufp, string, length, start, range, stop,
359 nregs, pmatch, eflags);
361 rval = 0;
363 /* I hope we needn't fill ther regs with -1's when no match was found. */
364 if (result != REG_NOERROR)
365 rval = -1;
366 else if (regs != NULL)
368 /* If caller wants register contents data back, copy them. */
369 bufp->regs_allocated = re_copy_regs (regs, pmatch, nregs,
370 bufp->regs_allocated);
371 if (BE (bufp->regs_allocated == REGS_UNALLOCATED, 0))
372 rval = -2;
375 if (BE (rval == 0, 1))
377 if (ret_len)
379 assert (pmatch[0].rm_so == start);
380 rval = pmatch[0].rm_eo - start;
382 else
383 rval = pmatch[0].rm_so;
385 re_free (pmatch);
386 return rval;
389 static unsigned
390 re_copy_regs (regs, pmatch, nregs, regs_allocated)
391 struct re_registers *regs;
392 regmatch_t *pmatch;
393 int nregs, regs_allocated;
395 int rval = REGS_REALLOCATE;
396 int i;
397 int need_regs = nregs + 1;
398 /* We need one extra element beyond `num_regs' for the `-1' marker GNU code
399 uses. */
401 /* Have the register data arrays been allocated? */
402 if (regs_allocated == REGS_UNALLOCATED)
403 { /* No. So allocate them with malloc. */
404 regs->start = re_malloc (regoff_t, need_regs);
405 if (BE (regs->start == NULL, 0))
406 return REGS_UNALLOCATED;
407 regs->end = re_malloc (regoff_t, need_regs);
408 if (BE (regs->end == NULL, 0))
410 re_free (regs->start);
411 return REGS_UNALLOCATED;
413 regs->num_regs = need_regs;
415 else if (regs_allocated == REGS_REALLOCATE)
416 { /* Yes. If we need more elements than were already
417 allocated, reallocate them. If we need fewer, just
418 leave it alone. */
419 if (need_regs > regs->num_regs)
421 regs->start = re_realloc (regs->start, regoff_t, need_regs);
422 if (BE (regs->start == NULL, 0))
424 if (regs->end != NULL)
425 re_free (regs->end);
426 return REGS_UNALLOCATED;
428 regs->end = re_realloc (regs->end, regoff_t, need_regs);
429 if (BE (regs->end == NULL, 0))
431 re_free (regs->start);
432 return REGS_UNALLOCATED;
434 regs->num_regs = need_regs;
437 else
439 assert (regs_allocated == REGS_FIXED);
440 /* This function may not be called with REGS_FIXED and nregs too big. */
441 assert (regs->num_regs >= nregs);
442 rval = REGS_FIXED;
445 /* Copy the regs. */
446 for (i = 0; i < nregs; ++i)
448 regs->start[i] = pmatch[i].rm_so;
449 regs->end[i] = pmatch[i].rm_eo;
451 for ( ; i < regs->num_regs; ++i)
452 regs->start[i] = regs->end[i] = -1;
454 return rval;
457 /* Set REGS to hold NUM_REGS registers, storing them in STARTS and
458 ENDS. Subsequent matches using PATTERN_BUFFER and REGS will use
459 this memory for recording register information. STARTS and ENDS
460 must be allocated using the malloc library routine, and must each
461 be at least NUM_REGS * sizeof (regoff_t) bytes long.
463 If NUM_REGS == 0, then subsequent matches should allocate their own
464 register data.
466 Unless this function is called, the first search or match using
467 PATTERN_BUFFER will allocate its own register data, without
468 freeing the old data. */
470 void
471 re_set_registers (bufp, regs, num_regs, starts, ends)
472 struct re_pattern_buffer *bufp;
473 struct re_registers *regs;
474 unsigned num_regs;
475 regoff_t *starts, *ends;
477 if (num_regs)
479 bufp->regs_allocated = REGS_REALLOCATE;
480 regs->num_regs = num_regs;
481 regs->start = starts;
482 regs->end = ends;
484 else
486 bufp->regs_allocated = REGS_UNALLOCATED;
487 regs->num_regs = 0;
488 regs->start = regs->end = (regoff_t *) 0;
491 #ifdef _LIBC
492 weak_alias (__re_set_registers, re_set_registers)
493 #endif
495 /* Entry points compatible with 4.2 BSD regex library. We don't define
496 them unless specifically requested. */
498 #if defined _REGEX_RE_COMP || defined _LIBC
500 # ifdef _LIBC
501 weak_function
502 # endif
503 re_exec (s)
504 const char *s;
506 return 0 == regexec (&re_comp_buf, s, 0, NULL, 0);
508 #endif /* _REGEX_RE_COMP */
510 static re_node_set empty_set;
512 /* Internal entry point. */
514 /* Searches for a compiled pattern PREG in the string STRING, whose
515 length is LENGTH. NMATCH, PMATCH, and EFLAGS have the same
516 mingings with regexec. START, and RANGE have the same meanings
517 with re_search.
518 Return REG_NOERROR if we find a match, and REG_NOMATCH if not,
519 otherwise return the error code.
520 Note: We assume front end functions already check ranges.
521 (START + RANGE >= 0 && START + RANGE <= LENGTH) */
523 static reg_errcode_t
524 re_search_internal (preg, string, length, start, range, stop, nmatch, pmatch,
525 eflags)
526 const regex_t *preg;
527 const char *string;
528 int length, start, range, stop, eflags;
529 size_t nmatch;
530 regmatch_t pmatch[];
532 reg_errcode_t err;
533 re_dfa_t *dfa = (re_dfa_t *)preg->buffer;
534 re_string_t input;
535 int left_lim, right_lim, incr;
536 int fl_longest_match, match_first, match_last = -1;
537 re_match_context_t mctx;
538 char *fastmap = ((preg->fastmap != NULL && preg->fastmap_accurate)
539 ? preg->fastmap : NULL);
541 /* Check if the DFA haven't been compiled. */
542 if (BE (preg->used == 0 || dfa->init_state == NULL
543 || dfa->init_state_word == NULL || dfa->init_state_nl == NULL
544 || dfa->init_state_begbuf == NULL, 0))
545 return REG_NOMATCH;
547 re_node_set_init_empty (&empty_set);
549 /* We must check the longest matching, if nmatch > 0. */
550 fl_longest_match = (nmatch != 0);
552 err = re_string_allocate (&input, string, length, dfa->nodes_len + 1,
553 preg->translate, preg->syntax & RE_ICASE);
554 if (BE (err != REG_NOERROR, 0))
555 return err;
556 input.stop = stop;
558 err = match_ctx_init (&mctx, eflags, &input, dfa->nbackref * 2);
559 if (BE (err != REG_NOERROR, 0))
560 return err;
562 /* We will log all the DFA states through which the dfa pass,
563 if nmatch > 1, or this dfa has "multibyte node", which is a
564 back-reference or a node which can accept multibyte character or
565 multi character collating element. */
566 if (nmatch > 1 || dfa->has_mb_node)
568 mctx.state_log = re_malloc (re_dfastate_t *, dfa->nodes_len + 1);
569 if (BE (mctx.state_log == NULL, 0))
570 return REG_ESPACE;
572 else
573 mctx.state_log = NULL;
575 #ifdef DEBUG
576 /* We assume front-end functions already check them. */
577 assert (start + range >= 0 && start + range <= length);
578 #endif
580 match_first = start;
581 input.tip_context = ((eflags & REG_NOTBOL) ? CONTEXT_BEGBUF
582 : CONTEXT_NEWLINE | CONTEXT_BEGBUF);
584 /* Check incrementally whether of not the input string match. */
585 incr = (range < 0) ? -1 : 1;
586 left_lim = (range < 0) ? start + range : start;
587 right_lim = (range < 0) ? start : start + range;
589 for (;;)
591 /* At first get the current byte from input string. */
592 int ch;
593 if (MB_CUR_MAX > 1 && (preg->syntax & RE_ICASE || preg->translate))
595 /* In this case, we can't determin easily the current byte,
596 since it might be a component byte of a multibyte character.
597 Then we use the constructed buffer instead. */
598 /* If MATCH_FIRST is out of the valid range, reconstruct the
599 buffers. */
600 if (input.raw_mbs_idx + input.valid_len <= match_first)
601 re_string_reconstruct (&input, match_first, eflags,
602 preg->newline_anchor);
603 /* If MATCH_FIRST is out of the buffer, leave it as '\0'.
604 Note that MATCH_FIRST must not be smaller than 0. */
605 ch = ((match_first >= length) ? 0
606 : re_string_byte_at (&input, match_first - input.raw_mbs_idx));
608 else
610 /* We apply translate/conversion manually, since it is trivial
611 in this case. */
612 /* If MATCH_FIRST is out of the buffer, leave it as '\0'.
613 Note that MATCH_FIRST must not be smaller than 0. */
614 ch = (match_first < length) ? (unsigned char)string[match_first] : 0;
615 /* Apply translation if we need. */
616 ch = preg->translate ? preg->translate[ch] : ch;
617 /* In case of case insensitive mode, convert to upper case. */
618 ch = ((preg->syntax & RE_ICASE) && islower (ch)) ? toupper (ch) : ch;
621 /* Eliminate inappropriate one by fastmap. */
622 if (preg->can_be_null || fastmap == NULL || fastmap[ch])
624 /* Reconstruct the buffers so that the matcher can assume that
625 the matching starts from the begining of the buffer. */
626 re_string_reconstruct (&input, match_first, eflags,
627 preg->newline_anchor);
628 #ifdef RE_ENABLE_I18N
629 /* Eliminate it when it is a component of a multibyte character
630 and isn't the head of a multibyte character. */
631 if (MB_CUR_MAX == 1 || re_string_first_byte (&input, 0))
632 #endif
634 /* It seems to be appropriate one, then use the matcher. */
635 /* We assume that the matching starts from 0. */
636 mctx.state_log_top = mctx.nbkref_ents = mctx.max_bkref_len = 0;
637 match_last = check_matching (preg, &mctx, 0, fl_longest_match);
638 if (match_last != -1)
640 if (BE (match_last == -2, 0))
641 return REG_ESPACE;
642 else
643 break; /* We found a matching. */
647 /* Update counter. */
648 match_first += incr;
649 if (match_first < left_lim || right_lim < match_first)
650 break;
653 /* Set pmatch[] if we need. */
654 if (match_last != -1 && nmatch > 0)
656 int reg_idx;
658 /* Initialize registers. */
659 for (reg_idx = 0; reg_idx < nmatch; ++reg_idx)
660 pmatch[reg_idx].rm_so = pmatch[reg_idx].rm_eo = -1;
662 /* Set the points where matching start/end. */
663 pmatch[0].rm_so = 0;
664 mctx.match_last = pmatch[0].rm_eo = match_last;
666 if (!preg->no_sub && nmatch > 1)
668 /* We need the ranges of all the subexpressions. */
669 int halt_node;
670 re_dfastate_t *pstate = mctx.state_log[match_last];
671 #ifdef DEBUG
672 assert (mctx.state_log != NULL);
673 #endif
674 halt_node = check_halt_state_context (preg, pstate, &mctx,
675 match_last);
676 err = sift_states_backward (preg, &mctx, halt_node);
677 if (BE (err != REG_NOERROR, 0))
678 return err;
679 err = set_regs (preg, &mctx, nmatch, pmatch, halt_node);
680 if (BE (err != REG_NOERROR, 0))
681 return err;
684 /* At last, add the offset to the each registers, since we slided
685 the buffers so that We can assume that the matching starts from 0. */
686 for (reg_idx = 0; reg_idx < nmatch; ++reg_idx)
687 if (pmatch[reg_idx].rm_so != -1)
689 pmatch[reg_idx].rm_so += match_first;
690 pmatch[reg_idx].rm_eo += match_first;
694 re_free (mctx.state_log);
695 if (dfa->nbackref)
696 match_ctx_free (&mctx);
697 re_string_destruct (&input);
698 return (match_last == -1) ? REG_NOMATCH : REG_NOERROR;
701 /* Acquire an initial state and return it.
702 We must select appropriate initial state depending on the context,
703 since initial states may have constraints like "\<", "^", etc.. */
705 static inline re_dfastate_t *
706 acquire_init_state_context (err, preg, mctx, idx)
707 reg_errcode_t *err;
708 const regex_t *preg;
709 const re_match_context_t *mctx;
710 int idx;
712 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
714 *err = REG_NOERROR;
715 if (dfa->init_state->has_constraint)
717 unsigned int context;
718 context = re_string_context_at (mctx->input, idx - 1, mctx->eflags,
719 preg->newline_anchor);
720 if (IS_WORD_CONTEXT (context))
721 return dfa->init_state_word;
722 else if (IS_ORDINARY_CONTEXT (context))
723 return dfa->init_state;
724 else if (IS_BEGBUF_CONTEXT (context) && IS_NEWLINE_CONTEXT (context))
725 return dfa->init_state_begbuf;
726 else if (IS_NEWLINE_CONTEXT (context))
727 return dfa->init_state_nl;
728 else if (IS_BEGBUF_CONTEXT (context))
730 /* It is relatively rare case, then calculate on demand. */
731 return re_acquire_state_context (err, dfa,
732 dfa->init_state->entrance_nodes,
733 context);
735 else
736 /* Must not happen? */
737 return dfa->init_state;
739 else
740 return dfa->init_state;
743 /* Check whether the regular expression match input string INPUT or not,
744 and return the index where the matching end, return -1 if not match,
745 or return -2 in case of an error.
746 FL_SEARCH means we must search where the matching starts,
747 FL_LONGEST_MATCH means we want the POSIX longest matching.
748 Note that the matcher assume that the maching starts from the current
749 index of the buffer. */
751 static int
752 check_matching (preg, mctx, fl_search, fl_longest_match)
753 const regex_t *preg;
754 re_match_context_t *mctx;
755 int fl_search, fl_longest_match;
757 reg_errcode_t err;
758 int match = 0;
759 int match_last = -1;
760 int cur_str_idx = re_string_cur_idx (mctx->input);
761 re_dfastate_t *cur_state;
763 cur_state = acquire_init_state_context (&err, preg, mctx, cur_str_idx);
764 /* An initial state must not be NULL(invalid state). */
765 if (BE (cur_state == NULL, 0))
766 return -2;
767 if (mctx->state_log != NULL)
768 mctx->state_log[cur_str_idx] = cur_state;
769 /* If the RE accepts NULL string. */
770 if (cur_state->halt)
772 if (!cur_state->has_constraint
773 || check_halt_state_context (preg, cur_state, mctx, cur_str_idx))
775 if (!fl_longest_match)
776 return cur_str_idx;
777 else
779 match_last = cur_str_idx;
780 match = 1;
785 while (!re_string_eoi (mctx->input))
787 cur_state = transit_state (&err, preg, mctx, cur_state,
788 fl_search && !match);
789 if (cur_state == NULL) /* Reached at the invalid state or an error. */
791 cur_str_idx = re_string_cur_idx (mctx->input);
792 if (BE (err != REG_NOERROR, 0))
793 return -2;
794 if (fl_search && !match)
796 /* Restart from initial state, since we are searching
797 the point from where matching start. */
798 #ifdef RE_ENABLE_I18N
799 if (MB_CUR_MAX == 1
800 || re_string_first_byte (mctx->input, cur_str_idx))
801 #endif /* RE_ENABLE_I18N */
802 cur_state = acquire_init_state_context (&err, preg, mctx,
803 cur_str_idx);
804 if (BE (cur_state == NULL && err != REG_NOERROR, 0))
805 return -2;
806 if (mctx->state_log != NULL)
807 mctx->state_log[cur_str_idx] = cur_state;
809 else if (!fl_longest_match && match)
810 break;
811 else /* (fl_longest_match && match) || (!fl_search && !match) */
813 if (mctx->state_log == NULL)
814 break;
815 else
817 int max = mctx->state_log_top;
818 for (; cur_str_idx <= max; ++cur_str_idx)
819 if (mctx->state_log[cur_str_idx] != NULL)
820 break;
821 if (cur_str_idx > max)
822 break;
827 if (cur_state != NULL && cur_state->halt)
829 /* Reached at a halt state.
830 Check the halt state can satisfy the current context. */
831 if (!cur_state->has_constraint
832 || check_halt_state_context (preg, cur_state, mctx,
833 re_string_cur_idx (mctx->input)))
835 /* We found an appropriate halt state. */
836 match_last = re_string_cur_idx (mctx->input);
837 match = 1;
838 if (!fl_longest_match)
839 break;
843 return match_last;
846 /* Check NODE match the current context. */
848 static int check_halt_node_context (dfa, node, context)
849 const re_dfa_t *dfa;
850 int node;
851 unsigned int context;
853 int entity;
854 re_token_type_t type = dfa->nodes[node].type;
855 if (type == END_OF_RE)
856 return 1;
857 if (type != OP_CONTEXT_NODE)
858 return 0;
859 entity = dfa->nodes[node].opr.ctx_info->entity;
860 if (dfa->nodes[entity].type != END_OF_RE
861 || NOT_SATISFY_NEXT_CONSTRAINT (dfa->nodes[node].constraint, context))
862 return 0;
863 return 1;
866 /* Check the halt state STATE match the current context.
867 Return 0 if not match, if the node, STATE has, is a halt node and
868 match the context, return the node. */
870 static int
871 check_halt_state_context (preg, state, mctx, idx)
872 const regex_t *preg;
873 const re_dfastate_t *state;
874 const re_match_context_t *mctx;
875 int idx;
877 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
878 int i;
879 unsigned int context;
880 #ifdef DEBUG
881 assert (state->halt);
882 #endif
883 context = re_string_context_at (mctx->input, idx, mctx->eflags,
884 preg->newline_anchor);
885 for (i = 0; i < state->nodes.nelem; ++i)
886 if (check_halt_node_context (dfa, state->nodes.elems[i], context))
887 return state->nodes.elems[i];
888 return 0;
891 /* Compute the next node to which "NFA" transit from NODE("NFA" is a NFA
892 corresponding to the DFA).
893 Return the destination node, and update EPS_VIA_NODES, return -1 in case
894 of errors. */
896 static int
897 proceed_next_node (preg, mctx, pidx, node, eps_via_nodes)
898 const regex_t *preg;
899 const re_match_context_t *mctx;
900 int *pidx, node;
901 re_node_set *eps_via_nodes;
903 re_dfa_t *dfa = (re_dfa_t *)preg->buffer;
904 int i, err, dest_node, cur_entity;
905 dest_node = -1;
906 cur_entity = ((dfa->nodes[node].type == OP_CONTEXT_NODE)
907 ? dfa->nodes[node].opr.ctx_info->entity : node);
908 if (IS_EPSILON_NODE (dfa->nodes[node].type))
910 int dest_entity = INT_MAX;
911 err = re_node_set_insert (eps_via_nodes, node);
912 if (BE (err < 0, 0))
913 return -1;
914 for (i = 0; i < mctx->state_log[*pidx]->nodes.nelem; ++i)
916 int candidate, candidate_entity;
917 candidate = mctx->state_log[*pidx]->nodes.elems[i];
918 candidate_entity = ((dfa->nodes[candidate].type == OP_CONTEXT_NODE)
919 ? dfa->nodes[candidate].opr.ctx_info->entity
920 : candidate);
921 if (!re_node_set_contains (dfa->edests + node, candidate))
922 if (candidate == candidate_entity
923 || !re_node_set_contains (dfa->edests + node, candidate_entity))
924 continue;
926 /* In order to avoid infinite loop like "(a*)*". */
927 if (cur_entity > candidate_entity
928 && re_node_set_contains (eps_via_nodes, candidate))
929 continue;
931 if (dest_entity > candidate_entity)
933 dest_node = candidate;
934 dest_entity = candidate_entity;
937 #ifdef DEBUG
938 assert (dest_node != -1);
939 #endif
940 return dest_node;
942 else
944 int naccepted = 0, entity = node;
945 re_token_type_t type = dfa->nodes[node].type;
946 if (type == OP_CONTEXT_NODE)
948 entity = dfa->nodes[node].opr.ctx_info->entity;
949 type = dfa->nodes[entity].type;
952 #ifdef RE_ENABLE_I18N
953 if (ACCEPT_MB_NODE (type))
954 naccepted = check_node_accept_bytes (preg, entity, mctx->input, *pidx);
955 else
956 #endif /* RE_ENABLE_I18N */
957 if (type == OP_BACK_REF)
959 for (i = 0; i < mctx->nbkref_ents; ++i)
961 if (mctx->bkref_ents[i].node == node
962 && mctx->bkref_ents[i].from == *pidx)
963 naccepted = mctx->bkref_ents[i].to - *pidx;
965 if (naccepted == 0)
967 err = re_node_set_insert (eps_via_nodes, node);
968 if (BE (err < 0, 0))
969 return -1;
970 dest_node = dfa->nexts[node];
971 if (re_node_set_contains (&mctx->state_log[*pidx]->nodes,
972 dest_node))
973 return dest_node;
974 for (i = 0; i < mctx->state_log[*pidx]->nodes.nelem; ++i)
976 dest_node = mctx->state_log[*pidx]->nodes.elems[i];
977 if ((dfa->nodes[dest_node].type == OP_CONTEXT_NODE
978 && (dfa->nexts[node]
979 == dfa->nodes[dest_node].opr.ctx_info->entity)))
980 return dest_node;
985 if (naccepted != 0
986 || check_node_accept (preg, dfa->nodes + node, mctx, *pidx))
988 dest_node = dfa->nexts[node];
989 *pidx = (naccepted == 0) ? *pidx + 1 : *pidx + naccepted;
990 #ifdef DEBUG
991 assert (mctx->state_log[*pidx] != NULL);
992 #endif
993 re_node_set_empty (eps_via_nodes);
994 return dest_node;
997 /* Must not reach here. */
998 #ifdef DEBUG
999 assert (0);
1000 #endif
1001 return 0;
1004 /* Set the positions where the subexpressions are starts/ends to registers
1005 PMATCH.
1006 Note: We assume that pmatch[0] is already set, and
1007 pmatch[i].rm_so == pmatch[i].rm_eo == -1 (i > 1). */
1009 static reg_errcode_t
1010 set_regs (preg, mctx, nmatch, pmatch, last_node)
1011 const regex_t *preg;
1012 const re_match_context_t *mctx;
1013 size_t nmatch;
1014 regmatch_t *pmatch;
1015 int last_node;
1017 re_dfa_t *dfa = (re_dfa_t *)preg->buffer;
1018 int idx, cur_node, real_nmatch;
1019 re_node_set eps_via_nodes;
1020 #ifdef DEBUG
1021 assert (nmatch > 1);
1022 assert (mctx->state_log != NULL);
1023 #endif
1024 cur_node = dfa->init_node;
1025 real_nmatch = (nmatch <= preg->re_nsub) ? nmatch : preg->re_nsub + 1;
1026 re_node_set_init_empty (&eps_via_nodes);
1027 for (idx = pmatch[0].rm_so; idx <= pmatch[0].rm_eo ;)
1029 update_regs (dfa, pmatch, cur_node, idx, real_nmatch);
1030 if (idx == pmatch[0].rm_eo && cur_node == last_node)
1031 break;
1033 /* Proceed to next node. */
1034 cur_node = proceed_next_node (preg, mctx, &idx, cur_node, &eps_via_nodes);
1035 if (BE (cur_node < 0, 0))
1036 return REG_ESPACE;
1038 re_node_set_free (&eps_via_nodes);
1039 return REG_NOERROR;
1042 static void
1043 update_regs (dfa, pmatch, cur_node, cur_idx, nmatch)
1044 re_dfa_t *dfa;
1045 regmatch_t *pmatch;
1046 int cur_node, cur_idx, nmatch;
1048 int type = dfa->nodes[cur_node].type;
1049 int reg_num;
1050 if (type != OP_OPEN_SUBEXP && type != OP_CLOSE_SUBEXP)
1051 return;
1052 reg_num = dfa->nodes[cur_node].opr.idx + 1;
1053 if (reg_num >= nmatch)
1054 return;
1055 if (type == OP_OPEN_SUBEXP)
1057 /* We are at the first node of this sub expression. */
1058 pmatch[reg_num].rm_so = cur_idx;
1059 pmatch[reg_num].rm_eo = -1;
1061 else if (type == OP_CLOSE_SUBEXP)
1062 /* We are at the first node of this sub expression. */
1063 pmatch[reg_num].rm_eo = cur_idx;
1066 #define NUMBER_OF_STATE 1
1068 /* This function checks the STATE_LOG from the MCTX->match_last to 0
1069 and sift the nodes in each states according to the following rules.
1070 Updated state_log will be wrote to STATE_LOG.
1072 Rules: We throw away the Node `a' in the STATE_LOG[STR_IDX] if...
1073 1. When STR_IDX == MATCH_LAST(the last index in the state_log):
1074 If `a' isn't the LAST_NODE and `a' can't epsilon transit to
1075 the LAST_NODE, we throw away the node `a'.
1076 2. When 0 <= STR_IDX < MATCH_LAST and `a' accepts
1077 string `s' and transit to `b':
1078 i. If 'b' isn't in the STATE_LOG[STR_IDX+strlen('s')], we throw
1079 away the node `a'.
1080 ii. If 'b' is in the STATE_LOG[STR_IDX+strlen('s')] but 'b' is
1081 throwed away, we throw away the node `a'.
1082 3. When 0 <= STR_IDX < n and 'a' epsilon transit to 'b':
1083 i. If 'b' isn't in the STATE_LOG[STR_IDX], we throw away the
1084 node `a'.
1085 ii. If 'b' is in the STATE_LOG[STR_IDX] but 'b' is throwed away,
1086 we throw away the node `a'. */
1088 #define STATE_NODE_CONTAINS(state,node) \
1089 ((state) != NULL && re_node_set_contains (&(state)->nodes, node))
1091 static reg_errcode_t
1092 sift_states_backward (preg, mctx, last_node)
1093 const regex_t *preg;
1094 const re_match_context_t *mctx;
1095 int last_node;
1097 reg_errcode_t err;
1098 re_dfa_t *dfa = (re_dfa_t *)preg->buffer;
1099 re_node_set state_buf;
1100 int str_idx = mctx->match_last;
1101 re_node_set *plog; /* Points the state_log[str_idx]->nodes */
1103 #ifdef DEBUG
1104 assert (mctx->state_log != NULL && mctx->state_log[str_idx] != NULL);
1105 #endif
1106 err = re_node_set_alloc (&state_buf, NUMBER_OF_STATE);
1107 if (BE (err != REG_NOERROR, 0))
1108 return err;
1109 plog = &mctx->state_log[str_idx]->nodes;
1111 /* Build sifted state_log[str_idx]. It has the nodes which can epsilon
1112 transit to the last_node and the last_node itself. */
1113 err = re_node_set_intersect (&state_buf, plog, dfa->inveclosures + last_node);
1114 if (BE (err != REG_NOERROR, 0))
1115 return err;
1117 if (mctx->state_log[str_idx] != NULL
1118 && mctx->state_log[str_idx]->has_backref)
1120 err = add_epsilon_backreference (dfa, mctx, plog, str_idx, &state_buf);
1121 if (BE (err != REG_NOERROR, 0))
1122 return err;
1125 /* Update state log. */
1126 mctx->state_log[str_idx] = re_acquire_state (&err, dfa, &state_buf);
1127 if (BE (mctx->state_log[str_idx] == NULL && err != REG_NOERROR, 0))
1128 return err;
1130 /* Then check each states in the state_log. */
1131 while (str_idx > 0)
1133 int i, j;
1134 /* Update counters. */
1135 re_node_set_empty (&state_buf);
1136 --str_idx;
1137 plog = ((mctx->state_log[str_idx] == NULL) ? &empty_set
1138 : &mctx->state_log[str_idx]->nodes);
1140 /* Then build the next sifted state.
1141 We build the next sifted state on `state_buf', and update
1142 `state_log[str_idx]' with `state_buf'.
1143 Note:
1144 `state_buf' is the sifted state from `state_log[str_idx + 1]'.
1145 `plog' points the node_set of the old `state_log[str_idx]'. */
1146 for (i = 0; i < plog->nelem; i++)
1148 int prev_node = plog->elems[i];
1149 int entity = prev_node;
1150 int naccepted = 0;
1151 re_token_type_t type = dfa->nodes[prev_node].type;
1152 if (type == OP_CONTEXT_NODE)
1154 entity = dfa->nodes[prev_node].opr.ctx_info->entity;
1155 type = dfa->nodes[entity].type;
1158 #ifdef RE_ENABLE_I18N
1159 /* If the node may accept `multi byte'. */
1160 if (ACCEPT_MB_NODE (type))
1161 naccepted = sift_states_iter_mb (preg, mctx, entity, str_idx,
1162 mctx->match_last);
1164 /* If the node is a back reference. */
1165 else
1166 #endif /* RE_ENABLE_I18N */
1167 if (type == OP_BACK_REF)
1168 for (j = 0; j < mctx->nbkref_ents; ++j)
1170 naccepted = sift_states_iter_bkref (dfa, mctx->state_log,
1171 mctx->bkref_ents + j,
1172 prev_node, str_idx,
1173 mctx->match_last);
1174 if (naccepted)
1175 break;
1178 if (!naccepted
1179 && check_node_accept (preg, dfa->nodes + prev_node, mctx,
1180 str_idx)
1181 && STATE_NODE_CONTAINS (mctx->state_log[str_idx + 1],
1182 dfa->nexts[prev_node]))
1183 naccepted = 1;
1185 if (naccepted == 0)
1186 continue;
1188 /* `prev_node' may point the entity of the OP_CONTEXT_NODE,
1189 then we use plog->elems[i] instead. */
1190 err = re_node_set_add_intersect (&state_buf, plog,
1191 dfa->inveclosures + prev_node);
1192 if (BE (err != REG_NOERROR, 0))
1193 return err;
1195 if (mctx->state_log[str_idx] != NULL
1196 && mctx->state_log[str_idx]->has_backref)
1198 err = add_epsilon_backreference (dfa, mctx, plog, str_idx, &state_buf);
1199 if (BE (err != REG_NOERROR, 0))
1200 return err;
1203 /* Update state_log. */
1204 mctx->state_log[str_idx] = re_acquire_state (&err, dfa, &state_buf);
1205 if (BE (mctx->state_log[str_idx] == NULL && err != REG_NOERROR, 0))
1206 return err;
1209 re_node_set_free (&state_buf);
1210 return REG_NOERROR;
1213 /* Helper functions. */
1215 static inline reg_errcode_t
1216 clean_state_log_if_need (mctx, next_state_log_idx)
1217 re_match_context_t *mctx;
1218 int next_state_log_idx;
1220 int top = mctx->state_log_top;
1222 if (next_state_log_idx >= mctx->input->bufs_len
1223 || (next_state_log_idx >= mctx->input->valid_len
1224 && mctx->input->valid_len < mctx->input->len))
1226 reg_errcode_t err;
1227 err = extend_buffers (mctx);
1228 if (BE (err != REG_NOERROR, 0))
1229 return err;
1232 if (top < next_state_log_idx)
1234 memset (mctx->state_log + top + 1, '\0',
1235 sizeof (re_dfastate_t *) * (next_state_log_idx - top));
1236 mctx->state_log_top = next_state_log_idx;
1238 return REG_NOERROR;
1241 #ifdef RE_ENABLE_I18N
1242 static int
1243 sift_states_iter_mb (preg, mctx, node_idx, str_idx, max_str_idx)
1244 const regex_t *preg;
1245 const re_match_context_t *mctx;
1246 int node_idx, str_idx, max_str_idx;
1248 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1249 int naccepted;
1250 /* Check the node can accept `multi byte'. */
1251 naccepted = check_node_accept_bytes (preg, node_idx, mctx->input, str_idx);
1252 if (naccepted > 0 && str_idx + naccepted <= max_str_idx &&
1253 !STATE_NODE_CONTAINS (mctx->state_log[str_idx + naccepted],
1254 dfa->nexts[node_idx]))
1255 /* The node can't accept the `multi byte', or the
1256 destination was already throwed away, then the node
1257 could't accept the current input `multi byte'. */
1258 naccepted = 0;
1259 /* Otherwise, it is sure that the node could accept
1260 `naccepted' bytes input. */
1261 return naccepted;
1263 #endif /* RE_ENABLE_I18N */
1265 static int
1266 sift_states_iter_bkref (dfa, state_log, mctx_entry, node_idx, idx, match_last)
1267 const re_dfa_t *dfa;
1268 re_dfastate_t **state_log;
1269 struct re_backref_cache_entry *mctx_entry;
1270 int node_idx, idx, match_last;
1272 int naccepted = 0;
1273 int from_idx, to_idx;
1274 from_idx = mctx_entry->from;
1275 to_idx = mctx_entry->to;
1276 if (mctx_entry->node == node_idx
1277 && from_idx == idx && to_idx <= match_last
1278 && STATE_NODE_CONTAINS (state_log[to_idx], dfa->nexts[node_idx]))
1279 naccepted = to_idx - from_idx;
1280 return naccepted;
1283 static reg_errcode_t
1284 add_epsilon_backreference (dfa, mctx, plog, idx, state_buf)
1285 const re_dfa_t *dfa;
1286 const re_match_context_t *mctx;
1287 const re_node_set *plog;
1288 int idx;
1289 re_node_set *state_buf;
1291 int i, j;
1292 for (i = 0; i < plog->nelem; ++i)
1294 int node_idx = plog->elems[i];
1295 re_token_type_t type = dfa->nodes[node_idx].type;
1296 if (type == OP_CONTEXT_NODE)
1297 type = dfa->nodes[dfa->nodes[node_idx].opr.ctx_info->entity].type;
1299 if (type == OP_BACK_REF &&
1300 !re_node_set_contains (state_buf, node_idx))
1302 for (j = 0; j < mctx->nbkref_ents; ++j)
1304 struct re_backref_cache_entry *entry;
1305 entry = mctx->bkref_ents + j;
1306 if (entry->from == entry->to && entry->from == idx)
1307 break;
1309 if (j < mctx->nbkref_ents || idx == 0)
1311 reg_errcode_t err;
1312 err = re_node_set_add_intersect (state_buf, plog,
1313 dfa->inveclosures + node_idx);
1314 if (BE (err != REG_NOERROR, 0))
1315 return err;
1316 i = 0;
1320 return REG_NOERROR;
1323 /* Functions for state transition. */
1325 /* Return the next state to which the current state STATE will transit by
1326 accepting the current input byte, and update STATE_LOG if necessary.
1327 If STATE can accept a multibyte char/collating element/back reference
1328 update the destination of STATE_LOG. */
1330 static re_dfastate_t *
1331 transit_state (err, preg, mctx, state, fl_search)
1332 reg_errcode_t *err;
1333 const regex_t *preg;
1334 re_match_context_t *mctx;
1335 re_dfastate_t *state;
1336 int fl_search;
1338 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1339 re_dfastate_t **trtable, *next_state;
1340 unsigned char ch;
1342 if (re_string_cur_idx (mctx->input) + 1 >= mctx->input->bufs_len
1343 || (re_string_cur_idx (mctx->input) + 1 >= mctx->input->valid_len
1344 && mctx->input->valid_len < mctx->input->len))
1346 *err = extend_buffers (mctx);
1347 if (BE (*err != REG_NOERROR, 0))
1348 return NULL;
1351 *err = REG_NOERROR;
1352 if (state == NULL)
1354 next_state = state;
1355 re_string_skip_bytes (mctx->input, 1);
1357 else
1359 #ifdef RE_ENABLE_I18N
1360 /* If the current state can accept multibyte. */
1361 if (state->accept_mb)
1363 *err = transit_state_mb (preg, state, mctx);
1364 if (BE (*err != REG_NOERROR, 0))
1365 return NULL;
1367 #endif /* RE_ENABLE_I18N */
1369 /* Then decide the next state with the single byte. */
1370 if (1)
1372 /* Use transition table */
1373 ch = re_string_fetch_byte (mctx->input);
1374 trtable = fl_search ? state->trtable_search : state->trtable;
1375 if (trtable == NULL)
1377 trtable = build_trtable (preg, state, fl_search);
1378 if (fl_search)
1379 state->trtable_search = trtable;
1380 else
1381 state->trtable = trtable;
1383 next_state = trtable[ch];
1385 else
1387 /* don't use transition table */
1388 next_state = transit_state_sb (err, preg, state, fl_search, mctx);
1389 if (BE (next_state == NULL && err != REG_NOERROR, 0))
1390 return NULL;
1394 /* Update the state_log if we need. */
1395 if (mctx->state_log != NULL)
1397 int cur_idx = re_string_cur_idx (mctx->input);
1398 if (cur_idx > mctx->state_log_top)
1400 mctx->state_log[cur_idx] = next_state;
1401 mctx->state_log_top = cur_idx;
1403 else if (mctx->state_log[cur_idx] == 0)
1405 mctx->state_log[cur_idx] = next_state;
1407 else
1409 re_dfastate_t *pstate;
1410 unsigned int context;
1411 re_node_set next_nodes, *log_nodes, *table_nodes = NULL;
1412 /* If (state_log[cur_idx] != 0), it implies that cur_idx is
1413 the destination of a multibyte char/collating element/
1414 back reference. Then the next state is the union set of
1415 these destinations and the results of the transition table. */
1416 pstate = mctx->state_log[cur_idx];
1417 log_nodes = pstate->entrance_nodes;
1418 if (next_state != NULL)
1420 table_nodes = next_state->entrance_nodes;
1421 *err = re_node_set_init_union (&next_nodes, table_nodes,
1422 log_nodes);
1423 if (BE (*err != REG_NOERROR, 0))
1424 return NULL;
1426 else
1427 next_nodes = *log_nodes;
1428 /* Note: We already add the nodes of the initial state,
1429 then we don't need to add them here. */
1431 context = re_string_context_at (mctx->input,
1432 re_string_cur_idx (mctx->input) - 1,
1433 mctx->eflags, preg->newline_anchor);
1434 next_state = mctx->state_log[cur_idx]
1435 = re_acquire_state_context (err, dfa, &next_nodes, context);
1436 /* We don't need to check errors here, since the return value of
1437 this function is next_state and ERR is already set. */
1439 if (table_nodes != NULL)
1440 re_node_set_free (&next_nodes);
1442 /* If the next state has back references. */
1443 if (next_state != NULL && next_state->has_backref)
1445 *err = transit_state_bkref (preg, next_state, mctx);
1446 if (BE (*err != REG_NOERROR, 0))
1447 return NULL;
1448 next_state = mctx->state_log[cur_idx];
1451 return next_state;
1454 /* Helper functions for transit_state. */
1456 /* Return the next state to which the current state STATE will transit by
1457 accepting the current input byte. */
1459 static re_dfastate_t *
1460 transit_state_sb (err, preg, state, fl_search, mctx)
1461 reg_errcode_t *err;
1462 const regex_t *preg;
1463 re_dfastate_t *state;
1464 int fl_search;
1465 re_match_context_t *mctx;
1467 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1468 re_node_set next_nodes;
1469 re_dfastate_t *next_state;
1470 int node_cnt, cur_str_idx = re_string_cur_idx (mctx->input);
1471 unsigned int context;
1473 *err = re_node_set_alloc (&next_nodes, state->nodes.nelem + 1);
1474 if (BE (*err != REG_NOERROR, 0))
1475 return NULL;
1476 for (node_cnt = 0; node_cnt < state->nodes.nelem; ++node_cnt)
1478 int cur_node = state->nodes.elems[node_cnt];
1479 if (check_node_accept (preg, dfa->nodes + cur_node, mctx, cur_str_idx))
1481 *err = re_node_set_merge (&next_nodes,
1482 dfa->eclosures + dfa->nexts[cur_node]);
1483 if (BE (*err != REG_NOERROR, 0))
1484 return NULL;
1487 if (fl_search)
1489 #ifdef RE_ENABLE_I18N
1490 int not_initial = 0;
1491 if (MB_CUR_MAX > 1)
1492 for (node_cnt = 0; node_cnt < next_nodes.nelem; ++node_cnt)
1493 if (dfa->nodes[next_nodes.elems[node_cnt]].type == CHARACTER)
1495 not_initial = dfa->nodes[next_nodes.elems[node_cnt]].mb_partial;
1496 break;
1498 if (!not_initial)
1499 #endif
1501 *err = re_node_set_merge (&next_nodes,
1502 dfa->init_state->entrance_nodes);
1503 if (BE (*err != REG_NOERROR, 0))
1504 return NULL;
1507 context = re_string_context_at (mctx->input, cur_str_idx, mctx->eflags,
1508 preg->newline_anchor);
1509 next_state = re_acquire_state_context (err, dfa, &next_nodes, context);
1510 /* We don't need to check errors here, since the return value of
1511 this function is next_state and ERR is already set. */
1513 re_node_set_free (&next_nodes);
1514 re_string_skip_bytes (mctx->input, 1);
1515 return next_state;
1518 #ifdef RE_ENABLE_I18N
1519 static reg_errcode_t
1520 transit_state_mb (preg, pstate, mctx)
1521 const regex_t *preg;
1522 re_dfastate_t *pstate;
1523 re_match_context_t *mctx;
1525 reg_errcode_t err;
1526 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1527 int i;
1529 for (i = 0; i < pstate->nodes.nelem; ++i)
1531 re_node_set dest_nodes, *new_nodes;
1532 int cur_node_idx = pstate->nodes.elems[i];
1533 int naccepted = 0, dest_idx;
1534 unsigned int context;
1535 re_dfastate_t *dest_state;
1537 if (dfa->nodes[cur_node_idx].type == OP_CONTEXT_NODE)
1539 context = re_string_context_at (mctx->input,
1540 re_string_cur_idx (mctx->input),
1541 mctx->eflags, preg->newline_anchor);
1542 if (NOT_SATISFY_NEXT_CONSTRAINT (dfa->nodes[cur_node_idx].constraint,
1543 context))
1544 continue;
1545 cur_node_idx = dfa->nodes[cur_node_idx].opr.ctx_info->entity;
1548 /* How many bytes the node can accepts? */
1549 if (ACCEPT_MB_NODE (dfa->nodes[cur_node_idx].type))
1550 naccepted = check_node_accept_bytes (preg, cur_node_idx, mctx->input,
1551 re_string_cur_idx (mctx->input));
1552 if (naccepted == 0)
1553 continue;
1555 /* The node can accepts `naccepted' bytes. */
1556 dest_idx = re_string_cur_idx (mctx->input) + naccepted;
1557 err = clean_state_log_if_need (mctx, dest_idx);
1558 if (BE (err != REG_NOERROR, 0))
1559 return err;
1560 #ifdef DEBUG
1561 assert (dfa->nexts[cur_node_idx] != -1);
1562 #endif
1563 /* `cur_node_idx' may point the entity of the OP_CONTEXT_NODE,
1564 then we use pstate->nodes.elems[i] instead. */
1565 new_nodes = dfa->eclosures + dfa->nexts[pstate->nodes.elems[i]];
1567 dest_state = mctx->state_log[dest_idx];
1568 if (dest_state == NULL)
1569 dest_nodes = *new_nodes;
1570 else
1572 err = re_node_set_init_union (&dest_nodes,
1573 dest_state->entrance_nodes, new_nodes);
1574 if (BE (err != REG_NOERROR, 0))
1575 return err;
1577 context = re_string_context_at (mctx->input, dest_idx - 1, mctx->eflags,
1578 preg->newline_anchor);
1579 mctx->state_log[dest_idx]
1580 = re_acquire_state_context (&err, dfa, &dest_nodes, context);
1581 if (BE (mctx->state_log[dest_idx] == NULL && err != REG_NOERROR, 0))
1582 return err;
1583 if (dest_state != NULL)
1584 re_node_set_free (&dest_nodes);
1586 return REG_NOERROR;
1588 #endif /* RE_ENABLE_I18N */
1590 static reg_errcode_t
1591 transit_state_bkref (preg, pstate, mctx)
1592 const regex_t *preg;
1593 re_dfastate_t *pstate;
1594 re_match_context_t *mctx;
1596 reg_errcode_t err;
1597 re_dfastate_t **work_state_log;
1599 work_state_log = re_malloc (re_dfastate_t *,
1600 re_string_cur_idx (mctx->input) + 1);
1601 if (BE (work_state_log == NULL, 0))
1602 return REG_ESPACE;
1604 err = transit_state_bkref_loop (preg, &pstate->nodes, work_state_log, mctx);
1605 re_free (work_state_log);
1606 return err;
1609 /* Caller must allocate `work_state_log'. */
1611 static reg_errcode_t
1612 transit_state_bkref_loop (preg, nodes, work_state_log, mctx)
1613 const regex_t *preg;
1614 re_node_set *nodes;
1615 re_dfastate_t **work_state_log;
1616 re_match_context_t *mctx;
1618 reg_errcode_t err;
1619 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1620 int i, j;
1621 re_dfastate_t **state_log_bak;
1622 regmatch_t *cur_regs = re_malloc (regmatch_t, preg->re_nsub + 1);
1623 int cur_str_idx = re_string_cur_idx (mctx->input);
1624 if (BE (cur_regs == NULL, 0))
1625 return REG_ESPACE;
1627 for (i = 0; i < nodes->nelem; ++i)
1629 char *buf;
1630 int dest_str_idx, subexp_idx, prev_nelem, subexp_len;
1631 int node_idx = nodes->elems[i];
1632 unsigned int context;
1633 re_token_t *node = dfa->nodes + node_idx;
1634 re_dfastate_t *dest_state;
1635 re_node_set *new_dest_nodes;
1637 /* Check whether `node' is a backreference or not. */
1638 if (node->type == OP_BACK_REF)
1639 subexp_idx = node->opr.idx;
1640 else if (node->type == OP_CONTEXT_NODE &&
1641 dfa->nodes[node->opr.ctx_info->entity].type == OP_BACK_REF)
1643 context = re_string_context_at (mctx->input, cur_str_idx,
1644 mctx->eflags, preg->newline_anchor);
1645 if (NOT_SATISFY_NEXT_CONSTRAINT (node->constraint, context))
1646 continue;
1647 subexp_idx = dfa->nodes[node->opr.ctx_info->entity].opr.idx;
1649 else
1650 continue;
1652 /* `node' is a backreference.
1653 At first, set registers to check the backreference. */
1654 cur_regs[0].rm_so = 0;
1655 cur_regs[0].rm_eo = cur_str_idx;
1656 memcpy (work_state_log, mctx->state_log,
1657 sizeof (re_dfastate_t *) * (cur_str_idx + 1));
1658 mctx->match_last = cur_str_idx;
1659 state_log_bak = mctx->state_log;
1660 mctx->state_log = work_state_log;
1661 sift_states_backward (preg, mctx, node_idx);
1662 if (!STATE_NODE_CONTAINS (work_state_log[0], dfa->init_node))
1663 continue;
1664 for (j = 1; j <= preg->re_nsub; ++j)
1665 cur_regs[j].rm_so = cur_regs[j].rm_eo = -1;
1666 set_regs (preg, mctx, subexp_idx + 1, cur_regs, node_idx);
1667 mctx->state_log = state_log_bak;
1669 /* Then check that the backreference can match the input string. */
1670 subexp_len = cur_regs[subexp_idx].rm_eo - cur_regs[subexp_idx].rm_so;
1671 if (subexp_len < 0 || cur_str_idx + subexp_len > mctx->input->len)
1672 continue;
1674 if (cur_str_idx + subexp_len > mctx->input->valid_len
1675 && mctx->input->valid_len < mctx->input->len)
1677 reg_errcode_t err;
1678 err = extend_buffers (mctx);
1679 if (BE (err != REG_NOERROR, 0))
1680 return err;
1682 buf = (char *) re_string_get_buffer (mctx->input);
1683 if (strncmp (buf + cur_regs[subexp_idx].rm_so, buf + cur_str_idx,
1684 subexp_len) != 0)
1685 continue;
1687 /* Successfully matched, add a new cache entry. */
1688 dest_str_idx = cur_str_idx + subexp_len;
1689 err = match_ctx_add_entry (mctx, node_idx, cur_str_idx, dest_str_idx);
1690 if (BE (err != REG_NOERROR, 0))
1691 return err;
1692 err = clean_state_log_if_need (mctx, dest_str_idx);
1693 if (BE (err != REG_NOERROR, 0))
1694 return err;
1696 /* And add the epsilon closures (which is `new_dest_nodes') of
1697 the backreference to appropriate state_log. */
1698 #ifdef DEBUG
1699 assert (dfa->nexts[node_idx] != -1);
1700 #endif
1701 if (node->type == OP_CONTEXT_NODE && subexp_len == 0)
1702 new_dest_nodes = dfa->nodes[node_idx].opr.ctx_info->bkref_eclosure;
1703 else
1704 new_dest_nodes = dfa->eclosures + dfa->nexts[node_idx];
1705 context = (IS_WORD_CHAR (re_string_byte_at (mctx->input,
1706 dest_str_idx - 1))
1707 ? CONTEXT_WORD : 0);
1708 dest_state = mctx->state_log[dest_str_idx];
1710 prev_nelem = ((mctx->state_log[cur_str_idx] == NULL) ? 0
1711 : mctx->state_log[cur_str_idx]->nodes.nelem);
1712 /* Add `new_dest_node' to state_log. */
1713 if (dest_state == NULL)
1715 mctx->state_log[dest_str_idx]
1716 = re_acquire_state_context (&err, dfa, new_dest_nodes, context);
1717 if (BE (mctx->state_log[dest_str_idx] == NULL
1718 && err != REG_NOERROR, 0))
1719 return err;
1721 else
1723 re_node_set dest_nodes;
1724 err = re_node_set_init_union (&dest_nodes, dest_state->entrance_nodes,
1725 new_dest_nodes);
1726 if (BE (err != REG_NOERROR, 0))
1727 return err;
1728 mctx->state_log[dest_str_idx]
1729 = re_acquire_state_context (&err, dfa, &dest_nodes, context);
1730 if (BE (mctx->state_log[dest_str_idx] == NULL
1731 && err != REG_NOERROR, 0))
1732 return err;
1733 re_node_set_free (&dest_nodes);
1736 /* We need to check recursively if the backreference can epsilon
1737 transit. */
1738 if (subexp_len == 0
1739 && mctx->state_log[cur_str_idx]->nodes.nelem > prev_nelem)
1741 err = transit_state_bkref_loop (preg, new_dest_nodes, work_state_log,
1742 mctx);
1743 if (BE (err != REG_NOERROR, 0))
1744 return err;
1747 re_free (cur_regs);
1748 return REG_NOERROR;
1751 /* Build transition table for the state.
1752 Return the new table if succeeded, otherwise return NULL. */
1754 static re_dfastate_t **
1755 build_trtable (preg, state, fl_search)
1756 const regex_t *preg;
1757 const re_dfastate_t *state;
1758 int fl_search;
1760 reg_errcode_t err;
1761 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1762 int i, j, k, ch;
1763 int ndests; /* Number of the destination states from `state'. */
1764 re_dfastate_t **trtable, **dest_states, **dest_states_word, **dest_states_nl;
1765 re_node_set follows, *dests_node;
1766 bitset *dests_ch;
1767 bitset acceptable;
1769 /* We build DFA states which corresponds to the destination nodes
1770 from `state'. `dests_node[i]' represents the nodes which i-th
1771 destination state contains, and `dests_ch[i]' represents the
1772 characters which i-th destination state accepts. */
1773 dests_node = re_malloc (re_node_set, SBC_MAX);
1774 dests_ch = re_malloc (bitset, SBC_MAX);
1776 /* Initialize transiton table. */
1777 trtable = (re_dfastate_t **) calloc (sizeof (re_dfastate_t *), SBC_MAX);
1778 if (BE (dests_node == NULL || dests_ch == NULL || trtable == NULL, 0))
1779 return NULL;
1781 /* At first, group all nodes belonging to `state' into several
1782 destinations. */
1783 ndests = group_nodes_into_DFAstates (preg, state, dests_node, dests_ch);
1784 if (BE (ndests <= 0, 0))
1786 re_free (dests_node);
1787 re_free (dests_ch);
1788 /* Return NULL in case of an error, trtable otherwise. */
1789 return (ndests < 0) ? NULL : trtable;
1792 dest_states = re_malloc (re_dfastate_t *, ndests);
1793 dest_states_word = re_malloc (re_dfastate_t *, ndests);
1794 dest_states_nl = re_malloc (re_dfastate_t *, ndests);
1795 bitset_empty (acceptable);
1797 err = re_node_set_alloc (&follows, ndests + 1);
1798 if (BE (dest_states == NULL || dest_states_word == NULL
1799 || dest_states_nl == NULL || err != REG_NOERROR, 0))
1800 return NULL;
1802 /* Then build the states for all destinations. */
1803 for (i = 0; i < ndests; ++i)
1805 int next_node;
1806 re_node_set_empty (&follows);
1807 /* Merge the follows of this destination states. */
1808 for (j = 0; j < dests_node[i].nelem; ++j)
1810 next_node = dfa->nexts[dests_node[i].elems[j]];
1811 if (next_node != -1)
1813 err = re_node_set_merge (&follows, dfa->eclosures + next_node);
1814 if (BE (err != REG_NOERROR, 0))
1815 return NULL;
1818 /* If search flag is set, merge the initial state. */
1819 if (fl_search)
1821 #ifdef RE_ENABLE_I18N
1822 int not_initial = 0;
1823 for (j = 0; j < follows.nelem; ++j)
1824 if (dfa->nodes[follows.elems[j]].type == CHARACTER)
1826 not_initial = dfa->nodes[follows.elems[j]].mb_partial;
1827 break;
1829 if (!not_initial)
1830 #endif
1832 err = re_node_set_merge (&follows,
1833 dfa->init_state->entrance_nodes);
1834 if (BE (err != REG_NOERROR, 0))
1835 return NULL;
1838 dest_states[i] = re_acquire_state_context (&err, dfa, &follows, 0);
1839 if (BE (dest_states[i] == NULL && err != REG_NOERROR, 0))
1840 return NULL;
1841 /* If the new state has context constraint,
1842 build appropriate states for these contexts. */
1843 if (dest_states[i]->has_constraint)
1845 dest_states_word[i] = re_acquire_state_context (&err, dfa, &follows,
1846 CONTEXT_WORD);
1847 if (BE (dest_states_word[i] == NULL && err != REG_NOERROR, 0))
1848 return NULL;
1849 dest_states_nl[i] = re_acquire_state_context (&err, dfa, &follows,
1850 CONTEXT_NEWLINE);
1851 if (BE (dest_states_nl[i] == NULL && err != REG_NOERROR, 0))
1852 return NULL;
1854 else
1856 dest_states_word[i] = dest_states[i];
1857 dest_states_nl[i] = dest_states[i];
1859 bitset_merge (acceptable, dests_ch[i]);
1862 /* Update the transition table. */
1863 /* For all characters ch...: */
1864 for (i = 0, ch = 0; i < BITSET_UINTS; ++i)
1865 for (j = 0; j < UINT_BITS; ++j, ++ch)
1866 if ((acceptable[i] >> j) & 1)
1868 /* The current state accepts the character ch. */
1869 if (IS_WORD_CHAR (ch))
1871 for (k = 0; k < ndests; ++k)
1872 if ((dests_ch[k][i] >> j) & 1)
1874 /* k-th destination accepts the word character ch. */
1875 trtable[ch] = dest_states_word[k];
1876 /* There must be only one destination which accepts
1877 character ch. See group_nodes_into_DFAstates. */
1878 break;
1881 else /* not WORD_CHAR */
1883 for (k = 0; k < ndests; ++k)
1884 if ((dests_ch[k][i] >> j) & 1)
1886 /* k-th destination accepts the non-word character ch. */
1887 trtable[ch] = dest_states[k];
1888 /* There must be only one destination which accepts
1889 character ch. See group_nodes_into_DFAstates. */
1890 break;
1894 /* new line */
1895 if (bitset_contain (acceptable, NEWLINE_CHAR))
1897 /* The current state accepts newline character. */
1898 for (k = 0; k < ndests; ++k)
1899 if (bitset_contain (dests_ch[k], NEWLINE_CHAR))
1901 /* k-th destination accepts newline character. */
1902 trtable[NEWLINE_CHAR] = dest_states_nl[k];
1903 /* There must be only one destination which accepts
1904 newline. See group_nodes_into_DFAstates. */
1905 break;
1909 re_free (dest_states_nl);
1910 re_free (dest_states_word);
1911 re_free (dest_states);
1913 re_node_set_free (&follows);
1914 for (i = 0; i < ndests; ++i)
1915 re_node_set_free (dests_node + i);
1917 re_free (dests_ch);
1918 re_free (dests_node);
1920 return trtable;
1923 /* Group all nodes belonging to STATE into several destinations.
1924 Then for all destinations, set the nodes belonging to the destination
1925 to DESTS_NODE[i] and set the characters accepted by the destination
1926 to DEST_CH[i]. This function return the number of destinations. */
1928 static int
1929 group_nodes_into_DFAstates (preg, state, dests_node, dests_ch)
1930 const regex_t *preg;
1931 const re_dfastate_t *state;
1932 re_node_set *dests_node;
1933 bitset *dests_ch;
1935 reg_errcode_t err;
1936 const re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1937 int i, j, k;
1938 int ndests; /* Number of the destinations from `state'. */
1939 bitset accepts; /* Characters a node can accept. */
1940 const re_node_set *cur_nodes = &state->nodes;
1941 bitset_empty (accepts);
1942 ndests = 0;
1944 /* For all the nodes belonging to `state', */
1945 for (i = 0; i < cur_nodes->nelem; ++i)
1947 unsigned int constraint = 0;
1948 re_token_t *node = &dfa->nodes[cur_nodes->elems[i]];
1949 re_token_type_t type = node->type;
1951 if (type == OP_CONTEXT_NODE)
1953 constraint = node->constraint;
1954 node = dfa->nodes + node->opr.ctx_info->entity;
1955 type = node->type;
1958 /* Enumerate all single byte character this node can accept. */
1959 if (type == CHARACTER)
1960 bitset_set (accepts, node->opr.c);
1961 else if (type == SIMPLE_BRACKET)
1963 bitset_merge (accepts, node->opr.sbcset);
1965 else if (type == OP_PERIOD)
1967 bitset_set_all (accepts);
1968 if (!(preg->syntax & RE_DOT_NEWLINE))
1969 bitset_clear (accepts, '\n');
1970 if (preg->syntax & RE_DOT_NOT_NULL)
1971 bitset_clear (accepts, '\0');
1973 else
1974 continue;
1976 /* Check the `accepts' and sift the characters which are not
1977 match it the context. */
1978 if (constraint)
1980 if (constraint & NEXT_WORD_CONSTRAINT)
1981 for (j = 0; j < BITSET_UINTS; ++j)
1982 accepts[j] &= dfa->word_char[j];
1983 else if (constraint & NEXT_NOTWORD_CONSTRAINT)
1984 for (j = 0; j < BITSET_UINTS; ++j)
1985 accepts[j] &= ~dfa->word_char[j];
1986 else if (constraint & NEXT_NEWLINE_CONSTRAINT)
1988 int accepts_newline = bitset_contain (accepts, NEWLINE_CHAR);
1989 bitset_empty (accepts);
1990 if (accepts_newline)
1991 bitset_set (accepts, NEWLINE_CHAR);
1992 else
1993 continue;
1997 /* Then divide `accepts' into DFA states, or create a new
1998 state. */
1999 for (j = 0; j < ndests; ++j)
2001 bitset intersec; /* Intersection sets, see below. */
2002 bitset remains;
2003 /* Flags, see below. */
2004 int has_intersec, not_subset, not_consumed;
2006 /* Optimization, skip if this state doesn't accept the character. */
2007 if (type == CHARACTER && !bitset_contain (dests_ch[j], node->opr.c))
2008 continue;
2010 /* Enumerate the intersection set of this state and `accepts'. */
2011 has_intersec = 0;
2012 for (k = 0; k < BITSET_UINTS; ++k)
2013 has_intersec |= intersec[k] = accepts[k] & dests_ch[j][k];
2014 /* And skip if the intersection set is empty. */
2015 if (!has_intersec)
2016 continue;
2018 /* Then check if this state is a subset of `accepts'. */
2019 not_subset = not_consumed = 0;
2020 for (k = 0; k < BITSET_UINTS; ++k)
2022 not_subset |= remains[k] = ~accepts[k] & dests_ch[j][k];
2023 not_consumed |= accepts[k] = accepts[k] & ~dests_ch[j][k];
2026 /* If this state isn't a subset of `accepts', create a
2027 new group state, which has the `remains'. */
2028 if (not_subset)
2030 bitset_copy (dests_ch[ndests], remains);
2031 bitset_copy (dests_ch[j], intersec);
2032 err = re_node_set_init_copy (dests_node + ndests, &dests_node[j]);
2033 if (BE (err != REG_NOERROR, 0))
2034 return -1;
2035 ++ndests;
2038 /* Put the position in the current group. */
2039 err = re_node_set_insert (&dests_node[j], cur_nodes->elems[i]);
2040 if (BE (err < 0, 0))
2041 return -1;
2043 /* If all characters are consumed, go to next node. */
2044 if (!not_consumed)
2045 break;
2047 /* Some characters remain, create a new group. */
2048 if (j == ndests)
2050 bitset_copy (dests_ch[ndests], accepts);
2051 err = re_node_set_init_1 (dests_node + ndests, cur_nodes->elems[i]);
2052 if (BE (err != REG_NOERROR, 0))
2053 return -1;
2054 ++ndests;
2055 bitset_empty (accepts);
2058 return ndests;
2061 #ifdef RE_ENABLE_I18N
2062 /* Check how many bytes the node `dfa->nodes[node_idx]' accepts.
2063 Return the number of the bytes the node accepts.
2064 STR_IDX is the current index of the input string.
2066 This function handles the nodes which can accept one character, or
2067 one collating element like '.', '[a-z]', opposite to the other nodes
2068 can only accept one byte. */
2070 static int
2071 check_node_accept_bytes (preg, node_idx, input, str_idx)
2072 const regex_t *preg;
2073 int node_idx, str_idx;
2074 const re_string_t *input;
2076 const re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2077 const re_token_t *node = dfa->nodes + node_idx;
2078 int elem_len = re_string_elem_size_at (input, str_idx);
2079 int char_len = re_string_char_size_at (input, str_idx);
2080 int i;
2081 # ifdef _LIBC
2082 int j;
2083 uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
2084 # endif /* _LIBC */
2085 if (elem_len <= 1 && char_len <= 1)
2086 return 0;
2087 if (node->type == OP_PERIOD)
2089 /* '.' accepts any one character except the following two cases. */
2090 if ((!(preg->syntax & RE_DOT_NEWLINE) &&
2091 re_string_byte_at (input, str_idx) == '\n') ||
2092 ((preg->syntax & RE_DOT_NOT_NULL) &&
2093 re_string_byte_at (input, str_idx) == '\0'))
2094 return 0;
2095 return char_len;
2097 else if (node->type == COMPLEX_BRACKET)
2099 const re_charset_t *cset = node->opr.mbcset;
2100 # ifdef _LIBC
2101 const unsigned char *pin = re_string_get_buffer (input) + str_idx;
2102 # endif /* _LIBC */
2103 int match_len = 0;
2104 wchar_t wc = ((cset->nranges || cset->nchar_classes || cset->nmbchars)
2105 ? re_string_wchar_at (input, str_idx) : 0);
2107 /* match with multibyte character? */
2108 for (i = 0; i < cset->nmbchars; ++i)
2109 if (wc == cset->mbchars[i])
2111 match_len = char_len;
2112 goto check_node_accept_bytes_match;
2114 /* match with character_class? */
2115 for (i = 0; i < cset->nchar_classes; ++i)
2117 wctype_t wt = cset->char_classes[i];
2118 if (__iswctype (wc, wt))
2120 match_len = char_len;
2121 goto check_node_accept_bytes_match;
2125 # ifdef _LIBC
2126 if (nrules != 0)
2128 unsigned int in_collseq = 0;
2129 const int32_t *table, *indirect;
2130 const unsigned char *weights, *extra;
2131 const char *collseqwc;
2132 int32_t idx;
2133 /* This #include defines a local function! */
2134 # include <locale/weight.h>
2136 /* match with collating_symbol? */
2137 if (cset->ncoll_syms)
2138 extra = (const unsigned char *)
2139 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB);
2140 for (i = 0; i < cset->ncoll_syms; ++i)
2142 const unsigned char *coll_sym = extra + cset->coll_syms[i];
2143 /* Compare the length of input collating element and
2144 the length of current collating element. */
2145 if (*coll_sym != elem_len)
2146 continue;
2147 /* Compare each bytes. */
2148 for (j = 0; j < *coll_sym; j++)
2149 if (pin[j] != coll_sym[1 + j])
2150 break;
2151 if (j == *coll_sym)
2153 /* Match if every bytes is equal. */
2154 match_len = j;
2155 goto check_node_accept_bytes_match;
2159 if (cset->nranges)
2161 if (elem_len <= char_len)
2163 collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
2164 in_collseq = collseq_table_lookup (collseqwc, wc);
2166 else
2167 in_collseq = find_collation_sequence_value (pin, elem_len);
2169 /* match with range expression? */
2170 for (i = 0; i < cset->nranges; ++i)
2171 if (cset->range_starts[i] <= in_collseq
2172 && in_collseq <= cset->range_ends[i])
2174 match_len = elem_len;
2175 goto check_node_accept_bytes_match;
2178 /* match with equivalence_class? */
2179 if (cset->nequiv_classes)
2181 const unsigned char *cp = pin;
2182 table = (const int32_t *)
2183 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
2184 weights = (const unsigned char *)
2185 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_WEIGHTMB);
2186 extra = (const unsigned char *)
2187 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_EXTRAMB);
2188 indirect = (const int32_t *)
2189 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_INDIRECTMB);
2190 idx = findidx (&cp);
2191 if (idx > 0)
2192 for (i = 0; i < cset->nequiv_classes; ++i)
2194 int32_t equiv_class_idx = cset->equiv_classes[i];
2195 size_t weight_len = weights[idx];
2196 if (weight_len == weights[equiv_class_idx])
2198 int cnt = 0;
2199 while (cnt <= weight_len
2200 && (weights[equiv_class_idx + 1 + cnt]
2201 == weights[idx + 1 + cnt]))
2202 ++cnt;
2203 if (cnt > weight_len)
2205 match_len = elem_len;
2206 goto check_node_accept_bytes_match;
2212 else
2213 # endif /* _LIBC */
2215 /* match with range expression? */
2216 #if __GNUC__ >= 2
2217 wchar_t cmp_buf[] = {L'\0', L'\0', wc, L'\0', L'\0', L'\0'};
2218 #else
2219 wchar_t cmp_buf[] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'};
2220 cmp_buf[2] = wc;
2221 #endif
2222 for (i = 0; i < cset->nranges; ++i)
2224 cmp_buf[0] = cset->range_starts[i];
2225 cmp_buf[4] = cset->range_ends[i];
2226 if (wcscoll (cmp_buf, cmp_buf + 2) <= 0
2227 && wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0)
2229 match_len = char_len;
2230 goto check_node_accept_bytes_match;
2234 check_node_accept_bytes_match:
2235 if (!cset->non_match)
2236 return match_len;
2237 else
2239 if (match_len > 0)
2240 return 0;
2241 else
2242 return (elem_len > char_len) ? elem_len : char_len;
2245 return 0;
2248 # ifdef _LIBC
2249 static unsigned int
2250 find_collation_sequence_value (mbs, mbs_len)
2251 const unsigned char *mbs;
2252 size_t mbs_len;
2254 uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
2255 if (nrules == 0)
2257 if (mbs_len == 1)
2259 /* No valid character. Match it as a single byte character. */
2260 const unsigned char *collseq = (const unsigned char *)
2261 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
2262 return collseq[mbs[0]];
2264 return UINT_MAX;
2266 else
2268 int32_t idx;
2269 const unsigned char *extra = (const unsigned char *)
2270 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB);
2272 for (idx = 0; ;)
2274 int mbs_cnt, found = 0;
2275 int32_t elem_mbs_len;
2276 /* Skip the name of collating element name. */
2277 idx = idx + extra[idx] + 1;
2278 elem_mbs_len = extra[idx++];
2279 if (mbs_len == elem_mbs_len)
2281 for (mbs_cnt = 0; mbs_cnt < elem_mbs_len; ++mbs_cnt)
2282 if (extra[idx + mbs_cnt] != mbs[mbs_cnt])
2283 break;
2284 if (mbs_cnt == elem_mbs_len)
2285 /* Found the entry. */
2286 found = 1;
2288 /* Skip the byte sequence of the collating element. */
2289 idx += elem_mbs_len;
2290 /* Adjust for the alignment. */
2291 idx = (idx + 3) & ~3;
2292 /* Skip the collation sequence value. */
2293 idx += sizeof (uint32_t);
2294 /* Skip the wide char sequence of the collating element. */
2295 idx = idx + sizeof (uint32_t) * (extra[idx] + 1);
2296 /* If we found the entry, return the sequence value. */
2297 if (found)
2298 return *(uint32_t *) (extra + idx);
2299 /* Skip the collation sequence value. */
2300 idx += sizeof (uint32_t);
2304 # endif /* _LIBC */
2305 #endif /* RE_ENABLE_I18N */
2307 /* Check whether the node accepts the byte which is IDX-th
2308 byte of the INPUT. */
2310 static int
2311 check_node_accept (preg, node, mctx, idx)
2312 const regex_t *preg;
2313 const re_token_t *node;
2314 const re_match_context_t *mctx;
2315 int idx;
2317 const re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2318 const re_token_t *cur_node;
2319 unsigned char ch;
2320 if (node->type == OP_CONTEXT_NODE)
2322 /* The node has constraints. Check whether the current context
2323 satisfies the constraints. */
2324 unsigned int context = re_string_context_at (mctx->input, idx,
2325 mctx->eflags,
2326 preg->newline_anchor);
2327 if (NOT_SATISFY_NEXT_CONSTRAINT (node->constraint, context))
2328 return 0;
2329 cur_node = dfa->nodes + node->opr.ctx_info->entity;
2331 else
2332 cur_node = node;
2334 ch = re_string_byte_at (mctx->input, idx);
2335 if (cur_node->type == CHARACTER)
2336 return cur_node->opr.c == ch;
2337 else if (cur_node->type == SIMPLE_BRACKET)
2338 return bitset_contain (cur_node->opr.sbcset, ch);
2339 else if (cur_node->type == OP_PERIOD)
2340 return !((ch == '\n' && !(preg->syntax & RE_DOT_NEWLINE))
2341 || (ch == '\0' && (preg->syntax & RE_DOT_NOT_NULL)));
2342 else
2343 return 0;
2346 /* Extend the buffers, if the buffers have run out. */
2348 static reg_errcode_t
2349 extend_buffers (mctx)
2350 re_match_context_t *mctx;
2352 reg_errcode_t ret;
2353 re_string_t *pstr = mctx->input;
2355 /* Double the lengthes of the buffers. */
2356 ret = re_string_realloc_buffers (pstr, pstr->bufs_len * 2);
2357 if (BE (ret != REG_NOERROR, 0))
2358 return ret;
2360 if (mctx->state_log != NULL)
2362 /* And double the length of state_log. */
2363 mctx->state_log = re_realloc (mctx->state_log, re_dfastate_t *,
2364 pstr->bufs_len * 2);
2365 if (BE (mctx->state_log == NULL, 0))
2366 return REG_ESPACE;
2369 /* Then reconstruct the buffers. */
2370 if (pstr->icase)
2372 #ifdef RE_ENABLE_I18N
2373 if (MB_CUR_MAX > 1)
2374 build_wcs_upper_buffer (pstr);
2375 else
2376 #endif /* RE_ENABLE_I18N */
2377 build_upper_buffer (pstr);
2379 else
2381 #ifdef RE_ENABLE_I18N
2382 if (MB_CUR_MAX > 1)
2383 build_wcs_buffer (pstr);
2384 else
2385 #endif /* RE_ENABLE_I18N */
2387 if (pstr->trans != NULL)
2388 re_string_translate_buffer (pstr);
2389 else
2390 pstr->valid_len = pstr->bufs_len;
2393 return REG_NOERROR;
2397 /* Functions for matching context. */
2399 static reg_errcode_t
2400 match_ctx_init (mctx, eflags, input, n)
2401 re_match_context_t *mctx;
2402 int eflags, n;
2403 re_string_t *input;
2405 mctx->eflags = eflags;
2406 mctx->input = input;
2407 mctx->match_last = -1;
2408 if (n > 0)
2410 mctx->bkref_ents = re_malloc (struct re_backref_cache_entry, n);
2411 if (BE (mctx->bkref_ents == NULL, 0))
2412 return REG_ESPACE;
2414 else
2415 mctx->bkref_ents = NULL;
2416 mctx->nbkref_ents = 0;
2417 mctx->abkref_ents = n;
2418 mctx->max_bkref_len = 0;
2419 return REG_NOERROR;
2422 static void
2423 match_ctx_free (mctx)
2424 re_match_context_t *mctx;
2426 re_free (mctx->bkref_ents);
2429 /* Add a new backreference entry to the cache. */
2431 static reg_errcode_t
2432 match_ctx_add_entry (mctx, node, from, to)
2433 re_match_context_t *mctx;
2434 int node, from, to;
2436 if (mctx->nbkref_ents >= mctx->abkref_ents)
2438 mctx->bkref_ents = re_realloc (mctx->bkref_ents,
2439 struct re_backref_cache_entry,
2440 mctx->abkref_ents * 2);
2441 if (BE (mctx->bkref_ents == NULL, 0))
2442 return REG_ESPACE;
2443 memset (mctx->bkref_ents + mctx->nbkref_ents, '\0',
2444 sizeof (struct re_backref_cache_entry) * mctx->abkref_ents);
2445 mctx->abkref_ents *= 2;
2447 mctx->bkref_ents[mctx->nbkref_ents].node = node;
2448 mctx->bkref_ents[mctx->nbkref_ents].from = from;
2449 mctx->bkref_ents[mctx->nbkref_ents++].to = to;
2450 if (mctx->max_bkref_len < to - from)
2451 mctx->max_bkref_len = to - from;
2452 return REG_NOERROR;