Include <sys/types.h> instead of <stddef.h> since ssize_t is needed as well. Replace...
[glibc.git] / posix / regexec.c
blob74e3ab5cd21fd28dfcd3c231f7df8a7e6011805d
1 /* Extended regular expression matching and search library.
2 Copyright (C) 2002 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4 Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library; if not, write to the Free
18 Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
19 02111-1307 USA. */
21 #include <assert.h>
22 #include <ctype.h>
23 #include <stdio.h>
24 #include <stdlib.h>
25 #include <string.h>
26 #include <wchar.h>
27 #include <wctype.h>
29 #ifdef _LIBC
30 # ifndef _RE_DEFINE_LOCALE_FUNCTIONS
31 # define _RE_DEFINE_LOCALE_FUNCTIONS 1
32 # include <locale/localeinfo.h>
33 # include <locale/elem-hash.h>
34 # include <locale/coll-lookup.h>
35 # endif
36 #endif
38 #include "regex.h"
39 #include "regex_internal.h"
41 static reg_errcode_t match_ctx_init (re_match_context_t *cache, int eflags,
42 re_string_t *input, int n);
43 static void match_ctx_free (re_match_context_t *cache);
44 static reg_errcode_t match_ctx_add_entry (re_match_context_t *cache, int node,
45 int from, int to);
46 static reg_errcode_t re_search_internal (const regex_t *preg,
47 const char *string, int length,
48 int start, int range, int stop,
49 size_t nmatch, regmatch_t pmatch[],
50 int eflags);
51 static int re_search_2_stub (struct re_pattern_buffer *bufp,
52 const char *string1, int length1,
53 const char *string2, int length2,
54 int start, int range, struct re_registers *regs,
55 int stop, int ret_len);
56 static int re_search_stub (struct re_pattern_buffer *bufp,
57 const char *string, int length, int start,
58 int range, int stop, struct re_registers *regs,
59 int ret_len);
60 static unsigned re_copy_regs (struct re_registers *regs, regmatch_t *pmatch,
61 int nregs, int regs_allocated);
62 static inline re_dfastate_t *acquire_init_state_context (reg_errcode_t *err,
63 const regex_t *preg,
64 const re_match_context_t *mctx,
65 int idx);
66 static int check_matching (const regex_t *preg, re_match_context_t *mctx,
67 int fl_search, int fl_longest_match);
68 static int check_halt_node_context (const re_dfa_t *dfa, int node,
69 unsigned int context);
70 static int check_halt_state_context (const regex_t *preg,
71 const re_dfastate_t *state,
72 const re_match_context_t *mctx, int idx);
73 static void update_regs (re_dfa_t *dfa, regmatch_t *pmatch, int cur_node,
74 int cur_idx, int nmatch);
75 static int proceed_next_node (const regex_t *preg,
76 const re_match_context_t *mctx,
77 int *pidx, int node, re_node_set *eps_via_nodes);
78 static reg_errcode_t set_regs (const regex_t *preg,
79 const re_match_context_t *mctx,
80 size_t nmatch, regmatch_t *pmatch, int last);
81 #ifdef RE_ENABLE_I18N
82 static int sift_states_iter_mb (const regex_t *preg,
83 const re_match_context_t *mctx,
84 int node_idx, int str_idx, int max_str_idx);
85 #endif /* RE_ENABLE_I18N */
86 static int sift_states_iter_bkref (const re_dfa_t *dfa,
87 re_dfastate_t **state_log,
88 struct re_backref_cache_entry *mctx_entry,
89 int node_idx, int idx, int match_last);
90 static reg_errcode_t sift_states_backward (const regex_t *preg,
91 const re_match_context_t *mctx,
92 int last_node);
93 static reg_errcode_t clean_state_log_if_need (re_match_context_t *mctx,
94 int next_state_log_idx);
95 static reg_errcode_t add_epsilon_backreference (const re_dfa_t *dfa,
96 const re_match_context_t *mctx,
97 const re_node_set *plog,
98 int idx,
99 re_node_set *state_buf);
100 static re_dfastate_t *transit_state (reg_errcode_t *err, const regex_t *preg,
101 re_match_context_t *mctx,
102 re_dfastate_t *state, int fl_search);
103 static re_dfastate_t *transit_state_sb (reg_errcode_t *err, const regex_t *preg,
104 re_dfastate_t *pstate,
105 int fl_search,
106 re_match_context_t *mctx);
107 #ifdef RE_ENABLE_I18N
108 static reg_errcode_t transit_state_mb (const regex_t *preg,
109 re_dfastate_t *pstate,
110 re_match_context_t *mctx);
111 #endif /* RE_ENABLE_I18N */
112 static reg_errcode_t transit_state_bkref (const regex_t *preg,
113 re_dfastate_t *pstate,
114 re_match_context_t *mctx);
115 static reg_errcode_t transit_state_bkref_loop (const regex_t *preg,
116 re_node_set *nodes,
117 re_dfastate_t **work_state_log,
118 re_match_context_t *mctx);
119 static re_dfastate_t **build_trtable (const regex_t *dfa,
120 const re_dfastate_t *state,
121 int fl_search);
122 #ifdef RE_ENABLE_I18N
123 static int check_node_accept_bytes (const regex_t *preg, int node_idx,
124 const re_string_t *input, int idx);
125 # ifdef _LIBC
126 static unsigned int find_collation_sequence_value (const unsigned char *mbs,
127 size_t name_len);
128 # endif /* _LIBC */
129 #endif /* RE_ENABLE_I18N */
130 static int group_nodes_into_DFAstates (const regex_t *dfa,
131 const re_dfastate_t *state,
132 re_node_set *states_node,
133 bitset *states_ch);
134 static int check_node_accept (const regex_t *preg, const re_token_t *node,
135 const re_match_context_t *mctx, int idx);
136 static reg_errcode_t extend_buffers (re_match_context_t *mctx);
138 /* Entry point for POSIX code. */
140 /* regexec searches for a given pattern, specified by PREG, in the
141 string STRING.
143 If NMATCH is zero or REG_NOSUB was set in the cflags argument to
144 `regcomp', we ignore PMATCH. Otherwise, we assume PMATCH has at
145 least NMATCH elements, and we set them to the offsets of the
146 corresponding matched substrings.
148 EFLAGS specifies `execution flags' which affect matching: if
149 REG_NOTBOL is set, then ^ does not match at the beginning of the
150 string; if REG_NOTEOL is set, then $ does not match at the end.
152 We return 0 if we find a match and REG_NOMATCH if not. */
155 regexec (preg, string, nmatch, pmatch, eflags)
156 const regex_t *preg;
157 const char *string;
158 size_t nmatch;
159 regmatch_t pmatch[];
160 int eflags;
162 reg_errcode_t err;
163 int length = strlen (string);
164 if (preg->no_sub)
165 err = re_search_internal (preg, string, length, 0, length, length, 0,
166 NULL, eflags);
167 else
168 err = re_search_internal (preg, string, length, 0, length, length, nmatch,
169 pmatch, eflags);
170 return err != REG_NOERROR;
172 #ifdef _LIBC
173 weak_alias (__regexec, regexec)
174 #endif
176 /* Entry points for GNU code. */
178 /* re_match, re_search, re_match_2, re_search_2
180 The former two functions operate on STRING with length LENGTH,
181 while the later two operate on concatenation of STRING1 and STRING2
182 with lengths LENGTH1 and LENGTH2, respectively.
184 re_match() matches the compiled pattern in BUFP against the string,
185 starting at index START.
187 re_search() first tries matching at index START, then it tries to match
188 starting from index START + 1, and so on. The last start position tried
189 is START + RANGE. (Thus RANGE = 0 forces re_search to operate the same
190 way as re_match().)
192 The parameter STOP of re_{match,search}_2 specifies that no match exceeding
193 the first STOP characters of the concatenation of the strings should be
194 concerned.
196 If REGS is not NULL, and BUFP->no_sub is not set, the offsets of the match
197 and all groups is stroed in REGS. (For the "_2" variants, the offsets are
198 computed relative to the concatenation, not relative to the individual
199 strings.)
201 On success, re_match* functions return the length of the match, re_search*
202 return the position of the start of the match. Return value -1 means no
203 match was found and -2 indicates an internal error. */
206 re_match (bufp, string, length, start, regs)
207 struct re_pattern_buffer *bufp;
208 const char *string;
209 int length, start;
210 struct re_registers *regs;
212 return re_search_stub (bufp, string, length, start, 0, length, regs, 1);
214 #ifdef _LIBC
215 weak_alias (__re_match, re_match)
216 #endif
219 re_search (bufp, string, length, start, range, regs)
220 struct re_pattern_buffer *bufp;
221 const char *string;
222 int length, start, range;
223 struct re_registers *regs;
225 return re_search_stub (bufp, string, length, start, range, length, regs, 0);
227 #ifdef _LIBC
228 weak_alias (__re_search, re_search)
229 #endif
232 re_match_2 (bufp, string1, length1, string2, length2, start, regs, stop)
233 struct re_pattern_buffer *bufp;
234 const char *string1, *string2;
235 int length1, length2, start, stop;
236 struct re_registers *regs;
238 return re_search_2_stub (bufp, string1, length1, string2, length2,
239 start, 0, regs, stop, 1);
241 #ifdef _LIBC
242 weak_alias (__re_match_2, re_match_2)
243 #endif
246 re_search_2 (bufp, string1, length1, string2, length2, start, range, regs, stop)
247 struct re_pattern_buffer *bufp;
248 const char *string1, *string2;
249 int length1, length2, start, range, stop;
250 struct re_registers *regs;
252 return re_search_2_stub (bufp, string1, length1, string2, length2,
253 start, range, regs, stop, 0);
255 #ifdef _LIBC
256 weak_alias (__re_search_2, re_search_2)
257 #endif
259 static int
260 re_search_2_stub (bufp, string1, length1, string2, length2, start, range, regs,
261 stop, ret_len)
262 struct re_pattern_buffer *bufp;
263 const char *string1, *string2;
264 int length1, length2, start, range, stop, ret_len;
265 struct re_registers *regs;
267 const char *str;
268 int rval;
269 int len = length1 + length2;
270 int free_str = 0;
272 if (BE (length1 < 0 || length2 < 0 || stop < 0, 0))
273 return -2;
275 /* Concatenate the strings. */
276 if (length2 > 0)
277 if (length1 > 0)
279 char *s = re_malloc (char, len);
281 if (BE (s == NULL, 0))
282 return -2;
283 memcpy (s, string1, length1);
284 memcpy (s + length1, string2, length2);
285 str = s;
286 free_str = 1;
288 else
289 str = string2;
290 else
291 str = string1;
293 rval = re_search_stub (bufp, str, len, start, range, stop, regs,
294 ret_len);
295 if (free_str)
296 re_free ((char *) str);
297 return rval;
300 /* The parameters have the same meaning as those of re_search.
301 Additional parameters:
302 If RET_LEN is nonzero the length of the match is returned (re_match style);
303 otherwise the position of the match is returned. */
305 static int
306 re_search_stub (bufp, string, length, start, range, stop, regs, ret_len)
307 struct re_pattern_buffer *bufp;
308 const char *string;
309 int length, start, range, stop, ret_len;
310 struct re_registers *regs;
312 reg_errcode_t result;
313 regmatch_t *pmatch;
314 int nregs, rval;
315 int eflags = 0;
317 /* Check for out-of-range. */
318 if (BE (start < 0 || start > length || range < 0, 0))
319 return -1;
320 if (BE (start + range > length, 0))
321 range = length - start;
323 eflags |= (bufp->not_bol) ? REG_NOTBOL : 0;
324 eflags |= (bufp->not_eol) ? REG_NOTEOL : 0;
326 /* Compile fastmap if we haven't yet. */
327 if (range > 0 && bufp->fastmap != NULL && !bufp->fastmap_accurate)
328 re_compile_fastmap (bufp);
330 if (BE (bufp->no_sub, 0))
331 regs = NULL;
333 /* We need at least 1 register. */
334 if (regs == NULL)
335 nregs = 1;
336 else if (BE (bufp->regs_allocated == REGS_FIXED &&
337 regs->num_regs < bufp->re_nsub + 1, 0))
339 nregs = regs->num_regs;
340 if (BE (nregs < 1, 0))
342 /* Nothing can be copied to regs. */
343 regs = NULL;
344 nregs = 1;
347 else
348 nregs = bufp->re_nsub + 1;
349 pmatch = re_malloc (regmatch_t, nregs);
350 if (BE (pmatch == NULL, 0))
351 return -2;
353 result = re_search_internal (bufp, string, length, start, range, stop,
354 nregs, pmatch, eflags);
356 rval = 0;
358 /* I hope we needn't fill ther regs with -1's when no match was found. */
359 if (result != REG_NOERROR)
360 rval = -1;
361 else if (regs != NULL)
363 /* If caller wants register contents data back, copy them. */
364 bufp->regs_allocated = re_copy_regs (regs, pmatch, nregs,
365 bufp->regs_allocated);
366 if (BE (bufp->regs_allocated == REGS_UNALLOCATED, 0))
367 rval = -2;
370 if (BE (rval == 0, 1))
372 if (ret_len)
374 assert (pmatch[0].rm_so == 0);
375 rval = pmatch[0].rm_eo;
377 else
378 rval = pmatch[0].rm_so;
380 re_free (pmatch);
381 return rval;
384 static unsigned
385 re_copy_regs (regs, pmatch, nregs, regs_allocated)
386 struct re_registers *regs;
387 regmatch_t *pmatch;
388 int nregs, regs_allocated;
390 int rval = REGS_REALLOCATE;
391 int i;
392 int need_regs = nregs + 1;
393 /* We need one extra element beyond `num_regs' for the `-1' marker GNU code
394 uses. */
396 /* Have the register data arrays been allocated? */
397 if (regs_allocated == REGS_UNALLOCATED)
398 { /* No. So allocate them with malloc. */
399 regs->start = re_malloc (regoff_t, need_regs);
400 if (BE (regs->start == NULL, 0))
401 return REGS_UNALLOCATED;
402 regs->end = re_malloc (regoff_t, need_regs);
403 if (BE (regs->end == NULL, 0))
405 re_free (regs->start);
406 return REGS_UNALLOCATED;
408 regs->num_regs = need_regs;
410 else if (regs_allocated == REGS_REALLOCATE)
411 { /* Yes. If we need more elements than were already
412 allocated, reallocate them. If we need fewer, just
413 leave it alone. */
414 if (need_regs > regs->num_regs)
416 regs->start = re_realloc (regs->start, regoff_t, need_regs);
417 if (BE (regs->start == NULL, 0))
419 if (regs->end != NULL)
420 re_free (regs->end);
421 return REGS_UNALLOCATED;
423 regs->end = re_realloc (regs->end, regoff_t, need_regs);
424 if (BE (regs->end == NULL, 0))
426 re_free (regs->start);
427 return REGS_UNALLOCATED;
429 regs->num_regs = need_regs;
432 else
434 assert (regs_allocated == REGS_FIXED);
435 /* This function may not be called with REGS_FIXED and nregs too big. */
436 assert (regs->num_regs >= nregs);
437 rval = REGS_FIXED;
440 /* Copy the regs. */
441 for (i = 0; i < nregs; ++i)
443 regs->start[i] = pmatch[i].rm_so;
444 regs->end[i] = pmatch[i].rm_eo;
446 for ( ; i < regs->num_regs; ++i)
447 regs->start[i] = regs->end[i] = -1;
449 return rval;
452 /* Set REGS to hold NUM_REGS registers, storing them in STARTS and
453 ENDS. Subsequent matches using PATTERN_BUFFER and REGS will use
454 this memory for recording register information. STARTS and ENDS
455 must be allocated using the malloc library routine, and must each
456 be at least NUM_REGS * sizeof (regoff_t) bytes long.
458 If NUM_REGS == 0, then subsequent matches should allocate their own
459 register data.
461 Unless this function is called, the first search or match using
462 PATTERN_BUFFER will allocate its own register data, without
463 freeing the old data. */
465 void
466 re_set_registers (bufp, regs, num_regs, starts, ends)
467 struct re_pattern_buffer *bufp;
468 struct re_registers *regs;
469 unsigned num_regs;
470 regoff_t *starts, *ends;
472 if (num_regs)
474 bufp->regs_allocated = REGS_REALLOCATE;
475 regs->num_regs = num_regs;
476 regs->start = starts;
477 regs->end = ends;
479 else
481 bufp->regs_allocated = REGS_UNALLOCATED;
482 regs->num_regs = 0;
483 regs->start = regs->end = (regoff_t *) 0;
486 #ifdef _LIBC
487 weak_alias (__re_set_registers, re_set_registers)
488 #endif
490 /* Entry points compatible with 4.2 BSD regex library. We don't define
491 them unless specifically requested. */
493 #if defined _REGEX_RE_COMP || defined _LIBC
495 # ifdef _LIBC
496 weak_function
497 # endif
498 re_exec (s)
499 const char *s;
501 return 0 == regexec (&re_comp_buf, s, 0, NULL, 0);
503 #endif /* _REGEX_RE_COMP */
505 static re_node_set empty_set;
507 /* Internal entry point. */
509 /* Searches for a compiled pattern PREG in the string STRING, whose
510 length is LENGTH. NMATCH, PMATCH, and EFLAGS have the same
511 mingings with regexec. START, and RANGE have the same meanings
512 with re_search.
513 Return REG_NOERROR if we find a match, and REG_NOMATCH if not,
514 otherwise return the error code.
515 Note: We assume front end functions already check ranges.
516 (START + RANGE >= 0 && START + RANGE <= LENGTH) */
518 static reg_errcode_t
519 re_search_internal (preg, string, length, start, range, stop, nmatch, pmatch,
520 eflags)
521 const regex_t *preg;
522 const char *string;
523 int length, start, range, stop, eflags;
524 size_t nmatch;
525 regmatch_t pmatch[];
527 reg_errcode_t err;
528 re_dfa_t *dfa = (re_dfa_t *)preg->buffer;
529 re_string_t input;
530 int left_lim, right_lim, incr;
531 int fl_longest_match, match_first, match_last = -1;
532 re_match_context_t mctx;
533 char *fastmap = ((preg->fastmap != NULL && preg->fastmap_accurate)
534 ? preg->fastmap : NULL);
536 /* Check if the DFA haven't been compiled. */
537 if (BE (preg->used == 0 || dfa->init_state == NULL
538 || dfa->init_state_word == NULL || dfa->init_state_nl == NULL
539 || dfa->init_state_begbuf == NULL, 0))
540 return REG_NOMATCH;
542 re_node_set_init_empty (&empty_set);
544 /* We must check the longest matching, if nmatch > 0. */
545 fl_longest_match = (nmatch != 0);
547 err = re_string_allocate (&input, string, length, dfa->nodes_len + 1,
548 preg->translate, preg->syntax & RE_ICASE);
549 if (BE (err != REG_NOERROR, 0))
550 return err;
551 input.stop = stop;
553 err = match_ctx_init (&mctx, eflags, &input, dfa->nbackref * 2);
554 if (BE (err != REG_NOERROR, 0))
555 return err;
557 /* We will log all the DFA states through which the dfa pass,
558 if nmatch > 1, or this dfa has "multibyte node", which is a
559 back-reference or a node which can accept multibyte character or
560 multi character collating element. */
561 if (nmatch > 1 || dfa->has_mb_node)
563 mctx.state_log = re_malloc (re_dfastate_t *, dfa->nodes_len + 1);
564 if (BE (mctx.state_log == NULL, 0))
565 return REG_ESPACE;
567 else
568 mctx.state_log = NULL;
570 #ifdef DEBUG
571 /* We assume front-end functions already check them. */
572 assert (start + range >= 0 && start + range <= length);
573 #endif
575 match_first = start;
576 input.tip_context = ((eflags & REG_NOTBOL) ? CONTEXT_BEGBUF
577 : CONTEXT_NEWLINE | CONTEXT_BEGBUF);
579 /* Check incrementally whether of not the input string match. */
580 incr = (range < 0) ? -1 : 1;
581 left_lim = (range < 0) ? start + range : start;
582 right_lim = (range < 0) ? start : start + range;
584 for (;;)
586 /* At first get the current byte from input string. */
587 int ch;
588 if (MB_CUR_MAX > 1 && (preg->syntax & RE_ICASE || preg->translate))
590 /* In this case, we can't determin easily the current byte,
591 since it might be a component byte of a multibyte character.
592 Then we use the constructed buffer instead. */
593 /* If MATCH_FIRST is out of the valid range, reconstruct the
594 buffers. */
595 if (input.raw_mbs_idx + input.valid_len <= match_first)
596 re_string_reconstruct (&input, match_first, eflags,
597 preg->newline_anchor);
598 /* If MATCH_FIRST is out of the buffer, leave it as '\0'.
599 Note that MATCH_FIRST must not be smaller than 0. */
600 ch = ((match_first >= length) ? 0
601 : re_string_byte_at (&input, match_first - input.raw_mbs_idx));
603 else
605 /* We apply translate/conversion manually, since it is trivial
606 in this case. */
607 /* If MATCH_FIRST is out of the buffer, leave it as '\0'.
608 Note that MATCH_FIRST must not be smaller than 0. */
609 ch = (match_first < length) ? (unsigned char)string[match_first] : 0;
610 /* Apply translation if we need. */
611 ch = preg->translate ? preg->translate[ch] : ch;
612 /* In case of case insensitive mode, convert to upper case. */
613 ch = ((preg->syntax & RE_ICASE) && islower (ch)) ? toupper (ch) : ch;
616 /* Eliminate inappropriate one by fastmap. */
617 if (preg->can_be_null || fastmap == NULL || fastmap[ch])
619 /* Reconstruct the buffers so that the matcher can assume that
620 the matching starts from the begining of the buffer. */
621 re_string_reconstruct (&input, match_first, eflags,
622 preg->newline_anchor);
623 #ifdef RE_ENABLE_I18N
624 /* Eliminate it when it is a component of a multibyte character
625 and isn't the head of a multibyte character. */
626 if (MB_CUR_MAX == 1 || re_string_first_byte (&input, 0))
627 #endif
629 /* It seems to be appropriate one, then use the matcher. */
630 /* We assume that the matching starts from 0. */
631 mctx.state_log_top = mctx.nbkref_ents = mctx.max_bkref_len = 0;
632 match_last = check_matching (preg, &mctx, 0, fl_longest_match);
633 if (match_last != -1)
635 if (BE (match_last == -2, 0))
636 return REG_ESPACE;
637 else
638 break; /* We found a matching. */
642 /* Update counter. */
643 match_first += incr;
644 if (match_first < left_lim || right_lim < match_first)
645 break;
648 /* Set pmatch[] if we need. */
649 if (match_last != -1 && nmatch > 0)
651 int reg_idx;
653 /* Initialize registers. */
654 for (reg_idx = 0; reg_idx < nmatch; ++reg_idx)
655 pmatch[reg_idx].rm_so = pmatch[reg_idx].rm_eo = -1;
657 /* Set the points where matching start/end. */
658 pmatch[0].rm_so = 0;
659 mctx.match_last = pmatch[0].rm_eo = match_last;
661 if (!preg->no_sub && nmatch > 1)
663 /* We need the ranges of all the subexpressions. */
664 int halt_node;
665 re_dfastate_t *pstate = mctx.state_log[match_last];
666 #ifdef DEBUG
667 assert (mctx.state_log != NULL);
668 #endif
669 halt_node = check_halt_state_context (preg, pstate, &mctx,
670 match_last);
671 err = sift_states_backward (preg, &mctx, halt_node);
672 if (BE (err != REG_NOERROR, 0))
673 return err;
674 err = set_regs (preg, &mctx, nmatch, pmatch, halt_node);
675 if (BE (err != REG_NOERROR, 0))
676 return err;
679 /* At last, add the offset to the each registers, since we slided
680 the buffers so that We can assume that the matching starts from 0. */
681 for (reg_idx = 0; reg_idx < nmatch; ++reg_idx)
682 if (pmatch[reg_idx].rm_so != -1)
684 pmatch[reg_idx].rm_so += match_first;
685 pmatch[reg_idx].rm_eo += match_first;
689 re_free (mctx.state_log);
690 if (dfa->nbackref)
691 match_ctx_free (&mctx);
692 re_string_destruct (&input);
693 return (match_last == -1) ? REG_NOMATCH : REG_NOERROR;
696 /* Acquire an initial state and return it.
697 We must select appropriate initial state depending on the context,
698 since initial states may have constraints like "\<", "^", etc.. */
700 static inline re_dfastate_t *
701 acquire_init_state_context (err, preg, mctx, idx)
702 reg_errcode_t *err;
703 const regex_t *preg;
704 const re_match_context_t *mctx;
705 int idx;
707 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
709 *err = REG_NOERROR;
710 if (dfa->init_state->has_constraint)
712 unsigned int context;
713 context = re_string_context_at (mctx->input, idx - 1, mctx->eflags,
714 preg->newline_anchor);
715 if (IS_WORD_CONTEXT (context))
716 return dfa->init_state_word;
717 else if (IS_ORDINARY_CONTEXT (context))
718 return dfa->init_state;
719 else if (IS_BEGBUF_CONTEXT (context) && IS_NEWLINE_CONTEXT (context))
720 return dfa->init_state_begbuf;
721 else if (IS_NEWLINE_CONTEXT (context))
722 return dfa->init_state_nl;
723 else if (IS_BEGBUF_CONTEXT (context))
725 /* It is relatively rare case, then calculate on demand. */
726 return re_acquire_state_context (err, dfa,
727 dfa->init_state->entrance_nodes,
728 context);
730 else
731 /* Must not happen? */
732 return dfa->init_state;
734 else
735 return dfa->init_state;
738 /* Check whether the regular expression match input string INPUT or not,
739 and return the index where the matching end, return -1 if not match,
740 or return -2 in case of an error.
741 FL_SEARCH means we must search where the matching starts,
742 FL_LONGEST_MATCH means we want the POSIX longest matching.
743 Note that the matcher assume that the maching starts from the current
744 index of the buffer. */
746 static int
747 check_matching (preg, mctx, fl_search, fl_longest_match)
748 const regex_t *preg;
749 re_match_context_t *mctx;
750 int fl_search, fl_longest_match;
752 reg_errcode_t err;
753 int match = 0;
754 int match_last = -1;
755 int cur_str_idx = re_string_cur_idx (mctx->input);
756 re_dfastate_t *cur_state;
758 cur_state = acquire_init_state_context (&err, preg, mctx, cur_str_idx);
759 /* An initial state must not be NULL(invalid state). */
760 if (BE (cur_state == NULL, 0))
761 return -2;
762 if (mctx->state_log != NULL)
763 mctx->state_log[cur_str_idx] = cur_state;
764 /* If the RE accepts NULL string. */
765 if (cur_state->halt)
767 if (!cur_state->has_constraint
768 || check_halt_state_context (preg, cur_state, mctx, cur_str_idx))
770 if (!fl_longest_match)
771 return cur_str_idx;
772 else
774 match_last = cur_str_idx;
775 match = 1;
780 while (!re_string_eoi (mctx->input))
782 cur_state = transit_state (&err, preg, mctx, cur_state,
783 fl_search && !match);
784 if (cur_state == NULL) /* Reached at the invalid state or an error. */
786 cur_str_idx = re_string_cur_idx (mctx->input);
787 if (BE (err != REG_NOERROR, 0))
788 return -2;
789 if (fl_search && !match)
791 /* Restart from initial state, since we are searching
792 the point from where matching start. */
793 #ifdef RE_ENABLE_I18N
794 if (MB_CUR_MAX == 1
795 || re_string_first_byte (mctx->input, cur_str_idx))
796 #endif /* RE_ENABLE_I18N */
797 cur_state = acquire_init_state_context (&err, preg, mctx,
798 cur_str_idx);
799 if (BE (cur_state == NULL && err != REG_NOERROR, 0))
800 return -2;
801 if (mctx->state_log != NULL)
802 mctx->state_log[cur_str_idx] = cur_state;
804 else if (!fl_longest_match && match)
805 break;
806 else /* (fl_longest_match && match) || (!fl_search && !match) */
808 if (mctx->state_log == NULL)
809 break;
810 else
812 int max = mctx->state_log_top;
813 for (; cur_str_idx <= max; ++cur_str_idx)
814 if (mctx->state_log[cur_str_idx] != NULL)
815 break;
816 if (cur_str_idx > max)
817 break;
822 if (cur_state != NULL && cur_state->halt)
824 /* Reached at a halt state.
825 Check the halt state can satisfy the current context. */
826 if (!cur_state->has_constraint
827 || check_halt_state_context (preg, cur_state, mctx,
828 re_string_cur_idx (mctx->input)))
830 /* We found an appropriate halt state. */
831 match_last = re_string_cur_idx (mctx->input);
832 match = 1;
833 if (!fl_longest_match)
834 break;
838 return match_last;
841 /* Check NODE match the current context. */
843 static int check_halt_node_context (dfa, node, context)
844 const re_dfa_t *dfa;
845 int node;
846 unsigned int context;
848 int entity;
849 re_token_type_t type = dfa->nodes[node].type;
850 if (type == END_OF_RE)
851 return 1;
852 if (type != OP_CONTEXT_NODE)
853 return 0;
854 entity = dfa->nodes[node].opr.ctx_info->entity;
855 if (dfa->nodes[entity].type != END_OF_RE
856 || NOT_SATISFY_NEXT_CONSTRAINT (dfa->nodes[node].constraint, context))
857 return 0;
858 return 1;
861 /* Check the halt state STATE match the current context.
862 Return 0 if not match, if the node, STATE has, is a halt node and
863 match the context, return the node. */
865 static int
866 check_halt_state_context (preg, state, mctx, idx)
867 const regex_t *preg;
868 const re_dfastate_t *state;
869 const re_match_context_t *mctx;
870 int idx;
872 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
873 int i;
874 unsigned int context;
875 #ifdef DEBUG
876 assert (state->halt);
877 #endif
878 context = re_string_context_at (mctx->input, idx, mctx->eflags,
879 preg->newline_anchor);
880 for (i = 0; i < state->nodes.nelem; ++i)
881 if (check_halt_node_context (dfa, state->nodes.elems[i], context))
882 return state->nodes.elems[i];
883 return 0;
886 /* Compute the next node to which "NFA" transit from NODE("NFA" is a NFA
887 corresponding to the DFA).
888 Return the destination node, and update EPS_VIA_NODES, return -1 in case
889 of errors. */
891 static int
892 proceed_next_node (preg, mctx, pidx, node, eps_via_nodes)
893 const regex_t *preg;
894 const re_match_context_t *mctx;
895 int *pidx, node;
896 re_node_set *eps_via_nodes;
898 re_dfa_t *dfa = (re_dfa_t *)preg->buffer;
899 int i, err, dest_node, cur_entity;
900 dest_node = -1;
901 cur_entity = ((dfa->nodes[node].type == OP_CONTEXT_NODE)
902 ? dfa->nodes[node].opr.ctx_info->entity : node);
903 if (IS_EPSILON_NODE (dfa->nodes[node].type))
905 int dest_entity = INT_MAX;
906 err = re_node_set_insert (eps_via_nodes, node);
907 if (BE (err < 0, 0))
908 return -1;
909 for (i = 0; i < mctx->state_log[*pidx]->nodes.nelem; ++i)
911 int candidate, candidate_entity;
912 candidate = mctx->state_log[*pidx]->nodes.elems[i];
913 candidate_entity = ((dfa->nodes[candidate].type == OP_CONTEXT_NODE)
914 ? dfa->nodes[candidate].opr.ctx_info->entity
915 : candidate);
916 if (!re_node_set_contains (dfa->edests + node, candidate))
917 if (candidate == candidate_entity
918 || !re_node_set_contains (dfa->edests + node, candidate_entity))
919 continue;
921 /* In order to avoid infinite loop like "(a*)*". */
922 if (cur_entity > candidate_entity
923 && re_node_set_contains (eps_via_nodes, candidate))
924 continue;
926 if (dest_entity > candidate_entity)
928 dest_node = candidate;
929 dest_entity = candidate_entity;
932 #ifdef DEBUG
933 assert (dest_node != -1);
934 #endif
935 return dest_node;
937 else
939 int naccepted = 0, entity = node;
940 re_token_type_t type = dfa->nodes[node].type;
941 if (type == OP_CONTEXT_NODE)
943 entity = dfa->nodes[node].opr.ctx_info->entity;
944 type = dfa->nodes[entity].type;
947 #ifdef RE_ENABLE_I18N
948 if (ACCEPT_MB_NODE (type))
949 naccepted = check_node_accept_bytes (preg, entity, mctx->input, *pidx);
950 else
951 #endif /* RE_ENABLE_I18N */
952 if (type == OP_BACK_REF)
954 for (i = 0; i < mctx->nbkref_ents; ++i)
956 if (mctx->bkref_ents[i].node == node
957 && mctx->bkref_ents[i].from == *pidx)
958 naccepted = mctx->bkref_ents[i].to - *pidx;
960 if (naccepted == 0)
962 err = re_node_set_insert (eps_via_nodes, node);
963 if (BE (err < 0, 0))
964 return -1;
965 dest_node = dfa->nexts[node];
966 if (re_node_set_contains (&mctx->state_log[*pidx]->nodes,
967 dest_node))
968 return dest_node;
969 for (i = 0; i < mctx->state_log[*pidx]->nodes.nelem; ++i)
971 dest_node = mctx->state_log[*pidx]->nodes.elems[i];
972 if ((dfa->nodes[dest_node].type == OP_CONTEXT_NODE
973 && (dfa->nexts[node]
974 == dfa->nodes[dest_node].opr.ctx_info->entity)))
975 return dest_node;
980 if (naccepted != 0
981 || check_node_accept (preg, dfa->nodes + node, mctx, *pidx))
983 dest_node = dfa->nexts[node];
984 *pidx = (naccepted == 0) ? *pidx + 1 : *pidx + naccepted;
985 #ifdef DEBUG
986 assert (mctx->state_log[*pidx] != NULL);
987 #endif
988 re_node_set_empty (eps_via_nodes);
989 return dest_node;
992 /* Must not reach here. */
993 #ifdef DEBUG
994 assert (0);
995 #endif
996 return 0;
999 /* Set the positions where the subexpressions are starts/ends to registers
1000 PMATCH.
1001 Note: We assume that pmatch[0] is already set, and
1002 pmatch[i].rm_so == pmatch[i].rm_eo == -1 (i > 1). */
1004 static reg_errcode_t
1005 set_regs (preg, mctx, nmatch, pmatch, last_node)
1006 const regex_t *preg;
1007 const re_match_context_t *mctx;
1008 size_t nmatch;
1009 regmatch_t *pmatch;
1010 int last_node;
1012 re_dfa_t *dfa = (re_dfa_t *)preg->buffer;
1013 int idx, cur_node, real_nmatch;
1014 re_node_set eps_via_nodes;
1015 #ifdef DEBUG
1016 assert (nmatch > 1);
1017 assert (mctx->state_log != NULL);
1018 #endif
1019 cur_node = dfa->init_node;
1020 real_nmatch = (nmatch <= preg->re_nsub) ? nmatch : preg->re_nsub + 1;
1021 re_node_set_init_empty (&eps_via_nodes);
1022 for (idx = pmatch[0].rm_so; idx <= pmatch[0].rm_eo ;)
1024 update_regs (dfa, pmatch, cur_node, idx, real_nmatch);
1025 if (idx == pmatch[0].rm_eo && cur_node == last_node)
1026 break;
1028 /* Proceed to next node. */
1029 cur_node = proceed_next_node (preg, mctx, &idx, cur_node, &eps_via_nodes);
1030 if (BE (cur_node < 0, 0))
1031 return REG_ESPACE;
1033 re_node_set_free (&eps_via_nodes);
1034 return REG_NOERROR;
1037 static void
1038 update_regs (dfa, pmatch, cur_node, cur_idx, nmatch)
1039 re_dfa_t *dfa;
1040 regmatch_t *pmatch;
1041 int cur_node, cur_idx, nmatch;
1043 int type = dfa->nodes[cur_node].type;
1044 int reg_num;
1045 if (type != OP_OPEN_SUBEXP && type != OP_CLOSE_SUBEXP)
1046 return;
1047 reg_num = dfa->nodes[cur_node].opr.idx + 1;
1048 if (reg_num >= nmatch)
1049 return;
1050 if (type == OP_OPEN_SUBEXP)
1052 /* We are at the first node of this sub expression. */
1053 pmatch[reg_num].rm_so = cur_idx;
1054 pmatch[reg_num].rm_eo = -1;
1056 else if (type == OP_CLOSE_SUBEXP)
1057 /* We are at the first node of this sub expression. */
1058 pmatch[reg_num].rm_eo = cur_idx;
1061 #define NUMBER_OF_STATE 1
1063 /* This function checks the STATE_LOG from the MCTX->match_last to 0
1064 and sift the nodes in each states according to the following rules.
1065 Updated state_log will be wrote to STATE_LOG.
1067 Rules: We throw away the Node `a' in the STATE_LOG[STR_IDX] if...
1068 1. When STR_IDX == MATCH_LAST(the last index in the state_log):
1069 If `a' isn't the LAST_NODE and `a' can't epsilon transit to
1070 the LAST_NODE, we throw away the node `a'.
1071 2. When 0 <= STR_IDX < MATCH_LAST and `a' accepts
1072 string `s' and transit to `b':
1073 i. If 'b' isn't in the STATE_LOG[STR_IDX+strlen('s')], we throw
1074 away the node `a'.
1075 ii. If 'b' is in the STATE_LOG[STR_IDX+strlen('s')] but 'b' is
1076 throwed away, we throw away the node `a'.
1077 3. When 0 <= STR_IDX < n and 'a' epsilon transit to 'b':
1078 i. If 'b' isn't in the STATE_LOG[STR_IDX], we throw away the
1079 node `a'.
1080 ii. If 'b' is in the STATE_LOG[STR_IDX] but 'b' is throwed away,
1081 we throw away the node `a'. */
1083 #define STATE_NODE_CONTAINS(state,node) \
1084 ((state) != NULL && re_node_set_contains (&(state)->nodes, node))
1086 static reg_errcode_t
1087 sift_states_backward (preg, mctx, last_node)
1088 const regex_t *preg;
1089 const re_match_context_t *mctx;
1090 int last_node;
1092 reg_errcode_t err;
1093 re_dfa_t *dfa = (re_dfa_t *)preg->buffer;
1094 re_node_set state_buf;
1095 int str_idx = mctx->match_last;
1096 re_node_set *plog; /* Points the state_log[str_idx]->nodes */
1098 #ifdef DEBUG
1099 assert (mctx->state_log != NULL && mctx->state_log[str_idx] != NULL);
1100 #endif
1101 err = re_node_set_alloc (&state_buf, NUMBER_OF_STATE);
1102 if (BE (err != REG_NOERROR, 0))
1103 return err;
1104 plog = &mctx->state_log[str_idx]->nodes;
1106 /* Build sifted state_log[str_idx]. It has the nodes which can epsilon
1107 transit to the last_node and the last_node itself. */
1108 err = re_node_set_intersect (&state_buf, plog, dfa->inveclosures + last_node);
1109 if (BE (err != REG_NOERROR, 0))
1110 return err;
1112 if (mctx->state_log[str_idx] != NULL
1113 && mctx->state_log[str_idx]->has_backref)
1115 err = add_epsilon_backreference (dfa, mctx, plog, str_idx, &state_buf);
1116 if (BE (err != REG_NOERROR, 0))
1117 return err;
1120 /* Update state log. */
1121 mctx->state_log[str_idx] = re_acquire_state (&err, dfa, &state_buf);
1122 if (BE (mctx->state_log[str_idx] == NULL && err != REG_NOERROR, 0))
1123 return err;
1125 /* Then check each states in the state_log. */
1126 while (str_idx > 0)
1128 int i, j;
1129 /* Update counters. */
1130 re_node_set_empty (&state_buf);
1131 --str_idx;
1132 plog = ((mctx->state_log[str_idx] == NULL) ? &empty_set
1133 : &mctx->state_log[str_idx]->nodes);
1135 /* Then build the next sifted state.
1136 We build the next sifted state on `state_buf', and update
1137 `state_log[str_idx]' with `state_buf'.
1138 Note:
1139 `state_buf' is the sifted state from `state_log[str_idx + 1]'.
1140 `plog' points the node_set of the old `state_log[str_idx]'. */
1141 for (i = 0; i < plog->nelem; i++)
1143 int prev_node = plog->elems[i];
1144 int entity = prev_node;
1145 int naccepted = 0;
1146 re_token_type_t type = dfa->nodes[prev_node].type;
1147 if (type == OP_CONTEXT_NODE)
1149 entity = dfa->nodes[prev_node].opr.ctx_info->entity;
1150 type = dfa->nodes[entity].type;
1153 #ifdef RE_ENABLE_I18N
1154 /* If the node may accept `multi byte'. */
1155 if (ACCEPT_MB_NODE (type))
1156 naccepted = sift_states_iter_mb (preg, mctx, entity, str_idx,
1157 mctx->match_last);
1159 /* If the node is a back reference. */
1160 else
1161 #endif /* RE_ENABLE_I18N */
1162 if (type == OP_BACK_REF)
1163 for (j = 0; j < mctx->nbkref_ents; ++j)
1165 naccepted = sift_states_iter_bkref (dfa, mctx->state_log,
1166 mctx->bkref_ents + j,
1167 prev_node, str_idx,
1168 mctx->match_last);
1169 if (naccepted)
1170 break;
1173 if (!naccepted
1174 && check_node_accept (preg, dfa->nodes + prev_node, mctx,
1175 str_idx)
1176 && STATE_NODE_CONTAINS (mctx->state_log[str_idx + 1],
1177 dfa->nexts[prev_node]))
1178 naccepted = 1;
1180 if (naccepted == 0)
1181 continue;
1183 /* `prev_node' may point the entity of the OP_CONTEXT_NODE,
1184 then we use plog->elems[i] instead. */
1185 err = re_node_set_add_intersect (&state_buf, plog,
1186 dfa->inveclosures + prev_node);
1187 if (BE (err != REG_NOERROR, 0))
1188 return err;
1190 if (mctx->state_log[str_idx] != NULL
1191 && mctx->state_log[str_idx]->has_backref)
1193 err = add_epsilon_backreference (dfa, mctx, plog, str_idx, &state_buf);
1194 if (BE (err != REG_NOERROR, 0))
1195 return err;
1198 /* Update state_log. */
1199 mctx->state_log[str_idx] = re_acquire_state (&err, dfa, &state_buf);
1200 if (BE (mctx->state_log[str_idx] == NULL && err != REG_NOERROR, 0))
1201 return err;
1204 re_node_set_free (&state_buf);
1205 return REG_NOERROR;
1208 /* Helper functions. */
1210 static inline reg_errcode_t
1211 clean_state_log_if_need (mctx, next_state_log_idx)
1212 re_match_context_t *mctx;
1213 int next_state_log_idx;
1215 int top = mctx->state_log_top;
1217 if (next_state_log_idx >= mctx->input->bufs_len
1218 || (next_state_log_idx >= mctx->input->valid_len
1219 && mctx->input->valid_len < mctx->input->len))
1221 reg_errcode_t err;
1222 err = extend_buffers (mctx);
1223 if (BE (err != REG_NOERROR, 0))
1224 return err;
1227 if (top < next_state_log_idx)
1229 memset (mctx->state_log + top + 1, '\0',
1230 sizeof (re_dfastate_t *) * (next_state_log_idx - top));
1231 mctx->state_log_top = next_state_log_idx;
1233 return REG_NOERROR;
1236 #ifdef RE_ENABLE_I18N
1237 static int
1238 sift_states_iter_mb (preg, mctx, node_idx, str_idx, max_str_idx)
1239 const regex_t *preg;
1240 const re_match_context_t *mctx;
1241 int node_idx, str_idx, max_str_idx;
1243 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1244 int naccepted;
1245 /* Check the node can accept `multi byte'. */
1246 naccepted = check_node_accept_bytes (preg, node_idx, mctx->input, str_idx);
1247 if (naccepted > 0 && str_idx + naccepted <= max_str_idx &&
1248 !STATE_NODE_CONTAINS (mctx->state_log[str_idx + naccepted],
1249 dfa->nexts[node_idx]))
1250 /* The node can't accept the `multi byte', or the
1251 destination was already throwed away, then the node
1252 could't accept the current input `multi byte'. */
1253 naccepted = 0;
1254 /* Otherwise, it is sure that the node could accept
1255 `naccepted' bytes input. */
1256 return naccepted;
1258 #endif /* RE_ENABLE_I18N */
1260 static int
1261 sift_states_iter_bkref (dfa, state_log, mctx_entry, node_idx, idx, match_last)
1262 const re_dfa_t *dfa;
1263 re_dfastate_t **state_log;
1264 struct re_backref_cache_entry *mctx_entry;
1265 int node_idx, idx, match_last;
1267 int naccepted = 0;
1268 int from_idx, to_idx;
1269 from_idx = mctx_entry->from;
1270 to_idx = mctx_entry->to;
1271 if (mctx_entry->node == node_idx
1272 && from_idx == idx && to_idx <= match_last
1273 && STATE_NODE_CONTAINS (state_log[to_idx], dfa->nexts[node_idx]))
1274 naccepted = to_idx - from_idx;
1275 return naccepted;
1278 static reg_errcode_t
1279 add_epsilon_backreference (dfa, mctx, plog, idx, state_buf)
1280 const re_dfa_t *dfa;
1281 const re_match_context_t *mctx;
1282 const re_node_set *plog;
1283 int idx;
1284 re_node_set *state_buf;
1286 int i, j;
1287 for (i = 0; i < plog->nelem; ++i)
1289 int node_idx = plog->elems[i];
1290 re_token_type_t type = dfa->nodes[node_idx].type;
1291 if (type == OP_CONTEXT_NODE)
1292 type = dfa->nodes[dfa->nodes[node_idx].opr.ctx_info->entity].type;
1294 if (type == OP_BACK_REF &&
1295 !re_node_set_contains (state_buf, node_idx))
1297 for (j = 0; j < mctx->nbkref_ents; ++j)
1299 struct re_backref_cache_entry *entry;
1300 entry = mctx->bkref_ents + j;
1301 if (entry->from == entry->to && entry->from == idx)
1302 break;
1304 if (j < mctx->nbkref_ents || idx == 0)
1306 reg_errcode_t err;
1307 err = re_node_set_add_intersect (state_buf, plog,
1308 dfa->inveclosures + node_idx);
1309 if (BE (err != REG_NOERROR, 0))
1310 return err;
1311 i = 0;
1315 return REG_NOERROR;
1318 /* Functions for state transition. */
1320 /* Return the next state to which the current state STATE will transit by
1321 accepting the current input byte, and update STATE_LOG if necessary.
1322 If STATE can accept a multibyte char/collating element/back reference
1323 update the destination of STATE_LOG. */
1325 static re_dfastate_t *
1326 transit_state (err, preg, mctx, state, fl_search)
1327 reg_errcode_t *err;
1328 const regex_t *preg;
1329 re_match_context_t *mctx;
1330 re_dfastate_t *state;
1331 int fl_search;
1333 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1334 re_dfastate_t **trtable, *next_state;
1335 unsigned char ch;
1337 if (re_string_cur_idx (mctx->input) + 1 >= mctx->input->bufs_len
1338 || (re_string_cur_idx (mctx->input) + 1 >= mctx->input->valid_len
1339 && mctx->input->valid_len < mctx->input->len))
1341 *err = extend_buffers (mctx);
1342 if (BE (*err != REG_NOERROR, 0))
1343 return NULL;
1346 *err = REG_NOERROR;
1347 if (state == NULL)
1349 next_state = state;
1350 re_string_skip_bytes (mctx->input, 1);
1352 else
1354 #ifdef RE_ENABLE_I18N
1355 /* If the current state can accept multibyte. */
1356 if (state->accept_mb)
1358 *err = transit_state_mb (preg, state, mctx);
1359 if (BE (*err != REG_NOERROR, 0))
1360 return NULL;
1362 #endif /* RE_ENABLE_I18N */
1364 /* Then decide the next state with the single byte. */
1365 if (1)
1367 /* Use transition table */
1368 ch = re_string_fetch_byte (mctx->input);
1369 trtable = fl_search ? state->trtable_search : state->trtable;
1370 if (trtable == NULL)
1372 trtable = build_trtable (preg, state, fl_search);
1373 if (fl_search)
1374 state->trtable_search = trtable;
1375 else
1376 state->trtable = trtable;
1378 next_state = trtable[ch];
1380 else
1382 /* don't use transition table */
1383 next_state = transit_state_sb (err, preg, state, fl_search, mctx);
1384 if (BE (next_state == NULL && err != REG_NOERROR, 0))
1385 return NULL;
1389 /* Update the state_log if we need. */
1390 if (mctx->state_log != NULL)
1392 int cur_idx = re_string_cur_idx (mctx->input);
1393 if (cur_idx > mctx->state_log_top)
1395 mctx->state_log[cur_idx] = next_state;
1396 mctx->state_log_top = cur_idx;
1398 else if (mctx->state_log[cur_idx] == 0)
1400 mctx->state_log[cur_idx] = next_state;
1402 else
1404 re_dfastate_t *pstate;
1405 unsigned int context;
1406 re_node_set next_nodes, *log_nodes, *table_nodes = NULL;
1407 /* If (state_log[cur_idx] != 0), it implies that cur_idx is
1408 the destination of a multibyte char/collating element/
1409 back reference. Then the next state is the union set of
1410 these destinations and the results of the transition table. */
1411 pstate = mctx->state_log[cur_idx];
1412 log_nodes = pstate->entrance_nodes;
1413 if (next_state != NULL)
1415 table_nodes = next_state->entrance_nodes;
1416 *err = re_node_set_init_union (&next_nodes, table_nodes,
1417 log_nodes);
1418 if (BE (*err != REG_NOERROR, 0))
1419 return NULL;
1421 else
1422 next_nodes = *log_nodes;
1423 /* Note: We already add the nodes of the initial state,
1424 then we don't need to add them here. */
1426 context = re_string_context_at (mctx->input,
1427 re_string_cur_idx (mctx->input) - 1,
1428 mctx->eflags, preg->newline_anchor);
1429 next_state = mctx->state_log[cur_idx]
1430 = re_acquire_state_context (err, dfa, &next_nodes, context);
1431 /* We don't need to check errors here, since the return value of
1432 this function is next_state and ERR is already set. */
1434 if (table_nodes != NULL)
1435 re_node_set_free (&next_nodes);
1437 /* If the next state has back references. */
1438 if (next_state != NULL && next_state->has_backref)
1440 *err = transit_state_bkref (preg, next_state, mctx);
1441 if (BE (*err != REG_NOERROR, 0))
1442 return NULL;
1443 next_state = mctx->state_log[cur_idx];
1446 return next_state;
1449 /* Helper functions for transit_state. */
1451 /* Return the next state to which the current state STATE will transit by
1452 accepting the current input byte. */
1454 static re_dfastate_t *
1455 transit_state_sb (err, preg, state, fl_search, mctx)
1456 reg_errcode_t *err;
1457 const regex_t *preg;
1458 re_dfastate_t *state;
1459 int fl_search;
1460 re_match_context_t *mctx;
1462 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1463 re_node_set next_nodes;
1464 re_dfastate_t *next_state;
1465 int node_cnt, cur_str_idx = re_string_cur_idx (mctx->input);
1466 unsigned int context;
1468 *err = re_node_set_alloc (&next_nodes, state->nodes.nelem + 1);
1469 if (BE (*err != REG_NOERROR, 0))
1470 return NULL;
1471 for (node_cnt = 0; node_cnt < state->nodes.nelem; ++node_cnt)
1473 int cur_node = state->nodes.elems[node_cnt];
1474 if (check_node_accept (preg, dfa->nodes + cur_node, mctx, cur_str_idx))
1476 *err = re_node_set_merge (&next_nodes,
1477 dfa->eclosures + dfa->nexts[cur_node]);
1478 if (BE (*err != REG_NOERROR, 0))
1479 return NULL;
1482 if (fl_search)
1484 #ifdef RE_ENABLE_I18N
1485 int not_initial = 0;
1486 if (MB_CUR_MAX > 1)
1487 for (node_cnt = 0; node_cnt < next_nodes.nelem; ++node_cnt)
1488 if (dfa->nodes[next_nodes.elems[node_cnt]].type == CHARACTER)
1490 not_initial = dfa->nodes[next_nodes.elems[node_cnt]].mb_partial;
1491 break;
1493 if (!not_initial)
1494 #endif
1496 *err = re_node_set_merge (&next_nodes,
1497 dfa->init_state->entrance_nodes);
1498 if (BE (*err != REG_NOERROR, 0))
1499 return NULL;
1502 context = re_string_context_at (mctx->input, cur_str_idx, mctx->eflags,
1503 preg->newline_anchor);
1504 next_state = re_acquire_state_context (err, dfa, &next_nodes, context);
1505 /* We don't need to check errors here, since the return value of
1506 this function is next_state and ERR is already set. */
1508 re_node_set_free (&next_nodes);
1509 re_string_skip_bytes (mctx->input, 1);
1510 return next_state;
1513 #ifdef RE_ENABLE_I18N
1514 static reg_errcode_t
1515 transit_state_mb (preg, pstate, mctx)
1516 const regex_t *preg;
1517 re_dfastate_t *pstate;
1518 re_match_context_t *mctx;
1520 reg_errcode_t err;
1521 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1522 int i;
1524 for (i = 0; i < pstate->nodes.nelem; ++i)
1526 re_node_set dest_nodes, *new_nodes;
1527 int cur_node_idx = pstate->nodes.elems[i];
1528 int naccepted = 0, dest_idx;
1529 unsigned int context;
1530 re_dfastate_t *dest_state;
1532 if (dfa->nodes[cur_node_idx].type == OP_CONTEXT_NODE)
1534 context = re_string_context_at (mctx->input,
1535 re_string_cur_idx (mctx->input),
1536 mctx->eflags, preg->newline_anchor);
1537 if (NOT_SATISFY_NEXT_CONSTRAINT (dfa->nodes[cur_node_idx].constraint,
1538 context))
1539 continue;
1540 cur_node_idx = dfa->nodes[cur_node_idx].opr.ctx_info->entity;
1543 /* How many bytes the node can accepts? */
1544 if (ACCEPT_MB_NODE (dfa->nodes[cur_node_idx].type))
1545 naccepted = check_node_accept_bytes (preg, cur_node_idx, mctx->input,
1546 re_string_cur_idx (mctx->input));
1547 if (naccepted == 0)
1548 continue;
1550 /* The node can accepts `naccepted' bytes. */
1551 dest_idx = re_string_cur_idx (mctx->input) + naccepted;
1552 err = clean_state_log_if_need (mctx, dest_idx);
1553 if (BE (err != REG_NOERROR, 0))
1554 return err;
1555 #ifdef DEBUG
1556 assert (dfa->nexts[cur_node_idx] != -1);
1557 #endif
1558 /* `cur_node_idx' may point the entity of the OP_CONTEXT_NODE,
1559 then we use pstate->nodes.elems[i] instead. */
1560 new_nodes = dfa->eclosures + dfa->nexts[pstate->nodes.elems[i]];
1562 dest_state = mctx->state_log[dest_idx];
1563 if (dest_state == NULL)
1564 dest_nodes = *new_nodes;
1565 else
1567 err = re_node_set_init_union (&dest_nodes,
1568 dest_state->entrance_nodes, new_nodes);
1569 if (BE (err != REG_NOERROR, 0))
1570 return err;
1572 context = re_string_context_at (mctx->input, dest_idx - 1, mctx->eflags,
1573 preg->newline_anchor);
1574 mctx->state_log[dest_idx]
1575 = re_acquire_state_context (&err, dfa, &dest_nodes, context);
1576 if (BE (mctx->state_log[dest_idx] == NULL && err != REG_NOERROR, 0))
1577 return err;
1578 if (dest_state != NULL)
1579 re_node_set_free (&dest_nodes);
1581 return REG_NOERROR;
1583 #endif /* RE_ENABLE_I18N */
1585 static reg_errcode_t
1586 transit_state_bkref (preg, pstate, mctx)
1587 const regex_t *preg;
1588 re_dfastate_t *pstate;
1589 re_match_context_t *mctx;
1591 reg_errcode_t err;
1592 re_dfastate_t **work_state_log;
1594 work_state_log = re_malloc (re_dfastate_t *,
1595 re_string_cur_idx (mctx->input) + 1);
1596 if (BE (work_state_log == NULL, 0))
1597 return REG_ESPACE;
1599 err = transit_state_bkref_loop (preg, &pstate->nodes, work_state_log, mctx);
1600 re_free (work_state_log);
1601 return err;
1604 /* Caller must allocate `work_state_log'. */
1606 static reg_errcode_t
1607 transit_state_bkref_loop (preg, nodes, work_state_log, mctx)
1608 const regex_t *preg;
1609 re_node_set *nodes;
1610 re_dfastate_t **work_state_log;
1611 re_match_context_t *mctx;
1613 reg_errcode_t err;
1614 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1615 int i, j;
1616 re_dfastate_t **state_log_bak;
1617 regmatch_t *cur_regs = re_malloc (regmatch_t, preg->re_nsub + 1);
1618 int cur_str_idx = re_string_cur_idx (mctx->input);
1619 if (BE (cur_regs == NULL, 0))
1620 return REG_ESPACE;
1622 for (i = 0; i < nodes->nelem; ++i)
1624 unsigned char *buf;
1625 int dest_str_idx, subexp_idx, prev_nelem, subexp_len;
1626 int node_idx = nodes->elems[i];
1627 unsigned int context;
1628 re_token_t *node = dfa->nodes + node_idx;
1629 re_dfastate_t *dest_state;
1630 re_node_set *new_dest_nodes;
1632 /* Check whether `node' is a backreference or not. */
1633 if (node->type == OP_BACK_REF)
1634 subexp_idx = node->opr.idx;
1635 else if (node->type == OP_CONTEXT_NODE &&
1636 dfa->nodes[node->opr.ctx_info->entity].type == OP_BACK_REF)
1638 context = re_string_context_at (mctx->input, cur_str_idx,
1639 mctx->eflags, preg->newline_anchor);
1640 if (NOT_SATISFY_NEXT_CONSTRAINT (node->constraint, context))
1641 continue;
1642 subexp_idx = dfa->nodes[node->opr.ctx_info->entity].opr.idx;
1644 else
1645 continue;
1647 /* `node' is a backreference.
1648 At first, set registers to check the backreference. */
1649 cur_regs[0].rm_so = 0;
1650 cur_regs[0].rm_eo = cur_str_idx;
1651 memcpy (work_state_log, mctx->state_log,
1652 sizeof (re_dfastate_t *) * (cur_str_idx + 1));
1653 mctx->match_last = cur_str_idx;
1654 state_log_bak = mctx->state_log;
1655 mctx->state_log = work_state_log;
1656 sift_states_backward (preg, mctx, node_idx);
1657 if (!STATE_NODE_CONTAINS (work_state_log[0], dfa->init_node))
1658 continue;
1659 for (j = 1; j <= preg->re_nsub; ++j)
1660 cur_regs[j].rm_so = cur_regs[j].rm_eo = -1;
1661 set_regs (preg, mctx, subexp_idx + 1, cur_regs, node_idx);
1662 mctx->state_log = state_log_bak;
1664 /* Then check that the backreference can match the input string. */
1665 subexp_len = cur_regs[subexp_idx].rm_eo - cur_regs[subexp_idx].rm_so;
1666 if (subexp_len < 0 || cur_str_idx + subexp_len > mctx->input->len)
1667 continue;
1669 if (cur_str_idx + subexp_len > mctx->input->valid_len
1670 && mctx->input->valid_len < mctx->input->len)
1672 reg_errcode_t err;
1673 err = extend_buffers (mctx);
1674 if (BE (err != REG_NOERROR, 0))
1675 return err;
1677 buf = re_string_get_buffer (mctx->input);
1678 if (strncmp (buf + cur_regs[subexp_idx].rm_so, buf + cur_str_idx,
1679 subexp_len) != 0)
1680 continue;
1682 /* Successfully matched, add a new cache entry. */
1683 dest_str_idx = cur_str_idx + subexp_len;
1684 err = match_ctx_add_entry (mctx, node_idx, cur_str_idx, dest_str_idx);
1685 if (BE (err != REG_NOERROR, 0))
1686 return err;
1687 err = clean_state_log_if_need (mctx, dest_str_idx);
1688 if (BE (err != REG_NOERROR, 0))
1689 return err;
1691 /* And add the epsilon closures (which is `new_dest_nodes') of
1692 the backreference to appropriate state_log. */
1693 #ifdef DEBUG
1694 assert (dfa->nexts[node_idx] != -1);
1695 #endif
1696 if (node->type == OP_CONTEXT_NODE && subexp_len == 0)
1697 new_dest_nodes = dfa->nodes[node_idx].opr.ctx_info->bkref_eclosure;
1698 else
1699 new_dest_nodes = dfa->eclosures + dfa->nexts[node_idx];
1700 context = (IS_WORD_CHAR (re_string_byte_at (mctx->input,
1701 dest_str_idx - 1))
1702 ? CONTEXT_WORD : 0);
1703 dest_state = mctx->state_log[dest_str_idx];
1705 prev_nelem = ((mctx->state_log[cur_str_idx] == NULL) ? 0
1706 : mctx->state_log[cur_str_idx]->nodes.nelem);
1707 /* Add `new_dest_node' to state_log. */
1708 if (dest_state == NULL)
1710 mctx->state_log[dest_str_idx]
1711 = re_acquire_state_context (&err, dfa, new_dest_nodes, context);
1712 if (BE (mctx->state_log[dest_str_idx] == NULL
1713 && err != REG_NOERROR, 0))
1714 return err;
1716 else
1718 re_node_set dest_nodes;
1719 err = re_node_set_init_union (&dest_nodes, dest_state->entrance_nodes,
1720 new_dest_nodes);
1721 if (BE (err != REG_NOERROR, 0))
1722 return err;
1723 mctx->state_log[dest_str_idx]
1724 = re_acquire_state_context (&err, dfa, &dest_nodes, context);
1725 if (BE (mctx->state_log[dest_str_idx] == NULL
1726 && err != REG_NOERROR, 0))
1727 return err;
1728 re_node_set_free (&dest_nodes);
1731 /* We need to check recursively if the backreference can epsilon
1732 transit. */
1733 if (subexp_len == 0
1734 && mctx->state_log[cur_str_idx]->nodes.nelem > prev_nelem)
1736 err = transit_state_bkref_loop (preg, new_dest_nodes, work_state_log,
1737 mctx);
1738 if (BE (err != REG_NOERROR, 0))
1739 return err;
1742 re_free (cur_regs);
1743 return REG_NOERROR;
1746 /* Build transition table for the state.
1747 Return the new table if succeeded, otherwise return NULL. */
1749 static re_dfastate_t **
1750 build_trtable (preg, state, fl_search)
1751 const regex_t *preg;
1752 const re_dfastate_t *state;
1753 int fl_search;
1755 reg_errcode_t err;
1756 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1757 int i, j, k, ch;
1758 int ndests; /* Number of the destination states from `state'. */
1759 re_dfastate_t **trtable, **dest_states, **dest_states_word, **dest_states_nl;
1760 re_node_set follows, *dests_node;
1761 bitset *dests_ch;
1762 bitset acceptable;
1764 /* We build DFA states which corresponds to the destination nodes
1765 from `state'. `dests_node[i]' represents the nodes which i-th
1766 destination state contains, and `dests_ch[i]' represents the
1767 characters which i-th destination state accepts. */
1768 dests_node = re_malloc (re_node_set, SBC_MAX);
1769 dests_ch = re_malloc (bitset, SBC_MAX);
1771 /* Initialize transiton table. */
1772 trtable = (re_dfastate_t **) calloc (sizeof (re_dfastate_t *), SBC_MAX);
1773 if (BE (dests_node == NULL || dests_ch == NULL || trtable == NULL, 0))
1774 return NULL;
1776 /* At first, group all nodes belonging to `state' into several
1777 destinations. */
1778 ndests = group_nodes_into_DFAstates (preg, state, dests_node, dests_ch);
1779 if (BE (ndests <= 0, 0))
1781 re_free (dests_node);
1782 re_free (dests_ch);
1783 /* Return NULL in case of an error, trtable otherwise. */
1784 return (ndests < 0) ? NULL : trtable;
1787 dest_states = re_malloc (re_dfastate_t *, ndests);
1788 dest_states_word = re_malloc (re_dfastate_t *, ndests);
1789 dest_states_nl = re_malloc (re_dfastate_t *, ndests);
1790 bitset_empty (acceptable);
1792 err = re_node_set_alloc (&follows, ndests + 1);
1793 if (BE (dest_states == NULL || dest_states_word == NULL
1794 || dest_states_nl == NULL || err != REG_NOERROR, 0))
1795 return NULL;
1797 /* Then build the states for all destinations. */
1798 for (i = 0; i < ndests; ++i)
1800 int next_node;
1801 re_node_set_empty (&follows);
1802 /* Merge the follows of this destination states. */
1803 for (j = 0; j < dests_node[i].nelem; ++j)
1805 next_node = dfa->nexts[dests_node[i].elems[j]];
1806 if (next_node != -1)
1808 err = re_node_set_merge (&follows, dfa->eclosures + next_node);
1809 if (BE (err != REG_NOERROR, 0))
1810 return NULL;
1813 /* If search flag is set, merge the initial state. */
1814 if (fl_search)
1816 #ifdef RE_ENABLE_I18N
1817 int not_initial = 0;
1818 for (j = 0; j < follows.nelem; ++j)
1819 if (dfa->nodes[follows.elems[j]].type == CHARACTER)
1821 not_initial = dfa->nodes[follows.elems[j]].mb_partial;
1822 break;
1824 if (!not_initial)
1825 #endif
1827 err = re_node_set_merge (&follows,
1828 dfa->init_state->entrance_nodes);
1829 if (BE (err != REG_NOERROR, 0))
1830 return NULL;
1833 dest_states[i] = re_acquire_state_context (&err, dfa, &follows, 0);
1834 if (BE (dest_states[i] == NULL && err != REG_NOERROR, 0))
1835 return NULL;
1836 /* If the new state has context constraint,
1837 build appropriate states for these contexts. */
1838 if (dest_states[i]->has_constraint)
1840 dest_states_word[i] = re_acquire_state_context (&err, dfa, &follows,
1841 CONTEXT_WORD);
1842 if (BE (dest_states_word[i] == NULL && err != REG_NOERROR, 0))
1843 return NULL;
1844 dest_states_nl[i] = re_acquire_state_context (&err, dfa, &follows,
1845 CONTEXT_NEWLINE);
1846 if (BE (dest_states_nl[i] == NULL && err != REG_NOERROR, 0))
1847 return NULL;
1849 else
1851 dest_states_word[i] = dest_states[i];
1852 dest_states_nl[i] = dest_states[i];
1854 bitset_merge (acceptable, dests_ch[i]);
1857 /* Update the transition table. */
1858 for (i = 0, ch = 0; i < BITSET_UINTS; ++i)
1859 for (j = 0; j < UINT_BITS; ++j, ++ch)
1860 if ((acceptable[i] >> j) & 1)
1862 if (IS_WORD_CHAR (ch))
1864 for (k = 0; k < ndests; ++k)
1865 if ((dests_ch[k][i] >> j) & 1)
1866 trtable[ch] = dest_states_word[k];
1868 else /* not WORD_CHAR */
1870 for (k = 0; k < ndests; ++k)
1871 if ((dests_ch[k][i] >> j) & 1)
1872 trtable[ch] = dest_states[k];
1875 /* new line */
1876 for (k = 0; k < ndests; ++k)
1877 if (bitset_contain (acceptable, NEWLINE_CHAR))
1878 trtable[NEWLINE_CHAR] = dest_states_nl[k];
1880 re_free (dest_states_nl);
1881 re_free (dest_states_word);
1882 re_free (dest_states);
1884 re_node_set_free (&follows);
1885 for (i = 0; i < ndests; ++i)
1886 re_node_set_free (dests_node + i);
1888 re_free (dests_ch);
1889 re_free (dests_node);
1891 return trtable;
1894 /* Group all nodes belonging to STATE into several destinations.
1895 Then for all destinations, set the nodes belonging to the destination
1896 to DESTS_NODE[i] and set the characters accepted by the destination
1897 to DEST_CH[i]. This function return the number of destinations. */
1899 static int
1900 group_nodes_into_DFAstates (preg, state, dests_node, dests_ch)
1901 const regex_t *preg;
1902 const re_dfastate_t *state;
1903 re_node_set *dests_node;
1904 bitset *dests_ch;
1906 reg_errcode_t err;
1907 const re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1908 int i, j, k;
1909 int ndests; /* Number of the destinations from `state'. */
1910 bitset accepts; /* Characters a node can accept. */
1911 const re_node_set *cur_nodes = &state->nodes;
1912 bitset_empty (accepts);
1913 ndests = 0;
1915 /* For all the nodes belonging to `state', */
1916 for (i = 0; i < cur_nodes->nelem; ++i)
1918 unsigned int constraint = 0;
1919 re_token_t *node = &dfa->nodes[cur_nodes->elems[i]];
1920 re_token_type_t type = node->type;
1922 if (type == OP_CONTEXT_NODE)
1924 constraint = node->constraint;
1925 node = dfa->nodes + node->opr.ctx_info->entity;
1926 type = node->type;
1929 /* Enumerate all single byte character this node can accept. */
1930 if (type == CHARACTER)
1931 bitset_set (accepts, node->opr.c);
1932 else if (type == SIMPLE_BRACKET)
1934 bitset_merge (accepts, node->opr.sbcset);
1936 else if (type == OP_PERIOD)
1938 bitset_set_all (accepts);
1939 if (!(preg->syntax & RE_DOT_NEWLINE))
1940 bitset_clear (accepts, '\n');
1941 if (preg->syntax & RE_DOT_NOT_NULL)
1942 bitset_clear (accepts, '\0');
1944 else
1945 continue;
1947 /* Check the `accepts' and sift the characters which are not
1948 match it the context. */
1949 if (constraint)
1951 if (constraint & NEXT_WORD_CONSTRAINT)
1952 for (j = 0; j < BITSET_UINTS; ++j)
1953 accepts[j] &= dfa->word_char[j];
1954 else if (constraint & NEXT_NOTWORD_CONSTRAINT)
1955 for (j = 0; j < BITSET_UINTS; ++j)
1956 accepts[j] &= ~dfa->word_char[j];
1957 else if (constraint & NEXT_NEWLINE_CONSTRAINT)
1959 int accepts_newline = bitset_contain (accepts, NEWLINE_CHAR);
1960 bitset_empty (accepts);
1961 if (accepts_newline)
1962 bitset_set (accepts, NEWLINE_CHAR);
1963 else
1964 continue;
1968 /* Then divide `accepts' into DFA states, or create a new
1969 state. */
1970 for (j = 0; j < ndests; ++j)
1972 bitset intersec; /* Intersection sets, see below. */
1973 bitset remains;
1974 /* Flags, see below. */
1975 int has_intersec, not_subset, not_consumed;
1977 /* Optimization, skip if this state doesn't accept the character. */
1978 if (type == CHARACTER && !bitset_contain (dests_ch[j], node->opr.c))
1979 continue;
1981 /* Enumerate the intersection set of this state and `accepts'. */
1982 has_intersec = 0;
1983 for (k = 0; k < BITSET_UINTS; ++k)
1984 has_intersec |= intersec[k] = accepts[k] & dests_ch[j][k];
1985 /* And skip if the intersection set is empty. */
1986 if (!has_intersec)
1987 continue;
1989 /* Then check if this state is a subset of `accepts'. */
1990 not_subset = not_consumed = 0;
1991 for (k = 0; k < BITSET_UINTS; ++k)
1993 not_subset |= remains[k] = ~accepts[k] & dests_ch[j][k];
1994 not_consumed |= accepts[k] = accepts[k] & ~dests_ch[j][k];
1997 /* If this state isn't a subset of `accepts', create a
1998 new group state, which has the `remains'. */
1999 if (not_subset)
2001 bitset_copy (dests_ch[ndests], remains);
2002 bitset_copy (dests_ch[j], intersec);
2003 err = re_node_set_init_copy (dests_node + ndests, &dests_node[j]);
2004 if (BE (err != REG_NOERROR, 0))
2005 return -1;
2006 ++ndests;
2009 /* Put the position in the current group. */
2010 err = re_node_set_insert (&dests_node[j], cur_nodes->elems[i]);
2011 if (BE (err < 0, 0))
2012 return -1;
2014 /* If all characters are consumed, go to next node. */
2015 if (!not_consumed)
2016 break;
2018 /* Some characters remain, create a new group. */
2019 if (j == ndests)
2021 bitset_copy (dests_ch[ndests], accepts);
2022 err = re_node_set_init_1 (dests_node + ndests, cur_nodes->elems[i]);
2023 if (BE (err != REG_NOERROR, 0))
2024 return -1;
2025 ++ndests;
2026 bitset_empty (accepts);
2029 return ndests;
2032 #ifdef RE_ENABLE_I18N
2033 /* Check how many bytes the node `dfa->nodes[node_idx]' accepts.
2034 Return the number of the bytes the node accepts.
2035 STR_IDX is the current index of the input string.
2037 This function handles the nodes which can accept one character, or
2038 one collating element like '.', '[a-z]', opposite to the other nodes
2039 can only accept one byte. */
2041 static int
2042 check_node_accept_bytes (preg, node_idx, input, str_idx)
2043 const regex_t *preg;
2044 int node_idx, str_idx;
2045 const re_string_t *input;
2047 const re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2048 const re_token_t *node = dfa->nodes + node_idx;
2049 int elem_len = re_string_elem_size_at (input, str_idx);
2050 int char_len = re_string_char_size_at (input, str_idx);
2051 int i;
2052 # ifdef _LIBC
2053 int j;
2054 uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
2055 # endif /* _LIBC */
2056 if (elem_len <= 1 && char_len <= 1)
2057 return 0;
2058 if (node->type == OP_PERIOD)
2060 /* '.' accepts any one character except the following two cases. */
2061 if ((!(preg->syntax & RE_DOT_NEWLINE) &&
2062 re_string_byte_at (input, str_idx) == '\n') ||
2063 ((preg->syntax & RE_DOT_NOT_NULL) &&
2064 re_string_byte_at (input, str_idx) == '\0'))
2065 return 0;
2066 return char_len;
2068 else if (node->type == COMPLEX_BRACKET)
2070 const re_charset_t *cset = node->opr.mbcset;
2071 # ifdef _LIBC
2072 const unsigned char *pin = re_string_get_buffer (input) + str_idx;
2073 # endif /* _LIBC */
2074 int match_len = 0;
2075 wchar_t wc = ((cset->nranges || cset->nchar_classes || cset->nmbchars)
2076 ? re_string_wchar_at (input, str_idx) : 0);
2078 /* match with multibyte character? */
2079 for (i = 0; i < cset->nmbchars; ++i)
2080 if (wc == cset->mbchars[i])
2082 match_len = char_len;
2083 goto check_node_accept_bytes_match;
2085 /* match with character_class? */
2086 for (i = 0; i < cset->nchar_classes; ++i)
2088 wctype_t wt = cset->char_classes[i];
2089 if (__iswctype (wc, wt))
2091 match_len = char_len;
2092 goto check_node_accept_bytes_match;
2096 # ifdef _LIBC
2097 if (nrules != 0)
2099 unsigned int in_collseq = 0;
2100 const int32_t *table, *indirect;
2101 const unsigned char *weights, *extra, *collseqwc;
2102 int32_t idx;
2103 /* This #include defines a local function! */
2104 # include <locale/weight.h>
2106 /* match with collating_symbol? */
2107 if (cset->ncoll_syms)
2108 extra = (const unsigned char *)
2109 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB);
2110 for (i = 0; i < cset->ncoll_syms; ++i)
2112 const unsigned char *coll_sym = extra + cset->coll_syms[i];
2113 /* Compare the length of input collating element and
2114 the length of current collating element. */
2115 if (*coll_sym != elem_len)
2116 continue;
2117 /* Compare each bytes. */
2118 for (j = 0; j < *coll_sym; j++)
2119 if (pin[j] != coll_sym[1 + j])
2120 break;
2121 if (j == *coll_sym)
2123 /* Match if every bytes is equal. */
2124 match_len = j;
2125 goto check_node_accept_bytes_match;
2129 if (cset->nranges)
2131 if (elem_len <= char_len)
2133 collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
2134 in_collseq = collseq_table_lookup (collseqwc, wc);
2136 else
2137 in_collseq = find_collation_sequence_value (pin, elem_len);
2139 /* match with range expression? */
2140 for (i = 0; i < cset->nranges; ++i)
2141 if (cset->range_starts[i] <= in_collseq
2142 && in_collseq <= cset->range_ends[i])
2144 match_len = elem_len;
2145 goto check_node_accept_bytes_match;
2148 /* match with equivalence_class? */
2149 if (cset->nequiv_classes)
2151 const unsigned char *cp = pin;
2152 table = (const int32_t *)
2153 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
2154 weights = (const unsigned char *)
2155 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_WEIGHTMB);
2156 extra = (const unsigned char *)
2157 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_EXTRAMB);
2158 indirect = (const int32_t *)
2159 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_INDIRECTMB);
2160 idx = findidx (&cp);
2161 if (idx > 0)
2162 for (i = 0; i < cset->nequiv_classes; ++i)
2164 int32_t equiv_class_idx = cset->equiv_classes[i];
2165 size_t weight_len = weights[idx];
2166 if (weight_len == weights[equiv_class_idx])
2168 int cnt = 0;
2169 while (cnt <= weight_len
2170 && (weights[equiv_class_idx + 1 + cnt]
2171 == weights[idx + 1 + cnt]))
2172 ++cnt;
2173 if (cnt > weight_len)
2175 match_len = elem_len;
2176 goto check_node_accept_bytes_match;
2182 else
2183 # endif /* _LIBC */
2185 /* match with range expression? */
2186 wchar_t cmp_buf[6] = {L'\0', L'\0', wc, L'\0', L'\0', L'\0'};
2187 for (i = 0; i < cset->nranges; ++i)
2189 cmp_buf[0] = cset->range_starts[i];
2190 cmp_buf[4] = cset->range_ends[i];
2191 if (wcscoll (cmp_buf, cmp_buf + 2) <= 0
2192 && wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0)
2194 match_len = char_len;
2195 goto check_node_accept_bytes_match;
2199 check_node_accept_bytes_match:
2200 if (!cset->non_match)
2201 return match_len;
2202 else
2204 if (match_len > 0)
2205 return 0;
2206 else
2207 return (elem_len > char_len) ? elem_len : char_len;
2210 return 0;
2213 # ifdef _LIBC
2214 static unsigned int
2215 find_collation_sequence_value (mbs, mbs_len)
2216 const unsigned char *mbs;
2217 size_t mbs_len;
2219 uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
2220 if (nrules == 0)
2222 if (mbs_len == 1)
2224 /* No valid character. Match it as a single byte character. */
2225 const unsigned char *collseq = (const unsigned char *)
2226 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
2227 return collseq[mbs[0]];
2229 return UINT_MAX;
2231 else
2233 int32_t idx;
2234 const unsigned char *extra = (const unsigned char *)
2235 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB);
2237 for (idx = 0; ;)
2239 int mbs_cnt, found = 0;
2240 int32_t elem_mbs_len;
2241 /* Skip the name of collating element name. */
2242 idx = idx + extra[idx] + 1;
2243 elem_mbs_len = extra[idx++];
2244 if (mbs_len == elem_mbs_len)
2246 for (mbs_cnt = 0; mbs_cnt < elem_mbs_len; ++mbs_cnt)
2247 if (extra[idx + mbs_cnt] != mbs[mbs_cnt])
2248 break;
2249 if (mbs_cnt == elem_mbs_len)
2250 /* Found the entry. */
2251 found = 1;
2253 /* Skip the byte sequence of the collating element. */
2254 idx += elem_mbs_len;
2255 /* Adjust for the alignment. */
2256 idx = (idx + 3) & ~3;
2257 /* Skip the collation sequence value. */
2258 idx += sizeof (uint32_t);
2259 /* Skip the wide char sequence of the collating element. */
2260 idx = idx + sizeof (uint32_t) * (extra[idx] + 1);
2261 /* If we found the entry, return the sequence value. */
2262 if (found)
2263 return *(uint32_t *) (extra + idx);
2264 /* Skip the collation sequence value. */
2265 idx += sizeof (uint32_t);
2269 # endif /* _LIBC */
2270 #endif /* RE_ENABLE_I18N */
2272 /* Check whether the node accepts the byte which is IDX-th
2273 byte of the INPUT. */
2275 static int
2276 check_node_accept (preg, node, mctx, idx)
2277 const regex_t *preg;
2278 const re_token_t *node;
2279 const re_match_context_t *mctx;
2280 int idx;
2282 const re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2283 const re_token_t *cur_node;
2284 unsigned char ch;
2285 if (node->type == OP_CONTEXT_NODE)
2287 /* The node has constraints. Check whether the current context
2288 satisfies the constraints. */
2289 unsigned int context = re_string_context_at (mctx->input, idx,
2290 mctx->eflags,
2291 preg->newline_anchor);
2292 if (NOT_SATISFY_NEXT_CONSTRAINT (node->constraint, context))
2293 return 0;
2294 cur_node = dfa->nodes + node->opr.ctx_info->entity;
2296 else
2297 cur_node = node;
2299 ch = re_string_byte_at (mctx->input, idx);
2300 if (cur_node->type == CHARACTER)
2301 return cur_node->opr.c == ch;
2302 else if (cur_node->type == SIMPLE_BRACKET)
2303 return bitset_contain (cur_node->opr.sbcset, ch);
2304 else if (cur_node->type == OP_PERIOD)
2305 return !((ch == '\n' && !(preg->syntax & RE_DOT_NEWLINE))
2306 || (ch == '\0' && (preg->syntax & RE_DOT_NOT_NULL)));
2307 else
2308 return 0;
2311 /* Extend the buffers, if the buffers have run out. */
2313 static reg_errcode_t
2314 extend_buffers (mctx)
2315 re_match_context_t *mctx;
2317 reg_errcode_t ret;
2318 re_string_t *pstr = mctx->input;
2320 /* Double the lengthes of the buffers. */
2321 ret = re_string_realloc_buffers (pstr, pstr->bufs_len * 2);
2322 if (BE (ret != REG_NOERROR, 0))
2323 return ret;
2325 if (mctx->state_log != NULL)
2327 /* And double the length of state_log. */
2328 mctx->state_log = re_realloc (mctx->state_log, re_dfastate_t *,
2329 pstr->bufs_len * 2);
2330 if (BE (mctx->state_log == NULL, 0))
2331 return REG_ESPACE;
2334 /* Then reconstruct the buffers. */
2335 if (pstr->icase)
2337 #ifdef RE_ENABLE_I18N
2338 if (MB_CUR_MAX > 1)
2339 build_wcs_upper_buffer (pstr);
2340 else
2341 #endif /* RE_ENABLE_I18N */
2342 build_upper_buffer (pstr);
2344 else
2346 #ifdef RE_ENABLE_I18N
2347 if (MB_CUR_MAX > 1)
2348 build_wcs_buffer (pstr);
2349 else
2350 #endif /* RE_ENABLE_I18N */
2352 if (pstr->trans != NULL)
2353 re_string_translate_buffer (pstr);
2354 else
2355 pstr->valid_len = pstr->bufs_len;
2358 return REG_NOERROR;
2362 /* Functions for matching context. */
2364 static reg_errcode_t
2365 match_ctx_init (mctx, eflags, input, n)
2366 re_match_context_t *mctx;
2367 int eflags, n;
2368 re_string_t *input;
2370 mctx->eflags = eflags;
2371 mctx->input = input;
2372 mctx->match_last = -1;
2373 if (n > 0)
2375 mctx->bkref_ents = re_malloc (struct re_backref_cache_entry, n);
2376 if (BE (mctx->bkref_ents == NULL, 0))
2377 return REG_ESPACE;
2379 else
2380 mctx->bkref_ents = NULL;
2381 mctx->nbkref_ents = 0;
2382 mctx->abkref_ents = n;
2383 mctx->max_bkref_len = 0;
2384 return REG_NOERROR;
2387 static void
2388 match_ctx_free (mctx)
2389 re_match_context_t *mctx;
2391 re_free (mctx->bkref_ents);
2394 /* Add a new backreference entry to the cache. */
2396 static reg_errcode_t
2397 match_ctx_add_entry (mctx, node, from, to)
2398 re_match_context_t *mctx;
2399 int node, from, to;
2401 if (mctx->nbkref_ents >= mctx->abkref_ents)
2403 mctx->bkref_ents = re_realloc (mctx->bkref_ents,
2404 struct re_backref_cache_entry,
2405 mctx->abkref_ents * 2);
2406 if (BE (mctx->bkref_ents == NULL, 0))
2407 return REG_ESPACE;
2408 memset (mctx->bkref_ents + mctx->nbkref_ents, '\0',
2409 sizeof (struct re_backref_cache_entry) * mctx->abkref_ents);
2410 mctx->abkref_ents *= 2;
2412 mctx->bkref_ents[mctx->nbkref_ents].node = node;
2413 mctx->bkref_ents[mctx->nbkref_ents].from = from;
2414 mctx->bkref_ents[mctx->nbkref_ents++].to = to;
2415 if (mctx->max_bkref_len < to - from)
2416 mctx->max_bkref_len = to - from;
2417 return REG_NOERROR;