1 /* Extended regular expression matching and search library.
2 Copyright (C) 2002 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4 Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library; if not, write to the Free
18 Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
30 # ifndef _RE_DEFINE_LOCALE_FUNCTIONS
31 # define _RE_DEFINE_LOCALE_FUNCTIONS 1
32 # include <locale/localeinfo.h>
33 # include <locale/elem-hash.h>
34 # include <locale/coll-lookup.h>
39 #include "regex_internal.h"
41 static reg_errcode_t
match_ctx_init (re_match_context_t
*cache
, int eflags
,
42 re_string_t
*input
, int n
);
43 static void match_ctx_free (re_match_context_t
*cache
);
44 static reg_errcode_t
match_ctx_add_entry (re_match_context_t
*cache
, int node
,
46 static reg_errcode_t
re_search_internal (const regex_t
*preg
,
47 const char *string
, int length
,
48 int start
, int range
, int stop
,
49 size_t nmatch
, regmatch_t pmatch
[],
51 static int re_search_2_stub (struct re_pattern_buffer
*bufp
,
52 const char *string1
, int length1
,
53 const char *string2
, int length2
,
54 int start
, int range
, struct re_registers
*regs
,
55 int stop
, int ret_len
);
56 static int re_search_stub (struct re_pattern_buffer
*bufp
,
57 const char *string
, int length
, int start
,
58 int range
, int stop
, struct re_registers
*regs
,
60 static unsigned re_copy_regs (struct re_registers
*regs
, regmatch_t
*pmatch
,
61 int nregs
, int regs_allocated
);
62 static inline re_dfastate_t
*acquire_init_state_context (reg_errcode_t
*err
,
64 const re_match_context_t
*mctx
,
66 static int check_matching (const regex_t
*preg
, re_match_context_t
*mctx
,
67 int fl_search
, int fl_longest_match
);
68 static int check_halt_node_context (const re_dfa_t
*dfa
, int node
,
69 unsigned int context
);
70 static int check_halt_state_context (const regex_t
*preg
,
71 const re_dfastate_t
*state
,
72 const re_match_context_t
*mctx
, int idx
);
73 static void update_regs (re_dfa_t
*dfa
, regmatch_t
*pmatch
, int cur_node
,
74 int cur_idx
, int nmatch
);
75 static int proceed_next_node (const regex_t
*preg
,
76 const re_match_context_t
*mctx
,
77 int *pidx
, int node
, re_node_set
*eps_via_nodes
);
78 static reg_errcode_t
set_regs (const regex_t
*preg
,
79 const re_match_context_t
*mctx
,
80 size_t nmatch
, regmatch_t
*pmatch
, int last
);
82 static int sift_states_iter_mb (const regex_t
*preg
,
83 const re_match_context_t
*mctx
,
84 int node_idx
, int str_idx
, int max_str_idx
);
85 #endif /* RE_ENABLE_I18N */
86 static int sift_states_iter_bkref (const re_dfa_t
*dfa
,
87 re_dfastate_t
**state_log
,
88 struct re_backref_cache_entry
*mctx_entry
,
89 int node_idx
, int idx
, int match_last
);
90 static reg_errcode_t
sift_states_backward (const regex_t
*preg
,
91 const re_match_context_t
*mctx
,
93 static reg_errcode_t
clean_state_log_if_need (re_match_context_t
*mctx
,
94 int next_state_log_idx
);
95 static reg_errcode_t
add_epsilon_backreference (const re_dfa_t
*dfa
,
96 const re_match_context_t
*mctx
,
97 const re_node_set
*plog
,
99 re_node_set
*state_buf
);
100 static re_dfastate_t
*transit_state (reg_errcode_t
*err
, const regex_t
*preg
,
101 re_match_context_t
*mctx
,
102 re_dfastate_t
*state
, int fl_search
);
103 static re_dfastate_t
*transit_state_sb (reg_errcode_t
*err
, const regex_t
*preg
,
104 re_dfastate_t
*pstate
,
106 re_match_context_t
*mctx
);
107 #ifdef RE_ENABLE_I18N
108 static reg_errcode_t
transit_state_mb (const regex_t
*preg
,
109 re_dfastate_t
*pstate
,
110 re_match_context_t
*mctx
);
111 #endif /* RE_ENABLE_I18N */
112 static reg_errcode_t
transit_state_bkref (const regex_t
*preg
,
113 re_dfastate_t
*pstate
,
114 re_match_context_t
*mctx
);
115 static reg_errcode_t
transit_state_bkref_loop (const regex_t
*preg
,
117 re_dfastate_t
**work_state_log
,
118 re_match_context_t
*mctx
);
119 static re_dfastate_t
**build_trtable (const regex_t
*dfa
,
120 const re_dfastate_t
*state
,
122 #ifdef RE_ENABLE_I18N
123 static int check_node_accept_bytes (const regex_t
*preg
, int node_idx
,
124 const re_string_t
*input
, int idx
);
126 static unsigned int find_collation_sequence_value (const char *mbs
,
129 #endif /* RE_ENABLE_I18N */
130 static int group_nodes_into_DFAstates (const regex_t
*dfa
,
131 const re_dfastate_t
*state
,
132 re_node_set
*states_node
,
134 static int check_node_accept (const regex_t
*preg
, const re_token_t
*node
,
135 const re_match_context_t
*mctx
, int idx
);
136 static reg_errcode_t
extend_buffers (re_match_context_t
*mctx
);
138 /* Entry point for POSIX code. */
140 /* regexec searches for a given pattern, specified by PREG, in the
143 If NMATCH is zero or REG_NOSUB was set in the cflags argument to
144 `regcomp', we ignore PMATCH. Otherwise, we assume PMATCH has at
145 least NMATCH elements, and we set them to the offsets of the
146 corresponding matched substrings.
148 EFLAGS specifies `execution flags' which affect matching: if
149 REG_NOTBOL is set, then ^ does not match at the beginning of the
150 string; if REG_NOTEOL is set, then $ does not match at the end.
152 We return 0 if we find a match and REG_NOMATCH if not. */
155 regexec (preg
, string
, nmatch
, pmatch
, eflags
)
156 const regex_t
*__restrict preg
;
157 const char *__restrict string
;
163 int length
= strlen (string
);
165 err
= re_search_internal (preg
, string
, length
, 0, length
, length
, 0,
168 err
= re_search_internal (preg
, string
, length
, 0, length
, length
, nmatch
,
170 return err
!= REG_NOERROR
;
173 weak_alias (__regexec
, regexec
)
176 /* Entry points for GNU code. */
178 /* re_match, re_search, re_match_2, re_search_2
180 The former two functions operate on STRING with length LENGTH,
181 while the later two operate on concatenation of STRING1 and STRING2
182 with lengths LENGTH1 and LENGTH2, respectively.
184 re_match() matches the compiled pattern in BUFP against the string,
185 starting at index START.
187 re_search() first tries matching at index START, then it tries to match
188 starting from index START + 1, and so on. The last start position tried
189 is START + RANGE. (Thus RANGE = 0 forces re_search to operate the same
192 The parameter STOP of re_{match,search}_2 specifies that no match exceeding
193 the first STOP characters of the concatenation of the strings should be
196 If REGS is not NULL, and BUFP->no_sub is not set, the offsets of the match
197 and all groups is stroed in REGS. (For the "_2" variants, the offsets are
198 computed relative to the concatenation, not relative to the individual
201 On success, re_match* functions return the length of the match, re_search*
202 return the position of the start of the match. Return value -1 means no
203 match was found and -2 indicates an internal error. */
206 re_match (bufp
, string
, length
, start
, regs
)
207 struct re_pattern_buffer
*bufp
;
210 struct re_registers
*regs
;
212 return re_search_stub (bufp
, string
, length
, start
, 0, length
, regs
, 1);
215 weak_alias (__re_match
, re_match
)
219 re_search (bufp
, string
, length
, start
, range
, regs
)
220 struct re_pattern_buffer
*bufp
;
222 int length
, start
, range
;
223 struct re_registers
*regs
;
225 return re_search_stub (bufp
, string
, length
, start
, range
, length
, regs
, 0);
228 weak_alias (__re_search
, re_search
)
232 re_match_2 (bufp
, string1
, length1
, string2
, length2
, start
, regs
, stop
)
233 struct re_pattern_buffer
*bufp
;
234 const char *string1
, *string2
;
235 int length1
, length2
, start
, stop
;
236 struct re_registers
*regs
;
238 return re_search_2_stub (bufp
, string1
, length1
, string2
, length2
,
239 start
, 0, regs
, stop
, 1);
242 weak_alias (__re_match_2
, re_match_2
)
246 re_search_2 (bufp
, string1
, length1
, string2
, length2
, start
, range
, regs
, stop
)
247 struct re_pattern_buffer
*bufp
;
248 const char *string1
, *string2
;
249 int length1
, length2
, start
, range
, stop
;
250 struct re_registers
*regs
;
252 return re_search_2_stub (bufp
, string1
, length1
, string2
, length2
,
253 start
, range
, regs
, stop
, 0);
256 weak_alias (__re_search_2
, re_search_2
)
260 re_search_2_stub (bufp
, string1
, length1
, string2
, length2
, start
, range
, regs
,
262 struct re_pattern_buffer
*bufp
;
263 const char *string1
, *string2
;
264 int length1
, length2
, start
, range
, stop
, ret_len
;
265 struct re_registers
*regs
;
269 int len
= length1
+ length2
;
272 if (BE (length1
< 0 || length2
< 0 || stop
< 0, 0))
275 /* Concatenate the strings. */
279 char *s
= re_malloc (char, len
);
281 if (BE (s
== NULL
, 0))
283 memcpy (s
, string1
, length1
);
284 memcpy (s
+ length1
, string2
, length2
);
293 rval
= re_search_stub (bufp
, str
, len
, start
, range
, stop
, regs
,
296 re_free ((char *) str
);
300 /* The parameters have the same meaning as those of re_search.
301 Additional parameters:
302 If RET_LEN is nonzero the length of the match is returned (re_match style);
303 otherwise the position of the match is returned. */
306 re_search_stub (bufp
, string
, length
, start
, range
, stop
, regs
, ret_len
)
307 struct re_pattern_buffer
*bufp
;
309 int length
, start
, range
, stop
, ret_len
;
310 struct re_registers
*regs
;
312 reg_errcode_t result
;
317 /* Check for out-of-range. */
318 if (BE (start
< 0 || start
> length
|| range
< 0, 0))
320 if (BE (start
+ range
> length
, 0))
321 range
= length
- start
;
323 eflags
|= (bufp
->not_bol
) ? REG_NOTBOL
: 0;
324 eflags
|= (bufp
->not_eol
) ? REG_NOTEOL
: 0;
326 /* Compile fastmap if we haven't yet. */
327 if (range
> 0 && bufp
->fastmap
!= NULL
&& !bufp
->fastmap_accurate
)
328 re_compile_fastmap (bufp
);
330 if (BE (bufp
->no_sub
, 0))
333 /* We need at least 1 register. */
336 else if (BE (bufp
->regs_allocated
== REGS_FIXED
&&
337 regs
->num_regs
< bufp
->re_nsub
+ 1, 0))
339 nregs
= regs
->num_regs
;
340 if (BE (nregs
< 1, 0))
342 /* Nothing can be copied to regs. */
348 nregs
= bufp
->re_nsub
+ 1;
349 pmatch
= re_malloc (regmatch_t
, nregs
);
350 if (BE (pmatch
== NULL
, 0))
353 result
= re_search_internal (bufp
, string
, length
, start
, range
, stop
,
354 nregs
, pmatch
, eflags
);
358 /* I hope we needn't fill ther regs with -1's when no match was found. */
359 if (result
!= REG_NOERROR
)
361 else if (regs
!= NULL
)
363 /* If caller wants register contents data back, copy them. */
364 bufp
->regs_allocated
= re_copy_regs (regs
, pmatch
, nregs
,
365 bufp
->regs_allocated
);
366 if (BE (bufp
->regs_allocated
== REGS_UNALLOCATED
, 0))
370 if (BE (rval
== 0, 1))
374 assert (pmatch
[0].rm_so
== start
);
375 rval
= pmatch
[0].rm_eo
- start
;
378 rval
= pmatch
[0].rm_so
;
385 re_copy_regs (regs
, pmatch
, nregs
, regs_allocated
)
386 struct re_registers
*regs
;
388 int nregs
, regs_allocated
;
390 int rval
= REGS_REALLOCATE
;
392 int need_regs
= nregs
+ 1;
393 /* We need one extra element beyond `num_regs' for the `-1' marker GNU code
396 /* Have the register data arrays been allocated? */
397 if (regs_allocated
== REGS_UNALLOCATED
)
398 { /* No. So allocate them with malloc. */
399 regs
->start
= re_malloc (regoff_t
, need_regs
);
400 if (BE (regs
->start
== NULL
, 0))
401 return REGS_UNALLOCATED
;
402 regs
->end
= re_malloc (regoff_t
, need_regs
);
403 if (BE (regs
->end
== NULL
, 0))
405 re_free (regs
->start
);
406 return REGS_UNALLOCATED
;
408 regs
->num_regs
= need_regs
;
410 else if (regs_allocated
== REGS_REALLOCATE
)
411 { /* Yes. If we need more elements than were already
412 allocated, reallocate them. If we need fewer, just
414 if (need_regs
> regs
->num_regs
)
416 regs
->start
= re_realloc (regs
->start
, regoff_t
, need_regs
);
417 if (BE (regs
->start
== NULL
, 0))
419 if (regs
->end
!= NULL
)
421 return REGS_UNALLOCATED
;
423 regs
->end
= re_realloc (regs
->end
, regoff_t
, need_regs
);
424 if (BE (regs
->end
== NULL
, 0))
426 re_free (regs
->start
);
427 return REGS_UNALLOCATED
;
429 regs
->num_regs
= need_regs
;
434 assert (regs_allocated
== REGS_FIXED
);
435 /* This function may not be called with REGS_FIXED and nregs too big. */
436 assert (regs
->num_regs
>= nregs
);
441 for (i
= 0; i
< nregs
; ++i
)
443 regs
->start
[i
] = pmatch
[i
].rm_so
;
444 regs
->end
[i
] = pmatch
[i
].rm_eo
;
446 for ( ; i
< regs
->num_regs
; ++i
)
447 regs
->start
[i
] = regs
->end
[i
] = -1;
452 /* Set REGS to hold NUM_REGS registers, storing them in STARTS and
453 ENDS. Subsequent matches using PATTERN_BUFFER and REGS will use
454 this memory for recording register information. STARTS and ENDS
455 must be allocated using the malloc library routine, and must each
456 be at least NUM_REGS * sizeof (regoff_t) bytes long.
458 If NUM_REGS == 0, then subsequent matches should allocate their own
461 Unless this function is called, the first search or match using
462 PATTERN_BUFFER will allocate its own register data, without
463 freeing the old data. */
466 re_set_registers (bufp
, regs
, num_regs
, starts
, ends
)
467 struct re_pattern_buffer
*bufp
;
468 struct re_registers
*regs
;
470 regoff_t
*starts
, *ends
;
474 bufp
->regs_allocated
= REGS_REALLOCATE
;
475 regs
->num_regs
= num_regs
;
476 regs
->start
= starts
;
481 bufp
->regs_allocated
= REGS_UNALLOCATED
;
483 regs
->start
= regs
->end
= (regoff_t
*) 0;
487 weak_alias (__re_set_registers
, re_set_registers
)
490 /* Entry points compatible with 4.2 BSD regex library. We don't define
491 them unless specifically requested. */
493 #if defined _REGEX_RE_COMP || defined _LIBC
501 return 0 == regexec (&re_comp_buf
, s
, 0, NULL
, 0);
503 #endif /* _REGEX_RE_COMP */
505 static re_node_set empty_set
;
507 /* Internal entry point. */
509 /* Searches for a compiled pattern PREG in the string STRING, whose
510 length is LENGTH. NMATCH, PMATCH, and EFLAGS have the same
511 mingings with regexec. START, and RANGE have the same meanings
513 Return REG_NOERROR if we find a match, and REG_NOMATCH if not,
514 otherwise return the error code.
515 Note: We assume front end functions already check ranges.
516 (START + RANGE >= 0 && START + RANGE <= LENGTH) */
519 re_search_internal (preg
, string
, length
, start
, range
, stop
, nmatch
, pmatch
,
523 int length
, start
, range
, stop
, eflags
;
528 re_dfa_t
*dfa
= (re_dfa_t
*)preg
->buffer
;
530 int left_lim
, right_lim
, incr
;
531 int fl_longest_match
, match_first
, match_last
= -1;
532 re_match_context_t mctx
;
533 char *fastmap
= ((preg
->fastmap
!= NULL
&& preg
->fastmap_accurate
)
534 ? preg
->fastmap
: NULL
);
536 /* Check if the DFA haven't been compiled. */
537 if (BE (preg
->used
== 0 || dfa
->init_state
== NULL
538 || dfa
->init_state_word
== NULL
|| dfa
->init_state_nl
== NULL
539 || dfa
->init_state_begbuf
== NULL
, 0))
542 re_node_set_init_empty (&empty_set
);
544 /* We must check the longest matching, if nmatch > 0. */
545 fl_longest_match
= (nmatch
!= 0);
547 err
= re_string_allocate (&input
, string
, length
, dfa
->nodes_len
+ 1,
548 preg
->translate
, preg
->syntax
& RE_ICASE
);
549 if (BE (err
!= REG_NOERROR
, 0))
553 err
= match_ctx_init (&mctx
, eflags
, &input
, dfa
->nbackref
* 2);
554 if (BE (err
!= REG_NOERROR
, 0))
557 /* We will log all the DFA states through which the dfa pass,
558 if nmatch > 1, or this dfa has "multibyte node", which is a
559 back-reference or a node which can accept multibyte character or
560 multi character collating element. */
561 if (nmatch
> 1 || dfa
->has_mb_node
)
563 mctx
.state_log
= re_malloc (re_dfastate_t
*, dfa
->nodes_len
+ 1);
564 if (BE (mctx
.state_log
== NULL
, 0))
568 mctx
.state_log
= NULL
;
571 /* We assume front-end functions already check them. */
572 assert (start
+ range
>= 0 && start
+ range
<= length
);
576 input
.tip_context
= ((eflags
& REG_NOTBOL
) ? CONTEXT_BEGBUF
577 : CONTEXT_NEWLINE
| CONTEXT_BEGBUF
);
579 /* Check incrementally whether of not the input string match. */
580 incr
= (range
< 0) ? -1 : 1;
581 left_lim
= (range
< 0) ? start
+ range
: start
;
582 right_lim
= (range
< 0) ? start
: start
+ range
;
586 /* At first get the current byte from input string. */
588 if (MB_CUR_MAX
> 1 && (preg
->syntax
& RE_ICASE
|| preg
->translate
))
590 /* In this case, we can't determin easily the current byte,
591 since it might be a component byte of a multibyte character.
592 Then we use the constructed buffer instead. */
593 /* If MATCH_FIRST is out of the valid range, reconstruct the
595 if (input
.raw_mbs_idx
+ input
.valid_len
<= match_first
)
596 re_string_reconstruct (&input
, match_first
, eflags
,
597 preg
->newline_anchor
);
598 /* If MATCH_FIRST is out of the buffer, leave it as '\0'.
599 Note that MATCH_FIRST must not be smaller than 0. */
600 ch
= ((match_first
>= length
) ? 0
601 : re_string_byte_at (&input
, match_first
- input
.raw_mbs_idx
));
605 /* We apply translate/conversion manually, since it is trivial
607 /* If MATCH_FIRST is out of the buffer, leave it as '\0'.
608 Note that MATCH_FIRST must not be smaller than 0. */
609 ch
= (match_first
< length
) ? (unsigned char)string
[match_first
] : 0;
610 /* Apply translation if we need. */
611 ch
= preg
->translate
? preg
->translate
[ch
] : ch
;
612 /* In case of case insensitive mode, convert to upper case. */
613 ch
= ((preg
->syntax
& RE_ICASE
) && islower (ch
)) ? toupper (ch
) : ch
;
616 /* Eliminate inappropriate one by fastmap. */
617 if (preg
->can_be_null
|| fastmap
== NULL
|| fastmap
[ch
])
619 /* Reconstruct the buffers so that the matcher can assume that
620 the matching starts from the begining of the buffer. */
621 re_string_reconstruct (&input
, match_first
, eflags
,
622 preg
->newline_anchor
);
623 #ifdef RE_ENABLE_I18N
624 /* Eliminate it when it is a component of a multibyte character
625 and isn't the head of a multibyte character. */
626 if (MB_CUR_MAX
== 1 || re_string_first_byte (&input
, 0))
629 /* It seems to be appropriate one, then use the matcher. */
630 /* We assume that the matching starts from 0. */
631 mctx
.state_log_top
= mctx
.nbkref_ents
= mctx
.max_bkref_len
= 0;
632 match_last
= check_matching (preg
, &mctx
, 0, fl_longest_match
);
633 if (match_last
!= -1)
635 if (BE (match_last
== -2, 0))
638 break; /* We found a matching. */
642 /* Update counter. */
644 if (match_first
< left_lim
|| right_lim
< match_first
)
648 /* Set pmatch[] if we need. */
649 if (match_last
!= -1 && nmatch
> 0)
653 /* Initialize registers. */
654 for (reg_idx
= 0; reg_idx
< nmatch
; ++reg_idx
)
655 pmatch
[reg_idx
].rm_so
= pmatch
[reg_idx
].rm_eo
= -1;
657 /* Set the points where matching start/end. */
659 mctx
.match_last
= pmatch
[0].rm_eo
= match_last
;
661 if (!preg
->no_sub
&& nmatch
> 1)
663 /* We need the ranges of all the subexpressions. */
665 re_dfastate_t
*pstate
= mctx
.state_log
[match_last
];
667 assert (mctx
.state_log
!= NULL
);
669 halt_node
= check_halt_state_context (preg
, pstate
, &mctx
,
671 err
= sift_states_backward (preg
, &mctx
, halt_node
);
672 if (BE (err
!= REG_NOERROR
, 0))
674 err
= set_regs (preg
, &mctx
, nmatch
, pmatch
, halt_node
);
675 if (BE (err
!= REG_NOERROR
, 0))
679 /* At last, add the offset to the each registers, since we slided
680 the buffers so that We can assume that the matching starts from 0. */
681 for (reg_idx
= 0; reg_idx
< nmatch
; ++reg_idx
)
682 if (pmatch
[reg_idx
].rm_so
!= -1)
684 pmatch
[reg_idx
].rm_so
+= match_first
;
685 pmatch
[reg_idx
].rm_eo
+= match_first
;
689 re_free (mctx
.state_log
);
691 match_ctx_free (&mctx
);
692 re_string_destruct (&input
);
693 return (match_last
== -1) ? REG_NOMATCH
: REG_NOERROR
;
696 /* Acquire an initial state and return it.
697 We must select appropriate initial state depending on the context,
698 since initial states may have constraints like "\<", "^", etc.. */
700 static inline re_dfastate_t
*
701 acquire_init_state_context (err
, preg
, mctx
, idx
)
704 const re_match_context_t
*mctx
;
707 re_dfa_t
*dfa
= (re_dfa_t
*) preg
->buffer
;
710 if (dfa
->init_state
->has_constraint
)
712 unsigned int context
;
713 context
= re_string_context_at (mctx
->input
, idx
- 1, mctx
->eflags
,
714 preg
->newline_anchor
);
715 if (IS_WORD_CONTEXT (context
))
716 return dfa
->init_state_word
;
717 else if (IS_ORDINARY_CONTEXT (context
))
718 return dfa
->init_state
;
719 else if (IS_BEGBUF_CONTEXT (context
) && IS_NEWLINE_CONTEXT (context
))
720 return dfa
->init_state_begbuf
;
721 else if (IS_NEWLINE_CONTEXT (context
))
722 return dfa
->init_state_nl
;
723 else if (IS_BEGBUF_CONTEXT (context
))
725 /* It is relatively rare case, then calculate on demand. */
726 return re_acquire_state_context (err
, dfa
,
727 dfa
->init_state
->entrance_nodes
,
731 /* Must not happen? */
732 return dfa
->init_state
;
735 return dfa
->init_state
;
738 /* Check whether the regular expression match input string INPUT or not,
739 and return the index where the matching end, return -1 if not match,
740 or return -2 in case of an error.
741 FL_SEARCH means we must search where the matching starts,
742 FL_LONGEST_MATCH means we want the POSIX longest matching.
743 Note that the matcher assume that the maching starts from the current
744 index of the buffer. */
747 check_matching (preg
, mctx
, fl_search
, fl_longest_match
)
749 re_match_context_t
*mctx
;
750 int fl_search
, fl_longest_match
;
755 int cur_str_idx
= re_string_cur_idx (mctx
->input
);
756 re_dfastate_t
*cur_state
;
758 cur_state
= acquire_init_state_context (&err
, preg
, mctx
, cur_str_idx
);
759 /* An initial state must not be NULL(invalid state). */
760 if (BE (cur_state
== NULL
, 0))
762 if (mctx
->state_log
!= NULL
)
763 mctx
->state_log
[cur_str_idx
] = cur_state
;
764 /* If the RE accepts NULL string. */
767 if (!cur_state
->has_constraint
768 || check_halt_state_context (preg
, cur_state
, mctx
, cur_str_idx
))
770 if (!fl_longest_match
)
774 match_last
= cur_str_idx
;
780 while (!re_string_eoi (mctx
->input
))
782 cur_state
= transit_state (&err
, preg
, mctx
, cur_state
,
783 fl_search
&& !match
);
784 if (cur_state
== NULL
) /* Reached at the invalid state or an error. */
786 cur_str_idx
= re_string_cur_idx (mctx
->input
);
787 if (BE (err
!= REG_NOERROR
, 0))
789 if (fl_search
&& !match
)
791 /* Restart from initial state, since we are searching
792 the point from where matching start. */
793 #ifdef RE_ENABLE_I18N
795 || re_string_first_byte (mctx
->input
, cur_str_idx
))
796 #endif /* RE_ENABLE_I18N */
797 cur_state
= acquire_init_state_context (&err
, preg
, mctx
,
799 if (BE (cur_state
== NULL
&& err
!= REG_NOERROR
, 0))
801 if (mctx
->state_log
!= NULL
)
802 mctx
->state_log
[cur_str_idx
] = cur_state
;
804 else if (!fl_longest_match
&& match
)
806 else /* (fl_longest_match && match) || (!fl_search && !match) */
808 if (mctx
->state_log
== NULL
)
812 int max
= mctx
->state_log_top
;
813 for (; cur_str_idx
<= max
; ++cur_str_idx
)
814 if (mctx
->state_log
[cur_str_idx
] != NULL
)
816 if (cur_str_idx
> max
)
822 if (cur_state
!= NULL
&& cur_state
->halt
)
824 /* Reached at a halt state.
825 Check the halt state can satisfy the current context. */
826 if (!cur_state
->has_constraint
827 || check_halt_state_context (preg
, cur_state
, mctx
,
828 re_string_cur_idx (mctx
->input
)))
830 /* We found an appropriate halt state. */
831 match_last
= re_string_cur_idx (mctx
->input
);
833 if (!fl_longest_match
)
841 /* Check NODE match the current context. */
843 static int check_halt_node_context (dfa
, node
, context
)
846 unsigned int context
;
849 re_token_type_t type
= dfa
->nodes
[node
].type
;
850 if (type
== END_OF_RE
)
852 if (type
!= OP_CONTEXT_NODE
)
854 entity
= dfa
->nodes
[node
].opr
.ctx_info
->entity
;
855 if (dfa
->nodes
[entity
].type
!= END_OF_RE
856 || NOT_SATISFY_NEXT_CONSTRAINT (dfa
->nodes
[node
].constraint
, context
))
861 /* Check the halt state STATE match the current context.
862 Return 0 if not match, if the node, STATE has, is a halt node and
863 match the context, return the node. */
866 check_halt_state_context (preg
, state
, mctx
, idx
)
868 const re_dfastate_t
*state
;
869 const re_match_context_t
*mctx
;
872 re_dfa_t
*dfa
= (re_dfa_t
*) preg
->buffer
;
874 unsigned int context
;
876 assert (state
->halt
);
878 context
= re_string_context_at (mctx
->input
, idx
, mctx
->eflags
,
879 preg
->newline_anchor
);
880 for (i
= 0; i
< state
->nodes
.nelem
; ++i
)
881 if (check_halt_node_context (dfa
, state
->nodes
.elems
[i
], context
))
882 return state
->nodes
.elems
[i
];
886 /* Compute the next node to which "NFA" transit from NODE("NFA" is a NFA
887 corresponding to the DFA).
888 Return the destination node, and update EPS_VIA_NODES, return -1 in case
892 proceed_next_node (preg
, mctx
, pidx
, node
, eps_via_nodes
)
894 const re_match_context_t
*mctx
;
896 re_node_set
*eps_via_nodes
;
898 re_dfa_t
*dfa
= (re_dfa_t
*)preg
->buffer
;
899 int i
, err
, dest_node
, cur_entity
;
901 cur_entity
= ((dfa
->nodes
[node
].type
== OP_CONTEXT_NODE
)
902 ? dfa
->nodes
[node
].opr
.ctx_info
->entity
: node
);
903 if (IS_EPSILON_NODE (dfa
->nodes
[node
].type
))
905 int dest_entity
= INT_MAX
;
906 err
= re_node_set_insert (eps_via_nodes
, node
);
909 for (i
= 0; i
< mctx
->state_log
[*pidx
]->nodes
.nelem
; ++i
)
911 int candidate
, candidate_entity
;
912 candidate
= mctx
->state_log
[*pidx
]->nodes
.elems
[i
];
913 candidate_entity
= ((dfa
->nodes
[candidate
].type
== OP_CONTEXT_NODE
)
914 ? dfa
->nodes
[candidate
].opr
.ctx_info
->entity
916 if (!re_node_set_contains (dfa
->edests
+ node
, candidate
))
917 if (candidate
== candidate_entity
918 || !re_node_set_contains (dfa
->edests
+ node
, candidate_entity
))
921 /* In order to avoid infinite loop like "(a*)*". */
922 if (cur_entity
> candidate_entity
923 && re_node_set_contains (eps_via_nodes
, candidate
))
926 if (dest_entity
> candidate_entity
)
928 dest_node
= candidate
;
929 dest_entity
= candidate_entity
;
933 assert (dest_node
!= -1);
939 int naccepted
= 0, entity
= node
;
940 re_token_type_t type
= dfa
->nodes
[node
].type
;
941 if (type
== OP_CONTEXT_NODE
)
943 entity
= dfa
->nodes
[node
].opr
.ctx_info
->entity
;
944 type
= dfa
->nodes
[entity
].type
;
947 #ifdef RE_ENABLE_I18N
948 if (ACCEPT_MB_NODE (type
))
949 naccepted
= check_node_accept_bytes (preg
, entity
, mctx
->input
, *pidx
);
951 #endif /* RE_ENABLE_I18N */
952 if (type
== OP_BACK_REF
)
954 for (i
= 0; i
< mctx
->nbkref_ents
; ++i
)
956 if (mctx
->bkref_ents
[i
].node
== node
957 && mctx
->bkref_ents
[i
].from
== *pidx
)
958 naccepted
= mctx
->bkref_ents
[i
].to
- *pidx
;
962 err
= re_node_set_insert (eps_via_nodes
, node
);
965 dest_node
= dfa
->nexts
[node
];
966 if (re_node_set_contains (&mctx
->state_log
[*pidx
]->nodes
,
969 for (i
= 0; i
< mctx
->state_log
[*pidx
]->nodes
.nelem
; ++i
)
971 dest_node
= mctx
->state_log
[*pidx
]->nodes
.elems
[i
];
972 if ((dfa
->nodes
[dest_node
].type
== OP_CONTEXT_NODE
974 == dfa
->nodes
[dest_node
].opr
.ctx_info
->entity
)))
981 || check_node_accept (preg
, dfa
->nodes
+ node
, mctx
, *pidx
))
983 dest_node
= dfa
->nexts
[node
];
984 *pidx
= (naccepted
== 0) ? *pidx
+ 1 : *pidx
+ naccepted
;
986 assert (mctx
->state_log
[*pidx
] != NULL
);
988 re_node_set_empty (eps_via_nodes
);
992 /* Must not reach here. */
999 /* Set the positions where the subexpressions are starts/ends to registers
1001 Note: We assume that pmatch[0] is already set, and
1002 pmatch[i].rm_so == pmatch[i].rm_eo == -1 (i > 1). */
1004 static reg_errcode_t
1005 set_regs (preg
, mctx
, nmatch
, pmatch
, last_node
)
1006 const regex_t
*preg
;
1007 const re_match_context_t
*mctx
;
1012 re_dfa_t
*dfa
= (re_dfa_t
*)preg
->buffer
;
1013 int idx
, cur_node
, real_nmatch
;
1014 re_node_set eps_via_nodes
;
1016 assert (nmatch
> 1);
1017 assert (mctx
->state_log
!= NULL
);
1019 cur_node
= dfa
->init_node
;
1020 real_nmatch
= (nmatch
<= preg
->re_nsub
) ? nmatch
: preg
->re_nsub
+ 1;
1021 re_node_set_init_empty (&eps_via_nodes
);
1022 for (idx
= pmatch
[0].rm_so
; idx
<= pmatch
[0].rm_eo
;)
1024 update_regs (dfa
, pmatch
, cur_node
, idx
, real_nmatch
);
1025 if (idx
== pmatch
[0].rm_eo
&& cur_node
== last_node
)
1028 /* Proceed to next node. */
1029 cur_node
= proceed_next_node (preg
, mctx
, &idx
, cur_node
, &eps_via_nodes
);
1030 if (BE (cur_node
< 0, 0))
1033 re_node_set_free (&eps_via_nodes
);
1038 update_regs (dfa
, pmatch
, cur_node
, cur_idx
, nmatch
)
1041 int cur_node
, cur_idx
, nmatch
;
1043 int type
= dfa
->nodes
[cur_node
].type
;
1045 if (type
!= OP_OPEN_SUBEXP
&& type
!= OP_CLOSE_SUBEXP
)
1047 reg_num
= dfa
->nodes
[cur_node
].opr
.idx
+ 1;
1048 if (reg_num
>= nmatch
)
1050 if (type
== OP_OPEN_SUBEXP
)
1052 /* We are at the first node of this sub expression. */
1053 pmatch
[reg_num
].rm_so
= cur_idx
;
1054 pmatch
[reg_num
].rm_eo
= -1;
1056 else if (type
== OP_CLOSE_SUBEXP
)
1057 /* We are at the first node of this sub expression. */
1058 pmatch
[reg_num
].rm_eo
= cur_idx
;
1061 #define NUMBER_OF_STATE 1
1063 /* This function checks the STATE_LOG from the MCTX->match_last to 0
1064 and sift the nodes in each states according to the following rules.
1065 Updated state_log will be wrote to STATE_LOG.
1067 Rules: We throw away the Node `a' in the STATE_LOG[STR_IDX] if...
1068 1. When STR_IDX == MATCH_LAST(the last index in the state_log):
1069 If `a' isn't the LAST_NODE and `a' can't epsilon transit to
1070 the LAST_NODE, we throw away the node `a'.
1071 2. When 0 <= STR_IDX < MATCH_LAST and `a' accepts
1072 string `s' and transit to `b':
1073 i. If 'b' isn't in the STATE_LOG[STR_IDX+strlen('s')], we throw
1075 ii. If 'b' is in the STATE_LOG[STR_IDX+strlen('s')] but 'b' is
1076 throwed away, we throw away the node `a'.
1077 3. When 0 <= STR_IDX < n and 'a' epsilon transit to 'b':
1078 i. If 'b' isn't in the STATE_LOG[STR_IDX], we throw away the
1080 ii. If 'b' is in the STATE_LOG[STR_IDX] but 'b' is throwed away,
1081 we throw away the node `a'. */
1083 #define STATE_NODE_CONTAINS(state,node) \
1084 ((state) != NULL && re_node_set_contains (&(state)->nodes, node))
1086 static reg_errcode_t
1087 sift_states_backward (preg
, mctx
, last_node
)
1088 const regex_t
*preg
;
1089 const re_match_context_t
*mctx
;
1093 re_dfa_t
*dfa
= (re_dfa_t
*)preg
->buffer
;
1094 re_node_set state_buf
;
1095 int str_idx
= mctx
->match_last
;
1096 re_node_set
*plog
; /* Points the state_log[str_idx]->nodes */
1099 assert (mctx
->state_log
!= NULL
&& mctx
->state_log
[str_idx
] != NULL
);
1101 err
= re_node_set_alloc (&state_buf
, NUMBER_OF_STATE
);
1102 if (BE (err
!= REG_NOERROR
, 0))
1104 plog
= &mctx
->state_log
[str_idx
]->nodes
;
1106 /* Build sifted state_log[str_idx]. It has the nodes which can epsilon
1107 transit to the last_node and the last_node itself. */
1108 err
= re_node_set_intersect (&state_buf
, plog
, dfa
->inveclosures
+ last_node
);
1109 if (BE (err
!= REG_NOERROR
, 0))
1112 if (mctx
->state_log
[str_idx
] != NULL
1113 && mctx
->state_log
[str_idx
]->has_backref
)
1115 err
= add_epsilon_backreference (dfa
, mctx
, plog
, str_idx
, &state_buf
);
1116 if (BE (err
!= REG_NOERROR
, 0))
1120 /* Update state log. */
1121 mctx
->state_log
[str_idx
] = re_acquire_state (&err
, dfa
, &state_buf
);
1122 if (BE (mctx
->state_log
[str_idx
] == NULL
&& err
!= REG_NOERROR
, 0))
1125 /* Then check each states in the state_log. */
1129 /* Update counters. */
1130 re_node_set_empty (&state_buf
);
1132 plog
= ((mctx
->state_log
[str_idx
] == NULL
) ? &empty_set
1133 : &mctx
->state_log
[str_idx
]->nodes
);
1135 /* Then build the next sifted state.
1136 We build the next sifted state on `state_buf', and update
1137 `state_log[str_idx]' with `state_buf'.
1139 `state_buf' is the sifted state from `state_log[str_idx + 1]'.
1140 `plog' points the node_set of the old `state_log[str_idx]'. */
1141 for (i
= 0; i
< plog
->nelem
; i
++)
1143 int prev_node
= plog
->elems
[i
];
1144 int entity
= prev_node
;
1146 re_token_type_t type
= dfa
->nodes
[prev_node
].type
;
1147 if (type
== OP_CONTEXT_NODE
)
1149 entity
= dfa
->nodes
[prev_node
].opr
.ctx_info
->entity
;
1150 type
= dfa
->nodes
[entity
].type
;
1153 #ifdef RE_ENABLE_I18N
1154 /* If the node may accept `multi byte'. */
1155 if (ACCEPT_MB_NODE (type
))
1156 naccepted
= sift_states_iter_mb (preg
, mctx
, entity
, str_idx
,
1159 /* If the node is a back reference. */
1161 #endif /* RE_ENABLE_I18N */
1162 if (type
== OP_BACK_REF
)
1163 for (j
= 0; j
< mctx
->nbkref_ents
; ++j
)
1165 naccepted
= sift_states_iter_bkref (dfa
, mctx
->state_log
,
1166 mctx
->bkref_ents
+ j
,
1174 && check_node_accept (preg
, dfa
->nodes
+ prev_node
, mctx
,
1176 && STATE_NODE_CONTAINS (mctx
->state_log
[str_idx
+ 1],
1177 dfa
->nexts
[prev_node
]))
1183 /* `prev_node' may point the entity of the OP_CONTEXT_NODE,
1184 then we use plog->elems[i] instead. */
1185 err
= re_node_set_add_intersect (&state_buf
, plog
,
1186 dfa
->inveclosures
+ prev_node
);
1187 if (BE (err
!= REG_NOERROR
, 0))
1190 if (mctx
->state_log
[str_idx
] != NULL
1191 && mctx
->state_log
[str_idx
]->has_backref
)
1193 err
= add_epsilon_backreference (dfa
, mctx
, plog
, str_idx
, &state_buf
);
1194 if (BE (err
!= REG_NOERROR
, 0))
1198 /* Update state_log. */
1199 mctx
->state_log
[str_idx
] = re_acquire_state (&err
, dfa
, &state_buf
);
1200 if (BE (mctx
->state_log
[str_idx
] == NULL
&& err
!= REG_NOERROR
, 0))
1204 re_node_set_free (&state_buf
);
1208 /* Helper functions. */
1210 static inline reg_errcode_t
1211 clean_state_log_if_need (mctx
, next_state_log_idx
)
1212 re_match_context_t
*mctx
;
1213 int next_state_log_idx
;
1215 int top
= mctx
->state_log_top
;
1217 if (next_state_log_idx
>= mctx
->input
->bufs_len
1218 || (next_state_log_idx
>= mctx
->input
->valid_len
1219 && mctx
->input
->valid_len
< mctx
->input
->len
))
1222 err
= extend_buffers (mctx
);
1223 if (BE (err
!= REG_NOERROR
, 0))
1227 if (top
< next_state_log_idx
)
1229 memset (mctx
->state_log
+ top
+ 1, '\0',
1230 sizeof (re_dfastate_t
*) * (next_state_log_idx
- top
));
1231 mctx
->state_log_top
= next_state_log_idx
;
1236 #ifdef RE_ENABLE_I18N
1238 sift_states_iter_mb (preg
, mctx
, node_idx
, str_idx
, max_str_idx
)
1239 const regex_t
*preg
;
1240 const re_match_context_t
*mctx
;
1241 int node_idx
, str_idx
, max_str_idx
;
1243 re_dfa_t
*dfa
= (re_dfa_t
*) preg
->buffer
;
1245 /* Check the node can accept `multi byte'. */
1246 naccepted
= check_node_accept_bytes (preg
, node_idx
, mctx
->input
, str_idx
);
1247 if (naccepted
> 0 && str_idx
+ naccepted
<= max_str_idx
&&
1248 !STATE_NODE_CONTAINS (mctx
->state_log
[str_idx
+ naccepted
],
1249 dfa
->nexts
[node_idx
]))
1250 /* The node can't accept the `multi byte', or the
1251 destination was already throwed away, then the node
1252 could't accept the current input `multi byte'. */
1254 /* Otherwise, it is sure that the node could accept
1255 `naccepted' bytes input. */
1258 #endif /* RE_ENABLE_I18N */
1261 sift_states_iter_bkref (dfa
, state_log
, mctx_entry
, node_idx
, idx
, match_last
)
1262 const re_dfa_t
*dfa
;
1263 re_dfastate_t
**state_log
;
1264 struct re_backref_cache_entry
*mctx_entry
;
1265 int node_idx
, idx
, match_last
;
1268 int from_idx
, to_idx
;
1269 from_idx
= mctx_entry
->from
;
1270 to_idx
= mctx_entry
->to
;
1271 if (mctx_entry
->node
== node_idx
1272 && from_idx
== idx
&& to_idx
<= match_last
1273 && STATE_NODE_CONTAINS (state_log
[to_idx
], dfa
->nexts
[node_idx
]))
1274 naccepted
= to_idx
- from_idx
;
1278 static reg_errcode_t
1279 add_epsilon_backreference (dfa
, mctx
, plog
, idx
, state_buf
)
1280 const re_dfa_t
*dfa
;
1281 const re_match_context_t
*mctx
;
1282 const re_node_set
*plog
;
1284 re_node_set
*state_buf
;
1287 for (i
= 0; i
< plog
->nelem
; ++i
)
1289 int node_idx
= plog
->elems
[i
];
1290 re_token_type_t type
= dfa
->nodes
[node_idx
].type
;
1291 if (type
== OP_CONTEXT_NODE
)
1292 type
= dfa
->nodes
[dfa
->nodes
[node_idx
].opr
.ctx_info
->entity
].type
;
1294 if (type
== OP_BACK_REF
&&
1295 !re_node_set_contains (state_buf
, node_idx
))
1297 for (j
= 0; j
< mctx
->nbkref_ents
; ++j
)
1299 struct re_backref_cache_entry
*entry
;
1300 entry
= mctx
->bkref_ents
+ j
;
1301 if (entry
->from
== entry
->to
&& entry
->from
== idx
)
1304 if (j
< mctx
->nbkref_ents
|| idx
== 0)
1307 err
= re_node_set_add_intersect (state_buf
, plog
,
1308 dfa
->inveclosures
+ node_idx
);
1309 if (BE (err
!= REG_NOERROR
, 0))
1318 /* Functions for state transition. */
1320 /* Return the next state to which the current state STATE will transit by
1321 accepting the current input byte, and update STATE_LOG if necessary.
1322 If STATE can accept a multibyte char/collating element/back reference
1323 update the destination of STATE_LOG. */
1325 static re_dfastate_t
*
1326 transit_state (err
, preg
, mctx
, state
, fl_search
)
1328 const regex_t
*preg
;
1329 re_match_context_t
*mctx
;
1330 re_dfastate_t
*state
;
1333 re_dfa_t
*dfa
= (re_dfa_t
*) preg
->buffer
;
1334 re_dfastate_t
**trtable
, *next_state
;
1337 if (re_string_cur_idx (mctx
->input
) + 1 >= mctx
->input
->bufs_len
1338 || (re_string_cur_idx (mctx
->input
) + 1 >= mctx
->input
->valid_len
1339 && mctx
->input
->valid_len
< mctx
->input
->len
))
1341 *err
= extend_buffers (mctx
);
1342 if (BE (*err
!= REG_NOERROR
, 0))
1350 re_string_skip_bytes (mctx
->input
, 1);
1354 #ifdef RE_ENABLE_I18N
1355 /* If the current state can accept multibyte. */
1356 if (state
->accept_mb
)
1358 *err
= transit_state_mb (preg
, state
, mctx
);
1359 if (BE (*err
!= REG_NOERROR
, 0))
1362 #endif /* RE_ENABLE_I18N */
1364 /* Then decide the next state with the single byte. */
1367 /* Use transition table */
1368 ch
= re_string_fetch_byte (mctx
->input
);
1369 trtable
= fl_search
? state
->trtable_search
: state
->trtable
;
1370 if (trtable
== NULL
)
1372 trtable
= build_trtable (preg
, state
, fl_search
);
1374 state
->trtable_search
= trtable
;
1376 state
->trtable
= trtable
;
1378 next_state
= trtable
[ch
];
1382 /* don't use transition table */
1383 next_state
= transit_state_sb (err
, preg
, state
, fl_search
, mctx
);
1384 if (BE (next_state
== NULL
&& err
!= REG_NOERROR
, 0))
1389 /* Update the state_log if we need. */
1390 if (mctx
->state_log
!= NULL
)
1392 int cur_idx
= re_string_cur_idx (mctx
->input
);
1393 if (cur_idx
> mctx
->state_log_top
)
1395 mctx
->state_log
[cur_idx
] = next_state
;
1396 mctx
->state_log_top
= cur_idx
;
1398 else if (mctx
->state_log
[cur_idx
] == 0)
1400 mctx
->state_log
[cur_idx
] = next_state
;
1404 re_dfastate_t
*pstate
;
1405 unsigned int context
;
1406 re_node_set next_nodes
, *log_nodes
, *table_nodes
= NULL
;
1407 /* If (state_log[cur_idx] != 0), it implies that cur_idx is
1408 the destination of a multibyte char/collating element/
1409 back reference. Then the next state is the union set of
1410 these destinations and the results of the transition table. */
1411 pstate
= mctx
->state_log
[cur_idx
];
1412 log_nodes
= pstate
->entrance_nodes
;
1413 if (next_state
!= NULL
)
1415 table_nodes
= next_state
->entrance_nodes
;
1416 *err
= re_node_set_init_union (&next_nodes
, table_nodes
,
1418 if (BE (*err
!= REG_NOERROR
, 0))
1422 next_nodes
= *log_nodes
;
1423 /* Note: We already add the nodes of the initial state,
1424 then we don't need to add them here. */
1426 context
= re_string_context_at (mctx
->input
,
1427 re_string_cur_idx (mctx
->input
) - 1,
1428 mctx
->eflags
, preg
->newline_anchor
);
1429 next_state
= mctx
->state_log
[cur_idx
]
1430 = re_acquire_state_context (err
, dfa
, &next_nodes
, context
);
1431 /* We don't need to check errors here, since the return value of
1432 this function is next_state and ERR is already set. */
1434 if (table_nodes
!= NULL
)
1435 re_node_set_free (&next_nodes
);
1437 /* If the next state has back references. */
1438 if (next_state
!= NULL
&& next_state
->has_backref
)
1440 *err
= transit_state_bkref (preg
, next_state
, mctx
);
1441 if (BE (*err
!= REG_NOERROR
, 0))
1443 next_state
= mctx
->state_log
[cur_idx
];
1449 /* Helper functions for transit_state. */
1451 /* Return the next state to which the current state STATE will transit by
1452 accepting the current input byte. */
1454 static re_dfastate_t
*
1455 transit_state_sb (err
, preg
, state
, fl_search
, mctx
)
1457 const regex_t
*preg
;
1458 re_dfastate_t
*state
;
1460 re_match_context_t
*mctx
;
1462 re_dfa_t
*dfa
= (re_dfa_t
*) preg
->buffer
;
1463 re_node_set next_nodes
;
1464 re_dfastate_t
*next_state
;
1465 int node_cnt
, cur_str_idx
= re_string_cur_idx (mctx
->input
);
1466 unsigned int context
;
1468 *err
= re_node_set_alloc (&next_nodes
, state
->nodes
.nelem
+ 1);
1469 if (BE (*err
!= REG_NOERROR
, 0))
1471 for (node_cnt
= 0; node_cnt
< state
->nodes
.nelem
; ++node_cnt
)
1473 int cur_node
= state
->nodes
.elems
[node_cnt
];
1474 if (check_node_accept (preg
, dfa
->nodes
+ cur_node
, mctx
, cur_str_idx
))
1476 *err
= re_node_set_merge (&next_nodes
,
1477 dfa
->eclosures
+ dfa
->nexts
[cur_node
]);
1478 if (BE (*err
!= REG_NOERROR
, 0))
1484 #ifdef RE_ENABLE_I18N
1485 int not_initial
= 0;
1487 for (node_cnt
= 0; node_cnt
< next_nodes
.nelem
; ++node_cnt
)
1488 if (dfa
->nodes
[next_nodes
.elems
[node_cnt
]].type
== CHARACTER
)
1490 not_initial
= dfa
->nodes
[next_nodes
.elems
[node_cnt
]].mb_partial
;
1496 *err
= re_node_set_merge (&next_nodes
,
1497 dfa
->init_state
->entrance_nodes
);
1498 if (BE (*err
!= REG_NOERROR
, 0))
1502 context
= re_string_context_at (mctx
->input
, cur_str_idx
, mctx
->eflags
,
1503 preg
->newline_anchor
);
1504 next_state
= re_acquire_state_context (err
, dfa
, &next_nodes
, context
);
1505 /* We don't need to check errors here, since the return value of
1506 this function is next_state and ERR is already set. */
1508 re_node_set_free (&next_nodes
);
1509 re_string_skip_bytes (mctx
->input
, 1);
1513 #ifdef RE_ENABLE_I18N
1514 static reg_errcode_t
1515 transit_state_mb (preg
, pstate
, mctx
)
1516 const regex_t
*preg
;
1517 re_dfastate_t
*pstate
;
1518 re_match_context_t
*mctx
;
1521 re_dfa_t
*dfa
= (re_dfa_t
*) preg
->buffer
;
1524 for (i
= 0; i
< pstate
->nodes
.nelem
; ++i
)
1526 re_node_set dest_nodes
, *new_nodes
;
1527 int cur_node_idx
= pstate
->nodes
.elems
[i
];
1528 int naccepted
= 0, dest_idx
;
1529 unsigned int context
;
1530 re_dfastate_t
*dest_state
;
1532 if (dfa
->nodes
[cur_node_idx
].type
== OP_CONTEXT_NODE
)
1534 context
= re_string_context_at (mctx
->input
,
1535 re_string_cur_idx (mctx
->input
),
1536 mctx
->eflags
, preg
->newline_anchor
);
1537 if (NOT_SATISFY_NEXT_CONSTRAINT (dfa
->nodes
[cur_node_idx
].constraint
,
1540 cur_node_idx
= dfa
->nodes
[cur_node_idx
].opr
.ctx_info
->entity
;
1543 /* How many bytes the node can accepts? */
1544 if (ACCEPT_MB_NODE (dfa
->nodes
[cur_node_idx
].type
))
1545 naccepted
= check_node_accept_bytes (preg
, cur_node_idx
, mctx
->input
,
1546 re_string_cur_idx (mctx
->input
));
1550 /* The node can accepts `naccepted' bytes. */
1551 dest_idx
= re_string_cur_idx (mctx
->input
) + naccepted
;
1552 err
= clean_state_log_if_need (mctx
, dest_idx
);
1553 if (BE (err
!= REG_NOERROR
, 0))
1556 assert (dfa
->nexts
[cur_node_idx
] != -1);
1558 /* `cur_node_idx' may point the entity of the OP_CONTEXT_NODE,
1559 then we use pstate->nodes.elems[i] instead. */
1560 new_nodes
= dfa
->eclosures
+ dfa
->nexts
[pstate
->nodes
.elems
[i
]];
1562 dest_state
= mctx
->state_log
[dest_idx
];
1563 if (dest_state
== NULL
)
1564 dest_nodes
= *new_nodes
;
1567 err
= re_node_set_init_union (&dest_nodes
,
1568 dest_state
->entrance_nodes
, new_nodes
);
1569 if (BE (err
!= REG_NOERROR
, 0))
1572 context
= re_string_context_at (mctx
->input
, dest_idx
- 1, mctx
->eflags
,
1573 preg
->newline_anchor
);
1574 mctx
->state_log
[dest_idx
]
1575 = re_acquire_state_context (&err
, dfa
, &dest_nodes
, context
);
1576 if (BE (mctx
->state_log
[dest_idx
] == NULL
&& err
!= REG_NOERROR
, 0))
1578 if (dest_state
!= NULL
)
1579 re_node_set_free (&dest_nodes
);
1583 #endif /* RE_ENABLE_I18N */
1585 static reg_errcode_t
1586 transit_state_bkref (preg
, pstate
, mctx
)
1587 const regex_t
*preg
;
1588 re_dfastate_t
*pstate
;
1589 re_match_context_t
*mctx
;
1592 re_dfastate_t
**work_state_log
;
1594 work_state_log
= re_malloc (re_dfastate_t
*,
1595 re_string_cur_idx (mctx
->input
) + 1);
1596 if (BE (work_state_log
== NULL
, 0))
1599 err
= transit_state_bkref_loop (preg
, &pstate
->nodes
, work_state_log
, mctx
);
1600 re_free (work_state_log
);
1604 /* Caller must allocate `work_state_log'. */
1606 static reg_errcode_t
1607 transit_state_bkref_loop (preg
, nodes
, work_state_log
, mctx
)
1608 const regex_t
*preg
;
1610 re_dfastate_t
**work_state_log
;
1611 re_match_context_t
*mctx
;
1614 re_dfa_t
*dfa
= (re_dfa_t
*) preg
->buffer
;
1616 re_dfastate_t
**state_log_bak
;
1617 regmatch_t
*cur_regs
= re_malloc (regmatch_t
, preg
->re_nsub
+ 1);
1618 int cur_str_idx
= re_string_cur_idx (mctx
->input
);
1619 if (BE (cur_regs
== NULL
, 0))
1622 for (i
= 0; i
< nodes
->nelem
; ++i
)
1625 int dest_str_idx
, subexp_idx
, prev_nelem
, subexp_len
;
1626 int node_idx
= nodes
->elems
[i
];
1627 unsigned int context
;
1628 re_token_t
*node
= dfa
->nodes
+ node_idx
;
1629 re_dfastate_t
*dest_state
;
1630 re_node_set
*new_dest_nodes
;
1632 /* Check whether `node' is a backreference or not. */
1633 if (node
->type
== OP_BACK_REF
)
1634 subexp_idx
= node
->opr
.idx
;
1635 else if (node
->type
== OP_CONTEXT_NODE
&&
1636 dfa
->nodes
[node
->opr
.ctx_info
->entity
].type
== OP_BACK_REF
)
1638 context
= re_string_context_at (mctx
->input
, cur_str_idx
,
1639 mctx
->eflags
, preg
->newline_anchor
);
1640 if (NOT_SATISFY_NEXT_CONSTRAINT (node
->constraint
, context
))
1642 subexp_idx
= dfa
->nodes
[node
->opr
.ctx_info
->entity
].opr
.idx
;
1647 /* `node' is a backreference.
1648 At first, set registers to check the backreference. */
1649 cur_regs
[0].rm_so
= 0;
1650 cur_regs
[0].rm_eo
= cur_str_idx
;
1651 memcpy (work_state_log
, mctx
->state_log
,
1652 sizeof (re_dfastate_t
*) * (cur_str_idx
+ 1));
1653 mctx
->match_last
= cur_str_idx
;
1654 state_log_bak
= mctx
->state_log
;
1655 mctx
->state_log
= work_state_log
;
1656 sift_states_backward (preg
, mctx
, node_idx
);
1657 if (!STATE_NODE_CONTAINS (work_state_log
[0], dfa
->init_node
))
1659 for (j
= 1; j
<= preg
->re_nsub
; ++j
)
1660 cur_regs
[j
].rm_so
= cur_regs
[j
].rm_eo
= -1;
1661 set_regs (preg
, mctx
, subexp_idx
+ 1, cur_regs
, node_idx
);
1662 mctx
->state_log
= state_log_bak
;
1664 /* Then check that the backreference can match the input string. */
1665 subexp_len
= cur_regs
[subexp_idx
].rm_eo
- cur_regs
[subexp_idx
].rm_so
;
1666 if (subexp_len
< 0 || cur_str_idx
+ subexp_len
> mctx
->input
->len
)
1669 if (cur_str_idx
+ subexp_len
> mctx
->input
->valid_len
1670 && mctx
->input
->valid_len
< mctx
->input
->len
)
1673 err
= extend_buffers (mctx
);
1674 if (BE (err
!= REG_NOERROR
, 0))
1677 buf
= re_string_get_buffer (mctx
->input
);
1678 if (strncmp (buf
+ cur_regs
[subexp_idx
].rm_so
, buf
+ cur_str_idx
,
1682 /* Successfully matched, add a new cache entry. */
1683 dest_str_idx
= cur_str_idx
+ subexp_len
;
1684 err
= match_ctx_add_entry (mctx
, node_idx
, cur_str_idx
, dest_str_idx
);
1685 if (BE (err
!= REG_NOERROR
, 0))
1687 err
= clean_state_log_if_need (mctx
, dest_str_idx
);
1688 if (BE (err
!= REG_NOERROR
, 0))
1691 /* And add the epsilon closures (which is `new_dest_nodes') of
1692 the backreference to appropriate state_log. */
1694 assert (dfa
->nexts
[node_idx
] != -1);
1696 if (node
->type
== OP_CONTEXT_NODE
&& subexp_len
== 0)
1697 new_dest_nodes
= dfa
->nodes
[node_idx
].opr
.ctx_info
->bkref_eclosure
;
1699 new_dest_nodes
= dfa
->eclosures
+ dfa
->nexts
[node_idx
];
1700 context
= (IS_WORD_CHAR (re_string_byte_at (mctx
->input
,
1702 ? CONTEXT_WORD
: 0);
1703 dest_state
= mctx
->state_log
[dest_str_idx
];
1705 prev_nelem
= ((mctx
->state_log
[cur_str_idx
] == NULL
) ? 0
1706 : mctx
->state_log
[cur_str_idx
]->nodes
.nelem
);
1707 /* Add `new_dest_node' to state_log. */
1708 if (dest_state
== NULL
)
1710 mctx
->state_log
[dest_str_idx
]
1711 = re_acquire_state_context (&err
, dfa
, new_dest_nodes
, context
);
1712 if (BE (mctx
->state_log
[dest_str_idx
] == NULL
1713 && err
!= REG_NOERROR
, 0))
1718 re_node_set dest_nodes
;
1719 err
= re_node_set_init_union (&dest_nodes
, dest_state
->entrance_nodes
,
1721 if (BE (err
!= REG_NOERROR
, 0))
1723 mctx
->state_log
[dest_str_idx
]
1724 = re_acquire_state_context (&err
, dfa
, &dest_nodes
, context
);
1725 if (BE (mctx
->state_log
[dest_str_idx
] == NULL
1726 && err
!= REG_NOERROR
, 0))
1728 re_node_set_free (&dest_nodes
);
1731 /* We need to check recursively if the backreference can epsilon
1734 && mctx
->state_log
[cur_str_idx
]->nodes
.nelem
> prev_nelem
)
1736 err
= transit_state_bkref_loop (preg
, new_dest_nodes
, work_state_log
,
1738 if (BE (err
!= REG_NOERROR
, 0))
1746 /* Build transition table for the state.
1747 Return the new table if succeeded, otherwise return NULL. */
1749 static re_dfastate_t
**
1750 build_trtable (preg
, state
, fl_search
)
1751 const regex_t
*preg
;
1752 const re_dfastate_t
*state
;
1756 re_dfa_t
*dfa
= (re_dfa_t
*) preg
->buffer
;
1758 int ndests
; /* Number of the destination states from `state'. */
1759 re_dfastate_t
**trtable
, **dest_states
, **dest_states_word
, **dest_states_nl
;
1760 re_node_set follows
, *dests_node
;
1764 /* We build DFA states which corresponds to the destination nodes
1765 from `state'. `dests_node[i]' represents the nodes which i-th
1766 destination state contains, and `dests_ch[i]' represents the
1767 characters which i-th destination state accepts. */
1768 dests_node
= re_malloc (re_node_set
, SBC_MAX
);
1769 dests_ch
= re_malloc (bitset
, SBC_MAX
);
1771 /* Initialize transiton table. */
1772 trtable
= (re_dfastate_t
**) calloc (sizeof (re_dfastate_t
*), SBC_MAX
);
1773 if (BE (dests_node
== NULL
|| dests_ch
== NULL
|| trtable
== NULL
, 0))
1776 /* At first, group all nodes belonging to `state' into several
1778 ndests
= group_nodes_into_DFAstates (preg
, state
, dests_node
, dests_ch
);
1779 if (BE (ndests
<= 0, 0))
1781 re_free (dests_node
);
1783 /* Return NULL in case of an error, trtable otherwise. */
1784 return (ndests
< 0) ? NULL
: trtable
;
1787 dest_states
= re_malloc (re_dfastate_t
*, ndests
);
1788 dest_states_word
= re_malloc (re_dfastate_t
*, ndests
);
1789 dest_states_nl
= re_malloc (re_dfastate_t
*, ndests
);
1790 bitset_empty (acceptable
);
1792 err
= re_node_set_alloc (&follows
, ndests
+ 1);
1793 if (BE (dest_states
== NULL
|| dest_states_word
== NULL
1794 || dest_states_nl
== NULL
|| err
!= REG_NOERROR
, 0))
1797 /* Then build the states for all destinations. */
1798 for (i
= 0; i
< ndests
; ++i
)
1801 re_node_set_empty (&follows
);
1802 /* Merge the follows of this destination states. */
1803 for (j
= 0; j
< dests_node
[i
].nelem
; ++j
)
1805 next_node
= dfa
->nexts
[dests_node
[i
].elems
[j
]];
1806 if (next_node
!= -1)
1808 err
= re_node_set_merge (&follows
, dfa
->eclosures
+ next_node
);
1809 if (BE (err
!= REG_NOERROR
, 0))
1813 /* If search flag is set, merge the initial state. */
1816 #ifdef RE_ENABLE_I18N
1817 int not_initial
= 0;
1818 for (j
= 0; j
< follows
.nelem
; ++j
)
1819 if (dfa
->nodes
[follows
.elems
[j
]].type
== CHARACTER
)
1821 not_initial
= dfa
->nodes
[follows
.elems
[j
]].mb_partial
;
1827 err
= re_node_set_merge (&follows
,
1828 dfa
->init_state
->entrance_nodes
);
1829 if (BE (err
!= REG_NOERROR
, 0))
1833 dest_states
[i
] = re_acquire_state_context (&err
, dfa
, &follows
, 0);
1834 if (BE (dest_states
[i
] == NULL
&& err
!= REG_NOERROR
, 0))
1836 /* If the new state has context constraint,
1837 build appropriate states for these contexts. */
1838 if (dest_states
[i
]->has_constraint
)
1840 dest_states_word
[i
] = re_acquire_state_context (&err
, dfa
, &follows
,
1842 if (BE (dest_states_word
[i
] == NULL
&& err
!= REG_NOERROR
, 0))
1844 dest_states_nl
[i
] = re_acquire_state_context (&err
, dfa
, &follows
,
1846 if (BE (dest_states_nl
[i
] == NULL
&& err
!= REG_NOERROR
, 0))
1851 dest_states_word
[i
] = dest_states
[i
];
1852 dest_states_nl
[i
] = dest_states
[i
];
1854 bitset_merge (acceptable
, dests_ch
[i
]);
1857 /* Update the transition table. */
1858 for (i
= 0, ch
= 0; i
< BITSET_UINTS
; ++i
)
1859 for (j
= 0; j
< UINT_BITS
; ++j
, ++ch
)
1860 if ((acceptable
[i
] >> j
) & 1)
1862 if (IS_WORD_CHAR (ch
))
1864 for (k
= 0; k
< ndests
; ++k
)
1865 if ((dests_ch
[k
][i
] >> j
) & 1)
1866 trtable
[ch
] = dest_states_word
[k
];
1868 else /* not WORD_CHAR */
1870 for (k
= 0; k
< ndests
; ++k
)
1871 if ((dests_ch
[k
][i
] >> j
) & 1)
1872 trtable
[ch
] = dest_states
[k
];
1876 for (k
= 0; k
< ndests
; ++k
)
1877 if (bitset_contain (acceptable
, NEWLINE_CHAR
))
1878 trtable
[NEWLINE_CHAR
] = dest_states_nl
[k
];
1880 re_free (dest_states_nl
);
1881 re_free (dest_states_word
);
1882 re_free (dest_states
);
1884 re_node_set_free (&follows
);
1885 for (i
= 0; i
< ndests
; ++i
)
1886 re_node_set_free (dests_node
+ i
);
1889 re_free (dests_node
);
1894 /* Group all nodes belonging to STATE into several destinations.
1895 Then for all destinations, set the nodes belonging to the destination
1896 to DESTS_NODE[i] and set the characters accepted by the destination
1897 to DEST_CH[i]. This function return the number of destinations. */
1900 group_nodes_into_DFAstates (preg
, state
, dests_node
, dests_ch
)
1901 const regex_t
*preg
;
1902 const re_dfastate_t
*state
;
1903 re_node_set
*dests_node
;
1907 const re_dfa_t
*dfa
= (re_dfa_t
*) preg
->buffer
;
1909 int ndests
; /* Number of the destinations from `state'. */
1910 bitset accepts
; /* Characters a node can accept. */
1911 const re_node_set
*cur_nodes
= &state
->nodes
;
1912 bitset_empty (accepts
);
1915 /* For all the nodes belonging to `state', */
1916 for (i
= 0; i
< cur_nodes
->nelem
; ++i
)
1918 unsigned int constraint
= 0;
1919 re_token_t
*node
= &dfa
->nodes
[cur_nodes
->elems
[i
]];
1920 re_token_type_t type
= node
->type
;
1922 if (type
== OP_CONTEXT_NODE
)
1924 constraint
= node
->constraint
;
1925 node
= dfa
->nodes
+ node
->opr
.ctx_info
->entity
;
1929 /* Enumerate all single byte character this node can accept. */
1930 if (type
== CHARACTER
)
1931 bitset_set (accepts
, node
->opr
.c
);
1932 else if (type
== SIMPLE_BRACKET
)
1934 bitset_merge (accepts
, node
->opr
.sbcset
);
1936 else if (type
== OP_PERIOD
)
1938 bitset_set_all (accepts
);
1939 if (!(preg
->syntax
& RE_DOT_NEWLINE
))
1940 bitset_clear (accepts
, '\n');
1941 if (preg
->syntax
& RE_DOT_NOT_NULL
)
1942 bitset_clear (accepts
, '\0');
1947 /* Check the `accepts' and sift the characters which are not
1948 match it the context. */
1951 if (constraint
& NEXT_WORD_CONSTRAINT
)
1952 for (j
= 0; j
< BITSET_UINTS
; ++j
)
1953 accepts
[j
] &= dfa
->word_char
[j
];
1954 else if (constraint
& NEXT_NOTWORD_CONSTRAINT
)
1955 for (j
= 0; j
< BITSET_UINTS
; ++j
)
1956 accepts
[j
] &= ~dfa
->word_char
[j
];
1957 else if (constraint
& NEXT_NEWLINE_CONSTRAINT
)
1959 int accepts_newline
= bitset_contain (accepts
, NEWLINE_CHAR
);
1960 bitset_empty (accepts
);
1961 if (accepts_newline
)
1962 bitset_set (accepts
, NEWLINE_CHAR
);
1968 /* Then divide `accepts' into DFA states, or create a new
1970 for (j
= 0; j
< ndests
; ++j
)
1972 bitset intersec
; /* Intersection sets, see below. */
1974 /* Flags, see below. */
1975 int has_intersec
, not_subset
, not_consumed
;
1977 /* Optimization, skip if this state doesn't accept the character. */
1978 if (type
== CHARACTER
&& !bitset_contain (dests_ch
[j
], node
->opr
.c
))
1981 /* Enumerate the intersection set of this state and `accepts'. */
1983 for (k
= 0; k
< BITSET_UINTS
; ++k
)
1984 has_intersec
|= intersec
[k
] = accepts
[k
] & dests_ch
[j
][k
];
1985 /* And skip if the intersection set is empty. */
1989 /* Then check if this state is a subset of `accepts'. */
1990 not_subset
= not_consumed
= 0;
1991 for (k
= 0; k
< BITSET_UINTS
; ++k
)
1993 not_subset
|= remains
[k
] = ~accepts
[k
] & dests_ch
[j
][k
];
1994 not_consumed
|= accepts
[k
] = accepts
[k
] & ~dests_ch
[j
][k
];
1997 /* If this state isn't a subset of `accepts', create a
1998 new group state, which has the `remains'. */
2001 bitset_copy (dests_ch
[ndests
], remains
);
2002 bitset_copy (dests_ch
[j
], intersec
);
2003 err
= re_node_set_init_copy (dests_node
+ ndests
, &dests_node
[j
]);
2004 if (BE (err
!= REG_NOERROR
, 0))
2009 /* Put the position in the current group. */
2010 err
= re_node_set_insert (&dests_node
[j
], cur_nodes
->elems
[i
]);
2011 if (BE (err
< 0, 0))
2014 /* If all characters are consumed, go to next node. */
2018 /* Some characters remain, create a new group. */
2021 bitset_copy (dests_ch
[ndests
], accepts
);
2022 err
= re_node_set_init_1 (dests_node
+ ndests
, cur_nodes
->elems
[i
]);
2023 if (BE (err
!= REG_NOERROR
, 0))
2026 bitset_empty (accepts
);
2032 #ifdef RE_ENABLE_I18N
2033 /* Check how many bytes the node `dfa->nodes[node_idx]' accepts.
2034 Return the number of the bytes the node accepts.
2035 STR_IDX is the current index of the input string.
2037 This function handles the nodes which can accept one character, or
2038 one collating element like '.', '[a-z]', opposite to the other nodes
2039 can only accept one byte. */
2042 check_node_accept_bytes (preg
, node_idx
, input
, str_idx
)
2043 const regex_t
*preg
;
2044 int node_idx
, str_idx
;
2045 const re_string_t
*input
;
2047 const re_dfa_t
*dfa
= (re_dfa_t
*) preg
->buffer
;
2048 const re_token_t
*node
= dfa
->nodes
+ node_idx
;
2049 int elem_len
= re_string_elem_size_at (input
, str_idx
);
2050 int char_len
= re_string_char_size_at (input
, str_idx
);
2054 uint32_t nrules
= _NL_CURRENT_WORD (LC_COLLATE
, _NL_COLLATE_NRULES
);
2056 if (elem_len
<= 1 && char_len
<= 1)
2058 if (node
->type
== OP_PERIOD
)
2060 /* '.' accepts any one character except the following two cases. */
2061 if ((!(preg
->syntax
& RE_DOT_NEWLINE
) &&
2062 re_string_byte_at (input
, str_idx
) == '\n') ||
2063 ((preg
->syntax
& RE_DOT_NOT_NULL
) &&
2064 re_string_byte_at (input
, str_idx
) == '\0'))
2068 else if (node
->type
== COMPLEX_BRACKET
)
2070 const re_charset_t
*cset
= node
->opr
.mbcset
;
2072 const char *pin
= re_string_get_buffer (input
) + str_idx
;
2075 wchar_t wc
= ((cset
->nranges
|| cset
->nchar_classes
|| cset
->nmbchars
)
2076 ? re_string_wchar_at (input
, str_idx
) : 0);
2078 /* match with multibyte character? */
2079 for (i
= 0; i
< cset
->nmbchars
; ++i
)
2080 if (wc
== cset
->mbchars
[i
])
2082 match_len
= char_len
;
2083 goto check_node_accept_bytes_match
;
2085 /* match with character_class? */
2086 for (i
= 0; i
< cset
->nchar_classes
; ++i
)
2088 wctype_t wt
= cset
->char_classes
[i
];
2089 if (__iswctype (wc
, wt
))
2091 match_len
= char_len
;
2092 goto check_node_accept_bytes_match
;
2099 unsigned int in_collseq
= 0;
2100 const int32_t *table
, *indirect
;
2101 const char *weights
, *extra
, *collseqwc
;
2103 /* This #include defines a local function! */
2104 # include <locale/weight.h>
2106 /* match with collating_symbol? */
2107 if (cset
->ncoll_syms
)
2108 extra
= _NL_CURRENT (LC_COLLATE
, _NL_COLLATE_SYMB_EXTRAMB
);
2109 for (i
= 0; i
< cset
->ncoll_syms
; ++i
)
2111 const char *coll_sym
= extra
+ cset
->coll_syms
[i
];
2112 /* Compare the length of input collating element and
2113 the length of current collating element. */
2114 if (*coll_sym
!= elem_len
)
2116 /* Compare each bytes. */
2117 for (j
= 0; j
< *coll_sym
; j
++)
2118 if (pin
[j
] != coll_sym
[1 + j
])
2122 /* Match if every bytes is equal. */
2124 goto check_node_accept_bytes_match
;
2130 if (elem_len
<= char_len
)
2132 collseqwc
= _NL_CURRENT (LC_COLLATE
, _NL_COLLATE_COLLSEQWC
);
2133 in_collseq
= collseq_table_lookup (collseqwc
, wc
);
2136 in_collseq
= find_collation_sequence_value (pin
, elem_len
);
2138 /* match with range expression? */
2139 for (i
= 0; i
< cset
->nranges
; ++i
)
2140 if (cset
->range_starts
[i
] <= in_collseq
2141 && in_collseq
<= cset
->range_ends
[i
])
2143 match_len
= elem_len
;
2144 goto check_node_accept_bytes_match
;
2147 /* match with equivalence_class? */
2148 if (cset
->nequiv_classes
)
2150 const unsigned char *cp
= (const unsigned char *) pin
;
2151 table
= (const int32_t *)
2152 _NL_CURRENT (LC_COLLATE
, _NL_COLLATE_TABLEMB
);
2153 weights
= _NL_CURRENT (LC_COLLATE
, _NL_COLLATE_WEIGHTMB
);
2154 extra
= _NL_CURRENT (LC_COLLATE
, _NL_COLLATE_EXTRAMB
);
2155 indirect
= (const int32_t *)
2156 _NL_CURRENT (LC_COLLATE
, _NL_COLLATE_INDIRECTMB
);
2157 idx
= findidx (&cp
);
2159 for (i
= 0; i
< cset
->nequiv_classes
; ++i
)
2161 int32_t equiv_class_idx
= cset
->equiv_classes
[i
];
2162 size_t weight_len
= weights
[idx
];
2163 if (weight_len
== weights
[equiv_class_idx
])
2166 while (cnt
<= weight_len
2167 && (weights
[equiv_class_idx
+ 1 + cnt
]
2168 == weights
[idx
+ 1 + cnt
]))
2170 if (cnt
> weight_len
)
2172 match_len
= elem_len
;
2173 goto check_node_accept_bytes_match
;
2182 /* match with range expression? */
2184 wchar_t cmp_buf
[] = {L
'\0', L
'\0', wc
, L
'\0', L
'\0', L
'\0'};
2186 wchar_t cmp_buf
[] = {L
'\0', L
'\0', L
'\0', L
'\0', L
'\0', L
'\0'};
2189 for (i
= 0; i
< cset
->nranges
; ++i
)
2191 cmp_buf
[0] = cset
->range_starts
[i
];
2192 cmp_buf
[4] = cset
->range_ends
[i
];
2193 if (wcscoll (cmp_buf
, cmp_buf
+ 2) <= 0
2194 && wcscoll (cmp_buf
+ 2, cmp_buf
+ 4) <= 0)
2196 match_len
= char_len
;
2197 goto check_node_accept_bytes_match
;
2201 check_node_accept_bytes_match
:
2202 if (!cset
->non_match
)
2209 return (elem_len
> char_len
) ? elem_len
: char_len
;
2217 find_collation_sequence_value (mbs
, mbs_len
)
2221 uint32_t nrules
= _NL_CURRENT_WORD (LC_COLLATE
, _NL_COLLATE_NRULES
);
2226 /* No valid character. Match it as a single byte character. */
2227 const unsigned char *collseq
= (const unsigned char *)
2228 _NL_CURRENT (LC_COLLATE
, _NL_COLLATE_COLLSEQMB
);
2229 return collseq
[*(unsigned char *) mbs
];
2236 const unsigned char *extra
= (const unsigned char *)
2237 _NL_CURRENT (LC_COLLATE
, _NL_COLLATE_SYMB_EXTRAMB
);
2241 int mbs_cnt
, found
= 0;
2242 int32_t elem_mbs_len
;
2243 /* Skip the name of collating element name. */
2244 idx
= idx
+ extra
[idx
] + 1;
2245 elem_mbs_len
= extra
[idx
++];
2246 if (mbs_len
== elem_mbs_len
)
2248 for (mbs_cnt
= 0; mbs_cnt
< elem_mbs_len
; ++mbs_cnt
)
2249 if (extra
[idx
+ mbs_cnt
] != mbs
[mbs_cnt
])
2251 if (mbs_cnt
== elem_mbs_len
)
2252 /* Found the entry. */
2255 /* Skip the byte sequence of the collating element. */
2256 idx
+= elem_mbs_len
;
2257 /* Adjust for the alignment. */
2258 idx
= (idx
+ 3) & ~3;
2259 /* Skip the collation sequence value. */
2260 idx
+= sizeof (uint32_t);
2261 /* Skip the wide char sequence of the collating element. */
2262 idx
= idx
+ sizeof (uint32_t) * (extra
[idx
] + 1);
2263 /* If we found the entry, return the sequence value. */
2265 return *(uint32_t *) (extra
+ idx
);
2266 /* Skip the collation sequence value. */
2267 idx
+= sizeof (uint32_t);
2272 #endif /* RE_ENABLE_I18N */
2274 /* Check whether the node accepts the byte which is IDX-th
2275 byte of the INPUT. */
2278 check_node_accept (preg
, node
, mctx
, idx
)
2279 const regex_t
*preg
;
2280 const re_token_t
*node
;
2281 const re_match_context_t
*mctx
;
2284 const re_dfa_t
*dfa
= (re_dfa_t
*) preg
->buffer
;
2285 const re_token_t
*cur_node
;
2287 if (node
->type
== OP_CONTEXT_NODE
)
2289 /* The node has constraints. Check whether the current context
2290 satisfies the constraints. */
2291 unsigned int context
= re_string_context_at (mctx
->input
, idx
,
2293 preg
->newline_anchor
);
2294 if (NOT_SATISFY_NEXT_CONSTRAINT (node
->constraint
, context
))
2296 cur_node
= dfa
->nodes
+ node
->opr
.ctx_info
->entity
;
2301 ch
= re_string_byte_at (mctx
->input
, idx
);
2302 if (cur_node
->type
== CHARACTER
)
2303 return cur_node
->opr
.c
== ch
;
2304 else if (cur_node
->type
== SIMPLE_BRACKET
)
2305 return bitset_contain (cur_node
->opr
.sbcset
, ch
);
2306 else if (cur_node
->type
== OP_PERIOD
)
2307 return !((ch
== '\n' && !(preg
->syntax
& RE_DOT_NEWLINE
))
2308 || (ch
== '\0' && (preg
->syntax
& RE_DOT_NOT_NULL
)));
2313 /* Extend the buffers, if the buffers have run out. */
2315 static reg_errcode_t
2316 extend_buffers (mctx
)
2317 re_match_context_t
*mctx
;
2320 re_string_t
*pstr
= mctx
->input
;
2322 /* Double the lengthes of the buffers. */
2323 ret
= re_string_realloc_buffers (pstr
, pstr
->bufs_len
* 2);
2324 if (BE (ret
!= REG_NOERROR
, 0))
2327 if (mctx
->state_log
!= NULL
)
2329 /* And double the length of state_log. */
2330 mctx
->state_log
= re_realloc (mctx
->state_log
, re_dfastate_t
*,
2331 pstr
->bufs_len
* 2);
2332 if (BE (mctx
->state_log
== NULL
, 0))
2336 /* Then reconstruct the buffers. */
2339 #ifdef RE_ENABLE_I18N
2341 build_wcs_upper_buffer (pstr
);
2343 #endif /* RE_ENABLE_I18N */
2344 build_upper_buffer (pstr
);
2348 #ifdef RE_ENABLE_I18N
2350 build_wcs_buffer (pstr
);
2352 #endif /* RE_ENABLE_I18N */
2354 if (pstr
->trans
!= NULL
)
2355 re_string_translate_buffer (pstr
);
2357 pstr
->valid_len
= pstr
->bufs_len
;
2364 /* Functions for matching context. */
2366 static reg_errcode_t
2367 match_ctx_init (mctx
, eflags
, input
, n
)
2368 re_match_context_t
*mctx
;
2372 mctx
->eflags
= eflags
;
2373 mctx
->input
= input
;
2374 mctx
->match_last
= -1;
2377 mctx
->bkref_ents
= re_malloc (struct re_backref_cache_entry
, n
);
2378 if (BE (mctx
->bkref_ents
== NULL
, 0))
2382 mctx
->bkref_ents
= NULL
;
2383 mctx
->nbkref_ents
= 0;
2384 mctx
->abkref_ents
= n
;
2385 mctx
->max_bkref_len
= 0;
2390 match_ctx_free (mctx
)
2391 re_match_context_t
*mctx
;
2393 re_free (mctx
->bkref_ents
);
2396 /* Add a new backreference entry to the cache. */
2398 static reg_errcode_t
2399 match_ctx_add_entry (mctx
, node
, from
, to
)
2400 re_match_context_t
*mctx
;
2403 if (mctx
->nbkref_ents
>= mctx
->abkref_ents
)
2405 mctx
->bkref_ents
= re_realloc (mctx
->bkref_ents
,
2406 struct re_backref_cache_entry
,
2407 mctx
->abkref_ents
* 2);
2408 if (BE (mctx
->bkref_ents
== NULL
, 0))
2410 memset (mctx
->bkref_ents
+ mctx
->nbkref_ents
, '\0',
2411 sizeof (struct re_backref_cache_entry
) * mctx
->abkref_ents
);
2412 mctx
->abkref_ents
*= 2;
2414 mctx
->bkref_ents
[mctx
->nbkref_ents
].node
= node
;
2415 mctx
->bkref_ents
[mctx
->nbkref_ents
].from
= from
;
2416 mctx
->bkref_ents
[mctx
->nbkref_ents
++].to
= to
;
2417 if (mctx
->max_bkref_len
< to
- from
)
2418 mctx
->max_bkref_len
= to
- from
;