1 /* Extended regular expression matching and search library.
2 Copyright (C) 2002-2006, 2010 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4 Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library; if not, write to the Free
18 Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
21 static void re_string_construct_common (const char *str
, int len
,
23 RE_TRANSLATE_TYPE trans
, int icase
,
24 const re_dfa_t
*dfa
) internal_function
;
25 static re_dfastate_t
*create_ci_newstate (const re_dfa_t
*dfa
,
26 const re_node_set
*nodes
,
27 unsigned int hash
) internal_function
;
28 static re_dfastate_t
*create_cd_newstate (const re_dfa_t
*dfa
,
29 const re_node_set
*nodes
,
31 unsigned int hash
) internal_function
;
33 /* Functions for string operation. */
35 /* This function allocate the buffers. It is necessary to call
36 re_string_reconstruct before using the object. */
39 internal_function __attribute_warn_unused_result__
40 re_string_allocate (re_string_t
*pstr
, const char *str
, int len
, int init_len
,
41 RE_TRANSLATE_TYPE trans
, int icase
, const re_dfa_t
*dfa
)
46 /* Ensure at least one character fits into the buffers. */
47 if (init_len
< dfa
->mb_cur_max
)
48 init_len
= dfa
->mb_cur_max
;
49 init_buf_len
= (len
+ 1 < init_len
) ? len
+ 1: init_len
;
50 re_string_construct_common (str
, len
, pstr
, trans
, icase
, dfa
);
52 ret
= re_string_realloc_buffers (pstr
, init_buf_len
);
53 if (BE (ret
!= REG_NOERROR
, 0))
56 pstr
->word_char
= dfa
->word_char
;
57 pstr
->word_ops_used
= dfa
->word_ops_used
;
58 pstr
->mbs
= pstr
->mbs_allocated
? pstr
->mbs
: (unsigned char *) str
;
59 pstr
->valid_len
= (pstr
->mbs_allocated
|| dfa
->mb_cur_max
> 1) ? 0 : len
;
60 pstr
->valid_raw_len
= pstr
->valid_len
;
64 /* This function allocate the buffers, and initialize them. */
67 internal_function __attribute_warn_unused_result__
68 re_string_construct (re_string_t
*pstr
, const char *str
, int len
,
69 RE_TRANSLATE_TYPE trans
, int icase
, const re_dfa_t
*dfa
)
72 memset (pstr
, '\0', sizeof (re_string_t
));
73 re_string_construct_common (str
, len
, pstr
, trans
, icase
, dfa
);
77 ret
= re_string_realloc_buffers (pstr
, len
+ 1);
78 if (BE (ret
!= REG_NOERROR
, 0))
81 pstr
->mbs
= pstr
->mbs_allocated
? pstr
->mbs
: (unsigned char *) str
;
86 if (dfa
->mb_cur_max
> 1)
90 ret
= build_wcs_upper_buffer (pstr
);
91 if (BE (ret
!= REG_NOERROR
, 0))
93 if (pstr
->valid_raw_len
>= len
)
95 if (pstr
->bufs_len
> pstr
->valid_len
+ dfa
->mb_cur_max
)
97 ret
= re_string_realloc_buffers (pstr
, pstr
->bufs_len
* 2);
98 if (BE (ret
!= REG_NOERROR
, 0))
103 #endif /* RE_ENABLE_I18N */
104 build_upper_buffer (pstr
);
108 #ifdef RE_ENABLE_I18N
109 if (dfa
->mb_cur_max
> 1)
110 build_wcs_buffer (pstr
);
112 #endif /* RE_ENABLE_I18N */
115 re_string_translate_buffer (pstr
);
118 pstr
->valid_len
= pstr
->bufs_len
;
119 pstr
->valid_raw_len
= pstr
->bufs_len
;
127 /* Helper functions for re_string_allocate, and re_string_construct. */
130 internal_function __attribute_warn_unused_result__
131 re_string_realloc_buffers (re_string_t
*pstr
, int new_buf_len
)
133 #ifdef RE_ENABLE_I18N
134 if (pstr
->mb_cur_max
> 1)
138 /* Avoid overflow in realloc. */
139 const size_t max_object_size
= MAX (sizeof (wint_t), sizeof (int));
140 if (BE (SIZE_MAX
/ max_object_size
< new_buf_len
, 0))
143 new_wcs
= re_realloc (pstr
->wcs
, wint_t, new_buf_len
);
144 if (BE (new_wcs
== NULL
, 0))
147 if (pstr
->offsets
!= NULL
)
149 int *new_offsets
= re_realloc (pstr
->offsets
, int, new_buf_len
);
150 if (BE (new_offsets
== NULL
, 0))
152 pstr
->offsets
= new_offsets
;
155 #endif /* RE_ENABLE_I18N */
156 if (pstr
->mbs_allocated
)
158 unsigned char *new_mbs
= re_realloc (pstr
->mbs
, unsigned char,
160 if (BE (new_mbs
== NULL
, 0))
164 pstr
->bufs_len
= new_buf_len
;
171 re_string_construct_common (const char *str
, int len
, re_string_t
*pstr
,
172 RE_TRANSLATE_TYPE trans
, int icase
,
175 pstr
->raw_mbs
= (const unsigned char *) str
;
179 pstr
->icase
= icase
? 1 : 0;
180 pstr
->mbs_allocated
= (trans
!= NULL
|| icase
);
181 pstr
->mb_cur_max
= dfa
->mb_cur_max
;
182 pstr
->is_utf8
= dfa
->is_utf8
;
183 pstr
->map_notascii
= dfa
->map_notascii
;
184 pstr
->stop
= pstr
->len
;
185 pstr
->raw_stop
= pstr
->stop
;
188 #ifdef RE_ENABLE_I18N
190 /* Build wide character buffer PSTR->WCS.
191 If the byte sequence of the string are:
192 <mb1>(0), <mb1>(1), <mb2>(0), <mb2>(1), <sb3>
193 Then wide character buffer will be:
194 <wc1> , WEOF , <wc2> , WEOF , <wc3>
195 We use WEOF for padding, they indicate that the position isn't
196 a first byte of a multibyte character.
198 Note that this function assumes PSTR->VALID_LEN elements are already
199 built and starts from PSTR->VALID_LEN. */
203 build_wcs_buffer (re_string_t
*pstr
)
206 unsigned char buf
[MB_LEN_MAX
];
207 assert (MB_LEN_MAX
>= pstr
->mb_cur_max
);
209 unsigned char buf
[64];
212 int byte_idx
, end_idx
, remain_len
;
215 /* Build the buffers from pstr->valid_len to either pstr->len or
217 end_idx
= (pstr
->bufs_len
> pstr
->len
) ? pstr
->len
: pstr
->bufs_len
;
218 for (byte_idx
= pstr
->valid_len
; byte_idx
< end_idx
;)
223 remain_len
= end_idx
- byte_idx
;
224 prev_st
= pstr
->cur_state
;
225 /* Apply the translation if we need. */
226 if (BE (pstr
->trans
!= NULL
, 0))
230 for (i
= 0; i
< pstr
->mb_cur_max
&& i
< remain_len
; ++i
)
232 ch
= pstr
->raw_mbs
[pstr
->raw_mbs_idx
+ byte_idx
+ i
];
233 buf
[i
] = pstr
->mbs
[byte_idx
+ i
] = pstr
->trans
[ch
];
235 p
= (const char *) buf
;
238 p
= (const char *) pstr
->raw_mbs
+ pstr
->raw_mbs_idx
+ byte_idx
;
239 mbclen
= __mbrtowc (&wc
, p
, remain_len
, &pstr
->cur_state
);
240 if (BE (mbclen
== (size_t) -2, 0))
242 /* The buffer doesn't have enough space, finish to build. */
243 pstr
->cur_state
= prev_st
;
246 else if (BE (mbclen
== (size_t) -1 || mbclen
== 0, 0))
248 /* We treat these cases as a singlebyte character. */
250 wc
= (wchar_t) pstr
->raw_mbs
[pstr
->raw_mbs_idx
+ byte_idx
];
251 if (BE (pstr
->trans
!= NULL
, 0))
252 wc
= pstr
->trans
[wc
];
253 pstr
->cur_state
= prev_st
;
256 /* Write wide character and padding. */
257 pstr
->wcs
[byte_idx
++] = wc
;
258 /* Write paddings. */
259 for (remain_len
= byte_idx
+ mbclen
- 1; byte_idx
< remain_len
;)
260 pstr
->wcs
[byte_idx
++] = WEOF
;
262 pstr
->valid_len
= byte_idx
;
263 pstr
->valid_raw_len
= byte_idx
;
266 /* Build wide character buffer PSTR->WCS like build_wcs_buffer,
267 but for REG_ICASE. */
270 internal_function __attribute_warn_unused_result__
271 build_wcs_upper_buffer (re_string_t
*pstr
)
274 int src_idx
, byte_idx
, end_idx
, remain_len
;
277 char buf
[MB_LEN_MAX
];
278 assert (MB_LEN_MAX
>= pstr
->mb_cur_max
);
283 byte_idx
= pstr
->valid_len
;
284 end_idx
= (pstr
->bufs_len
> pstr
->len
) ? pstr
->len
: pstr
->bufs_len
;
286 /* The following optimization assumes that ASCII characters can be
287 mapped to wide characters with a simple cast. */
288 if (! pstr
->map_notascii
&& pstr
->trans
== NULL
&& !pstr
->offsets_needed
)
290 while (byte_idx
< end_idx
)
294 if (isascii (pstr
->raw_mbs
[pstr
->raw_mbs_idx
+ byte_idx
])
295 && mbsinit (&pstr
->cur_state
))
297 /* In case of a singlebyte character. */
299 = toupper (pstr
->raw_mbs
[pstr
->raw_mbs_idx
+ byte_idx
]);
300 /* The next step uses the assumption that wchar_t is encoded
301 ASCII-safe: all ASCII values can be converted like this. */
302 pstr
->wcs
[byte_idx
] = (wchar_t) pstr
->mbs
[byte_idx
];
307 remain_len
= end_idx
- byte_idx
;
308 prev_st
= pstr
->cur_state
;
309 mbclen
= __mbrtowc (&wc
,
310 ((const char *) pstr
->raw_mbs
+ pstr
->raw_mbs_idx
311 + byte_idx
), remain_len
, &pstr
->cur_state
);
312 if (BE (mbclen
+ 2 > 2, 1))
320 mbcdlen
= wcrtomb (buf
, wcu
, &prev_st
);
321 if (BE (mbclen
== mbcdlen
, 1))
322 memcpy (pstr
->mbs
+ byte_idx
, buf
, mbclen
);
330 memcpy (pstr
->mbs
+ byte_idx
,
331 pstr
->raw_mbs
+ pstr
->raw_mbs_idx
+ byte_idx
, mbclen
);
332 pstr
->wcs
[byte_idx
++] = wcu
;
333 /* Write paddings. */
334 for (remain_len
= byte_idx
+ mbclen
- 1; byte_idx
< remain_len
;)
335 pstr
->wcs
[byte_idx
++] = WEOF
;
337 else if (mbclen
== (size_t) -1 || mbclen
== 0)
339 /* It is an invalid character or '\0'. Just use the byte. */
340 int ch
= pstr
->raw_mbs
[pstr
->raw_mbs_idx
+ byte_idx
];
341 pstr
->mbs
[byte_idx
] = ch
;
342 /* And also cast it to wide char. */
343 pstr
->wcs
[byte_idx
++] = (wchar_t) ch
;
344 if (BE (mbclen
== (size_t) -1, 0))
345 pstr
->cur_state
= prev_st
;
349 /* The buffer doesn't have enough space, finish to build. */
350 pstr
->cur_state
= prev_st
;
354 pstr
->valid_len
= byte_idx
;
355 pstr
->valid_raw_len
= byte_idx
;
359 for (src_idx
= pstr
->valid_raw_len
; byte_idx
< end_idx
;)
364 remain_len
= end_idx
- byte_idx
;
365 prev_st
= pstr
->cur_state
;
366 if (BE (pstr
->trans
!= NULL
, 0))
370 for (i
= 0; i
< pstr
->mb_cur_max
&& i
< remain_len
; ++i
)
372 ch
= pstr
->raw_mbs
[pstr
->raw_mbs_idx
+ src_idx
+ i
];
373 buf
[i
] = pstr
->trans
[ch
];
375 p
= (const char *) buf
;
378 p
= (const char *) pstr
->raw_mbs
+ pstr
->raw_mbs_idx
+ src_idx
;
379 mbclen
= __mbrtowc (&wc
, p
, remain_len
, &pstr
->cur_state
);
380 if (BE (mbclen
+ 2 > 2, 1))
388 mbcdlen
= wcrtomb ((char *) buf
, wcu
, &prev_st
);
389 if (BE (mbclen
== mbcdlen
, 1))
390 memcpy (pstr
->mbs
+ byte_idx
, buf
, mbclen
);
391 else if (mbcdlen
!= (size_t) -1)
395 if (byte_idx
+ mbcdlen
> pstr
->bufs_len
)
397 pstr
->cur_state
= prev_st
;
401 if (pstr
->offsets
== NULL
)
403 pstr
->offsets
= re_malloc (int, pstr
->bufs_len
);
405 if (pstr
->offsets
== NULL
)
408 if (!pstr
->offsets_needed
)
410 for (i
= 0; i
< (size_t) byte_idx
; ++i
)
411 pstr
->offsets
[i
] = i
;
412 pstr
->offsets_needed
= 1;
415 memcpy (pstr
->mbs
+ byte_idx
, buf
, mbcdlen
);
416 pstr
->wcs
[byte_idx
] = wcu
;
417 pstr
->offsets
[byte_idx
] = src_idx
;
418 for (i
= 1; i
< mbcdlen
; ++i
)
420 pstr
->offsets
[byte_idx
+ i
]
421 = src_idx
+ (i
< mbclen
? i
: mbclen
- 1);
422 pstr
->wcs
[byte_idx
+ i
] = WEOF
;
424 pstr
->len
+= mbcdlen
- mbclen
;
425 if (pstr
->raw_stop
> src_idx
)
426 pstr
->stop
+= mbcdlen
- mbclen
;
427 end_idx
= (pstr
->bufs_len
> pstr
->len
)
428 ? pstr
->len
: pstr
->bufs_len
;
434 memcpy (pstr
->mbs
+ byte_idx
, p
, mbclen
);
437 memcpy (pstr
->mbs
+ byte_idx
, p
, mbclen
);
439 if (BE (pstr
->offsets_needed
!= 0, 0))
442 for (i
= 0; i
< mbclen
; ++i
)
443 pstr
->offsets
[byte_idx
+ i
] = src_idx
+ i
;
447 pstr
->wcs
[byte_idx
++] = wcu
;
448 /* Write paddings. */
449 for (remain_len
= byte_idx
+ mbclen
- 1; byte_idx
< remain_len
;)
450 pstr
->wcs
[byte_idx
++] = WEOF
;
452 else if (mbclen
== (size_t) -1 || mbclen
== 0)
454 /* It is an invalid character or '\0'. Just use the byte. */
455 int ch
= pstr
->raw_mbs
[pstr
->raw_mbs_idx
+ src_idx
];
457 if (BE (pstr
->trans
!= NULL
, 0))
458 ch
= pstr
->trans
[ch
];
459 pstr
->mbs
[byte_idx
] = ch
;
461 if (BE (pstr
->offsets_needed
!= 0, 0))
462 pstr
->offsets
[byte_idx
] = src_idx
;
465 /* And also cast it to wide char. */
466 pstr
->wcs
[byte_idx
++] = (wchar_t) ch
;
467 if (BE (mbclen
== (size_t) -1, 0))
468 pstr
->cur_state
= prev_st
;
472 /* The buffer doesn't have enough space, finish to build. */
473 pstr
->cur_state
= prev_st
;
477 pstr
->valid_len
= byte_idx
;
478 pstr
->valid_raw_len
= src_idx
;
482 /* Skip characters until the index becomes greater than NEW_RAW_IDX.
487 re_string_skip_chars (re_string_t
*pstr
, int new_raw_idx
, wint_t *last_wc
)
494 /* Skip the characters which are not necessary to check. */
495 for (rawbuf_idx
= pstr
->raw_mbs_idx
+ pstr
->valid_raw_len
;
496 rawbuf_idx
< new_raw_idx
;)
499 int remain_len
= pstr
->len
- rawbuf_idx
;
500 prev_st
= pstr
->cur_state
;
501 mbclen
= __mbrtowc (&wc2
, (const char *) pstr
->raw_mbs
+ rawbuf_idx
,
502 remain_len
, &pstr
->cur_state
);
503 if (BE ((ssize_t
) mbclen
<= 0, 0))
505 /* We treat these cases as a single byte character. */
506 if (mbclen
== 0 || remain_len
== 0)
509 wc
= *(unsigned char *) (pstr
->raw_mbs
+ rawbuf_idx
);
511 pstr
->cur_state
= prev_st
;
515 /* Then proceed the next character. */
516 rawbuf_idx
+= mbclen
;
521 #endif /* RE_ENABLE_I18N */
523 /* Build the buffer PSTR->MBS, and apply the translation if we need.
524 This function is used in case of REG_ICASE. */
528 build_upper_buffer (re_string_t
*pstr
)
530 int char_idx
, end_idx
;
531 end_idx
= (pstr
->bufs_len
> pstr
->len
) ? pstr
->len
: pstr
->bufs_len
;
533 for (char_idx
= pstr
->valid_len
; char_idx
< end_idx
; ++char_idx
)
535 int ch
= pstr
->raw_mbs
[pstr
->raw_mbs_idx
+ char_idx
];
536 if (BE (pstr
->trans
!= NULL
, 0))
537 ch
= pstr
->trans
[ch
];
539 pstr
->mbs
[char_idx
] = toupper (ch
);
541 pstr
->mbs
[char_idx
] = ch
;
543 pstr
->valid_len
= char_idx
;
544 pstr
->valid_raw_len
= char_idx
;
547 /* Apply TRANS to the buffer in PSTR. */
551 re_string_translate_buffer (re_string_t
*pstr
)
553 int buf_idx
, end_idx
;
554 end_idx
= (pstr
->bufs_len
> pstr
->len
) ? pstr
->len
: pstr
->bufs_len
;
556 for (buf_idx
= pstr
->valid_len
; buf_idx
< end_idx
; ++buf_idx
)
558 int ch
= pstr
->raw_mbs
[pstr
->raw_mbs_idx
+ buf_idx
];
559 pstr
->mbs
[buf_idx
] = pstr
->trans
[ch
];
562 pstr
->valid_len
= buf_idx
;
563 pstr
->valid_raw_len
= buf_idx
;
566 /* This function re-construct the buffers.
567 Concretely, convert to wide character in case of pstr->mb_cur_max > 1,
568 convert to upper case in case of REG_ICASE, apply translation. */
571 internal_function __attribute_warn_unused_result__
572 re_string_reconstruct (re_string_t
*pstr
, int idx
, int eflags
)
574 int offset
= idx
- pstr
->raw_mbs_idx
;
575 if (BE (offset
< 0, 0))
578 #ifdef RE_ENABLE_I18N
579 if (pstr
->mb_cur_max
> 1)
580 memset (&pstr
->cur_state
, '\0', sizeof (mbstate_t));
581 #endif /* RE_ENABLE_I18N */
582 pstr
->len
= pstr
->raw_len
;
583 pstr
->stop
= pstr
->raw_stop
;
585 pstr
->raw_mbs_idx
= 0;
586 pstr
->valid_raw_len
= 0;
587 pstr
->offsets_needed
= 0;
588 pstr
->tip_context
= ((eflags
& REG_NOTBOL
) ? CONTEXT_BEGBUF
589 : CONTEXT_NEWLINE
| CONTEXT_BEGBUF
);
590 if (!pstr
->mbs_allocated
)
591 pstr
->mbs
= (unsigned char *) pstr
->raw_mbs
;
595 if (BE (offset
!= 0, 1))
597 /* Should the already checked characters be kept? */
598 if (BE (offset
< pstr
->valid_raw_len
, 1))
600 /* Yes, move them to the front of the buffer. */
601 #ifdef RE_ENABLE_I18N
602 if (BE (pstr
->offsets_needed
, 0))
604 int low
= 0, high
= pstr
->valid_len
, mid
;
607 mid
= (high
+ low
) / 2;
608 if (pstr
->offsets
[mid
] > offset
)
610 else if (pstr
->offsets
[mid
] < offset
)
616 if (pstr
->offsets
[mid
] < offset
)
618 pstr
->tip_context
= re_string_context_at (pstr
, mid
- 1,
620 /* This can be quite complicated, so handle specially
621 only the common and easy case where the character with
622 different length representation of lower and upper
623 case is present at or after offset. */
624 if (pstr
->valid_len
> offset
625 && mid
== offset
&& pstr
->offsets
[mid
] == offset
)
627 memmove (pstr
->wcs
, pstr
->wcs
+ offset
,
628 (pstr
->valid_len
- offset
) * sizeof (wint_t));
629 memmove (pstr
->mbs
, pstr
->mbs
+ offset
, pstr
->valid_len
- offset
);
630 pstr
->valid_len
-= offset
;
631 pstr
->valid_raw_len
-= offset
;
632 for (low
= 0; low
< pstr
->valid_len
; low
++)
633 pstr
->offsets
[low
] = pstr
->offsets
[low
+ offset
] - offset
;
637 /* Otherwise, just find out how long the partial multibyte
638 character at offset is and fill it with WEOF/255. */
639 pstr
->len
= pstr
->raw_len
- idx
+ offset
;
640 pstr
->stop
= pstr
->raw_stop
- idx
+ offset
;
641 pstr
->offsets_needed
= 0;
642 while (mid
> 0 && pstr
->offsets
[mid
- 1] == offset
)
644 while (mid
< pstr
->valid_len
)
645 if (pstr
->wcs
[mid
] != WEOF
)
649 if (mid
== pstr
->valid_len
)
653 pstr
->valid_len
= pstr
->offsets
[mid
] - offset
;
656 for (low
= 0; low
< pstr
->valid_len
; ++low
)
657 pstr
->wcs
[low
] = WEOF
;
658 memset (pstr
->mbs
, 255, pstr
->valid_len
);
661 pstr
->valid_raw_len
= pstr
->valid_len
;
667 pstr
->tip_context
= re_string_context_at (pstr
, offset
- 1,
669 #ifdef RE_ENABLE_I18N
670 if (pstr
->mb_cur_max
> 1)
671 memmove (pstr
->wcs
, pstr
->wcs
+ offset
,
672 (pstr
->valid_len
- offset
) * sizeof (wint_t));
673 #endif /* RE_ENABLE_I18N */
674 if (BE (pstr
->mbs_allocated
, 0))
675 memmove (pstr
->mbs
, pstr
->mbs
+ offset
,
676 pstr
->valid_len
- offset
);
677 pstr
->valid_len
-= offset
;
678 pstr
->valid_raw_len
-= offset
;
680 assert (pstr
->valid_len
> 0);
686 /* No, skip all characters until IDX. */
687 int prev_valid_len
= pstr
->valid_len
;
689 #ifdef RE_ENABLE_I18N
690 if (BE (pstr
->offsets_needed
, 0))
692 pstr
->len
= pstr
->raw_len
- idx
+ offset
;
693 pstr
->stop
= pstr
->raw_stop
- idx
+ offset
;
694 pstr
->offsets_needed
= 0;
698 #ifdef RE_ENABLE_I18N
699 if (pstr
->mb_cur_max
> 1)
706 const unsigned char *raw
, *p
, *end
;
708 /* Special case UTF-8. Multi-byte chars start with any
709 byte other than 0x80 - 0xbf. */
710 raw
= pstr
->raw_mbs
+ pstr
->raw_mbs_idx
;
711 end
= raw
+ (offset
- pstr
->mb_cur_max
);
712 if (end
< pstr
->raw_mbs
)
714 p
= raw
+ offset
- 1;
716 /* We know the wchar_t encoding is UCS4, so for the simple
717 case, ASCII characters, skip the conversion step. */
718 if (isascii (*p
) && BE (pstr
->trans
== NULL
, 1))
720 memset (&pstr
->cur_state
, '\0', sizeof (mbstate_t));
721 /* pstr->valid_len = 0; */
726 for (; p
>= end
; --p
)
727 if ((*p
& 0xc0) != 0x80)
731 int mlen
= raw
+ pstr
->len
- p
;
732 unsigned char buf
[6];
735 if (BE (pstr
->trans
!= NULL
, 0))
737 int i
= mlen
< 6 ? mlen
: 6;
739 buf
[i
] = pstr
->trans
[p
[i
]];
741 /* XXX Don't use mbrtowc, we know which conversion
742 to use (UTF-8 -> UCS4). */
743 memset (&cur_state
, 0, sizeof (cur_state
));
744 mbclen
= __mbrtowc (&wc2
, (const char *) p
, mlen
,
746 if (raw
+ offset
- p
<= mbclen
747 && mbclen
< (size_t) -2)
749 memset (&pstr
->cur_state
, '\0',
751 pstr
->valid_len
= mbclen
- (raw
+ offset
- p
);
759 pstr
->valid_len
= re_string_skip_chars (pstr
, idx
, &wc
) - idx
;
762 = re_string_context_at (pstr
, prev_valid_len
- 1, eflags
);
764 pstr
->tip_context
= ((BE (pstr
->word_ops_used
!= 0, 0)
765 && IS_WIDE_WORD_CHAR (wc
))
767 : ((IS_WIDE_NEWLINE (wc
)
768 && pstr
->newline_anchor
)
769 ? CONTEXT_NEWLINE
: 0));
770 if (BE (pstr
->valid_len
, 0))
772 for (wcs_idx
= 0; wcs_idx
< pstr
->valid_len
; ++wcs_idx
)
773 pstr
->wcs
[wcs_idx
] = WEOF
;
774 if (pstr
->mbs_allocated
)
775 memset (pstr
->mbs
, 255, pstr
->valid_len
);
777 pstr
->valid_raw_len
= pstr
->valid_len
;
780 #endif /* RE_ENABLE_I18N */
782 int c
= pstr
->raw_mbs
[pstr
->raw_mbs_idx
+ offset
- 1];
783 pstr
->valid_raw_len
= 0;
786 pstr
->tip_context
= (bitset_contain (pstr
->word_char
, c
)
788 : ((IS_NEWLINE (c
) && pstr
->newline_anchor
)
789 ? CONTEXT_NEWLINE
: 0));
792 if (!BE (pstr
->mbs_allocated
, 0))
795 pstr
->raw_mbs_idx
= idx
;
797 pstr
->stop
-= offset
;
799 /* Then build the buffers. */
800 #ifdef RE_ENABLE_I18N
801 if (pstr
->mb_cur_max
> 1)
805 reg_errcode_t ret
= build_wcs_upper_buffer (pstr
);
806 if (BE (ret
!= REG_NOERROR
, 0))
810 build_wcs_buffer (pstr
);
813 #endif /* RE_ENABLE_I18N */
814 if (BE (pstr
->mbs_allocated
, 0))
817 build_upper_buffer (pstr
);
818 else if (pstr
->trans
!= NULL
)
819 re_string_translate_buffer (pstr
);
822 pstr
->valid_len
= pstr
->len
;
829 internal_function
__attribute ((pure
))
830 re_string_peek_byte_case (const re_string_t
*pstr
, int idx
)
834 /* Handle the common (easiest) cases first. */
835 if (BE (!pstr
->mbs_allocated
, 1))
836 return re_string_peek_byte (pstr
, idx
);
838 #ifdef RE_ENABLE_I18N
839 if (pstr
->mb_cur_max
> 1
840 && ! re_string_is_single_byte_char (pstr
, pstr
->cur_idx
+ idx
))
841 return re_string_peek_byte (pstr
, idx
);
844 off
= pstr
->cur_idx
+ idx
;
845 #ifdef RE_ENABLE_I18N
846 if (pstr
->offsets_needed
)
847 off
= pstr
->offsets
[off
];
850 ch
= pstr
->raw_mbs
[pstr
->raw_mbs_idx
+ off
];
852 #ifdef RE_ENABLE_I18N
853 /* Ensure that e.g. for tr_TR.UTF-8 BACKSLASH DOTLESS SMALL LETTER I
854 this function returns CAPITAL LETTER I instead of first byte of
855 DOTLESS SMALL LETTER I. The latter would confuse the parser,
856 since peek_byte_case doesn't advance cur_idx in any way. */
857 if (pstr
->offsets_needed
&& !isascii (ch
))
858 return re_string_peek_byte (pstr
, idx
);
865 internal_function
__attribute ((pure
))
866 re_string_fetch_byte_case (re_string_t
*pstr
)
868 if (BE (!pstr
->mbs_allocated
, 1))
869 return re_string_fetch_byte (pstr
);
871 #ifdef RE_ENABLE_I18N
872 if (pstr
->offsets_needed
)
876 /* For tr_TR.UTF-8 [[:islower:]] there is
877 [[: CAPITAL LETTER I WITH DOT lower:]] in mbs. Skip
878 in that case the whole multi-byte character and return
879 the original letter. On the other side, with
880 [[: DOTLESS SMALL LETTER I return [[:I, as doing
881 anything else would complicate things too much. */
883 if (!re_string_first_byte (pstr
, pstr
->cur_idx
))
884 return re_string_fetch_byte (pstr
);
886 off
= pstr
->offsets
[pstr
->cur_idx
];
887 ch
= pstr
->raw_mbs
[pstr
->raw_mbs_idx
+ off
];
890 return re_string_fetch_byte (pstr
);
892 re_string_skip_bytes (pstr
,
893 re_string_char_size_at (pstr
, pstr
->cur_idx
));
898 return pstr
->raw_mbs
[pstr
->raw_mbs_idx
+ pstr
->cur_idx
++];
903 re_string_destruct (re_string_t
*pstr
)
905 #ifdef RE_ENABLE_I18N
907 re_free (pstr
->offsets
);
908 #endif /* RE_ENABLE_I18N */
909 if (pstr
->mbs_allocated
)
913 /* Return the context at IDX in INPUT. */
917 re_string_context_at (const re_string_t
*input
, int idx
, int eflags
)
921 /* In this case, we use the value stored in input->tip_context,
922 since we can't know the character in input->mbs[-1] here. */
923 return input
->tip_context
;
924 if (BE (idx
== input
->len
, 0))
925 return ((eflags
& REG_NOTEOL
) ? CONTEXT_ENDBUF
926 : CONTEXT_NEWLINE
| CONTEXT_ENDBUF
);
927 #ifdef RE_ENABLE_I18N
928 if (input
->mb_cur_max
> 1)
932 while(input
->wcs
[wc_idx
] == WEOF
)
935 /* It must not happen. */
936 assert (wc_idx
>= 0);
940 return input
->tip_context
;
942 wc
= input
->wcs
[wc_idx
];
943 if (BE (input
->word_ops_used
!= 0, 0) && IS_WIDE_WORD_CHAR (wc
))
945 return (IS_WIDE_NEWLINE (wc
) && input
->newline_anchor
946 ? CONTEXT_NEWLINE
: 0);
951 c
= re_string_byte_at (input
, idx
);
952 if (bitset_contain (input
->word_char
, c
))
954 return IS_NEWLINE (c
) && input
->newline_anchor
? CONTEXT_NEWLINE
: 0;
958 /* Functions for set operation. */
961 internal_function __attribute_warn_unused_result__
962 re_node_set_alloc (re_node_set
*set
, int size
)
966 set
->elems
= re_malloc (int, size
);
967 if (BE (set
->elems
== NULL
, 0))
973 internal_function __attribute_warn_unused_result__
974 re_node_set_init_1 (re_node_set
*set
, int elem
)
978 set
->elems
= re_malloc (int, 1);
979 if (BE (set
->elems
== NULL
, 0))
981 set
->alloc
= set
->nelem
= 0;
984 set
->elems
[0] = elem
;
989 internal_function __attribute_warn_unused_result__
990 re_node_set_init_2 (re_node_set
*set
, int elem1
, int elem2
)
993 set
->elems
= re_malloc (int, 2);
994 if (BE (set
->elems
== NULL
, 0))
999 set
->elems
[0] = elem1
;
1006 set
->elems
[0] = elem1
;
1007 set
->elems
[1] = elem2
;
1011 set
->elems
[0] = elem2
;
1012 set
->elems
[1] = elem1
;
1018 static reg_errcode_t
1019 internal_function __attribute_warn_unused_result__
1020 re_node_set_init_copy (re_node_set
*dest
, const re_node_set
*src
)
1022 dest
->nelem
= src
->nelem
;
1025 dest
->alloc
= dest
->nelem
;
1026 dest
->elems
= re_malloc (int, dest
->alloc
);
1027 if (BE (dest
->elems
== NULL
, 0))
1029 dest
->alloc
= dest
->nelem
= 0;
1032 memcpy (dest
->elems
, src
->elems
, src
->nelem
* sizeof (int));
1035 re_node_set_init_empty (dest
);
1039 /* Calculate the intersection of the sets SRC1 and SRC2. And merge it to
1040 DEST. Return value indicate the error code or REG_NOERROR if succeeded.
1041 Note: We assume dest->elems is NULL, when dest->alloc is 0. */
1043 static reg_errcode_t
1044 internal_function __attribute_warn_unused_result__
1045 re_node_set_add_intersect (re_node_set
*dest
, const re_node_set
*src1
,
1046 const re_node_set
*src2
)
1048 int i1
, i2
, is
, id
, delta
, sbase
;
1049 if (src1
->nelem
== 0 || src2
->nelem
== 0)
1052 /* We need dest->nelem + 2 * elems_in_intersection; this is a
1053 conservative estimate. */
1054 if (src1
->nelem
+ src2
->nelem
+ dest
->nelem
> dest
->alloc
)
1056 int new_alloc
= src1
->nelem
+ src2
->nelem
+ dest
->alloc
;
1057 int *new_elems
= re_realloc (dest
->elems
, int, new_alloc
);
1058 if (BE (new_elems
== NULL
, 0))
1060 dest
->elems
= new_elems
;
1061 dest
->alloc
= new_alloc
;
1064 /* Find the items in the intersection of SRC1 and SRC2, and copy
1065 into the top of DEST those that are not already in DEST itself. */
1066 sbase
= dest
->nelem
+ src1
->nelem
+ src2
->nelem
;
1067 i1
= src1
->nelem
- 1;
1068 i2
= src2
->nelem
- 1;
1069 id
= dest
->nelem
- 1;
1072 if (src1
->elems
[i1
] == src2
->elems
[i2
])
1074 /* Try to find the item in DEST. Maybe we could binary search? */
1075 while (id
>= 0 && dest
->elems
[id
] > src1
->elems
[i1
])
1078 if (id
< 0 || dest
->elems
[id
] != src1
->elems
[i1
])
1079 dest
->elems
[--sbase
] = src1
->elems
[i1
];
1081 if (--i1
< 0 || --i2
< 0)
1085 /* Lower the highest of the two items. */
1086 else if (src1
->elems
[i1
] < src2
->elems
[i2
])
1098 id
= dest
->nelem
- 1;
1099 is
= dest
->nelem
+ src1
->nelem
+ src2
->nelem
- 1;
1100 delta
= is
- sbase
+ 1;
1102 /* Now copy. When DELTA becomes zero, the remaining
1103 DEST elements are already in place; this is more or
1104 less the same loop that is in re_node_set_merge. */
1105 dest
->nelem
+= delta
;
1106 if (delta
> 0 && id
>= 0)
1109 if (dest
->elems
[is
] > dest
->elems
[id
])
1111 /* Copy from the top. */
1112 dest
->elems
[id
+ delta
--] = dest
->elems
[is
--];
1118 /* Slide from the bottom. */
1119 dest
->elems
[id
+ delta
] = dest
->elems
[id
];
1125 /* Copy remaining SRC elements. */
1126 memcpy (dest
->elems
, dest
->elems
+ sbase
, delta
* sizeof (int));
1131 /* Calculate the union set of the sets SRC1 and SRC2. And store it to
1132 DEST. Return value indicate the error code or REG_NOERROR if succeeded. */
1134 static reg_errcode_t
1135 internal_function __attribute_warn_unused_result__
1136 re_node_set_init_union (re_node_set
*dest
, const re_node_set
*src1
,
1137 const re_node_set
*src2
)
1140 if (src1
!= NULL
&& src1
->nelem
> 0 && src2
!= NULL
&& src2
->nelem
> 0)
1142 dest
->alloc
= src1
->nelem
+ src2
->nelem
;
1143 dest
->elems
= re_malloc (int, dest
->alloc
);
1144 if (BE (dest
->elems
== NULL
, 0))
1149 if (src1
!= NULL
&& src1
->nelem
> 0)
1150 return re_node_set_init_copy (dest
, src1
);
1151 else if (src2
!= NULL
&& src2
->nelem
> 0)
1152 return re_node_set_init_copy (dest
, src2
);
1154 re_node_set_init_empty (dest
);
1157 for (i1
= i2
= id
= 0 ; i1
< src1
->nelem
&& i2
< src2
->nelem
;)
1159 if (src1
->elems
[i1
] > src2
->elems
[i2
])
1161 dest
->elems
[id
++] = src2
->elems
[i2
++];
1164 if (src1
->elems
[i1
] == src2
->elems
[i2
])
1166 dest
->elems
[id
++] = src1
->elems
[i1
++];
1168 if (i1
< src1
->nelem
)
1170 memcpy (dest
->elems
+ id
, src1
->elems
+ i1
,
1171 (src1
->nelem
- i1
) * sizeof (int));
1172 id
+= src1
->nelem
- i1
;
1174 else if (i2
< src2
->nelem
)
1176 memcpy (dest
->elems
+ id
, src2
->elems
+ i2
,
1177 (src2
->nelem
- i2
) * sizeof (int));
1178 id
+= src2
->nelem
- i2
;
1184 /* Calculate the union set of the sets DEST and SRC. And store it to
1185 DEST. Return value indicate the error code or REG_NOERROR if succeeded. */
1187 static reg_errcode_t
1188 internal_function __attribute_warn_unused_result__
1189 re_node_set_merge (re_node_set
*dest
, const re_node_set
*src
)
1191 int is
, id
, sbase
, delta
;
1192 if (src
== NULL
|| src
->nelem
== 0)
1194 if (dest
->alloc
< 2 * src
->nelem
+ dest
->nelem
)
1196 int new_alloc
= 2 * (src
->nelem
+ dest
->alloc
);
1197 int *new_buffer
= re_realloc (dest
->elems
, int, new_alloc
);
1198 if (BE (new_buffer
== NULL
, 0))
1200 dest
->elems
= new_buffer
;
1201 dest
->alloc
= new_alloc
;
1204 if (BE (dest
->nelem
== 0, 0))
1206 dest
->nelem
= src
->nelem
;
1207 memcpy (dest
->elems
, src
->elems
, src
->nelem
* sizeof (int));
1211 /* Copy into the top of DEST the items of SRC that are not
1212 found in DEST. Maybe we could binary search in DEST? */
1213 for (sbase
= dest
->nelem
+ 2 * src
->nelem
,
1214 is
= src
->nelem
- 1, id
= dest
->nelem
- 1; is
>= 0 && id
>= 0; )
1216 if (dest
->elems
[id
] == src
->elems
[is
])
1218 else if (dest
->elems
[id
] < src
->elems
[is
])
1219 dest
->elems
[--sbase
] = src
->elems
[is
--];
1220 else /* if (dest->elems[id] > src->elems[is]) */
1226 /* If DEST is exhausted, the remaining items of SRC must be unique. */
1228 memcpy (dest
->elems
+ sbase
, src
->elems
, (is
+ 1) * sizeof (int));
1231 id
= dest
->nelem
- 1;
1232 is
= dest
->nelem
+ 2 * src
->nelem
- 1;
1233 delta
= is
- sbase
+ 1;
1237 /* Now copy. When DELTA becomes zero, the remaining
1238 DEST elements are already in place. */
1239 dest
->nelem
+= delta
;
1242 if (dest
->elems
[is
] > dest
->elems
[id
])
1244 /* Copy from the top. */
1245 dest
->elems
[id
+ delta
--] = dest
->elems
[is
--];
1251 /* Slide from the bottom. */
1252 dest
->elems
[id
+ delta
] = dest
->elems
[id
];
1255 /* Copy remaining SRC elements. */
1256 memcpy (dest
->elems
, dest
->elems
+ sbase
,
1257 delta
* sizeof (int));
1266 /* Insert the new element ELEM to the re_node_set* SET.
1267 SET should not already have ELEM.
1268 return -1 if an error is occured, return 1 otherwise. */
1271 internal_function __attribute_warn_unused_result__
1272 re_node_set_insert (re_node_set
*set
, int elem
)
1275 /* In case the set is empty. */
1276 if (set
->alloc
== 0)
1278 if (BE (re_node_set_init_1 (set
, elem
) == REG_NOERROR
, 1))
1284 if (BE (set
->nelem
, 0) == 0)
1286 /* We already guaranteed above that set->alloc != 0. */
1287 set
->elems
[0] = elem
;
1292 /* Realloc if we need. */
1293 if (set
->alloc
== set
->nelem
)
1296 set
->alloc
= set
->alloc
* 2;
1297 new_elems
= re_realloc (set
->elems
, int, set
->alloc
);
1298 if (BE (new_elems
== NULL
, 0))
1300 set
->elems
= new_elems
;
1303 /* Move the elements which follows the new element. Test the
1304 first element separately to skip a check in the inner loop. */
1305 if (elem
< set
->elems
[0])
1308 for (idx
= set
->nelem
; idx
> 0; idx
--)
1309 set
->elems
[idx
] = set
->elems
[idx
- 1];
1313 for (idx
= set
->nelem
; set
->elems
[idx
- 1] > elem
; idx
--)
1314 set
->elems
[idx
] = set
->elems
[idx
- 1];
1317 /* Insert the new element. */
1318 set
->elems
[idx
] = elem
;
1323 /* Insert the new element ELEM to the re_node_set* SET.
1324 SET should not already have any element greater than or equal to ELEM.
1325 Return -1 if an error is occured, return 1 otherwise. */
1328 internal_function __attribute_warn_unused_result__
1329 re_node_set_insert_last (re_node_set
*set
, int elem
)
1331 /* Realloc if we need. */
1332 if (set
->alloc
== set
->nelem
)
1335 set
->alloc
= (set
->alloc
+ 1) * 2;
1336 new_elems
= re_realloc (set
->elems
, int, set
->alloc
);
1337 if (BE (new_elems
== NULL
, 0))
1339 set
->elems
= new_elems
;
1342 /* Insert the new element. */
1343 set
->elems
[set
->nelem
++] = elem
;
1347 /* Compare two node sets SET1 and SET2.
1348 return 1 if SET1 and SET2 are equivalent, return 0 otherwise. */
1351 internal_function
__attribute ((pure
))
1352 re_node_set_compare (const re_node_set
*set1
, const re_node_set
*set2
)
1355 if (set1
== NULL
|| set2
== NULL
|| set1
->nelem
!= set2
->nelem
)
1357 for (i
= set1
->nelem
; --i
>= 0 ; )
1358 if (set1
->elems
[i
] != set2
->elems
[i
])
1363 /* Return (idx + 1) if SET contains the element ELEM, return 0 otherwise. */
1366 internal_function
__attribute ((pure
))
1367 re_node_set_contains (const re_node_set
*set
, int elem
)
1369 unsigned int idx
, right
, mid
;
1370 if (set
->nelem
<= 0)
1373 /* Binary search the element. */
1375 right
= set
->nelem
- 1;
1378 mid
= (idx
+ right
) / 2;
1379 if (set
->elems
[mid
] < elem
)
1384 return set
->elems
[idx
] == elem
? idx
+ 1 : 0;
1389 re_node_set_remove_at (re_node_set
*set
, int idx
)
1391 if (idx
< 0 || idx
>= set
->nelem
)
1394 for (; idx
< set
->nelem
; idx
++)
1395 set
->elems
[idx
] = set
->elems
[idx
+ 1];
1399 /* Add the token TOKEN to dfa->nodes, and return the index of the token.
1400 Or return -1, if an error will be occured. */
1404 re_dfa_add_node (re_dfa_t
*dfa
, re_token_t token
)
1406 int type
= token
.type
;
1407 if (BE (dfa
->nodes_len
>= dfa
->nodes_alloc
, 0))
1409 size_t new_nodes_alloc
= dfa
->nodes_alloc
* 2;
1410 int *new_nexts
, *new_indices
;
1411 re_node_set
*new_edests
, *new_eclosures
;
1412 re_token_t
*new_nodes
;
1414 /* Avoid overflows in realloc. */
1415 const size_t max_object_size
= MAX (sizeof (re_token_t
),
1416 MAX (sizeof (re_node_set
),
1418 if (BE (SIZE_MAX
/ max_object_size
< new_nodes_alloc
, 0))
1421 new_nodes
= re_realloc (dfa
->nodes
, re_token_t
, new_nodes_alloc
);
1422 if (BE (new_nodes
== NULL
, 0))
1424 dfa
->nodes
= new_nodes
;
1425 new_nexts
= re_realloc (dfa
->nexts
, int, new_nodes_alloc
);
1426 new_indices
= re_realloc (dfa
->org_indices
, int, new_nodes_alloc
);
1427 new_edests
= re_realloc (dfa
->edests
, re_node_set
, new_nodes_alloc
);
1428 new_eclosures
= re_realloc (dfa
->eclosures
, re_node_set
, new_nodes_alloc
);
1429 if (BE (new_nexts
== NULL
|| new_indices
== NULL
1430 || new_edests
== NULL
|| new_eclosures
== NULL
, 0))
1432 dfa
->nexts
= new_nexts
;
1433 dfa
->org_indices
= new_indices
;
1434 dfa
->edests
= new_edests
;
1435 dfa
->eclosures
= new_eclosures
;
1436 dfa
->nodes_alloc
= new_nodes_alloc
;
1438 dfa
->nodes
[dfa
->nodes_len
] = token
;
1439 dfa
->nodes
[dfa
->nodes_len
].constraint
= 0;
1440 #ifdef RE_ENABLE_I18N
1441 dfa
->nodes
[dfa
->nodes_len
].accept_mb
=
1442 (type
== OP_PERIOD
&& dfa
->mb_cur_max
> 1) || type
== COMPLEX_BRACKET
;
1444 dfa
->nexts
[dfa
->nodes_len
] = -1;
1445 re_node_set_init_empty (dfa
->edests
+ dfa
->nodes_len
);
1446 re_node_set_init_empty (dfa
->eclosures
+ dfa
->nodes_len
);
1447 return dfa
->nodes_len
++;
1450 static inline unsigned int
1452 calc_state_hash (const re_node_set
*nodes
, unsigned int context
)
1454 unsigned int hash
= nodes
->nelem
+ context
;
1456 for (i
= 0 ; i
< nodes
->nelem
; i
++)
1457 hash
+= nodes
->elems
[i
];
1461 /* Search for the state whose node_set is equivalent to NODES.
1462 Return the pointer to the state, if we found it in the DFA.
1463 Otherwise create the new one and return it. In case of an error
1464 return NULL and set the error code in ERR.
1465 Note: - We assume NULL as the invalid state, then it is possible that
1466 return value is NULL and ERR is REG_NOERROR.
1467 - We never return non-NULL value in case of any errors, it is for
1470 static re_dfastate_t
*
1471 internal_function __attribute_warn_unused_result__
1472 re_acquire_state (reg_errcode_t
*err
, const re_dfa_t
*dfa
,
1473 const re_node_set
*nodes
)
1476 re_dfastate_t
*new_state
;
1477 struct re_state_table_entry
*spot
;
1479 if (BE (nodes
->nelem
== 0, 0))
1484 hash
= calc_state_hash (nodes
, 0);
1485 spot
= dfa
->state_table
+ (hash
& dfa
->state_hash_mask
);
1487 for (i
= 0 ; i
< spot
->num
; i
++)
1489 re_dfastate_t
*state
= spot
->array
[i
];
1490 if (hash
!= state
->hash
)
1492 if (re_node_set_compare (&state
->nodes
, nodes
))
1496 /* There are no appropriate state in the dfa, create the new one. */
1497 new_state
= create_ci_newstate (dfa
, nodes
, hash
);
1498 if (BE (new_state
== NULL
, 0))
1504 /* Search for the state whose node_set is equivalent to NODES and
1505 whose context is equivalent to CONTEXT.
1506 Return the pointer to the state, if we found it in the DFA.
1507 Otherwise create the new one and return it. In case of an error
1508 return NULL and set the error code in ERR.
1509 Note: - We assume NULL as the invalid state, then it is possible that
1510 return value is NULL and ERR is REG_NOERROR.
1511 - We never return non-NULL value in case of any errors, it is for
1514 static re_dfastate_t
*
1515 internal_function __attribute_warn_unused_result__
1516 re_acquire_state_context (reg_errcode_t
*err
, const re_dfa_t
*dfa
,
1517 const re_node_set
*nodes
, unsigned int context
)
1520 re_dfastate_t
*new_state
;
1521 struct re_state_table_entry
*spot
;
1523 if (nodes
->nelem
== 0)
1528 hash
= calc_state_hash (nodes
, context
);
1529 spot
= dfa
->state_table
+ (hash
& dfa
->state_hash_mask
);
1531 for (i
= 0 ; i
< spot
->num
; i
++)
1533 re_dfastate_t
*state
= spot
->array
[i
];
1534 if (state
->hash
== hash
1535 && state
->context
== context
1536 && re_node_set_compare (state
->entrance_nodes
, nodes
))
1539 /* There are no appropriate state in `dfa', create the new one. */
1540 new_state
= create_cd_newstate (dfa
, nodes
, context
, hash
);
1541 if (BE (new_state
== NULL
, 0))
1547 /* Finish initialization of the new state NEWSTATE, and using its hash value
1548 HASH put in the appropriate bucket of DFA's state table. Return value
1549 indicates the error code if failed. */
1551 static reg_errcode_t
1552 __attribute_warn_unused_result__
1553 register_state (const re_dfa_t
*dfa
, re_dfastate_t
*newstate
,
1556 struct re_state_table_entry
*spot
;
1560 newstate
->hash
= hash
;
1561 err
= re_node_set_alloc (&newstate
->non_eps_nodes
, newstate
->nodes
.nelem
);
1562 if (BE (err
!= REG_NOERROR
, 0))
1564 for (i
= 0; i
< newstate
->nodes
.nelem
; i
++)
1566 int elem
= newstate
->nodes
.elems
[i
];
1567 if (!IS_EPSILON_NODE (dfa
->nodes
[elem
].type
))
1568 if (re_node_set_insert_last (&newstate
->non_eps_nodes
, elem
) < 0)
1572 spot
= dfa
->state_table
+ (hash
& dfa
->state_hash_mask
);
1573 if (BE (spot
->alloc
<= spot
->num
, 0))
1575 int new_alloc
= 2 * spot
->num
+ 2;
1576 re_dfastate_t
**new_array
= re_realloc (spot
->array
, re_dfastate_t
*,
1578 if (BE (new_array
== NULL
, 0))
1580 spot
->array
= new_array
;
1581 spot
->alloc
= new_alloc
;
1583 spot
->array
[spot
->num
++] = newstate
;
1588 free_state (re_dfastate_t
*state
)
1590 re_node_set_free (&state
->non_eps_nodes
);
1591 re_node_set_free (&state
->inveclosure
);
1592 if (state
->entrance_nodes
!= &state
->nodes
)
1594 re_node_set_free (state
->entrance_nodes
);
1595 re_free (state
->entrance_nodes
);
1597 re_node_set_free (&state
->nodes
);
1598 re_free (state
->word_trtable
);
1599 re_free (state
->trtable
);
1603 /* Create the new state which is independ of contexts.
1604 Return the new state if succeeded, otherwise return NULL. */
1606 static re_dfastate_t
*
1607 internal_function __attribute_warn_unused_result__
1608 create_ci_newstate (const re_dfa_t
*dfa
, const re_node_set
*nodes
,
1613 re_dfastate_t
*newstate
;
1615 newstate
= (re_dfastate_t
*) calloc (sizeof (re_dfastate_t
), 1);
1616 if (BE (newstate
== NULL
, 0))
1618 err
= re_node_set_init_copy (&newstate
->nodes
, nodes
);
1619 if (BE (err
!= REG_NOERROR
, 0))
1625 newstate
->entrance_nodes
= &newstate
->nodes
;
1626 for (i
= 0 ; i
< nodes
->nelem
; i
++)
1628 re_token_t
*node
= dfa
->nodes
+ nodes
->elems
[i
];
1629 re_token_type_t type
= node
->type
;
1630 if (type
== CHARACTER
&& !node
->constraint
)
1632 #ifdef RE_ENABLE_I18N
1633 newstate
->accept_mb
|= node
->accept_mb
;
1634 #endif /* RE_ENABLE_I18N */
1636 /* If the state has the halt node, the state is a halt state. */
1637 if (type
== END_OF_RE
)
1639 else if (type
== OP_BACK_REF
)
1640 newstate
->has_backref
= 1;
1641 else if (type
== ANCHOR
|| node
->constraint
)
1642 newstate
->has_constraint
= 1;
1644 err
= register_state (dfa
, newstate
, hash
);
1645 if (BE (err
!= REG_NOERROR
, 0))
1647 free_state (newstate
);
1653 /* Create the new state which is depend on the context CONTEXT.
1654 Return the new state if succeeded, otherwise return NULL. */
1656 static re_dfastate_t
*
1657 internal_function __attribute_warn_unused_result__
1658 create_cd_newstate (const re_dfa_t
*dfa
, const re_node_set
*nodes
,
1659 unsigned int context
, unsigned int hash
)
1661 int i
, nctx_nodes
= 0;
1663 re_dfastate_t
*newstate
;
1665 newstate
= (re_dfastate_t
*) calloc (sizeof (re_dfastate_t
), 1);
1666 if (BE (newstate
== NULL
, 0))
1668 err
= re_node_set_init_copy (&newstate
->nodes
, nodes
);
1669 if (BE (err
!= REG_NOERROR
, 0))
1675 newstate
->context
= context
;
1676 newstate
->entrance_nodes
= &newstate
->nodes
;
1678 for (i
= 0 ; i
< nodes
->nelem
; i
++)
1680 re_token_t
*node
= dfa
->nodes
+ nodes
->elems
[i
];
1681 re_token_type_t type
= node
->type
;
1682 unsigned int constraint
= node
->constraint
;
1684 if (type
== CHARACTER
&& !constraint
)
1686 #ifdef RE_ENABLE_I18N
1687 newstate
->accept_mb
|= node
->accept_mb
;
1688 #endif /* RE_ENABLE_I18N */
1690 /* If the state has the halt node, the state is a halt state. */
1691 if (type
== END_OF_RE
)
1693 else if (type
== OP_BACK_REF
)
1694 newstate
->has_backref
= 1;
1698 if (newstate
->entrance_nodes
== &newstate
->nodes
)
1700 newstate
->entrance_nodes
= re_malloc (re_node_set
, 1);
1701 if (BE (newstate
->entrance_nodes
== NULL
, 0))
1703 free_state (newstate
);
1706 if (re_node_set_init_copy (newstate
->entrance_nodes
, nodes
)
1710 newstate
->has_constraint
= 1;
1713 if (NOT_SATISFY_PREV_CONSTRAINT (constraint
,context
))
1715 re_node_set_remove_at (&newstate
->nodes
, i
- nctx_nodes
);
1720 err
= register_state (dfa
, newstate
, hash
);
1721 if (BE (err
!= REG_NOERROR
, 0))
1723 free_state (newstate
);