1 /* Extended regular expression matching and search library.
2 Copyright (C) 2002-2006, 2010, 2011 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4 Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library; if not, write to the Free
18 Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
21 static void re_string_construct_common (const char *str
, int len
,
23 RE_TRANSLATE_TYPE trans
, int icase
,
24 const re_dfa_t
*dfa
) internal_function
;
25 static re_dfastate_t
*create_ci_newstate (const re_dfa_t
*dfa
,
26 const re_node_set
*nodes
,
27 unsigned int hash
) internal_function
;
28 static re_dfastate_t
*create_cd_newstate (const re_dfa_t
*dfa
,
29 const re_node_set
*nodes
,
31 unsigned int hash
) internal_function
;
33 /* Functions for string operation. */
35 /* This function allocate the buffers. It is necessary to call
36 re_string_reconstruct before using the object. */
39 internal_function __attribute_warn_unused_result__
40 re_string_allocate (re_string_t
*pstr
, const char *str
, int len
, int init_len
,
41 RE_TRANSLATE_TYPE trans
, int icase
, const re_dfa_t
*dfa
)
46 /* Ensure at least one character fits into the buffers. */
47 if (init_len
< dfa
->mb_cur_max
)
48 init_len
= dfa
->mb_cur_max
;
49 init_buf_len
= (len
+ 1 < init_len
) ? len
+ 1: init_len
;
50 re_string_construct_common (str
, len
, pstr
, trans
, icase
, dfa
);
52 ret
= re_string_realloc_buffers (pstr
, init_buf_len
);
53 if (BE (ret
!= REG_NOERROR
, 0))
56 pstr
->word_char
= dfa
->word_char
;
57 pstr
->word_ops_used
= dfa
->word_ops_used
;
58 pstr
->mbs
= pstr
->mbs_allocated
? pstr
->mbs
: (unsigned char *) str
;
59 pstr
->valid_len
= (pstr
->mbs_allocated
|| dfa
->mb_cur_max
> 1) ? 0 : len
;
60 pstr
->valid_raw_len
= pstr
->valid_len
;
64 /* This function allocate the buffers, and initialize them. */
67 internal_function __attribute_warn_unused_result__
68 re_string_construct (re_string_t
*pstr
, const char *str
, int len
,
69 RE_TRANSLATE_TYPE trans
, int icase
, const re_dfa_t
*dfa
)
72 memset (pstr
, '\0', sizeof (re_string_t
));
73 re_string_construct_common (str
, len
, pstr
, trans
, icase
, dfa
);
77 ret
= re_string_realloc_buffers (pstr
, len
+ 1);
78 if (BE (ret
!= REG_NOERROR
, 0))
81 pstr
->mbs
= pstr
->mbs_allocated
? pstr
->mbs
: (unsigned char *) str
;
86 if (dfa
->mb_cur_max
> 1)
90 ret
= build_wcs_upper_buffer (pstr
);
91 if (BE (ret
!= REG_NOERROR
, 0))
93 if (pstr
->valid_raw_len
>= len
)
95 if (pstr
->bufs_len
> pstr
->valid_len
+ dfa
->mb_cur_max
)
97 ret
= re_string_realloc_buffers (pstr
, pstr
->bufs_len
* 2);
98 if (BE (ret
!= REG_NOERROR
, 0))
103 #endif /* RE_ENABLE_I18N */
104 build_upper_buffer (pstr
);
108 #ifdef RE_ENABLE_I18N
109 if (dfa
->mb_cur_max
> 1)
110 build_wcs_buffer (pstr
);
112 #endif /* RE_ENABLE_I18N */
115 re_string_translate_buffer (pstr
);
118 pstr
->valid_len
= pstr
->bufs_len
;
119 pstr
->valid_raw_len
= pstr
->bufs_len
;
127 /* Helper functions for re_string_allocate, and re_string_construct. */
130 internal_function __attribute_warn_unused_result__
131 re_string_realloc_buffers (re_string_t
*pstr
, int new_buf_len
)
133 #ifdef RE_ENABLE_I18N
134 if (pstr
->mb_cur_max
> 1)
138 /* Avoid overflow in realloc. */
139 const size_t max_object_size
= MAX (sizeof (wint_t), sizeof (int));
140 if (BE (SIZE_MAX
/ max_object_size
< new_buf_len
, 0))
143 new_wcs
= re_realloc (pstr
->wcs
, wint_t, new_buf_len
);
144 if (BE (new_wcs
== NULL
, 0))
147 if (pstr
->offsets
!= NULL
)
149 int *new_offsets
= re_realloc (pstr
->offsets
, int, new_buf_len
);
150 if (BE (new_offsets
== NULL
, 0))
152 pstr
->offsets
= new_offsets
;
155 #endif /* RE_ENABLE_I18N */
156 if (pstr
->mbs_allocated
)
158 unsigned char *new_mbs
= re_realloc (pstr
->mbs
, unsigned char,
160 if (BE (new_mbs
== NULL
, 0))
164 pstr
->bufs_len
= new_buf_len
;
171 re_string_construct_common (const char *str
, int len
, re_string_t
*pstr
,
172 RE_TRANSLATE_TYPE trans
, int icase
,
175 pstr
->raw_mbs
= (const unsigned char *) str
;
179 pstr
->icase
= icase
? 1 : 0;
180 pstr
->mbs_allocated
= (trans
!= NULL
|| icase
);
181 pstr
->mb_cur_max
= dfa
->mb_cur_max
;
182 pstr
->is_utf8
= dfa
->is_utf8
;
183 pstr
->map_notascii
= dfa
->map_notascii
;
184 pstr
->stop
= pstr
->len
;
185 pstr
->raw_stop
= pstr
->stop
;
188 #ifdef RE_ENABLE_I18N
190 /* Build wide character buffer PSTR->WCS.
191 If the byte sequence of the string are:
192 <mb1>(0), <mb1>(1), <mb2>(0), <mb2>(1), <sb3>
193 Then wide character buffer will be:
194 <wc1> , WEOF , <wc2> , WEOF , <wc3>
195 We use WEOF for padding, they indicate that the position isn't
196 a first byte of a multibyte character.
198 Note that this function assumes PSTR->VALID_LEN elements are already
199 built and starts from PSTR->VALID_LEN. */
203 build_wcs_buffer (re_string_t
*pstr
)
206 unsigned char buf
[MB_LEN_MAX
];
207 assert (MB_LEN_MAX
>= pstr
->mb_cur_max
);
209 unsigned char buf
[64];
212 int byte_idx
, end_idx
, remain_len
;
215 /* Build the buffers from pstr->valid_len to either pstr->len or
217 end_idx
= (pstr
->bufs_len
> pstr
->len
) ? pstr
->len
: pstr
->bufs_len
;
218 for (byte_idx
= pstr
->valid_len
; byte_idx
< end_idx
;)
223 remain_len
= end_idx
- byte_idx
;
224 prev_st
= pstr
->cur_state
;
225 /* Apply the translation if we need. */
226 if (BE (pstr
->trans
!= NULL
, 0))
230 for (i
= 0; i
< pstr
->mb_cur_max
&& i
< remain_len
; ++i
)
232 ch
= pstr
->raw_mbs
[pstr
->raw_mbs_idx
+ byte_idx
+ i
];
233 buf
[i
] = pstr
->mbs
[byte_idx
+ i
] = pstr
->trans
[ch
];
235 p
= (const char *) buf
;
238 p
= (const char *) pstr
->raw_mbs
+ pstr
->raw_mbs_idx
+ byte_idx
;
239 mbclen
= __mbrtowc (&wc
, p
, remain_len
, &pstr
->cur_state
);
240 if (BE (mbclen
== (size_t) -1 || mbclen
== 0
241 || (mbclen
== (size_t) -2 && pstr
->bufs_len
>= pstr
->len
), 0))
243 /* We treat these cases as a singlebyte character. */
245 wc
= (wchar_t) pstr
->raw_mbs
[pstr
->raw_mbs_idx
+ byte_idx
];
246 if (BE (pstr
->trans
!= NULL
, 0))
247 wc
= pstr
->trans
[wc
];
248 pstr
->cur_state
= prev_st
;
250 else if (BE (mbclen
== (size_t) -2, 0))
252 /* The buffer doesn't have enough space, finish to build. */
253 pstr
->cur_state
= prev_st
;
257 /* Write wide character and padding. */
258 pstr
->wcs
[byte_idx
++] = wc
;
259 /* Write paddings. */
260 for (remain_len
= byte_idx
+ mbclen
- 1; byte_idx
< remain_len
;)
261 pstr
->wcs
[byte_idx
++] = WEOF
;
263 pstr
->valid_len
= byte_idx
;
264 pstr
->valid_raw_len
= byte_idx
;
267 /* Build wide character buffer PSTR->WCS like build_wcs_buffer,
268 but for REG_ICASE. */
271 internal_function __attribute_warn_unused_result__
272 build_wcs_upper_buffer (re_string_t
*pstr
)
275 int src_idx
, byte_idx
, end_idx
, remain_len
;
278 char buf
[MB_LEN_MAX
];
279 assert (MB_LEN_MAX
>= pstr
->mb_cur_max
);
284 byte_idx
= pstr
->valid_len
;
285 end_idx
= (pstr
->bufs_len
> pstr
->len
) ? pstr
->len
: pstr
->bufs_len
;
287 /* The following optimization assumes that ASCII characters can be
288 mapped to wide characters with a simple cast. */
289 if (! pstr
->map_notascii
&& pstr
->trans
== NULL
&& !pstr
->offsets_needed
)
291 while (byte_idx
< end_idx
)
295 if (isascii (pstr
->raw_mbs
[pstr
->raw_mbs_idx
+ byte_idx
])
296 && mbsinit (&pstr
->cur_state
))
298 /* In case of a singlebyte character. */
300 = toupper (pstr
->raw_mbs
[pstr
->raw_mbs_idx
+ byte_idx
]);
301 /* The next step uses the assumption that wchar_t is encoded
302 ASCII-safe: all ASCII values can be converted like this. */
303 pstr
->wcs
[byte_idx
] = (wchar_t) pstr
->mbs
[byte_idx
];
308 remain_len
= end_idx
- byte_idx
;
309 prev_st
= pstr
->cur_state
;
310 mbclen
= __mbrtowc (&wc
,
311 ((const char *) pstr
->raw_mbs
+ pstr
->raw_mbs_idx
312 + byte_idx
), remain_len
, &pstr
->cur_state
);
313 if (BE (mbclen
+ 2 > 2, 1))
321 mbcdlen
= wcrtomb (buf
, wcu
, &prev_st
);
322 if (BE (mbclen
== mbcdlen
, 1))
323 memcpy (pstr
->mbs
+ byte_idx
, buf
, mbclen
);
331 memcpy (pstr
->mbs
+ byte_idx
,
332 pstr
->raw_mbs
+ pstr
->raw_mbs_idx
+ byte_idx
, mbclen
);
333 pstr
->wcs
[byte_idx
++] = wcu
;
334 /* Write paddings. */
335 for (remain_len
= byte_idx
+ mbclen
- 1; byte_idx
< remain_len
;)
336 pstr
->wcs
[byte_idx
++] = WEOF
;
338 else if (mbclen
== (size_t) -1 || mbclen
== 0
339 || (mbclen
== (size_t) -2 && pstr
->bufs_len
>= pstr
->len
))
341 /* It is an invalid character, an incomplete character
342 at the end of the string, or '\0'. Just use the byte. */
343 int ch
= pstr
->raw_mbs
[pstr
->raw_mbs_idx
+ byte_idx
];
344 pstr
->mbs
[byte_idx
] = ch
;
345 /* And also cast it to wide char. */
346 pstr
->wcs
[byte_idx
++] = (wchar_t) ch
;
347 if (BE (mbclen
== (size_t) -1, 0))
348 pstr
->cur_state
= prev_st
;
352 /* The buffer doesn't have enough space, finish to build. */
353 pstr
->cur_state
= prev_st
;
357 pstr
->valid_len
= byte_idx
;
358 pstr
->valid_raw_len
= byte_idx
;
362 for (src_idx
= pstr
->valid_raw_len
; byte_idx
< end_idx
;)
367 remain_len
= end_idx
- byte_idx
;
368 prev_st
= pstr
->cur_state
;
369 if (BE (pstr
->trans
!= NULL
, 0))
373 for (i
= 0; i
< pstr
->mb_cur_max
&& i
< remain_len
; ++i
)
375 ch
= pstr
->raw_mbs
[pstr
->raw_mbs_idx
+ src_idx
+ i
];
376 buf
[i
] = pstr
->trans
[ch
];
378 p
= (const char *) buf
;
381 p
= (const char *) pstr
->raw_mbs
+ pstr
->raw_mbs_idx
+ src_idx
;
382 mbclen
= __mbrtowc (&wc
, p
, remain_len
, &pstr
->cur_state
);
383 if (BE (mbclen
+ 2 > 2, 1))
391 mbcdlen
= wcrtomb ((char *) buf
, wcu
, &prev_st
);
392 if (BE (mbclen
== mbcdlen
, 1))
393 memcpy (pstr
->mbs
+ byte_idx
, buf
, mbclen
);
394 else if (mbcdlen
!= (size_t) -1)
398 if (byte_idx
+ mbcdlen
> pstr
->bufs_len
)
400 pstr
->cur_state
= prev_st
;
404 if (pstr
->offsets
== NULL
)
406 pstr
->offsets
= re_malloc (int, pstr
->bufs_len
);
408 if (pstr
->offsets
== NULL
)
411 if (!pstr
->offsets_needed
)
413 for (i
= 0; i
< (size_t) byte_idx
; ++i
)
414 pstr
->offsets
[i
] = i
;
415 pstr
->offsets_needed
= 1;
418 memcpy (pstr
->mbs
+ byte_idx
, buf
, mbcdlen
);
419 pstr
->wcs
[byte_idx
] = wcu
;
420 pstr
->offsets
[byte_idx
] = src_idx
;
421 for (i
= 1; i
< mbcdlen
; ++i
)
423 pstr
->offsets
[byte_idx
+ i
]
424 = src_idx
+ (i
< mbclen
? i
: mbclen
- 1);
425 pstr
->wcs
[byte_idx
+ i
] = WEOF
;
427 pstr
->len
+= mbcdlen
- mbclen
;
428 if (pstr
->raw_stop
> src_idx
)
429 pstr
->stop
+= mbcdlen
- mbclen
;
430 end_idx
= (pstr
->bufs_len
> pstr
->len
)
431 ? pstr
->len
: pstr
->bufs_len
;
437 memcpy (pstr
->mbs
+ byte_idx
, p
, mbclen
);
440 memcpy (pstr
->mbs
+ byte_idx
, p
, mbclen
);
442 if (BE (pstr
->offsets_needed
!= 0, 0))
445 for (i
= 0; i
< mbclen
; ++i
)
446 pstr
->offsets
[byte_idx
+ i
] = src_idx
+ i
;
450 pstr
->wcs
[byte_idx
++] = wcu
;
451 /* Write paddings. */
452 for (remain_len
= byte_idx
+ mbclen
- 1; byte_idx
< remain_len
;)
453 pstr
->wcs
[byte_idx
++] = WEOF
;
455 else if (mbclen
== (size_t) -1 || mbclen
== 0
456 || (mbclen
== (size_t) -2 && pstr
->bufs_len
>= pstr
->len
))
458 /* It is an invalid character or '\0'. Just use the byte. */
459 int ch
= pstr
->raw_mbs
[pstr
->raw_mbs_idx
+ src_idx
];
461 if (BE (pstr
->trans
!= NULL
, 0))
462 ch
= pstr
->trans
[ch
];
463 pstr
->mbs
[byte_idx
] = ch
;
465 if (BE (pstr
->offsets_needed
!= 0, 0))
466 pstr
->offsets
[byte_idx
] = src_idx
;
469 /* And also cast it to wide char. */
470 pstr
->wcs
[byte_idx
++] = (wchar_t) ch
;
471 if (BE (mbclen
== (size_t) -1, 0))
472 pstr
->cur_state
= prev_st
;
476 /* The buffer doesn't have enough space, finish to build. */
477 pstr
->cur_state
= prev_st
;
481 pstr
->valid_len
= byte_idx
;
482 pstr
->valid_raw_len
= src_idx
;
486 /* Skip characters until the index becomes greater than NEW_RAW_IDX.
491 re_string_skip_chars (re_string_t
*pstr
, int new_raw_idx
, wint_t *last_wc
)
498 /* Skip the characters which are not necessary to check. */
499 for (rawbuf_idx
= pstr
->raw_mbs_idx
+ pstr
->valid_raw_len
;
500 rawbuf_idx
< new_raw_idx
;)
503 int remain_len
= pstr
->len
- rawbuf_idx
;
504 prev_st
= pstr
->cur_state
;
505 mbclen
= __mbrtowc (&wc2
, (const char *) pstr
->raw_mbs
+ rawbuf_idx
,
506 remain_len
, &pstr
->cur_state
);
507 if (BE ((ssize_t
) mbclen
<= 0, 0))
509 /* We treat these cases as a single byte character. */
510 if (mbclen
== 0 || remain_len
== 0)
513 wc
= *(unsigned char *) (pstr
->raw_mbs
+ rawbuf_idx
);
515 pstr
->cur_state
= prev_st
;
519 /* Then proceed the next character. */
520 rawbuf_idx
+= mbclen
;
525 #endif /* RE_ENABLE_I18N */
527 /* Build the buffer PSTR->MBS, and apply the translation if we need.
528 This function is used in case of REG_ICASE. */
532 build_upper_buffer (re_string_t
*pstr
)
534 int char_idx
, end_idx
;
535 end_idx
= (pstr
->bufs_len
> pstr
->len
) ? pstr
->len
: pstr
->bufs_len
;
537 for (char_idx
= pstr
->valid_len
; char_idx
< end_idx
; ++char_idx
)
539 int ch
= pstr
->raw_mbs
[pstr
->raw_mbs_idx
+ char_idx
];
540 if (BE (pstr
->trans
!= NULL
, 0))
541 ch
= pstr
->trans
[ch
];
543 pstr
->mbs
[char_idx
] = toupper (ch
);
545 pstr
->mbs
[char_idx
] = ch
;
547 pstr
->valid_len
= char_idx
;
548 pstr
->valid_raw_len
= char_idx
;
551 /* Apply TRANS to the buffer in PSTR. */
555 re_string_translate_buffer (re_string_t
*pstr
)
557 int buf_idx
, end_idx
;
558 end_idx
= (pstr
->bufs_len
> pstr
->len
) ? pstr
->len
: pstr
->bufs_len
;
560 for (buf_idx
= pstr
->valid_len
; buf_idx
< end_idx
; ++buf_idx
)
562 int ch
= pstr
->raw_mbs
[pstr
->raw_mbs_idx
+ buf_idx
];
563 pstr
->mbs
[buf_idx
] = pstr
->trans
[ch
];
566 pstr
->valid_len
= buf_idx
;
567 pstr
->valid_raw_len
= buf_idx
;
570 /* This function re-construct the buffers.
571 Concretely, convert to wide character in case of pstr->mb_cur_max > 1,
572 convert to upper case in case of REG_ICASE, apply translation. */
575 internal_function __attribute_warn_unused_result__
576 re_string_reconstruct (re_string_t
*pstr
, int idx
, int eflags
)
578 int offset
= idx
- pstr
->raw_mbs_idx
;
579 if (BE (offset
< 0, 0))
582 #ifdef RE_ENABLE_I18N
583 if (pstr
->mb_cur_max
> 1)
584 memset (&pstr
->cur_state
, '\0', sizeof (mbstate_t));
585 #endif /* RE_ENABLE_I18N */
586 pstr
->len
= pstr
->raw_len
;
587 pstr
->stop
= pstr
->raw_stop
;
589 pstr
->raw_mbs_idx
= 0;
590 pstr
->valid_raw_len
= 0;
591 pstr
->offsets_needed
= 0;
592 pstr
->tip_context
= ((eflags
& REG_NOTBOL
) ? CONTEXT_BEGBUF
593 : CONTEXT_NEWLINE
| CONTEXT_BEGBUF
);
594 if (!pstr
->mbs_allocated
)
595 pstr
->mbs
= (unsigned char *) pstr
->raw_mbs
;
599 if (BE (offset
!= 0, 1))
601 /* Should the already checked characters be kept? */
602 if (BE (offset
< pstr
->valid_raw_len
, 1))
604 /* Yes, move them to the front of the buffer. */
605 #ifdef RE_ENABLE_I18N
606 if (BE (pstr
->offsets_needed
, 0))
608 int low
= 0, high
= pstr
->valid_len
, mid
;
611 mid
= (high
+ low
) / 2;
612 if (pstr
->offsets
[mid
] > offset
)
614 else if (pstr
->offsets
[mid
] < offset
)
620 if (pstr
->offsets
[mid
] < offset
)
622 pstr
->tip_context
= re_string_context_at (pstr
, mid
- 1,
624 /* This can be quite complicated, so handle specially
625 only the common and easy case where the character with
626 different length representation of lower and upper
627 case is present at or after offset. */
628 if (pstr
->valid_len
> offset
629 && mid
== offset
&& pstr
->offsets
[mid
] == offset
)
631 memmove (pstr
->wcs
, pstr
->wcs
+ offset
,
632 (pstr
->valid_len
- offset
) * sizeof (wint_t));
633 memmove (pstr
->mbs
, pstr
->mbs
+ offset
, pstr
->valid_len
- offset
);
634 pstr
->valid_len
-= offset
;
635 pstr
->valid_raw_len
-= offset
;
636 for (low
= 0; low
< pstr
->valid_len
; low
++)
637 pstr
->offsets
[low
] = pstr
->offsets
[low
+ offset
] - offset
;
641 /* Otherwise, just find out how long the partial multibyte
642 character at offset is and fill it with WEOF/255. */
643 pstr
->len
= pstr
->raw_len
- idx
+ offset
;
644 pstr
->stop
= pstr
->raw_stop
- idx
+ offset
;
645 pstr
->offsets_needed
= 0;
646 while (mid
> 0 && pstr
->offsets
[mid
- 1] == offset
)
648 while (mid
< pstr
->valid_len
)
649 if (pstr
->wcs
[mid
] != WEOF
)
653 if (mid
== pstr
->valid_len
)
657 pstr
->valid_len
= pstr
->offsets
[mid
] - offset
;
660 for (low
= 0; low
< pstr
->valid_len
; ++low
)
661 pstr
->wcs
[low
] = WEOF
;
662 memset (pstr
->mbs
, 255, pstr
->valid_len
);
665 pstr
->valid_raw_len
= pstr
->valid_len
;
671 pstr
->tip_context
= re_string_context_at (pstr
, offset
- 1,
673 #ifdef RE_ENABLE_I18N
674 if (pstr
->mb_cur_max
> 1)
675 memmove (pstr
->wcs
, pstr
->wcs
+ offset
,
676 (pstr
->valid_len
- offset
) * sizeof (wint_t));
677 #endif /* RE_ENABLE_I18N */
678 if (BE (pstr
->mbs_allocated
, 0))
679 memmove (pstr
->mbs
, pstr
->mbs
+ offset
,
680 pstr
->valid_len
- offset
);
681 pstr
->valid_len
-= offset
;
682 pstr
->valid_raw_len
-= offset
;
684 assert (pstr
->valid_len
> 0);
690 /* No, skip all characters until IDX. */
691 int prev_valid_len
= pstr
->valid_len
;
693 #ifdef RE_ENABLE_I18N
694 if (BE (pstr
->offsets_needed
, 0))
696 pstr
->len
= pstr
->raw_len
- idx
+ offset
;
697 pstr
->stop
= pstr
->raw_stop
- idx
+ offset
;
698 pstr
->offsets_needed
= 0;
702 #ifdef RE_ENABLE_I18N
703 if (pstr
->mb_cur_max
> 1)
710 const unsigned char *raw
, *p
, *end
;
712 /* Special case UTF-8. Multi-byte chars start with any
713 byte other than 0x80 - 0xbf. */
714 raw
= pstr
->raw_mbs
+ pstr
->raw_mbs_idx
;
715 end
= raw
+ (offset
- pstr
->mb_cur_max
);
716 if (end
< pstr
->raw_mbs
)
718 p
= raw
+ offset
- 1;
720 /* We know the wchar_t encoding is UCS4, so for the simple
721 case, ASCII characters, skip the conversion step. */
722 if (isascii (*p
) && BE (pstr
->trans
== NULL
, 1))
724 memset (&pstr
->cur_state
, '\0', sizeof (mbstate_t));
725 /* pstr->valid_len = 0; */
730 for (; p
>= end
; --p
)
731 if ((*p
& 0xc0) != 0x80)
735 int mlen
= raw
+ pstr
->len
- p
;
736 unsigned char buf
[6];
739 const unsigned char *pp
= p
;
740 if (BE (pstr
->trans
!= NULL
, 0))
742 int i
= mlen
< 6 ? mlen
: 6;
744 buf
[i
] = pstr
->trans
[p
[i
]];
747 /* XXX Don't use mbrtowc, we know which conversion
748 to use (UTF-8 -> UCS4). */
749 memset (&cur_state
, 0, sizeof (cur_state
));
750 mbclen
= __mbrtowc (&wc2
, (const char *) pp
, mlen
,
752 if (raw
+ offset
- p
<= mbclen
753 && mbclen
< (size_t) -2)
755 memset (&pstr
->cur_state
, '\0',
757 pstr
->valid_len
= mbclen
- (raw
+ offset
- p
);
765 pstr
->valid_len
= re_string_skip_chars (pstr
, idx
, &wc
) - idx
;
768 = re_string_context_at (pstr
, prev_valid_len
- 1, eflags
);
770 pstr
->tip_context
= ((BE (pstr
->word_ops_used
!= 0, 0)
771 && IS_WIDE_WORD_CHAR (wc
))
773 : ((IS_WIDE_NEWLINE (wc
)
774 && pstr
->newline_anchor
)
775 ? CONTEXT_NEWLINE
: 0));
776 if (BE (pstr
->valid_len
, 0))
778 for (wcs_idx
= 0; wcs_idx
< pstr
->valid_len
; ++wcs_idx
)
779 pstr
->wcs
[wcs_idx
] = WEOF
;
780 if (pstr
->mbs_allocated
)
781 memset (pstr
->mbs
, 255, pstr
->valid_len
);
783 pstr
->valid_raw_len
= pstr
->valid_len
;
786 #endif /* RE_ENABLE_I18N */
788 int c
= pstr
->raw_mbs
[pstr
->raw_mbs_idx
+ offset
- 1];
789 pstr
->valid_raw_len
= 0;
792 pstr
->tip_context
= (bitset_contain (pstr
->word_char
, c
)
794 : ((IS_NEWLINE (c
) && pstr
->newline_anchor
)
795 ? CONTEXT_NEWLINE
: 0));
798 if (!BE (pstr
->mbs_allocated
, 0))
801 pstr
->raw_mbs_idx
= idx
;
803 pstr
->stop
-= offset
;
805 /* Then build the buffers. */
806 #ifdef RE_ENABLE_I18N
807 if (pstr
->mb_cur_max
> 1)
811 reg_errcode_t ret
= build_wcs_upper_buffer (pstr
);
812 if (BE (ret
!= REG_NOERROR
, 0))
816 build_wcs_buffer (pstr
);
819 #endif /* RE_ENABLE_I18N */
820 if (BE (pstr
->mbs_allocated
, 0))
823 build_upper_buffer (pstr
);
824 else if (pstr
->trans
!= NULL
)
825 re_string_translate_buffer (pstr
);
828 pstr
->valid_len
= pstr
->len
;
835 internal_function
__attribute ((pure
))
836 re_string_peek_byte_case (const re_string_t
*pstr
, int idx
)
840 /* Handle the common (easiest) cases first. */
841 if (BE (!pstr
->mbs_allocated
, 1))
842 return re_string_peek_byte (pstr
, idx
);
844 #ifdef RE_ENABLE_I18N
845 if (pstr
->mb_cur_max
> 1
846 && ! re_string_is_single_byte_char (pstr
, pstr
->cur_idx
+ idx
))
847 return re_string_peek_byte (pstr
, idx
);
850 off
= pstr
->cur_idx
+ idx
;
851 #ifdef RE_ENABLE_I18N
852 if (pstr
->offsets_needed
)
853 off
= pstr
->offsets
[off
];
856 ch
= pstr
->raw_mbs
[pstr
->raw_mbs_idx
+ off
];
858 #ifdef RE_ENABLE_I18N
859 /* Ensure that e.g. for tr_TR.UTF-8 BACKSLASH DOTLESS SMALL LETTER I
860 this function returns CAPITAL LETTER I instead of first byte of
861 DOTLESS SMALL LETTER I. The latter would confuse the parser,
862 since peek_byte_case doesn't advance cur_idx in any way. */
863 if (pstr
->offsets_needed
&& !isascii (ch
))
864 return re_string_peek_byte (pstr
, idx
);
872 re_string_fetch_byte_case (re_string_t
*pstr
)
874 if (BE (!pstr
->mbs_allocated
, 1))
875 return re_string_fetch_byte (pstr
);
877 #ifdef RE_ENABLE_I18N
878 if (pstr
->offsets_needed
)
882 /* For tr_TR.UTF-8 [[:islower:]] there is
883 [[: CAPITAL LETTER I WITH DOT lower:]] in mbs. Skip
884 in that case the whole multi-byte character and return
885 the original letter. On the other side, with
886 [[: DOTLESS SMALL LETTER I return [[:I, as doing
887 anything else would complicate things too much. */
889 if (!re_string_first_byte (pstr
, pstr
->cur_idx
))
890 return re_string_fetch_byte (pstr
);
892 off
= pstr
->offsets
[pstr
->cur_idx
];
893 ch
= pstr
->raw_mbs
[pstr
->raw_mbs_idx
+ off
];
896 return re_string_fetch_byte (pstr
);
898 re_string_skip_bytes (pstr
,
899 re_string_char_size_at (pstr
, pstr
->cur_idx
));
904 return pstr
->raw_mbs
[pstr
->raw_mbs_idx
+ pstr
->cur_idx
++];
909 re_string_destruct (re_string_t
*pstr
)
911 #ifdef RE_ENABLE_I18N
913 re_free (pstr
->offsets
);
914 #endif /* RE_ENABLE_I18N */
915 if (pstr
->mbs_allocated
)
919 /* Return the context at IDX in INPUT. */
923 re_string_context_at (const re_string_t
*input
, int idx
, int eflags
)
927 /* In this case, we use the value stored in input->tip_context,
928 since we can't know the character in input->mbs[-1] here. */
929 return input
->tip_context
;
930 if (BE (idx
== input
->len
, 0))
931 return ((eflags
& REG_NOTEOL
) ? CONTEXT_ENDBUF
932 : CONTEXT_NEWLINE
| CONTEXT_ENDBUF
);
933 #ifdef RE_ENABLE_I18N
934 if (input
->mb_cur_max
> 1)
938 while(input
->wcs
[wc_idx
] == WEOF
)
941 /* It must not happen. */
942 assert (wc_idx
>= 0);
946 return input
->tip_context
;
948 wc
= input
->wcs
[wc_idx
];
949 if (BE (input
->word_ops_used
!= 0, 0) && IS_WIDE_WORD_CHAR (wc
))
951 return (IS_WIDE_NEWLINE (wc
) && input
->newline_anchor
952 ? CONTEXT_NEWLINE
: 0);
957 c
= re_string_byte_at (input
, idx
);
958 if (bitset_contain (input
->word_char
, c
))
960 return IS_NEWLINE (c
) && input
->newline_anchor
? CONTEXT_NEWLINE
: 0;
964 /* Functions for set operation. */
967 internal_function __attribute_warn_unused_result__
968 re_node_set_alloc (re_node_set
*set
, int size
)
972 set
->elems
= re_malloc (int, size
);
973 if (BE (set
->elems
== NULL
, 0))
979 internal_function __attribute_warn_unused_result__
980 re_node_set_init_1 (re_node_set
*set
, int elem
)
984 set
->elems
= re_malloc (int, 1);
985 if (BE (set
->elems
== NULL
, 0))
987 set
->alloc
= set
->nelem
= 0;
990 set
->elems
[0] = elem
;
995 internal_function __attribute_warn_unused_result__
996 re_node_set_init_2 (re_node_set
*set
, int elem1
, int elem2
)
999 set
->elems
= re_malloc (int, 2);
1000 if (BE (set
->elems
== NULL
, 0))
1005 set
->elems
[0] = elem1
;
1012 set
->elems
[0] = elem1
;
1013 set
->elems
[1] = elem2
;
1017 set
->elems
[0] = elem2
;
1018 set
->elems
[1] = elem1
;
1024 static reg_errcode_t
1025 internal_function __attribute_warn_unused_result__
1026 re_node_set_init_copy (re_node_set
*dest
, const re_node_set
*src
)
1028 dest
->nelem
= src
->nelem
;
1031 dest
->alloc
= dest
->nelem
;
1032 dest
->elems
= re_malloc (int, dest
->alloc
);
1033 if (BE (dest
->elems
== NULL
, 0))
1035 dest
->alloc
= dest
->nelem
= 0;
1038 memcpy (dest
->elems
, src
->elems
, src
->nelem
* sizeof (int));
1041 re_node_set_init_empty (dest
);
1045 /* Calculate the intersection of the sets SRC1 and SRC2. And merge it to
1046 DEST. Return value indicate the error code or REG_NOERROR if succeeded.
1047 Note: We assume dest->elems is NULL, when dest->alloc is 0. */
1049 static reg_errcode_t
1050 internal_function __attribute_warn_unused_result__
1051 re_node_set_add_intersect (re_node_set
*dest
, const re_node_set
*src1
,
1052 const re_node_set
*src2
)
1054 int i1
, i2
, is
, id
, delta
, sbase
;
1055 if (src1
->nelem
== 0 || src2
->nelem
== 0)
1058 /* We need dest->nelem + 2 * elems_in_intersection; this is a
1059 conservative estimate. */
1060 if (src1
->nelem
+ src2
->nelem
+ dest
->nelem
> dest
->alloc
)
1062 int new_alloc
= src1
->nelem
+ src2
->nelem
+ dest
->alloc
;
1063 int *new_elems
= re_realloc (dest
->elems
, int, new_alloc
);
1064 if (BE (new_elems
== NULL
, 0))
1066 dest
->elems
= new_elems
;
1067 dest
->alloc
= new_alloc
;
1070 /* Find the items in the intersection of SRC1 and SRC2, and copy
1071 into the top of DEST those that are not already in DEST itself. */
1072 sbase
= dest
->nelem
+ src1
->nelem
+ src2
->nelem
;
1073 i1
= src1
->nelem
- 1;
1074 i2
= src2
->nelem
- 1;
1075 id
= dest
->nelem
- 1;
1078 if (src1
->elems
[i1
] == src2
->elems
[i2
])
1080 /* Try to find the item in DEST. Maybe we could binary search? */
1081 while (id
>= 0 && dest
->elems
[id
] > src1
->elems
[i1
])
1084 if (id
< 0 || dest
->elems
[id
] != src1
->elems
[i1
])
1085 dest
->elems
[--sbase
] = src1
->elems
[i1
];
1087 if (--i1
< 0 || --i2
< 0)
1091 /* Lower the highest of the two items. */
1092 else if (src1
->elems
[i1
] < src2
->elems
[i2
])
1104 id
= dest
->nelem
- 1;
1105 is
= dest
->nelem
+ src1
->nelem
+ src2
->nelem
- 1;
1106 delta
= is
- sbase
+ 1;
1108 /* Now copy. When DELTA becomes zero, the remaining
1109 DEST elements are already in place; this is more or
1110 less the same loop that is in re_node_set_merge. */
1111 dest
->nelem
+= delta
;
1112 if (delta
> 0 && id
>= 0)
1115 if (dest
->elems
[is
] > dest
->elems
[id
])
1117 /* Copy from the top. */
1118 dest
->elems
[id
+ delta
--] = dest
->elems
[is
--];
1124 /* Slide from the bottom. */
1125 dest
->elems
[id
+ delta
] = dest
->elems
[id
];
1131 /* Copy remaining SRC elements. */
1132 memcpy (dest
->elems
, dest
->elems
+ sbase
, delta
* sizeof (int));
1137 /* Calculate the union set of the sets SRC1 and SRC2. And store it to
1138 DEST. Return value indicate the error code or REG_NOERROR if succeeded. */
1140 static reg_errcode_t
1141 internal_function __attribute_warn_unused_result__
1142 re_node_set_init_union (re_node_set
*dest
, const re_node_set
*src1
,
1143 const re_node_set
*src2
)
1146 if (src1
!= NULL
&& src1
->nelem
> 0 && src2
!= NULL
&& src2
->nelem
> 0)
1148 dest
->alloc
= src1
->nelem
+ src2
->nelem
;
1149 dest
->elems
= re_malloc (int, dest
->alloc
);
1150 if (BE (dest
->elems
== NULL
, 0))
1155 if (src1
!= NULL
&& src1
->nelem
> 0)
1156 return re_node_set_init_copy (dest
, src1
);
1157 else if (src2
!= NULL
&& src2
->nelem
> 0)
1158 return re_node_set_init_copy (dest
, src2
);
1160 re_node_set_init_empty (dest
);
1163 for (i1
= i2
= id
= 0 ; i1
< src1
->nelem
&& i2
< src2
->nelem
;)
1165 if (src1
->elems
[i1
] > src2
->elems
[i2
])
1167 dest
->elems
[id
++] = src2
->elems
[i2
++];
1170 if (src1
->elems
[i1
] == src2
->elems
[i2
])
1172 dest
->elems
[id
++] = src1
->elems
[i1
++];
1174 if (i1
< src1
->nelem
)
1176 memcpy (dest
->elems
+ id
, src1
->elems
+ i1
,
1177 (src1
->nelem
- i1
) * sizeof (int));
1178 id
+= src1
->nelem
- i1
;
1180 else if (i2
< src2
->nelem
)
1182 memcpy (dest
->elems
+ id
, src2
->elems
+ i2
,
1183 (src2
->nelem
- i2
) * sizeof (int));
1184 id
+= src2
->nelem
- i2
;
1190 /* Calculate the union set of the sets DEST and SRC. And store it to
1191 DEST. Return value indicate the error code or REG_NOERROR if succeeded. */
1193 static reg_errcode_t
1194 internal_function __attribute_warn_unused_result__
1195 re_node_set_merge (re_node_set
*dest
, const re_node_set
*src
)
1197 int is
, id
, sbase
, delta
;
1198 if (src
== NULL
|| src
->nelem
== 0)
1200 if (dest
->alloc
< 2 * src
->nelem
+ dest
->nelem
)
1202 int new_alloc
= 2 * (src
->nelem
+ dest
->alloc
);
1203 int *new_buffer
= re_realloc (dest
->elems
, int, new_alloc
);
1204 if (BE (new_buffer
== NULL
, 0))
1206 dest
->elems
= new_buffer
;
1207 dest
->alloc
= new_alloc
;
1210 if (BE (dest
->nelem
== 0, 0))
1212 dest
->nelem
= src
->nelem
;
1213 memcpy (dest
->elems
, src
->elems
, src
->nelem
* sizeof (int));
1217 /* Copy into the top of DEST the items of SRC that are not
1218 found in DEST. Maybe we could binary search in DEST? */
1219 for (sbase
= dest
->nelem
+ 2 * src
->nelem
,
1220 is
= src
->nelem
- 1, id
= dest
->nelem
- 1; is
>= 0 && id
>= 0; )
1222 if (dest
->elems
[id
] == src
->elems
[is
])
1224 else if (dest
->elems
[id
] < src
->elems
[is
])
1225 dest
->elems
[--sbase
] = src
->elems
[is
--];
1226 else /* if (dest->elems[id] > src->elems[is]) */
1232 /* If DEST is exhausted, the remaining items of SRC must be unique. */
1234 memcpy (dest
->elems
+ sbase
, src
->elems
, (is
+ 1) * sizeof (int));
1237 id
= dest
->nelem
- 1;
1238 is
= dest
->nelem
+ 2 * src
->nelem
- 1;
1239 delta
= is
- sbase
+ 1;
1243 /* Now copy. When DELTA becomes zero, the remaining
1244 DEST elements are already in place. */
1245 dest
->nelem
+= delta
;
1248 if (dest
->elems
[is
] > dest
->elems
[id
])
1250 /* Copy from the top. */
1251 dest
->elems
[id
+ delta
--] = dest
->elems
[is
--];
1257 /* Slide from the bottom. */
1258 dest
->elems
[id
+ delta
] = dest
->elems
[id
];
1261 /* Copy remaining SRC elements. */
1262 memcpy (dest
->elems
, dest
->elems
+ sbase
,
1263 delta
* sizeof (int));
1272 /* Insert the new element ELEM to the re_node_set* SET.
1273 SET should not already have ELEM.
1274 return -1 if an error is occured, return 1 otherwise. */
1277 internal_function __attribute_warn_unused_result__
1278 re_node_set_insert (re_node_set
*set
, int elem
)
1281 /* In case the set is empty. */
1282 if (set
->alloc
== 0)
1284 if (BE (re_node_set_init_1 (set
, elem
) == REG_NOERROR
, 1))
1290 if (BE (set
->nelem
, 0) == 0)
1292 /* We already guaranteed above that set->alloc != 0. */
1293 set
->elems
[0] = elem
;
1298 /* Realloc if we need. */
1299 if (set
->alloc
== set
->nelem
)
1302 set
->alloc
= set
->alloc
* 2;
1303 new_elems
= re_realloc (set
->elems
, int, set
->alloc
);
1304 if (BE (new_elems
== NULL
, 0))
1306 set
->elems
= new_elems
;
1309 /* Move the elements which follows the new element. Test the
1310 first element separately to skip a check in the inner loop. */
1311 if (elem
< set
->elems
[0])
1314 for (idx
= set
->nelem
; idx
> 0; idx
--)
1315 set
->elems
[idx
] = set
->elems
[idx
- 1];
1319 for (idx
= set
->nelem
; set
->elems
[idx
- 1] > elem
; idx
--)
1320 set
->elems
[idx
] = set
->elems
[idx
- 1];
1323 /* Insert the new element. */
1324 set
->elems
[idx
] = elem
;
1329 /* Insert the new element ELEM to the re_node_set* SET.
1330 SET should not already have any element greater than or equal to ELEM.
1331 Return -1 if an error is occured, return 1 otherwise. */
1334 internal_function __attribute_warn_unused_result__
1335 re_node_set_insert_last (re_node_set
*set
, int elem
)
1337 /* Realloc if we need. */
1338 if (set
->alloc
== set
->nelem
)
1341 set
->alloc
= (set
->alloc
+ 1) * 2;
1342 new_elems
= re_realloc (set
->elems
, int, set
->alloc
);
1343 if (BE (new_elems
== NULL
, 0))
1345 set
->elems
= new_elems
;
1348 /* Insert the new element. */
1349 set
->elems
[set
->nelem
++] = elem
;
1353 /* Compare two node sets SET1 and SET2.
1354 return 1 if SET1 and SET2 are equivalent, return 0 otherwise. */
1357 internal_function
__attribute ((pure
))
1358 re_node_set_compare (const re_node_set
*set1
, const re_node_set
*set2
)
1361 if (set1
== NULL
|| set2
== NULL
|| set1
->nelem
!= set2
->nelem
)
1363 for (i
= set1
->nelem
; --i
>= 0 ; )
1364 if (set1
->elems
[i
] != set2
->elems
[i
])
1369 /* Return (idx + 1) if SET contains the element ELEM, return 0 otherwise. */
1372 internal_function
__attribute ((pure
))
1373 re_node_set_contains (const re_node_set
*set
, int elem
)
1375 unsigned int idx
, right
, mid
;
1376 if (set
->nelem
<= 0)
1379 /* Binary search the element. */
1381 right
= set
->nelem
- 1;
1384 mid
= (idx
+ right
) / 2;
1385 if (set
->elems
[mid
] < elem
)
1390 return set
->elems
[idx
] == elem
? idx
+ 1 : 0;
1395 re_node_set_remove_at (re_node_set
*set
, int idx
)
1397 if (idx
< 0 || idx
>= set
->nelem
)
1400 for (; idx
< set
->nelem
; idx
++)
1401 set
->elems
[idx
] = set
->elems
[idx
+ 1];
1405 /* Add the token TOKEN to dfa->nodes, and return the index of the token.
1406 Or return -1, if an error will be occured. */
1410 re_dfa_add_node (re_dfa_t
*dfa
, re_token_t token
)
1412 int type
= token
.type
;
1413 if (BE (dfa
->nodes_len
>= dfa
->nodes_alloc
, 0))
1415 size_t new_nodes_alloc
= dfa
->nodes_alloc
* 2;
1416 int *new_nexts
, *new_indices
;
1417 re_node_set
*new_edests
, *new_eclosures
;
1418 re_token_t
*new_nodes
;
1420 /* Avoid overflows in realloc. */
1421 const size_t max_object_size
= MAX (sizeof (re_token_t
),
1422 MAX (sizeof (re_node_set
),
1424 if (BE (SIZE_MAX
/ max_object_size
< new_nodes_alloc
, 0))
1427 new_nodes
= re_realloc (dfa
->nodes
, re_token_t
, new_nodes_alloc
);
1428 if (BE (new_nodes
== NULL
, 0))
1430 dfa
->nodes
= new_nodes
;
1431 new_nexts
= re_realloc (dfa
->nexts
, int, new_nodes_alloc
);
1432 new_indices
= re_realloc (dfa
->org_indices
, int, new_nodes_alloc
);
1433 new_edests
= re_realloc (dfa
->edests
, re_node_set
, new_nodes_alloc
);
1434 new_eclosures
= re_realloc (dfa
->eclosures
, re_node_set
, new_nodes_alloc
);
1435 if (BE (new_nexts
== NULL
|| new_indices
== NULL
1436 || new_edests
== NULL
|| new_eclosures
== NULL
, 0))
1438 dfa
->nexts
= new_nexts
;
1439 dfa
->org_indices
= new_indices
;
1440 dfa
->edests
= new_edests
;
1441 dfa
->eclosures
= new_eclosures
;
1442 dfa
->nodes_alloc
= new_nodes_alloc
;
1444 dfa
->nodes
[dfa
->nodes_len
] = token
;
1445 dfa
->nodes
[dfa
->nodes_len
].constraint
= 0;
1446 #ifdef RE_ENABLE_I18N
1447 dfa
->nodes
[dfa
->nodes_len
].accept_mb
=
1448 (type
== OP_PERIOD
&& dfa
->mb_cur_max
> 1) || type
== COMPLEX_BRACKET
;
1450 dfa
->nexts
[dfa
->nodes_len
] = -1;
1451 re_node_set_init_empty (dfa
->edests
+ dfa
->nodes_len
);
1452 re_node_set_init_empty (dfa
->eclosures
+ dfa
->nodes_len
);
1453 return dfa
->nodes_len
++;
1456 static inline unsigned int
1458 calc_state_hash (const re_node_set
*nodes
, unsigned int context
)
1460 unsigned int hash
= nodes
->nelem
+ context
;
1462 for (i
= 0 ; i
< nodes
->nelem
; i
++)
1463 hash
+= nodes
->elems
[i
];
1467 /* Search for the state whose node_set is equivalent to NODES.
1468 Return the pointer to the state, if we found it in the DFA.
1469 Otherwise create the new one and return it. In case of an error
1470 return NULL and set the error code in ERR.
1471 Note: - We assume NULL as the invalid state, then it is possible that
1472 return value is NULL and ERR is REG_NOERROR.
1473 - We never return non-NULL value in case of any errors, it is for
1476 static re_dfastate_t
*
1477 internal_function __attribute_warn_unused_result__
1478 re_acquire_state (reg_errcode_t
*err
, const re_dfa_t
*dfa
,
1479 const re_node_set
*nodes
)
1482 re_dfastate_t
*new_state
;
1483 struct re_state_table_entry
*spot
;
1485 if (BE (nodes
->nelem
== 0, 0))
1490 hash
= calc_state_hash (nodes
, 0);
1491 spot
= dfa
->state_table
+ (hash
& dfa
->state_hash_mask
);
1493 for (i
= 0 ; i
< spot
->num
; i
++)
1495 re_dfastate_t
*state
= spot
->array
[i
];
1496 if (hash
!= state
->hash
)
1498 if (re_node_set_compare (&state
->nodes
, nodes
))
1502 /* There are no appropriate state in the dfa, create the new one. */
1503 new_state
= create_ci_newstate (dfa
, nodes
, hash
);
1504 if (BE (new_state
== NULL
, 0))
1510 /* Search for the state whose node_set is equivalent to NODES and
1511 whose context is equivalent to CONTEXT.
1512 Return the pointer to the state, if we found it in the DFA.
1513 Otherwise create the new one and return it. In case of an error
1514 return NULL and set the error code in ERR.
1515 Note: - We assume NULL as the invalid state, then it is possible that
1516 return value is NULL and ERR is REG_NOERROR.
1517 - We never return non-NULL value in case of any errors, it is for
1520 static re_dfastate_t
*
1521 internal_function __attribute_warn_unused_result__
1522 re_acquire_state_context (reg_errcode_t
*err
, const re_dfa_t
*dfa
,
1523 const re_node_set
*nodes
, unsigned int context
)
1526 re_dfastate_t
*new_state
;
1527 struct re_state_table_entry
*spot
;
1529 if (nodes
->nelem
== 0)
1534 hash
= calc_state_hash (nodes
, context
);
1535 spot
= dfa
->state_table
+ (hash
& dfa
->state_hash_mask
);
1537 for (i
= 0 ; i
< spot
->num
; i
++)
1539 re_dfastate_t
*state
= spot
->array
[i
];
1540 if (state
->hash
== hash
1541 && state
->context
== context
1542 && re_node_set_compare (state
->entrance_nodes
, nodes
))
1545 /* There are no appropriate state in `dfa', create the new one. */
1546 new_state
= create_cd_newstate (dfa
, nodes
, context
, hash
);
1547 if (BE (new_state
== NULL
, 0))
1553 /* Finish initialization of the new state NEWSTATE, and using its hash value
1554 HASH put in the appropriate bucket of DFA's state table. Return value
1555 indicates the error code if failed. */
1557 static reg_errcode_t
1558 __attribute_warn_unused_result__
1559 register_state (const re_dfa_t
*dfa
, re_dfastate_t
*newstate
,
1562 struct re_state_table_entry
*spot
;
1566 newstate
->hash
= hash
;
1567 err
= re_node_set_alloc (&newstate
->non_eps_nodes
, newstate
->nodes
.nelem
);
1568 if (BE (err
!= REG_NOERROR
, 0))
1570 for (i
= 0; i
< newstate
->nodes
.nelem
; i
++)
1572 int elem
= newstate
->nodes
.elems
[i
];
1573 if (!IS_EPSILON_NODE (dfa
->nodes
[elem
].type
))
1574 if (re_node_set_insert_last (&newstate
->non_eps_nodes
, elem
) < 0)
1578 spot
= dfa
->state_table
+ (hash
& dfa
->state_hash_mask
);
1579 if (BE (spot
->alloc
<= spot
->num
, 0))
1581 int new_alloc
= 2 * spot
->num
+ 2;
1582 re_dfastate_t
**new_array
= re_realloc (spot
->array
, re_dfastate_t
*,
1584 if (BE (new_array
== NULL
, 0))
1586 spot
->array
= new_array
;
1587 spot
->alloc
= new_alloc
;
1589 spot
->array
[spot
->num
++] = newstate
;
1594 free_state (re_dfastate_t
*state
)
1596 re_node_set_free (&state
->non_eps_nodes
);
1597 re_node_set_free (&state
->inveclosure
);
1598 if (state
->entrance_nodes
!= &state
->nodes
)
1600 re_node_set_free (state
->entrance_nodes
);
1601 re_free (state
->entrance_nodes
);
1603 re_node_set_free (&state
->nodes
);
1604 re_free (state
->word_trtable
);
1605 re_free (state
->trtable
);
1609 /* Create the new state which is independ of contexts.
1610 Return the new state if succeeded, otherwise return NULL. */
1612 static re_dfastate_t
*
1613 internal_function __attribute_warn_unused_result__
1614 create_ci_newstate (const re_dfa_t
*dfa
, const re_node_set
*nodes
,
1619 re_dfastate_t
*newstate
;
1621 newstate
= (re_dfastate_t
*) calloc (sizeof (re_dfastate_t
), 1);
1622 if (BE (newstate
== NULL
, 0))
1624 err
= re_node_set_init_copy (&newstate
->nodes
, nodes
);
1625 if (BE (err
!= REG_NOERROR
, 0))
1631 newstate
->entrance_nodes
= &newstate
->nodes
;
1632 for (i
= 0 ; i
< nodes
->nelem
; i
++)
1634 re_token_t
*node
= dfa
->nodes
+ nodes
->elems
[i
];
1635 re_token_type_t type
= node
->type
;
1636 if (type
== CHARACTER
&& !node
->constraint
)
1638 #ifdef RE_ENABLE_I18N
1639 newstate
->accept_mb
|= node
->accept_mb
;
1640 #endif /* RE_ENABLE_I18N */
1642 /* If the state has the halt node, the state is a halt state. */
1643 if (type
== END_OF_RE
)
1645 else if (type
== OP_BACK_REF
)
1646 newstate
->has_backref
= 1;
1647 else if (type
== ANCHOR
|| node
->constraint
)
1648 newstate
->has_constraint
= 1;
1650 err
= register_state (dfa
, newstate
, hash
);
1651 if (BE (err
!= REG_NOERROR
, 0))
1653 free_state (newstate
);
1659 /* Create the new state which is depend on the context CONTEXT.
1660 Return the new state if succeeded, otherwise return NULL. */
1662 static re_dfastate_t
*
1663 internal_function __attribute_warn_unused_result__
1664 create_cd_newstate (const re_dfa_t
*dfa
, const re_node_set
*nodes
,
1665 unsigned int context
, unsigned int hash
)
1667 int i
, nctx_nodes
= 0;
1669 re_dfastate_t
*newstate
;
1671 newstate
= (re_dfastate_t
*) calloc (sizeof (re_dfastate_t
), 1);
1672 if (BE (newstate
== NULL
, 0))
1674 err
= re_node_set_init_copy (&newstate
->nodes
, nodes
);
1675 if (BE (err
!= REG_NOERROR
, 0))
1681 newstate
->context
= context
;
1682 newstate
->entrance_nodes
= &newstate
->nodes
;
1684 for (i
= 0 ; i
< nodes
->nelem
; i
++)
1686 re_token_t
*node
= dfa
->nodes
+ nodes
->elems
[i
];
1687 re_token_type_t type
= node
->type
;
1688 unsigned int constraint
= node
->constraint
;
1690 if (type
== CHARACTER
&& !constraint
)
1692 #ifdef RE_ENABLE_I18N
1693 newstate
->accept_mb
|= node
->accept_mb
;
1694 #endif /* RE_ENABLE_I18N */
1696 /* If the state has the halt node, the state is a halt state. */
1697 if (type
== END_OF_RE
)
1699 else if (type
== OP_BACK_REF
)
1700 newstate
->has_backref
= 1;
1704 if (newstate
->entrance_nodes
== &newstate
->nodes
)
1706 newstate
->entrance_nodes
= re_malloc (re_node_set
, 1);
1707 if (BE (newstate
->entrance_nodes
== NULL
, 0))
1709 free_state (newstate
);
1712 if (re_node_set_init_copy (newstate
->entrance_nodes
, nodes
)
1716 newstate
->has_constraint
= 1;
1719 if (NOT_SATISFY_PREV_CONSTRAINT (constraint
,context
))
1721 re_node_set_remove_at (&newstate
->nodes
, i
- nctx_nodes
);
1726 err
= register_state (dfa
, newstate
, hash
);
1727 if (BE (err
!= REG_NOERROR
, 0))
1729 free_state (newstate
);