1 /* Extended regular expression matching and search library.
2 Copyright (C) 2002-2006, 2010 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4 Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library; if not, write to the Free
18 Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
21 static void re_string_construct_common (const char *str
, int len
,
23 RE_TRANSLATE_TYPE trans
, int icase
,
24 const re_dfa_t
*dfa
) internal_function
;
25 static re_dfastate_t
*create_ci_newstate (const re_dfa_t
*dfa
,
26 const re_node_set
*nodes
,
27 unsigned int hash
) internal_function
;
28 static re_dfastate_t
*create_cd_newstate (const re_dfa_t
*dfa
,
29 const re_node_set
*nodes
,
31 unsigned int hash
) internal_function
;
33 /* Functions for string operation. */
35 /* This function allocate the buffers. It is necessary to call
36 re_string_reconstruct before using the object. */
39 internal_function __attribute_warn_unused_result__
40 re_string_allocate (re_string_t
*pstr
, const char *str
, int len
, int init_len
,
41 RE_TRANSLATE_TYPE trans
, int icase
, const re_dfa_t
*dfa
)
46 /* Ensure at least one character fits into the buffers. */
47 if (init_len
< dfa
->mb_cur_max
)
48 init_len
= dfa
->mb_cur_max
;
49 init_buf_len
= (len
+ 1 < init_len
) ? len
+ 1: init_len
;
50 re_string_construct_common (str
, len
, pstr
, trans
, icase
, dfa
);
52 ret
= re_string_realloc_buffers (pstr
, init_buf_len
);
53 if (BE (ret
!= REG_NOERROR
, 0))
56 pstr
->word_char
= dfa
->word_char
;
57 pstr
->word_ops_used
= dfa
->word_ops_used
;
58 pstr
->mbs
= pstr
->mbs_allocated
? pstr
->mbs
: (unsigned char *) str
;
59 pstr
->valid_len
= (pstr
->mbs_allocated
|| dfa
->mb_cur_max
> 1) ? 0 : len
;
60 pstr
->valid_raw_len
= pstr
->valid_len
;
64 /* This function allocate the buffers, and initialize them. */
67 internal_function __attribute_warn_unused_result__
68 re_string_construct (re_string_t
*pstr
, const char *str
, int len
,
69 RE_TRANSLATE_TYPE trans
, int icase
, const re_dfa_t
*dfa
)
72 memset (pstr
, '\0', sizeof (re_string_t
));
73 re_string_construct_common (str
, len
, pstr
, trans
, icase
, dfa
);
77 ret
= re_string_realloc_buffers (pstr
, len
+ 1);
78 if (BE (ret
!= REG_NOERROR
, 0))
81 pstr
->mbs
= pstr
->mbs_allocated
? pstr
->mbs
: (unsigned char *) str
;
86 if (dfa
->mb_cur_max
> 1)
90 ret
= build_wcs_upper_buffer (pstr
);
91 if (BE (ret
!= REG_NOERROR
, 0))
93 if (pstr
->valid_raw_len
>= len
)
95 if (pstr
->bufs_len
> pstr
->valid_len
+ dfa
->mb_cur_max
)
97 ret
= re_string_realloc_buffers (pstr
, pstr
->bufs_len
* 2);
98 if (BE (ret
!= REG_NOERROR
, 0))
103 #endif /* RE_ENABLE_I18N */
104 build_upper_buffer (pstr
);
108 #ifdef RE_ENABLE_I18N
109 if (dfa
->mb_cur_max
> 1)
110 build_wcs_buffer (pstr
);
112 #endif /* RE_ENABLE_I18N */
115 re_string_translate_buffer (pstr
);
118 pstr
->valid_len
= pstr
->bufs_len
;
119 pstr
->valid_raw_len
= pstr
->bufs_len
;
127 /* Helper functions for re_string_allocate, and re_string_construct. */
130 internal_function __attribute_warn_unused_result__
131 re_string_realloc_buffers (re_string_t
*pstr
, int new_buf_len
)
133 #ifdef RE_ENABLE_I18N
134 if (pstr
->mb_cur_max
> 1)
138 /* Avoid overflow in realloc. */
139 const size_t max_object_size
= MAX (sizeof (wint_t), sizeof (int));
140 if (BE (SIZE_MAX
/ max_object_size
< new_buf_len
, 0))
143 new_wcs
= re_realloc (pstr
->wcs
, wint_t, new_buf_len
);
144 if (BE (new_wcs
== NULL
, 0))
147 if (pstr
->offsets
!= NULL
)
149 int *new_offsets
= re_realloc (pstr
->offsets
, int, new_buf_len
);
150 if (BE (new_offsets
== NULL
, 0))
152 pstr
->offsets
= new_offsets
;
155 #endif /* RE_ENABLE_I18N */
156 if (pstr
->mbs_allocated
)
158 unsigned char *new_mbs
= re_realloc (pstr
->mbs
, unsigned char,
160 if (BE (new_mbs
== NULL
, 0))
164 pstr
->bufs_len
= new_buf_len
;
171 re_string_construct_common (const char *str
, int len
, re_string_t
*pstr
,
172 RE_TRANSLATE_TYPE trans
, int icase
,
175 pstr
->raw_mbs
= (const unsigned char *) str
;
179 pstr
->icase
= icase
? 1 : 0;
180 pstr
->mbs_allocated
= (trans
!= NULL
|| icase
);
181 pstr
->mb_cur_max
= dfa
->mb_cur_max
;
182 pstr
->is_utf8
= dfa
->is_utf8
;
183 pstr
->map_notascii
= dfa
->map_notascii
;
184 pstr
->stop
= pstr
->len
;
185 pstr
->raw_stop
= pstr
->stop
;
188 #ifdef RE_ENABLE_I18N
190 /* Build wide character buffer PSTR->WCS.
191 If the byte sequence of the string are:
192 <mb1>(0), <mb1>(1), <mb2>(0), <mb2>(1), <sb3>
193 Then wide character buffer will be:
194 <wc1> , WEOF , <wc2> , WEOF , <wc3>
195 We use WEOF for padding, they indicate that the position isn't
196 a first byte of a multibyte character.
198 Note that this function assumes PSTR->VALID_LEN elements are already
199 built and starts from PSTR->VALID_LEN. */
203 build_wcs_buffer (re_string_t
*pstr
)
206 unsigned char buf
[MB_LEN_MAX
];
207 assert (MB_LEN_MAX
>= pstr
->mb_cur_max
);
209 unsigned char buf
[64];
212 int byte_idx
, end_idx
, remain_len
;
215 /* Build the buffers from pstr->valid_len to either pstr->len or
217 end_idx
= (pstr
->bufs_len
> pstr
->len
) ? pstr
->len
: pstr
->bufs_len
;
218 for (byte_idx
= pstr
->valid_len
; byte_idx
< end_idx
;)
223 remain_len
= end_idx
- byte_idx
;
224 prev_st
= pstr
->cur_state
;
225 /* Apply the translation if we need. */
226 if (BE (pstr
->trans
!= NULL
, 0))
230 for (i
= 0; i
< pstr
->mb_cur_max
&& i
< remain_len
; ++i
)
232 ch
= pstr
->raw_mbs
[pstr
->raw_mbs_idx
+ byte_idx
+ i
];
233 buf
[i
] = pstr
->mbs
[byte_idx
+ i
] = pstr
->trans
[ch
];
235 p
= (const char *) buf
;
238 p
= (const char *) pstr
->raw_mbs
+ pstr
->raw_mbs_idx
+ byte_idx
;
239 mbclen
= __mbrtowc (&wc
, p
, remain_len
, &pstr
->cur_state
);
240 if (BE (mbclen
== (size_t) -1 || mbclen
== 0
241 || (mbclen
== (size_t) -2 && pstr
->bufs_len
>= pstr
->len
), 0))
243 /* We treat these cases as a singlebyte character. */
245 wc
= (wchar_t) pstr
->raw_mbs
[pstr
->raw_mbs_idx
+ byte_idx
];
246 if (BE (pstr
->trans
!= NULL
, 0))
247 wc
= pstr
->trans
[wc
];
248 pstr
->cur_state
= prev_st
;
250 else if (BE (mbclen
== (size_t) -2, 0))
252 /* The buffer doesn't have enough space, finish to build. */
253 pstr
->cur_state
= prev_st
;
257 /* Write wide character and padding. */
258 pstr
->wcs
[byte_idx
++] = wc
;
259 /* Write paddings. */
260 for (remain_len
= byte_idx
+ mbclen
- 1; byte_idx
< remain_len
;)
261 pstr
->wcs
[byte_idx
++] = WEOF
;
263 pstr
->valid_len
= byte_idx
;
264 pstr
->valid_raw_len
= byte_idx
;
267 /* Build wide character buffer PSTR->WCS like build_wcs_buffer,
268 but for REG_ICASE. */
271 internal_function __attribute_warn_unused_result__
272 build_wcs_upper_buffer (re_string_t
*pstr
)
275 int src_idx
, byte_idx
, end_idx
, remain_len
;
278 char buf
[MB_LEN_MAX
];
279 assert (MB_LEN_MAX
>= pstr
->mb_cur_max
);
284 byte_idx
= pstr
->valid_len
;
285 end_idx
= (pstr
->bufs_len
> pstr
->len
) ? pstr
->len
: pstr
->bufs_len
;
287 /* The following optimization assumes that ASCII characters can be
288 mapped to wide characters with a simple cast. */
289 if (! pstr
->map_notascii
&& pstr
->trans
== NULL
&& !pstr
->offsets_needed
)
291 while (byte_idx
< end_idx
)
295 if (isascii (pstr
->raw_mbs
[pstr
->raw_mbs_idx
+ byte_idx
])
296 && mbsinit (&pstr
->cur_state
))
298 /* In case of a singlebyte character. */
300 = toupper (pstr
->raw_mbs
[pstr
->raw_mbs_idx
+ byte_idx
]);
301 /* The next step uses the assumption that wchar_t is encoded
302 ASCII-safe: all ASCII values can be converted like this. */
303 pstr
->wcs
[byte_idx
] = (wchar_t) pstr
->mbs
[byte_idx
];
308 remain_len
= end_idx
- byte_idx
;
309 prev_st
= pstr
->cur_state
;
310 mbclen
= __mbrtowc (&wc
,
311 ((const char *) pstr
->raw_mbs
+ pstr
->raw_mbs_idx
312 + byte_idx
), remain_len
, &pstr
->cur_state
);
313 if (BE (mbclen
+ 2 > 2, 1))
321 mbcdlen
= wcrtomb (buf
, wcu
, &prev_st
);
322 if (BE (mbclen
== mbcdlen
, 1))
323 memcpy (pstr
->mbs
+ byte_idx
, buf
, mbclen
);
331 memcpy (pstr
->mbs
+ byte_idx
,
332 pstr
->raw_mbs
+ pstr
->raw_mbs_idx
+ byte_idx
, mbclen
);
333 pstr
->wcs
[byte_idx
++] = wcu
;
334 /* Write paddings. */
335 for (remain_len
= byte_idx
+ mbclen
- 1; byte_idx
< remain_len
;)
336 pstr
->wcs
[byte_idx
++] = WEOF
;
338 else if (mbclen
== (size_t) -1 || mbclen
== 0
339 || (mbclen
== (size_t) -2 && pstr
->bufs_len
>= pstr
->len
))
341 /* It is an invalid character, an incomplete character
342 at the end of the string, or '\0'. Just use the byte. */
343 int ch
= pstr
->raw_mbs
[pstr
->raw_mbs_idx
+ byte_idx
];
344 pstr
->mbs
[byte_idx
] = ch
;
345 /* And also cast it to wide char. */
346 pstr
->wcs
[byte_idx
++] = (wchar_t) ch
;
347 if (BE (mbclen
== (size_t) -1, 0))
348 pstr
->cur_state
= prev_st
;
352 /* The buffer doesn't have enough space, finish to build. */
353 pstr
->cur_state
= prev_st
;
357 pstr
->valid_len
= byte_idx
;
358 pstr
->valid_raw_len
= byte_idx
;
362 for (src_idx
= pstr
->valid_raw_len
; byte_idx
< end_idx
;)
367 remain_len
= end_idx
- byte_idx
;
368 prev_st
= pstr
->cur_state
;
369 if (BE (pstr
->trans
!= NULL
, 0))
373 for (i
= 0; i
< pstr
->mb_cur_max
&& i
< remain_len
; ++i
)
375 ch
= pstr
->raw_mbs
[pstr
->raw_mbs_idx
+ src_idx
+ i
];
376 buf
[i
] = pstr
->trans
[ch
];
378 p
= (const char *) buf
;
381 p
= (const char *) pstr
->raw_mbs
+ pstr
->raw_mbs_idx
+ src_idx
;
382 mbclen
= __mbrtowc (&wc
, p
, remain_len
, &pstr
->cur_state
);
383 if (BE (mbclen
+ 2 > 2, 1))
391 mbcdlen
= wcrtomb ((char *) buf
, wcu
, &prev_st
);
392 if (BE (mbclen
== mbcdlen
, 1))
393 memcpy (pstr
->mbs
+ byte_idx
, buf
, mbclen
);
394 else if (mbcdlen
!= (size_t) -1)
398 if (byte_idx
+ mbcdlen
> pstr
->bufs_len
)
400 pstr
->cur_state
= prev_st
;
404 if (pstr
->offsets
== NULL
)
406 pstr
->offsets
= re_malloc (int, pstr
->bufs_len
);
408 if (pstr
->offsets
== NULL
)
411 if (!pstr
->offsets_needed
)
413 for (i
= 0; i
< (size_t) byte_idx
; ++i
)
414 pstr
->offsets
[i
] = i
;
415 pstr
->offsets_needed
= 1;
418 memcpy (pstr
->mbs
+ byte_idx
, buf
, mbcdlen
);
419 pstr
->wcs
[byte_idx
] = wcu
;
420 pstr
->offsets
[byte_idx
] = src_idx
;
421 for (i
= 1; i
< mbcdlen
; ++i
)
423 pstr
->offsets
[byte_idx
+ i
]
424 = src_idx
+ (i
< mbclen
? i
: mbclen
- 1);
425 pstr
->wcs
[byte_idx
+ i
] = WEOF
;
427 pstr
->len
+= mbcdlen
- mbclen
;
428 if (pstr
->raw_stop
> src_idx
)
429 pstr
->stop
+= mbcdlen
- mbclen
;
430 end_idx
= (pstr
->bufs_len
> pstr
->len
)
431 ? pstr
->len
: pstr
->bufs_len
;
437 memcpy (pstr
->mbs
+ byte_idx
, p
, mbclen
);
440 memcpy (pstr
->mbs
+ byte_idx
, p
, mbclen
);
442 if (BE (pstr
->offsets_needed
!= 0, 0))
445 for (i
= 0; i
< mbclen
; ++i
)
446 pstr
->offsets
[byte_idx
+ i
] = src_idx
+ i
;
450 pstr
->wcs
[byte_idx
++] = wcu
;
451 /* Write paddings. */
452 for (remain_len
= byte_idx
+ mbclen
- 1; byte_idx
< remain_len
;)
453 pstr
->wcs
[byte_idx
++] = WEOF
;
455 else if (mbclen
== (size_t) -1 || mbclen
== 0
456 || (mbclen
== (size_t) -2 && pstr
->bufs_len
>= pstr
->len
))
458 /* It is an invalid character or '\0'. Just use the byte. */
459 int ch
= pstr
->raw_mbs
[pstr
->raw_mbs_idx
+ src_idx
];
461 if (BE (pstr
->trans
!= NULL
, 0))
462 ch
= pstr
->trans
[ch
];
463 pstr
->mbs
[byte_idx
] = ch
;
465 if (BE (pstr
->offsets_needed
!= 0, 0))
466 pstr
->offsets
[byte_idx
] = src_idx
;
469 /* And also cast it to wide char. */
470 pstr
->wcs
[byte_idx
++] = (wchar_t) ch
;
471 if (BE (mbclen
== (size_t) -1, 0))
472 pstr
->cur_state
= prev_st
;
476 /* The buffer doesn't have enough space, finish to build. */
477 pstr
->cur_state
= prev_st
;
481 pstr
->valid_len
= byte_idx
;
482 pstr
->valid_raw_len
= src_idx
;
486 /* Skip characters until the index becomes greater than NEW_RAW_IDX.
491 re_string_skip_chars (re_string_t
*pstr
, int new_raw_idx
, wint_t *last_wc
)
498 /* Skip the characters which are not necessary to check. */
499 for (rawbuf_idx
= pstr
->raw_mbs_idx
+ pstr
->valid_raw_len
;
500 rawbuf_idx
< new_raw_idx
;)
503 int remain_len
= pstr
->len
- rawbuf_idx
;
504 prev_st
= pstr
->cur_state
;
505 mbclen
= __mbrtowc (&wc2
, (const char *) pstr
->raw_mbs
+ rawbuf_idx
,
506 remain_len
, &pstr
->cur_state
);
507 if (BE ((ssize_t
) mbclen
<= 0, 0))
509 /* We treat these cases as a single byte character. */
510 if (mbclen
== 0 || remain_len
== 0)
513 wc
= *(unsigned char *) (pstr
->raw_mbs
+ rawbuf_idx
);
515 pstr
->cur_state
= prev_st
;
519 /* Then proceed the next character. */
520 rawbuf_idx
+= mbclen
;
525 #endif /* RE_ENABLE_I18N */
527 /* Build the buffer PSTR->MBS, and apply the translation if we need.
528 This function is used in case of REG_ICASE. */
532 build_upper_buffer (re_string_t
*pstr
)
534 int char_idx
, end_idx
;
535 end_idx
= (pstr
->bufs_len
> pstr
->len
) ? pstr
->len
: pstr
->bufs_len
;
537 for (char_idx
= pstr
->valid_len
; char_idx
< end_idx
; ++char_idx
)
539 int ch
= pstr
->raw_mbs
[pstr
->raw_mbs_idx
+ char_idx
];
540 if (BE (pstr
->trans
!= NULL
, 0))
541 ch
= pstr
->trans
[ch
];
543 pstr
->mbs
[char_idx
] = toupper (ch
);
545 pstr
->mbs
[char_idx
] = ch
;
547 pstr
->valid_len
= char_idx
;
548 pstr
->valid_raw_len
= char_idx
;
551 /* Apply TRANS to the buffer in PSTR. */
555 re_string_translate_buffer (re_string_t
*pstr
)
557 int buf_idx
, end_idx
;
558 end_idx
= (pstr
->bufs_len
> pstr
->len
) ? pstr
->len
: pstr
->bufs_len
;
560 for (buf_idx
= pstr
->valid_len
; buf_idx
< end_idx
; ++buf_idx
)
562 int ch
= pstr
->raw_mbs
[pstr
->raw_mbs_idx
+ buf_idx
];
563 pstr
->mbs
[buf_idx
] = pstr
->trans
[ch
];
566 pstr
->valid_len
= buf_idx
;
567 pstr
->valid_raw_len
= buf_idx
;
570 /* This function re-construct the buffers.
571 Concretely, convert to wide character in case of pstr->mb_cur_max > 1,
572 convert to upper case in case of REG_ICASE, apply translation. */
575 internal_function __attribute_warn_unused_result__
576 re_string_reconstruct (re_string_t
*pstr
, int idx
, int eflags
)
578 int offset
= idx
- pstr
->raw_mbs_idx
;
579 if (BE (offset
< 0, 0))
582 #ifdef RE_ENABLE_I18N
583 if (pstr
->mb_cur_max
> 1)
584 memset (&pstr
->cur_state
, '\0', sizeof (mbstate_t));
585 #endif /* RE_ENABLE_I18N */
586 pstr
->len
= pstr
->raw_len
;
587 pstr
->stop
= pstr
->raw_stop
;
589 pstr
->raw_mbs_idx
= 0;
590 pstr
->valid_raw_len
= 0;
591 pstr
->offsets_needed
= 0;
592 pstr
->tip_context
= ((eflags
& REG_NOTBOL
) ? CONTEXT_BEGBUF
593 : CONTEXT_NEWLINE
| CONTEXT_BEGBUF
);
594 if (!pstr
->mbs_allocated
)
595 pstr
->mbs
= (unsigned char *) pstr
->raw_mbs
;
599 if (BE (offset
!= 0, 1))
601 /* Should the already checked characters be kept? */
602 if (BE (offset
< pstr
->valid_raw_len
, 1))
604 /* Yes, move them to the front of the buffer. */
605 #ifdef RE_ENABLE_I18N
606 if (BE (pstr
->offsets_needed
, 0))
608 int low
= 0, high
= pstr
->valid_len
, mid
;
611 mid
= (high
+ low
) / 2;
612 if (pstr
->offsets
[mid
] > offset
)
614 else if (pstr
->offsets
[mid
] < offset
)
620 if (pstr
->offsets
[mid
] < offset
)
622 pstr
->tip_context
= re_string_context_at (pstr
, mid
- 1,
624 /* This can be quite complicated, so handle specially
625 only the common and easy case where the character with
626 different length representation of lower and upper
627 case is present at or after offset. */
628 if (pstr
->valid_len
> offset
629 && mid
== offset
&& pstr
->offsets
[mid
] == offset
)
631 memmove (pstr
->wcs
, pstr
->wcs
+ offset
,
632 (pstr
->valid_len
- offset
) * sizeof (wint_t));
633 memmove (pstr
->mbs
, pstr
->mbs
+ offset
, pstr
->valid_len
- offset
);
634 pstr
->valid_len
-= offset
;
635 pstr
->valid_raw_len
-= offset
;
636 for (low
= 0; low
< pstr
->valid_len
; low
++)
637 pstr
->offsets
[low
] = pstr
->offsets
[low
+ offset
] - offset
;
641 /* Otherwise, just find out how long the partial multibyte
642 character at offset is and fill it with WEOF/255. */
643 pstr
->len
= pstr
->raw_len
- idx
+ offset
;
644 pstr
->stop
= pstr
->raw_stop
- idx
+ offset
;
645 pstr
->offsets_needed
= 0;
646 while (mid
> 0 && pstr
->offsets
[mid
- 1] == offset
)
648 while (mid
< pstr
->valid_len
)
649 if (pstr
->wcs
[mid
] != WEOF
)
653 if (mid
== pstr
->valid_len
)
657 pstr
->valid_len
= pstr
->offsets
[mid
] - offset
;
660 for (low
= 0; low
< pstr
->valid_len
; ++low
)
661 pstr
->wcs
[low
] = WEOF
;
662 memset (pstr
->mbs
, 255, pstr
->valid_len
);
665 pstr
->valid_raw_len
= pstr
->valid_len
;
671 pstr
->tip_context
= re_string_context_at (pstr
, offset
- 1,
673 #ifdef RE_ENABLE_I18N
674 if (pstr
->mb_cur_max
> 1)
675 memmove (pstr
->wcs
, pstr
->wcs
+ offset
,
676 (pstr
->valid_len
- offset
) * sizeof (wint_t));
677 #endif /* RE_ENABLE_I18N */
678 if (BE (pstr
->mbs_allocated
, 0))
679 memmove (pstr
->mbs
, pstr
->mbs
+ offset
,
680 pstr
->valid_len
- offset
);
681 pstr
->valid_len
-= offset
;
682 pstr
->valid_raw_len
-= offset
;
684 assert (pstr
->valid_len
> 0);
690 /* No, skip all characters until IDX. */
691 int prev_valid_len
= pstr
->valid_len
;
693 #ifdef RE_ENABLE_I18N
694 if (BE (pstr
->offsets_needed
, 0))
696 pstr
->len
= pstr
->raw_len
- idx
+ offset
;
697 pstr
->stop
= pstr
->raw_stop
- idx
+ offset
;
698 pstr
->offsets_needed
= 0;
702 #ifdef RE_ENABLE_I18N
703 if (pstr
->mb_cur_max
> 1)
710 const unsigned char *raw
, *p
, *end
;
712 /* Special case UTF-8. Multi-byte chars start with any
713 byte other than 0x80 - 0xbf. */
714 raw
= pstr
->raw_mbs
+ pstr
->raw_mbs_idx
;
715 end
= raw
+ (offset
- pstr
->mb_cur_max
);
716 if (end
< pstr
->raw_mbs
)
718 p
= raw
+ offset
- 1;
720 /* We know the wchar_t encoding is UCS4, so for the simple
721 case, ASCII characters, skip the conversion step. */
722 if (isascii (*p
) && BE (pstr
->trans
== NULL
, 1))
724 memset (&pstr
->cur_state
, '\0', sizeof (mbstate_t));
725 /* pstr->valid_len = 0; */
730 for (; p
>= end
; --p
)
731 if ((*p
& 0xc0) != 0x80)
735 int mlen
= raw
+ pstr
->len
- p
;
736 unsigned char buf
[6];
739 if (BE (pstr
->trans
!= NULL
, 0))
741 int i
= mlen
< 6 ? mlen
: 6;
743 buf
[i
] = pstr
->trans
[p
[i
]];
745 /* XXX Don't use mbrtowc, we know which conversion
746 to use (UTF-8 -> UCS4). */
747 memset (&cur_state
, 0, sizeof (cur_state
));
748 mbclen
= __mbrtowc (&wc2
, (const char *) p
, mlen
,
750 if (raw
+ offset
- p
<= mbclen
751 && mbclen
< (size_t) -2)
753 memset (&pstr
->cur_state
, '\0',
755 pstr
->valid_len
= mbclen
- (raw
+ offset
- p
);
763 pstr
->valid_len
= re_string_skip_chars (pstr
, idx
, &wc
) - idx
;
766 = re_string_context_at (pstr
, prev_valid_len
- 1, eflags
);
768 pstr
->tip_context
= ((BE (pstr
->word_ops_used
!= 0, 0)
769 && IS_WIDE_WORD_CHAR (wc
))
771 : ((IS_WIDE_NEWLINE (wc
)
772 && pstr
->newline_anchor
)
773 ? CONTEXT_NEWLINE
: 0));
774 if (BE (pstr
->valid_len
, 0))
776 for (wcs_idx
= 0; wcs_idx
< pstr
->valid_len
; ++wcs_idx
)
777 pstr
->wcs
[wcs_idx
] = WEOF
;
778 if (pstr
->mbs_allocated
)
779 memset (pstr
->mbs
, 255, pstr
->valid_len
);
781 pstr
->valid_raw_len
= pstr
->valid_len
;
784 #endif /* RE_ENABLE_I18N */
786 int c
= pstr
->raw_mbs
[pstr
->raw_mbs_idx
+ offset
- 1];
787 pstr
->valid_raw_len
= 0;
790 pstr
->tip_context
= (bitset_contain (pstr
->word_char
, c
)
792 : ((IS_NEWLINE (c
) && pstr
->newline_anchor
)
793 ? CONTEXT_NEWLINE
: 0));
796 if (!BE (pstr
->mbs_allocated
, 0))
799 pstr
->raw_mbs_idx
= idx
;
801 pstr
->stop
-= offset
;
803 /* Then build the buffers. */
804 #ifdef RE_ENABLE_I18N
805 if (pstr
->mb_cur_max
> 1)
809 reg_errcode_t ret
= build_wcs_upper_buffer (pstr
);
810 if (BE (ret
!= REG_NOERROR
, 0))
814 build_wcs_buffer (pstr
);
817 #endif /* RE_ENABLE_I18N */
818 if (BE (pstr
->mbs_allocated
, 0))
821 build_upper_buffer (pstr
);
822 else if (pstr
->trans
!= NULL
)
823 re_string_translate_buffer (pstr
);
826 pstr
->valid_len
= pstr
->len
;
833 internal_function
__attribute ((pure
))
834 re_string_peek_byte_case (const re_string_t
*pstr
, int idx
)
838 /* Handle the common (easiest) cases first. */
839 if (BE (!pstr
->mbs_allocated
, 1))
840 return re_string_peek_byte (pstr
, idx
);
842 #ifdef RE_ENABLE_I18N
843 if (pstr
->mb_cur_max
> 1
844 && ! re_string_is_single_byte_char (pstr
, pstr
->cur_idx
+ idx
))
845 return re_string_peek_byte (pstr
, idx
);
848 off
= pstr
->cur_idx
+ idx
;
849 #ifdef RE_ENABLE_I18N
850 if (pstr
->offsets_needed
)
851 off
= pstr
->offsets
[off
];
854 ch
= pstr
->raw_mbs
[pstr
->raw_mbs_idx
+ off
];
856 #ifdef RE_ENABLE_I18N
857 /* Ensure that e.g. for tr_TR.UTF-8 BACKSLASH DOTLESS SMALL LETTER I
858 this function returns CAPITAL LETTER I instead of first byte of
859 DOTLESS SMALL LETTER I. The latter would confuse the parser,
860 since peek_byte_case doesn't advance cur_idx in any way. */
861 if (pstr
->offsets_needed
&& !isascii (ch
))
862 return re_string_peek_byte (pstr
, idx
);
869 internal_function
__attribute ((pure
))
870 re_string_fetch_byte_case (re_string_t
*pstr
)
872 if (BE (!pstr
->mbs_allocated
, 1))
873 return re_string_fetch_byte (pstr
);
875 #ifdef RE_ENABLE_I18N
876 if (pstr
->offsets_needed
)
880 /* For tr_TR.UTF-8 [[:islower:]] there is
881 [[: CAPITAL LETTER I WITH DOT lower:]] in mbs. Skip
882 in that case the whole multi-byte character and return
883 the original letter. On the other side, with
884 [[: DOTLESS SMALL LETTER I return [[:I, as doing
885 anything else would complicate things too much. */
887 if (!re_string_first_byte (pstr
, pstr
->cur_idx
))
888 return re_string_fetch_byte (pstr
);
890 off
= pstr
->offsets
[pstr
->cur_idx
];
891 ch
= pstr
->raw_mbs
[pstr
->raw_mbs_idx
+ off
];
894 return re_string_fetch_byte (pstr
);
896 re_string_skip_bytes (pstr
,
897 re_string_char_size_at (pstr
, pstr
->cur_idx
));
902 return pstr
->raw_mbs
[pstr
->raw_mbs_idx
+ pstr
->cur_idx
++];
907 re_string_destruct (re_string_t
*pstr
)
909 #ifdef RE_ENABLE_I18N
911 re_free (pstr
->offsets
);
912 #endif /* RE_ENABLE_I18N */
913 if (pstr
->mbs_allocated
)
917 /* Return the context at IDX in INPUT. */
921 re_string_context_at (const re_string_t
*input
, int idx
, int eflags
)
925 /* In this case, we use the value stored in input->tip_context,
926 since we can't know the character in input->mbs[-1] here. */
927 return input
->tip_context
;
928 if (BE (idx
== input
->len
, 0))
929 return ((eflags
& REG_NOTEOL
) ? CONTEXT_ENDBUF
930 : CONTEXT_NEWLINE
| CONTEXT_ENDBUF
);
931 #ifdef RE_ENABLE_I18N
932 if (input
->mb_cur_max
> 1)
936 while(input
->wcs
[wc_idx
] == WEOF
)
939 /* It must not happen. */
940 assert (wc_idx
>= 0);
944 return input
->tip_context
;
946 wc
= input
->wcs
[wc_idx
];
947 if (BE (input
->word_ops_used
!= 0, 0) && IS_WIDE_WORD_CHAR (wc
))
949 return (IS_WIDE_NEWLINE (wc
) && input
->newline_anchor
950 ? CONTEXT_NEWLINE
: 0);
955 c
= re_string_byte_at (input
, idx
);
956 if (bitset_contain (input
->word_char
, c
))
958 return IS_NEWLINE (c
) && input
->newline_anchor
? CONTEXT_NEWLINE
: 0;
962 /* Functions for set operation. */
965 internal_function __attribute_warn_unused_result__
966 re_node_set_alloc (re_node_set
*set
, int size
)
970 set
->elems
= re_malloc (int, size
);
971 if (BE (set
->elems
== NULL
, 0))
977 internal_function __attribute_warn_unused_result__
978 re_node_set_init_1 (re_node_set
*set
, int elem
)
982 set
->elems
= re_malloc (int, 1);
983 if (BE (set
->elems
== NULL
, 0))
985 set
->alloc
= set
->nelem
= 0;
988 set
->elems
[0] = elem
;
993 internal_function __attribute_warn_unused_result__
994 re_node_set_init_2 (re_node_set
*set
, int elem1
, int elem2
)
997 set
->elems
= re_malloc (int, 2);
998 if (BE (set
->elems
== NULL
, 0))
1003 set
->elems
[0] = elem1
;
1010 set
->elems
[0] = elem1
;
1011 set
->elems
[1] = elem2
;
1015 set
->elems
[0] = elem2
;
1016 set
->elems
[1] = elem1
;
1022 static reg_errcode_t
1023 internal_function __attribute_warn_unused_result__
1024 re_node_set_init_copy (re_node_set
*dest
, const re_node_set
*src
)
1026 dest
->nelem
= src
->nelem
;
1029 dest
->alloc
= dest
->nelem
;
1030 dest
->elems
= re_malloc (int, dest
->alloc
);
1031 if (BE (dest
->elems
== NULL
, 0))
1033 dest
->alloc
= dest
->nelem
= 0;
1036 memcpy (dest
->elems
, src
->elems
, src
->nelem
* sizeof (int));
1039 re_node_set_init_empty (dest
);
1043 /* Calculate the intersection of the sets SRC1 and SRC2. And merge it to
1044 DEST. Return value indicate the error code or REG_NOERROR if succeeded.
1045 Note: We assume dest->elems is NULL, when dest->alloc is 0. */
1047 static reg_errcode_t
1048 internal_function __attribute_warn_unused_result__
1049 re_node_set_add_intersect (re_node_set
*dest
, const re_node_set
*src1
,
1050 const re_node_set
*src2
)
1052 int i1
, i2
, is
, id
, delta
, sbase
;
1053 if (src1
->nelem
== 0 || src2
->nelem
== 0)
1056 /* We need dest->nelem + 2 * elems_in_intersection; this is a
1057 conservative estimate. */
1058 if (src1
->nelem
+ src2
->nelem
+ dest
->nelem
> dest
->alloc
)
1060 int new_alloc
= src1
->nelem
+ src2
->nelem
+ dest
->alloc
;
1061 int *new_elems
= re_realloc (dest
->elems
, int, new_alloc
);
1062 if (BE (new_elems
== NULL
, 0))
1064 dest
->elems
= new_elems
;
1065 dest
->alloc
= new_alloc
;
1068 /* Find the items in the intersection of SRC1 and SRC2, and copy
1069 into the top of DEST those that are not already in DEST itself. */
1070 sbase
= dest
->nelem
+ src1
->nelem
+ src2
->nelem
;
1071 i1
= src1
->nelem
- 1;
1072 i2
= src2
->nelem
- 1;
1073 id
= dest
->nelem
- 1;
1076 if (src1
->elems
[i1
] == src2
->elems
[i2
])
1078 /* Try to find the item in DEST. Maybe we could binary search? */
1079 while (id
>= 0 && dest
->elems
[id
] > src1
->elems
[i1
])
1082 if (id
< 0 || dest
->elems
[id
] != src1
->elems
[i1
])
1083 dest
->elems
[--sbase
] = src1
->elems
[i1
];
1085 if (--i1
< 0 || --i2
< 0)
1089 /* Lower the highest of the two items. */
1090 else if (src1
->elems
[i1
] < src2
->elems
[i2
])
1102 id
= dest
->nelem
- 1;
1103 is
= dest
->nelem
+ src1
->nelem
+ src2
->nelem
- 1;
1104 delta
= is
- sbase
+ 1;
1106 /* Now copy. When DELTA becomes zero, the remaining
1107 DEST elements are already in place; this is more or
1108 less the same loop that is in re_node_set_merge. */
1109 dest
->nelem
+= delta
;
1110 if (delta
> 0 && id
>= 0)
1113 if (dest
->elems
[is
] > dest
->elems
[id
])
1115 /* Copy from the top. */
1116 dest
->elems
[id
+ delta
--] = dest
->elems
[is
--];
1122 /* Slide from the bottom. */
1123 dest
->elems
[id
+ delta
] = dest
->elems
[id
];
1129 /* Copy remaining SRC elements. */
1130 memcpy (dest
->elems
, dest
->elems
+ sbase
, delta
* sizeof (int));
1135 /* Calculate the union set of the sets SRC1 and SRC2. And store it to
1136 DEST. Return value indicate the error code or REG_NOERROR if succeeded. */
1138 static reg_errcode_t
1139 internal_function __attribute_warn_unused_result__
1140 re_node_set_init_union (re_node_set
*dest
, const re_node_set
*src1
,
1141 const re_node_set
*src2
)
1144 if (src1
!= NULL
&& src1
->nelem
> 0 && src2
!= NULL
&& src2
->nelem
> 0)
1146 dest
->alloc
= src1
->nelem
+ src2
->nelem
;
1147 dest
->elems
= re_malloc (int, dest
->alloc
);
1148 if (BE (dest
->elems
== NULL
, 0))
1153 if (src1
!= NULL
&& src1
->nelem
> 0)
1154 return re_node_set_init_copy (dest
, src1
);
1155 else if (src2
!= NULL
&& src2
->nelem
> 0)
1156 return re_node_set_init_copy (dest
, src2
);
1158 re_node_set_init_empty (dest
);
1161 for (i1
= i2
= id
= 0 ; i1
< src1
->nelem
&& i2
< src2
->nelem
;)
1163 if (src1
->elems
[i1
] > src2
->elems
[i2
])
1165 dest
->elems
[id
++] = src2
->elems
[i2
++];
1168 if (src1
->elems
[i1
] == src2
->elems
[i2
])
1170 dest
->elems
[id
++] = src1
->elems
[i1
++];
1172 if (i1
< src1
->nelem
)
1174 memcpy (dest
->elems
+ id
, src1
->elems
+ i1
,
1175 (src1
->nelem
- i1
) * sizeof (int));
1176 id
+= src1
->nelem
- i1
;
1178 else if (i2
< src2
->nelem
)
1180 memcpy (dest
->elems
+ id
, src2
->elems
+ i2
,
1181 (src2
->nelem
- i2
) * sizeof (int));
1182 id
+= src2
->nelem
- i2
;
1188 /* Calculate the union set of the sets DEST and SRC. And store it to
1189 DEST. Return value indicate the error code or REG_NOERROR if succeeded. */
1191 static reg_errcode_t
1192 internal_function __attribute_warn_unused_result__
1193 re_node_set_merge (re_node_set
*dest
, const re_node_set
*src
)
1195 int is
, id
, sbase
, delta
;
1196 if (src
== NULL
|| src
->nelem
== 0)
1198 if (dest
->alloc
< 2 * src
->nelem
+ dest
->nelem
)
1200 int new_alloc
= 2 * (src
->nelem
+ dest
->alloc
);
1201 int *new_buffer
= re_realloc (dest
->elems
, int, new_alloc
);
1202 if (BE (new_buffer
== NULL
, 0))
1204 dest
->elems
= new_buffer
;
1205 dest
->alloc
= new_alloc
;
1208 if (BE (dest
->nelem
== 0, 0))
1210 dest
->nelem
= src
->nelem
;
1211 memcpy (dest
->elems
, src
->elems
, src
->nelem
* sizeof (int));
1215 /* Copy into the top of DEST the items of SRC that are not
1216 found in DEST. Maybe we could binary search in DEST? */
1217 for (sbase
= dest
->nelem
+ 2 * src
->nelem
,
1218 is
= src
->nelem
- 1, id
= dest
->nelem
- 1; is
>= 0 && id
>= 0; )
1220 if (dest
->elems
[id
] == src
->elems
[is
])
1222 else if (dest
->elems
[id
] < src
->elems
[is
])
1223 dest
->elems
[--sbase
] = src
->elems
[is
--];
1224 else /* if (dest->elems[id] > src->elems[is]) */
1230 /* If DEST is exhausted, the remaining items of SRC must be unique. */
1232 memcpy (dest
->elems
+ sbase
, src
->elems
, (is
+ 1) * sizeof (int));
1235 id
= dest
->nelem
- 1;
1236 is
= dest
->nelem
+ 2 * src
->nelem
- 1;
1237 delta
= is
- sbase
+ 1;
1241 /* Now copy. When DELTA becomes zero, the remaining
1242 DEST elements are already in place. */
1243 dest
->nelem
+= delta
;
1246 if (dest
->elems
[is
] > dest
->elems
[id
])
1248 /* Copy from the top. */
1249 dest
->elems
[id
+ delta
--] = dest
->elems
[is
--];
1255 /* Slide from the bottom. */
1256 dest
->elems
[id
+ delta
] = dest
->elems
[id
];
1259 /* Copy remaining SRC elements. */
1260 memcpy (dest
->elems
, dest
->elems
+ sbase
,
1261 delta
* sizeof (int));
1270 /* Insert the new element ELEM to the re_node_set* SET.
1271 SET should not already have ELEM.
1272 return -1 if an error is occured, return 1 otherwise. */
1275 internal_function __attribute_warn_unused_result__
1276 re_node_set_insert (re_node_set
*set
, int elem
)
1279 /* In case the set is empty. */
1280 if (set
->alloc
== 0)
1282 if (BE (re_node_set_init_1 (set
, elem
) == REG_NOERROR
, 1))
1288 if (BE (set
->nelem
, 0) == 0)
1290 /* We already guaranteed above that set->alloc != 0. */
1291 set
->elems
[0] = elem
;
1296 /* Realloc if we need. */
1297 if (set
->alloc
== set
->nelem
)
1300 set
->alloc
= set
->alloc
* 2;
1301 new_elems
= re_realloc (set
->elems
, int, set
->alloc
);
1302 if (BE (new_elems
== NULL
, 0))
1304 set
->elems
= new_elems
;
1307 /* Move the elements which follows the new element. Test the
1308 first element separately to skip a check in the inner loop. */
1309 if (elem
< set
->elems
[0])
1312 for (idx
= set
->nelem
; idx
> 0; idx
--)
1313 set
->elems
[idx
] = set
->elems
[idx
- 1];
1317 for (idx
= set
->nelem
; set
->elems
[idx
- 1] > elem
; idx
--)
1318 set
->elems
[idx
] = set
->elems
[idx
- 1];
1321 /* Insert the new element. */
1322 set
->elems
[idx
] = elem
;
1327 /* Insert the new element ELEM to the re_node_set* SET.
1328 SET should not already have any element greater than or equal to ELEM.
1329 Return -1 if an error is occured, return 1 otherwise. */
1332 internal_function __attribute_warn_unused_result__
1333 re_node_set_insert_last (re_node_set
*set
, int elem
)
1335 /* Realloc if we need. */
1336 if (set
->alloc
== set
->nelem
)
1339 set
->alloc
= (set
->alloc
+ 1) * 2;
1340 new_elems
= re_realloc (set
->elems
, int, set
->alloc
);
1341 if (BE (new_elems
== NULL
, 0))
1343 set
->elems
= new_elems
;
1346 /* Insert the new element. */
1347 set
->elems
[set
->nelem
++] = elem
;
1351 /* Compare two node sets SET1 and SET2.
1352 return 1 if SET1 and SET2 are equivalent, return 0 otherwise. */
1355 internal_function
__attribute ((pure
))
1356 re_node_set_compare (const re_node_set
*set1
, const re_node_set
*set2
)
1359 if (set1
== NULL
|| set2
== NULL
|| set1
->nelem
!= set2
->nelem
)
1361 for (i
= set1
->nelem
; --i
>= 0 ; )
1362 if (set1
->elems
[i
] != set2
->elems
[i
])
1367 /* Return (idx + 1) if SET contains the element ELEM, return 0 otherwise. */
1370 internal_function
__attribute ((pure
))
1371 re_node_set_contains (const re_node_set
*set
, int elem
)
1373 unsigned int idx
, right
, mid
;
1374 if (set
->nelem
<= 0)
1377 /* Binary search the element. */
1379 right
= set
->nelem
- 1;
1382 mid
= (idx
+ right
) / 2;
1383 if (set
->elems
[mid
] < elem
)
1388 return set
->elems
[idx
] == elem
? idx
+ 1 : 0;
1393 re_node_set_remove_at (re_node_set
*set
, int idx
)
1395 if (idx
< 0 || idx
>= set
->nelem
)
1398 for (; idx
< set
->nelem
; idx
++)
1399 set
->elems
[idx
] = set
->elems
[idx
+ 1];
1403 /* Add the token TOKEN to dfa->nodes, and return the index of the token.
1404 Or return -1, if an error will be occured. */
1408 re_dfa_add_node (re_dfa_t
*dfa
, re_token_t token
)
1410 int type
= token
.type
;
1411 if (BE (dfa
->nodes_len
>= dfa
->nodes_alloc
, 0))
1413 size_t new_nodes_alloc
= dfa
->nodes_alloc
* 2;
1414 int *new_nexts
, *new_indices
;
1415 re_node_set
*new_edests
, *new_eclosures
;
1416 re_token_t
*new_nodes
;
1418 /* Avoid overflows in realloc. */
1419 const size_t max_object_size
= MAX (sizeof (re_token_t
),
1420 MAX (sizeof (re_node_set
),
1422 if (BE (SIZE_MAX
/ max_object_size
< new_nodes_alloc
, 0))
1425 new_nodes
= re_realloc (dfa
->nodes
, re_token_t
, new_nodes_alloc
);
1426 if (BE (new_nodes
== NULL
, 0))
1428 dfa
->nodes
= new_nodes
;
1429 new_nexts
= re_realloc (dfa
->nexts
, int, new_nodes_alloc
);
1430 new_indices
= re_realloc (dfa
->org_indices
, int, new_nodes_alloc
);
1431 new_edests
= re_realloc (dfa
->edests
, re_node_set
, new_nodes_alloc
);
1432 new_eclosures
= re_realloc (dfa
->eclosures
, re_node_set
, new_nodes_alloc
);
1433 if (BE (new_nexts
== NULL
|| new_indices
== NULL
1434 || new_edests
== NULL
|| new_eclosures
== NULL
, 0))
1436 dfa
->nexts
= new_nexts
;
1437 dfa
->org_indices
= new_indices
;
1438 dfa
->edests
= new_edests
;
1439 dfa
->eclosures
= new_eclosures
;
1440 dfa
->nodes_alloc
= new_nodes_alloc
;
1442 dfa
->nodes
[dfa
->nodes_len
] = token
;
1443 dfa
->nodes
[dfa
->nodes_len
].constraint
= 0;
1444 #ifdef RE_ENABLE_I18N
1445 dfa
->nodes
[dfa
->nodes_len
].accept_mb
=
1446 (type
== OP_PERIOD
&& dfa
->mb_cur_max
> 1) || type
== COMPLEX_BRACKET
;
1448 dfa
->nexts
[dfa
->nodes_len
] = -1;
1449 re_node_set_init_empty (dfa
->edests
+ dfa
->nodes_len
);
1450 re_node_set_init_empty (dfa
->eclosures
+ dfa
->nodes_len
);
1451 return dfa
->nodes_len
++;
1454 static inline unsigned int
1456 calc_state_hash (const re_node_set
*nodes
, unsigned int context
)
1458 unsigned int hash
= nodes
->nelem
+ context
;
1460 for (i
= 0 ; i
< nodes
->nelem
; i
++)
1461 hash
+= nodes
->elems
[i
];
1465 /* Search for the state whose node_set is equivalent to NODES.
1466 Return the pointer to the state, if we found it in the DFA.
1467 Otherwise create the new one and return it. In case of an error
1468 return NULL and set the error code in ERR.
1469 Note: - We assume NULL as the invalid state, then it is possible that
1470 return value is NULL and ERR is REG_NOERROR.
1471 - We never return non-NULL value in case of any errors, it is for
1474 static re_dfastate_t
*
1475 internal_function __attribute_warn_unused_result__
1476 re_acquire_state (reg_errcode_t
*err
, const re_dfa_t
*dfa
,
1477 const re_node_set
*nodes
)
1480 re_dfastate_t
*new_state
;
1481 struct re_state_table_entry
*spot
;
1483 if (BE (nodes
->nelem
== 0, 0))
1488 hash
= calc_state_hash (nodes
, 0);
1489 spot
= dfa
->state_table
+ (hash
& dfa
->state_hash_mask
);
1491 for (i
= 0 ; i
< spot
->num
; i
++)
1493 re_dfastate_t
*state
= spot
->array
[i
];
1494 if (hash
!= state
->hash
)
1496 if (re_node_set_compare (&state
->nodes
, nodes
))
1500 /* There are no appropriate state in the dfa, create the new one. */
1501 new_state
= create_ci_newstate (dfa
, nodes
, hash
);
1502 if (BE (new_state
== NULL
, 0))
1508 /* Search for the state whose node_set is equivalent to NODES and
1509 whose context is equivalent to CONTEXT.
1510 Return the pointer to the state, if we found it in the DFA.
1511 Otherwise create the new one and return it. In case of an error
1512 return NULL and set the error code in ERR.
1513 Note: - We assume NULL as the invalid state, then it is possible that
1514 return value is NULL and ERR is REG_NOERROR.
1515 - We never return non-NULL value in case of any errors, it is for
1518 static re_dfastate_t
*
1519 internal_function __attribute_warn_unused_result__
1520 re_acquire_state_context (reg_errcode_t
*err
, const re_dfa_t
*dfa
,
1521 const re_node_set
*nodes
, unsigned int context
)
1524 re_dfastate_t
*new_state
;
1525 struct re_state_table_entry
*spot
;
1527 if (nodes
->nelem
== 0)
1532 hash
= calc_state_hash (nodes
, context
);
1533 spot
= dfa
->state_table
+ (hash
& dfa
->state_hash_mask
);
1535 for (i
= 0 ; i
< spot
->num
; i
++)
1537 re_dfastate_t
*state
= spot
->array
[i
];
1538 if (state
->hash
== hash
1539 && state
->context
== context
1540 && re_node_set_compare (state
->entrance_nodes
, nodes
))
1543 /* There are no appropriate state in `dfa', create the new one. */
1544 new_state
= create_cd_newstate (dfa
, nodes
, context
, hash
);
1545 if (BE (new_state
== NULL
, 0))
1551 /* Finish initialization of the new state NEWSTATE, and using its hash value
1552 HASH put in the appropriate bucket of DFA's state table. Return value
1553 indicates the error code if failed. */
1555 static reg_errcode_t
1556 __attribute_warn_unused_result__
1557 register_state (const re_dfa_t
*dfa
, re_dfastate_t
*newstate
,
1560 struct re_state_table_entry
*spot
;
1564 newstate
->hash
= hash
;
1565 err
= re_node_set_alloc (&newstate
->non_eps_nodes
, newstate
->nodes
.nelem
);
1566 if (BE (err
!= REG_NOERROR
, 0))
1568 for (i
= 0; i
< newstate
->nodes
.nelem
; i
++)
1570 int elem
= newstate
->nodes
.elems
[i
];
1571 if (!IS_EPSILON_NODE (dfa
->nodes
[elem
].type
))
1572 if (re_node_set_insert_last (&newstate
->non_eps_nodes
, elem
) < 0)
1576 spot
= dfa
->state_table
+ (hash
& dfa
->state_hash_mask
);
1577 if (BE (spot
->alloc
<= spot
->num
, 0))
1579 int new_alloc
= 2 * spot
->num
+ 2;
1580 re_dfastate_t
**new_array
= re_realloc (spot
->array
, re_dfastate_t
*,
1582 if (BE (new_array
== NULL
, 0))
1584 spot
->array
= new_array
;
1585 spot
->alloc
= new_alloc
;
1587 spot
->array
[spot
->num
++] = newstate
;
1592 free_state (re_dfastate_t
*state
)
1594 re_node_set_free (&state
->non_eps_nodes
);
1595 re_node_set_free (&state
->inveclosure
);
1596 if (state
->entrance_nodes
!= &state
->nodes
)
1598 re_node_set_free (state
->entrance_nodes
);
1599 re_free (state
->entrance_nodes
);
1601 re_node_set_free (&state
->nodes
);
1602 re_free (state
->word_trtable
);
1603 re_free (state
->trtable
);
1607 /* Create the new state which is independ of contexts.
1608 Return the new state if succeeded, otherwise return NULL. */
1610 static re_dfastate_t
*
1611 internal_function __attribute_warn_unused_result__
1612 create_ci_newstate (const re_dfa_t
*dfa
, const re_node_set
*nodes
,
1617 re_dfastate_t
*newstate
;
1619 newstate
= (re_dfastate_t
*) calloc (sizeof (re_dfastate_t
), 1);
1620 if (BE (newstate
== NULL
, 0))
1622 err
= re_node_set_init_copy (&newstate
->nodes
, nodes
);
1623 if (BE (err
!= REG_NOERROR
, 0))
1629 newstate
->entrance_nodes
= &newstate
->nodes
;
1630 for (i
= 0 ; i
< nodes
->nelem
; i
++)
1632 re_token_t
*node
= dfa
->nodes
+ nodes
->elems
[i
];
1633 re_token_type_t type
= node
->type
;
1634 if (type
== CHARACTER
&& !node
->constraint
)
1636 #ifdef RE_ENABLE_I18N
1637 newstate
->accept_mb
|= node
->accept_mb
;
1638 #endif /* RE_ENABLE_I18N */
1640 /* If the state has the halt node, the state is a halt state. */
1641 if (type
== END_OF_RE
)
1643 else if (type
== OP_BACK_REF
)
1644 newstate
->has_backref
= 1;
1645 else if (type
== ANCHOR
|| node
->constraint
)
1646 newstate
->has_constraint
= 1;
1648 err
= register_state (dfa
, newstate
, hash
);
1649 if (BE (err
!= REG_NOERROR
, 0))
1651 free_state (newstate
);
1657 /* Create the new state which is depend on the context CONTEXT.
1658 Return the new state if succeeded, otherwise return NULL. */
1660 static re_dfastate_t
*
1661 internal_function __attribute_warn_unused_result__
1662 create_cd_newstate (const re_dfa_t
*dfa
, const re_node_set
*nodes
,
1663 unsigned int context
, unsigned int hash
)
1665 int i
, nctx_nodes
= 0;
1667 re_dfastate_t
*newstate
;
1669 newstate
= (re_dfastate_t
*) calloc (sizeof (re_dfastate_t
), 1);
1670 if (BE (newstate
== NULL
, 0))
1672 err
= re_node_set_init_copy (&newstate
->nodes
, nodes
);
1673 if (BE (err
!= REG_NOERROR
, 0))
1679 newstate
->context
= context
;
1680 newstate
->entrance_nodes
= &newstate
->nodes
;
1682 for (i
= 0 ; i
< nodes
->nelem
; i
++)
1684 re_token_t
*node
= dfa
->nodes
+ nodes
->elems
[i
];
1685 re_token_type_t type
= node
->type
;
1686 unsigned int constraint
= node
->constraint
;
1688 if (type
== CHARACTER
&& !constraint
)
1690 #ifdef RE_ENABLE_I18N
1691 newstate
->accept_mb
|= node
->accept_mb
;
1692 #endif /* RE_ENABLE_I18N */
1694 /* If the state has the halt node, the state is a halt state. */
1695 if (type
== END_OF_RE
)
1697 else if (type
== OP_BACK_REF
)
1698 newstate
->has_backref
= 1;
1702 if (newstate
->entrance_nodes
== &newstate
->nodes
)
1704 newstate
->entrance_nodes
= re_malloc (re_node_set
, 1);
1705 if (BE (newstate
->entrance_nodes
== NULL
, 0))
1707 free_state (newstate
);
1710 if (re_node_set_init_copy (newstate
->entrance_nodes
, nodes
)
1714 newstate
->has_constraint
= 1;
1717 if (NOT_SATISFY_PREV_CONSTRAINT (constraint
,context
))
1719 re_node_set_remove_at (&newstate
->nodes
, i
- nctx_nodes
);
1724 err
= register_state (dfa
, newstate
, hash
);
1725 if (BE (err
!= REG_NOERROR
, 0))
1727 free_state (newstate
);