1 /* Extended regular expression matching and search library.
2 Copyright (C) 2002-2024 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4 Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library; if not, see
18 <https://www.gnu.org/licenses/>. */
20 static void re_string_construct_common (const char *str
, Idx len
,
22 RE_TRANSLATE_TYPE trans
, bool icase
,
24 static re_dfastate_t
*create_ci_newstate (const re_dfa_t
*dfa
,
25 const re_node_set
*nodes
,
27 static re_dfastate_t
*create_cd_newstate (const re_dfa_t
*dfa
,
28 const re_node_set
*nodes
,
31 static reg_errcode_t
re_string_realloc_buffers (re_string_t
*pstr
,
33 static void build_wcs_buffer (re_string_t
*pstr
);
34 static reg_errcode_t
build_wcs_upper_buffer (re_string_t
*pstr
);
35 static void build_upper_buffer (re_string_t
*pstr
);
36 static void re_string_translate_buffer (re_string_t
*pstr
);
37 static unsigned int re_string_context_at (const re_string_t
*input
, Idx idx
,
38 int eflags
) __attribute__ ((pure
));
40 /* Functions for string operation. */
42 /* This function allocate the buffers. It is necessary to call
43 re_string_reconstruct before using the object. */
46 __attribute_warn_unused_result__
47 re_string_allocate (re_string_t
*pstr
, const char *str
, Idx len
, Idx init_len
,
48 RE_TRANSLATE_TYPE trans
, bool icase
, const re_dfa_t
*dfa
)
53 /* Ensure at least one character fits into the buffers. */
54 if (init_len
< dfa
->mb_cur_max
)
55 init_len
= dfa
->mb_cur_max
;
56 init_buf_len
= (len
+ 1 < init_len
) ? len
+ 1: init_len
;
57 re_string_construct_common (str
, len
, pstr
, trans
, icase
, dfa
);
59 ret
= re_string_realloc_buffers (pstr
, init_buf_len
);
60 if (__glibc_unlikely (ret
!= REG_NOERROR
))
63 pstr
->word_char
= dfa
->word_char
;
64 pstr
->word_ops_used
= dfa
->word_ops_used
;
65 pstr
->mbs
= pstr
->mbs_allocated
? pstr
->mbs
: (unsigned char *) str
;
66 pstr
->valid_len
= (pstr
->mbs_allocated
|| dfa
->mb_cur_max
> 1) ? 0 : len
;
67 pstr
->valid_raw_len
= pstr
->valid_len
;
71 /* This function allocate the buffers, and initialize them. */
74 __attribute_warn_unused_result__
75 re_string_construct (re_string_t
*pstr
, const char *str
, Idx len
,
76 RE_TRANSLATE_TYPE trans
, bool icase
, const re_dfa_t
*dfa
)
79 memset (pstr
, '\0', sizeof (re_string_t
));
80 re_string_construct_common (str
, len
, pstr
, trans
, icase
, dfa
);
84 ret
= re_string_realloc_buffers (pstr
, len
+ 1);
85 if (__glibc_unlikely (ret
!= REG_NOERROR
))
88 pstr
->mbs
= pstr
->mbs_allocated
? pstr
->mbs
: (unsigned char *) str
;
92 if (dfa
->mb_cur_max
> 1)
96 ret
= build_wcs_upper_buffer (pstr
);
97 if (__glibc_unlikely (ret
!= REG_NOERROR
))
99 if (pstr
->valid_raw_len
>= len
)
101 if (pstr
->bufs_len
> pstr
->valid_len
+ dfa
->mb_cur_max
)
103 ret
= re_string_realloc_buffers (pstr
, pstr
->bufs_len
* 2);
104 if (__glibc_unlikely (ret
!= REG_NOERROR
))
109 build_upper_buffer (pstr
);
113 if (dfa
->mb_cur_max
> 1)
114 build_wcs_buffer (pstr
);
118 re_string_translate_buffer (pstr
);
121 pstr
->valid_len
= pstr
->bufs_len
;
122 pstr
->valid_raw_len
= pstr
->bufs_len
;
130 /* Helper functions for re_string_allocate, and re_string_construct. */
133 __attribute_warn_unused_result__
134 re_string_realloc_buffers (re_string_t
*pstr
, Idx new_buf_len
)
136 if (pstr
->mb_cur_max
> 1)
140 /* Avoid overflow in realloc. */
141 const size_t max_object_size
= MAX (sizeof (wint_t), sizeof (Idx
));
142 if (__glibc_unlikely (MIN (IDX_MAX
, SIZE_MAX
/ max_object_size
)
146 new_wcs
= re_realloc (pstr
->wcs
, wint_t, new_buf_len
);
147 if (__glibc_unlikely (new_wcs
== NULL
))
150 if (pstr
->offsets
!= NULL
)
152 Idx
*new_offsets
= re_realloc (pstr
->offsets
, Idx
, new_buf_len
);
153 if (__glibc_unlikely (new_offsets
== NULL
))
155 pstr
->offsets
= new_offsets
;
158 if (pstr
->mbs_allocated
)
160 unsigned char *new_mbs
= re_realloc (pstr
->mbs
, unsigned char,
162 if (__glibc_unlikely (new_mbs
== NULL
))
166 pstr
->bufs_len
= new_buf_len
;
172 re_string_construct_common (const char *str
, Idx len
, re_string_t
*pstr
,
173 RE_TRANSLATE_TYPE trans
, bool icase
,
176 pstr
->raw_mbs
= (const unsigned char *) str
;
181 pstr
->mbs_allocated
= (trans
!= NULL
|| icase
);
182 pstr
->mb_cur_max
= dfa
->mb_cur_max
;
183 pstr
->is_utf8
= dfa
->is_utf8
;
184 pstr
->map_notascii
= dfa
->map_notascii
;
185 pstr
->stop
= pstr
->len
;
186 pstr
->raw_stop
= pstr
->stop
;
190 /* Build wide character buffer PSTR->WCS.
191 If the byte sequence of the string are:
192 <mb1>(0), <mb1>(1), <mb2>(0), <mb2>(1), <sb3>
193 Then wide character buffer will be:
194 <wc1> , WEOF , <wc2> , WEOF , <wc3>
195 We use WEOF for padding, they indicate that the position isn't
196 a first byte of a multibyte character.
198 Note that this function assumes PSTR->VALID_LEN elements are already
199 built and starts from PSTR->VALID_LEN. */
202 build_wcs_buffer (re_string_t
*pstr
)
205 unsigned char buf
[MB_LEN_MAX
];
206 DEBUG_ASSERT (MB_LEN_MAX
>= pstr
->mb_cur_max
);
208 unsigned char buf
[64];
211 Idx byte_idx
, end_idx
, remain_len
;
214 /* Build the buffers from pstr->valid_len to either pstr->len or
216 end_idx
= (pstr
->bufs_len
> pstr
->len
) ? pstr
->len
: pstr
->bufs_len
;
217 for (byte_idx
= pstr
->valid_len
; byte_idx
< end_idx
;)
222 remain_len
= end_idx
- byte_idx
;
223 prev_st
= pstr
->cur_state
;
224 /* Apply the translation if we need. */
225 if (__glibc_unlikely (pstr
->trans
!= NULL
))
229 for (i
= 0; i
< pstr
->mb_cur_max
&& i
< remain_len
; ++i
)
231 ch
= pstr
->raw_mbs
[pstr
->raw_mbs_idx
+ byte_idx
+ i
];
232 buf
[i
] = pstr
->mbs
[byte_idx
+ i
] = pstr
->trans
[ch
];
234 p
= (const char *) buf
;
237 p
= (const char *) pstr
->raw_mbs
+ pstr
->raw_mbs_idx
+ byte_idx
;
238 mbclen
= __mbrtowc (&wc
, p
, remain_len
, &pstr
->cur_state
);
239 if (__glibc_unlikely (mbclen
== (size_t) -1 || mbclen
== 0
240 || (mbclen
== (size_t) -2
241 && pstr
->bufs_len
>= pstr
->len
)))
243 /* We treat these cases as a singlebyte character. */
245 wc
= (wchar_t) pstr
->raw_mbs
[pstr
->raw_mbs_idx
+ byte_idx
];
246 if (__glibc_unlikely (pstr
->trans
!= NULL
))
247 wc
= pstr
->trans
[wc
];
248 pstr
->cur_state
= prev_st
;
250 else if (__glibc_unlikely (mbclen
== (size_t) -2))
252 /* The buffer doesn't have enough space, finish to build. */
253 pstr
->cur_state
= prev_st
;
257 /* Write wide character and padding. */
258 pstr
->wcs
[byte_idx
++] = wc
;
259 /* Write paddings. */
260 for (remain_len
= byte_idx
+ mbclen
- 1; byte_idx
< remain_len
;)
261 pstr
->wcs
[byte_idx
++] = WEOF
;
263 pstr
->valid_len
= byte_idx
;
264 pstr
->valid_raw_len
= byte_idx
;
267 /* Build wide character buffer PSTR->WCS like build_wcs_buffer,
268 but for REG_ICASE. */
271 __attribute_warn_unused_result__
272 build_wcs_upper_buffer (re_string_t
*pstr
)
275 Idx src_idx
, byte_idx
, end_idx
, remain_len
;
278 char buf
[MB_LEN_MAX
];
279 DEBUG_ASSERT (pstr
->mb_cur_max
<= MB_LEN_MAX
);
284 byte_idx
= pstr
->valid_len
;
285 end_idx
= (pstr
->bufs_len
> pstr
->len
) ? pstr
->len
: pstr
->bufs_len
;
287 /* The following optimization assumes that ASCII characters can be
288 mapped to wide characters with a simple cast. */
289 if (! pstr
->map_notascii
&& pstr
->trans
== NULL
&& !pstr
->offsets_needed
)
291 while (byte_idx
< end_idx
)
294 unsigned char ch
= pstr
->raw_mbs
[pstr
->raw_mbs_idx
+ byte_idx
];
296 if (isascii (ch
) && mbsinit (&pstr
->cur_state
))
298 /* The next step uses the assumption that wchar_t is encoded
299 ASCII-safe: all ASCII values can be converted like this. */
300 wchar_t wcu
= __towupper (ch
);
303 pstr
->mbs
[byte_idx
] = wcu
;
304 pstr
->wcs
[byte_idx
] = wcu
;
310 remain_len
= end_idx
- byte_idx
;
311 prev_st
= pstr
->cur_state
;
312 mbclen
= __mbrtowc (&wc
,
313 ((const char *) pstr
->raw_mbs
+ pstr
->raw_mbs_idx
314 + byte_idx
), remain_len
, &pstr
->cur_state
);
315 if (__glibc_likely (0 < mbclen
&& mbclen
< (size_t) -2))
317 wchar_t wcu
= __towupper (wc
);
322 mbcdlen
= __wcrtomb (buf
, wcu
, &prev_st
);
323 if (__glibc_likely (mbclen
== mbcdlen
))
324 memcpy (pstr
->mbs
+ byte_idx
, buf
, mbclen
);
332 memcpy (pstr
->mbs
+ byte_idx
,
333 pstr
->raw_mbs
+ pstr
->raw_mbs_idx
+ byte_idx
, mbclen
);
334 pstr
->wcs
[byte_idx
++] = wcu
;
335 /* Write paddings. */
336 for (remain_len
= byte_idx
+ mbclen
- 1; byte_idx
< remain_len
;)
337 pstr
->wcs
[byte_idx
++] = WEOF
;
339 else if (mbclen
== (size_t) -1 || mbclen
== 0
340 || (mbclen
== (size_t) -2 && pstr
->bufs_len
>= pstr
->len
))
342 /* It is an invalid character, an incomplete character
343 at the end of the string, or '\0'. Just use the byte. */
344 pstr
->mbs
[byte_idx
] = ch
;
345 /* And also cast it to wide char. */
346 pstr
->wcs
[byte_idx
++] = (wchar_t) ch
;
347 if (__glibc_unlikely (mbclen
== (size_t) -1))
348 pstr
->cur_state
= prev_st
;
352 /* The buffer doesn't have enough space, finish to build. */
353 pstr
->cur_state
= prev_st
;
357 pstr
->valid_len
= byte_idx
;
358 pstr
->valid_raw_len
= byte_idx
;
362 for (src_idx
= pstr
->valid_raw_len
; byte_idx
< end_idx
;)
367 remain_len
= end_idx
- byte_idx
;
368 prev_st
= pstr
->cur_state
;
369 if (__glibc_unlikely (pstr
->trans
!= NULL
))
373 for (i
= 0; i
< pstr
->mb_cur_max
&& i
< remain_len
; ++i
)
375 ch
= pstr
->raw_mbs
[pstr
->raw_mbs_idx
+ src_idx
+ i
];
376 buf
[i
] = pstr
->trans
[ch
];
378 p
= (const char *) buf
;
381 p
= (const char *) pstr
->raw_mbs
+ pstr
->raw_mbs_idx
+ src_idx
;
382 mbclen
= __mbrtowc (&wc
, p
, remain_len
, &pstr
->cur_state
);
383 if (__glibc_likely (0 < mbclen
&& mbclen
< (size_t) -2))
385 wchar_t wcu
= __towupper (wc
);
390 mbcdlen
= __wcrtomb ((char *) buf
, wcu
, &prev_st
);
391 if (__glibc_likely (mbclen
== mbcdlen
))
392 memcpy (pstr
->mbs
+ byte_idx
, buf
, mbclen
);
393 else if (mbcdlen
!= (size_t) -1)
397 if (byte_idx
+ mbcdlen
> pstr
->bufs_len
)
399 pstr
->cur_state
= prev_st
;
403 if (pstr
->offsets
== NULL
)
405 pstr
->offsets
= re_malloc (Idx
, pstr
->bufs_len
);
407 if (pstr
->offsets
== NULL
)
410 if (!pstr
->offsets_needed
)
412 for (i
= 0; i
< (size_t) byte_idx
; ++i
)
413 pstr
->offsets
[i
] = i
;
414 pstr
->offsets_needed
= 1;
417 memcpy (pstr
->mbs
+ byte_idx
, buf
, mbcdlen
);
418 pstr
->wcs
[byte_idx
] = wcu
;
419 pstr
->offsets
[byte_idx
] = src_idx
;
420 for (i
= 1; i
< mbcdlen
; ++i
)
422 pstr
->offsets
[byte_idx
+ i
]
423 = src_idx
+ (i
< mbclen
? i
: mbclen
- 1);
424 pstr
->wcs
[byte_idx
+ i
] = WEOF
;
426 pstr
->len
+= mbcdlen
- mbclen
;
427 if (pstr
->raw_stop
> src_idx
)
428 pstr
->stop
+= mbcdlen
- mbclen
;
429 end_idx
= (pstr
->bufs_len
> pstr
->len
)
430 ? pstr
->len
: pstr
->bufs_len
;
436 memcpy (pstr
->mbs
+ byte_idx
, p
, mbclen
);
439 memcpy (pstr
->mbs
+ byte_idx
, p
, mbclen
);
441 if (__glibc_unlikely (pstr
->offsets_needed
!= 0))
444 for (i
= 0; i
< mbclen
; ++i
)
445 pstr
->offsets
[byte_idx
+ i
] = src_idx
+ i
;
449 pstr
->wcs
[byte_idx
++] = wcu
;
450 /* Write paddings. */
451 for (remain_len
= byte_idx
+ mbclen
- 1; byte_idx
< remain_len
;)
452 pstr
->wcs
[byte_idx
++] = WEOF
;
454 else if (mbclen
== (size_t) -1 || mbclen
== 0
455 || (mbclen
== (size_t) -2 && pstr
->bufs_len
>= pstr
->len
))
457 /* It is an invalid character or '\0'. Just use the byte. */
458 int ch
= pstr
->raw_mbs
[pstr
->raw_mbs_idx
+ src_idx
];
460 if (__glibc_unlikely (pstr
->trans
!= NULL
))
461 ch
= pstr
->trans
[ch
];
462 pstr
->mbs
[byte_idx
] = ch
;
464 if (__glibc_unlikely (pstr
->offsets_needed
!= 0))
465 pstr
->offsets
[byte_idx
] = src_idx
;
468 /* And also cast it to wide char. */
469 pstr
->wcs
[byte_idx
++] = (wchar_t) ch
;
470 if (__glibc_unlikely (mbclen
== (size_t) -1))
471 pstr
->cur_state
= prev_st
;
475 /* The buffer doesn't have enough space, finish to build. */
476 pstr
->cur_state
= prev_st
;
480 pstr
->valid_len
= byte_idx
;
481 pstr
->valid_raw_len
= src_idx
;
485 /* Skip characters until the index becomes greater than NEW_RAW_IDX.
489 re_string_skip_chars (re_string_t
*pstr
, Idx new_raw_idx
, wint_t *last_wc
)
496 /* Skip the characters which are not necessary to check. */
497 for (rawbuf_idx
= pstr
->raw_mbs_idx
+ pstr
->valid_raw_len
;
498 rawbuf_idx
< new_raw_idx
;)
501 Idx remain_len
= pstr
->raw_len
- rawbuf_idx
;
502 prev_st
= pstr
->cur_state
;
503 mbclen
= __mbrtowc (&wc2
, (const char *) pstr
->raw_mbs
+ rawbuf_idx
,
504 remain_len
, &pstr
->cur_state
);
505 if (__glibc_unlikely (mbclen
== (size_t) -2 || mbclen
== (size_t) -1
508 /* We treat these cases as a single byte character. */
509 if (mbclen
== 0 || remain_len
== 0)
512 wc
= *(unsigned char *) (pstr
->raw_mbs
+ rawbuf_idx
);
514 pstr
->cur_state
= prev_st
;
518 /* Then proceed the next character. */
519 rawbuf_idx
+= mbclen
;
525 /* Build the buffer PSTR->MBS, and apply the translation if we need.
526 This function is used in case of REG_ICASE. */
529 build_upper_buffer (re_string_t
*pstr
)
531 Idx char_idx
, end_idx
;
532 end_idx
= (pstr
->bufs_len
> pstr
->len
) ? pstr
->len
: pstr
->bufs_len
;
534 for (char_idx
= pstr
->valid_len
; char_idx
< end_idx
; ++char_idx
)
536 int ch
= pstr
->raw_mbs
[pstr
->raw_mbs_idx
+ char_idx
];
537 if (__glibc_unlikely (pstr
->trans
!= NULL
))
538 ch
= pstr
->trans
[ch
];
539 pstr
->mbs
[char_idx
] = toupper (ch
);
541 pstr
->valid_len
= char_idx
;
542 pstr
->valid_raw_len
= char_idx
;
545 /* Apply TRANS to the buffer in PSTR. */
548 re_string_translate_buffer (re_string_t
*pstr
)
550 Idx buf_idx
, end_idx
;
551 end_idx
= (pstr
->bufs_len
> pstr
->len
) ? pstr
->len
: pstr
->bufs_len
;
553 for (buf_idx
= pstr
->valid_len
; buf_idx
< end_idx
; ++buf_idx
)
555 int ch
= pstr
->raw_mbs
[pstr
->raw_mbs_idx
+ buf_idx
];
556 pstr
->mbs
[buf_idx
] = pstr
->trans
[ch
];
559 pstr
->valid_len
= buf_idx
;
560 pstr
->valid_raw_len
= buf_idx
;
563 /* This function re-construct the buffers.
564 Concretely, convert to wide character in case of pstr->mb_cur_max > 1,
565 convert to upper case in case of REG_ICASE, apply translation. */
568 __attribute_warn_unused_result__
569 re_string_reconstruct (re_string_t
*pstr
, Idx idx
, int eflags
)
573 if (__glibc_unlikely (pstr
->raw_mbs_idx
<= idx
))
574 offset
= idx
- pstr
->raw_mbs_idx
;
578 if (pstr
->mb_cur_max
> 1)
579 memset (&pstr
->cur_state
, '\0', sizeof (mbstate_t));
580 pstr
->len
= pstr
->raw_len
;
581 pstr
->stop
= pstr
->raw_stop
;
583 pstr
->raw_mbs_idx
= 0;
584 pstr
->valid_raw_len
= 0;
585 pstr
->offsets_needed
= 0;
586 pstr
->tip_context
= ((eflags
& REG_NOTBOL
) ? CONTEXT_BEGBUF
587 : CONTEXT_NEWLINE
| CONTEXT_BEGBUF
);
588 if (!pstr
->mbs_allocated
)
589 pstr
->mbs
= (unsigned char *) pstr
->raw_mbs
;
593 if (__glibc_likely (offset
!= 0))
595 /* Should the already checked characters be kept? */
596 if (__glibc_likely (offset
< pstr
->valid_raw_len
))
598 /* Yes, move them to the front of the buffer. */
599 if (__glibc_unlikely (pstr
->offsets_needed
))
601 Idx low
= 0, high
= pstr
->valid_len
, mid
;
604 mid
= (high
+ low
) / 2;
605 if (pstr
->offsets
[mid
] > offset
)
607 else if (pstr
->offsets
[mid
] < offset
)
613 if (pstr
->offsets
[mid
] < offset
)
615 pstr
->tip_context
= re_string_context_at (pstr
, mid
- 1,
617 /* This can be quite complicated, so handle specially
618 only the common and easy case where the character with
619 different length representation of lower and upper
620 case is present at or after offset. */
621 if (pstr
->valid_len
> offset
622 && mid
== offset
&& pstr
->offsets
[mid
] == offset
)
624 memmove (pstr
->wcs
, pstr
->wcs
+ offset
,
625 (pstr
->valid_len
- offset
) * sizeof (wint_t));
626 memmove (pstr
->mbs
, pstr
->mbs
+ offset
, pstr
->valid_len
- offset
);
627 pstr
->valid_len
-= offset
;
628 pstr
->valid_raw_len
-= offset
;
629 for (low
= 0; low
< pstr
->valid_len
; low
++)
630 pstr
->offsets
[low
] = pstr
->offsets
[low
+ offset
] - offset
;
634 /* Otherwise, just find out how long the partial multibyte
635 character at offset is and fill it with WEOF/255. */
636 pstr
->len
= pstr
->raw_len
- idx
+ offset
;
637 pstr
->stop
= pstr
->raw_stop
- idx
+ offset
;
638 pstr
->offsets_needed
= 0;
639 while (mid
> 0 && pstr
->offsets
[mid
- 1] == offset
)
641 while (mid
< pstr
->valid_len
)
642 if (pstr
->wcs
[mid
] != WEOF
)
646 if (mid
== pstr
->valid_len
)
650 pstr
->valid_len
= pstr
->offsets
[mid
] - offset
;
653 for (low
= 0; low
< pstr
->valid_len
; ++low
)
654 pstr
->wcs
[low
] = WEOF
;
655 memset (pstr
->mbs
, 255, pstr
->valid_len
);
658 pstr
->valid_raw_len
= pstr
->valid_len
;
663 pstr
->tip_context
= re_string_context_at (pstr
, offset
- 1,
665 if (pstr
->mb_cur_max
> 1)
666 memmove (pstr
->wcs
, pstr
->wcs
+ offset
,
667 (pstr
->valid_len
- offset
) * sizeof (wint_t));
668 if (__glibc_unlikely (pstr
->mbs_allocated
))
669 memmove (pstr
->mbs
, pstr
->mbs
+ offset
,
670 pstr
->valid_len
- offset
);
671 pstr
->valid_len
-= offset
;
672 pstr
->valid_raw_len
-= offset
;
673 DEBUG_ASSERT (pstr
->valid_len
> 0);
678 /* No, skip all characters until IDX. */
679 Idx prev_valid_len
= pstr
->valid_len
;
681 if (__glibc_unlikely (pstr
->offsets_needed
))
683 pstr
->len
= pstr
->raw_len
- idx
+ offset
;
684 pstr
->stop
= pstr
->raw_stop
- idx
+ offset
;
685 pstr
->offsets_needed
= 0;
688 if (pstr
->mb_cur_max
> 1)
695 const unsigned char *raw
, *p
, *end
;
697 /* Special case UTF-8. Multi-byte chars start with any
698 byte other than 0x80 - 0xbf. */
699 raw
= pstr
->raw_mbs
+ pstr
->raw_mbs_idx
;
700 end
= raw
+ (offset
- pstr
->mb_cur_max
);
701 if (end
< pstr
->raw_mbs
)
703 p
= raw
+ offset
- 1;
705 /* We know the wchar_t encoding is UCS4, so for the simple
706 case, ASCII characters, skip the conversion step. */
707 if (isascii (*p
) && __glibc_likely (pstr
->trans
== NULL
))
709 memset (&pstr
->cur_state
, '\0', sizeof (mbstate_t));
710 /* pstr->valid_len = 0; */
715 for (; p
>= end
; --p
)
716 if ((*p
& 0xc0) != 0x80)
720 Idx mlen
= raw
+ pstr
->len
- p
;
721 unsigned char buf
[6];
724 const unsigned char *pp
= p
;
725 if (__glibc_unlikely (pstr
->trans
!= NULL
))
727 int i
= mlen
< 6 ? mlen
: 6;
729 buf
[i
] = pstr
->trans
[p
[i
]];
732 /* XXX Don't use mbrtowc, we know which conversion
733 to use (UTF-8 -> UCS4). */
734 memset (&cur_state
, 0, sizeof (cur_state
));
735 mbclen
= __mbrtowc (&wc2
, (const char *) pp
, mlen
,
737 if (raw
+ offset
- p
<= mbclen
738 && mbclen
< (size_t) -2)
740 memset (&pstr
->cur_state
, '\0',
742 pstr
->valid_len
= mbclen
- (raw
+ offset
- p
);
750 pstr
->valid_len
= re_string_skip_chars (pstr
, idx
, &wc
) - idx
;
753 = re_string_context_at (pstr
, prev_valid_len
- 1, eflags
);
755 pstr
->tip_context
= ((__glibc_unlikely (pstr
->word_ops_used
!= 0)
756 && IS_WIDE_WORD_CHAR (wc
))
758 : ((IS_WIDE_NEWLINE (wc
)
759 && pstr
->newline_anchor
)
760 ? CONTEXT_NEWLINE
: 0));
761 if (__glibc_unlikely (pstr
->valid_len
))
763 for (wcs_idx
= 0; wcs_idx
< pstr
->valid_len
; ++wcs_idx
)
764 pstr
->wcs
[wcs_idx
] = WEOF
;
765 if (pstr
->mbs_allocated
)
766 memset (pstr
->mbs
, 255, pstr
->valid_len
);
768 pstr
->valid_raw_len
= pstr
->valid_len
;
772 int c
= pstr
->raw_mbs
[pstr
->raw_mbs_idx
+ offset
- 1];
773 pstr
->valid_raw_len
= 0;
776 pstr
->tip_context
= (bitset_contain (pstr
->word_char
, c
)
778 : ((IS_NEWLINE (c
) && pstr
->newline_anchor
)
779 ? CONTEXT_NEWLINE
: 0));
782 if (!__glibc_unlikely (pstr
->mbs_allocated
))
785 pstr
->raw_mbs_idx
= idx
;
787 pstr
->stop
-= offset
;
789 /* Then build the buffers. */
790 if (pstr
->mb_cur_max
> 1)
794 reg_errcode_t ret
= build_wcs_upper_buffer (pstr
);
795 if (__glibc_unlikely (ret
!= REG_NOERROR
))
799 build_wcs_buffer (pstr
);
802 if (__glibc_unlikely (pstr
->mbs_allocated
))
805 build_upper_buffer (pstr
);
806 else if (pstr
->trans
!= NULL
)
807 re_string_translate_buffer (pstr
);
810 pstr
->valid_len
= pstr
->len
;
817 __attribute__ ((pure
))
818 re_string_peek_byte_case (const re_string_t
*pstr
, Idx idx
)
823 /* Handle the common (easiest) cases first. */
824 if (__glibc_likely (!pstr
->mbs_allocated
))
825 return re_string_peek_byte (pstr
, idx
);
827 if (pstr
->mb_cur_max
> 1
828 && ! re_string_is_single_byte_char (pstr
, pstr
->cur_idx
+ idx
))
829 return re_string_peek_byte (pstr
, idx
);
831 off
= pstr
->cur_idx
+ idx
;
832 if (pstr
->offsets_needed
)
833 off
= pstr
->offsets
[off
];
835 ch
= pstr
->raw_mbs
[pstr
->raw_mbs_idx
+ off
];
837 /* Ensure that e.g. for tr_TR.UTF-8 BACKSLASH DOTLESS SMALL LETTER I
838 this function returns CAPITAL LETTER I instead of first byte of
839 DOTLESS SMALL LETTER I. The latter would confuse the parser,
840 since peek_byte_case doesn't advance cur_idx in any way. */
841 if (pstr
->offsets_needed
&& !isascii (ch
))
842 return re_string_peek_byte (pstr
, idx
);
848 re_string_fetch_byte_case (re_string_t
*pstr
)
850 if (__glibc_likely (!pstr
->mbs_allocated
))
851 return re_string_fetch_byte (pstr
);
853 if (pstr
->offsets_needed
)
858 /* For tr_TR.UTF-8 [[:islower:]] there is
859 [[: CAPITAL LETTER I WITH DOT lower:]] in mbs. Skip
860 in that case the whole multi-byte character and return
861 the original letter. On the other side, with
862 [[: DOTLESS SMALL LETTER I return [[:I, as doing
863 anything else would complicate things too much. */
865 if (!re_string_first_byte (pstr
, pstr
->cur_idx
))
866 return re_string_fetch_byte (pstr
);
868 off
= pstr
->offsets
[pstr
->cur_idx
];
869 ch
= pstr
->raw_mbs
[pstr
->raw_mbs_idx
+ off
];
872 return re_string_fetch_byte (pstr
);
874 re_string_skip_bytes (pstr
,
875 re_string_char_size_at (pstr
, pstr
->cur_idx
));
879 return pstr
->raw_mbs
[pstr
->raw_mbs_idx
+ pstr
->cur_idx
++];
883 re_string_destruct (re_string_t
*pstr
)
886 re_free (pstr
->offsets
);
887 if (pstr
->mbs_allocated
)
891 /* Return the context at IDX in INPUT. */
894 re_string_context_at (const re_string_t
*input
, Idx idx
, int eflags
)
897 if (__glibc_unlikely (idx
< 0))
898 /* In this case, we use the value stored in input->tip_context,
899 since we can't know the character in input->mbs[-1] here. */
900 return input
->tip_context
;
901 if (__glibc_unlikely (idx
== input
->len
))
902 return ((eflags
& REG_NOTEOL
) ? CONTEXT_ENDBUF
903 : CONTEXT_NEWLINE
| CONTEXT_ENDBUF
);
904 if (input
->mb_cur_max
> 1)
908 while(input
->wcs
[wc_idx
] == WEOF
)
910 DEBUG_ASSERT (wc_idx
>= 0);
913 return input
->tip_context
;
915 wc
= input
->wcs
[wc_idx
];
916 if (__glibc_unlikely (input
->word_ops_used
!= 0)
917 && IS_WIDE_WORD_CHAR (wc
))
919 return (IS_WIDE_NEWLINE (wc
) && input
->newline_anchor
920 ? CONTEXT_NEWLINE
: 0);
924 c
= re_string_byte_at (input
, idx
);
925 if (bitset_contain (input
->word_char
, c
))
927 return IS_NEWLINE (c
) && input
->newline_anchor
? CONTEXT_NEWLINE
: 0;
931 /* Functions for set operation. */
934 __attribute_warn_unused_result__
935 re_node_set_alloc (re_node_set
*set
, Idx size
)
939 set
->elems
= re_malloc (Idx
, size
);
940 if (__glibc_unlikely (set
->elems
== NULL
))
946 __attribute_warn_unused_result__
947 re_node_set_init_1 (re_node_set
*set
, Idx elem
)
951 set
->elems
= re_malloc (Idx
, 1);
952 if (__glibc_unlikely (set
->elems
== NULL
))
954 set
->alloc
= set
->nelem
= 0;
957 set
->elems
[0] = elem
;
962 __attribute_warn_unused_result__
963 re_node_set_init_2 (re_node_set
*set
, Idx elem1
, Idx elem2
)
966 set
->elems
= re_malloc (Idx
, 2);
967 if (__glibc_unlikely (set
->elems
== NULL
))
972 set
->elems
[0] = elem1
;
979 set
->elems
[0] = elem1
;
980 set
->elems
[1] = elem2
;
984 set
->elems
[0] = elem2
;
985 set
->elems
[1] = elem1
;
992 __attribute_warn_unused_result__
993 re_node_set_init_copy (re_node_set
*dest
, const re_node_set
*src
)
995 dest
->nelem
= src
->nelem
;
998 dest
->alloc
= dest
->nelem
;
999 dest
->elems
= re_malloc (Idx
, dest
->alloc
);
1000 if (__glibc_unlikely (dest
->elems
== NULL
))
1002 dest
->alloc
= dest
->nelem
= 0;
1005 memcpy (dest
->elems
, src
->elems
, src
->nelem
* sizeof (Idx
));
1008 re_node_set_init_empty (dest
);
1012 /* Calculate the intersection of the sets SRC1 and SRC2. And merge it to
1013 DEST. Return value indicate the error code or REG_NOERROR if succeeded.
1014 Note: We assume dest->elems is NULL, when dest->alloc is 0. */
1016 static reg_errcode_t
1017 __attribute_warn_unused_result__
1018 re_node_set_add_intersect (re_node_set
*dest
, const re_node_set
*src1
,
1019 const re_node_set
*src2
)
1021 Idx i1
, i2
, is
, id
, delta
, sbase
;
1022 if (src1
->nelem
== 0 || src2
->nelem
== 0)
1025 /* We need dest->nelem + 2 * elems_in_intersection; this is a
1026 conservative estimate. */
1027 if (src1
->nelem
+ src2
->nelem
+ dest
->nelem
> dest
->alloc
)
1029 Idx new_alloc
= src1
->nelem
+ src2
->nelem
+ dest
->alloc
;
1030 Idx
*new_elems
= re_realloc (dest
->elems
, Idx
, new_alloc
);
1031 if (__glibc_unlikely (new_elems
== NULL
))
1033 dest
->elems
= new_elems
;
1034 dest
->alloc
= new_alloc
;
1037 /* Find the items in the intersection of SRC1 and SRC2, and copy
1038 into the top of DEST those that are not already in DEST itself. */
1039 sbase
= dest
->nelem
+ src1
->nelem
+ src2
->nelem
;
1040 i1
= src1
->nelem
- 1;
1041 i2
= src2
->nelem
- 1;
1042 id
= dest
->nelem
- 1;
1045 if (src1
->elems
[i1
] == src2
->elems
[i2
])
1047 /* Try to find the item in DEST. Maybe we could binary search? */
1048 while (id
>= 0 && dest
->elems
[id
] > src1
->elems
[i1
])
1051 if (id
< 0 || dest
->elems
[id
] != src1
->elems
[i1
])
1052 dest
->elems
[--sbase
] = src1
->elems
[i1
];
1054 if (--i1
< 0 || --i2
< 0)
1058 /* Lower the highest of the two items. */
1059 else if (src1
->elems
[i1
] < src2
->elems
[i2
])
1071 id
= dest
->nelem
- 1;
1072 is
= dest
->nelem
+ src1
->nelem
+ src2
->nelem
- 1;
1073 delta
= is
- sbase
+ 1;
1075 /* Now copy. When DELTA becomes zero, the remaining
1076 DEST elements are already in place; this is more or
1077 less the same loop that is in re_node_set_merge. */
1078 dest
->nelem
+= delta
;
1079 if (delta
> 0 && id
>= 0)
1082 if (dest
->elems
[is
] > dest
->elems
[id
])
1084 /* Copy from the top. */
1085 dest
->elems
[id
+ delta
--] = dest
->elems
[is
--];
1091 /* Slide from the bottom. */
1092 dest
->elems
[id
+ delta
] = dest
->elems
[id
];
1098 /* Copy remaining SRC elements. */
1099 memcpy (dest
->elems
, dest
->elems
+ sbase
, delta
* sizeof (Idx
));
1104 /* Calculate the union set of the sets SRC1 and SRC2. And store it to
1105 DEST. Return value indicate the error code or REG_NOERROR if succeeded. */
1107 static reg_errcode_t
1108 __attribute_warn_unused_result__
1109 re_node_set_init_union (re_node_set
*dest
, const re_node_set
*src1
,
1110 const re_node_set
*src2
)
1113 if (src1
!= NULL
&& src1
->nelem
> 0 && src2
!= NULL
&& src2
->nelem
> 0)
1115 dest
->alloc
= src1
->nelem
+ src2
->nelem
;
1116 dest
->elems
= re_malloc (Idx
, dest
->alloc
);
1117 if (__glibc_unlikely (dest
->elems
== NULL
))
1122 if (src1
!= NULL
&& src1
->nelem
> 0)
1123 return re_node_set_init_copy (dest
, src1
);
1124 else if (src2
!= NULL
&& src2
->nelem
> 0)
1125 return re_node_set_init_copy (dest
, src2
);
1127 re_node_set_init_empty (dest
);
1130 for (i1
= i2
= id
= 0 ; i1
< src1
->nelem
&& i2
< src2
->nelem
;)
1132 if (src1
->elems
[i1
] > src2
->elems
[i2
])
1134 dest
->elems
[id
++] = src2
->elems
[i2
++];
1137 if (src1
->elems
[i1
] == src2
->elems
[i2
])
1139 dest
->elems
[id
++] = src1
->elems
[i1
++];
1141 if (i1
< src1
->nelem
)
1143 memcpy (dest
->elems
+ id
, src1
->elems
+ i1
,
1144 (src1
->nelem
- i1
) * sizeof (Idx
));
1145 id
+= src1
->nelem
- i1
;
1147 else if (i2
< src2
->nelem
)
1149 memcpy (dest
->elems
+ id
, src2
->elems
+ i2
,
1150 (src2
->nelem
- i2
) * sizeof (Idx
));
1151 id
+= src2
->nelem
- i2
;
1157 /* Calculate the union set of the sets DEST and SRC. And store it to
1158 DEST. Return value indicate the error code or REG_NOERROR if succeeded. */
1160 static reg_errcode_t
1161 __attribute_warn_unused_result__
1162 re_node_set_merge (re_node_set
*dest
, const re_node_set
*src
)
1164 Idx is
, id
, sbase
, delta
;
1165 if (src
== NULL
|| src
->nelem
== 0)
1167 if (dest
->alloc
< 2 * src
->nelem
+ dest
->nelem
)
1169 Idx new_alloc
= 2 * (src
->nelem
+ dest
->alloc
);
1170 Idx
*new_buffer
= re_realloc (dest
->elems
, Idx
, new_alloc
);
1171 if (__glibc_unlikely (new_buffer
== NULL
))
1173 dest
->elems
= new_buffer
;
1174 dest
->alloc
= new_alloc
;
1177 if (__glibc_unlikely (dest
->nelem
== 0))
1179 /* Although we already guaranteed above that dest->alloc != 0 and
1180 therefore dest->elems != NULL, add a debug assertion to pacify
1181 GCC 11.2.1's -fanalyzer. */
1182 DEBUG_ASSERT (dest
->elems
);
1183 dest
->nelem
= src
->nelem
;
1184 memcpy (dest
->elems
, src
->elems
, src
->nelem
* sizeof (Idx
));
1188 /* Copy into the top of DEST the items of SRC that are not
1189 found in DEST. Maybe we could binary search in DEST? */
1190 for (sbase
= dest
->nelem
+ 2 * src
->nelem
,
1191 is
= src
->nelem
- 1, id
= dest
->nelem
- 1; is
>= 0 && id
>= 0; )
1193 if (dest
->elems
[id
] == src
->elems
[is
])
1195 else if (dest
->elems
[id
] < src
->elems
[is
])
1196 dest
->elems
[--sbase
] = src
->elems
[is
--];
1197 else /* if (dest->elems[id] > src->elems[is]) */
1203 /* If DEST is exhausted, the remaining items of SRC must be unique. */
1205 memcpy (dest
->elems
+ sbase
, src
->elems
, (is
+ 1) * sizeof (Idx
));
1208 id
= dest
->nelem
- 1;
1209 is
= dest
->nelem
+ 2 * src
->nelem
- 1;
1210 delta
= is
- sbase
+ 1;
1214 /* Now copy. When DELTA becomes zero, the remaining
1215 DEST elements are already in place. */
1216 dest
->nelem
+= delta
;
1219 if (dest
->elems
[is
] > dest
->elems
[id
])
1221 /* Copy from the top. */
1222 dest
->elems
[id
+ delta
--] = dest
->elems
[is
--];
1228 /* Slide from the bottom. */
1229 dest
->elems
[id
+ delta
] = dest
->elems
[id
];
1232 /* Copy remaining SRC elements. */
1233 memcpy (dest
->elems
, dest
->elems
+ sbase
,
1234 delta
* sizeof (Idx
));
1243 /* Insert the new element ELEM to the re_node_set* SET.
1244 SET should not already have ELEM.
1245 Return true if successful. */
1248 __attribute_warn_unused_result__
1249 re_node_set_insert (re_node_set
*set
, Idx elem
)
1252 /* In case the set is empty. */
1253 if (set
->alloc
== 0)
1254 return __glibc_likely (re_node_set_init_1 (set
, elem
) == REG_NOERROR
);
1256 if (__glibc_unlikely (set
->nelem
) == 0)
1258 /* Although we already guaranteed above that set->alloc != 0 and
1259 therefore set->elems != NULL, add a debug assertion to pacify
1260 GCC 11.2 -fanalyzer. */
1261 DEBUG_ASSERT (set
->elems
);
1262 set
->elems
[0] = elem
;
1267 /* Realloc if we need. */
1268 if (set
->alloc
== set
->nelem
)
1271 set
->alloc
= set
->alloc
* 2;
1272 new_elems
= re_realloc (set
->elems
, Idx
, set
->alloc
);
1273 if (__glibc_unlikely (new_elems
== NULL
))
1275 set
->elems
= new_elems
;
1278 /* Move the elements which follows the new element. Test the
1279 first element separately to skip a check in the inner loop. */
1280 if (elem
< set
->elems
[0])
1282 for (idx
= set
->nelem
; idx
> 0; idx
--)
1283 set
->elems
[idx
] = set
->elems
[idx
- 1];
1287 for (idx
= set
->nelem
; set
->elems
[idx
- 1] > elem
; idx
--)
1288 set
->elems
[idx
] = set
->elems
[idx
- 1];
1289 DEBUG_ASSERT (set
->elems
[idx
- 1] < elem
);
1292 /* Insert the new element. */
1293 set
->elems
[idx
] = elem
;
1298 /* Insert the new element ELEM to the re_node_set* SET.
1299 SET should not already have any element greater than or equal to ELEM.
1300 Return true if successful. */
1303 __attribute_warn_unused_result__
1304 re_node_set_insert_last (re_node_set
*set
, Idx elem
)
1306 /* Realloc if we need. */
1307 if (set
->alloc
== set
->nelem
)
1310 set
->alloc
= (set
->alloc
+ 1) * 2;
1311 new_elems
= re_realloc (set
->elems
, Idx
, set
->alloc
);
1312 if (__glibc_unlikely (new_elems
== NULL
))
1314 set
->elems
= new_elems
;
1317 /* Insert the new element. */
1318 set
->elems
[set
->nelem
++] = elem
;
1322 /* Compare two node sets SET1 and SET2.
1323 Return true if SET1 and SET2 are equivalent. */
1326 __attribute__ ((pure
))
1327 re_node_set_compare (const re_node_set
*set1
, const re_node_set
*set2
)
1330 if (set1
== NULL
|| set2
== NULL
|| set1
->nelem
!= set2
->nelem
)
1332 for (i
= set1
->nelem
; --i
>= 0 ; )
1333 if (set1
->elems
[i
] != set2
->elems
[i
])
1338 /* Return (idx + 1) if SET contains the element ELEM, return 0 otherwise. */
1341 __attribute__ ((pure
))
1342 re_node_set_contains (const re_node_set
*set
, Idx elem
)
1344 __re_size_t idx
, right
, mid
;
1345 if (set
->nelem
<= 0)
1348 /* Binary search the element. */
1350 right
= set
->nelem
- 1;
1353 mid
= (idx
+ right
) / 2;
1354 if (set
->elems
[mid
] < elem
)
1359 return set
->elems
[idx
] == elem
? idx
+ 1 : 0;
1363 re_node_set_remove_at (re_node_set
*set
, Idx idx
)
1365 if (idx
< 0 || idx
>= set
->nelem
)
1368 for (; idx
< set
->nelem
; idx
++)
1369 set
->elems
[idx
] = set
->elems
[idx
+ 1];
1373 /* Add the token TOKEN to dfa->nodes, and return the index of the token.
1374 Or return -1 if an error occurred. */
1377 re_dfa_add_node (re_dfa_t
*dfa
, re_token_t token
)
1379 if (__glibc_unlikely (dfa
->nodes_len
>= dfa
->nodes_alloc
))
1381 size_t new_nodes_alloc
= dfa
->nodes_alloc
* 2;
1382 Idx
*new_nexts
, *new_indices
;
1383 re_node_set
*new_edests
, *new_eclosures
;
1384 re_token_t
*new_nodes
;
1386 /* Avoid overflows in realloc. */
1387 const size_t max_object_size
= MAX (sizeof (re_token_t
),
1388 MAX (sizeof (re_node_set
),
1390 if (__glibc_unlikely (MIN (IDX_MAX
, SIZE_MAX
/ max_object_size
)
1394 new_nodes
= re_realloc (dfa
->nodes
, re_token_t
, new_nodes_alloc
);
1395 if (__glibc_unlikely (new_nodes
== NULL
))
1397 dfa
->nodes
= new_nodes
;
1398 dfa
->nodes_alloc
= new_nodes_alloc
;
1399 new_nexts
= re_realloc (dfa
->nexts
, Idx
, new_nodes_alloc
);
1400 if (new_nexts
!= NULL
)
1401 dfa
->nexts
= new_nexts
;
1402 new_indices
= re_realloc (dfa
->org_indices
, Idx
, new_nodes_alloc
);
1403 if (new_indices
!= NULL
)
1404 dfa
->org_indices
= new_indices
;
1405 new_edests
= re_realloc (dfa
->edests
, re_node_set
, new_nodes_alloc
);
1406 if (new_edests
!= NULL
)
1407 dfa
->edests
= new_edests
;
1408 new_eclosures
= re_realloc (dfa
->eclosures
, re_node_set
, new_nodes_alloc
);
1409 if (new_eclosures
!= NULL
)
1410 dfa
->eclosures
= new_eclosures
;
1411 if (__glibc_unlikely (new_nexts
== NULL
|| new_indices
== NULL
1412 || new_edests
== NULL
|| new_eclosures
== NULL
))
1415 dfa
->nodes
[dfa
->nodes_len
] = token
;
1416 dfa
->nodes
[dfa
->nodes_len
].constraint
= 0;
1417 dfa
->nodes
[dfa
->nodes_len
].accept_mb
=
1418 ((token
.type
== OP_PERIOD
&& dfa
->mb_cur_max
> 1)
1419 || token
.type
== COMPLEX_BRACKET
);
1420 dfa
->nexts
[dfa
->nodes_len
] = -1;
1421 re_node_set_init_empty (dfa
->edests
+ dfa
->nodes_len
);
1422 re_node_set_init_empty (dfa
->eclosures
+ dfa
->nodes_len
);
1423 return dfa
->nodes_len
++;
1427 calc_state_hash (const re_node_set
*nodes
, unsigned int context
)
1429 re_hashval_t hash
= nodes
->nelem
+ context
;
1431 for (i
= 0 ; i
< nodes
->nelem
; i
++)
1432 hash
+= nodes
->elems
[i
];
1436 /* Search for the state whose node_set is equivalent to NODES.
1437 Return the pointer to the state, if we found it in the DFA.
1438 Otherwise create the new one and return it. In case of an error
1439 return NULL and set the error code in ERR.
1440 Note: - We assume NULL as the invalid state, then it is possible that
1441 return value is NULL and ERR is REG_NOERROR.
1442 - We never return non-NULL value in case of any errors, it is for
1445 static re_dfastate_t
*
1446 __attribute_warn_unused_result__
1447 re_acquire_state (reg_errcode_t
*err
, const re_dfa_t
*dfa
,
1448 const re_node_set
*nodes
)
1451 re_dfastate_t
*new_state
;
1452 struct re_state_table_entry
*spot
;
1454 #if defined GCC_LINT || defined lint
1455 /* Suppress bogus uninitialized-variable warnings. */
1458 if (__glibc_unlikely (nodes
->nelem
== 0))
1463 hash
= calc_state_hash (nodes
, 0);
1464 spot
= dfa
->state_table
+ (hash
& dfa
->state_hash_mask
);
1466 for (i
= 0 ; i
< spot
->num
; i
++)
1468 re_dfastate_t
*state
= spot
->array
[i
];
1469 if (hash
!= state
->hash
)
1471 if (re_node_set_compare (&state
->nodes
, nodes
))
1475 /* There are no appropriate state in the dfa, create the new one. */
1476 new_state
= create_ci_newstate (dfa
, nodes
, hash
);
1477 if (__glibc_unlikely (new_state
== NULL
))
1483 /* Search for the state whose node_set is equivalent to NODES and
1484 whose context is equivalent to CONTEXT.
1485 Return the pointer to the state, if we found it in the DFA.
1486 Otherwise create the new one and return it. In case of an error
1487 return NULL and set the error code in ERR.
1488 Note: - We assume NULL as the invalid state, then it is possible that
1489 return value is NULL and ERR is REG_NOERROR.
1490 - We never return non-NULL value in case of any errors, it is for
1493 static re_dfastate_t
*
1494 __attribute_warn_unused_result__
1495 re_acquire_state_context (reg_errcode_t
*err
, const re_dfa_t
*dfa
,
1496 const re_node_set
*nodes
, unsigned int context
)
1499 re_dfastate_t
*new_state
;
1500 struct re_state_table_entry
*spot
;
1502 #if defined GCC_LINT || defined lint
1503 /* Suppress bogus uninitialized-variable warnings. */
1506 if (nodes
->nelem
== 0)
1511 hash
= calc_state_hash (nodes
, context
);
1512 spot
= dfa
->state_table
+ (hash
& dfa
->state_hash_mask
);
1514 for (i
= 0 ; i
< spot
->num
; i
++)
1516 re_dfastate_t
*state
= spot
->array
[i
];
1517 if (state
->hash
== hash
1518 && state
->context
== context
1519 && re_node_set_compare (state
->entrance_nodes
, nodes
))
1522 /* There are no appropriate state in 'dfa', create the new one. */
1523 new_state
= create_cd_newstate (dfa
, nodes
, context
, hash
);
1524 if (__glibc_unlikely (new_state
== NULL
))
1530 /* Finish initialization of the new state NEWSTATE, and using its hash value
1531 HASH put in the appropriate bucket of DFA's state table. Return value
1532 indicates the error code if failed. */
1534 static reg_errcode_t
1535 __attribute_warn_unused_result__
1536 register_state (const re_dfa_t
*dfa
, re_dfastate_t
*newstate
,
1539 struct re_state_table_entry
*spot
;
1543 newstate
->hash
= hash
;
1544 err
= re_node_set_alloc (&newstate
->non_eps_nodes
, newstate
->nodes
.nelem
);
1545 if (__glibc_unlikely (err
!= REG_NOERROR
))
1547 for (i
= 0; i
< newstate
->nodes
.nelem
; i
++)
1549 Idx elem
= newstate
->nodes
.elems
[i
];
1550 if (!IS_EPSILON_NODE (dfa
->nodes
[elem
].type
))
1551 if (! re_node_set_insert_last (&newstate
->non_eps_nodes
, elem
))
1555 spot
= dfa
->state_table
+ (hash
& dfa
->state_hash_mask
);
1556 if (__glibc_unlikely (spot
->alloc
<= spot
->num
))
1558 Idx new_alloc
= 2 * spot
->num
+ 2;
1559 re_dfastate_t
**new_array
= re_realloc (spot
->array
, re_dfastate_t
*,
1561 if (__glibc_unlikely (new_array
== NULL
))
1563 spot
->array
= new_array
;
1564 spot
->alloc
= new_alloc
;
1566 spot
->array
[spot
->num
++] = newstate
;
1571 free_state (re_dfastate_t
*state
)
1573 re_node_set_free (&state
->non_eps_nodes
);
1574 re_node_set_free (&state
->inveclosure
);
1575 if (state
->entrance_nodes
!= &state
->nodes
)
1577 re_node_set_free (state
->entrance_nodes
);
1578 re_free (state
->entrance_nodes
);
1580 re_node_set_free (&state
->nodes
);
1581 re_free (state
->word_trtable
);
1582 re_free (state
->trtable
);
1586 /* Create the new state which is independent of contexts.
1587 Return the new state if succeeded, otherwise return NULL. */
1589 static re_dfastate_t
*
1590 __attribute_warn_unused_result__
1591 create_ci_newstate (const re_dfa_t
*dfa
, const re_node_set
*nodes
,
1596 re_dfastate_t
*newstate
;
1598 newstate
= (re_dfastate_t
*) calloc (sizeof (re_dfastate_t
), 1);
1599 if (__glibc_unlikely (newstate
== NULL
))
1601 err
= re_node_set_init_copy (&newstate
->nodes
, nodes
);
1602 if (__glibc_unlikely (err
!= REG_NOERROR
))
1608 newstate
->entrance_nodes
= &newstate
->nodes
;
1609 for (i
= 0 ; i
< nodes
->nelem
; i
++)
1611 re_token_t
*node
= dfa
->nodes
+ nodes
->elems
[i
];
1612 re_token_type_t type
= node
->type
;
1613 if (type
== CHARACTER
&& !node
->constraint
)
1615 newstate
->accept_mb
|= node
->accept_mb
;
1617 /* If the state has the halt node, the state is a halt state. */
1618 if (type
== END_OF_RE
)
1620 else if (type
== OP_BACK_REF
)
1621 newstate
->has_backref
= 1;
1622 else if (type
== ANCHOR
|| node
->constraint
)
1623 newstate
->has_constraint
= 1;
1625 err
= register_state (dfa
, newstate
, hash
);
1626 if (__glibc_unlikely (err
!= REG_NOERROR
))
1628 free_state (newstate
);
1634 /* Create the new state which is depend on the context CONTEXT.
1635 Return the new state if succeeded, otherwise return NULL. */
1637 static re_dfastate_t
*
1638 __attribute_warn_unused_result__
1639 create_cd_newstate (const re_dfa_t
*dfa
, const re_node_set
*nodes
,
1640 unsigned int context
, re_hashval_t hash
)
1642 Idx i
, nctx_nodes
= 0;
1644 re_dfastate_t
*newstate
;
1646 newstate
= (re_dfastate_t
*) calloc (sizeof (re_dfastate_t
), 1);
1647 if (__glibc_unlikely (newstate
== NULL
))
1649 err
= re_node_set_init_copy (&newstate
->nodes
, nodes
);
1650 if (__glibc_unlikely (err
!= REG_NOERROR
))
1656 newstate
->context
= context
;
1657 newstate
->entrance_nodes
= &newstate
->nodes
;
1659 for (i
= 0 ; i
< nodes
->nelem
; i
++)
1661 re_token_t
*node
= dfa
->nodes
+ nodes
->elems
[i
];
1662 re_token_type_t type
= node
->type
;
1663 unsigned int constraint
= node
->constraint
;
1665 if (type
== CHARACTER
&& !constraint
)
1667 newstate
->accept_mb
|= node
->accept_mb
;
1669 /* If the state has the halt node, the state is a halt state. */
1670 if (type
== END_OF_RE
)
1672 else if (type
== OP_BACK_REF
)
1673 newstate
->has_backref
= 1;
1677 if (newstate
->entrance_nodes
== &newstate
->nodes
)
1679 re_node_set
*entrance_nodes
= re_malloc (re_node_set
, 1);
1680 if (__glibc_unlikely (entrance_nodes
== NULL
))
1682 free_state (newstate
);
1685 newstate
->entrance_nodes
= entrance_nodes
;
1686 if (re_node_set_init_copy (newstate
->entrance_nodes
, nodes
)
1689 free_state (newstate
);
1693 newstate
->has_constraint
= 1;
1696 if (NOT_SATISFY_PREV_CONSTRAINT (constraint
,context
))
1698 re_node_set_remove_at (&newstate
->nodes
, i
- nctx_nodes
);
1703 err
= register_state (dfa
, newstate
, hash
);
1704 if (__glibc_unlikely (err
!= REG_NOERROR
))
1706 free_state (newstate
);