1 /* Extended regular expression matching and search library.
2 Copyright (C) 2002-2024 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4 Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library; if not, see
18 <https://www.gnu.org/licenses/>. */
20 static void re_string_construct_common (const char *str
, Idx len
,
22 RE_TRANSLATE_TYPE trans
, bool icase
,
24 static re_dfastate_t
*create_ci_newstate (const re_dfa_t
*dfa
,
25 const re_node_set
*nodes
,
27 static re_dfastate_t
*create_cd_newstate (const re_dfa_t
*dfa
,
28 const re_node_set
*nodes
,
31 static reg_errcode_t
re_string_realloc_buffers (re_string_t
*pstr
,
34 static void build_wcs_buffer (re_string_t
*pstr
);
35 static reg_errcode_t
build_wcs_upper_buffer (re_string_t
*pstr
);
36 #endif /* RE_ENABLE_I18N */
37 static void build_upper_buffer (re_string_t
*pstr
);
38 static void re_string_translate_buffer (re_string_t
*pstr
);
39 static unsigned int re_string_context_at (const re_string_t
*input
, Idx idx
,
40 int eflags
) __attribute__ ((pure
));
42 /* Functions for string operation. */
44 /* This function allocate the buffers. It is necessary to call
45 re_string_reconstruct before using the object. */
48 __attribute_warn_unused_result__
49 re_string_allocate (re_string_t
*pstr
, const char *str
, Idx len
, Idx init_len
,
50 RE_TRANSLATE_TYPE trans
, bool icase
, const re_dfa_t
*dfa
)
55 /* Ensure at least one character fits into the buffers. */
56 if (init_len
< dfa
->mb_cur_max
)
57 init_len
= dfa
->mb_cur_max
;
58 init_buf_len
= (len
+ 1 < init_len
) ? len
+ 1: init_len
;
59 re_string_construct_common (str
, len
, pstr
, trans
, icase
, dfa
);
61 ret
= re_string_realloc_buffers (pstr
, init_buf_len
);
62 if (__glibc_unlikely (ret
!= REG_NOERROR
))
65 pstr
->word_char
= dfa
->word_char
;
66 pstr
->word_ops_used
= dfa
->word_ops_used
;
67 pstr
->mbs
= pstr
->mbs_allocated
? pstr
->mbs
: (unsigned char *) str
;
68 pstr
->valid_len
= (pstr
->mbs_allocated
|| dfa
->mb_cur_max
> 1) ? 0 : len
;
69 pstr
->valid_raw_len
= pstr
->valid_len
;
73 /* This function allocate the buffers, and initialize them. */
76 __attribute_warn_unused_result__
77 re_string_construct (re_string_t
*pstr
, const char *str
, Idx len
,
78 RE_TRANSLATE_TYPE trans
, bool icase
, const re_dfa_t
*dfa
)
81 memset (pstr
, '\0', sizeof (re_string_t
));
82 re_string_construct_common (str
, len
, pstr
, trans
, icase
, dfa
);
86 ret
= re_string_realloc_buffers (pstr
, len
+ 1);
87 if (__glibc_unlikely (ret
!= REG_NOERROR
))
90 pstr
->mbs
= pstr
->mbs_allocated
? pstr
->mbs
: (unsigned char *) str
;
95 if (dfa
->mb_cur_max
> 1)
99 ret
= build_wcs_upper_buffer (pstr
);
100 if (__glibc_unlikely (ret
!= REG_NOERROR
))
102 if (pstr
->valid_raw_len
>= len
)
104 if (pstr
->bufs_len
> pstr
->valid_len
+ dfa
->mb_cur_max
)
106 ret
= re_string_realloc_buffers (pstr
, pstr
->bufs_len
* 2);
107 if (__glibc_unlikely (ret
!= REG_NOERROR
))
112 #endif /* RE_ENABLE_I18N */
113 build_upper_buffer (pstr
);
117 #ifdef RE_ENABLE_I18N
118 if (dfa
->mb_cur_max
> 1)
119 build_wcs_buffer (pstr
);
121 #endif /* RE_ENABLE_I18N */
124 re_string_translate_buffer (pstr
);
127 pstr
->valid_len
= pstr
->bufs_len
;
128 pstr
->valid_raw_len
= pstr
->bufs_len
;
136 /* Helper functions for re_string_allocate, and re_string_construct. */
139 __attribute_warn_unused_result__
140 re_string_realloc_buffers (re_string_t
*pstr
, Idx new_buf_len
)
142 #ifdef RE_ENABLE_I18N
143 if (pstr
->mb_cur_max
> 1)
147 /* Avoid overflow in realloc. */
148 const size_t max_object_size
= MAX (sizeof (wint_t), sizeof (Idx
));
149 if (__glibc_unlikely (MIN (IDX_MAX
, SIZE_MAX
/ max_object_size
)
153 new_wcs
= re_realloc (pstr
->wcs
, wint_t, new_buf_len
);
154 if (__glibc_unlikely (new_wcs
== NULL
))
157 if (pstr
->offsets
!= NULL
)
159 Idx
*new_offsets
= re_realloc (pstr
->offsets
, Idx
, new_buf_len
);
160 if (__glibc_unlikely (new_offsets
== NULL
))
162 pstr
->offsets
= new_offsets
;
165 #endif /* RE_ENABLE_I18N */
166 if (pstr
->mbs_allocated
)
168 unsigned char *new_mbs
= re_realloc (pstr
->mbs
, unsigned char,
170 if (__glibc_unlikely (new_mbs
== NULL
))
174 pstr
->bufs_len
= new_buf_len
;
180 re_string_construct_common (const char *str
, Idx len
, re_string_t
*pstr
,
181 RE_TRANSLATE_TYPE trans
, bool icase
,
184 pstr
->raw_mbs
= (const unsigned char *) str
;
189 pstr
->mbs_allocated
= (trans
!= NULL
|| icase
);
190 pstr
->mb_cur_max
= dfa
->mb_cur_max
;
191 pstr
->is_utf8
= dfa
->is_utf8
;
192 pstr
->map_notascii
= dfa
->map_notascii
;
193 pstr
->stop
= pstr
->len
;
194 pstr
->raw_stop
= pstr
->stop
;
197 #ifdef RE_ENABLE_I18N
199 /* Build wide character buffer PSTR->WCS.
200 If the byte sequence of the string are:
201 <mb1>(0), <mb1>(1), <mb2>(0), <mb2>(1), <sb3>
202 Then wide character buffer will be:
203 <wc1> , WEOF , <wc2> , WEOF , <wc3>
204 We use WEOF for padding, they indicate that the position isn't
205 a first byte of a multibyte character.
207 Note that this function assumes PSTR->VALID_LEN elements are already
208 built and starts from PSTR->VALID_LEN. */
211 build_wcs_buffer (re_string_t
*pstr
)
214 unsigned char buf
[MB_LEN_MAX
];
215 DEBUG_ASSERT (MB_LEN_MAX
>= pstr
->mb_cur_max
);
217 unsigned char buf
[64];
220 Idx byte_idx
, end_idx
, remain_len
;
223 /* Build the buffers from pstr->valid_len to either pstr->len or
225 end_idx
= (pstr
->bufs_len
> pstr
->len
) ? pstr
->len
: pstr
->bufs_len
;
226 for (byte_idx
= pstr
->valid_len
; byte_idx
< end_idx
;)
231 remain_len
= end_idx
- byte_idx
;
232 prev_st
= pstr
->cur_state
;
233 /* Apply the translation if we need. */
234 if (__glibc_unlikely (pstr
->trans
!= NULL
))
238 for (i
= 0; i
< pstr
->mb_cur_max
&& i
< remain_len
; ++i
)
240 ch
= pstr
->raw_mbs
[pstr
->raw_mbs_idx
+ byte_idx
+ i
];
241 buf
[i
] = pstr
->mbs
[byte_idx
+ i
] = pstr
->trans
[ch
];
243 p
= (const char *) buf
;
246 p
= (const char *) pstr
->raw_mbs
+ pstr
->raw_mbs_idx
+ byte_idx
;
247 mbclen
= __mbrtowc (&wc
, p
, remain_len
, &pstr
->cur_state
);
248 if (__glibc_unlikely (mbclen
== (size_t) -1 || mbclen
== 0
249 || (mbclen
== (size_t) -2
250 && pstr
->bufs_len
>= pstr
->len
)))
252 /* We treat these cases as a singlebyte character. */
254 wc
= (wchar_t) pstr
->raw_mbs
[pstr
->raw_mbs_idx
+ byte_idx
];
255 if (__glibc_unlikely (pstr
->trans
!= NULL
))
256 wc
= pstr
->trans
[wc
];
257 pstr
->cur_state
= prev_st
;
259 else if (__glibc_unlikely (mbclen
== (size_t) -2))
261 /* The buffer doesn't have enough space, finish to build. */
262 pstr
->cur_state
= prev_st
;
266 /* Write wide character and padding. */
267 pstr
->wcs
[byte_idx
++] = wc
;
268 /* Write paddings. */
269 for (remain_len
= byte_idx
+ mbclen
- 1; byte_idx
< remain_len
;)
270 pstr
->wcs
[byte_idx
++] = WEOF
;
272 pstr
->valid_len
= byte_idx
;
273 pstr
->valid_raw_len
= byte_idx
;
276 /* Build wide character buffer PSTR->WCS like build_wcs_buffer,
277 but for REG_ICASE. */
280 __attribute_warn_unused_result__
281 build_wcs_upper_buffer (re_string_t
*pstr
)
284 Idx src_idx
, byte_idx
, end_idx
, remain_len
;
287 char buf
[MB_LEN_MAX
];
288 DEBUG_ASSERT (pstr
->mb_cur_max
<= MB_LEN_MAX
);
293 byte_idx
= pstr
->valid_len
;
294 end_idx
= (pstr
->bufs_len
> pstr
->len
) ? pstr
->len
: pstr
->bufs_len
;
296 /* The following optimization assumes that ASCII characters can be
297 mapped to wide characters with a simple cast. */
298 if (! pstr
->map_notascii
&& pstr
->trans
== NULL
&& !pstr
->offsets_needed
)
300 while (byte_idx
< end_idx
)
303 unsigned char ch
= pstr
->raw_mbs
[pstr
->raw_mbs_idx
+ byte_idx
];
305 if (isascii (ch
) && mbsinit (&pstr
->cur_state
))
307 /* The next step uses the assumption that wchar_t is encoded
308 ASCII-safe: all ASCII values can be converted like this. */
309 wchar_t wcu
= __towupper (ch
);
312 pstr
->mbs
[byte_idx
] = wcu
;
313 pstr
->wcs
[byte_idx
] = wcu
;
319 remain_len
= end_idx
- byte_idx
;
320 prev_st
= pstr
->cur_state
;
321 mbclen
= __mbrtowc (&wc
,
322 ((const char *) pstr
->raw_mbs
+ pstr
->raw_mbs_idx
323 + byte_idx
), remain_len
, &pstr
->cur_state
);
324 if (__glibc_likely (0 < mbclen
&& mbclen
< (size_t) -2))
326 wchar_t wcu
= __towupper (wc
);
331 mbcdlen
= __wcrtomb (buf
, wcu
, &prev_st
);
332 if (__glibc_likely (mbclen
== mbcdlen
))
333 memcpy (pstr
->mbs
+ byte_idx
, buf
, mbclen
);
341 memcpy (pstr
->mbs
+ byte_idx
,
342 pstr
->raw_mbs
+ pstr
->raw_mbs_idx
+ byte_idx
, mbclen
);
343 pstr
->wcs
[byte_idx
++] = wcu
;
344 /* Write paddings. */
345 for (remain_len
= byte_idx
+ mbclen
- 1; byte_idx
< remain_len
;)
346 pstr
->wcs
[byte_idx
++] = WEOF
;
348 else if (mbclen
== (size_t) -1 || mbclen
== 0
349 || (mbclen
== (size_t) -2 && pstr
->bufs_len
>= pstr
->len
))
351 /* It is an invalid character, an incomplete character
352 at the end of the string, or '\0'. Just use the byte. */
353 pstr
->mbs
[byte_idx
] = ch
;
354 /* And also cast it to wide char. */
355 pstr
->wcs
[byte_idx
++] = (wchar_t) ch
;
356 if (__glibc_unlikely (mbclen
== (size_t) -1))
357 pstr
->cur_state
= prev_st
;
361 /* The buffer doesn't have enough space, finish to build. */
362 pstr
->cur_state
= prev_st
;
366 pstr
->valid_len
= byte_idx
;
367 pstr
->valid_raw_len
= byte_idx
;
371 for (src_idx
= pstr
->valid_raw_len
; byte_idx
< end_idx
;)
376 remain_len
= end_idx
- byte_idx
;
377 prev_st
= pstr
->cur_state
;
378 if (__glibc_unlikely (pstr
->trans
!= NULL
))
382 for (i
= 0; i
< pstr
->mb_cur_max
&& i
< remain_len
; ++i
)
384 ch
= pstr
->raw_mbs
[pstr
->raw_mbs_idx
+ src_idx
+ i
];
385 buf
[i
] = pstr
->trans
[ch
];
387 p
= (const char *) buf
;
390 p
= (const char *) pstr
->raw_mbs
+ pstr
->raw_mbs_idx
+ src_idx
;
391 mbclen
= __mbrtowc (&wc
, p
, remain_len
, &pstr
->cur_state
);
392 if (__glibc_likely (0 < mbclen
&& mbclen
< (size_t) -2))
394 wchar_t wcu
= __towupper (wc
);
399 mbcdlen
= __wcrtomb ((char *) buf
, wcu
, &prev_st
);
400 if (__glibc_likely (mbclen
== mbcdlen
))
401 memcpy (pstr
->mbs
+ byte_idx
, buf
, mbclen
);
402 else if (mbcdlen
!= (size_t) -1)
406 if (byte_idx
+ mbcdlen
> pstr
->bufs_len
)
408 pstr
->cur_state
= prev_st
;
412 if (pstr
->offsets
== NULL
)
414 pstr
->offsets
= re_malloc (Idx
, pstr
->bufs_len
);
416 if (pstr
->offsets
== NULL
)
419 if (!pstr
->offsets_needed
)
421 for (i
= 0; i
< (size_t) byte_idx
; ++i
)
422 pstr
->offsets
[i
] = i
;
423 pstr
->offsets_needed
= 1;
426 memcpy (pstr
->mbs
+ byte_idx
, buf
, mbcdlen
);
427 pstr
->wcs
[byte_idx
] = wcu
;
428 pstr
->offsets
[byte_idx
] = src_idx
;
429 for (i
= 1; i
< mbcdlen
; ++i
)
431 pstr
->offsets
[byte_idx
+ i
]
432 = src_idx
+ (i
< mbclen
? i
: mbclen
- 1);
433 pstr
->wcs
[byte_idx
+ i
] = WEOF
;
435 pstr
->len
+= mbcdlen
- mbclen
;
436 if (pstr
->raw_stop
> src_idx
)
437 pstr
->stop
+= mbcdlen
- mbclen
;
438 end_idx
= (pstr
->bufs_len
> pstr
->len
)
439 ? pstr
->len
: pstr
->bufs_len
;
445 memcpy (pstr
->mbs
+ byte_idx
, p
, mbclen
);
448 memcpy (pstr
->mbs
+ byte_idx
, p
, mbclen
);
450 if (__glibc_unlikely (pstr
->offsets_needed
!= 0))
453 for (i
= 0; i
< mbclen
; ++i
)
454 pstr
->offsets
[byte_idx
+ i
] = src_idx
+ i
;
458 pstr
->wcs
[byte_idx
++] = wcu
;
459 /* Write paddings. */
460 for (remain_len
= byte_idx
+ mbclen
- 1; byte_idx
< remain_len
;)
461 pstr
->wcs
[byte_idx
++] = WEOF
;
463 else if (mbclen
== (size_t) -1 || mbclen
== 0
464 || (mbclen
== (size_t) -2 && pstr
->bufs_len
>= pstr
->len
))
466 /* It is an invalid character or '\0'. Just use the byte. */
467 int ch
= pstr
->raw_mbs
[pstr
->raw_mbs_idx
+ src_idx
];
469 if (__glibc_unlikely (pstr
->trans
!= NULL
))
470 ch
= pstr
->trans
[ch
];
471 pstr
->mbs
[byte_idx
] = ch
;
473 if (__glibc_unlikely (pstr
->offsets_needed
!= 0))
474 pstr
->offsets
[byte_idx
] = src_idx
;
477 /* And also cast it to wide char. */
478 pstr
->wcs
[byte_idx
++] = (wchar_t) ch
;
479 if (__glibc_unlikely (mbclen
== (size_t) -1))
480 pstr
->cur_state
= prev_st
;
484 /* The buffer doesn't have enough space, finish to build. */
485 pstr
->cur_state
= prev_st
;
489 pstr
->valid_len
= byte_idx
;
490 pstr
->valid_raw_len
= src_idx
;
494 /* Skip characters until the index becomes greater than NEW_RAW_IDX.
498 re_string_skip_chars (re_string_t
*pstr
, Idx new_raw_idx
, wint_t *last_wc
)
505 /* Skip the characters which are not necessary to check. */
506 for (rawbuf_idx
= pstr
->raw_mbs_idx
+ pstr
->valid_raw_len
;
507 rawbuf_idx
< new_raw_idx
;)
510 Idx remain_len
= pstr
->raw_len
- rawbuf_idx
;
511 prev_st
= pstr
->cur_state
;
512 mbclen
= __mbrtowc (&wc2
, (const char *) pstr
->raw_mbs
+ rawbuf_idx
,
513 remain_len
, &pstr
->cur_state
);
514 if (__glibc_unlikely (mbclen
== (size_t) -2 || mbclen
== (size_t) -1
517 /* We treat these cases as a single byte character. */
518 if (mbclen
== 0 || remain_len
== 0)
521 wc
= *(unsigned char *) (pstr
->raw_mbs
+ rawbuf_idx
);
523 pstr
->cur_state
= prev_st
;
527 /* Then proceed the next character. */
528 rawbuf_idx
+= mbclen
;
533 #endif /* RE_ENABLE_I18N */
535 /* Build the buffer PSTR->MBS, and apply the translation if we need.
536 This function is used in case of REG_ICASE. */
539 build_upper_buffer (re_string_t
*pstr
)
541 Idx char_idx
, end_idx
;
542 end_idx
= (pstr
->bufs_len
> pstr
->len
) ? pstr
->len
: pstr
->bufs_len
;
544 for (char_idx
= pstr
->valid_len
; char_idx
< end_idx
; ++char_idx
)
546 int ch
= pstr
->raw_mbs
[pstr
->raw_mbs_idx
+ char_idx
];
547 if (__glibc_unlikely (pstr
->trans
!= NULL
))
548 ch
= pstr
->trans
[ch
];
549 pstr
->mbs
[char_idx
] = toupper (ch
);
551 pstr
->valid_len
= char_idx
;
552 pstr
->valid_raw_len
= char_idx
;
555 /* Apply TRANS to the buffer in PSTR. */
558 re_string_translate_buffer (re_string_t
*pstr
)
560 Idx buf_idx
, end_idx
;
561 end_idx
= (pstr
->bufs_len
> pstr
->len
) ? pstr
->len
: pstr
->bufs_len
;
563 for (buf_idx
= pstr
->valid_len
; buf_idx
< end_idx
; ++buf_idx
)
565 int ch
= pstr
->raw_mbs
[pstr
->raw_mbs_idx
+ buf_idx
];
566 pstr
->mbs
[buf_idx
] = pstr
->trans
[ch
];
569 pstr
->valid_len
= buf_idx
;
570 pstr
->valid_raw_len
= buf_idx
;
573 /* This function re-construct the buffers.
574 Concretely, convert to wide character in case of pstr->mb_cur_max > 1,
575 convert to upper case in case of REG_ICASE, apply translation. */
578 __attribute_warn_unused_result__
579 re_string_reconstruct (re_string_t
*pstr
, Idx idx
, int eflags
)
583 if (__glibc_unlikely (pstr
->raw_mbs_idx
<= idx
))
584 offset
= idx
- pstr
->raw_mbs_idx
;
588 #ifdef RE_ENABLE_I18N
589 if (pstr
->mb_cur_max
> 1)
590 memset (&pstr
->cur_state
, '\0', sizeof (mbstate_t));
591 #endif /* RE_ENABLE_I18N */
592 pstr
->len
= pstr
->raw_len
;
593 pstr
->stop
= pstr
->raw_stop
;
595 pstr
->raw_mbs_idx
= 0;
596 pstr
->valid_raw_len
= 0;
597 pstr
->offsets_needed
= 0;
598 pstr
->tip_context
= ((eflags
& REG_NOTBOL
) ? CONTEXT_BEGBUF
599 : CONTEXT_NEWLINE
| CONTEXT_BEGBUF
);
600 if (!pstr
->mbs_allocated
)
601 pstr
->mbs
= (unsigned char *) pstr
->raw_mbs
;
605 if (__glibc_likely (offset
!= 0))
607 /* Should the already checked characters be kept? */
608 if (__glibc_likely (offset
< pstr
->valid_raw_len
))
610 /* Yes, move them to the front of the buffer. */
611 #ifdef RE_ENABLE_I18N
612 if (__glibc_unlikely (pstr
->offsets_needed
))
614 Idx low
= 0, high
= pstr
->valid_len
, mid
;
617 mid
= (high
+ low
) / 2;
618 if (pstr
->offsets
[mid
] > offset
)
620 else if (pstr
->offsets
[mid
] < offset
)
626 if (pstr
->offsets
[mid
] < offset
)
628 pstr
->tip_context
= re_string_context_at (pstr
, mid
- 1,
630 /* This can be quite complicated, so handle specially
631 only the common and easy case where the character with
632 different length representation of lower and upper
633 case is present at or after offset. */
634 if (pstr
->valid_len
> offset
635 && mid
== offset
&& pstr
->offsets
[mid
] == offset
)
637 memmove (pstr
->wcs
, pstr
->wcs
+ offset
,
638 (pstr
->valid_len
- offset
) * sizeof (wint_t));
639 memmove (pstr
->mbs
, pstr
->mbs
+ offset
, pstr
->valid_len
- offset
);
640 pstr
->valid_len
-= offset
;
641 pstr
->valid_raw_len
-= offset
;
642 for (low
= 0; low
< pstr
->valid_len
; low
++)
643 pstr
->offsets
[low
] = pstr
->offsets
[low
+ offset
] - offset
;
647 /* Otherwise, just find out how long the partial multibyte
648 character at offset is and fill it with WEOF/255. */
649 pstr
->len
= pstr
->raw_len
- idx
+ offset
;
650 pstr
->stop
= pstr
->raw_stop
- idx
+ offset
;
651 pstr
->offsets_needed
= 0;
652 while (mid
> 0 && pstr
->offsets
[mid
- 1] == offset
)
654 while (mid
< pstr
->valid_len
)
655 if (pstr
->wcs
[mid
] != WEOF
)
659 if (mid
== pstr
->valid_len
)
663 pstr
->valid_len
= pstr
->offsets
[mid
] - offset
;
666 for (low
= 0; low
< pstr
->valid_len
; ++low
)
667 pstr
->wcs
[low
] = WEOF
;
668 memset (pstr
->mbs
, 255, pstr
->valid_len
);
671 pstr
->valid_raw_len
= pstr
->valid_len
;
677 pstr
->tip_context
= re_string_context_at (pstr
, offset
- 1,
679 #ifdef RE_ENABLE_I18N
680 if (pstr
->mb_cur_max
> 1)
681 memmove (pstr
->wcs
, pstr
->wcs
+ offset
,
682 (pstr
->valid_len
- offset
) * sizeof (wint_t));
683 #endif /* RE_ENABLE_I18N */
684 if (__glibc_unlikely (pstr
->mbs_allocated
))
685 memmove (pstr
->mbs
, pstr
->mbs
+ offset
,
686 pstr
->valid_len
- offset
);
687 pstr
->valid_len
-= offset
;
688 pstr
->valid_raw_len
-= offset
;
689 DEBUG_ASSERT (pstr
->valid_len
> 0);
694 #ifdef RE_ENABLE_I18N
695 /* No, skip all characters until IDX. */
696 Idx prev_valid_len
= pstr
->valid_len
;
698 if (__glibc_unlikely (pstr
->offsets_needed
))
700 pstr
->len
= pstr
->raw_len
- idx
+ offset
;
701 pstr
->stop
= pstr
->raw_stop
- idx
+ offset
;
702 pstr
->offsets_needed
= 0;
706 #ifdef RE_ENABLE_I18N
707 if (pstr
->mb_cur_max
> 1)
714 const unsigned char *raw
, *p
, *end
;
716 /* Special case UTF-8. Multi-byte chars start with any
717 byte other than 0x80 - 0xbf. */
718 raw
= pstr
->raw_mbs
+ pstr
->raw_mbs_idx
;
719 end
= raw
+ (offset
- pstr
->mb_cur_max
);
720 if (end
< pstr
->raw_mbs
)
722 p
= raw
+ offset
- 1;
724 /* We know the wchar_t encoding is UCS4, so for the simple
725 case, ASCII characters, skip the conversion step. */
726 if (isascii (*p
) && __glibc_likely (pstr
->trans
== NULL
))
728 memset (&pstr
->cur_state
, '\0', sizeof (mbstate_t));
729 /* pstr->valid_len = 0; */
734 for (; p
>= end
; --p
)
735 if ((*p
& 0xc0) != 0x80)
739 Idx mlen
= raw
+ pstr
->len
- p
;
740 unsigned char buf
[6];
743 const unsigned char *pp
= p
;
744 if (__glibc_unlikely (pstr
->trans
!= NULL
))
746 int i
= mlen
< 6 ? mlen
: 6;
748 buf
[i
] = pstr
->trans
[p
[i
]];
751 /* XXX Don't use mbrtowc, we know which conversion
752 to use (UTF-8 -> UCS4). */
753 memset (&cur_state
, 0, sizeof (cur_state
));
754 mbclen
= __mbrtowc (&wc2
, (const char *) pp
, mlen
,
756 if (raw
+ offset
- p
<= mbclen
757 && mbclen
< (size_t) -2)
759 memset (&pstr
->cur_state
, '\0',
761 pstr
->valid_len
= mbclen
- (raw
+ offset
- p
);
769 pstr
->valid_len
= re_string_skip_chars (pstr
, idx
, &wc
) - idx
;
772 = re_string_context_at (pstr
, prev_valid_len
- 1, eflags
);
774 pstr
->tip_context
= ((__glibc_unlikely (pstr
->word_ops_used
!= 0)
775 && IS_WIDE_WORD_CHAR (wc
))
777 : ((IS_WIDE_NEWLINE (wc
)
778 && pstr
->newline_anchor
)
779 ? CONTEXT_NEWLINE
: 0));
780 if (__glibc_unlikely (pstr
->valid_len
))
782 for (wcs_idx
= 0; wcs_idx
< pstr
->valid_len
; ++wcs_idx
)
783 pstr
->wcs
[wcs_idx
] = WEOF
;
784 if (pstr
->mbs_allocated
)
785 memset (pstr
->mbs
, 255, pstr
->valid_len
);
787 pstr
->valid_raw_len
= pstr
->valid_len
;
790 #endif /* RE_ENABLE_I18N */
792 int c
= pstr
->raw_mbs
[pstr
->raw_mbs_idx
+ offset
- 1];
793 pstr
->valid_raw_len
= 0;
796 pstr
->tip_context
= (bitset_contain (pstr
->word_char
, c
)
798 : ((IS_NEWLINE (c
) && pstr
->newline_anchor
)
799 ? CONTEXT_NEWLINE
: 0));
802 if (!__glibc_unlikely (pstr
->mbs_allocated
))
805 pstr
->raw_mbs_idx
= idx
;
807 pstr
->stop
-= offset
;
809 /* Then build the buffers. */
810 #ifdef RE_ENABLE_I18N
811 if (pstr
->mb_cur_max
> 1)
815 reg_errcode_t ret
= build_wcs_upper_buffer (pstr
);
816 if (__glibc_unlikely (ret
!= REG_NOERROR
))
820 build_wcs_buffer (pstr
);
823 #endif /* RE_ENABLE_I18N */
824 if (__glibc_unlikely (pstr
->mbs_allocated
))
827 build_upper_buffer (pstr
);
828 else if (pstr
->trans
!= NULL
)
829 re_string_translate_buffer (pstr
);
832 pstr
->valid_len
= pstr
->len
;
839 __attribute__ ((pure
))
840 re_string_peek_byte_case (const re_string_t
*pstr
, Idx idx
)
845 /* Handle the common (easiest) cases first. */
846 if (__glibc_likely (!pstr
->mbs_allocated
))
847 return re_string_peek_byte (pstr
, idx
);
849 #ifdef RE_ENABLE_I18N
850 if (pstr
->mb_cur_max
> 1
851 && ! re_string_is_single_byte_char (pstr
, pstr
->cur_idx
+ idx
))
852 return re_string_peek_byte (pstr
, idx
);
855 off
= pstr
->cur_idx
+ idx
;
856 #ifdef RE_ENABLE_I18N
857 if (pstr
->offsets_needed
)
858 off
= pstr
->offsets
[off
];
861 ch
= pstr
->raw_mbs
[pstr
->raw_mbs_idx
+ off
];
863 #ifdef RE_ENABLE_I18N
864 /* Ensure that e.g. for tr_TR.UTF-8 BACKSLASH DOTLESS SMALL LETTER I
865 this function returns CAPITAL LETTER I instead of first byte of
866 DOTLESS SMALL LETTER I. The latter would confuse the parser,
867 since peek_byte_case doesn't advance cur_idx in any way. */
868 if (pstr
->offsets_needed
&& !isascii (ch
))
869 return re_string_peek_byte (pstr
, idx
);
876 re_string_fetch_byte_case (re_string_t
*pstr
)
878 if (__glibc_likely (!pstr
->mbs_allocated
))
879 return re_string_fetch_byte (pstr
);
881 #ifdef RE_ENABLE_I18N
882 if (pstr
->offsets_needed
)
887 /* For tr_TR.UTF-8 [[:islower:]] there is
888 [[: CAPITAL LETTER I WITH DOT lower:]] in mbs. Skip
889 in that case the whole multi-byte character and return
890 the original letter. On the other side, with
891 [[: DOTLESS SMALL LETTER I return [[:I, as doing
892 anything else would complicate things too much. */
894 if (!re_string_first_byte (pstr
, pstr
->cur_idx
))
895 return re_string_fetch_byte (pstr
);
897 off
= pstr
->offsets
[pstr
->cur_idx
];
898 ch
= pstr
->raw_mbs
[pstr
->raw_mbs_idx
+ off
];
901 return re_string_fetch_byte (pstr
);
903 re_string_skip_bytes (pstr
,
904 re_string_char_size_at (pstr
, pstr
->cur_idx
));
909 return pstr
->raw_mbs
[pstr
->raw_mbs_idx
+ pstr
->cur_idx
++];
913 re_string_destruct (re_string_t
*pstr
)
915 #ifdef RE_ENABLE_I18N
917 re_free (pstr
->offsets
);
918 #endif /* RE_ENABLE_I18N */
919 if (pstr
->mbs_allocated
)
923 /* Return the context at IDX in INPUT. */
926 re_string_context_at (const re_string_t
*input
, Idx idx
, int eflags
)
929 if (__glibc_unlikely (idx
< 0))
930 /* In this case, we use the value stored in input->tip_context,
931 since we can't know the character in input->mbs[-1] here. */
932 return input
->tip_context
;
933 if (__glibc_unlikely (idx
== input
->len
))
934 return ((eflags
& REG_NOTEOL
) ? CONTEXT_ENDBUF
935 : CONTEXT_NEWLINE
| CONTEXT_ENDBUF
);
936 #ifdef RE_ENABLE_I18N
937 if (input
->mb_cur_max
> 1)
941 while(input
->wcs
[wc_idx
] == WEOF
)
943 DEBUG_ASSERT (wc_idx
>= 0);
946 return input
->tip_context
;
948 wc
= input
->wcs
[wc_idx
];
949 if (__glibc_unlikely (input
->word_ops_used
!= 0)
950 && IS_WIDE_WORD_CHAR (wc
))
952 return (IS_WIDE_NEWLINE (wc
) && input
->newline_anchor
953 ? CONTEXT_NEWLINE
: 0);
958 c
= re_string_byte_at (input
, idx
);
959 if (bitset_contain (input
->word_char
, c
))
961 return IS_NEWLINE (c
) && input
->newline_anchor
? CONTEXT_NEWLINE
: 0;
965 /* Functions for set operation. */
968 __attribute_warn_unused_result__
969 re_node_set_alloc (re_node_set
*set
, Idx size
)
973 set
->elems
= re_malloc (Idx
, size
);
974 if (__glibc_unlikely (set
->elems
== NULL
)
975 && (MALLOC_0_IS_NONNULL
|| size
!= 0))
981 __attribute_warn_unused_result__
982 re_node_set_init_1 (re_node_set
*set
, Idx elem
)
986 set
->elems
= re_malloc (Idx
, 1);
987 if (__glibc_unlikely (set
->elems
== NULL
))
989 set
->alloc
= set
->nelem
= 0;
992 set
->elems
[0] = elem
;
997 __attribute_warn_unused_result__
998 re_node_set_init_2 (re_node_set
*set
, Idx elem1
, Idx elem2
)
1001 set
->elems
= re_malloc (Idx
, 2);
1002 if (__glibc_unlikely (set
->elems
== NULL
))
1007 set
->elems
[0] = elem1
;
1014 set
->elems
[0] = elem1
;
1015 set
->elems
[1] = elem2
;
1019 set
->elems
[0] = elem2
;
1020 set
->elems
[1] = elem1
;
1026 static reg_errcode_t
1027 __attribute_warn_unused_result__
1028 re_node_set_init_copy (re_node_set
*dest
, const re_node_set
*src
)
1030 dest
->nelem
= src
->nelem
;
1033 dest
->alloc
= dest
->nelem
;
1034 dest
->elems
= re_malloc (Idx
, dest
->alloc
);
1035 if (__glibc_unlikely (dest
->elems
== NULL
))
1037 dest
->alloc
= dest
->nelem
= 0;
1040 memcpy (dest
->elems
, src
->elems
, src
->nelem
* sizeof (Idx
));
1043 re_node_set_init_empty (dest
);
1047 /* Calculate the intersection of the sets SRC1 and SRC2. And merge it to
1048 DEST. Return value indicate the error code or REG_NOERROR if succeeded.
1049 Note: We assume dest->elems is NULL, when dest->alloc is 0. */
1051 static reg_errcode_t
1052 __attribute_warn_unused_result__
1053 re_node_set_add_intersect (re_node_set
*dest
, const re_node_set
*src1
,
1054 const re_node_set
*src2
)
1056 Idx i1
, i2
, is
, id
, delta
, sbase
;
1057 if (src1
->nelem
== 0 || src2
->nelem
== 0)
1060 /* We need dest->nelem + 2 * elems_in_intersection; this is a
1061 conservative estimate. */
1062 if (src1
->nelem
+ src2
->nelem
+ dest
->nelem
> dest
->alloc
)
1064 Idx new_alloc
= src1
->nelem
+ src2
->nelem
+ dest
->alloc
;
1065 Idx
*new_elems
= re_realloc (dest
->elems
, Idx
, new_alloc
);
1066 if (__glibc_unlikely (new_elems
== NULL
))
1068 dest
->elems
= new_elems
;
1069 dest
->alloc
= new_alloc
;
1072 /* Find the items in the intersection of SRC1 and SRC2, and copy
1073 into the top of DEST those that are not already in DEST itself. */
1074 sbase
= dest
->nelem
+ src1
->nelem
+ src2
->nelem
;
1075 i1
= src1
->nelem
- 1;
1076 i2
= src2
->nelem
- 1;
1077 id
= dest
->nelem
- 1;
1080 if (src1
->elems
[i1
] == src2
->elems
[i2
])
1082 /* Try to find the item in DEST. Maybe we could binary search? */
1083 while (id
>= 0 && dest
->elems
[id
] > src1
->elems
[i1
])
1086 if (id
< 0 || dest
->elems
[id
] != src1
->elems
[i1
])
1087 dest
->elems
[--sbase
] = src1
->elems
[i1
];
1089 if (--i1
< 0 || --i2
< 0)
1093 /* Lower the highest of the two items. */
1094 else if (src1
->elems
[i1
] < src2
->elems
[i2
])
1106 id
= dest
->nelem
- 1;
1107 is
= dest
->nelem
+ src1
->nelem
+ src2
->nelem
- 1;
1108 delta
= is
- sbase
+ 1;
1110 /* Now copy. When DELTA becomes zero, the remaining
1111 DEST elements are already in place; this is more or
1112 less the same loop that is in re_node_set_merge. */
1113 dest
->nelem
+= delta
;
1114 if (delta
> 0 && id
>= 0)
1117 if (dest
->elems
[is
] > dest
->elems
[id
])
1119 /* Copy from the top. */
1120 dest
->elems
[id
+ delta
--] = dest
->elems
[is
--];
1126 /* Slide from the bottom. */
1127 dest
->elems
[id
+ delta
] = dest
->elems
[id
];
1133 /* Copy remaining SRC elements. */
1134 memcpy (dest
->elems
, dest
->elems
+ sbase
, delta
* sizeof (Idx
));
1139 /* Calculate the union set of the sets SRC1 and SRC2. And store it to
1140 DEST. Return value indicate the error code or REG_NOERROR if succeeded. */
1142 static reg_errcode_t
1143 __attribute_warn_unused_result__
1144 re_node_set_init_union (re_node_set
*dest
, const re_node_set
*src1
,
1145 const re_node_set
*src2
)
1148 if (src1
!= NULL
&& src1
->nelem
> 0 && src2
!= NULL
&& src2
->nelem
> 0)
1150 dest
->alloc
= src1
->nelem
+ src2
->nelem
;
1151 dest
->elems
= re_malloc (Idx
, dest
->alloc
);
1152 if (__glibc_unlikely (dest
->elems
== NULL
))
1157 if (src1
!= NULL
&& src1
->nelem
> 0)
1158 return re_node_set_init_copy (dest
, src1
);
1159 else if (src2
!= NULL
&& src2
->nelem
> 0)
1160 return re_node_set_init_copy (dest
, src2
);
1162 re_node_set_init_empty (dest
);
1165 for (i1
= i2
= id
= 0 ; i1
< src1
->nelem
&& i2
< src2
->nelem
;)
1167 if (src1
->elems
[i1
] > src2
->elems
[i2
])
1169 dest
->elems
[id
++] = src2
->elems
[i2
++];
1172 if (src1
->elems
[i1
] == src2
->elems
[i2
])
1174 dest
->elems
[id
++] = src1
->elems
[i1
++];
1176 if (i1
< src1
->nelem
)
1178 memcpy (dest
->elems
+ id
, src1
->elems
+ i1
,
1179 (src1
->nelem
- i1
) * sizeof (Idx
));
1180 id
+= src1
->nelem
- i1
;
1182 else if (i2
< src2
->nelem
)
1184 memcpy (dest
->elems
+ id
, src2
->elems
+ i2
,
1185 (src2
->nelem
- i2
) * sizeof (Idx
));
1186 id
+= src2
->nelem
- i2
;
1192 /* Calculate the union set of the sets DEST and SRC. And store it to
1193 DEST. Return value indicate the error code or REG_NOERROR if succeeded. */
1195 static reg_errcode_t
1196 __attribute_warn_unused_result__
1197 re_node_set_merge (re_node_set
*dest
, const re_node_set
*src
)
1199 Idx is
, id
, sbase
, delta
;
1200 if (src
== NULL
|| src
->nelem
== 0)
1202 if (dest
->alloc
< 2 * src
->nelem
+ dest
->nelem
)
1204 Idx new_alloc
= 2 * (src
->nelem
+ dest
->alloc
);
1205 Idx
*new_buffer
= re_realloc (dest
->elems
, Idx
, new_alloc
);
1206 if (__glibc_unlikely (new_buffer
== NULL
))
1208 dest
->elems
= new_buffer
;
1209 dest
->alloc
= new_alloc
;
1212 if (__glibc_unlikely (dest
->nelem
== 0))
1214 /* Although we already guaranteed above that dest->alloc != 0 and
1215 therefore dest->elems != NULL, add a debug assertion to pacify
1216 GCC 11.2.1's -fanalyzer. */
1217 DEBUG_ASSERT (dest
->elems
);
1218 dest
->nelem
= src
->nelem
;
1219 memcpy (dest
->elems
, src
->elems
, src
->nelem
* sizeof (Idx
));
1223 /* Copy into the top of DEST the items of SRC that are not
1224 found in DEST. Maybe we could binary search in DEST? */
1225 for (sbase
= dest
->nelem
+ 2 * src
->nelem
,
1226 is
= src
->nelem
- 1, id
= dest
->nelem
- 1; is
>= 0 && id
>= 0; )
1228 if (dest
->elems
[id
] == src
->elems
[is
])
1230 else if (dest
->elems
[id
] < src
->elems
[is
])
1231 dest
->elems
[--sbase
] = src
->elems
[is
--];
1232 else /* if (dest->elems[id] > src->elems[is]) */
1238 /* If DEST is exhausted, the remaining items of SRC must be unique. */
1240 memcpy (dest
->elems
+ sbase
, src
->elems
, (is
+ 1) * sizeof (Idx
));
1243 id
= dest
->nelem
- 1;
1244 is
= dest
->nelem
+ 2 * src
->nelem
- 1;
1245 delta
= is
- sbase
+ 1;
1249 /* Now copy. When DELTA becomes zero, the remaining
1250 DEST elements are already in place. */
1251 dest
->nelem
+= delta
;
1254 if (dest
->elems
[is
] > dest
->elems
[id
])
1256 /* Copy from the top. */
1257 dest
->elems
[id
+ delta
--] = dest
->elems
[is
--];
1263 /* Slide from the bottom. */
1264 dest
->elems
[id
+ delta
] = dest
->elems
[id
];
1267 /* Copy remaining SRC elements. */
1268 memcpy (dest
->elems
, dest
->elems
+ sbase
,
1269 delta
* sizeof (Idx
));
1278 /* Insert the new element ELEM to the re_node_set* SET.
1279 SET should not already have ELEM.
1280 Return true if successful. */
1283 __attribute_warn_unused_result__
1284 re_node_set_insert (re_node_set
*set
, Idx elem
)
1287 /* In case the set is empty. */
1288 if (set
->alloc
== 0)
1289 return __glibc_likely (re_node_set_init_1 (set
, elem
) == REG_NOERROR
);
1291 if (__glibc_unlikely (set
->nelem
) == 0)
1293 /* Although we already guaranteed above that set->alloc != 0 and
1294 therefore set->elems != NULL, add a debug assertion to pacify
1295 GCC 11.2 -fanalyzer. */
1296 DEBUG_ASSERT (set
->elems
);
1297 set
->elems
[0] = elem
;
1302 /* Realloc if we need. */
1303 if (set
->alloc
== set
->nelem
)
1306 set
->alloc
= set
->alloc
* 2;
1307 new_elems
= re_realloc (set
->elems
, Idx
, set
->alloc
);
1308 if (__glibc_unlikely (new_elems
== NULL
))
1310 set
->elems
= new_elems
;
1313 /* Move the elements which follows the new element. Test the
1314 first element separately to skip a check in the inner loop. */
1315 if (elem
< set
->elems
[0])
1317 for (idx
= set
->nelem
; idx
> 0; idx
--)
1318 set
->elems
[idx
] = set
->elems
[idx
- 1];
1322 for (idx
= set
->nelem
; set
->elems
[idx
- 1] > elem
; idx
--)
1323 set
->elems
[idx
] = set
->elems
[idx
- 1];
1324 DEBUG_ASSERT (set
->elems
[idx
- 1] < elem
);
1327 /* Insert the new element. */
1328 set
->elems
[idx
] = elem
;
1333 /* Insert the new element ELEM to the re_node_set* SET.
1334 SET should not already have any element greater than or equal to ELEM.
1335 Return true if successful. */
1338 __attribute_warn_unused_result__
1339 re_node_set_insert_last (re_node_set
*set
, Idx elem
)
1341 /* Realloc if we need. */
1342 if (set
->alloc
== set
->nelem
)
1345 set
->alloc
= (set
->alloc
+ 1) * 2;
1346 new_elems
= re_realloc (set
->elems
, Idx
, set
->alloc
);
1347 if (__glibc_unlikely (new_elems
== NULL
))
1349 set
->elems
= new_elems
;
1352 /* Insert the new element. */
1353 set
->elems
[set
->nelem
++] = elem
;
1357 /* Compare two node sets SET1 and SET2.
1358 Return true if SET1 and SET2 are equivalent. */
1361 __attribute__ ((pure
))
1362 re_node_set_compare (const re_node_set
*set1
, const re_node_set
*set2
)
1365 if (set1
== NULL
|| set2
== NULL
|| set1
->nelem
!= set2
->nelem
)
1367 for (i
= set1
->nelem
; --i
>= 0 ; )
1368 if (set1
->elems
[i
] != set2
->elems
[i
])
1373 /* Return (idx + 1) if SET contains the element ELEM, return 0 otherwise. */
1376 __attribute__ ((pure
))
1377 re_node_set_contains (const re_node_set
*set
, Idx elem
)
1379 __re_size_t idx
, right
, mid
;
1380 if (set
->nelem
<= 0)
1383 /* Binary search the element. */
1385 right
= set
->nelem
- 1;
1388 mid
= (idx
+ right
) / 2;
1389 if (set
->elems
[mid
] < elem
)
1394 return set
->elems
[idx
] == elem
? idx
+ 1 : 0;
1398 re_node_set_remove_at (re_node_set
*set
, Idx idx
)
1400 if (idx
< 0 || idx
>= set
->nelem
)
1403 for (; idx
< set
->nelem
; idx
++)
1404 set
->elems
[idx
] = set
->elems
[idx
+ 1];
1408 /* Add the token TOKEN to dfa->nodes, and return the index of the token.
1409 Or return -1 if an error occurred. */
1412 re_dfa_add_node (re_dfa_t
*dfa
, re_token_t token
)
1414 if (__glibc_unlikely (dfa
->nodes_len
>= dfa
->nodes_alloc
))
1416 size_t new_nodes_alloc
= dfa
->nodes_alloc
* 2;
1417 Idx
*new_nexts
, *new_indices
;
1418 re_node_set
*new_edests
, *new_eclosures
;
1419 re_token_t
*new_nodes
;
1421 /* Avoid overflows in realloc. */
1422 const size_t max_object_size
= MAX (sizeof (re_token_t
),
1423 MAX (sizeof (re_node_set
),
1425 if (__glibc_unlikely (MIN (IDX_MAX
, SIZE_MAX
/ max_object_size
)
1429 new_nodes
= re_realloc (dfa
->nodes
, re_token_t
, new_nodes_alloc
);
1430 if (__glibc_unlikely (new_nodes
== NULL
))
1432 dfa
->nodes
= new_nodes
;
1433 new_nexts
= re_realloc (dfa
->nexts
, Idx
, new_nodes_alloc
);
1434 new_indices
= re_realloc (dfa
->org_indices
, Idx
, new_nodes_alloc
);
1435 new_edests
= re_realloc (dfa
->edests
, re_node_set
, new_nodes_alloc
);
1436 new_eclosures
= re_realloc (dfa
->eclosures
, re_node_set
, new_nodes_alloc
);
1437 if (__glibc_unlikely (new_nexts
== NULL
|| new_indices
== NULL
1438 || new_edests
== NULL
|| new_eclosures
== NULL
))
1440 re_free (new_nexts
);
1441 re_free (new_indices
);
1442 re_free (new_edests
);
1443 re_free (new_eclosures
);
1446 dfa
->nexts
= new_nexts
;
1447 dfa
->org_indices
= new_indices
;
1448 dfa
->edests
= new_edests
;
1449 dfa
->eclosures
= new_eclosures
;
1450 dfa
->nodes_alloc
= new_nodes_alloc
;
1452 dfa
->nodes
[dfa
->nodes_len
] = token
;
1453 dfa
->nodes
[dfa
->nodes_len
].constraint
= 0;
1454 #ifdef RE_ENABLE_I18N
1455 dfa
->nodes
[dfa
->nodes_len
].accept_mb
=
1456 ((token
.type
== OP_PERIOD
&& dfa
->mb_cur_max
> 1)
1457 || token
.type
== COMPLEX_BRACKET
);
1459 dfa
->nexts
[dfa
->nodes_len
] = -1;
1460 re_node_set_init_empty (dfa
->edests
+ dfa
->nodes_len
);
1461 re_node_set_init_empty (dfa
->eclosures
+ dfa
->nodes_len
);
1462 return dfa
->nodes_len
++;
1466 calc_state_hash (const re_node_set
*nodes
, unsigned int context
)
1468 re_hashval_t hash
= nodes
->nelem
+ context
;
1470 for (i
= 0 ; i
< nodes
->nelem
; i
++)
1471 hash
+= nodes
->elems
[i
];
1475 /* Search for the state whose node_set is equivalent to NODES.
1476 Return the pointer to the state, if we found it in the DFA.
1477 Otherwise create the new one and return it. In case of an error
1478 return NULL and set the error code in ERR.
1479 Note: - We assume NULL as the invalid state, then it is possible that
1480 return value is NULL and ERR is REG_NOERROR.
1481 - We never return non-NULL value in case of any errors, it is for
1484 static re_dfastate_t
*
1485 __attribute_warn_unused_result__
1486 re_acquire_state (reg_errcode_t
*err
, const re_dfa_t
*dfa
,
1487 const re_node_set
*nodes
)
1490 re_dfastate_t
*new_state
;
1491 struct re_state_table_entry
*spot
;
1493 #if defined GCC_LINT || defined lint
1494 /* Suppress bogus uninitialized-variable warnings. */
1497 if (__glibc_unlikely (nodes
->nelem
== 0))
1502 hash
= calc_state_hash (nodes
, 0);
1503 spot
= dfa
->state_table
+ (hash
& dfa
->state_hash_mask
);
1505 for (i
= 0 ; i
< spot
->num
; i
++)
1507 re_dfastate_t
*state
= spot
->array
[i
];
1508 if (hash
!= state
->hash
)
1510 if (re_node_set_compare (&state
->nodes
, nodes
))
1514 /* There are no appropriate state in the dfa, create the new one. */
1515 new_state
= create_ci_newstate (dfa
, nodes
, hash
);
1516 if (__glibc_unlikely (new_state
== NULL
))
1522 /* Search for the state whose node_set is equivalent to NODES and
1523 whose context is equivalent to CONTEXT.
1524 Return the pointer to the state, if we found it in the DFA.
1525 Otherwise create the new one and return it. In case of an error
1526 return NULL and set the error code in ERR.
1527 Note: - We assume NULL as the invalid state, then it is possible that
1528 return value is NULL and ERR is REG_NOERROR.
1529 - We never return non-NULL value in case of any errors, it is for
1532 static re_dfastate_t
*
1533 __attribute_warn_unused_result__
1534 re_acquire_state_context (reg_errcode_t
*err
, const re_dfa_t
*dfa
,
1535 const re_node_set
*nodes
, unsigned int context
)
1538 re_dfastate_t
*new_state
;
1539 struct re_state_table_entry
*spot
;
1541 #if defined GCC_LINT || defined lint
1542 /* Suppress bogus uninitialized-variable warnings. */
1545 if (nodes
->nelem
== 0)
1550 hash
= calc_state_hash (nodes
, context
);
1551 spot
= dfa
->state_table
+ (hash
& dfa
->state_hash_mask
);
1553 for (i
= 0 ; i
< spot
->num
; i
++)
1555 re_dfastate_t
*state
= spot
->array
[i
];
1556 if (state
->hash
== hash
1557 && state
->context
== context
1558 && re_node_set_compare (state
->entrance_nodes
, nodes
))
1561 /* There are no appropriate state in 'dfa', create the new one. */
1562 new_state
= create_cd_newstate (dfa
, nodes
, context
, hash
);
1563 if (__glibc_unlikely (new_state
== NULL
))
1569 /* Finish initialization of the new state NEWSTATE, and using its hash value
1570 HASH put in the appropriate bucket of DFA's state table. Return value
1571 indicates the error code if failed. */
1573 static reg_errcode_t
1574 __attribute_warn_unused_result__
1575 register_state (const re_dfa_t
*dfa
, re_dfastate_t
*newstate
,
1578 struct re_state_table_entry
*spot
;
1582 newstate
->hash
= hash
;
1583 err
= re_node_set_alloc (&newstate
->non_eps_nodes
, newstate
->nodes
.nelem
);
1584 if (__glibc_unlikely (err
!= REG_NOERROR
))
1586 for (i
= 0; i
< newstate
->nodes
.nelem
; i
++)
1588 Idx elem
= newstate
->nodes
.elems
[i
];
1589 if (!IS_EPSILON_NODE (dfa
->nodes
[elem
].type
))
1590 if (! re_node_set_insert_last (&newstate
->non_eps_nodes
, elem
))
1594 spot
= dfa
->state_table
+ (hash
& dfa
->state_hash_mask
);
1595 if (__glibc_unlikely (spot
->alloc
<= spot
->num
))
1597 Idx new_alloc
= 2 * spot
->num
+ 2;
1598 re_dfastate_t
**new_array
= re_realloc (spot
->array
, re_dfastate_t
*,
1600 if (__glibc_unlikely (new_array
== NULL
))
1602 spot
->array
= new_array
;
1603 spot
->alloc
= new_alloc
;
1605 spot
->array
[spot
->num
++] = newstate
;
1610 free_state (re_dfastate_t
*state
)
1612 re_node_set_free (&state
->non_eps_nodes
);
1613 re_node_set_free (&state
->inveclosure
);
1614 if (state
->entrance_nodes
!= &state
->nodes
)
1616 re_node_set_free (state
->entrance_nodes
);
1617 re_free (state
->entrance_nodes
);
1619 re_node_set_free (&state
->nodes
);
1620 re_free (state
->word_trtable
);
1621 re_free (state
->trtable
);
1625 /* Create the new state which is independent of contexts.
1626 Return the new state if succeeded, otherwise return NULL. */
1628 static re_dfastate_t
*
1629 __attribute_warn_unused_result__
1630 create_ci_newstate (const re_dfa_t
*dfa
, const re_node_set
*nodes
,
1635 re_dfastate_t
*newstate
;
1637 newstate
= (re_dfastate_t
*) calloc (sizeof (re_dfastate_t
), 1);
1638 if (__glibc_unlikely (newstate
== NULL
))
1640 err
= re_node_set_init_copy (&newstate
->nodes
, nodes
);
1641 if (__glibc_unlikely (err
!= REG_NOERROR
))
1647 newstate
->entrance_nodes
= &newstate
->nodes
;
1648 for (i
= 0 ; i
< nodes
->nelem
; i
++)
1650 re_token_t
*node
= dfa
->nodes
+ nodes
->elems
[i
];
1651 re_token_type_t type
= node
->type
;
1652 if (type
== CHARACTER
&& !node
->constraint
)
1654 #ifdef RE_ENABLE_I18N
1655 newstate
->accept_mb
|= node
->accept_mb
;
1656 #endif /* RE_ENABLE_I18N */
1658 /* If the state has the halt node, the state is a halt state. */
1659 if (type
== END_OF_RE
)
1661 else if (type
== OP_BACK_REF
)
1662 newstate
->has_backref
= 1;
1663 else if (type
== ANCHOR
|| node
->constraint
)
1664 newstate
->has_constraint
= 1;
1666 err
= register_state (dfa
, newstate
, hash
);
1667 if (__glibc_unlikely (err
!= REG_NOERROR
))
1669 free_state (newstate
);
1675 /* Create the new state which is depend on the context CONTEXT.
1676 Return the new state if succeeded, otherwise return NULL. */
1678 static re_dfastate_t
*
1679 __attribute_warn_unused_result__
1680 create_cd_newstate (const re_dfa_t
*dfa
, const re_node_set
*nodes
,
1681 unsigned int context
, re_hashval_t hash
)
1683 Idx i
, nctx_nodes
= 0;
1685 re_dfastate_t
*newstate
;
1687 newstate
= (re_dfastate_t
*) calloc (sizeof (re_dfastate_t
), 1);
1688 if (__glibc_unlikely (newstate
== NULL
))
1690 err
= re_node_set_init_copy (&newstate
->nodes
, nodes
);
1691 if (__glibc_unlikely (err
!= REG_NOERROR
))
1697 newstate
->context
= context
;
1698 newstate
->entrance_nodes
= &newstate
->nodes
;
1700 for (i
= 0 ; i
< nodes
->nelem
; i
++)
1702 re_token_t
*node
= dfa
->nodes
+ nodes
->elems
[i
];
1703 re_token_type_t type
= node
->type
;
1704 unsigned int constraint
= node
->constraint
;
1706 if (type
== CHARACTER
&& !constraint
)
1708 #ifdef RE_ENABLE_I18N
1709 newstate
->accept_mb
|= node
->accept_mb
;
1710 #endif /* RE_ENABLE_I18N */
1712 /* If the state has the halt node, the state is a halt state. */
1713 if (type
== END_OF_RE
)
1715 else if (type
== OP_BACK_REF
)
1716 newstate
->has_backref
= 1;
1720 if (newstate
->entrance_nodes
== &newstate
->nodes
)
1722 re_node_set
*entrance_nodes
= re_malloc (re_node_set
, 1);
1723 if (__glibc_unlikely (entrance_nodes
== NULL
))
1725 free_state (newstate
);
1728 newstate
->entrance_nodes
= entrance_nodes
;
1729 if (re_node_set_init_copy (newstate
->entrance_nodes
, nodes
)
1732 free_state (newstate
);
1736 newstate
->has_constraint
= 1;
1739 if (NOT_SATISFY_PREV_CONSTRAINT (constraint
,context
))
1741 re_node_set_remove_at (&newstate
->nodes
, i
- nctx_nodes
);
1746 err
= register_state (dfa
, newstate
, hash
);
1747 if (__glibc_unlikely (err
!= REG_NOERROR
))
1749 free_state (newstate
);