arm: Implement memchr ifunc selection in C
[glibc.git] / posix / regex_internal.c
blob111231c198b6c6d099b39b0e2ff219439d9d65f9
1 /* Extended regular expression matching and search library.
2 Copyright (C) 2002-2017 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4 Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library; if not, see
18 <http://www.gnu.org/licenses/>. */
20 static void re_string_construct_common (const char *str, int len,
21 re_string_t *pstr,
22 RE_TRANSLATE_TYPE trans, int icase,
23 const re_dfa_t *dfa);
24 static re_dfastate_t *create_ci_newstate (const re_dfa_t *dfa,
25 const re_node_set *nodes,
26 unsigned int hash);
27 static re_dfastate_t *create_cd_newstate (const re_dfa_t *dfa,
28 const re_node_set *nodes,
29 unsigned int context,
30 unsigned int hash);
32 /* Functions for string operation. */
34 /* This function allocate the buffers. It is necessary to call
35 re_string_reconstruct before using the object. */
37 static reg_errcode_t
38 __attribute_warn_unused_result__
39 re_string_allocate (re_string_t *pstr, const char *str, int len, int init_len,
40 RE_TRANSLATE_TYPE trans, int icase, const re_dfa_t *dfa)
42 reg_errcode_t ret;
43 int init_buf_len;
45 /* Ensure at least one character fits into the buffers. */
46 if (init_len < dfa->mb_cur_max)
47 init_len = dfa->mb_cur_max;
48 init_buf_len = (len + 1 < init_len) ? len + 1: init_len;
49 re_string_construct_common (str, len, pstr, trans, icase, dfa);
51 ret = re_string_realloc_buffers (pstr, init_buf_len);
52 if (BE (ret != REG_NOERROR, 0))
53 return ret;
55 pstr->word_char = dfa->word_char;
56 pstr->word_ops_used = dfa->word_ops_used;
57 pstr->mbs = pstr->mbs_allocated ? pstr->mbs : (unsigned char *) str;
58 pstr->valid_len = (pstr->mbs_allocated || dfa->mb_cur_max > 1) ? 0 : len;
59 pstr->valid_raw_len = pstr->valid_len;
60 return REG_NOERROR;
63 /* This function allocate the buffers, and initialize them. */
65 static reg_errcode_t
66 __attribute_warn_unused_result__
67 re_string_construct (re_string_t *pstr, const char *str, int len,
68 RE_TRANSLATE_TYPE trans, int icase, const re_dfa_t *dfa)
70 reg_errcode_t ret;
71 memset (pstr, '\0', sizeof (re_string_t));
72 re_string_construct_common (str, len, pstr, trans, icase, dfa);
74 if (len > 0)
76 ret = re_string_realloc_buffers (pstr, len + 1);
77 if (BE (ret != REG_NOERROR, 0))
78 return ret;
80 pstr->mbs = pstr->mbs_allocated ? pstr->mbs : (unsigned char *) str;
82 if (icase)
84 #ifdef RE_ENABLE_I18N
85 if (dfa->mb_cur_max > 1)
87 while (1)
89 ret = build_wcs_upper_buffer (pstr);
90 if (BE (ret != REG_NOERROR, 0))
91 return ret;
92 if (pstr->valid_raw_len >= len)
93 break;
94 if (pstr->bufs_len > pstr->valid_len + dfa->mb_cur_max)
95 break;
96 ret = re_string_realloc_buffers (pstr, pstr->bufs_len * 2);
97 if (BE (ret != REG_NOERROR, 0))
98 return ret;
101 else
102 #endif /* RE_ENABLE_I18N */
103 build_upper_buffer (pstr);
105 else
107 #ifdef RE_ENABLE_I18N
108 if (dfa->mb_cur_max > 1)
109 build_wcs_buffer (pstr);
110 else
111 #endif /* RE_ENABLE_I18N */
113 if (trans != NULL)
114 re_string_translate_buffer (pstr);
115 else
117 pstr->valid_len = pstr->bufs_len;
118 pstr->valid_raw_len = pstr->bufs_len;
123 return REG_NOERROR;
126 /* Helper functions for re_string_allocate, and re_string_construct. */
128 static reg_errcode_t
129 __attribute_warn_unused_result__
130 re_string_realloc_buffers (re_string_t *pstr, int new_buf_len)
132 #ifdef RE_ENABLE_I18N
133 if (pstr->mb_cur_max > 1)
135 wint_t *new_wcs;
137 /* Avoid overflow in realloc. */
138 const size_t max_object_size = MAX (sizeof (wint_t), sizeof (int));
139 if (BE (SIZE_MAX / max_object_size < new_buf_len, 0))
140 return REG_ESPACE;
142 new_wcs = re_realloc (pstr->wcs, wint_t, new_buf_len);
143 if (BE (new_wcs == NULL, 0))
144 return REG_ESPACE;
145 pstr->wcs = new_wcs;
146 if (pstr->offsets != NULL)
148 int *new_offsets = re_realloc (pstr->offsets, int, new_buf_len);
149 if (BE (new_offsets == NULL, 0))
150 return REG_ESPACE;
151 pstr->offsets = new_offsets;
154 #endif /* RE_ENABLE_I18N */
155 if (pstr->mbs_allocated)
157 unsigned char *new_mbs = re_realloc (pstr->mbs, unsigned char,
158 new_buf_len);
159 if (BE (new_mbs == NULL, 0))
160 return REG_ESPACE;
161 pstr->mbs = new_mbs;
163 pstr->bufs_len = new_buf_len;
164 return REG_NOERROR;
168 static void
169 re_string_construct_common (const char *str, int len, re_string_t *pstr,
170 RE_TRANSLATE_TYPE trans, int icase,
171 const re_dfa_t *dfa)
173 pstr->raw_mbs = (const unsigned char *) str;
174 pstr->len = len;
175 pstr->raw_len = len;
176 pstr->trans = trans;
177 pstr->icase = icase ? 1 : 0;
178 pstr->mbs_allocated = (trans != NULL || icase);
179 pstr->mb_cur_max = dfa->mb_cur_max;
180 pstr->is_utf8 = dfa->is_utf8;
181 pstr->map_notascii = dfa->map_notascii;
182 pstr->stop = pstr->len;
183 pstr->raw_stop = pstr->stop;
186 #ifdef RE_ENABLE_I18N
188 /* Build wide character buffer PSTR->WCS.
189 If the byte sequence of the string are:
190 <mb1>(0), <mb1>(1), <mb2>(0), <mb2>(1), <sb3>
191 Then wide character buffer will be:
192 <wc1> , WEOF , <wc2> , WEOF , <wc3>
193 We use WEOF for padding, they indicate that the position isn't
194 a first byte of a multibyte character.
196 Note that this function assumes PSTR->VALID_LEN elements are already
197 built and starts from PSTR->VALID_LEN. */
199 static void
200 build_wcs_buffer (re_string_t *pstr)
202 #ifdef _LIBC
203 unsigned char buf[MB_LEN_MAX];
204 assert (MB_LEN_MAX >= pstr->mb_cur_max);
205 #else
206 unsigned char buf[64];
207 #endif
208 mbstate_t prev_st;
209 int byte_idx, end_idx, remain_len;
210 size_t mbclen;
212 /* Build the buffers from pstr->valid_len to either pstr->len or
213 pstr->bufs_len. */
214 end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
215 for (byte_idx = pstr->valid_len; byte_idx < end_idx;)
217 wchar_t wc;
218 const char *p;
220 remain_len = end_idx - byte_idx;
221 prev_st = pstr->cur_state;
222 /* Apply the translation if we need. */
223 if (BE (pstr->trans != NULL, 0))
225 int i, ch;
227 for (i = 0; i < pstr->mb_cur_max && i < remain_len; ++i)
229 ch = pstr->raw_mbs [pstr->raw_mbs_idx + byte_idx + i];
230 buf[i] = pstr->mbs[byte_idx + i] = pstr->trans[ch];
232 p = (const char *) buf;
234 else
235 p = (const char *) pstr->raw_mbs + pstr->raw_mbs_idx + byte_idx;
236 mbclen = __mbrtowc (&wc, p, remain_len, &pstr->cur_state);
237 if (BE (mbclen == (size_t) -1 || mbclen == 0
238 || (mbclen == (size_t) -2 && pstr->bufs_len >= pstr->len), 0))
240 /* We treat these cases as a singlebyte character. */
241 mbclen = 1;
242 wc = (wchar_t) pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx];
243 if (BE (pstr->trans != NULL, 0))
244 wc = pstr->trans[wc];
245 pstr->cur_state = prev_st;
247 else if (BE (mbclen == (size_t) -2, 0))
249 /* The buffer doesn't have enough space, finish to build. */
250 pstr->cur_state = prev_st;
251 break;
254 /* Write wide character and padding. */
255 pstr->wcs[byte_idx++] = wc;
256 /* Write paddings. */
257 for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
258 pstr->wcs[byte_idx++] = WEOF;
260 pstr->valid_len = byte_idx;
261 pstr->valid_raw_len = byte_idx;
264 /* Build wide character buffer PSTR->WCS like build_wcs_buffer,
265 but for REG_ICASE. */
267 static reg_errcode_t
268 __attribute_warn_unused_result__
269 build_wcs_upper_buffer (re_string_t *pstr)
271 mbstate_t prev_st;
272 int src_idx, byte_idx, end_idx, remain_len;
273 size_t mbclen;
274 #ifdef _LIBC
275 char buf[MB_LEN_MAX];
276 assert (MB_LEN_MAX >= pstr->mb_cur_max);
277 #else
278 char buf[64];
279 #endif
281 byte_idx = pstr->valid_len;
282 end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
284 /* The following optimization assumes that ASCII characters can be
285 mapped to wide characters with a simple cast. */
286 if (! pstr->map_notascii && pstr->trans == NULL && !pstr->offsets_needed)
288 while (byte_idx < end_idx)
290 wchar_t wc;
292 if (isascii (pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx])
293 && mbsinit (&pstr->cur_state))
295 /* In case of a singlebyte character. */
296 pstr->mbs[byte_idx]
297 = toupper (pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx]);
298 /* The next step uses the assumption that wchar_t is encoded
299 ASCII-safe: all ASCII values can be converted like this. */
300 pstr->wcs[byte_idx] = (wchar_t) pstr->mbs[byte_idx];
301 ++byte_idx;
302 continue;
305 remain_len = end_idx - byte_idx;
306 prev_st = pstr->cur_state;
307 mbclen = __mbrtowc (&wc,
308 ((const char *) pstr->raw_mbs + pstr->raw_mbs_idx
309 + byte_idx), remain_len, &pstr->cur_state);
310 if (BE (mbclen + 2 > 2, 1))
312 wchar_t wcu = wc;
313 if (__iswlower (wc))
315 size_t mbcdlen;
317 wcu = __towupper (wc);
318 mbcdlen = __wcrtomb (buf, wcu, &prev_st);
319 if (BE (mbclen == mbcdlen, 1))
320 memcpy (pstr->mbs + byte_idx, buf, mbclen);
321 else
323 src_idx = byte_idx;
324 goto offsets_needed;
327 else
328 memcpy (pstr->mbs + byte_idx,
329 pstr->raw_mbs + pstr->raw_mbs_idx + byte_idx, mbclen);
330 pstr->wcs[byte_idx++] = wcu;
331 /* Write paddings. */
332 for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
333 pstr->wcs[byte_idx++] = WEOF;
335 else if (mbclen == (size_t) -1 || mbclen == 0
336 || (mbclen == (size_t) -2 && pstr->bufs_len >= pstr->len))
338 /* It is an invalid character, an incomplete character
339 at the end of the string, or '\0'. Just use the byte. */
340 int ch = pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx];
341 pstr->mbs[byte_idx] = ch;
342 /* And also cast it to wide char. */
343 pstr->wcs[byte_idx++] = (wchar_t) ch;
344 if (BE (mbclen == (size_t) -1, 0))
345 pstr->cur_state = prev_st;
347 else
349 /* The buffer doesn't have enough space, finish to build. */
350 pstr->cur_state = prev_st;
351 break;
354 pstr->valid_len = byte_idx;
355 pstr->valid_raw_len = byte_idx;
356 return REG_NOERROR;
358 else
359 for (src_idx = pstr->valid_raw_len; byte_idx < end_idx;)
361 wchar_t wc;
362 const char *p;
363 offsets_needed:
364 remain_len = end_idx - byte_idx;
365 prev_st = pstr->cur_state;
366 if (BE (pstr->trans != NULL, 0))
368 int i, ch;
370 for (i = 0; i < pstr->mb_cur_max && i < remain_len; ++i)
372 ch = pstr->raw_mbs [pstr->raw_mbs_idx + src_idx + i];
373 buf[i] = pstr->trans[ch];
375 p = (const char *) buf;
377 else
378 p = (const char *) pstr->raw_mbs + pstr->raw_mbs_idx + src_idx;
379 mbclen = __mbrtowc (&wc, p, remain_len, &pstr->cur_state);
380 if (BE (mbclen + 2 > 2, 1))
382 wchar_t wcu = wc;
383 if (__iswlower (wc))
385 size_t mbcdlen;
387 wcu = __towupper (wc);
388 mbcdlen = __wcrtomb ((char *) buf, wcu, &prev_st);
389 if (BE (mbclen == mbcdlen, 1))
390 memcpy (pstr->mbs + byte_idx, buf, mbclen);
391 else if (mbcdlen != (size_t) -1)
393 size_t i;
395 if (byte_idx + mbcdlen > pstr->bufs_len)
397 pstr->cur_state = prev_st;
398 break;
401 if (pstr->offsets == NULL)
403 pstr->offsets = re_malloc (int, pstr->bufs_len);
405 if (pstr->offsets == NULL)
406 return REG_ESPACE;
408 if (!pstr->offsets_needed)
410 for (i = 0; i < (size_t) byte_idx; ++i)
411 pstr->offsets[i] = i;
412 pstr->offsets_needed = 1;
415 memcpy (pstr->mbs + byte_idx, buf, mbcdlen);
416 pstr->wcs[byte_idx] = wcu;
417 pstr->offsets[byte_idx] = src_idx;
418 for (i = 1; i < mbcdlen; ++i)
420 pstr->offsets[byte_idx + i]
421 = src_idx + (i < mbclen ? i : mbclen - 1);
422 pstr->wcs[byte_idx + i] = WEOF;
424 pstr->len += mbcdlen - mbclen;
425 if (pstr->raw_stop > src_idx)
426 pstr->stop += mbcdlen - mbclen;
427 end_idx = (pstr->bufs_len > pstr->len)
428 ? pstr->len : pstr->bufs_len;
429 byte_idx += mbcdlen;
430 src_idx += mbclen;
431 continue;
433 else
434 memcpy (pstr->mbs + byte_idx, p, mbclen);
436 else
437 memcpy (pstr->mbs + byte_idx, p, mbclen);
439 if (BE (pstr->offsets_needed != 0, 0))
441 size_t i;
442 for (i = 0; i < mbclen; ++i)
443 pstr->offsets[byte_idx + i] = src_idx + i;
445 src_idx += mbclen;
447 pstr->wcs[byte_idx++] = wcu;
448 /* Write paddings. */
449 for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
450 pstr->wcs[byte_idx++] = WEOF;
452 else if (mbclen == (size_t) -1 || mbclen == 0
453 || (mbclen == (size_t) -2 && pstr->bufs_len >= pstr->len))
455 /* It is an invalid character or '\0'. Just use the byte. */
456 int ch = pstr->raw_mbs[pstr->raw_mbs_idx + src_idx];
458 if (BE (pstr->trans != NULL, 0))
459 ch = pstr->trans [ch];
460 pstr->mbs[byte_idx] = ch;
462 if (BE (pstr->offsets_needed != 0, 0))
463 pstr->offsets[byte_idx] = src_idx;
464 ++src_idx;
466 /* And also cast it to wide char. */
467 pstr->wcs[byte_idx++] = (wchar_t) ch;
468 if (BE (mbclen == (size_t) -1, 0))
469 pstr->cur_state = prev_st;
471 else
473 /* The buffer doesn't have enough space, finish to build. */
474 pstr->cur_state = prev_st;
475 break;
478 pstr->valid_len = byte_idx;
479 pstr->valid_raw_len = src_idx;
480 return REG_NOERROR;
483 /* Skip characters until the index becomes greater than NEW_RAW_IDX.
484 Return the index. */
486 static int
487 re_string_skip_chars (re_string_t *pstr, int new_raw_idx, wint_t *last_wc)
489 mbstate_t prev_st;
490 int rawbuf_idx;
491 size_t mbclen;
492 wint_t wc = WEOF;
494 /* Skip the characters which are not necessary to check. */
495 for (rawbuf_idx = pstr->raw_mbs_idx + pstr->valid_raw_len;
496 rawbuf_idx < new_raw_idx;)
498 wchar_t wc2;
499 int remain_len = pstr->raw_len - rawbuf_idx;
500 prev_st = pstr->cur_state;
501 mbclen = __mbrtowc (&wc2, (const char *) pstr->raw_mbs + rawbuf_idx,
502 remain_len, &pstr->cur_state);
503 if (BE ((ssize_t) mbclen <= 0, 0))
505 /* We treat these cases as a single byte character. */
506 if (mbclen == 0 || remain_len == 0)
507 wc = L'\0';
508 else
509 wc = *(unsigned char *) (pstr->raw_mbs + rawbuf_idx);
510 mbclen = 1;
511 pstr->cur_state = prev_st;
513 else
514 wc = (wint_t) wc2;
515 /* Then proceed the next character. */
516 rawbuf_idx += mbclen;
518 *last_wc = wc;
519 return rawbuf_idx;
521 #endif /* RE_ENABLE_I18N */
523 /* Build the buffer PSTR->MBS, and apply the translation if we need.
524 This function is used in case of REG_ICASE. */
526 static void
527 build_upper_buffer (re_string_t *pstr)
529 int char_idx, end_idx;
530 end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
532 for (char_idx = pstr->valid_len; char_idx < end_idx; ++char_idx)
534 int ch = pstr->raw_mbs[pstr->raw_mbs_idx + char_idx];
535 if (BE (pstr->trans != NULL, 0))
536 ch = pstr->trans[ch];
537 if (islower (ch))
538 pstr->mbs[char_idx] = toupper (ch);
539 else
540 pstr->mbs[char_idx] = ch;
542 pstr->valid_len = char_idx;
543 pstr->valid_raw_len = char_idx;
546 /* Apply TRANS to the buffer in PSTR. */
548 static void
549 re_string_translate_buffer (re_string_t *pstr)
551 int buf_idx, end_idx;
552 end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
554 for (buf_idx = pstr->valid_len; buf_idx < end_idx; ++buf_idx)
556 int ch = pstr->raw_mbs[pstr->raw_mbs_idx + buf_idx];
557 pstr->mbs[buf_idx] = pstr->trans[ch];
560 pstr->valid_len = buf_idx;
561 pstr->valid_raw_len = buf_idx;
564 /* This function re-construct the buffers.
565 Concretely, convert to wide character in case of pstr->mb_cur_max > 1,
566 convert to upper case in case of REG_ICASE, apply translation. */
568 static reg_errcode_t
569 __attribute_warn_unused_result__
570 re_string_reconstruct (re_string_t *pstr, int idx, int eflags)
572 int offset = idx - pstr->raw_mbs_idx;
573 if (BE (offset < 0, 0))
575 /* Reset buffer. */
576 #ifdef RE_ENABLE_I18N
577 if (pstr->mb_cur_max > 1)
578 memset (&pstr->cur_state, '\0', sizeof (mbstate_t));
579 #endif /* RE_ENABLE_I18N */
580 pstr->len = pstr->raw_len;
581 pstr->stop = pstr->raw_stop;
582 pstr->valid_len = 0;
583 pstr->raw_mbs_idx = 0;
584 pstr->valid_raw_len = 0;
585 pstr->offsets_needed = 0;
586 pstr->tip_context = ((eflags & REG_NOTBOL) ? CONTEXT_BEGBUF
587 : CONTEXT_NEWLINE | CONTEXT_BEGBUF);
588 if (!pstr->mbs_allocated)
589 pstr->mbs = (unsigned char *) pstr->raw_mbs;
590 offset = idx;
593 if (BE (offset != 0, 1))
595 /* Should the already checked characters be kept? */
596 if (BE (offset < pstr->valid_raw_len, 1))
598 /* Yes, move them to the front of the buffer. */
599 #ifdef RE_ENABLE_I18N
600 if (BE (pstr->offsets_needed, 0))
602 int low = 0, high = pstr->valid_len, mid;
605 mid = (high + low) / 2;
606 if (pstr->offsets[mid] > offset)
607 high = mid;
608 else if (pstr->offsets[mid] < offset)
609 low = mid + 1;
610 else
611 break;
613 while (low < high);
614 if (pstr->offsets[mid] < offset)
615 ++mid;
616 pstr->tip_context = re_string_context_at (pstr, mid - 1,
617 eflags);
618 /* This can be quite complicated, so handle specially
619 only the common and easy case where the character with
620 different length representation of lower and upper
621 case is present at or after offset. */
622 if (pstr->valid_len > offset
623 && mid == offset && pstr->offsets[mid] == offset)
625 memmove (pstr->wcs, pstr->wcs + offset,
626 (pstr->valid_len - offset) * sizeof (wint_t));
627 memmove (pstr->mbs, pstr->mbs + offset, pstr->valid_len - offset);
628 pstr->valid_len -= offset;
629 pstr->valid_raw_len -= offset;
630 for (low = 0; low < pstr->valid_len; low++)
631 pstr->offsets[low] = pstr->offsets[low + offset] - offset;
633 else
635 /* Otherwise, just find out how long the partial multibyte
636 character at offset is and fill it with WEOF/255. */
637 pstr->len = pstr->raw_len - idx + offset;
638 pstr->stop = pstr->raw_stop - idx + offset;
639 pstr->offsets_needed = 0;
640 while (mid > 0 && pstr->offsets[mid - 1] == offset)
641 --mid;
642 while (mid < pstr->valid_len)
643 if (pstr->wcs[mid] != WEOF)
644 break;
645 else
646 ++mid;
647 if (mid == pstr->valid_len)
648 pstr->valid_len = 0;
649 else
651 pstr->valid_len = pstr->offsets[mid] - offset;
652 if (pstr->valid_len)
654 for (low = 0; low < pstr->valid_len; ++low)
655 pstr->wcs[low] = WEOF;
656 memset (pstr->mbs, 255, pstr->valid_len);
659 pstr->valid_raw_len = pstr->valid_len;
662 else
663 #endif
665 pstr->tip_context = re_string_context_at (pstr, offset - 1,
666 eflags);
667 #ifdef RE_ENABLE_I18N
668 if (pstr->mb_cur_max > 1)
669 memmove (pstr->wcs, pstr->wcs + offset,
670 (pstr->valid_len - offset) * sizeof (wint_t));
671 #endif /* RE_ENABLE_I18N */
672 if (BE (pstr->mbs_allocated, 0))
673 memmove (pstr->mbs, pstr->mbs + offset,
674 pstr->valid_len - offset);
675 pstr->valid_len -= offset;
676 pstr->valid_raw_len -= offset;
677 #if defined DEBUG && DEBUG
678 assert (pstr->valid_len > 0);
679 #endif
682 else
684 /* No, skip all characters until IDX. */
685 int prev_valid_len = pstr->valid_len;
687 #ifdef RE_ENABLE_I18N
688 if (BE (pstr->offsets_needed, 0))
690 pstr->len = pstr->raw_len - idx + offset;
691 pstr->stop = pstr->raw_stop - idx + offset;
692 pstr->offsets_needed = 0;
694 #endif
695 pstr->valid_len = 0;
696 #ifdef RE_ENABLE_I18N
697 if (pstr->mb_cur_max > 1)
699 int wcs_idx;
700 wint_t wc = WEOF;
702 if (pstr->is_utf8)
704 const unsigned char *raw, *p, *end;
706 /* Special case UTF-8. Multi-byte chars start with any
707 byte other than 0x80 - 0xbf. */
708 raw = pstr->raw_mbs + pstr->raw_mbs_idx;
709 end = raw + (offset - pstr->mb_cur_max);
710 if (end < pstr->raw_mbs)
711 end = pstr->raw_mbs;
712 p = raw + offset - 1;
713 #ifdef _LIBC
714 /* We know the wchar_t encoding is UCS4, so for the simple
715 case, ASCII characters, skip the conversion step. */
716 if (isascii (*p) && BE (pstr->trans == NULL, 1))
718 memset (&pstr->cur_state, '\0', sizeof (mbstate_t));
719 /* pstr->valid_len = 0; */
720 wc = (wchar_t) *p;
722 else
723 #endif
724 for (; p >= end; --p)
725 if ((*p & 0xc0) != 0x80)
727 mbstate_t cur_state;
728 wchar_t wc2;
729 int mlen = raw + pstr->len - p;
730 unsigned char buf[6];
731 size_t mbclen;
733 const unsigned char *pp = p;
734 if (BE (pstr->trans != NULL, 0))
736 int i = mlen < 6 ? mlen : 6;
737 while (--i >= 0)
738 buf[i] = pstr->trans[p[i]];
739 pp = buf;
741 /* XXX Don't use mbrtowc, we know which conversion
742 to use (UTF-8 -> UCS4). */
743 memset (&cur_state, 0, sizeof (cur_state));
744 mbclen = __mbrtowc (&wc2, (const char *) pp, mlen,
745 &cur_state);
746 if (raw + offset - p <= mbclen
747 && mbclen < (size_t) -2)
749 memset (&pstr->cur_state, '\0',
750 sizeof (mbstate_t));
751 pstr->valid_len = mbclen - (raw + offset - p);
752 wc = wc2;
754 break;
758 if (wc == WEOF)
759 pstr->valid_len = re_string_skip_chars (pstr, idx, &wc) - idx;
760 if (wc == WEOF)
761 pstr->tip_context
762 = re_string_context_at (pstr, prev_valid_len - 1, eflags);
763 else
764 pstr->tip_context = ((BE (pstr->word_ops_used != 0, 0)
765 && IS_WIDE_WORD_CHAR (wc))
766 ? CONTEXT_WORD
767 : ((IS_WIDE_NEWLINE (wc)
768 && pstr->newline_anchor)
769 ? CONTEXT_NEWLINE : 0));
770 if (BE (pstr->valid_len, 0))
772 for (wcs_idx = 0; wcs_idx < pstr->valid_len; ++wcs_idx)
773 pstr->wcs[wcs_idx] = WEOF;
774 if (pstr->mbs_allocated)
775 memset (pstr->mbs, 255, pstr->valid_len);
777 pstr->valid_raw_len = pstr->valid_len;
779 else
780 #endif /* RE_ENABLE_I18N */
782 int c = pstr->raw_mbs[pstr->raw_mbs_idx + offset - 1];
783 pstr->valid_raw_len = 0;
784 if (pstr->trans)
785 c = pstr->trans[c];
786 pstr->tip_context = (bitset_contain (pstr->word_char, c)
787 ? CONTEXT_WORD
788 : ((IS_NEWLINE (c) && pstr->newline_anchor)
789 ? CONTEXT_NEWLINE : 0));
792 if (!BE (pstr->mbs_allocated, 0))
793 pstr->mbs += offset;
795 pstr->raw_mbs_idx = idx;
796 pstr->len -= offset;
797 pstr->stop -= offset;
799 /* Then build the buffers. */
800 #ifdef RE_ENABLE_I18N
801 if (pstr->mb_cur_max > 1)
803 if (pstr->icase)
805 reg_errcode_t ret = build_wcs_upper_buffer (pstr);
806 if (BE (ret != REG_NOERROR, 0))
807 return ret;
809 else
810 build_wcs_buffer (pstr);
812 else
813 #endif /* RE_ENABLE_I18N */
814 if (BE (pstr->mbs_allocated, 0))
816 if (pstr->icase)
817 build_upper_buffer (pstr);
818 else if (pstr->trans != NULL)
819 re_string_translate_buffer (pstr);
821 else
822 pstr->valid_len = pstr->len;
824 pstr->cur_idx = 0;
825 return REG_NOERROR;
828 static unsigned char
829 __attribute ((pure))
830 re_string_peek_byte_case (const re_string_t *pstr, int idx)
832 int ch, off;
834 /* Handle the common (easiest) cases first. */
835 if (BE (!pstr->mbs_allocated, 1))
836 return re_string_peek_byte (pstr, idx);
838 #ifdef RE_ENABLE_I18N
839 if (pstr->mb_cur_max > 1
840 && ! re_string_is_single_byte_char (pstr, pstr->cur_idx + idx))
841 return re_string_peek_byte (pstr, idx);
842 #endif
844 off = pstr->cur_idx + idx;
845 #ifdef RE_ENABLE_I18N
846 if (pstr->offsets_needed)
847 off = pstr->offsets[off];
848 #endif
850 ch = pstr->raw_mbs[pstr->raw_mbs_idx + off];
852 #ifdef RE_ENABLE_I18N
853 /* Ensure that e.g. for tr_TR.UTF-8 BACKSLASH DOTLESS SMALL LETTER I
854 this function returns CAPITAL LETTER I instead of first byte of
855 DOTLESS SMALL LETTER I. The latter would confuse the parser,
856 since peek_byte_case doesn't advance cur_idx in any way. */
857 if (pstr->offsets_needed && !isascii (ch))
858 return re_string_peek_byte (pstr, idx);
859 #endif
861 return ch;
864 static unsigned char
865 re_string_fetch_byte_case (re_string_t *pstr)
867 if (BE (!pstr->mbs_allocated, 1))
868 return re_string_fetch_byte (pstr);
870 #ifdef RE_ENABLE_I18N
871 if (pstr->offsets_needed)
873 int off, ch;
875 /* For tr_TR.UTF-8 [[:islower:]] there is
876 [[: CAPITAL LETTER I WITH DOT lower:]] in mbs. Skip
877 in that case the whole multi-byte character and return
878 the original letter. On the other side, with
879 [[: DOTLESS SMALL LETTER I return [[:I, as doing
880 anything else would complicate things too much. */
882 if (!re_string_first_byte (pstr, pstr->cur_idx))
883 return re_string_fetch_byte (pstr);
885 off = pstr->offsets[pstr->cur_idx];
886 ch = pstr->raw_mbs[pstr->raw_mbs_idx + off];
888 if (! isascii (ch))
889 return re_string_fetch_byte (pstr);
891 re_string_skip_bytes (pstr,
892 re_string_char_size_at (pstr, pstr->cur_idx));
893 return ch;
895 #endif
897 return pstr->raw_mbs[pstr->raw_mbs_idx + pstr->cur_idx++];
900 static void
901 re_string_destruct (re_string_t *pstr)
903 #ifdef RE_ENABLE_I18N
904 re_free (pstr->wcs);
905 re_free (pstr->offsets);
906 #endif /* RE_ENABLE_I18N */
907 if (pstr->mbs_allocated)
908 re_free (pstr->mbs);
911 /* Return the context at IDX in INPUT. */
913 static unsigned int
914 re_string_context_at (const re_string_t *input, int idx, int eflags)
916 int c;
917 if (BE (idx < 0, 0))
918 /* In this case, we use the value stored in input->tip_context,
919 since we can't know the character in input->mbs[-1] here. */
920 return input->tip_context;
921 if (BE (idx == input->len, 0))
922 return ((eflags & REG_NOTEOL) ? CONTEXT_ENDBUF
923 : CONTEXT_NEWLINE | CONTEXT_ENDBUF);
924 #ifdef RE_ENABLE_I18N
925 if (input->mb_cur_max > 1)
927 wint_t wc;
928 int wc_idx = idx;
929 while(input->wcs[wc_idx] == WEOF)
931 #if defined DEBUG && DEBUG
932 /* It must not happen. */
933 assert (wc_idx >= 0);
934 #endif
935 --wc_idx;
936 if (wc_idx < 0)
937 return input->tip_context;
939 wc = input->wcs[wc_idx];
940 if (BE (input->word_ops_used != 0, 0) && IS_WIDE_WORD_CHAR (wc))
941 return CONTEXT_WORD;
942 return (IS_WIDE_NEWLINE (wc) && input->newline_anchor
943 ? CONTEXT_NEWLINE : 0);
945 else
946 #endif
948 c = re_string_byte_at (input, idx);
949 if (bitset_contain (input->word_char, c))
950 return CONTEXT_WORD;
951 return IS_NEWLINE (c) && input->newline_anchor ? CONTEXT_NEWLINE : 0;
955 /* Functions for set operation. */
957 static reg_errcode_t
958 __attribute_warn_unused_result__
959 re_node_set_alloc (re_node_set *set, int size)
961 set->alloc = size;
962 set->nelem = 0;
963 set->elems = re_malloc (int, size);
964 if (BE (set->elems == NULL, 0))
965 return REG_ESPACE;
966 return REG_NOERROR;
969 static reg_errcode_t
970 __attribute_warn_unused_result__
971 re_node_set_init_1 (re_node_set *set, int elem)
973 set->alloc = 1;
974 set->nelem = 1;
975 set->elems = re_malloc (int, 1);
976 if (BE (set->elems == NULL, 0))
978 set->alloc = set->nelem = 0;
979 return REG_ESPACE;
981 set->elems[0] = elem;
982 return REG_NOERROR;
985 static reg_errcode_t
986 __attribute_warn_unused_result__
987 re_node_set_init_2 (re_node_set *set, int elem1, int elem2)
989 set->alloc = 2;
990 set->elems = re_malloc (int, 2);
991 if (BE (set->elems == NULL, 0))
992 return REG_ESPACE;
993 if (elem1 == elem2)
995 set->nelem = 1;
996 set->elems[0] = elem1;
998 else
1000 set->nelem = 2;
1001 if (elem1 < elem2)
1003 set->elems[0] = elem1;
1004 set->elems[1] = elem2;
1006 else
1008 set->elems[0] = elem2;
1009 set->elems[1] = elem1;
1012 return REG_NOERROR;
1015 static reg_errcode_t
1016 __attribute_warn_unused_result__
1017 re_node_set_init_copy (re_node_set *dest, const re_node_set *src)
1019 dest->nelem = src->nelem;
1020 if (src->nelem > 0)
1022 dest->alloc = dest->nelem;
1023 dest->elems = re_malloc (int, dest->alloc);
1024 if (BE (dest->elems == NULL, 0))
1026 dest->alloc = dest->nelem = 0;
1027 return REG_ESPACE;
1029 memcpy (dest->elems, src->elems, src->nelem * sizeof (int));
1031 else
1032 re_node_set_init_empty (dest);
1033 return REG_NOERROR;
1036 /* Calculate the intersection of the sets SRC1 and SRC2. And merge it to
1037 DEST. Return value indicate the error code or REG_NOERROR if succeeded.
1038 Note: We assume dest->elems is NULL, when dest->alloc is 0. */
1040 static reg_errcode_t
1041 __attribute_warn_unused_result__
1042 re_node_set_add_intersect (re_node_set *dest, const re_node_set *src1,
1043 const re_node_set *src2)
1045 int i1, i2, is, id, delta, sbase;
1046 if (src1->nelem == 0 || src2->nelem == 0)
1047 return REG_NOERROR;
1049 /* We need dest->nelem + 2 * elems_in_intersection; this is a
1050 conservative estimate. */
1051 if (src1->nelem + src2->nelem + dest->nelem > dest->alloc)
1053 int new_alloc = src1->nelem + src2->nelem + dest->alloc;
1054 int *new_elems = re_realloc (dest->elems, int, new_alloc);
1055 if (BE (new_elems == NULL, 0))
1056 return REG_ESPACE;
1057 dest->elems = new_elems;
1058 dest->alloc = new_alloc;
1061 /* Find the items in the intersection of SRC1 and SRC2, and copy
1062 into the top of DEST those that are not already in DEST itself. */
1063 sbase = dest->nelem + src1->nelem + src2->nelem;
1064 i1 = src1->nelem - 1;
1065 i2 = src2->nelem - 1;
1066 id = dest->nelem - 1;
1067 for (;;)
1069 if (src1->elems[i1] == src2->elems[i2])
1071 /* Try to find the item in DEST. Maybe we could binary search? */
1072 while (id >= 0 && dest->elems[id] > src1->elems[i1])
1073 --id;
1075 if (id < 0 || dest->elems[id] != src1->elems[i1])
1076 dest->elems[--sbase] = src1->elems[i1];
1078 if (--i1 < 0 || --i2 < 0)
1079 break;
1082 /* Lower the highest of the two items. */
1083 else if (src1->elems[i1] < src2->elems[i2])
1085 if (--i2 < 0)
1086 break;
1088 else
1090 if (--i1 < 0)
1091 break;
1095 id = dest->nelem - 1;
1096 is = dest->nelem + src1->nelem + src2->nelem - 1;
1097 delta = is - sbase + 1;
1099 /* Now copy. When DELTA becomes zero, the remaining
1100 DEST elements are already in place; this is more or
1101 less the same loop that is in re_node_set_merge. */
1102 dest->nelem += delta;
1103 if (delta > 0 && id >= 0)
1104 for (;;)
1106 if (dest->elems[is] > dest->elems[id])
1108 /* Copy from the top. */
1109 dest->elems[id + delta--] = dest->elems[is--];
1110 if (delta == 0)
1111 break;
1113 else
1115 /* Slide from the bottom. */
1116 dest->elems[id + delta] = dest->elems[id];
1117 if (--id < 0)
1118 break;
1122 /* Copy remaining SRC elements. */
1123 memcpy (dest->elems, dest->elems + sbase, delta * sizeof (int));
1125 return REG_NOERROR;
1128 /* Calculate the union set of the sets SRC1 and SRC2. And store it to
1129 DEST. Return value indicate the error code or REG_NOERROR if succeeded. */
1131 static reg_errcode_t
1132 __attribute_warn_unused_result__
1133 re_node_set_init_union (re_node_set *dest, const re_node_set *src1,
1134 const re_node_set *src2)
1136 int i1, i2, id;
1137 if (src1 != NULL && src1->nelem > 0 && src2 != NULL && src2->nelem > 0)
1139 dest->alloc = src1->nelem + src2->nelem;
1140 dest->elems = re_malloc (int, dest->alloc);
1141 if (BE (dest->elems == NULL, 0))
1142 return REG_ESPACE;
1144 else
1146 if (src1 != NULL && src1->nelem > 0)
1147 return re_node_set_init_copy (dest, src1);
1148 else if (src2 != NULL && src2->nelem > 0)
1149 return re_node_set_init_copy (dest, src2);
1150 else
1151 re_node_set_init_empty (dest);
1152 return REG_NOERROR;
1154 for (i1 = i2 = id = 0 ; i1 < src1->nelem && i2 < src2->nelem ;)
1156 if (src1->elems[i1] > src2->elems[i2])
1158 dest->elems[id++] = src2->elems[i2++];
1159 continue;
1161 if (src1->elems[i1] == src2->elems[i2])
1162 ++i2;
1163 dest->elems[id++] = src1->elems[i1++];
1165 if (i1 < src1->nelem)
1167 memcpy (dest->elems + id, src1->elems + i1,
1168 (src1->nelem - i1) * sizeof (int));
1169 id += src1->nelem - i1;
1171 else if (i2 < src2->nelem)
1173 memcpy (dest->elems + id, src2->elems + i2,
1174 (src2->nelem - i2) * sizeof (int));
1175 id += src2->nelem - i2;
1177 dest->nelem = id;
1178 return REG_NOERROR;
1181 /* Calculate the union set of the sets DEST and SRC. And store it to
1182 DEST. Return value indicate the error code or REG_NOERROR if succeeded. */
1184 static reg_errcode_t
1185 __attribute_warn_unused_result__
1186 re_node_set_merge (re_node_set *dest, const re_node_set *src)
1188 int is, id, sbase, delta;
1189 if (src == NULL || src->nelem == 0)
1190 return REG_NOERROR;
1191 if (dest->alloc < 2 * src->nelem + dest->nelem)
1193 int new_alloc = 2 * (src->nelem + dest->alloc);
1194 int *new_buffer = re_realloc (dest->elems, int, new_alloc);
1195 if (BE (new_buffer == NULL, 0))
1196 return REG_ESPACE;
1197 dest->elems = new_buffer;
1198 dest->alloc = new_alloc;
1201 if (BE (dest->nelem == 0, 0))
1203 dest->nelem = src->nelem;
1204 memcpy (dest->elems, src->elems, src->nelem * sizeof (int));
1205 return REG_NOERROR;
1208 /* Copy into the top of DEST the items of SRC that are not
1209 found in DEST. Maybe we could binary search in DEST? */
1210 for (sbase = dest->nelem + 2 * src->nelem,
1211 is = src->nelem - 1, id = dest->nelem - 1; is >= 0 && id >= 0; )
1213 if (dest->elems[id] == src->elems[is])
1214 is--, id--;
1215 else if (dest->elems[id] < src->elems[is])
1216 dest->elems[--sbase] = src->elems[is--];
1217 else /* if (dest->elems[id] > src->elems[is]) */
1218 --id;
1221 if (is >= 0)
1223 /* If DEST is exhausted, the remaining items of SRC must be unique. */
1224 sbase -= is + 1;
1225 memcpy (dest->elems + sbase, src->elems, (is + 1) * sizeof (int));
1228 id = dest->nelem - 1;
1229 is = dest->nelem + 2 * src->nelem - 1;
1230 delta = is - sbase + 1;
1231 if (delta == 0)
1232 return REG_NOERROR;
1234 /* Now copy. When DELTA becomes zero, the remaining
1235 DEST elements are already in place. */
1236 dest->nelem += delta;
1237 for (;;)
1239 if (dest->elems[is] > dest->elems[id])
1241 /* Copy from the top. */
1242 dest->elems[id + delta--] = dest->elems[is--];
1243 if (delta == 0)
1244 break;
1246 else
1248 /* Slide from the bottom. */
1249 dest->elems[id + delta] = dest->elems[id];
1250 if (--id < 0)
1252 /* Copy remaining SRC elements. */
1253 memcpy (dest->elems, dest->elems + sbase,
1254 delta * sizeof (int));
1255 break;
1260 return REG_NOERROR;
1263 /* Insert the new element ELEM to the re_node_set* SET.
1264 SET should not already have ELEM.
1265 return -1 if an error is occured, return 1 otherwise. */
1267 static int
1268 __attribute_warn_unused_result__
1269 re_node_set_insert (re_node_set *set, int elem)
1271 int idx;
1272 /* In case the set is empty. */
1273 if (set->alloc == 0)
1275 if (BE (re_node_set_init_1 (set, elem) == REG_NOERROR, 1))
1276 return 1;
1277 else
1278 return -1;
1281 if (BE (set->nelem, 0) == 0)
1283 /* We already guaranteed above that set->alloc != 0. */
1284 set->elems[0] = elem;
1285 ++set->nelem;
1286 return 1;
1289 /* Realloc if we need. */
1290 if (set->alloc == set->nelem)
1292 int *new_elems;
1293 set->alloc = set->alloc * 2;
1294 new_elems = re_realloc (set->elems, int, set->alloc);
1295 if (BE (new_elems == NULL, 0))
1296 return -1;
1297 set->elems = new_elems;
1300 /* Move the elements which follows the new element. Test the
1301 first element separately to skip a check in the inner loop. */
1302 if (elem < set->elems[0])
1304 idx = 0;
1305 for (idx = set->nelem; idx > 0; idx--)
1306 set->elems[idx] = set->elems[idx - 1];
1308 else
1310 for (idx = set->nelem; set->elems[idx - 1] > elem; idx--)
1311 set->elems[idx] = set->elems[idx - 1];
1314 /* Insert the new element. */
1315 set->elems[idx] = elem;
1316 ++set->nelem;
1317 return 1;
1320 /* Insert the new element ELEM to the re_node_set* SET.
1321 SET should not already have any element greater than or equal to ELEM.
1322 Return -1 if an error is occured, return 1 otherwise. */
1324 static int
1325 __attribute_warn_unused_result__
1326 re_node_set_insert_last (re_node_set *set, int elem)
1328 /* Realloc if we need. */
1329 if (set->alloc == set->nelem)
1331 int *new_elems;
1332 set->alloc = (set->alloc + 1) * 2;
1333 new_elems = re_realloc (set->elems, int, set->alloc);
1334 if (BE (new_elems == NULL, 0))
1335 return -1;
1336 set->elems = new_elems;
1339 /* Insert the new element. */
1340 set->elems[set->nelem++] = elem;
1341 return 1;
1344 /* Compare two node sets SET1 and SET2.
1345 return 1 if SET1 and SET2 are equivalent, return 0 otherwise. */
1347 static int
1348 __attribute ((pure))
1349 re_node_set_compare (const re_node_set *set1, const re_node_set *set2)
1351 int i;
1352 if (set1 == NULL || set2 == NULL || set1->nelem != set2->nelem)
1353 return 0;
1354 for (i = set1->nelem ; --i >= 0 ; )
1355 if (set1->elems[i] != set2->elems[i])
1356 return 0;
1357 return 1;
1360 /* Return (idx + 1) if SET contains the element ELEM, return 0 otherwise. */
1362 static int
1363 __attribute ((pure))
1364 re_node_set_contains (const re_node_set *set, int elem)
1366 unsigned int idx, right, mid;
1367 if (set->nelem <= 0)
1368 return 0;
1370 /* Binary search the element. */
1371 idx = 0;
1372 right = set->nelem - 1;
1373 while (idx < right)
1375 mid = (idx + right) / 2;
1376 if (set->elems[mid] < elem)
1377 idx = mid + 1;
1378 else
1379 right = mid;
1381 return set->elems[idx] == elem ? idx + 1 : 0;
1384 static void
1385 re_node_set_remove_at (re_node_set *set, int idx)
1387 if (idx < 0 || idx >= set->nelem)
1388 return;
1389 --set->nelem;
1390 for (; idx < set->nelem; idx++)
1391 set->elems[idx] = set->elems[idx + 1];
1395 /* Add the token TOKEN to dfa->nodes, and return the index of the token.
1396 Or return -1, if an error will be occured. */
1398 static int
1399 re_dfa_add_node (re_dfa_t *dfa, re_token_t token)
1401 int type = token.type;
1402 if (BE (dfa->nodes_len >= dfa->nodes_alloc, 0))
1404 size_t new_nodes_alloc = dfa->nodes_alloc * 2;
1405 int *new_nexts, *new_indices;
1406 re_node_set *new_edests, *new_eclosures;
1407 re_token_t *new_nodes;
1409 /* Avoid overflows in realloc. */
1410 const size_t max_object_size = MAX (sizeof (re_token_t),
1411 MAX (sizeof (re_node_set),
1412 sizeof (int)));
1413 if (BE (SIZE_MAX / max_object_size < new_nodes_alloc, 0))
1414 return -1;
1416 new_nodes = re_realloc (dfa->nodes, re_token_t, new_nodes_alloc);
1417 if (BE (new_nodes == NULL, 0))
1418 return -1;
1419 dfa->nodes = new_nodes;
1420 new_nexts = re_realloc (dfa->nexts, int, new_nodes_alloc);
1421 new_indices = re_realloc (dfa->org_indices, int, new_nodes_alloc);
1422 new_edests = re_realloc (dfa->edests, re_node_set, new_nodes_alloc);
1423 new_eclosures = re_realloc (dfa->eclosures, re_node_set, new_nodes_alloc);
1424 if (BE (new_nexts == NULL || new_indices == NULL
1425 || new_edests == NULL || new_eclosures == NULL, 0))
1426 return -1;
1427 dfa->nexts = new_nexts;
1428 dfa->org_indices = new_indices;
1429 dfa->edests = new_edests;
1430 dfa->eclosures = new_eclosures;
1431 dfa->nodes_alloc = new_nodes_alloc;
1433 dfa->nodes[dfa->nodes_len] = token;
1434 dfa->nodes[dfa->nodes_len].constraint = 0;
1435 #ifdef RE_ENABLE_I18N
1436 dfa->nodes[dfa->nodes_len].accept_mb =
1437 (type == OP_PERIOD && dfa->mb_cur_max > 1) || type == COMPLEX_BRACKET;
1438 #endif
1439 dfa->nexts[dfa->nodes_len] = -1;
1440 re_node_set_init_empty (dfa->edests + dfa->nodes_len);
1441 re_node_set_init_empty (dfa->eclosures + dfa->nodes_len);
1442 return dfa->nodes_len++;
1445 static inline unsigned int
1446 calc_state_hash (const re_node_set *nodes, unsigned int context)
1448 unsigned int hash = nodes->nelem + context;
1449 int i;
1450 for (i = 0 ; i < nodes->nelem ; i++)
1451 hash += nodes->elems[i];
1452 return hash;
1455 /* Search for the state whose node_set is equivalent to NODES.
1456 Return the pointer to the state, if we found it in the DFA.
1457 Otherwise create the new one and return it. In case of an error
1458 return NULL and set the error code in ERR.
1459 Note: - We assume NULL as the invalid state, then it is possible that
1460 return value is NULL and ERR is REG_NOERROR.
1461 - We never return non-NULL value in case of any errors, it is for
1462 optimization. */
1464 static re_dfastate_t *
1465 __attribute_warn_unused_result__
1466 re_acquire_state (reg_errcode_t *err, const re_dfa_t *dfa,
1467 const re_node_set *nodes)
1469 unsigned int hash;
1470 re_dfastate_t *new_state;
1471 struct re_state_table_entry *spot;
1472 int i;
1473 if (BE (nodes->nelem == 0, 0))
1475 *err = REG_NOERROR;
1476 return NULL;
1478 hash = calc_state_hash (nodes, 0);
1479 spot = dfa->state_table + (hash & dfa->state_hash_mask);
1481 for (i = 0 ; i < spot->num ; i++)
1483 re_dfastate_t *state = spot->array[i];
1484 if (hash != state->hash)
1485 continue;
1486 if (re_node_set_compare (&state->nodes, nodes))
1487 return state;
1490 /* There are no appropriate state in the dfa, create the new one. */
1491 new_state = create_ci_newstate (dfa, nodes, hash);
1492 if (BE (new_state == NULL, 0))
1493 *err = REG_ESPACE;
1495 return new_state;
1498 /* Search for the state whose node_set is equivalent to NODES and
1499 whose context is equivalent to CONTEXT.
1500 Return the pointer to the state, if we found it in the DFA.
1501 Otherwise create the new one and return it. In case of an error
1502 return NULL and set the error code in ERR.
1503 Note: - We assume NULL as the invalid state, then it is possible that
1504 return value is NULL and ERR is REG_NOERROR.
1505 - We never return non-NULL value in case of any errors, it is for
1506 optimization. */
1508 static re_dfastate_t *
1509 __attribute_warn_unused_result__
1510 re_acquire_state_context (reg_errcode_t *err, const re_dfa_t *dfa,
1511 const re_node_set *nodes, unsigned int context)
1513 unsigned int hash;
1514 re_dfastate_t *new_state;
1515 struct re_state_table_entry *spot;
1516 int i;
1517 if (nodes->nelem == 0)
1519 *err = REG_NOERROR;
1520 return NULL;
1522 hash = calc_state_hash (nodes, context);
1523 spot = dfa->state_table + (hash & dfa->state_hash_mask);
1525 for (i = 0 ; i < spot->num ; i++)
1527 re_dfastate_t *state = spot->array[i];
1528 if (state->hash == hash
1529 && state->context == context
1530 && re_node_set_compare (state->entrance_nodes, nodes))
1531 return state;
1533 /* There are no appropriate state in `dfa', create the new one. */
1534 new_state = create_cd_newstate (dfa, nodes, context, hash);
1535 if (BE (new_state == NULL, 0))
1536 *err = REG_ESPACE;
1538 return new_state;
1541 /* Finish initialization of the new state NEWSTATE, and using its hash value
1542 HASH put in the appropriate bucket of DFA's state table. Return value
1543 indicates the error code if failed. */
1545 static reg_errcode_t
1546 __attribute_warn_unused_result__
1547 register_state (const re_dfa_t *dfa, re_dfastate_t *newstate,
1548 unsigned int hash)
1550 struct re_state_table_entry *spot;
1551 reg_errcode_t err;
1552 int i;
1554 newstate->hash = hash;
1555 err = re_node_set_alloc (&newstate->non_eps_nodes, newstate->nodes.nelem);
1556 if (BE (err != REG_NOERROR, 0))
1557 return REG_ESPACE;
1558 for (i = 0; i < newstate->nodes.nelem; i++)
1560 int elem = newstate->nodes.elems[i];
1561 if (!IS_EPSILON_NODE (dfa->nodes[elem].type))
1562 if (re_node_set_insert_last (&newstate->non_eps_nodes, elem) < 0)
1563 return REG_ESPACE;
1566 spot = dfa->state_table + (hash & dfa->state_hash_mask);
1567 if (BE (spot->alloc <= spot->num, 0))
1569 int new_alloc = 2 * spot->num + 2;
1570 re_dfastate_t **new_array = re_realloc (spot->array, re_dfastate_t *,
1571 new_alloc);
1572 if (BE (new_array == NULL, 0))
1573 return REG_ESPACE;
1574 spot->array = new_array;
1575 spot->alloc = new_alloc;
1577 spot->array[spot->num++] = newstate;
1578 return REG_NOERROR;
1581 static void
1582 free_state (re_dfastate_t *state)
1584 re_node_set_free (&state->non_eps_nodes);
1585 re_node_set_free (&state->inveclosure);
1586 if (state->entrance_nodes != &state->nodes)
1588 re_node_set_free (state->entrance_nodes);
1589 re_free (state->entrance_nodes);
1591 re_node_set_free (&state->nodes);
1592 re_free (state->word_trtable);
1593 re_free (state->trtable);
1594 re_free (state);
1597 /* Create the new state which is independ of contexts.
1598 Return the new state if succeeded, otherwise return NULL. */
1600 static re_dfastate_t *
1601 __attribute_warn_unused_result__
1602 create_ci_newstate (const re_dfa_t *dfa, const re_node_set *nodes,
1603 unsigned int hash)
1605 int i;
1606 reg_errcode_t err;
1607 re_dfastate_t *newstate;
1609 newstate = (re_dfastate_t *) calloc (sizeof (re_dfastate_t), 1);
1610 if (BE (newstate == NULL, 0))
1611 return NULL;
1612 err = re_node_set_init_copy (&newstate->nodes, nodes);
1613 if (BE (err != REG_NOERROR, 0))
1615 re_free (newstate);
1616 return NULL;
1619 newstate->entrance_nodes = &newstate->nodes;
1620 for (i = 0 ; i < nodes->nelem ; i++)
1622 re_token_t *node = dfa->nodes + nodes->elems[i];
1623 re_token_type_t type = node->type;
1624 if (type == CHARACTER && !node->constraint)
1625 continue;
1626 #ifdef RE_ENABLE_I18N
1627 newstate->accept_mb |= node->accept_mb;
1628 #endif /* RE_ENABLE_I18N */
1630 /* If the state has the halt node, the state is a halt state. */
1631 if (type == END_OF_RE)
1632 newstate->halt = 1;
1633 else if (type == OP_BACK_REF)
1634 newstate->has_backref = 1;
1635 else if (type == ANCHOR || node->constraint)
1636 newstate->has_constraint = 1;
1638 err = register_state (dfa, newstate, hash);
1639 if (BE (err != REG_NOERROR, 0))
1641 free_state (newstate);
1642 newstate = NULL;
1644 return newstate;
1647 /* Create the new state which is depend on the context CONTEXT.
1648 Return the new state if succeeded, otherwise return NULL. */
1650 static re_dfastate_t *
1651 __attribute_warn_unused_result__
1652 create_cd_newstate (const re_dfa_t *dfa, const re_node_set *nodes,
1653 unsigned int context, unsigned int hash)
1655 int i, nctx_nodes = 0;
1656 reg_errcode_t err;
1657 re_dfastate_t *newstate;
1659 newstate = (re_dfastate_t *) calloc (sizeof (re_dfastate_t), 1);
1660 if (BE (newstate == NULL, 0))
1661 return NULL;
1662 err = re_node_set_init_copy (&newstate->nodes, nodes);
1663 if (BE (err != REG_NOERROR, 0))
1665 re_free (newstate);
1666 return NULL;
1669 newstate->context = context;
1670 newstate->entrance_nodes = &newstate->nodes;
1672 for (i = 0 ; i < nodes->nelem ; i++)
1674 re_token_t *node = dfa->nodes + nodes->elems[i];
1675 re_token_type_t type = node->type;
1676 unsigned int constraint = node->constraint;
1678 if (type == CHARACTER && !constraint)
1679 continue;
1680 #ifdef RE_ENABLE_I18N
1681 newstate->accept_mb |= node->accept_mb;
1682 #endif /* RE_ENABLE_I18N */
1684 /* If the state has the halt node, the state is a halt state. */
1685 if (type == END_OF_RE)
1686 newstate->halt = 1;
1687 else if (type == OP_BACK_REF)
1688 newstate->has_backref = 1;
1690 if (constraint)
1692 if (newstate->entrance_nodes == &newstate->nodes)
1694 newstate->entrance_nodes = re_malloc (re_node_set, 1);
1695 if (BE (newstate->entrance_nodes == NULL, 0))
1697 free_state (newstate);
1698 return NULL;
1700 if (re_node_set_init_copy (newstate->entrance_nodes, nodes)
1701 != REG_NOERROR)
1702 return NULL;
1703 nctx_nodes = 0;
1704 newstate->has_constraint = 1;
1707 if (NOT_SATISFY_PREV_CONSTRAINT (constraint,context))
1709 re_node_set_remove_at (&newstate->nodes, i - nctx_nodes);
1710 ++nctx_nodes;
1714 err = register_state (dfa, newstate, hash);
1715 if (BE (err != REG_NOERROR, 0))
1717 free_state (newstate);
1718 newstate = NULL;
1720 return newstate;