32bit memcmp/strcmp/strncmp optimized for SSSE3/SSS4.2
[glibc.git] / posix / regex_internal.c
blob8183a29bf694bacc1c63865a285d178ee2c1e4e9
1 /* Extended regular expression matching and search library.
2 Copyright (C) 2002-2006, 2010 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4 Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library; if not, write to the Free
18 Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
19 02111-1307 USA. */
21 static void re_string_construct_common (const char *str, int len,
22 re_string_t *pstr,
23 RE_TRANSLATE_TYPE trans, int icase,
24 const re_dfa_t *dfa) internal_function;
25 static re_dfastate_t *create_ci_newstate (const re_dfa_t *dfa,
26 const re_node_set *nodes,
27 unsigned int hash) internal_function;
28 static re_dfastate_t *create_cd_newstate (const re_dfa_t *dfa,
29 const re_node_set *nodes,
30 unsigned int context,
31 unsigned int hash) internal_function;
33 /* Functions for string operation. */
35 /* This function allocate the buffers. It is necessary to call
36 re_string_reconstruct before using the object. */
38 static reg_errcode_t
39 internal_function __attribute_warn_unused_result__
40 re_string_allocate (re_string_t *pstr, const char *str, int len, int init_len,
41 RE_TRANSLATE_TYPE trans, int icase, const re_dfa_t *dfa)
43 reg_errcode_t ret;
44 int init_buf_len;
46 /* Ensure at least one character fits into the buffers. */
47 if (init_len < dfa->mb_cur_max)
48 init_len = dfa->mb_cur_max;
49 init_buf_len = (len + 1 < init_len) ? len + 1: init_len;
50 re_string_construct_common (str, len, pstr, trans, icase, dfa);
52 ret = re_string_realloc_buffers (pstr, init_buf_len);
53 if (BE (ret != REG_NOERROR, 0))
54 return ret;
56 pstr->word_char = dfa->word_char;
57 pstr->word_ops_used = dfa->word_ops_used;
58 pstr->mbs = pstr->mbs_allocated ? pstr->mbs : (unsigned char *) str;
59 pstr->valid_len = (pstr->mbs_allocated || dfa->mb_cur_max > 1) ? 0 : len;
60 pstr->valid_raw_len = pstr->valid_len;
61 return REG_NOERROR;
64 /* This function allocate the buffers, and initialize them. */
66 static reg_errcode_t
67 internal_function __attribute_warn_unused_result__
68 re_string_construct (re_string_t *pstr, const char *str, int len,
69 RE_TRANSLATE_TYPE trans, int icase, const re_dfa_t *dfa)
71 reg_errcode_t ret;
72 memset (pstr, '\0', sizeof (re_string_t));
73 re_string_construct_common (str, len, pstr, trans, icase, dfa);
75 if (len > 0)
77 ret = re_string_realloc_buffers (pstr, len + 1);
78 if (BE (ret != REG_NOERROR, 0))
79 return ret;
81 pstr->mbs = pstr->mbs_allocated ? pstr->mbs : (unsigned char *) str;
83 if (icase)
85 #ifdef RE_ENABLE_I18N
86 if (dfa->mb_cur_max > 1)
88 while (1)
90 ret = build_wcs_upper_buffer (pstr);
91 if (BE (ret != REG_NOERROR, 0))
92 return ret;
93 if (pstr->valid_raw_len >= len)
94 break;
95 if (pstr->bufs_len > pstr->valid_len + dfa->mb_cur_max)
96 break;
97 ret = re_string_realloc_buffers (pstr, pstr->bufs_len * 2);
98 if (BE (ret != REG_NOERROR, 0))
99 return ret;
102 else
103 #endif /* RE_ENABLE_I18N */
104 build_upper_buffer (pstr);
106 else
108 #ifdef RE_ENABLE_I18N
109 if (dfa->mb_cur_max > 1)
110 build_wcs_buffer (pstr);
111 else
112 #endif /* RE_ENABLE_I18N */
114 if (trans != NULL)
115 re_string_translate_buffer (pstr);
116 else
118 pstr->valid_len = pstr->bufs_len;
119 pstr->valid_raw_len = pstr->bufs_len;
124 return REG_NOERROR;
127 /* Helper functions for re_string_allocate, and re_string_construct. */
129 static reg_errcode_t
130 internal_function __attribute_warn_unused_result__
131 re_string_realloc_buffers (re_string_t *pstr, int new_buf_len)
133 #ifdef RE_ENABLE_I18N
134 if (pstr->mb_cur_max > 1)
136 wint_t *new_wcs;
138 /* Avoid overflow in realloc. */
139 const size_t max_object_size = MAX (sizeof (wint_t), sizeof (int));
140 if (BE (SIZE_MAX / max_object_size < new_buf_len, 0))
141 return REG_ESPACE;
143 new_wcs = re_realloc (pstr->wcs, wint_t, new_buf_len);
144 if (BE (new_wcs == NULL, 0))
145 return REG_ESPACE;
146 pstr->wcs = new_wcs;
147 if (pstr->offsets != NULL)
149 int *new_offsets = re_realloc (pstr->offsets, int, new_buf_len);
150 if (BE (new_offsets == NULL, 0))
151 return REG_ESPACE;
152 pstr->offsets = new_offsets;
155 #endif /* RE_ENABLE_I18N */
156 if (pstr->mbs_allocated)
158 unsigned char *new_mbs = re_realloc (pstr->mbs, unsigned char,
159 new_buf_len);
160 if (BE (new_mbs == NULL, 0))
161 return REG_ESPACE;
162 pstr->mbs = new_mbs;
164 pstr->bufs_len = new_buf_len;
165 return REG_NOERROR;
169 static void
170 internal_function
171 re_string_construct_common (const char *str, int len, re_string_t *pstr,
172 RE_TRANSLATE_TYPE trans, int icase,
173 const re_dfa_t *dfa)
175 pstr->raw_mbs = (const unsigned char *) str;
176 pstr->len = len;
177 pstr->raw_len = len;
178 pstr->trans = trans;
179 pstr->icase = icase ? 1 : 0;
180 pstr->mbs_allocated = (trans != NULL || icase);
181 pstr->mb_cur_max = dfa->mb_cur_max;
182 pstr->is_utf8 = dfa->is_utf8;
183 pstr->map_notascii = dfa->map_notascii;
184 pstr->stop = pstr->len;
185 pstr->raw_stop = pstr->stop;
188 #ifdef RE_ENABLE_I18N
190 /* Build wide character buffer PSTR->WCS.
191 If the byte sequence of the string are:
192 <mb1>(0), <mb1>(1), <mb2>(0), <mb2>(1), <sb3>
193 Then wide character buffer will be:
194 <wc1> , WEOF , <wc2> , WEOF , <wc3>
195 We use WEOF for padding, they indicate that the position isn't
196 a first byte of a multibyte character.
198 Note that this function assumes PSTR->VALID_LEN elements are already
199 built and starts from PSTR->VALID_LEN. */
201 static void
202 internal_function
203 build_wcs_buffer (re_string_t *pstr)
205 #ifdef _LIBC
206 unsigned char buf[MB_LEN_MAX];
207 assert (MB_LEN_MAX >= pstr->mb_cur_max);
208 #else
209 unsigned char buf[64];
210 #endif
211 mbstate_t prev_st;
212 int byte_idx, end_idx, remain_len;
213 size_t mbclen;
215 /* Build the buffers from pstr->valid_len to either pstr->len or
216 pstr->bufs_len. */
217 end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
218 for (byte_idx = pstr->valid_len; byte_idx < end_idx;)
220 wchar_t wc;
221 const char *p;
223 remain_len = end_idx - byte_idx;
224 prev_st = pstr->cur_state;
225 /* Apply the translation if we need. */
226 if (BE (pstr->trans != NULL, 0))
228 int i, ch;
230 for (i = 0; i < pstr->mb_cur_max && i < remain_len; ++i)
232 ch = pstr->raw_mbs [pstr->raw_mbs_idx + byte_idx + i];
233 buf[i] = pstr->mbs[byte_idx + i] = pstr->trans[ch];
235 p = (const char *) buf;
237 else
238 p = (const char *) pstr->raw_mbs + pstr->raw_mbs_idx + byte_idx;
239 mbclen = __mbrtowc (&wc, p, remain_len, &pstr->cur_state);
240 if (BE (mbclen == (size_t) -2, 0))
242 /* The buffer doesn't have enough space, finish to build. */
243 pstr->cur_state = prev_st;
244 break;
246 else if (BE (mbclen == (size_t) -1 || mbclen == 0, 0))
248 /* We treat these cases as a singlebyte character. */
249 mbclen = 1;
250 wc = (wchar_t) pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx];
251 if (BE (pstr->trans != NULL, 0))
252 wc = pstr->trans[wc];
253 pstr->cur_state = prev_st;
256 /* Write wide character and padding. */
257 pstr->wcs[byte_idx++] = wc;
258 /* Write paddings. */
259 for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
260 pstr->wcs[byte_idx++] = WEOF;
262 pstr->valid_len = byte_idx;
263 pstr->valid_raw_len = byte_idx;
266 /* Build wide character buffer PSTR->WCS like build_wcs_buffer,
267 but for REG_ICASE. */
269 static reg_errcode_t
270 internal_function __attribute_warn_unused_result__
271 build_wcs_upper_buffer (re_string_t *pstr)
273 mbstate_t prev_st;
274 int src_idx, byte_idx, end_idx, remain_len;
275 size_t mbclen;
276 #ifdef _LIBC
277 char buf[MB_LEN_MAX];
278 assert (MB_LEN_MAX >= pstr->mb_cur_max);
279 #else
280 char buf[64];
281 #endif
283 byte_idx = pstr->valid_len;
284 end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
286 /* The following optimization assumes that ASCII characters can be
287 mapped to wide characters with a simple cast. */
288 if (! pstr->map_notascii && pstr->trans == NULL && !pstr->offsets_needed)
290 while (byte_idx < end_idx)
292 wchar_t wc;
294 if (isascii (pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx])
295 && mbsinit (&pstr->cur_state))
297 /* In case of a singlebyte character. */
298 pstr->mbs[byte_idx]
299 = toupper (pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx]);
300 /* The next step uses the assumption that wchar_t is encoded
301 ASCII-safe: all ASCII values can be converted like this. */
302 pstr->wcs[byte_idx] = (wchar_t) pstr->mbs[byte_idx];
303 ++byte_idx;
304 continue;
307 remain_len = end_idx - byte_idx;
308 prev_st = pstr->cur_state;
309 mbclen = __mbrtowc (&wc,
310 ((const char *) pstr->raw_mbs + pstr->raw_mbs_idx
311 + byte_idx), remain_len, &pstr->cur_state);
312 if (BE (mbclen + 2 > 2, 1))
314 wchar_t wcu = wc;
315 if (iswlower (wc))
317 size_t mbcdlen;
319 wcu = towupper (wc);
320 mbcdlen = wcrtomb (buf, wcu, &prev_st);
321 if (BE (mbclen == mbcdlen, 1))
322 memcpy (pstr->mbs + byte_idx, buf, mbclen);
323 else
325 src_idx = byte_idx;
326 goto offsets_needed;
329 else
330 memcpy (pstr->mbs + byte_idx,
331 pstr->raw_mbs + pstr->raw_mbs_idx + byte_idx, mbclen);
332 pstr->wcs[byte_idx++] = wcu;
333 /* Write paddings. */
334 for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
335 pstr->wcs[byte_idx++] = WEOF;
337 else if (mbclen == (size_t) -1 || mbclen == 0)
339 /* It is an invalid character or '\0'. Just use the byte. */
340 int ch = pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx];
341 pstr->mbs[byte_idx] = ch;
342 /* And also cast it to wide char. */
343 pstr->wcs[byte_idx++] = (wchar_t) ch;
344 if (BE (mbclen == (size_t) -1, 0))
345 pstr->cur_state = prev_st;
347 else
349 /* The buffer doesn't have enough space, finish to build. */
350 pstr->cur_state = prev_st;
351 break;
354 pstr->valid_len = byte_idx;
355 pstr->valid_raw_len = byte_idx;
356 return REG_NOERROR;
358 else
359 for (src_idx = pstr->valid_raw_len; byte_idx < end_idx;)
361 wchar_t wc;
362 const char *p;
363 offsets_needed:
364 remain_len = end_idx - byte_idx;
365 prev_st = pstr->cur_state;
366 if (BE (pstr->trans != NULL, 0))
368 int i, ch;
370 for (i = 0; i < pstr->mb_cur_max && i < remain_len; ++i)
372 ch = pstr->raw_mbs [pstr->raw_mbs_idx + src_idx + i];
373 buf[i] = pstr->trans[ch];
375 p = (const char *) buf;
377 else
378 p = (const char *) pstr->raw_mbs + pstr->raw_mbs_idx + src_idx;
379 mbclen = __mbrtowc (&wc, p, remain_len, &pstr->cur_state);
380 if (BE (mbclen + 2 > 2, 1))
382 wchar_t wcu = wc;
383 if (iswlower (wc))
385 size_t mbcdlen;
387 wcu = towupper (wc);
388 mbcdlen = wcrtomb ((char *) buf, wcu, &prev_st);
389 if (BE (mbclen == mbcdlen, 1))
390 memcpy (pstr->mbs + byte_idx, buf, mbclen);
391 else if (mbcdlen != (size_t) -1)
393 size_t i;
395 if (byte_idx + mbcdlen > pstr->bufs_len)
397 pstr->cur_state = prev_st;
398 break;
401 if (pstr->offsets == NULL)
403 pstr->offsets = re_malloc (int, pstr->bufs_len);
405 if (pstr->offsets == NULL)
406 return REG_ESPACE;
408 if (!pstr->offsets_needed)
410 for (i = 0; i < (size_t) byte_idx; ++i)
411 pstr->offsets[i] = i;
412 pstr->offsets_needed = 1;
415 memcpy (pstr->mbs + byte_idx, buf, mbcdlen);
416 pstr->wcs[byte_idx] = wcu;
417 pstr->offsets[byte_idx] = src_idx;
418 for (i = 1; i < mbcdlen; ++i)
420 pstr->offsets[byte_idx + i]
421 = src_idx + (i < mbclen ? i : mbclen - 1);
422 pstr->wcs[byte_idx + i] = WEOF;
424 pstr->len += mbcdlen - mbclen;
425 if (pstr->raw_stop > src_idx)
426 pstr->stop += mbcdlen - mbclen;
427 end_idx = (pstr->bufs_len > pstr->len)
428 ? pstr->len : pstr->bufs_len;
429 byte_idx += mbcdlen;
430 src_idx += mbclen;
431 continue;
433 else
434 memcpy (pstr->mbs + byte_idx, p, mbclen);
436 else
437 memcpy (pstr->mbs + byte_idx, p, mbclen);
439 if (BE (pstr->offsets_needed != 0, 0))
441 size_t i;
442 for (i = 0; i < mbclen; ++i)
443 pstr->offsets[byte_idx + i] = src_idx + i;
445 src_idx += mbclen;
447 pstr->wcs[byte_idx++] = wcu;
448 /* Write paddings. */
449 for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
450 pstr->wcs[byte_idx++] = WEOF;
452 else if (mbclen == (size_t) -1 || mbclen == 0)
454 /* It is an invalid character or '\0'. Just use the byte. */
455 int ch = pstr->raw_mbs[pstr->raw_mbs_idx + src_idx];
457 if (BE (pstr->trans != NULL, 0))
458 ch = pstr->trans [ch];
459 pstr->mbs[byte_idx] = ch;
461 if (BE (pstr->offsets_needed != 0, 0))
462 pstr->offsets[byte_idx] = src_idx;
463 ++src_idx;
465 /* And also cast it to wide char. */
466 pstr->wcs[byte_idx++] = (wchar_t) ch;
467 if (BE (mbclen == (size_t) -1, 0))
468 pstr->cur_state = prev_st;
470 else
472 /* The buffer doesn't have enough space, finish to build. */
473 pstr->cur_state = prev_st;
474 break;
477 pstr->valid_len = byte_idx;
478 pstr->valid_raw_len = src_idx;
479 return REG_NOERROR;
482 /* Skip characters until the index becomes greater than NEW_RAW_IDX.
483 Return the index. */
485 static int
486 internal_function
487 re_string_skip_chars (re_string_t *pstr, int new_raw_idx, wint_t *last_wc)
489 mbstate_t prev_st;
490 int rawbuf_idx;
491 size_t mbclen;
492 wint_t wc = WEOF;
494 /* Skip the characters which are not necessary to check. */
495 for (rawbuf_idx = pstr->raw_mbs_idx + pstr->valid_raw_len;
496 rawbuf_idx < new_raw_idx;)
498 wchar_t wc2;
499 int remain_len = pstr->len - rawbuf_idx;
500 prev_st = pstr->cur_state;
501 mbclen = __mbrtowc (&wc2, (const char *) pstr->raw_mbs + rawbuf_idx,
502 remain_len, &pstr->cur_state);
503 if (BE ((ssize_t) mbclen <= 0, 0))
505 /* We treat these cases as a single byte character. */
506 if (mbclen == 0 || remain_len == 0)
507 wc = L'\0';
508 else
509 wc = *(unsigned char *) (pstr->raw_mbs + rawbuf_idx);
510 mbclen = 1;
511 pstr->cur_state = prev_st;
513 else
514 wc = (wint_t) wc2;
515 /* Then proceed the next character. */
516 rawbuf_idx += mbclen;
518 *last_wc = wc;
519 return rawbuf_idx;
521 #endif /* RE_ENABLE_I18N */
523 /* Build the buffer PSTR->MBS, and apply the translation if we need.
524 This function is used in case of REG_ICASE. */
526 static void
527 internal_function
528 build_upper_buffer (re_string_t *pstr)
530 int char_idx, end_idx;
531 end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
533 for (char_idx = pstr->valid_len; char_idx < end_idx; ++char_idx)
535 int ch = pstr->raw_mbs[pstr->raw_mbs_idx + char_idx];
536 if (BE (pstr->trans != NULL, 0))
537 ch = pstr->trans[ch];
538 if (islower (ch))
539 pstr->mbs[char_idx] = toupper (ch);
540 else
541 pstr->mbs[char_idx] = ch;
543 pstr->valid_len = char_idx;
544 pstr->valid_raw_len = char_idx;
547 /* Apply TRANS to the buffer in PSTR. */
549 static void
550 internal_function
551 re_string_translate_buffer (re_string_t *pstr)
553 int buf_idx, end_idx;
554 end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
556 for (buf_idx = pstr->valid_len; buf_idx < end_idx; ++buf_idx)
558 int ch = pstr->raw_mbs[pstr->raw_mbs_idx + buf_idx];
559 pstr->mbs[buf_idx] = pstr->trans[ch];
562 pstr->valid_len = buf_idx;
563 pstr->valid_raw_len = buf_idx;
566 /* This function re-construct the buffers.
567 Concretely, convert to wide character in case of pstr->mb_cur_max > 1,
568 convert to upper case in case of REG_ICASE, apply translation. */
570 static reg_errcode_t
571 internal_function __attribute_warn_unused_result__
572 re_string_reconstruct (re_string_t *pstr, int idx, int eflags)
574 int offset = idx - pstr->raw_mbs_idx;
575 if (BE (offset < 0, 0))
577 /* Reset buffer. */
578 #ifdef RE_ENABLE_I18N
579 if (pstr->mb_cur_max > 1)
580 memset (&pstr->cur_state, '\0', sizeof (mbstate_t));
581 #endif /* RE_ENABLE_I18N */
582 pstr->len = pstr->raw_len;
583 pstr->stop = pstr->raw_stop;
584 pstr->valid_len = 0;
585 pstr->raw_mbs_idx = 0;
586 pstr->valid_raw_len = 0;
587 pstr->offsets_needed = 0;
588 pstr->tip_context = ((eflags & REG_NOTBOL) ? CONTEXT_BEGBUF
589 : CONTEXT_NEWLINE | CONTEXT_BEGBUF);
590 if (!pstr->mbs_allocated)
591 pstr->mbs = (unsigned char *) pstr->raw_mbs;
592 offset = idx;
595 if (BE (offset != 0, 1))
597 /* Should the already checked characters be kept? */
598 if (BE (offset < pstr->valid_raw_len, 1))
600 /* Yes, move them to the front of the buffer. */
601 #ifdef RE_ENABLE_I18N
602 if (BE (pstr->offsets_needed, 0))
604 int low = 0, high = pstr->valid_len, mid;
607 mid = (high + low) / 2;
608 if (pstr->offsets[mid] > offset)
609 high = mid;
610 else if (pstr->offsets[mid] < offset)
611 low = mid + 1;
612 else
613 break;
615 while (low < high);
616 if (pstr->offsets[mid] < offset)
617 ++mid;
618 pstr->tip_context = re_string_context_at (pstr, mid - 1,
619 eflags);
620 /* This can be quite complicated, so handle specially
621 only the common and easy case where the character with
622 different length representation of lower and upper
623 case is present at or after offset. */
624 if (pstr->valid_len > offset
625 && mid == offset && pstr->offsets[mid] == offset)
627 memmove (pstr->wcs, pstr->wcs + offset,
628 (pstr->valid_len - offset) * sizeof (wint_t));
629 memmove (pstr->mbs, pstr->mbs + offset, pstr->valid_len - offset);
630 pstr->valid_len -= offset;
631 pstr->valid_raw_len -= offset;
632 for (low = 0; low < pstr->valid_len; low++)
633 pstr->offsets[low] = pstr->offsets[low + offset] - offset;
635 else
637 /* Otherwise, just find out how long the partial multibyte
638 character at offset is and fill it with WEOF/255. */
639 pstr->len = pstr->raw_len - idx + offset;
640 pstr->stop = pstr->raw_stop - idx + offset;
641 pstr->offsets_needed = 0;
642 while (mid > 0 && pstr->offsets[mid - 1] == offset)
643 --mid;
644 while (mid < pstr->valid_len)
645 if (pstr->wcs[mid] != WEOF)
646 break;
647 else
648 ++mid;
649 if (mid == pstr->valid_len)
650 pstr->valid_len = 0;
651 else
653 pstr->valid_len = pstr->offsets[mid] - offset;
654 if (pstr->valid_len)
656 for (low = 0; low < pstr->valid_len; ++low)
657 pstr->wcs[low] = WEOF;
658 memset (pstr->mbs, 255, pstr->valid_len);
661 pstr->valid_raw_len = pstr->valid_len;
664 else
665 #endif
667 pstr->tip_context = re_string_context_at (pstr, offset - 1,
668 eflags);
669 #ifdef RE_ENABLE_I18N
670 if (pstr->mb_cur_max > 1)
671 memmove (pstr->wcs, pstr->wcs + offset,
672 (pstr->valid_len - offset) * sizeof (wint_t));
673 #endif /* RE_ENABLE_I18N */
674 if (BE (pstr->mbs_allocated, 0))
675 memmove (pstr->mbs, pstr->mbs + offset,
676 pstr->valid_len - offset);
677 pstr->valid_len -= offset;
678 pstr->valid_raw_len -= offset;
679 #if DEBUG
680 assert (pstr->valid_len > 0);
681 #endif
684 else
686 /* No, skip all characters until IDX. */
687 int prev_valid_len = pstr->valid_len;
689 #ifdef RE_ENABLE_I18N
690 if (BE (pstr->offsets_needed, 0))
692 pstr->len = pstr->raw_len - idx + offset;
693 pstr->stop = pstr->raw_stop - idx + offset;
694 pstr->offsets_needed = 0;
696 #endif
697 pstr->valid_len = 0;
698 #ifdef RE_ENABLE_I18N
699 if (pstr->mb_cur_max > 1)
701 int wcs_idx;
702 wint_t wc = WEOF;
704 if (pstr->is_utf8)
706 const unsigned char *raw, *p, *end;
708 /* Special case UTF-8. Multi-byte chars start with any
709 byte other than 0x80 - 0xbf. */
710 raw = pstr->raw_mbs + pstr->raw_mbs_idx;
711 end = raw + (offset - pstr->mb_cur_max);
712 if (end < pstr->raw_mbs)
713 end = pstr->raw_mbs;
714 p = raw + offset - 1;
715 #ifdef _LIBC
716 /* We know the wchar_t encoding is UCS4, so for the simple
717 case, ASCII characters, skip the conversion step. */
718 if (isascii (*p) && BE (pstr->trans == NULL, 1))
720 memset (&pstr->cur_state, '\0', sizeof (mbstate_t));
721 /* pstr->valid_len = 0; */
722 wc = (wchar_t) *p;
724 else
725 #endif
726 for (; p >= end; --p)
727 if ((*p & 0xc0) != 0x80)
729 mbstate_t cur_state;
730 wchar_t wc2;
731 int mlen = raw + pstr->len - p;
732 unsigned char buf[6];
733 size_t mbclen;
735 if (BE (pstr->trans != NULL, 0))
737 int i = mlen < 6 ? mlen : 6;
738 while (--i >= 0)
739 buf[i] = pstr->trans[p[i]];
741 /* XXX Don't use mbrtowc, we know which conversion
742 to use (UTF-8 -> UCS4). */
743 memset (&cur_state, 0, sizeof (cur_state));
744 mbclen = __mbrtowc (&wc2, (const char *) p, mlen,
745 &cur_state);
746 if (raw + offset - p <= mbclen
747 && mbclen < (size_t) -2)
749 memset (&pstr->cur_state, '\0',
750 sizeof (mbstate_t));
751 pstr->valid_len = mbclen - (raw + offset - p);
752 wc = wc2;
754 break;
758 if (wc == WEOF)
759 pstr->valid_len = re_string_skip_chars (pstr, idx, &wc) - idx;
760 if (wc == WEOF)
761 pstr->tip_context
762 = re_string_context_at (pstr, prev_valid_len - 1, eflags);
763 else
764 pstr->tip_context = ((BE (pstr->word_ops_used != 0, 0)
765 && IS_WIDE_WORD_CHAR (wc))
766 ? CONTEXT_WORD
767 : ((IS_WIDE_NEWLINE (wc)
768 && pstr->newline_anchor)
769 ? CONTEXT_NEWLINE : 0));
770 if (BE (pstr->valid_len, 0))
772 for (wcs_idx = 0; wcs_idx < pstr->valid_len; ++wcs_idx)
773 pstr->wcs[wcs_idx] = WEOF;
774 if (pstr->mbs_allocated)
775 memset (pstr->mbs, 255, pstr->valid_len);
777 pstr->valid_raw_len = pstr->valid_len;
779 else
780 #endif /* RE_ENABLE_I18N */
782 int c = pstr->raw_mbs[pstr->raw_mbs_idx + offset - 1];
783 pstr->valid_raw_len = 0;
784 if (pstr->trans)
785 c = pstr->trans[c];
786 pstr->tip_context = (bitset_contain (pstr->word_char, c)
787 ? CONTEXT_WORD
788 : ((IS_NEWLINE (c) && pstr->newline_anchor)
789 ? CONTEXT_NEWLINE : 0));
792 if (!BE (pstr->mbs_allocated, 0))
793 pstr->mbs += offset;
795 pstr->raw_mbs_idx = idx;
796 pstr->len -= offset;
797 pstr->stop -= offset;
799 /* Then build the buffers. */
800 #ifdef RE_ENABLE_I18N
801 if (pstr->mb_cur_max > 1)
803 if (pstr->icase)
805 reg_errcode_t ret = build_wcs_upper_buffer (pstr);
806 if (BE (ret != REG_NOERROR, 0))
807 return ret;
809 else
810 build_wcs_buffer (pstr);
812 else
813 #endif /* RE_ENABLE_I18N */
814 if (BE (pstr->mbs_allocated, 0))
816 if (pstr->icase)
817 build_upper_buffer (pstr);
818 else if (pstr->trans != NULL)
819 re_string_translate_buffer (pstr);
821 else
822 pstr->valid_len = pstr->len;
824 pstr->cur_idx = 0;
825 return REG_NOERROR;
828 static unsigned char
829 internal_function __attribute ((pure))
830 re_string_peek_byte_case (const re_string_t *pstr, int idx)
832 int ch, off;
834 /* Handle the common (easiest) cases first. */
835 if (BE (!pstr->mbs_allocated, 1))
836 return re_string_peek_byte (pstr, idx);
838 #ifdef RE_ENABLE_I18N
839 if (pstr->mb_cur_max > 1
840 && ! re_string_is_single_byte_char (pstr, pstr->cur_idx + idx))
841 return re_string_peek_byte (pstr, idx);
842 #endif
844 off = pstr->cur_idx + idx;
845 #ifdef RE_ENABLE_I18N
846 if (pstr->offsets_needed)
847 off = pstr->offsets[off];
848 #endif
850 ch = pstr->raw_mbs[pstr->raw_mbs_idx + off];
852 #ifdef RE_ENABLE_I18N
853 /* Ensure that e.g. for tr_TR.UTF-8 BACKSLASH DOTLESS SMALL LETTER I
854 this function returns CAPITAL LETTER I instead of first byte of
855 DOTLESS SMALL LETTER I. The latter would confuse the parser,
856 since peek_byte_case doesn't advance cur_idx in any way. */
857 if (pstr->offsets_needed && !isascii (ch))
858 return re_string_peek_byte (pstr, idx);
859 #endif
861 return ch;
864 static unsigned char
865 internal_function __attribute ((pure))
866 re_string_fetch_byte_case (re_string_t *pstr)
868 if (BE (!pstr->mbs_allocated, 1))
869 return re_string_fetch_byte (pstr);
871 #ifdef RE_ENABLE_I18N
872 if (pstr->offsets_needed)
874 int off, ch;
876 /* For tr_TR.UTF-8 [[:islower:]] there is
877 [[: CAPITAL LETTER I WITH DOT lower:]] in mbs. Skip
878 in that case the whole multi-byte character and return
879 the original letter. On the other side, with
880 [[: DOTLESS SMALL LETTER I return [[:I, as doing
881 anything else would complicate things too much. */
883 if (!re_string_first_byte (pstr, pstr->cur_idx))
884 return re_string_fetch_byte (pstr);
886 off = pstr->offsets[pstr->cur_idx];
887 ch = pstr->raw_mbs[pstr->raw_mbs_idx + off];
889 if (! isascii (ch))
890 return re_string_fetch_byte (pstr);
892 re_string_skip_bytes (pstr,
893 re_string_char_size_at (pstr, pstr->cur_idx));
894 return ch;
896 #endif
898 return pstr->raw_mbs[pstr->raw_mbs_idx + pstr->cur_idx++];
901 static void
902 internal_function
903 re_string_destruct (re_string_t *pstr)
905 #ifdef RE_ENABLE_I18N
906 re_free (pstr->wcs);
907 re_free (pstr->offsets);
908 #endif /* RE_ENABLE_I18N */
909 if (pstr->mbs_allocated)
910 re_free (pstr->mbs);
913 /* Return the context at IDX in INPUT. */
915 static unsigned int
916 internal_function
917 re_string_context_at (const re_string_t *input, int idx, int eflags)
919 int c;
920 if (BE (idx < 0, 0))
921 /* In this case, we use the value stored in input->tip_context,
922 since we can't know the character in input->mbs[-1] here. */
923 return input->tip_context;
924 if (BE (idx == input->len, 0))
925 return ((eflags & REG_NOTEOL) ? CONTEXT_ENDBUF
926 : CONTEXT_NEWLINE | CONTEXT_ENDBUF);
927 #ifdef RE_ENABLE_I18N
928 if (input->mb_cur_max > 1)
930 wint_t wc;
931 int wc_idx = idx;
932 while(input->wcs[wc_idx] == WEOF)
934 #ifdef DEBUG
935 /* It must not happen. */
936 assert (wc_idx >= 0);
937 #endif
938 --wc_idx;
939 if (wc_idx < 0)
940 return input->tip_context;
942 wc = input->wcs[wc_idx];
943 if (BE (input->word_ops_used != 0, 0) && IS_WIDE_WORD_CHAR (wc))
944 return CONTEXT_WORD;
945 return (IS_WIDE_NEWLINE (wc) && input->newline_anchor
946 ? CONTEXT_NEWLINE : 0);
948 else
949 #endif
951 c = re_string_byte_at (input, idx);
952 if (bitset_contain (input->word_char, c))
953 return CONTEXT_WORD;
954 return IS_NEWLINE (c) && input->newline_anchor ? CONTEXT_NEWLINE : 0;
958 /* Functions for set operation. */
960 static reg_errcode_t
961 internal_function __attribute_warn_unused_result__
962 re_node_set_alloc (re_node_set *set, int size)
964 set->alloc = size;
965 set->nelem = 0;
966 set->elems = re_malloc (int, size);
967 if (BE (set->elems == NULL, 0))
968 return REG_ESPACE;
969 return REG_NOERROR;
972 static reg_errcode_t
973 internal_function __attribute_warn_unused_result__
974 re_node_set_init_1 (re_node_set *set, int elem)
976 set->alloc = 1;
977 set->nelem = 1;
978 set->elems = re_malloc (int, 1);
979 if (BE (set->elems == NULL, 0))
981 set->alloc = set->nelem = 0;
982 return REG_ESPACE;
984 set->elems[0] = elem;
985 return REG_NOERROR;
988 static reg_errcode_t
989 internal_function __attribute_warn_unused_result__
990 re_node_set_init_2 (re_node_set *set, int elem1, int elem2)
992 set->alloc = 2;
993 set->elems = re_malloc (int, 2);
994 if (BE (set->elems == NULL, 0))
995 return REG_ESPACE;
996 if (elem1 == elem2)
998 set->nelem = 1;
999 set->elems[0] = elem1;
1001 else
1003 set->nelem = 2;
1004 if (elem1 < elem2)
1006 set->elems[0] = elem1;
1007 set->elems[1] = elem2;
1009 else
1011 set->elems[0] = elem2;
1012 set->elems[1] = elem1;
1015 return REG_NOERROR;
1018 static reg_errcode_t
1019 internal_function __attribute_warn_unused_result__
1020 re_node_set_init_copy (re_node_set *dest, const re_node_set *src)
1022 dest->nelem = src->nelem;
1023 if (src->nelem > 0)
1025 dest->alloc = dest->nelem;
1026 dest->elems = re_malloc (int, dest->alloc);
1027 if (BE (dest->elems == NULL, 0))
1029 dest->alloc = dest->nelem = 0;
1030 return REG_ESPACE;
1032 memcpy (dest->elems, src->elems, src->nelem * sizeof (int));
1034 else
1035 re_node_set_init_empty (dest);
1036 return REG_NOERROR;
1039 /* Calculate the intersection of the sets SRC1 and SRC2. And merge it to
1040 DEST. Return value indicate the error code or REG_NOERROR if succeeded.
1041 Note: We assume dest->elems is NULL, when dest->alloc is 0. */
1043 static reg_errcode_t
1044 internal_function __attribute_warn_unused_result__
1045 re_node_set_add_intersect (re_node_set *dest, const re_node_set *src1,
1046 const re_node_set *src2)
1048 int i1, i2, is, id, delta, sbase;
1049 if (src1->nelem == 0 || src2->nelem == 0)
1050 return REG_NOERROR;
1052 /* We need dest->nelem + 2 * elems_in_intersection; this is a
1053 conservative estimate. */
1054 if (src1->nelem + src2->nelem + dest->nelem > dest->alloc)
1056 int new_alloc = src1->nelem + src2->nelem + dest->alloc;
1057 int *new_elems = re_realloc (dest->elems, int, new_alloc);
1058 if (BE (new_elems == NULL, 0))
1059 return REG_ESPACE;
1060 dest->elems = new_elems;
1061 dest->alloc = new_alloc;
1064 /* Find the items in the intersection of SRC1 and SRC2, and copy
1065 into the top of DEST those that are not already in DEST itself. */
1066 sbase = dest->nelem + src1->nelem + src2->nelem;
1067 i1 = src1->nelem - 1;
1068 i2 = src2->nelem - 1;
1069 id = dest->nelem - 1;
1070 for (;;)
1072 if (src1->elems[i1] == src2->elems[i2])
1074 /* Try to find the item in DEST. Maybe we could binary search? */
1075 while (id >= 0 && dest->elems[id] > src1->elems[i1])
1076 --id;
1078 if (id < 0 || dest->elems[id] != src1->elems[i1])
1079 dest->elems[--sbase] = src1->elems[i1];
1081 if (--i1 < 0 || --i2 < 0)
1082 break;
1085 /* Lower the highest of the two items. */
1086 else if (src1->elems[i1] < src2->elems[i2])
1088 if (--i2 < 0)
1089 break;
1091 else
1093 if (--i1 < 0)
1094 break;
1098 id = dest->nelem - 1;
1099 is = dest->nelem + src1->nelem + src2->nelem - 1;
1100 delta = is - sbase + 1;
1102 /* Now copy. When DELTA becomes zero, the remaining
1103 DEST elements are already in place; this is more or
1104 less the same loop that is in re_node_set_merge. */
1105 dest->nelem += delta;
1106 if (delta > 0 && id >= 0)
1107 for (;;)
1109 if (dest->elems[is] > dest->elems[id])
1111 /* Copy from the top. */
1112 dest->elems[id + delta--] = dest->elems[is--];
1113 if (delta == 0)
1114 break;
1116 else
1118 /* Slide from the bottom. */
1119 dest->elems[id + delta] = dest->elems[id];
1120 if (--id < 0)
1121 break;
1125 /* Copy remaining SRC elements. */
1126 memcpy (dest->elems, dest->elems + sbase, delta * sizeof (int));
1128 return REG_NOERROR;
1131 /* Calculate the union set of the sets SRC1 and SRC2. And store it to
1132 DEST. Return value indicate the error code or REG_NOERROR if succeeded. */
1134 static reg_errcode_t
1135 internal_function __attribute_warn_unused_result__
1136 re_node_set_init_union (re_node_set *dest, const re_node_set *src1,
1137 const re_node_set *src2)
1139 int i1, i2, id;
1140 if (src1 != NULL && src1->nelem > 0 && src2 != NULL && src2->nelem > 0)
1142 dest->alloc = src1->nelem + src2->nelem;
1143 dest->elems = re_malloc (int, dest->alloc);
1144 if (BE (dest->elems == NULL, 0))
1145 return REG_ESPACE;
1147 else
1149 if (src1 != NULL && src1->nelem > 0)
1150 return re_node_set_init_copy (dest, src1);
1151 else if (src2 != NULL && src2->nelem > 0)
1152 return re_node_set_init_copy (dest, src2);
1153 else
1154 re_node_set_init_empty (dest);
1155 return REG_NOERROR;
1157 for (i1 = i2 = id = 0 ; i1 < src1->nelem && i2 < src2->nelem ;)
1159 if (src1->elems[i1] > src2->elems[i2])
1161 dest->elems[id++] = src2->elems[i2++];
1162 continue;
1164 if (src1->elems[i1] == src2->elems[i2])
1165 ++i2;
1166 dest->elems[id++] = src1->elems[i1++];
1168 if (i1 < src1->nelem)
1170 memcpy (dest->elems + id, src1->elems + i1,
1171 (src1->nelem - i1) * sizeof (int));
1172 id += src1->nelem - i1;
1174 else if (i2 < src2->nelem)
1176 memcpy (dest->elems + id, src2->elems + i2,
1177 (src2->nelem - i2) * sizeof (int));
1178 id += src2->nelem - i2;
1180 dest->nelem = id;
1181 return REG_NOERROR;
1184 /* Calculate the union set of the sets DEST and SRC. And store it to
1185 DEST. Return value indicate the error code or REG_NOERROR if succeeded. */
1187 static reg_errcode_t
1188 internal_function __attribute_warn_unused_result__
1189 re_node_set_merge (re_node_set *dest, const re_node_set *src)
1191 int is, id, sbase, delta;
1192 if (src == NULL || src->nelem == 0)
1193 return REG_NOERROR;
1194 if (dest->alloc < 2 * src->nelem + dest->nelem)
1196 int new_alloc = 2 * (src->nelem + dest->alloc);
1197 int *new_buffer = re_realloc (dest->elems, int, new_alloc);
1198 if (BE (new_buffer == NULL, 0))
1199 return REG_ESPACE;
1200 dest->elems = new_buffer;
1201 dest->alloc = new_alloc;
1204 if (BE (dest->nelem == 0, 0))
1206 dest->nelem = src->nelem;
1207 memcpy (dest->elems, src->elems, src->nelem * sizeof (int));
1208 return REG_NOERROR;
1211 /* Copy into the top of DEST the items of SRC that are not
1212 found in DEST. Maybe we could binary search in DEST? */
1213 for (sbase = dest->nelem + 2 * src->nelem,
1214 is = src->nelem - 1, id = dest->nelem - 1; is >= 0 && id >= 0; )
1216 if (dest->elems[id] == src->elems[is])
1217 is--, id--;
1218 else if (dest->elems[id] < src->elems[is])
1219 dest->elems[--sbase] = src->elems[is--];
1220 else /* if (dest->elems[id] > src->elems[is]) */
1221 --id;
1224 if (is >= 0)
1226 /* If DEST is exhausted, the remaining items of SRC must be unique. */
1227 sbase -= is + 1;
1228 memcpy (dest->elems + sbase, src->elems, (is + 1) * sizeof (int));
1231 id = dest->nelem - 1;
1232 is = dest->nelem + 2 * src->nelem - 1;
1233 delta = is - sbase + 1;
1234 if (delta == 0)
1235 return REG_NOERROR;
1237 /* Now copy. When DELTA becomes zero, the remaining
1238 DEST elements are already in place. */
1239 dest->nelem += delta;
1240 for (;;)
1242 if (dest->elems[is] > dest->elems[id])
1244 /* Copy from the top. */
1245 dest->elems[id + delta--] = dest->elems[is--];
1246 if (delta == 0)
1247 break;
1249 else
1251 /* Slide from the bottom. */
1252 dest->elems[id + delta] = dest->elems[id];
1253 if (--id < 0)
1255 /* Copy remaining SRC elements. */
1256 memcpy (dest->elems, dest->elems + sbase,
1257 delta * sizeof (int));
1258 break;
1263 return REG_NOERROR;
1266 /* Insert the new element ELEM to the re_node_set* SET.
1267 SET should not already have ELEM.
1268 return -1 if an error is occured, return 1 otherwise. */
1270 static int
1271 internal_function __attribute_warn_unused_result__
1272 re_node_set_insert (re_node_set *set, int elem)
1274 int idx;
1275 /* In case the set is empty. */
1276 if (set->alloc == 0)
1278 if (BE (re_node_set_init_1 (set, elem) == REG_NOERROR, 1))
1279 return 1;
1280 else
1281 return -1;
1284 if (BE (set->nelem, 0) == 0)
1286 /* We already guaranteed above that set->alloc != 0. */
1287 set->elems[0] = elem;
1288 ++set->nelem;
1289 return 1;
1292 /* Realloc if we need. */
1293 if (set->alloc == set->nelem)
1295 int *new_elems;
1296 set->alloc = set->alloc * 2;
1297 new_elems = re_realloc (set->elems, int, set->alloc);
1298 if (BE (new_elems == NULL, 0))
1299 return -1;
1300 set->elems = new_elems;
1303 /* Move the elements which follows the new element. Test the
1304 first element separately to skip a check in the inner loop. */
1305 if (elem < set->elems[0])
1307 idx = 0;
1308 for (idx = set->nelem; idx > 0; idx--)
1309 set->elems[idx] = set->elems[idx - 1];
1311 else
1313 for (idx = set->nelem; set->elems[idx - 1] > elem; idx--)
1314 set->elems[idx] = set->elems[idx - 1];
1317 /* Insert the new element. */
1318 set->elems[idx] = elem;
1319 ++set->nelem;
1320 return 1;
1323 /* Insert the new element ELEM to the re_node_set* SET.
1324 SET should not already have any element greater than or equal to ELEM.
1325 Return -1 if an error is occured, return 1 otherwise. */
1327 static int
1328 internal_function __attribute_warn_unused_result__
1329 re_node_set_insert_last (re_node_set *set, int elem)
1331 /* Realloc if we need. */
1332 if (set->alloc == set->nelem)
1334 int *new_elems;
1335 set->alloc = (set->alloc + 1) * 2;
1336 new_elems = re_realloc (set->elems, int, set->alloc);
1337 if (BE (new_elems == NULL, 0))
1338 return -1;
1339 set->elems = new_elems;
1342 /* Insert the new element. */
1343 set->elems[set->nelem++] = elem;
1344 return 1;
1347 /* Compare two node sets SET1 and SET2.
1348 return 1 if SET1 and SET2 are equivalent, return 0 otherwise. */
1350 static int
1351 internal_function __attribute ((pure))
1352 re_node_set_compare (const re_node_set *set1, const re_node_set *set2)
1354 int i;
1355 if (set1 == NULL || set2 == NULL || set1->nelem != set2->nelem)
1356 return 0;
1357 for (i = set1->nelem ; --i >= 0 ; )
1358 if (set1->elems[i] != set2->elems[i])
1359 return 0;
1360 return 1;
1363 /* Return (idx + 1) if SET contains the element ELEM, return 0 otherwise. */
1365 static int
1366 internal_function __attribute ((pure))
1367 re_node_set_contains (const re_node_set *set, int elem)
1369 unsigned int idx, right, mid;
1370 if (set->nelem <= 0)
1371 return 0;
1373 /* Binary search the element. */
1374 idx = 0;
1375 right = set->nelem - 1;
1376 while (idx < right)
1378 mid = (idx + right) / 2;
1379 if (set->elems[mid] < elem)
1380 idx = mid + 1;
1381 else
1382 right = mid;
1384 return set->elems[idx] == elem ? idx + 1 : 0;
1387 static void
1388 internal_function
1389 re_node_set_remove_at (re_node_set *set, int idx)
1391 if (idx < 0 || idx >= set->nelem)
1392 return;
1393 --set->nelem;
1394 for (; idx < set->nelem; idx++)
1395 set->elems[idx] = set->elems[idx + 1];
1399 /* Add the token TOKEN to dfa->nodes, and return the index of the token.
1400 Or return -1, if an error will be occured. */
1402 static int
1403 internal_function
1404 re_dfa_add_node (re_dfa_t *dfa, re_token_t token)
1406 int type = token.type;
1407 if (BE (dfa->nodes_len >= dfa->nodes_alloc, 0))
1409 size_t new_nodes_alloc = dfa->nodes_alloc * 2;
1410 int *new_nexts, *new_indices;
1411 re_node_set *new_edests, *new_eclosures;
1412 re_token_t *new_nodes;
1414 /* Avoid overflows in realloc. */
1415 const size_t max_object_size = MAX (sizeof (re_token_t),
1416 MAX (sizeof (re_node_set),
1417 sizeof (int)));
1418 if (BE (SIZE_MAX / max_object_size < new_nodes_alloc, 0))
1419 return -1;
1421 new_nodes = re_realloc (dfa->nodes, re_token_t, new_nodes_alloc);
1422 if (BE (new_nodes == NULL, 0))
1423 return -1;
1424 dfa->nodes = new_nodes;
1425 new_nexts = re_realloc (dfa->nexts, int, new_nodes_alloc);
1426 new_indices = re_realloc (dfa->org_indices, int, new_nodes_alloc);
1427 new_edests = re_realloc (dfa->edests, re_node_set, new_nodes_alloc);
1428 new_eclosures = re_realloc (dfa->eclosures, re_node_set, new_nodes_alloc);
1429 if (BE (new_nexts == NULL || new_indices == NULL
1430 || new_edests == NULL || new_eclosures == NULL, 0))
1431 return -1;
1432 dfa->nexts = new_nexts;
1433 dfa->org_indices = new_indices;
1434 dfa->edests = new_edests;
1435 dfa->eclosures = new_eclosures;
1436 dfa->nodes_alloc = new_nodes_alloc;
1438 dfa->nodes[dfa->nodes_len] = token;
1439 dfa->nodes[dfa->nodes_len].constraint = 0;
1440 #ifdef RE_ENABLE_I18N
1441 dfa->nodes[dfa->nodes_len].accept_mb =
1442 (type == OP_PERIOD && dfa->mb_cur_max > 1) || type == COMPLEX_BRACKET;
1443 #endif
1444 dfa->nexts[dfa->nodes_len] = -1;
1445 re_node_set_init_empty (dfa->edests + dfa->nodes_len);
1446 re_node_set_init_empty (dfa->eclosures + dfa->nodes_len);
1447 return dfa->nodes_len++;
1450 static inline unsigned int
1451 internal_function
1452 calc_state_hash (const re_node_set *nodes, unsigned int context)
1454 unsigned int hash = nodes->nelem + context;
1455 int i;
1456 for (i = 0 ; i < nodes->nelem ; i++)
1457 hash += nodes->elems[i];
1458 return hash;
1461 /* Search for the state whose node_set is equivalent to NODES.
1462 Return the pointer to the state, if we found it in the DFA.
1463 Otherwise create the new one and return it. In case of an error
1464 return NULL and set the error code in ERR.
1465 Note: - We assume NULL as the invalid state, then it is possible that
1466 return value is NULL and ERR is REG_NOERROR.
1467 - We never return non-NULL value in case of any errors, it is for
1468 optimization. */
1470 static re_dfastate_t *
1471 internal_function __attribute_warn_unused_result__
1472 re_acquire_state (reg_errcode_t *err, const re_dfa_t *dfa,
1473 const re_node_set *nodes)
1475 unsigned int hash;
1476 re_dfastate_t *new_state;
1477 struct re_state_table_entry *spot;
1478 int i;
1479 if (BE (nodes->nelem == 0, 0))
1481 *err = REG_NOERROR;
1482 return NULL;
1484 hash = calc_state_hash (nodes, 0);
1485 spot = dfa->state_table + (hash & dfa->state_hash_mask);
1487 for (i = 0 ; i < spot->num ; i++)
1489 re_dfastate_t *state = spot->array[i];
1490 if (hash != state->hash)
1491 continue;
1492 if (re_node_set_compare (&state->nodes, nodes))
1493 return state;
1496 /* There are no appropriate state in the dfa, create the new one. */
1497 new_state = create_ci_newstate (dfa, nodes, hash);
1498 if (BE (new_state == NULL, 0))
1499 *err = REG_ESPACE;
1501 return new_state;
1504 /* Search for the state whose node_set is equivalent to NODES and
1505 whose context is equivalent to CONTEXT.
1506 Return the pointer to the state, if we found it in the DFA.
1507 Otherwise create the new one and return it. In case of an error
1508 return NULL and set the error code in ERR.
1509 Note: - We assume NULL as the invalid state, then it is possible that
1510 return value is NULL and ERR is REG_NOERROR.
1511 - We never return non-NULL value in case of any errors, it is for
1512 optimization. */
1514 static re_dfastate_t *
1515 internal_function __attribute_warn_unused_result__
1516 re_acquire_state_context (reg_errcode_t *err, const re_dfa_t *dfa,
1517 const re_node_set *nodes, unsigned int context)
1519 unsigned int hash;
1520 re_dfastate_t *new_state;
1521 struct re_state_table_entry *spot;
1522 int i;
1523 if (nodes->nelem == 0)
1525 *err = REG_NOERROR;
1526 return NULL;
1528 hash = calc_state_hash (nodes, context);
1529 spot = dfa->state_table + (hash & dfa->state_hash_mask);
1531 for (i = 0 ; i < spot->num ; i++)
1533 re_dfastate_t *state = spot->array[i];
1534 if (state->hash == hash
1535 && state->context == context
1536 && re_node_set_compare (state->entrance_nodes, nodes))
1537 return state;
1539 /* There are no appropriate state in `dfa', create the new one. */
1540 new_state = create_cd_newstate (dfa, nodes, context, hash);
1541 if (BE (new_state == NULL, 0))
1542 *err = REG_ESPACE;
1544 return new_state;
1547 /* Finish initialization of the new state NEWSTATE, and using its hash value
1548 HASH put in the appropriate bucket of DFA's state table. Return value
1549 indicates the error code if failed. */
1551 static reg_errcode_t
1552 __attribute_warn_unused_result__
1553 register_state (const re_dfa_t *dfa, re_dfastate_t *newstate,
1554 unsigned int hash)
1556 struct re_state_table_entry *spot;
1557 reg_errcode_t err;
1558 int i;
1560 newstate->hash = hash;
1561 err = re_node_set_alloc (&newstate->non_eps_nodes, newstate->nodes.nelem);
1562 if (BE (err != REG_NOERROR, 0))
1563 return REG_ESPACE;
1564 for (i = 0; i < newstate->nodes.nelem; i++)
1566 int elem = newstate->nodes.elems[i];
1567 if (!IS_EPSILON_NODE (dfa->nodes[elem].type))
1568 if (re_node_set_insert_last (&newstate->non_eps_nodes, elem) < 0)
1569 return REG_ESPACE;
1572 spot = dfa->state_table + (hash & dfa->state_hash_mask);
1573 if (BE (spot->alloc <= spot->num, 0))
1575 int new_alloc = 2 * spot->num + 2;
1576 re_dfastate_t **new_array = re_realloc (spot->array, re_dfastate_t *,
1577 new_alloc);
1578 if (BE (new_array == NULL, 0))
1579 return REG_ESPACE;
1580 spot->array = new_array;
1581 spot->alloc = new_alloc;
1583 spot->array[spot->num++] = newstate;
1584 return REG_NOERROR;
1587 static void
1588 free_state (re_dfastate_t *state)
1590 re_node_set_free (&state->non_eps_nodes);
1591 re_node_set_free (&state->inveclosure);
1592 if (state->entrance_nodes != &state->nodes)
1594 re_node_set_free (state->entrance_nodes);
1595 re_free (state->entrance_nodes);
1597 re_node_set_free (&state->nodes);
1598 re_free (state->word_trtable);
1599 re_free (state->trtable);
1600 re_free (state);
1603 /* Create the new state which is independ of contexts.
1604 Return the new state if succeeded, otherwise return NULL. */
1606 static re_dfastate_t *
1607 internal_function __attribute_warn_unused_result__
1608 create_ci_newstate (const re_dfa_t *dfa, const re_node_set *nodes,
1609 unsigned int hash)
1611 int i;
1612 reg_errcode_t err;
1613 re_dfastate_t *newstate;
1615 newstate = (re_dfastate_t *) calloc (sizeof (re_dfastate_t), 1);
1616 if (BE (newstate == NULL, 0))
1617 return NULL;
1618 err = re_node_set_init_copy (&newstate->nodes, nodes);
1619 if (BE (err != REG_NOERROR, 0))
1621 re_free (newstate);
1622 return NULL;
1625 newstate->entrance_nodes = &newstate->nodes;
1626 for (i = 0 ; i < nodes->nelem ; i++)
1628 re_token_t *node = dfa->nodes + nodes->elems[i];
1629 re_token_type_t type = node->type;
1630 if (type == CHARACTER && !node->constraint)
1631 continue;
1632 #ifdef RE_ENABLE_I18N
1633 newstate->accept_mb |= node->accept_mb;
1634 #endif /* RE_ENABLE_I18N */
1636 /* If the state has the halt node, the state is a halt state. */
1637 if (type == END_OF_RE)
1638 newstate->halt = 1;
1639 else if (type == OP_BACK_REF)
1640 newstate->has_backref = 1;
1641 else if (type == ANCHOR || node->constraint)
1642 newstate->has_constraint = 1;
1644 err = register_state (dfa, newstate, hash);
1645 if (BE (err != REG_NOERROR, 0))
1647 free_state (newstate);
1648 newstate = NULL;
1650 return newstate;
1653 /* Create the new state which is depend on the context CONTEXT.
1654 Return the new state if succeeded, otherwise return NULL. */
1656 static re_dfastate_t *
1657 internal_function __attribute_warn_unused_result__
1658 create_cd_newstate (const re_dfa_t *dfa, const re_node_set *nodes,
1659 unsigned int context, unsigned int hash)
1661 int i, nctx_nodes = 0;
1662 reg_errcode_t err;
1663 re_dfastate_t *newstate;
1665 newstate = (re_dfastate_t *) calloc (sizeof (re_dfastate_t), 1);
1666 if (BE (newstate == NULL, 0))
1667 return NULL;
1668 err = re_node_set_init_copy (&newstate->nodes, nodes);
1669 if (BE (err != REG_NOERROR, 0))
1671 re_free (newstate);
1672 return NULL;
1675 newstate->context = context;
1676 newstate->entrance_nodes = &newstate->nodes;
1678 for (i = 0 ; i < nodes->nelem ; i++)
1680 re_token_t *node = dfa->nodes + nodes->elems[i];
1681 re_token_type_t type = node->type;
1682 unsigned int constraint = node->constraint;
1684 if (type == CHARACTER && !constraint)
1685 continue;
1686 #ifdef RE_ENABLE_I18N
1687 newstate->accept_mb |= node->accept_mb;
1688 #endif /* RE_ENABLE_I18N */
1690 /* If the state has the halt node, the state is a halt state. */
1691 if (type == END_OF_RE)
1692 newstate->halt = 1;
1693 else if (type == OP_BACK_REF)
1694 newstate->has_backref = 1;
1696 if (constraint)
1698 if (newstate->entrance_nodes == &newstate->nodes)
1700 newstate->entrance_nodes = re_malloc (re_node_set, 1);
1701 if (BE (newstate->entrance_nodes == NULL, 0))
1703 free_state (newstate);
1704 return NULL;
1706 if (re_node_set_init_copy (newstate->entrance_nodes, nodes)
1707 != REG_NOERROR)
1708 return NULL;
1709 nctx_nodes = 0;
1710 newstate->has_constraint = 1;
1713 if (NOT_SATISFY_PREV_CONSTRAINT (constraint,context))
1715 re_node_set_remove_at (&newstate->nodes, i - nctx_nodes);
1716 ++nctx_nodes;
1720 err = register_state (dfa, newstate, hash);
1721 if (BE (err != REG_NOERROR, 0))
1723 free_state (newstate);
1724 newstate = NULL;
1726 return newstate;