sbin/hammer: Have consistent naming for buffer variables
[dragonfly.git] / contrib / diffutils / lib / regex_internal.c
blob649b2b3a8b2918ab7ada20ac3aa641208d176c7e
1 /* Extended regular expression matching and search library.
2 Copyright (C) 2002-2013 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4 Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU General Public
8 License as published by the Free Software Foundation; either
9 version 3 of the License, or (at your option) any later version.
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 General Public License for more details.
16 You should have received a copy of the GNU General Public
17 License along with the GNU C Library; if not, see
18 <http://www.gnu.org/licenses/>. */
20 #include "verify.h"
21 #include "intprops.h"
22 static void re_string_construct_common (const char *str, Idx len,
23 re_string_t *pstr,
24 RE_TRANSLATE_TYPE trans, bool icase,
25 const re_dfa_t *dfa) internal_function;
26 static re_dfastate_t *create_ci_newstate (const re_dfa_t *dfa,
27 const re_node_set *nodes,
28 re_hashval_t hash) internal_function;
29 static re_dfastate_t *create_cd_newstate (const re_dfa_t *dfa,
30 const re_node_set *nodes,
31 unsigned int context,
32 re_hashval_t hash) internal_function;
34 /* Functions for string operation. */
36 /* This function allocate the buffers. It is necessary to call
37 re_string_reconstruct before using the object. */
39 static reg_errcode_t
40 internal_function __attribute_warn_unused_result__
41 re_string_allocate (re_string_t *pstr, const char *str, Idx len, Idx init_len,
42 RE_TRANSLATE_TYPE trans, bool icase, const re_dfa_t *dfa)
44 reg_errcode_t ret;
45 Idx init_buf_len;
47 /* Ensure at least one character fits into the buffers. */
48 if (init_len < dfa->mb_cur_max)
49 init_len = dfa->mb_cur_max;
50 init_buf_len = (len + 1 < init_len) ? len + 1: init_len;
51 re_string_construct_common (str, len, pstr, trans, icase, dfa);
53 ret = re_string_realloc_buffers (pstr, init_buf_len);
54 if (BE (ret != REG_NOERROR, 0))
55 return ret;
57 pstr->word_char = dfa->word_char;
58 pstr->word_ops_used = dfa->word_ops_used;
59 pstr->mbs = pstr->mbs_allocated ? pstr->mbs : (unsigned char *) str;
60 pstr->valid_len = (pstr->mbs_allocated || dfa->mb_cur_max > 1) ? 0 : len;
61 pstr->valid_raw_len = pstr->valid_len;
62 return REG_NOERROR;
65 /* This function allocate the buffers, and initialize them. */
67 static reg_errcode_t
68 internal_function __attribute_warn_unused_result__
69 re_string_construct (re_string_t *pstr, const char *str, Idx len,
70 RE_TRANSLATE_TYPE trans, bool icase, const re_dfa_t *dfa)
72 reg_errcode_t ret;
73 memset (pstr, '\0', sizeof (re_string_t));
74 re_string_construct_common (str, len, pstr, trans, icase, dfa);
76 if (len > 0)
78 ret = re_string_realloc_buffers (pstr, len + 1);
79 if (BE (ret != REG_NOERROR, 0))
80 return ret;
82 pstr->mbs = pstr->mbs_allocated ? pstr->mbs : (unsigned char *) str;
84 if (icase)
86 #ifdef RE_ENABLE_I18N
87 if (dfa->mb_cur_max > 1)
89 while (1)
91 ret = build_wcs_upper_buffer (pstr);
92 if (BE (ret != REG_NOERROR, 0))
93 return ret;
94 if (pstr->valid_raw_len >= len)
95 break;
96 if (pstr->bufs_len > pstr->valid_len + dfa->mb_cur_max)
97 break;
98 ret = re_string_realloc_buffers (pstr, pstr->bufs_len * 2);
99 if (BE (ret != REG_NOERROR, 0))
100 return ret;
103 else
104 #endif /* RE_ENABLE_I18N */
105 build_upper_buffer (pstr);
107 else
109 #ifdef RE_ENABLE_I18N
110 if (dfa->mb_cur_max > 1)
111 build_wcs_buffer (pstr);
112 else
113 #endif /* RE_ENABLE_I18N */
115 if (trans != NULL)
116 re_string_translate_buffer (pstr);
117 else
119 pstr->valid_len = pstr->bufs_len;
120 pstr->valid_raw_len = pstr->bufs_len;
125 return REG_NOERROR;
128 /* Helper functions for re_string_allocate, and re_string_construct. */
130 static reg_errcode_t
131 internal_function __attribute_warn_unused_result__
132 re_string_realloc_buffers (re_string_t *pstr, Idx new_buf_len)
134 #ifdef RE_ENABLE_I18N
135 if (pstr->mb_cur_max > 1)
137 wint_t *new_wcs;
139 /* Avoid overflow in realloc. */
140 const size_t max_object_size = MAX (sizeof (wint_t), sizeof (Idx));
141 if (BE (MIN (IDX_MAX, SIZE_MAX / max_object_size) < new_buf_len, 0))
142 return REG_ESPACE;
144 new_wcs = re_realloc (pstr->wcs, wint_t, new_buf_len);
145 if (BE (new_wcs == NULL, 0))
146 return REG_ESPACE;
147 pstr->wcs = new_wcs;
148 if (pstr->offsets != NULL)
150 Idx *new_offsets = re_realloc (pstr->offsets, Idx, new_buf_len);
151 if (BE (new_offsets == NULL, 0))
152 return REG_ESPACE;
153 pstr->offsets = new_offsets;
156 #endif /* RE_ENABLE_I18N */
157 if (pstr->mbs_allocated)
159 unsigned char *new_mbs = re_realloc (pstr->mbs, unsigned char,
160 new_buf_len);
161 if (BE (new_mbs == NULL, 0))
162 return REG_ESPACE;
163 pstr->mbs = new_mbs;
165 pstr->bufs_len = new_buf_len;
166 return REG_NOERROR;
170 static void
171 internal_function
172 re_string_construct_common (const char *str, Idx len, re_string_t *pstr,
173 RE_TRANSLATE_TYPE trans, bool icase,
174 const re_dfa_t *dfa)
176 pstr->raw_mbs = (const unsigned char *) str;
177 pstr->len = len;
178 pstr->raw_len = len;
179 pstr->trans = trans;
180 pstr->icase = icase;
181 pstr->mbs_allocated = (trans != NULL || icase);
182 pstr->mb_cur_max = dfa->mb_cur_max;
183 pstr->is_utf8 = dfa->is_utf8;
184 pstr->map_notascii = dfa->map_notascii;
185 pstr->stop = pstr->len;
186 pstr->raw_stop = pstr->stop;
189 #ifdef RE_ENABLE_I18N
191 /* Build wide character buffer PSTR->WCS.
192 If the byte sequence of the string are:
193 <mb1>(0), <mb1>(1), <mb2>(0), <mb2>(1), <sb3>
194 Then wide character buffer will be:
195 <wc1> , WEOF , <wc2> , WEOF , <wc3>
196 We use WEOF for padding, they indicate that the position isn't
197 a first byte of a multibyte character.
199 Note that this function assumes PSTR->VALID_LEN elements are already
200 built and starts from PSTR->VALID_LEN. */
202 static void
203 internal_function
204 build_wcs_buffer (re_string_t *pstr)
206 #ifdef _LIBC
207 unsigned char buf[MB_LEN_MAX];
208 assert (MB_LEN_MAX >= pstr->mb_cur_max);
209 #else
210 unsigned char buf[64];
211 #endif
212 mbstate_t prev_st;
213 Idx byte_idx, end_idx, remain_len;
214 size_t mbclen;
216 /* Build the buffers from pstr->valid_len to either pstr->len or
217 pstr->bufs_len. */
218 end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
219 for (byte_idx = pstr->valid_len; byte_idx < end_idx;)
221 wchar_t wc;
222 const char *p;
224 remain_len = end_idx - byte_idx;
225 prev_st = pstr->cur_state;
226 /* Apply the translation if we need. */
227 if (BE (pstr->trans != NULL, 0))
229 int i, ch;
231 for (i = 0; i < pstr->mb_cur_max && i < remain_len; ++i)
233 ch = pstr->raw_mbs [pstr->raw_mbs_idx + byte_idx + i];
234 buf[i] = pstr->mbs[byte_idx + i] = pstr->trans[ch];
236 p = (const char *) buf;
238 else
239 p = (const char *) pstr->raw_mbs + pstr->raw_mbs_idx + byte_idx;
240 mbclen = __mbrtowc (&wc, p, remain_len, &pstr->cur_state);
241 if (BE (mbclen == (size_t) -1 || mbclen == 0
242 || (mbclen == (size_t) -2 && pstr->bufs_len >= pstr->len), 0))
244 /* We treat these cases as a singlebyte character. */
245 mbclen = 1;
246 wc = (wchar_t) pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx];
247 if (BE (pstr->trans != NULL, 0))
248 wc = pstr->trans[wc];
249 pstr->cur_state = prev_st;
251 else if (BE (mbclen == (size_t) -2, 0))
253 /* The buffer doesn't have enough space, finish to build. */
254 pstr->cur_state = prev_st;
255 break;
258 /* Write wide character and padding. */
259 pstr->wcs[byte_idx++] = wc;
260 /* Write paddings. */
261 for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
262 pstr->wcs[byte_idx++] = WEOF;
264 pstr->valid_len = byte_idx;
265 pstr->valid_raw_len = byte_idx;
268 /* Build wide character buffer PSTR->WCS like build_wcs_buffer,
269 but for REG_ICASE. */
271 static reg_errcode_t
272 internal_function __attribute_warn_unused_result__
273 build_wcs_upper_buffer (re_string_t *pstr)
275 mbstate_t prev_st;
276 Idx src_idx, byte_idx, end_idx, remain_len;
277 size_t mbclen;
278 #ifdef _LIBC
279 char buf[MB_LEN_MAX];
280 assert (MB_LEN_MAX >= pstr->mb_cur_max);
281 #else
282 char buf[64];
283 #endif
285 byte_idx = pstr->valid_len;
286 end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
288 /* The following optimization assumes that ASCII characters can be
289 mapped to wide characters with a simple cast. */
290 if (! pstr->map_notascii && pstr->trans == NULL && !pstr->offsets_needed)
292 while (byte_idx < end_idx)
294 wchar_t wc;
296 if (isascii (pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx])
297 && mbsinit (&pstr->cur_state))
299 /* In case of a singlebyte character. */
300 pstr->mbs[byte_idx]
301 = toupper (pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx]);
302 /* The next step uses the assumption that wchar_t is encoded
303 ASCII-safe: all ASCII values can be converted like this. */
304 pstr->wcs[byte_idx] = (wchar_t) pstr->mbs[byte_idx];
305 ++byte_idx;
306 continue;
309 remain_len = end_idx - byte_idx;
310 prev_st = pstr->cur_state;
311 mbclen = __mbrtowc (&wc,
312 ((const char *) pstr->raw_mbs + pstr->raw_mbs_idx
313 + byte_idx), remain_len, &pstr->cur_state);
314 if (BE (mbclen < (size_t) -2, 1))
316 wchar_t wcu = wc;
317 if (iswlower (wc))
319 size_t mbcdlen;
321 wcu = towupper (wc);
322 mbcdlen = wcrtomb (buf, wcu, &prev_st);
323 if (BE (mbclen == mbcdlen, 1))
324 memcpy (pstr->mbs + byte_idx, buf, mbclen);
325 else
327 src_idx = byte_idx;
328 goto offsets_needed;
331 else
332 memcpy (pstr->mbs + byte_idx,
333 pstr->raw_mbs + pstr->raw_mbs_idx + byte_idx, mbclen);
334 pstr->wcs[byte_idx++] = wcu;
335 /* Write paddings. */
336 for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
337 pstr->wcs[byte_idx++] = WEOF;
339 else if (mbclen == (size_t) -1 || mbclen == 0
340 || (mbclen == (size_t) -2 && pstr->bufs_len >= pstr->len))
342 /* It is an invalid character, an incomplete character
343 at the end of the string, or '\0'. Just use the byte. */
344 int ch = pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx];
345 pstr->mbs[byte_idx] = ch;
346 /* And also cast it to wide char. */
347 pstr->wcs[byte_idx++] = (wchar_t) ch;
348 if (BE (mbclen == (size_t) -1, 0))
349 pstr->cur_state = prev_st;
351 else
353 /* The buffer doesn't have enough space, finish to build. */
354 pstr->cur_state = prev_st;
355 break;
358 pstr->valid_len = byte_idx;
359 pstr->valid_raw_len = byte_idx;
360 return REG_NOERROR;
362 else
363 for (src_idx = pstr->valid_raw_len; byte_idx < end_idx;)
365 wchar_t wc;
366 const char *p;
367 offsets_needed:
368 remain_len = end_idx - byte_idx;
369 prev_st = pstr->cur_state;
370 if (BE (pstr->trans != NULL, 0))
372 int i, ch;
374 for (i = 0; i < pstr->mb_cur_max && i < remain_len; ++i)
376 ch = pstr->raw_mbs [pstr->raw_mbs_idx + src_idx + i];
377 buf[i] = pstr->trans[ch];
379 p = (const char *) buf;
381 else
382 p = (const char *) pstr->raw_mbs + pstr->raw_mbs_idx + src_idx;
383 mbclen = __mbrtowc (&wc, p, remain_len, &pstr->cur_state);
384 if (BE (mbclen < (size_t) -2, 1))
386 wchar_t wcu = wc;
387 if (iswlower (wc))
389 size_t mbcdlen;
391 wcu = towupper (wc);
392 mbcdlen = wcrtomb ((char *) buf, wcu, &prev_st);
393 if (BE (mbclen == mbcdlen, 1))
394 memcpy (pstr->mbs + byte_idx, buf, mbclen);
395 else if (mbcdlen != (size_t) -1)
397 size_t i;
399 if (byte_idx + mbcdlen > pstr->bufs_len)
401 pstr->cur_state = prev_st;
402 break;
405 if (pstr->offsets == NULL)
407 pstr->offsets = re_malloc (Idx, pstr->bufs_len);
409 if (pstr->offsets == NULL)
410 return REG_ESPACE;
412 if (!pstr->offsets_needed)
414 for (i = 0; i < (size_t) byte_idx; ++i)
415 pstr->offsets[i] = i;
416 pstr->offsets_needed = 1;
419 memcpy (pstr->mbs + byte_idx, buf, mbcdlen);
420 pstr->wcs[byte_idx] = wcu;
421 pstr->offsets[byte_idx] = src_idx;
422 for (i = 1; i < mbcdlen; ++i)
424 pstr->offsets[byte_idx + i]
425 = src_idx + (i < mbclen ? i : mbclen - 1);
426 pstr->wcs[byte_idx + i] = WEOF;
428 pstr->len += mbcdlen - mbclen;
429 if (pstr->raw_stop > src_idx)
430 pstr->stop += mbcdlen - mbclen;
431 end_idx = (pstr->bufs_len > pstr->len)
432 ? pstr->len : pstr->bufs_len;
433 byte_idx += mbcdlen;
434 src_idx += mbclen;
435 continue;
437 else
438 memcpy (pstr->mbs + byte_idx, p, mbclen);
440 else
441 memcpy (pstr->mbs + byte_idx, p, mbclen);
443 if (BE (pstr->offsets_needed != 0, 0))
445 size_t i;
446 for (i = 0; i < mbclen; ++i)
447 pstr->offsets[byte_idx + i] = src_idx + i;
449 src_idx += mbclen;
451 pstr->wcs[byte_idx++] = wcu;
452 /* Write paddings. */
453 for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
454 pstr->wcs[byte_idx++] = WEOF;
456 else if (mbclen == (size_t) -1 || mbclen == 0
457 || (mbclen == (size_t) -2 && pstr->bufs_len >= pstr->len))
459 /* It is an invalid character or '\0'. Just use the byte. */
460 int ch = pstr->raw_mbs[pstr->raw_mbs_idx + src_idx];
462 if (BE (pstr->trans != NULL, 0))
463 ch = pstr->trans [ch];
464 pstr->mbs[byte_idx] = ch;
466 if (BE (pstr->offsets_needed != 0, 0))
467 pstr->offsets[byte_idx] = src_idx;
468 ++src_idx;
470 /* And also cast it to wide char. */
471 pstr->wcs[byte_idx++] = (wchar_t) ch;
472 if (BE (mbclen == (size_t) -1, 0))
473 pstr->cur_state = prev_st;
475 else
477 /* The buffer doesn't have enough space, finish to build. */
478 pstr->cur_state = prev_st;
479 break;
482 pstr->valid_len = byte_idx;
483 pstr->valid_raw_len = src_idx;
484 return REG_NOERROR;
487 /* Skip characters until the index becomes greater than NEW_RAW_IDX.
488 Return the index. */
490 static Idx
491 internal_function
492 re_string_skip_chars (re_string_t *pstr, Idx new_raw_idx, wint_t *last_wc)
494 mbstate_t prev_st;
495 Idx rawbuf_idx;
496 size_t mbclen;
497 wint_t wc = WEOF;
499 /* Skip the characters which are not necessary to check. */
500 for (rawbuf_idx = pstr->raw_mbs_idx + pstr->valid_raw_len;
501 rawbuf_idx < new_raw_idx;)
503 wchar_t wc2;
504 Idx remain_len = pstr->raw_len - rawbuf_idx;
505 prev_st = pstr->cur_state;
506 mbclen = __mbrtowc (&wc2, (const char *) pstr->raw_mbs + rawbuf_idx,
507 remain_len, &pstr->cur_state);
508 if (BE (mbclen == (size_t) -2 || mbclen == (size_t) -1 || mbclen == 0, 0))
510 /* We treat these cases as a single byte character. */
511 if (mbclen == 0 || remain_len == 0)
512 wc = L'\0';
513 else
514 wc = *(unsigned char *) (pstr->raw_mbs + rawbuf_idx);
515 mbclen = 1;
516 pstr->cur_state = prev_st;
518 else
519 wc = wc2;
520 /* Then proceed the next character. */
521 rawbuf_idx += mbclen;
523 *last_wc = wc;
524 return rawbuf_idx;
526 #endif /* RE_ENABLE_I18N */
528 /* Build the buffer PSTR->MBS, and apply the translation if we need.
529 This function is used in case of REG_ICASE. */
531 static void
532 internal_function
533 build_upper_buffer (re_string_t *pstr)
535 Idx char_idx, end_idx;
536 end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
538 for (char_idx = pstr->valid_len; char_idx < end_idx; ++char_idx)
540 int ch = pstr->raw_mbs[pstr->raw_mbs_idx + char_idx];
541 if (BE (pstr->trans != NULL, 0))
542 ch = pstr->trans[ch];
543 if (islower (ch))
544 pstr->mbs[char_idx] = toupper (ch);
545 else
546 pstr->mbs[char_idx] = ch;
548 pstr->valid_len = char_idx;
549 pstr->valid_raw_len = char_idx;
552 /* Apply TRANS to the buffer in PSTR. */
554 static void
555 internal_function
556 re_string_translate_buffer (re_string_t *pstr)
558 Idx buf_idx, end_idx;
559 end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
561 for (buf_idx = pstr->valid_len; buf_idx < end_idx; ++buf_idx)
563 int ch = pstr->raw_mbs[pstr->raw_mbs_idx + buf_idx];
564 pstr->mbs[buf_idx] = pstr->trans[ch];
567 pstr->valid_len = buf_idx;
568 pstr->valid_raw_len = buf_idx;
571 /* This function re-construct the buffers.
572 Concretely, convert to wide character in case of pstr->mb_cur_max > 1,
573 convert to upper case in case of REG_ICASE, apply translation. */
575 static reg_errcode_t
576 internal_function __attribute_warn_unused_result__
577 re_string_reconstruct (re_string_t *pstr, Idx idx, int eflags)
579 Idx offset;
581 if (BE (pstr->raw_mbs_idx <= idx, 0))
582 offset = idx - pstr->raw_mbs_idx;
583 else
585 /* Reset buffer. */
586 #ifdef RE_ENABLE_I18N
587 if (pstr->mb_cur_max > 1)
588 memset (&pstr->cur_state, '\0', sizeof (mbstate_t));
589 #endif /* RE_ENABLE_I18N */
590 pstr->len = pstr->raw_len;
591 pstr->stop = pstr->raw_stop;
592 pstr->valid_len = 0;
593 pstr->raw_mbs_idx = 0;
594 pstr->valid_raw_len = 0;
595 pstr->offsets_needed = 0;
596 pstr->tip_context = ((eflags & REG_NOTBOL) ? CONTEXT_BEGBUF
597 : CONTEXT_NEWLINE | CONTEXT_BEGBUF);
598 if (!pstr->mbs_allocated)
599 pstr->mbs = (unsigned char *) pstr->raw_mbs;
600 offset = idx;
603 if (BE (offset != 0, 1))
605 /* Should the already checked characters be kept? */
606 if (BE (offset < pstr->valid_raw_len, 1))
608 /* Yes, move them to the front of the buffer. */
609 #ifdef RE_ENABLE_I18N
610 if (BE (pstr->offsets_needed, 0))
612 Idx low = 0, high = pstr->valid_len, mid;
615 mid = (high + low) / 2;
616 if (pstr->offsets[mid] > offset)
617 high = mid;
618 else if (pstr->offsets[mid] < offset)
619 low = mid + 1;
620 else
621 break;
623 while (low < high);
624 if (pstr->offsets[mid] < offset)
625 ++mid;
626 pstr->tip_context = re_string_context_at (pstr, mid - 1,
627 eflags);
628 /* This can be quite complicated, so handle specially
629 only the common and easy case where the character with
630 different length representation of lower and upper
631 case is present at or after offset. */
632 if (pstr->valid_len > offset
633 && mid == offset && pstr->offsets[mid] == offset)
635 memmove (pstr->wcs, pstr->wcs + offset,
636 (pstr->valid_len - offset) * sizeof (wint_t));
637 memmove (pstr->mbs, pstr->mbs + offset, pstr->valid_len - offset);
638 pstr->valid_len -= offset;
639 pstr->valid_raw_len -= offset;
640 for (low = 0; low < pstr->valid_len; low++)
641 pstr->offsets[low] = pstr->offsets[low + offset] - offset;
643 else
645 /* Otherwise, just find out how long the partial multibyte
646 character at offset is and fill it with WEOF/255. */
647 pstr->len = pstr->raw_len - idx + offset;
648 pstr->stop = pstr->raw_stop - idx + offset;
649 pstr->offsets_needed = 0;
650 while (mid > 0 && pstr->offsets[mid - 1] == offset)
651 --mid;
652 while (mid < pstr->valid_len)
653 if (pstr->wcs[mid] != WEOF)
654 break;
655 else
656 ++mid;
657 if (mid == pstr->valid_len)
658 pstr->valid_len = 0;
659 else
661 pstr->valid_len = pstr->offsets[mid] - offset;
662 if (pstr->valid_len)
664 for (low = 0; low < pstr->valid_len; ++low)
665 pstr->wcs[low] = WEOF;
666 memset (pstr->mbs, 255, pstr->valid_len);
669 pstr->valid_raw_len = pstr->valid_len;
672 else
673 #endif
675 pstr->tip_context = re_string_context_at (pstr, offset - 1,
676 eflags);
677 #ifdef RE_ENABLE_I18N
678 if (pstr->mb_cur_max > 1)
679 memmove (pstr->wcs, pstr->wcs + offset,
680 (pstr->valid_len - offset) * sizeof (wint_t));
681 #endif /* RE_ENABLE_I18N */
682 if (BE (pstr->mbs_allocated, 0))
683 memmove (pstr->mbs, pstr->mbs + offset,
684 pstr->valid_len - offset);
685 pstr->valid_len -= offset;
686 pstr->valid_raw_len -= offset;
687 #if DEBUG
688 assert (pstr->valid_len > 0);
689 #endif
692 else
694 #ifdef RE_ENABLE_I18N
695 /* No, skip all characters until IDX. */
696 Idx prev_valid_len = pstr->valid_len;
698 if (BE (pstr->offsets_needed, 0))
700 pstr->len = pstr->raw_len - idx + offset;
701 pstr->stop = pstr->raw_stop - idx + offset;
702 pstr->offsets_needed = 0;
704 #endif
705 pstr->valid_len = 0;
706 #ifdef RE_ENABLE_I18N
707 if (pstr->mb_cur_max > 1)
709 Idx wcs_idx;
710 wint_t wc = WEOF;
712 if (pstr->is_utf8)
714 const unsigned char *raw, *p, *end;
716 /* Special case UTF-8. Multi-byte chars start with any
717 byte other than 0x80 - 0xbf. */
718 raw = pstr->raw_mbs + pstr->raw_mbs_idx;
719 end = raw + (offset - pstr->mb_cur_max);
720 if (end < pstr->raw_mbs)
721 end = pstr->raw_mbs;
722 p = raw + offset - 1;
723 #ifdef _LIBC
724 /* We know the wchar_t encoding is UCS4, so for the simple
725 case, ASCII characters, skip the conversion step. */
726 if (isascii (*p) && BE (pstr->trans == NULL, 1))
728 memset (&pstr->cur_state, '\0', sizeof (mbstate_t));
729 /* pstr->valid_len = 0; */
730 wc = (wchar_t) *p;
732 else
733 #endif
734 for (; p >= end; --p)
735 if ((*p & 0xc0) != 0x80)
737 mbstate_t cur_state;
738 wchar_t wc2;
739 Idx mlen = raw + pstr->len - p;
740 unsigned char buf[6];
741 size_t mbclen;
743 const unsigned char *pp = p;
744 if (BE (pstr->trans != NULL, 0))
746 int i = mlen < 6 ? mlen : 6;
747 while (--i >= 0)
748 buf[i] = pstr->trans[p[i]];
749 pp = buf;
751 /* XXX Don't use mbrtowc, we know which conversion
752 to use (UTF-8 -> UCS4). */
753 memset (&cur_state, 0, sizeof (cur_state));
754 mbclen = __mbrtowc (&wc2, (const char *) pp, mlen,
755 &cur_state);
756 if (raw + offset - p <= mbclen
757 && mbclen < (size_t) -2)
759 memset (&pstr->cur_state, '\0',
760 sizeof (mbstate_t));
761 pstr->valid_len = mbclen - (raw + offset - p);
762 wc = wc2;
764 break;
768 if (wc == WEOF)
769 pstr->valid_len = re_string_skip_chars (pstr, idx, &wc) - idx;
770 if (wc == WEOF)
771 pstr->tip_context
772 = re_string_context_at (pstr, prev_valid_len - 1, eflags);
773 else
774 pstr->tip_context = ((BE (pstr->word_ops_used != 0, 0)
775 && IS_WIDE_WORD_CHAR (wc))
776 ? CONTEXT_WORD
777 : ((IS_WIDE_NEWLINE (wc)
778 && pstr->newline_anchor)
779 ? CONTEXT_NEWLINE : 0));
780 if (BE (pstr->valid_len, 0))
782 for (wcs_idx = 0; wcs_idx < pstr->valid_len; ++wcs_idx)
783 pstr->wcs[wcs_idx] = WEOF;
784 if (pstr->mbs_allocated)
785 memset (pstr->mbs, 255, pstr->valid_len);
787 pstr->valid_raw_len = pstr->valid_len;
789 else
790 #endif /* RE_ENABLE_I18N */
792 int c = pstr->raw_mbs[pstr->raw_mbs_idx + offset - 1];
793 pstr->valid_raw_len = 0;
794 if (pstr->trans)
795 c = pstr->trans[c];
796 pstr->tip_context = (bitset_contain (pstr->word_char, c)
797 ? CONTEXT_WORD
798 : ((IS_NEWLINE (c) && pstr->newline_anchor)
799 ? CONTEXT_NEWLINE : 0));
802 if (!BE (pstr->mbs_allocated, 0))
803 pstr->mbs += offset;
805 pstr->raw_mbs_idx = idx;
806 pstr->len -= offset;
807 pstr->stop -= offset;
809 /* Then build the buffers. */
810 #ifdef RE_ENABLE_I18N
811 if (pstr->mb_cur_max > 1)
813 if (pstr->icase)
815 reg_errcode_t ret = build_wcs_upper_buffer (pstr);
816 if (BE (ret != REG_NOERROR, 0))
817 return ret;
819 else
820 build_wcs_buffer (pstr);
822 else
823 #endif /* RE_ENABLE_I18N */
824 if (BE (pstr->mbs_allocated, 0))
826 if (pstr->icase)
827 build_upper_buffer (pstr);
828 else if (pstr->trans != NULL)
829 re_string_translate_buffer (pstr);
831 else
832 pstr->valid_len = pstr->len;
834 pstr->cur_idx = 0;
835 return REG_NOERROR;
838 static unsigned char
839 internal_function __attribute__ ((pure))
840 re_string_peek_byte_case (const re_string_t *pstr, Idx idx)
842 int ch;
843 Idx off;
845 /* Handle the common (easiest) cases first. */
846 if (BE (!pstr->mbs_allocated, 1))
847 return re_string_peek_byte (pstr, idx);
849 #ifdef RE_ENABLE_I18N
850 if (pstr->mb_cur_max > 1
851 && ! re_string_is_single_byte_char (pstr, pstr->cur_idx + idx))
852 return re_string_peek_byte (pstr, idx);
853 #endif
855 off = pstr->cur_idx + idx;
856 #ifdef RE_ENABLE_I18N
857 if (pstr->offsets_needed)
858 off = pstr->offsets[off];
859 #endif
861 ch = pstr->raw_mbs[pstr->raw_mbs_idx + off];
863 #ifdef RE_ENABLE_I18N
864 /* Ensure that e.g. for tr_TR.UTF-8 BACKSLASH DOTLESS SMALL LETTER I
865 this function returns CAPITAL LETTER I instead of first byte of
866 DOTLESS SMALL LETTER I. The latter would confuse the parser,
867 since peek_byte_case doesn't advance cur_idx in any way. */
868 if (pstr->offsets_needed && !isascii (ch))
869 return re_string_peek_byte (pstr, idx);
870 #endif
872 return ch;
875 static unsigned char
876 internal_function
877 re_string_fetch_byte_case (re_string_t *pstr)
879 if (BE (!pstr->mbs_allocated, 1))
880 return re_string_fetch_byte (pstr);
882 #ifdef RE_ENABLE_I18N
883 if (pstr->offsets_needed)
885 Idx off;
886 int ch;
888 /* For tr_TR.UTF-8 [[:islower:]] there is
889 [[: CAPITAL LETTER I WITH DOT lower:]] in mbs. Skip
890 in that case the whole multi-byte character and return
891 the original letter. On the other side, with
892 [[: DOTLESS SMALL LETTER I return [[:I, as doing
893 anything else would complicate things too much. */
895 if (!re_string_first_byte (pstr, pstr->cur_idx))
896 return re_string_fetch_byte (pstr);
898 off = pstr->offsets[pstr->cur_idx];
899 ch = pstr->raw_mbs[pstr->raw_mbs_idx + off];
901 if (! isascii (ch))
902 return re_string_fetch_byte (pstr);
904 re_string_skip_bytes (pstr,
905 re_string_char_size_at (pstr, pstr->cur_idx));
906 return ch;
908 #endif
910 return pstr->raw_mbs[pstr->raw_mbs_idx + pstr->cur_idx++];
913 static void
914 internal_function
915 re_string_destruct (re_string_t *pstr)
917 #ifdef RE_ENABLE_I18N
918 re_free (pstr->wcs);
919 re_free (pstr->offsets);
920 #endif /* RE_ENABLE_I18N */
921 if (pstr->mbs_allocated)
922 re_free (pstr->mbs);
925 /* Return the context at IDX in INPUT. */
927 static unsigned int
928 internal_function
929 re_string_context_at (const re_string_t *input, Idx idx, int eflags)
931 int c;
932 if (BE (! REG_VALID_INDEX (idx), 0))
933 /* In this case, we use the value stored in input->tip_context,
934 since we can't know the character in input->mbs[-1] here. */
935 return input->tip_context;
936 if (BE (idx == input->len, 0))
937 return ((eflags & REG_NOTEOL) ? CONTEXT_ENDBUF
938 : CONTEXT_NEWLINE | CONTEXT_ENDBUF);
939 #ifdef RE_ENABLE_I18N
940 if (input->mb_cur_max > 1)
942 wint_t wc;
943 Idx wc_idx = idx;
944 while(input->wcs[wc_idx] == WEOF)
946 #ifdef DEBUG
947 /* It must not happen. */
948 assert (REG_VALID_INDEX (wc_idx));
949 #endif
950 --wc_idx;
951 if (! REG_VALID_INDEX (wc_idx))
952 return input->tip_context;
954 wc = input->wcs[wc_idx];
955 if (BE (input->word_ops_used != 0, 0) && IS_WIDE_WORD_CHAR (wc))
956 return CONTEXT_WORD;
957 return (IS_WIDE_NEWLINE (wc) && input->newline_anchor
958 ? CONTEXT_NEWLINE : 0);
960 else
961 #endif
963 c = re_string_byte_at (input, idx);
964 if (bitset_contain (input->word_char, c))
965 return CONTEXT_WORD;
966 return IS_NEWLINE (c) && input->newline_anchor ? CONTEXT_NEWLINE : 0;
970 /* Functions for set operation. */
972 static reg_errcode_t
973 internal_function __attribute_warn_unused_result__
974 re_node_set_alloc (re_node_set *set, Idx size)
976 set->alloc = size;
977 set->nelem = 0;
978 set->elems = re_malloc (Idx, size);
979 if (BE (set->elems == NULL, 0) && (MALLOC_0_IS_NONNULL || size != 0))
980 return REG_ESPACE;
981 return REG_NOERROR;
984 static reg_errcode_t
985 internal_function __attribute_warn_unused_result__
986 re_node_set_init_1 (re_node_set *set, Idx elem)
988 set->alloc = 1;
989 set->nelem = 1;
990 set->elems = re_malloc (Idx, 1);
991 if (BE (set->elems == NULL, 0))
993 set->alloc = set->nelem = 0;
994 return REG_ESPACE;
996 set->elems[0] = elem;
997 return REG_NOERROR;
1000 static reg_errcode_t
1001 internal_function __attribute_warn_unused_result__
1002 re_node_set_init_2 (re_node_set *set, Idx elem1, Idx elem2)
1004 set->alloc = 2;
1005 set->elems = re_malloc (Idx, 2);
1006 if (BE (set->elems == NULL, 0))
1007 return REG_ESPACE;
1008 if (elem1 == elem2)
1010 set->nelem = 1;
1011 set->elems[0] = elem1;
1013 else
1015 set->nelem = 2;
1016 if (elem1 < elem2)
1018 set->elems[0] = elem1;
1019 set->elems[1] = elem2;
1021 else
1023 set->elems[0] = elem2;
1024 set->elems[1] = elem1;
1027 return REG_NOERROR;
1030 static reg_errcode_t
1031 internal_function __attribute_warn_unused_result__
1032 re_node_set_init_copy (re_node_set *dest, const re_node_set *src)
1034 dest->nelem = src->nelem;
1035 if (src->nelem > 0)
1037 dest->alloc = dest->nelem;
1038 dest->elems = re_malloc (Idx, dest->alloc);
1039 if (BE (dest->elems == NULL, 0))
1041 dest->alloc = dest->nelem = 0;
1042 return REG_ESPACE;
1044 memcpy (dest->elems, src->elems, src->nelem * sizeof (Idx));
1046 else
1047 re_node_set_init_empty (dest);
1048 return REG_NOERROR;
1051 /* Calculate the intersection of the sets SRC1 and SRC2. And merge it to
1052 DEST. Return value indicate the error code or REG_NOERROR if succeeded.
1053 Note: We assume dest->elems is NULL, when dest->alloc is 0. */
1055 static reg_errcode_t
1056 internal_function __attribute_warn_unused_result__
1057 re_node_set_add_intersect (re_node_set *dest, const re_node_set *src1,
1058 const re_node_set *src2)
1060 Idx i1, i2, is, id, delta, sbase;
1061 if (src1->nelem == 0 || src2->nelem == 0)
1062 return REG_NOERROR;
1064 /* We need dest->nelem + 2 * elems_in_intersection; this is a
1065 conservative estimate. */
1066 if (src1->nelem + src2->nelem + dest->nelem > dest->alloc)
1068 Idx new_alloc = src1->nelem + src2->nelem + dest->alloc;
1069 Idx *new_elems = re_realloc (dest->elems, Idx, new_alloc);
1070 if (BE (new_elems == NULL, 0))
1071 return REG_ESPACE;
1072 dest->elems = new_elems;
1073 dest->alloc = new_alloc;
1076 /* Find the items in the intersection of SRC1 and SRC2, and copy
1077 into the top of DEST those that are not already in DEST itself. */
1078 sbase = dest->nelem + src1->nelem + src2->nelem;
1079 i1 = src1->nelem - 1;
1080 i2 = src2->nelem - 1;
1081 id = dest->nelem - 1;
1082 for (;;)
1084 if (src1->elems[i1] == src2->elems[i2])
1086 /* Try to find the item in DEST. Maybe we could binary search? */
1087 while (REG_VALID_INDEX (id) && dest->elems[id] > src1->elems[i1])
1088 --id;
1090 if (! REG_VALID_INDEX (id) || dest->elems[id] != src1->elems[i1])
1091 dest->elems[--sbase] = src1->elems[i1];
1093 if (! REG_VALID_INDEX (--i1) || ! REG_VALID_INDEX (--i2))
1094 break;
1097 /* Lower the highest of the two items. */
1098 else if (src1->elems[i1] < src2->elems[i2])
1100 if (! REG_VALID_INDEX (--i2))
1101 break;
1103 else
1105 if (! REG_VALID_INDEX (--i1))
1106 break;
1110 id = dest->nelem - 1;
1111 is = dest->nelem + src1->nelem + src2->nelem - 1;
1112 delta = is - sbase + 1;
1114 /* Now copy. When DELTA becomes zero, the remaining
1115 DEST elements are already in place; this is more or
1116 less the same loop that is in re_node_set_merge. */
1117 dest->nelem += delta;
1118 if (delta > 0 && REG_VALID_INDEX (id))
1119 for (;;)
1121 if (dest->elems[is] > dest->elems[id])
1123 /* Copy from the top. */
1124 dest->elems[id + delta--] = dest->elems[is--];
1125 if (delta == 0)
1126 break;
1128 else
1130 /* Slide from the bottom. */
1131 dest->elems[id + delta] = dest->elems[id];
1132 if (! REG_VALID_INDEX (--id))
1133 break;
1137 /* Copy remaining SRC elements. */
1138 memcpy (dest->elems, dest->elems + sbase, delta * sizeof (Idx));
1140 return REG_NOERROR;
1143 /* Calculate the union set of the sets SRC1 and SRC2. And store it to
1144 DEST. Return value indicate the error code or REG_NOERROR if succeeded. */
1146 static reg_errcode_t
1147 internal_function __attribute_warn_unused_result__
1148 re_node_set_init_union (re_node_set *dest, const re_node_set *src1,
1149 const re_node_set *src2)
1151 Idx i1, i2, id;
1152 if (src1 != NULL && src1->nelem > 0 && src2 != NULL && src2->nelem > 0)
1154 dest->alloc = src1->nelem + src2->nelem;
1155 dest->elems = re_malloc (Idx, dest->alloc);
1156 if (BE (dest->elems == NULL, 0))
1157 return REG_ESPACE;
1159 else
1161 if (src1 != NULL && src1->nelem > 0)
1162 return re_node_set_init_copy (dest, src1);
1163 else if (src2 != NULL && src2->nelem > 0)
1164 return re_node_set_init_copy (dest, src2);
1165 else
1166 re_node_set_init_empty (dest);
1167 return REG_NOERROR;
1169 for (i1 = i2 = id = 0 ; i1 < src1->nelem && i2 < src2->nelem ;)
1171 if (src1->elems[i1] > src2->elems[i2])
1173 dest->elems[id++] = src2->elems[i2++];
1174 continue;
1176 if (src1->elems[i1] == src2->elems[i2])
1177 ++i2;
1178 dest->elems[id++] = src1->elems[i1++];
1180 if (i1 < src1->nelem)
1182 memcpy (dest->elems + id, src1->elems + i1,
1183 (src1->nelem - i1) * sizeof (Idx));
1184 id += src1->nelem - i1;
1186 else if (i2 < src2->nelem)
1188 memcpy (dest->elems + id, src2->elems + i2,
1189 (src2->nelem - i2) * sizeof (Idx));
1190 id += src2->nelem - i2;
1192 dest->nelem = id;
1193 return REG_NOERROR;
1196 /* Calculate the union set of the sets DEST and SRC. And store it to
1197 DEST. Return value indicate the error code or REG_NOERROR if succeeded. */
1199 static reg_errcode_t
1200 internal_function __attribute_warn_unused_result__
1201 re_node_set_merge (re_node_set *dest, const re_node_set *src)
1203 Idx is, id, sbase, delta;
1204 if (src == NULL || src->nelem == 0)
1205 return REG_NOERROR;
1206 if (dest->alloc < 2 * src->nelem + dest->nelem)
1208 Idx new_alloc = 2 * (src->nelem + dest->alloc);
1209 Idx *new_buffer = re_realloc (dest->elems, Idx, new_alloc);
1210 if (BE (new_buffer == NULL, 0))
1211 return REG_ESPACE;
1212 dest->elems = new_buffer;
1213 dest->alloc = new_alloc;
1216 if (BE (dest->nelem == 0, 0))
1218 dest->nelem = src->nelem;
1219 memcpy (dest->elems, src->elems, src->nelem * sizeof (Idx));
1220 return REG_NOERROR;
1223 /* Copy into the top of DEST the items of SRC that are not
1224 found in DEST. Maybe we could binary search in DEST? */
1225 for (sbase = dest->nelem + 2 * src->nelem,
1226 is = src->nelem - 1, id = dest->nelem - 1;
1227 REG_VALID_INDEX (is) && REG_VALID_INDEX (id); )
1229 if (dest->elems[id] == src->elems[is])
1230 is--, id--;
1231 else if (dest->elems[id] < src->elems[is])
1232 dest->elems[--sbase] = src->elems[is--];
1233 else /* if (dest->elems[id] > src->elems[is]) */
1234 --id;
1237 if (REG_VALID_INDEX (is))
1239 /* If DEST is exhausted, the remaining items of SRC must be unique. */
1240 sbase -= is + 1;
1241 memcpy (dest->elems + sbase, src->elems, (is + 1) * sizeof (Idx));
1244 id = dest->nelem - 1;
1245 is = dest->nelem + 2 * src->nelem - 1;
1246 delta = is - sbase + 1;
1247 if (delta == 0)
1248 return REG_NOERROR;
1250 /* Now copy. When DELTA becomes zero, the remaining
1251 DEST elements are already in place. */
1252 dest->nelem += delta;
1253 for (;;)
1255 if (dest->elems[is] > dest->elems[id])
1257 /* Copy from the top. */
1258 dest->elems[id + delta--] = dest->elems[is--];
1259 if (delta == 0)
1260 break;
1262 else
1264 /* Slide from the bottom. */
1265 dest->elems[id + delta] = dest->elems[id];
1266 if (! REG_VALID_INDEX (--id))
1268 /* Copy remaining SRC elements. */
1269 memcpy (dest->elems, dest->elems + sbase,
1270 delta * sizeof (Idx));
1271 break;
1276 return REG_NOERROR;
1279 /* Insert the new element ELEM to the re_node_set* SET.
1280 SET should not already have ELEM.
1281 Return true if successful. */
1283 static bool
1284 internal_function __attribute_warn_unused_result__
1285 re_node_set_insert (re_node_set *set, Idx elem)
1287 Idx idx;
1288 /* In case the set is empty. */
1289 if (set->alloc == 0)
1290 return BE (re_node_set_init_1 (set, elem) == REG_NOERROR, 1);
1292 if (BE (set->nelem, 0) == 0)
1294 /* We already guaranteed above that set->alloc != 0. */
1295 set->elems[0] = elem;
1296 ++set->nelem;
1297 return true;
1300 /* Realloc if we need. */
1301 if (set->alloc == set->nelem)
1303 Idx *new_elems;
1304 set->alloc = set->alloc * 2;
1305 new_elems = re_realloc (set->elems, Idx, set->alloc);
1306 if (BE (new_elems == NULL, 0))
1307 return false;
1308 set->elems = new_elems;
1311 /* Move the elements which follows the new element. Test the
1312 first element separately to skip a check in the inner loop. */
1313 if (elem < set->elems[0])
1315 idx = 0;
1316 for (idx = set->nelem; idx > 0; idx--)
1317 set->elems[idx] = set->elems[idx - 1];
1319 else
1321 for (idx = set->nelem; set->elems[idx - 1] > elem; idx--)
1322 set->elems[idx] = set->elems[idx - 1];
1325 /* Insert the new element. */
1326 set->elems[idx] = elem;
1327 ++set->nelem;
1328 return true;
1331 /* Insert the new element ELEM to the re_node_set* SET.
1332 SET should not already have any element greater than or equal to ELEM.
1333 Return true if successful. */
1335 static bool
1336 internal_function __attribute_warn_unused_result__
1337 re_node_set_insert_last (re_node_set *set, Idx elem)
1339 /* Realloc if we need. */
1340 if (set->alloc == set->nelem)
1342 Idx *new_elems;
1343 set->alloc = (set->alloc + 1) * 2;
1344 new_elems = re_realloc (set->elems, Idx, set->alloc);
1345 if (BE (new_elems == NULL, 0))
1346 return false;
1347 set->elems = new_elems;
1350 /* Insert the new element. */
1351 set->elems[set->nelem++] = elem;
1352 return true;
1355 /* Compare two node sets SET1 and SET2.
1356 Return true if SET1 and SET2 are equivalent. */
1358 static bool
1359 internal_function __attribute__ ((pure))
1360 re_node_set_compare (const re_node_set *set1, const re_node_set *set2)
1362 Idx i;
1363 if (set1 == NULL || set2 == NULL || set1->nelem != set2->nelem)
1364 return false;
1365 for (i = set1->nelem ; REG_VALID_INDEX (--i) ; )
1366 if (set1->elems[i] != set2->elems[i])
1367 return false;
1368 return true;
1371 /* Return (idx + 1) if SET contains the element ELEM, return 0 otherwise. */
1373 static Idx
1374 internal_function __attribute__ ((pure))
1375 re_node_set_contains (const re_node_set *set, Idx elem)
1377 __re_size_t idx, right, mid;
1378 if (! REG_VALID_NONZERO_INDEX (set->nelem))
1379 return 0;
1381 /* Binary search the element. */
1382 idx = 0;
1383 right = set->nelem - 1;
1384 while (idx < right)
1386 mid = (idx + right) / 2;
1387 if (set->elems[mid] < elem)
1388 idx = mid + 1;
1389 else
1390 right = mid;
1392 return set->elems[idx] == elem ? idx + 1 : 0;
1395 static void
1396 internal_function
1397 re_node_set_remove_at (re_node_set *set, Idx idx)
1399 verify (! TYPE_SIGNED (Idx));
1400 /* if (idx < 0)
1401 return; */
1402 if (idx >= set->nelem)
1403 return;
1404 --set->nelem;
1405 for (; idx < set->nelem; idx++)
1406 set->elems[idx] = set->elems[idx + 1];
1410 /* Add the token TOKEN to dfa->nodes, and return the index of the token.
1411 Or return REG_MISSING if an error occurred. */
1413 static Idx
1414 internal_function
1415 re_dfa_add_node (re_dfa_t *dfa, re_token_t token)
1417 if (BE (dfa->nodes_len >= dfa->nodes_alloc, 0))
1419 size_t new_nodes_alloc = dfa->nodes_alloc * 2;
1420 Idx *new_nexts, *new_indices;
1421 re_node_set *new_edests, *new_eclosures;
1422 re_token_t *new_nodes;
1424 /* Avoid overflows in realloc. */
1425 const size_t max_object_size = MAX (sizeof (re_token_t),
1426 MAX (sizeof (re_node_set),
1427 sizeof (Idx)));
1428 if (BE (MIN (IDX_MAX, SIZE_MAX / max_object_size) < new_nodes_alloc, 0))
1429 return REG_MISSING;
1431 new_nodes = re_realloc (dfa->nodes, re_token_t, new_nodes_alloc);
1432 if (BE (new_nodes == NULL, 0))
1433 return REG_MISSING;
1434 dfa->nodes = new_nodes;
1435 new_nexts = re_realloc (dfa->nexts, Idx, new_nodes_alloc);
1436 new_indices = re_realloc (dfa->org_indices, Idx, new_nodes_alloc);
1437 new_edests = re_realloc (dfa->edests, re_node_set, new_nodes_alloc);
1438 new_eclosures = re_realloc (dfa->eclosures, re_node_set, new_nodes_alloc);
1439 if (BE (new_nexts == NULL || new_indices == NULL
1440 || new_edests == NULL || new_eclosures == NULL, 0))
1441 return REG_MISSING;
1442 dfa->nexts = new_nexts;
1443 dfa->org_indices = new_indices;
1444 dfa->edests = new_edests;
1445 dfa->eclosures = new_eclosures;
1446 dfa->nodes_alloc = new_nodes_alloc;
1448 dfa->nodes[dfa->nodes_len] = token;
1449 dfa->nodes[dfa->nodes_len].constraint = 0;
1450 #ifdef RE_ENABLE_I18N
1451 dfa->nodes[dfa->nodes_len].accept_mb =
1452 ((token.type == OP_PERIOD && dfa->mb_cur_max > 1)
1453 || token.type == COMPLEX_BRACKET);
1454 #endif
1455 dfa->nexts[dfa->nodes_len] = REG_MISSING;
1456 re_node_set_init_empty (dfa->edests + dfa->nodes_len);
1457 re_node_set_init_empty (dfa->eclosures + dfa->nodes_len);
1458 return dfa->nodes_len++;
1461 static re_hashval_t
1462 internal_function
1463 calc_state_hash (const re_node_set *nodes, unsigned int context)
1465 re_hashval_t hash = nodes->nelem + context;
1466 Idx i;
1467 for (i = 0 ; i < nodes->nelem ; i++)
1468 hash += nodes->elems[i];
1469 return hash;
1472 /* Search for the state whose node_set is equivalent to NODES.
1473 Return the pointer to the state, if we found it in the DFA.
1474 Otherwise create the new one and return it. In case of an error
1475 return NULL and set the error code in ERR.
1476 Note: - We assume NULL as the invalid state, then it is possible that
1477 return value is NULL and ERR is REG_NOERROR.
1478 - We never return non-NULL value in case of any errors, it is for
1479 optimization. */
1481 static re_dfastate_t *
1482 internal_function __attribute_warn_unused_result__
1483 re_acquire_state (reg_errcode_t *err, const re_dfa_t *dfa,
1484 const re_node_set *nodes)
1486 re_hashval_t hash;
1487 re_dfastate_t *new_state;
1488 struct re_state_table_entry *spot;
1489 Idx i;
1490 #ifdef lint
1491 /* Suppress bogus uninitialized-variable warnings. */
1492 *err = REG_NOERROR;
1493 #endif
1494 if (BE (nodes->nelem == 0, 0))
1496 *err = REG_NOERROR;
1497 return NULL;
1499 hash = calc_state_hash (nodes, 0);
1500 spot = dfa->state_table + (hash & dfa->state_hash_mask);
1502 for (i = 0 ; i < spot->num ; i++)
1504 re_dfastate_t *state = spot->array[i];
1505 if (hash != state->hash)
1506 continue;
1507 if (re_node_set_compare (&state->nodes, nodes))
1508 return state;
1511 /* There are no appropriate state in the dfa, create the new one. */
1512 new_state = create_ci_newstate (dfa, nodes, hash);
1513 if (BE (new_state == NULL, 0))
1514 *err = REG_ESPACE;
1516 return new_state;
1519 /* Search for the state whose node_set is equivalent to NODES and
1520 whose context is equivalent to CONTEXT.
1521 Return the pointer to the state, if we found it in the DFA.
1522 Otherwise create the new one and return it. In case of an error
1523 return NULL and set the error code in ERR.
1524 Note: - We assume NULL as the invalid state, then it is possible that
1525 return value is NULL and ERR is REG_NOERROR.
1526 - We never return non-NULL value in case of any errors, it is for
1527 optimization. */
1529 static re_dfastate_t *
1530 internal_function __attribute_warn_unused_result__
1531 re_acquire_state_context (reg_errcode_t *err, const re_dfa_t *dfa,
1532 const re_node_set *nodes, unsigned int context)
1534 re_hashval_t hash;
1535 re_dfastate_t *new_state;
1536 struct re_state_table_entry *spot;
1537 Idx i;
1538 #ifdef lint
1539 /* Suppress bogus uninitialized-variable warnings. */
1540 *err = REG_NOERROR;
1541 #endif
1542 if (nodes->nelem == 0)
1544 *err = REG_NOERROR;
1545 return NULL;
1547 hash = calc_state_hash (nodes, context);
1548 spot = dfa->state_table + (hash & dfa->state_hash_mask);
1550 for (i = 0 ; i < spot->num ; i++)
1552 re_dfastate_t *state = spot->array[i];
1553 if (state->hash == hash
1554 && state->context == context
1555 && re_node_set_compare (state->entrance_nodes, nodes))
1556 return state;
1558 /* There are no appropriate state in 'dfa', create the new one. */
1559 new_state = create_cd_newstate (dfa, nodes, context, hash);
1560 if (BE (new_state == NULL, 0))
1561 *err = REG_ESPACE;
1563 return new_state;
1566 /* Finish initialization of the new state NEWSTATE, and using its hash value
1567 HASH put in the appropriate bucket of DFA's state table. Return value
1568 indicates the error code if failed. */
1570 static reg_errcode_t
1571 __attribute_warn_unused_result__
1572 register_state (const re_dfa_t *dfa, re_dfastate_t *newstate,
1573 re_hashval_t hash)
1575 struct re_state_table_entry *spot;
1576 reg_errcode_t err;
1577 Idx i;
1579 newstate->hash = hash;
1580 err = re_node_set_alloc (&newstate->non_eps_nodes, newstate->nodes.nelem);
1581 if (BE (err != REG_NOERROR, 0))
1582 return REG_ESPACE;
1583 for (i = 0; i < newstate->nodes.nelem; i++)
1585 Idx elem = newstate->nodes.elems[i];
1586 if (!IS_EPSILON_NODE (dfa->nodes[elem].type))
1587 if (! re_node_set_insert_last (&newstate->non_eps_nodes, elem))
1588 return REG_ESPACE;
1591 spot = dfa->state_table + (hash & dfa->state_hash_mask);
1592 if (BE (spot->alloc <= spot->num, 0))
1594 Idx new_alloc = 2 * spot->num + 2;
1595 re_dfastate_t **new_array = re_realloc (spot->array, re_dfastate_t *,
1596 new_alloc);
1597 if (BE (new_array == NULL, 0))
1598 return REG_ESPACE;
1599 spot->array = new_array;
1600 spot->alloc = new_alloc;
1602 spot->array[spot->num++] = newstate;
1603 return REG_NOERROR;
1606 static void
1607 free_state (re_dfastate_t *state)
1609 re_node_set_free (&state->non_eps_nodes);
1610 re_node_set_free (&state->inveclosure);
1611 if (state->entrance_nodes != &state->nodes)
1613 re_node_set_free (state->entrance_nodes);
1614 re_free (state->entrance_nodes);
1616 re_node_set_free (&state->nodes);
1617 re_free (state->word_trtable);
1618 re_free (state->trtable);
1619 re_free (state);
1622 /* Create the new state which is independent of contexts.
1623 Return the new state if succeeded, otherwise return NULL. */
1625 static re_dfastate_t *
1626 internal_function __attribute_warn_unused_result__
1627 create_ci_newstate (const re_dfa_t *dfa, const re_node_set *nodes,
1628 re_hashval_t hash)
1630 Idx i;
1631 reg_errcode_t err;
1632 re_dfastate_t *newstate;
1634 newstate = (re_dfastate_t *) calloc (sizeof (re_dfastate_t), 1);
1635 if (BE (newstate == NULL, 0))
1636 return NULL;
1637 err = re_node_set_init_copy (&newstate->nodes, nodes);
1638 if (BE (err != REG_NOERROR, 0))
1640 re_free (newstate);
1641 return NULL;
1644 newstate->entrance_nodes = &newstate->nodes;
1645 for (i = 0 ; i < nodes->nelem ; i++)
1647 re_token_t *node = dfa->nodes + nodes->elems[i];
1648 re_token_type_t type = node->type;
1649 if (type == CHARACTER && !node->constraint)
1650 continue;
1651 #ifdef RE_ENABLE_I18N
1652 newstate->accept_mb |= node->accept_mb;
1653 #endif /* RE_ENABLE_I18N */
1655 /* If the state has the halt node, the state is a halt state. */
1656 if (type == END_OF_RE)
1657 newstate->halt = 1;
1658 else if (type == OP_BACK_REF)
1659 newstate->has_backref = 1;
1660 else if (type == ANCHOR || node->constraint)
1661 newstate->has_constraint = 1;
1663 err = register_state (dfa, newstate, hash);
1664 if (BE (err != REG_NOERROR, 0))
1666 free_state (newstate);
1667 newstate = NULL;
1669 return newstate;
1672 /* Create the new state which is depend on the context CONTEXT.
1673 Return the new state if succeeded, otherwise return NULL. */
1675 static re_dfastate_t *
1676 internal_function __attribute_warn_unused_result__
1677 create_cd_newstate (const re_dfa_t *dfa, const re_node_set *nodes,
1678 unsigned int context, re_hashval_t hash)
1680 Idx i, nctx_nodes = 0;
1681 reg_errcode_t err;
1682 re_dfastate_t *newstate;
1684 newstate = (re_dfastate_t *) calloc (sizeof (re_dfastate_t), 1);
1685 if (BE (newstate == NULL, 0))
1686 return NULL;
1687 err = re_node_set_init_copy (&newstate->nodes, nodes);
1688 if (BE (err != REG_NOERROR, 0))
1690 re_free (newstate);
1691 return NULL;
1694 newstate->context = context;
1695 newstate->entrance_nodes = &newstate->nodes;
1697 for (i = 0 ; i < nodes->nelem ; i++)
1699 re_token_t *node = dfa->nodes + nodes->elems[i];
1700 re_token_type_t type = node->type;
1701 unsigned int constraint = node->constraint;
1703 if (type == CHARACTER && !constraint)
1704 continue;
1705 #ifdef RE_ENABLE_I18N
1706 newstate->accept_mb |= node->accept_mb;
1707 #endif /* RE_ENABLE_I18N */
1709 /* If the state has the halt node, the state is a halt state. */
1710 if (type == END_OF_RE)
1711 newstate->halt = 1;
1712 else if (type == OP_BACK_REF)
1713 newstate->has_backref = 1;
1715 if (constraint)
1717 if (newstate->entrance_nodes == &newstate->nodes)
1719 newstate->entrance_nodes = re_malloc (re_node_set, 1);
1720 if (BE (newstate->entrance_nodes == NULL, 0))
1722 free_state (newstate);
1723 return NULL;
1725 if (re_node_set_init_copy (newstate->entrance_nodes, nodes)
1726 != REG_NOERROR)
1727 return NULL;
1728 nctx_nodes = 0;
1729 newstate->has_constraint = 1;
1732 if (NOT_SATISFY_PREV_CONSTRAINT (constraint,context))
1734 re_node_set_remove_at (&newstate->nodes, i - nctx_nodes);
1735 ++nctx_nodes;
1739 err = register_state (dfa, newstate, hash);
1740 if (BE (err != REG_NOERROR, 0))
1742 free_state (newstate);
1743 newstate = NULL;
1745 return newstate;