regex_internal.c: remove useless variable and the code to set it.
[glibc.git] / posix / regex_internal.c
blob95f2a0e40547e3279737cc9f3e24be4ca73b5f4b
1 /* Extended regular expression matching and search library.
2 Copyright (C) 2002-2006, 2010 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4 Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library; if not, write to the Free
18 Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
19 02111-1307 USA. */
21 static void re_string_construct_common (const char *str, int len,
22 re_string_t *pstr,
23 RE_TRANSLATE_TYPE trans, int icase,
24 const re_dfa_t *dfa) internal_function;
25 static re_dfastate_t *create_ci_newstate (const re_dfa_t *dfa,
26 const re_node_set *nodes,
27 unsigned int hash) internal_function;
28 static re_dfastate_t *create_cd_newstate (const re_dfa_t *dfa,
29 const re_node_set *nodes,
30 unsigned int context,
31 unsigned int hash) internal_function;
33 /* Functions for string operation. */
35 /* This function allocate the buffers. It is necessary to call
36 re_string_reconstruct before using the object. */
38 static reg_errcode_t
39 internal_function __attribute_warn_unused_result__
40 re_string_allocate (re_string_t *pstr, const char *str, int len, int init_len,
41 RE_TRANSLATE_TYPE trans, int icase, const re_dfa_t *dfa)
43 reg_errcode_t ret;
44 int init_buf_len;
46 /* Ensure at least one character fits into the buffers. */
47 if (init_len < dfa->mb_cur_max)
48 init_len = dfa->mb_cur_max;
49 init_buf_len = (len + 1 < init_len) ? len + 1: init_len;
50 re_string_construct_common (str, len, pstr, trans, icase, dfa);
52 ret = re_string_realloc_buffers (pstr, init_buf_len);
53 if (BE (ret != REG_NOERROR, 0))
54 return ret;
56 pstr->word_char = dfa->word_char;
57 pstr->word_ops_used = dfa->word_ops_used;
58 pstr->mbs = pstr->mbs_allocated ? pstr->mbs : (unsigned char *) str;
59 pstr->valid_len = (pstr->mbs_allocated || dfa->mb_cur_max > 1) ? 0 : len;
60 pstr->valid_raw_len = pstr->valid_len;
61 return REG_NOERROR;
64 /* This function allocate the buffers, and initialize them. */
66 static reg_errcode_t
67 internal_function __attribute_warn_unused_result__
68 re_string_construct (re_string_t *pstr, const char *str, int len,
69 RE_TRANSLATE_TYPE trans, int icase, const re_dfa_t *dfa)
71 reg_errcode_t ret;
72 memset (pstr, '\0', sizeof (re_string_t));
73 re_string_construct_common (str, len, pstr, trans, icase, dfa);
75 if (len > 0)
77 ret = re_string_realloc_buffers (pstr, len + 1);
78 if (BE (ret != REG_NOERROR, 0))
79 return ret;
81 pstr->mbs = pstr->mbs_allocated ? pstr->mbs : (unsigned char *) str;
83 if (icase)
85 #ifdef RE_ENABLE_I18N
86 if (dfa->mb_cur_max > 1)
88 while (1)
90 ret = build_wcs_upper_buffer (pstr);
91 if (BE (ret != REG_NOERROR, 0))
92 return ret;
93 if (pstr->valid_raw_len >= len)
94 break;
95 if (pstr->bufs_len > pstr->valid_len + dfa->mb_cur_max)
96 break;
97 ret = re_string_realloc_buffers (pstr, pstr->bufs_len * 2);
98 if (BE (ret != REG_NOERROR, 0))
99 return ret;
102 else
103 #endif /* RE_ENABLE_I18N */
104 build_upper_buffer (pstr);
106 else
108 #ifdef RE_ENABLE_I18N
109 if (dfa->mb_cur_max > 1)
110 build_wcs_buffer (pstr);
111 else
112 #endif /* RE_ENABLE_I18N */
114 if (trans != NULL)
115 re_string_translate_buffer (pstr);
116 else
118 pstr->valid_len = pstr->bufs_len;
119 pstr->valid_raw_len = pstr->bufs_len;
124 return REG_NOERROR;
127 /* Helper functions for re_string_allocate, and re_string_construct. */
129 static reg_errcode_t
130 internal_function __attribute_warn_unused_result__
131 re_string_realloc_buffers (re_string_t *pstr, int new_buf_len)
133 #ifdef RE_ENABLE_I18N
134 if (pstr->mb_cur_max > 1)
136 wint_t *new_wcs;
138 /* Avoid overflow in realloc. */
139 const size_t max_object_size = MAX (sizeof (wint_t), sizeof (int));
140 if (BE (SIZE_MAX / max_object_size < new_buf_len, 0))
141 return REG_ESPACE;
143 new_wcs = re_realloc (pstr->wcs, wint_t, new_buf_len);
144 if (BE (new_wcs == NULL, 0))
145 return REG_ESPACE;
146 pstr->wcs = new_wcs;
147 if (pstr->offsets != NULL)
149 int *new_offsets = re_realloc (pstr->offsets, int, new_buf_len);
150 if (BE (new_offsets == NULL, 0))
151 return REG_ESPACE;
152 pstr->offsets = new_offsets;
155 #endif /* RE_ENABLE_I18N */
156 if (pstr->mbs_allocated)
158 unsigned char *new_mbs = re_realloc (pstr->mbs, unsigned char,
159 new_buf_len);
160 if (BE (new_mbs == NULL, 0))
161 return REG_ESPACE;
162 pstr->mbs = new_mbs;
164 pstr->bufs_len = new_buf_len;
165 return REG_NOERROR;
169 static void
170 internal_function
171 re_string_construct_common (const char *str, int len, re_string_t *pstr,
172 RE_TRANSLATE_TYPE trans, int icase,
173 const re_dfa_t *dfa)
175 pstr->raw_mbs = (const unsigned char *) str;
176 pstr->len = len;
177 pstr->raw_len = len;
178 pstr->trans = trans;
179 pstr->icase = icase ? 1 : 0;
180 pstr->mbs_allocated = (trans != NULL || icase);
181 pstr->mb_cur_max = dfa->mb_cur_max;
182 pstr->is_utf8 = dfa->is_utf8;
183 pstr->map_notascii = dfa->map_notascii;
184 pstr->stop = pstr->len;
185 pstr->raw_stop = pstr->stop;
188 #ifdef RE_ENABLE_I18N
190 /* Build wide character buffer PSTR->WCS.
191 If the byte sequence of the string are:
192 <mb1>(0), <mb1>(1), <mb2>(0), <mb2>(1), <sb3>
193 Then wide character buffer will be:
194 <wc1> , WEOF , <wc2> , WEOF , <wc3>
195 We use WEOF for padding, they indicate that the position isn't
196 a first byte of a multibyte character.
198 Note that this function assumes PSTR->VALID_LEN elements are already
199 built and starts from PSTR->VALID_LEN. */
201 static void
202 internal_function
203 build_wcs_buffer (re_string_t *pstr)
205 #ifdef _LIBC
206 unsigned char buf[MB_LEN_MAX];
207 assert (MB_LEN_MAX >= pstr->mb_cur_max);
208 #else
209 unsigned char buf[64];
210 #endif
211 mbstate_t prev_st;
212 int byte_idx, end_idx, remain_len;
213 size_t mbclen;
215 /* Build the buffers from pstr->valid_len to either pstr->len or
216 pstr->bufs_len. */
217 end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
218 for (byte_idx = pstr->valid_len; byte_idx < end_idx;)
220 wchar_t wc;
221 const char *p;
223 remain_len = end_idx - byte_idx;
224 prev_st = pstr->cur_state;
225 /* Apply the translation if we need. */
226 if (BE (pstr->trans != NULL, 0))
228 int i, ch;
230 for (i = 0; i < pstr->mb_cur_max && i < remain_len; ++i)
232 ch = pstr->raw_mbs [pstr->raw_mbs_idx + byte_idx + i];
233 buf[i] = pstr->mbs[byte_idx + i] = pstr->trans[ch];
235 p = (const char *) buf;
237 else
238 p = (const char *) pstr->raw_mbs + pstr->raw_mbs_idx + byte_idx;
239 mbclen = __mbrtowc (&wc, p, remain_len, &pstr->cur_state);
240 if (BE (mbclen == (size_t) -2, 0))
242 /* The buffer doesn't have enough space, finish to build. */
243 pstr->cur_state = prev_st;
244 break;
246 else if (BE (mbclen == (size_t) -1 || mbclen == 0, 0))
248 /* We treat these cases as a singlebyte character. */
249 mbclen = 1;
250 wc = (wchar_t) pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx];
251 if (BE (pstr->trans != NULL, 0))
252 wc = pstr->trans[wc];
253 pstr->cur_state = prev_st;
256 /* Write wide character and padding. */
257 pstr->wcs[byte_idx++] = wc;
258 /* Write paddings. */
259 for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
260 pstr->wcs[byte_idx++] = WEOF;
262 pstr->valid_len = byte_idx;
263 pstr->valid_raw_len = byte_idx;
266 /* Build wide character buffer PSTR->WCS like build_wcs_buffer,
267 but for REG_ICASE. */
269 static reg_errcode_t
270 internal_function __attribute_warn_unused_result__
271 build_wcs_upper_buffer (re_string_t *pstr)
273 mbstate_t prev_st;
274 int src_idx, byte_idx, end_idx, remain_len;
275 size_t mbclen;
276 #ifdef _LIBC
277 char buf[MB_LEN_MAX];
278 assert (MB_LEN_MAX >= pstr->mb_cur_max);
279 #else
280 char buf[64];
281 #endif
283 byte_idx = pstr->valid_len;
284 end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
286 /* The following optimization assumes that ASCII characters can be
287 mapped to wide characters with a simple cast. */
288 if (! pstr->map_notascii && pstr->trans == NULL && !pstr->offsets_needed)
290 while (byte_idx < end_idx)
292 wchar_t wc;
294 if (isascii (pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx])
295 && mbsinit (&pstr->cur_state))
297 /* In case of a singlebyte character. */
298 pstr->mbs[byte_idx]
299 = toupper (pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx]);
300 /* The next step uses the assumption that wchar_t is encoded
301 ASCII-safe: all ASCII values can be converted like this. */
302 pstr->wcs[byte_idx] = (wchar_t) pstr->mbs[byte_idx];
303 ++byte_idx;
304 continue;
307 remain_len = end_idx - byte_idx;
308 prev_st = pstr->cur_state;
309 mbclen = __mbrtowc (&wc,
310 ((const char *) pstr->raw_mbs + pstr->raw_mbs_idx
311 + byte_idx), remain_len, &pstr->cur_state);
312 if (BE (mbclen + 2 > 2, 1))
314 wchar_t wcu = wc;
315 if (iswlower (wc))
317 size_t mbcdlen;
319 wcu = towupper (wc);
320 mbcdlen = wcrtomb (buf, wcu, &prev_st);
321 if (BE (mbclen == mbcdlen, 1))
322 memcpy (pstr->mbs + byte_idx, buf, mbclen);
323 else
325 src_idx = byte_idx;
326 goto offsets_needed;
329 else
330 memcpy (pstr->mbs + byte_idx,
331 pstr->raw_mbs + pstr->raw_mbs_idx + byte_idx, mbclen);
332 pstr->wcs[byte_idx++] = wcu;
333 /* Write paddings. */
334 for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
335 pstr->wcs[byte_idx++] = WEOF;
337 else if (mbclen == (size_t) -1 || mbclen == 0)
339 /* It is an invalid character or '\0'. Just use the byte. */
340 int ch = pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx];
341 pstr->mbs[byte_idx] = ch;
342 /* And also cast it to wide char. */
343 pstr->wcs[byte_idx++] = (wchar_t) ch;
344 if (BE (mbclen == (size_t) -1, 0))
345 pstr->cur_state = prev_st;
347 else
349 /* The buffer doesn't have enough space, finish to build. */
350 pstr->cur_state = prev_st;
351 break;
354 pstr->valid_len = byte_idx;
355 pstr->valid_raw_len = byte_idx;
356 return REG_NOERROR;
358 else
359 for (src_idx = pstr->valid_raw_len; byte_idx < end_idx;)
361 wchar_t wc;
362 const char *p;
363 offsets_needed:
364 remain_len = end_idx - byte_idx;
365 prev_st = pstr->cur_state;
366 if (BE (pstr->trans != NULL, 0))
368 int i, ch;
370 for (i = 0; i < pstr->mb_cur_max && i < remain_len; ++i)
372 ch = pstr->raw_mbs [pstr->raw_mbs_idx + src_idx + i];
373 buf[i] = pstr->trans[ch];
375 p = (const char *) buf;
377 else
378 p = (const char *) pstr->raw_mbs + pstr->raw_mbs_idx + src_idx;
379 mbclen = __mbrtowc (&wc, p, remain_len, &pstr->cur_state);
380 if (BE (mbclen + 2 > 2, 1))
382 wchar_t wcu = wc;
383 if (iswlower (wc))
385 size_t mbcdlen;
387 wcu = towupper (wc);
388 mbcdlen = wcrtomb ((char *) buf, wcu, &prev_st);
389 if (BE (mbclen == mbcdlen, 1))
390 memcpy (pstr->mbs + byte_idx, buf, mbclen);
391 else if (mbcdlen != (size_t) -1)
393 size_t i;
395 if (byte_idx + mbcdlen > pstr->bufs_len)
397 pstr->cur_state = prev_st;
398 break;
401 if (pstr->offsets == NULL)
403 pstr->offsets = re_malloc (int, pstr->bufs_len);
405 if (pstr->offsets == NULL)
406 return REG_ESPACE;
408 if (!pstr->offsets_needed)
410 for (i = 0; i < (size_t) byte_idx; ++i)
411 pstr->offsets[i] = i;
412 pstr->offsets_needed = 1;
415 memcpy (pstr->mbs + byte_idx, buf, mbcdlen);
416 pstr->wcs[byte_idx] = wcu;
417 pstr->offsets[byte_idx] = src_idx;
418 for (i = 1; i < mbcdlen; ++i)
420 pstr->offsets[byte_idx + i]
421 = src_idx + (i < mbclen ? i : mbclen - 1);
422 pstr->wcs[byte_idx + i] = WEOF;
424 pstr->len += mbcdlen - mbclen;
425 if (pstr->raw_stop > src_idx)
426 pstr->stop += mbcdlen - mbclen;
427 end_idx = (pstr->bufs_len > pstr->len)
428 ? pstr->len : pstr->bufs_len;
429 byte_idx += mbcdlen;
430 src_idx += mbclen;
431 continue;
433 else
434 memcpy (pstr->mbs + byte_idx, p, mbclen);
436 else
437 memcpy (pstr->mbs + byte_idx, p, mbclen);
439 if (BE (pstr->offsets_needed != 0, 0))
441 size_t i;
442 for (i = 0; i < mbclen; ++i)
443 pstr->offsets[byte_idx + i] = src_idx + i;
445 src_idx += mbclen;
447 pstr->wcs[byte_idx++] = wcu;
448 /* Write paddings. */
449 for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
450 pstr->wcs[byte_idx++] = WEOF;
452 else if (mbclen == (size_t) -1 || mbclen == 0)
454 /* It is an invalid character or '\0'. Just use the byte. */
455 int ch = pstr->raw_mbs[pstr->raw_mbs_idx + src_idx];
457 if (BE (pstr->trans != NULL, 0))
458 ch = pstr->trans [ch];
459 pstr->mbs[byte_idx] = ch;
461 if (BE (pstr->offsets_needed != 0, 0))
462 pstr->offsets[byte_idx] = src_idx;
463 ++src_idx;
465 /* And also cast it to wide char. */
466 pstr->wcs[byte_idx++] = (wchar_t) ch;
467 if (BE (mbclen == (size_t) -1, 0))
468 pstr->cur_state = prev_st;
470 else
472 /* The buffer doesn't have enough space, finish to build. */
473 pstr->cur_state = prev_st;
474 break;
477 pstr->valid_len = byte_idx;
478 pstr->valid_raw_len = src_idx;
479 return REG_NOERROR;
482 /* Skip characters until the index becomes greater than NEW_RAW_IDX.
483 Return the index. */
485 static int
486 internal_function
487 re_string_skip_chars (re_string_t *pstr, int new_raw_idx, wint_t *last_wc)
489 mbstate_t prev_st;
490 int rawbuf_idx;
491 size_t mbclen;
492 wchar_t wc = WEOF;
494 /* Skip the characters which are not necessary to check. */
495 for (rawbuf_idx = pstr->raw_mbs_idx + pstr->valid_raw_len;
496 rawbuf_idx < new_raw_idx;)
498 int remain_len;
499 remain_len = pstr->len - rawbuf_idx;
500 prev_st = pstr->cur_state;
501 mbclen = __mbrtowc (&wc, (const char *) pstr->raw_mbs + rawbuf_idx,
502 remain_len, &pstr->cur_state);
503 if (BE (mbclen == (size_t) -2 || mbclen == (size_t) -1 || mbclen == 0, 0))
505 /* We treat these cases as a single byte character. */
506 if (mbclen == 0 || remain_len == 0)
507 wc = L'\0';
508 else
509 wc = *(unsigned char *) (pstr->raw_mbs + rawbuf_idx);
510 mbclen = 1;
511 pstr->cur_state = prev_st;
513 /* Then proceed the next character. */
514 rawbuf_idx += mbclen;
516 *last_wc = (wint_t) wc;
517 return rawbuf_idx;
519 #endif /* RE_ENABLE_I18N */
521 /* Build the buffer PSTR->MBS, and apply the translation if we need.
522 This function is used in case of REG_ICASE. */
524 static void
525 internal_function
526 build_upper_buffer (re_string_t *pstr)
528 int char_idx, end_idx;
529 end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
531 for (char_idx = pstr->valid_len; char_idx < end_idx; ++char_idx)
533 int ch = pstr->raw_mbs[pstr->raw_mbs_idx + char_idx];
534 if (BE (pstr->trans != NULL, 0))
535 ch = pstr->trans[ch];
536 if (islower (ch))
537 pstr->mbs[char_idx] = toupper (ch);
538 else
539 pstr->mbs[char_idx] = ch;
541 pstr->valid_len = char_idx;
542 pstr->valid_raw_len = char_idx;
545 /* Apply TRANS to the buffer in PSTR. */
547 static void
548 internal_function
549 re_string_translate_buffer (re_string_t *pstr)
551 int buf_idx, end_idx;
552 end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
554 for (buf_idx = pstr->valid_len; buf_idx < end_idx; ++buf_idx)
556 int ch = pstr->raw_mbs[pstr->raw_mbs_idx + buf_idx];
557 pstr->mbs[buf_idx] = pstr->trans[ch];
560 pstr->valid_len = buf_idx;
561 pstr->valid_raw_len = buf_idx;
564 /* This function re-construct the buffers.
565 Concretely, convert to wide character in case of pstr->mb_cur_max > 1,
566 convert to upper case in case of REG_ICASE, apply translation. */
568 static reg_errcode_t
569 internal_function __attribute_warn_unused_result__
570 re_string_reconstruct (re_string_t *pstr, int idx, int eflags)
572 int offset = idx - pstr->raw_mbs_idx;
573 if (BE (offset < 0, 0))
575 /* Reset buffer. */
576 #ifdef RE_ENABLE_I18N
577 if (pstr->mb_cur_max > 1)
578 memset (&pstr->cur_state, '\0', sizeof (mbstate_t));
579 #endif /* RE_ENABLE_I18N */
580 pstr->len = pstr->raw_len;
581 pstr->stop = pstr->raw_stop;
582 pstr->valid_len = 0;
583 pstr->raw_mbs_idx = 0;
584 pstr->valid_raw_len = 0;
585 pstr->offsets_needed = 0;
586 pstr->tip_context = ((eflags & REG_NOTBOL) ? CONTEXT_BEGBUF
587 : CONTEXT_NEWLINE | CONTEXT_BEGBUF);
588 if (!pstr->mbs_allocated)
589 pstr->mbs = (unsigned char *) pstr->raw_mbs;
590 offset = idx;
593 if (BE (offset != 0, 1))
595 /* Should the already checked characters be kept? */
596 if (BE (offset < pstr->valid_raw_len, 1))
598 /* Yes, move them to the front of the buffer. */
599 #ifdef RE_ENABLE_I18N
600 if (BE (pstr->offsets_needed, 0))
602 int low = 0, high = pstr->valid_len, mid;
605 mid = (high + low) / 2;
606 if (pstr->offsets[mid] > offset)
607 high = mid;
608 else if (pstr->offsets[mid] < offset)
609 low = mid + 1;
610 else
611 break;
613 while (low < high);
614 if (pstr->offsets[mid] < offset)
615 ++mid;
616 pstr->tip_context = re_string_context_at (pstr, mid - 1,
617 eflags);
618 /* This can be quite complicated, so handle specially
619 only the common and easy case where the character with
620 different length representation of lower and upper
621 case is present at or after offset. */
622 if (pstr->valid_len > offset
623 && mid == offset && pstr->offsets[mid] == offset)
625 memmove (pstr->wcs, pstr->wcs + offset,
626 (pstr->valid_len - offset) * sizeof (wint_t));
627 memmove (pstr->mbs, pstr->mbs + offset, pstr->valid_len - offset);
628 pstr->valid_len -= offset;
629 pstr->valid_raw_len -= offset;
630 for (low = 0; low < pstr->valid_len; low++)
631 pstr->offsets[low] = pstr->offsets[low + offset] - offset;
633 else
635 /* Otherwise, just find out how long the partial multibyte
636 character at offset is and fill it with WEOF/255. */
637 pstr->len = pstr->raw_len - idx + offset;
638 pstr->stop = pstr->raw_stop - idx + offset;
639 pstr->offsets_needed = 0;
640 while (mid > 0 && pstr->offsets[mid - 1] == offset)
641 --mid;
642 while (mid < pstr->valid_len)
643 if (pstr->wcs[mid] != WEOF)
644 break;
645 else
646 ++mid;
647 if (mid == pstr->valid_len)
648 pstr->valid_len = 0;
649 else
651 pstr->valid_len = pstr->offsets[mid] - offset;
652 if (pstr->valid_len)
654 for (low = 0; low < pstr->valid_len; ++low)
655 pstr->wcs[low] = WEOF;
656 memset (pstr->mbs, 255, pstr->valid_len);
659 pstr->valid_raw_len = pstr->valid_len;
662 else
663 #endif
665 pstr->tip_context = re_string_context_at (pstr, offset - 1,
666 eflags);
667 #ifdef RE_ENABLE_I18N
668 if (pstr->mb_cur_max > 1)
669 memmove (pstr->wcs, pstr->wcs + offset,
670 (pstr->valid_len - offset) * sizeof (wint_t));
671 #endif /* RE_ENABLE_I18N */
672 if (BE (pstr->mbs_allocated, 0))
673 memmove (pstr->mbs, pstr->mbs + offset,
674 pstr->valid_len - offset);
675 pstr->valid_len -= offset;
676 pstr->valid_raw_len -= offset;
677 #if DEBUG
678 assert (pstr->valid_len > 0);
679 #endif
682 else
684 /* No, skip all characters until IDX. */
685 int prev_valid_len = pstr->valid_len;
687 #ifdef RE_ENABLE_I18N
688 if (BE (pstr->offsets_needed, 0))
690 pstr->len = pstr->raw_len - idx + offset;
691 pstr->stop = pstr->raw_stop - idx + offset;
692 pstr->offsets_needed = 0;
694 #endif
695 pstr->valid_len = 0;
696 #ifdef RE_ENABLE_I18N
697 if (pstr->mb_cur_max > 1)
699 int wcs_idx;
700 wint_t wc = WEOF;
702 if (pstr->is_utf8)
704 const unsigned char *raw, *p, *end;
706 /* Special case UTF-8. Multi-byte chars start with any
707 byte other than 0x80 - 0xbf. */
708 raw = pstr->raw_mbs + pstr->raw_mbs_idx;
709 end = raw + (offset - pstr->mb_cur_max);
710 if (end < pstr->raw_mbs)
711 end = pstr->raw_mbs;
712 p = raw + offset - 1;
713 #ifdef _LIBC
714 /* We know the wchar_t encoding is UCS4, so for the simple
715 case, ASCII characters, skip the conversion step. */
716 if (isascii (*p) && BE (pstr->trans == NULL, 1))
718 memset (&pstr->cur_state, '\0', sizeof (mbstate_t));
719 /* pstr->valid_len = 0; */
720 wc = (wchar_t) *p;
722 else
723 #endif
724 for (; p >= end; --p)
725 if ((*p & 0xc0) != 0x80)
727 mbstate_t cur_state;
728 wchar_t wc2;
729 int mlen = raw + pstr->len - p;
730 unsigned char buf[6];
731 size_t mbclen;
733 if (BE (pstr->trans != NULL, 0))
735 int i = mlen < 6 ? mlen : 6;
736 while (--i >= 0)
737 buf[i] = pstr->trans[p[i]];
739 /* XXX Don't use mbrtowc, we know which conversion
740 to use (UTF-8 -> UCS4). */
741 memset (&cur_state, 0, sizeof (cur_state));
742 mbclen = __mbrtowc (&wc2, (const char *) p, mlen,
743 &cur_state);
744 if (raw + offset - p <= mbclen
745 && mbclen < (size_t) -2)
747 memset (&pstr->cur_state, '\0',
748 sizeof (mbstate_t));
749 pstr->valid_len = mbclen - (raw + offset - p);
750 wc = wc2;
752 break;
756 if (wc == WEOF)
757 pstr->valid_len = re_string_skip_chars (pstr, idx, &wc) - idx;
758 if (wc == WEOF)
759 pstr->tip_context
760 = re_string_context_at (pstr, prev_valid_len - 1, eflags);
761 else
762 pstr->tip_context = ((BE (pstr->word_ops_used != 0, 0)
763 && IS_WIDE_WORD_CHAR (wc))
764 ? CONTEXT_WORD
765 : ((IS_WIDE_NEWLINE (wc)
766 && pstr->newline_anchor)
767 ? CONTEXT_NEWLINE : 0));
768 if (BE (pstr->valid_len, 0))
770 for (wcs_idx = 0; wcs_idx < pstr->valid_len; ++wcs_idx)
771 pstr->wcs[wcs_idx] = WEOF;
772 if (pstr->mbs_allocated)
773 memset (pstr->mbs, 255, pstr->valid_len);
775 pstr->valid_raw_len = pstr->valid_len;
777 else
778 #endif /* RE_ENABLE_I18N */
780 int c = pstr->raw_mbs[pstr->raw_mbs_idx + offset - 1];
781 pstr->valid_raw_len = 0;
782 if (pstr->trans)
783 c = pstr->trans[c];
784 pstr->tip_context = (bitset_contain (pstr->word_char, c)
785 ? CONTEXT_WORD
786 : ((IS_NEWLINE (c) && pstr->newline_anchor)
787 ? CONTEXT_NEWLINE : 0));
790 if (!BE (pstr->mbs_allocated, 0))
791 pstr->mbs += offset;
793 pstr->raw_mbs_idx = idx;
794 pstr->len -= offset;
795 pstr->stop -= offset;
797 /* Then build the buffers. */
798 #ifdef RE_ENABLE_I18N
799 if (pstr->mb_cur_max > 1)
801 if (pstr->icase)
803 reg_errcode_t ret = build_wcs_upper_buffer (pstr);
804 if (BE (ret != REG_NOERROR, 0))
805 return ret;
807 else
808 build_wcs_buffer (pstr);
810 else
811 #endif /* RE_ENABLE_I18N */
812 if (BE (pstr->mbs_allocated, 0))
814 if (pstr->icase)
815 build_upper_buffer (pstr);
816 else if (pstr->trans != NULL)
817 re_string_translate_buffer (pstr);
819 else
820 pstr->valid_len = pstr->len;
822 pstr->cur_idx = 0;
823 return REG_NOERROR;
826 static unsigned char
827 internal_function __attribute ((pure))
828 re_string_peek_byte_case (const re_string_t *pstr, int idx)
830 int ch, off;
832 /* Handle the common (easiest) cases first. */
833 if (BE (!pstr->mbs_allocated, 1))
834 return re_string_peek_byte (pstr, idx);
836 #ifdef RE_ENABLE_I18N
837 if (pstr->mb_cur_max > 1
838 && ! re_string_is_single_byte_char (pstr, pstr->cur_idx + idx))
839 return re_string_peek_byte (pstr, idx);
840 #endif
842 off = pstr->cur_idx + idx;
843 #ifdef RE_ENABLE_I18N
844 if (pstr->offsets_needed)
845 off = pstr->offsets[off];
846 #endif
848 ch = pstr->raw_mbs[pstr->raw_mbs_idx + off];
850 #ifdef RE_ENABLE_I18N
851 /* Ensure that e.g. for tr_TR.UTF-8 BACKSLASH DOTLESS SMALL LETTER I
852 this function returns CAPITAL LETTER I instead of first byte of
853 DOTLESS SMALL LETTER I. The latter would confuse the parser,
854 since peek_byte_case doesn't advance cur_idx in any way. */
855 if (pstr->offsets_needed && !isascii (ch))
856 return re_string_peek_byte (pstr, idx);
857 #endif
859 return ch;
862 static unsigned char
863 internal_function __attribute ((pure))
864 re_string_fetch_byte_case (re_string_t *pstr)
866 if (BE (!pstr->mbs_allocated, 1))
867 return re_string_fetch_byte (pstr);
869 #ifdef RE_ENABLE_I18N
870 if (pstr->offsets_needed)
872 int off, ch;
874 /* For tr_TR.UTF-8 [[:islower:]] there is
875 [[: CAPITAL LETTER I WITH DOT lower:]] in mbs. Skip
876 in that case the whole multi-byte character and return
877 the original letter. On the other side, with
878 [[: DOTLESS SMALL LETTER I return [[:I, as doing
879 anything else would complicate things too much. */
881 if (!re_string_first_byte (pstr, pstr->cur_idx))
882 return re_string_fetch_byte (pstr);
884 off = pstr->offsets[pstr->cur_idx];
885 ch = pstr->raw_mbs[pstr->raw_mbs_idx + off];
887 if (! isascii (ch))
888 return re_string_fetch_byte (pstr);
890 re_string_skip_bytes (pstr,
891 re_string_char_size_at (pstr, pstr->cur_idx));
892 return ch;
894 #endif
896 return pstr->raw_mbs[pstr->raw_mbs_idx + pstr->cur_idx++];
899 static void
900 internal_function
901 re_string_destruct (re_string_t *pstr)
903 #ifdef RE_ENABLE_I18N
904 re_free (pstr->wcs);
905 re_free (pstr->offsets);
906 #endif /* RE_ENABLE_I18N */
907 if (pstr->mbs_allocated)
908 re_free (pstr->mbs);
911 /* Return the context at IDX in INPUT. */
913 static unsigned int
914 internal_function
915 re_string_context_at (const re_string_t *input, int idx, int eflags)
917 int c;
918 if (BE (idx < 0, 0))
919 /* In this case, we use the value stored in input->tip_context,
920 since we can't know the character in input->mbs[-1] here. */
921 return input->tip_context;
922 if (BE (idx == input->len, 0))
923 return ((eflags & REG_NOTEOL) ? CONTEXT_ENDBUF
924 : CONTEXT_NEWLINE | CONTEXT_ENDBUF);
925 #ifdef RE_ENABLE_I18N
926 if (input->mb_cur_max > 1)
928 wint_t wc;
929 int wc_idx = idx;
930 while(input->wcs[wc_idx] == WEOF)
932 #ifdef DEBUG
933 /* It must not happen. */
934 assert (wc_idx >= 0);
935 #endif
936 --wc_idx;
937 if (wc_idx < 0)
938 return input->tip_context;
940 wc = input->wcs[wc_idx];
941 if (BE (input->word_ops_used != 0, 0) && IS_WIDE_WORD_CHAR (wc))
942 return CONTEXT_WORD;
943 return (IS_WIDE_NEWLINE (wc) && input->newline_anchor
944 ? CONTEXT_NEWLINE : 0);
946 else
947 #endif
949 c = re_string_byte_at (input, idx);
950 if (bitset_contain (input->word_char, c))
951 return CONTEXT_WORD;
952 return IS_NEWLINE (c) && input->newline_anchor ? CONTEXT_NEWLINE : 0;
956 /* Functions for set operation. */
958 static reg_errcode_t
959 internal_function __attribute_warn_unused_result__
960 re_node_set_alloc (re_node_set *set, int size)
962 set->alloc = size;
963 set->nelem = 0;
964 set->elems = re_malloc (int, size);
965 if (BE (set->elems == NULL, 0))
966 return REG_ESPACE;
967 return REG_NOERROR;
970 static reg_errcode_t
971 internal_function __attribute_warn_unused_result__
972 re_node_set_init_1 (re_node_set *set, int elem)
974 set->alloc = 1;
975 set->nelem = 1;
976 set->elems = re_malloc (int, 1);
977 if (BE (set->elems == NULL, 0))
979 set->alloc = set->nelem = 0;
980 return REG_ESPACE;
982 set->elems[0] = elem;
983 return REG_NOERROR;
986 static reg_errcode_t
987 internal_function __attribute_warn_unused_result__
988 re_node_set_init_2 (re_node_set *set, int elem1, int elem2)
990 set->alloc = 2;
991 set->elems = re_malloc (int, 2);
992 if (BE (set->elems == NULL, 0))
993 return REG_ESPACE;
994 if (elem1 == elem2)
996 set->nelem = 1;
997 set->elems[0] = elem1;
999 else
1001 set->nelem = 2;
1002 if (elem1 < elem2)
1004 set->elems[0] = elem1;
1005 set->elems[1] = elem2;
1007 else
1009 set->elems[0] = elem2;
1010 set->elems[1] = elem1;
1013 return REG_NOERROR;
1016 static reg_errcode_t
1017 internal_function __attribute_warn_unused_result__
1018 re_node_set_init_copy (re_node_set *dest, const re_node_set *src)
1020 dest->nelem = src->nelem;
1021 if (src->nelem > 0)
1023 dest->alloc = dest->nelem;
1024 dest->elems = re_malloc (int, dest->alloc);
1025 if (BE (dest->elems == NULL, 0))
1027 dest->alloc = dest->nelem = 0;
1028 return REG_ESPACE;
1030 memcpy (dest->elems, src->elems, src->nelem * sizeof (int));
1032 else
1033 re_node_set_init_empty (dest);
1034 return REG_NOERROR;
1037 /* Calculate the intersection of the sets SRC1 and SRC2. And merge it to
1038 DEST. Return value indicate the error code or REG_NOERROR if succeeded.
1039 Note: We assume dest->elems is NULL, when dest->alloc is 0. */
1041 static reg_errcode_t
1042 internal_function __attribute_warn_unused_result__
1043 re_node_set_add_intersect (re_node_set *dest, const re_node_set *src1,
1044 const re_node_set *src2)
1046 int i1, i2, is, id, delta, sbase;
1047 if (src1->nelem == 0 || src2->nelem == 0)
1048 return REG_NOERROR;
1050 /* We need dest->nelem + 2 * elems_in_intersection; this is a
1051 conservative estimate. */
1052 if (src1->nelem + src2->nelem + dest->nelem > dest->alloc)
1054 int new_alloc = src1->nelem + src2->nelem + dest->alloc;
1055 int *new_elems = re_realloc (dest->elems, int, new_alloc);
1056 if (BE (new_elems == NULL, 0))
1057 return REG_ESPACE;
1058 dest->elems = new_elems;
1059 dest->alloc = new_alloc;
1062 /* Find the items in the intersection of SRC1 and SRC2, and copy
1063 into the top of DEST those that are not already in DEST itself. */
1064 sbase = dest->nelem + src1->nelem + src2->nelem;
1065 i1 = src1->nelem - 1;
1066 i2 = src2->nelem - 1;
1067 id = dest->nelem - 1;
1068 for (;;)
1070 if (src1->elems[i1] == src2->elems[i2])
1072 /* Try to find the item in DEST. Maybe we could binary search? */
1073 while (id >= 0 && dest->elems[id] > src1->elems[i1])
1074 --id;
1076 if (id < 0 || dest->elems[id] != src1->elems[i1])
1077 dest->elems[--sbase] = src1->elems[i1];
1079 if (--i1 < 0 || --i2 < 0)
1080 break;
1083 /* Lower the highest of the two items. */
1084 else if (src1->elems[i1] < src2->elems[i2])
1086 if (--i2 < 0)
1087 break;
1089 else
1091 if (--i1 < 0)
1092 break;
1096 id = dest->nelem - 1;
1097 is = dest->nelem + src1->nelem + src2->nelem - 1;
1098 delta = is - sbase + 1;
1100 /* Now copy. When DELTA becomes zero, the remaining
1101 DEST elements are already in place; this is more or
1102 less the same loop that is in re_node_set_merge. */
1103 dest->nelem += delta;
1104 if (delta > 0 && id >= 0)
1105 for (;;)
1107 if (dest->elems[is] > dest->elems[id])
1109 /* Copy from the top. */
1110 dest->elems[id + delta--] = dest->elems[is--];
1111 if (delta == 0)
1112 break;
1114 else
1116 /* Slide from the bottom. */
1117 dest->elems[id + delta] = dest->elems[id];
1118 if (--id < 0)
1119 break;
1123 /* Copy remaining SRC elements. */
1124 memcpy (dest->elems, dest->elems + sbase, delta * sizeof (int));
1126 return REG_NOERROR;
1129 /* Calculate the union set of the sets SRC1 and SRC2. And store it to
1130 DEST. Return value indicate the error code or REG_NOERROR if succeeded. */
1132 static reg_errcode_t
1133 internal_function __attribute_warn_unused_result__
1134 re_node_set_init_union (re_node_set *dest, const re_node_set *src1,
1135 const re_node_set *src2)
1137 int i1, i2, id;
1138 if (src1 != NULL && src1->nelem > 0 && src2 != NULL && src2->nelem > 0)
1140 dest->alloc = src1->nelem + src2->nelem;
1141 dest->elems = re_malloc (int, dest->alloc);
1142 if (BE (dest->elems == NULL, 0))
1143 return REG_ESPACE;
1145 else
1147 if (src1 != NULL && src1->nelem > 0)
1148 return re_node_set_init_copy (dest, src1);
1149 else if (src2 != NULL && src2->nelem > 0)
1150 return re_node_set_init_copy (dest, src2);
1151 else
1152 re_node_set_init_empty (dest);
1153 return REG_NOERROR;
1155 for (i1 = i2 = id = 0 ; i1 < src1->nelem && i2 < src2->nelem ;)
1157 if (src1->elems[i1] > src2->elems[i2])
1159 dest->elems[id++] = src2->elems[i2++];
1160 continue;
1162 if (src1->elems[i1] == src2->elems[i2])
1163 ++i2;
1164 dest->elems[id++] = src1->elems[i1++];
1166 if (i1 < src1->nelem)
1168 memcpy (dest->elems + id, src1->elems + i1,
1169 (src1->nelem - i1) * sizeof (int));
1170 id += src1->nelem - i1;
1172 else if (i2 < src2->nelem)
1174 memcpy (dest->elems + id, src2->elems + i2,
1175 (src2->nelem - i2) * sizeof (int));
1176 id += src2->nelem - i2;
1178 dest->nelem = id;
1179 return REG_NOERROR;
1182 /* Calculate the union set of the sets DEST and SRC. And store it to
1183 DEST. Return value indicate the error code or REG_NOERROR if succeeded. */
1185 static reg_errcode_t
1186 internal_function __attribute_warn_unused_result__
1187 re_node_set_merge (re_node_set *dest, const re_node_set *src)
1189 int is, id, sbase, delta;
1190 if (src == NULL || src->nelem == 0)
1191 return REG_NOERROR;
1192 if (dest->alloc < 2 * src->nelem + dest->nelem)
1194 int new_alloc = 2 * (src->nelem + dest->alloc);
1195 int *new_buffer = re_realloc (dest->elems, int, new_alloc);
1196 if (BE (new_buffer == NULL, 0))
1197 return REG_ESPACE;
1198 dest->elems = new_buffer;
1199 dest->alloc = new_alloc;
1202 if (BE (dest->nelem == 0, 0))
1204 dest->nelem = src->nelem;
1205 memcpy (dest->elems, src->elems, src->nelem * sizeof (int));
1206 return REG_NOERROR;
1209 /* Copy into the top of DEST the items of SRC that are not
1210 found in DEST. Maybe we could binary search in DEST? */
1211 for (sbase = dest->nelem + 2 * src->nelem,
1212 is = src->nelem - 1, id = dest->nelem - 1; is >= 0 && id >= 0; )
1214 if (dest->elems[id] == src->elems[is])
1215 is--, id--;
1216 else if (dest->elems[id] < src->elems[is])
1217 dest->elems[--sbase] = src->elems[is--];
1218 else /* if (dest->elems[id] > src->elems[is]) */
1219 --id;
1222 if (is >= 0)
1224 /* If DEST is exhausted, the remaining items of SRC must be unique. */
1225 sbase -= is + 1;
1226 memcpy (dest->elems + sbase, src->elems, (is + 1) * sizeof (int));
1229 id = dest->nelem - 1;
1230 is = dest->nelem + 2 * src->nelem - 1;
1231 delta = is - sbase + 1;
1232 if (delta == 0)
1233 return REG_NOERROR;
1235 /* Now copy. When DELTA becomes zero, the remaining
1236 DEST elements are already in place. */
1237 dest->nelem += delta;
1238 for (;;)
1240 if (dest->elems[is] > dest->elems[id])
1242 /* Copy from the top. */
1243 dest->elems[id + delta--] = dest->elems[is--];
1244 if (delta == 0)
1245 break;
1247 else
1249 /* Slide from the bottom. */
1250 dest->elems[id + delta] = dest->elems[id];
1251 if (--id < 0)
1253 /* Copy remaining SRC elements. */
1254 memcpy (dest->elems, dest->elems + sbase,
1255 delta * sizeof (int));
1256 break;
1261 return REG_NOERROR;
1264 /* Insert the new element ELEM to the re_node_set* SET.
1265 SET should not already have ELEM.
1266 return -1 if an error is occured, return 1 otherwise. */
1268 static int
1269 internal_function __attribute_warn_unused_result__
1270 re_node_set_insert (re_node_set *set, int elem)
1272 int idx;
1273 /* In case the set is empty. */
1274 if (set->alloc == 0)
1276 if (BE (re_node_set_init_1 (set, elem) == REG_NOERROR, 1))
1277 return 1;
1278 else
1279 return -1;
1282 if (BE (set->nelem, 0) == 0)
1284 /* We already guaranteed above that set->alloc != 0. */
1285 set->elems[0] = elem;
1286 ++set->nelem;
1287 return 1;
1290 /* Realloc if we need. */
1291 if (set->alloc == set->nelem)
1293 int *new_elems;
1294 set->alloc = set->alloc * 2;
1295 new_elems = re_realloc (set->elems, int, set->alloc);
1296 if (BE (new_elems == NULL, 0))
1297 return -1;
1298 set->elems = new_elems;
1301 /* Move the elements which follows the new element. Test the
1302 first element separately to skip a check in the inner loop. */
1303 if (elem < set->elems[0])
1305 idx = 0;
1306 for (idx = set->nelem; idx > 0; idx--)
1307 set->elems[idx] = set->elems[idx - 1];
1309 else
1311 for (idx = set->nelem; set->elems[idx - 1] > elem; idx--)
1312 set->elems[idx] = set->elems[idx - 1];
1315 /* Insert the new element. */
1316 set->elems[idx] = elem;
1317 ++set->nelem;
1318 return 1;
1321 /* Insert the new element ELEM to the re_node_set* SET.
1322 SET should not already have any element greater than or equal to ELEM.
1323 Return -1 if an error is occured, return 1 otherwise. */
1325 static int
1326 internal_function __attribute_warn_unused_result__
1327 re_node_set_insert_last (re_node_set *set, int elem)
1329 /* Realloc if we need. */
1330 if (set->alloc == set->nelem)
1332 int *new_elems;
1333 set->alloc = (set->alloc + 1) * 2;
1334 new_elems = re_realloc (set->elems, int, set->alloc);
1335 if (BE (new_elems == NULL, 0))
1336 return -1;
1337 set->elems = new_elems;
1340 /* Insert the new element. */
1341 set->elems[set->nelem++] = elem;
1342 return 1;
1345 /* Compare two node sets SET1 and SET2.
1346 return 1 if SET1 and SET2 are equivalent, return 0 otherwise. */
1348 static int
1349 internal_function __attribute ((pure))
1350 re_node_set_compare (const re_node_set *set1, const re_node_set *set2)
1352 int i;
1353 if (set1 == NULL || set2 == NULL || set1->nelem != set2->nelem)
1354 return 0;
1355 for (i = set1->nelem ; --i >= 0 ; )
1356 if (set1->elems[i] != set2->elems[i])
1357 return 0;
1358 return 1;
1361 /* Return (idx + 1) if SET contains the element ELEM, return 0 otherwise. */
1363 static int
1364 internal_function __attribute ((pure))
1365 re_node_set_contains (const re_node_set *set, int elem)
1367 unsigned int idx, right, mid;
1368 if (set->nelem <= 0)
1369 return 0;
1371 /* Binary search the element. */
1372 idx = 0;
1373 right = set->nelem - 1;
1374 while (idx < right)
1376 mid = (idx + right) / 2;
1377 if (set->elems[mid] < elem)
1378 idx = mid + 1;
1379 else
1380 right = mid;
1382 return set->elems[idx] == elem ? idx + 1 : 0;
1385 static void
1386 internal_function
1387 re_node_set_remove_at (re_node_set *set, int idx)
1389 if (idx < 0 || idx >= set->nelem)
1390 return;
1391 --set->nelem;
1392 for (; idx < set->nelem; idx++)
1393 set->elems[idx] = set->elems[idx + 1];
1397 /* Add the token TOKEN to dfa->nodes, and return the index of the token.
1398 Or return -1, if an error will be occured. */
1400 static int
1401 internal_function
1402 re_dfa_add_node (re_dfa_t *dfa, re_token_t token)
1404 int type = token.type;
1405 if (BE (dfa->nodes_len >= dfa->nodes_alloc, 0))
1407 size_t new_nodes_alloc = dfa->nodes_alloc * 2;
1408 int *new_nexts, *new_indices;
1409 re_node_set *new_edests, *new_eclosures;
1410 re_token_t *new_nodes;
1412 /* Avoid overflows in realloc. */
1413 const size_t max_object_size = MAX (sizeof (re_token_t),
1414 MAX (sizeof (re_node_set),
1415 sizeof (int)));
1416 if (BE (SIZE_MAX / max_object_size < new_nodes_alloc, 0))
1417 return -1;
1419 new_nodes = re_realloc (dfa->nodes, re_token_t, new_nodes_alloc);
1420 if (BE (new_nodes == NULL, 0))
1421 return -1;
1422 dfa->nodes = new_nodes;
1423 new_nexts = re_realloc (dfa->nexts, int, new_nodes_alloc);
1424 new_indices = re_realloc (dfa->org_indices, int, new_nodes_alloc);
1425 new_edests = re_realloc (dfa->edests, re_node_set, new_nodes_alloc);
1426 new_eclosures = re_realloc (dfa->eclosures, re_node_set, new_nodes_alloc);
1427 if (BE (new_nexts == NULL || new_indices == NULL
1428 || new_edests == NULL || new_eclosures == NULL, 0))
1429 return -1;
1430 dfa->nexts = new_nexts;
1431 dfa->org_indices = new_indices;
1432 dfa->edests = new_edests;
1433 dfa->eclosures = new_eclosures;
1434 dfa->nodes_alloc = new_nodes_alloc;
1436 dfa->nodes[dfa->nodes_len] = token;
1437 dfa->nodes[dfa->nodes_len].constraint = 0;
1438 #ifdef RE_ENABLE_I18N
1439 dfa->nodes[dfa->nodes_len].accept_mb =
1440 (type == OP_PERIOD && dfa->mb_cur_max > 1) || type == COMPLEX_BRACKET;
1441 #endif
1442 dfa->nexts[dfa->nodes_len] = -1;
1443 re_node_set_init_empty (dfa->edests + dfa->nodes_len);
1444 re_node_set_init_empty (dfa->eclosures + dfa->nodes_len);
1445 return dfa->nodes_len++;
1448 static inline unsigned int
1449 internal_function
1450 calc_state_hash (const re_node_set *nodes, unsigned int context)
1452 unsigned int hash = nodes->nelem + context;
1453 int i;
1454 for (i = 0 ; i < nodes->nelem ; i++)
1455 hash += nodes->elems[i];
1456 return hash;
1459 /* Search for the state whose node_set is equivalent to NODES.
1460 Return the pointer to the state, if we found it in the DFA.
1461 Otherwise create the new one and return it. In case of an error
1462 return NULL and set the error code in ERR.
1463 Note: - We assume NULL as the invalid state, then it is possible that
1464 return value is NULL and ERR is REG_NOERROR.
1465 - We never return non-NULL value in case of any errors, it is for
1466 optimization. */
1468 static re_dfastate_t *
1469 internal_function __attribute_warn_unused_result__
1470 re_acquire_state (reg_errcode_t *err, const re_dfa_t *dfa,
1471 const re_node_set *nodes)
1473 unsigned int hash;
1474 re_dfastate_t *new_state;
1475 struct re_state_table_entry *spot;
1476 int i;
1477 if (BE (nodes->nelem == 0, 0))
1479 *err = REG_NOERROR;
1480 return NULL;
1482 hash = calc_state_hash (nodes, 0);
1483 spot = dfa->state_table + (hash & dfa->state_hash_mask);
1485 for (i = 0 ; i < spot->num ; i++)
1487 re_dfastate_t *state = spot->array[i];
1488 if (hash != state->hash)
1489 continue;
1490 if (re_node_set_compare (&state->nodes, nodes))
1491 return state;
1494 /* There are no appropriate state in the dfa, create the new one. */
1495 new_state = create_ci_newstate (dfa, nodes, hash);
1496 if (BE (new_state == NULL, 0))
1497 *err = REG_ESPACE;
1499 return new_state;
1502 /* Search for the state whose node_set is equivalent to NODES and
1503 whose context is equivalent to CONTEXT.
1504 Return the pointer to the state, if we found it in the DFA.
1505 Otherwise create the new one and return it. In case of an error
1506 return NULL and set the error code in ERR.
1507 Note: - We assume NULL as the invalid state, then it is possible that
1508 return value is NULL and ERR is REG_NOERROR.
1509 - We never return non-NULL value in case of any errors, it is for
1510 optimization. */
1512 static re_dfastate_t *
1513 internal_function __attribute_warn_unused_result__
1514 re_acquire_state_context (reg_errcode_t *err, const re_dfa_t *dfa,
1515 const re_node_set *nodes, unsigned int context)
1517 unsigned int hash;
1518 re_dfastate_t *new_state;
1519 struct re_state_table_entry *spot;
1520 int i;
1521 if (nodes->nelem == 0)
1523 *err = REG_NOERROR;
1524 return NULL;
1526 hash = calc_state_hash (nodes, context);
1527 spot = dfa->state_table + (hash & dfa->state_hash_mask);
1529 for (i = 0 ; i < spot->num ; i++)
1531 re_dfastate_t *state = spot->array[i];
1532 if (state->hash == hash
1533 && state->context == context
1534 && re_node_set_compare (state->entrance_nodes, nodes))
1535 return state;
1537 /* There are no appropriate state in `dfa', create the new one. */
1538 new_state = create_cd_newstate (dfa, nodes, context, hash);
1539 if (BE (new_state == NULL, 0))
1540 *err = REG_ESPACE;
1542 return new_state;
1545 /* Finish initialization of the new state NEWSTATE, and using its hash value
1546 HASH put in the appropriate bucket of DFA's state table. Return value
1547 indicates the error code if failed. */
1549 static reg_errcode_t
1550 __attribute_warn_unused_result__
1551 register_state (const re_dfa_t *dfa, re_dfastate_t *newstate,
1552 unsigned int hash)
1554 struct re_state_table_entry *spot;
1555 reg_errcode_t err;
1556 int i;
1558 newstate->hash = hash;
1559 err = re_node_set_alloc (&newstate->non_eps_nodes, newstate->nodes.nelem);
1560 if (BE (err != REG_NOERROR, 0))
1561 return REG_ESPACE;
1562 for (i = 0; i < newstate->nodes.nelem; i++)
1564 int elem = newstate->nodes.elems[i];
1565 if (!IS_EPSILON_NODE (dfa->nodes[elem].type))
1566 if (re_node_set_insert_last (&newstate->non_eps_nodes, elem) < 0)
1567 return REG_ESPACE;
1570 spot = dfa->state_table + (hash & dfa->state_hash_mask);
1571 if (BE (spot->alloc <= spot->num, 0))
1573 int new_alloc = 2 * spot->num + 2;
1574 re_dfastate_t **new_array = re_realloc (spot->array, re_dfastate_t *,
1575 new_alloc);
1576 if (BE (new_array == NULL, 0))
1577 return REG_ESPACE;
1578 spot->array = new_array;
1579 spot->alloc = new_alloc;
1581 spot->array[spot->num++] = newstate;
1582 return REG_NOERROR;
1585 static void
1586 free_state (re_dfastate_t *state)
1588 re_node_set_free (&state->non_eps_nodes);
1589 re_node_set_free (&state->inveclosure);
1590 if (state->entrance_nodes != &state->nodes)
1592 re_node_set_free (state->entrance_nodes);
1593 re_free (state->entrance_nodes);
1595 re_node_set_free (&state->nodes);
1596 re_free (state->word_trtable);
1597 re_free (state->trtable);
1598 re_free (state);
1601 /* Create the new state which is independ of contexts.
1602 Return the new state if succeeded, otherwise return NULL. */
1604 static re_dfastate_t *
1605 internal_function __attribute_warn_unused_result__
1606 create_ci_newstate (const re_dfa_t *dfa, const re_node_set *nodes,
1607 unsigned int hash)
1609 int i;
1610 reg_errcode_t err;
1611 re_dfastate_t *newstate;
1613 newstate = (re_dfastate_t *) calloc (sizeof (re_dfastate_t), 1);
1614 if (BE (newstate == NULL, 0))
1615 return NULL;
1616 err = re_node_set_init_copy (&newstate->nodes, nodes);
1617 if (BE (err != REG_NOERROR, 0))
1619 re_free (newstate);
1620 return NULL;
1623 newstate->entrance_nodes = &newstate->nodes;
1624 for (i = 0 ; i < nodes->nelem ; i++)
1626 re_token_t *node = dfa->nodes + nodes->elems[i];
1627 re_token_type_t type = node->type;
1628 if (type == CHARACTER && !node->constraint)
1629 continue;
1630 #ifdef RE_ENABLE_I18N
1631 newstate->accept_mb |= node->accept_mb;
1632 #endif /* RE_ENABLE_I18N */
1634 /* If the state has the halt node, the state is a halt state. */
1635 if (type == END_OF_RE)
1636 newstate->halt = 1;
1637 else if (type == OP_BACK_REF)
1638 newstate->has_backref = 1;
1639 else if (type == ANCHOR || node->constraint)
1640 newstate->has_constraint = 1;
1642 err = register_state (dfa, newstate, hash);
1643 if (BE (err != REG_NOERROR, 0))
1645 free_state (newstate);
1646 newstate = NULL;
1648 return newstate;
1651 /* Create the new state which is depend on the context CONTEXT.
1652 Return the new state if succeeded, otherwise return NULL. */
1654 static re_dfastate_t *
1655 internal_function __attribute_warn_unused_result__
1656 create_cd_newstate (const re_dfa_t *dfa, const re_node_set *nodes,
1657 unsigned int context, unsigned int hash)
1659 int i, nctx_nodes = 0;
1660 reg_errcode_t err;
1661 re_dfastate_t *newstate;
1663 newstate = (re_dfastate_t *) calloc (sizeof (re_dfastate_t), 1);
1664 if (BE (newstate == NULL, 0))
1665 return NULL;
1666 err = re_node_set_init_copy (&newstate->nodes, nodes);
1667 if (BE (err != REG_NOERROR, 0))
1669 re_free (newstate);
1670 return NULL;
1673 newstate->context = context;
1674 newstate->entrance_nodes = &newstate->nodes;
1676 for (i = 0 ; i < nodes->nelem ; i++)
1678 re_token_t *node = dfa->nodes + nodes->elems[i];
1679 re_token_type_t type = node->type;
1680 unsigned int constraint = node->constraint;
1682 if (type == CHARACTER && !constraint)
1683 continue;
1684 #ifdef RE_ENABLE_I18N
1685 newstate->accept_mb |= node->accept_mb;
1686 #endif /* RE_ENABLE_I18N */
1688 /* If the state has the halt node, the state is a halt state. */
1689 if (type == END_OF_RE)
1690 newstate->halt = 1;
1691 else if (type == OP_BACK_REF)
1692 newstate->has_backref = 1;
1694 if (constraint)
1696 if (newstate->entrance_nodes == &newstate->nodes)
1698 newstate->entrance_nodes = re_malloc (re_node_set, 1);
1699 if (BE (newstate->entrance_nodes == NULL, 0))
1701 free_state (newstate);
1702 return NULL;
1704 if (re_node_set_init_copy (newstate->entrance_nodes, nodes)
1705 != REG_NOERROR)
1706 return NULL;
1707 nctx_nodes = 0;
1708 newstate->has_constraint = 1;
1711 if (NOT_SATISFY_PREV_CONSTRAINT (constraint,context))
1713 re_node_set_remove_at (&newstate->nodes, i - nctx_nodes);
1714 ++nctx_nodes;
1718 err = register_state (dfa, newstate, hash);
1719 if (BE (err != REG_NOERROR, 0))
1721 free_state (newstate);
1722 newstate = NULL;
1724 return newstate;