Fix up regcomp/regexec
[glibc.git] / posix / regex_internal.c
blob124f8ccf2e85ea619df9b323afcfb9cdbf057388
1 /* Extended regular expression matching and search library.
2 Copyright (C) 2002-2006, 2010, 2011 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4 Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library; if not, write to the Free
18 Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
19 02111-1307 USA. */
21 static void re_string_construct_common (const char *str, int len,
22 re_string_t *pstr,
23 RE_TRANSLATE_TYPE trans, int icase,
24 const re_dfa_t *dfa) internal_function;
25 static re_dfastate_t *create_ci_newstate (const re_dfa_t *dfa,
26 const re_node_set *nodes,
27 unsigned int hash) internal_function;
28 static re_dfastate_t *create_cd_newstate (const re_dfa_t *dfa,
29 const re_node_set *nodes,
30 unsigned int context,
31 unsigned int hash) internal_function;
33 /* Functions for string operation. */
35 /* This function allocate the buffers. It is necessary to call
36 re_string_reconstruct before using the object. */
38 static reg_errcode_t
39 internal_function __attribute_warn_unused_result__
40 re_string_allocate (re_string_t *pstr, const char *str, int len, int init_len,
41 RE_TRANSLATE_TYPE trans, int icase, const re_dfa_t *dfa)
43 reg_errcode_t ret;
44 int init_buf_len;
46 /* Ensure at least one character fits into the buffers. */
47 if (init_len < dfa->mb_cur_max)
48 init_len = dfa->mb_cur_max;
49 init_buf_len = (len + 1 < init_len) ? len + 1: init_len;
50 re_string_construct_common (str, len, pstr, trans, icase, dfa);
52 ret = re_string_realloc_buffers (pstr, init_buf_len);
53 if (BE (ret != REG_NOERROR, 0))
54 return ret;
56 pstr->word_char = dfa->word_char;
57 pstr->word_ops_used = dfa->word_ops_used;
58 pstr->mbs = pstr->mbs_allocated ? pstr->mbs : (unsigned char *) str;
59 pstr->valid_len = (pstr->mbs_allocated || dfa->mb_cur_max > 1) ? 0 : len;
60 pstr->valid_raw_len = pstr->valid_len;
61 return REG_NOERROR;
64 /* This function allocate the buffers, and initialize them. */
66 static reg_errcode_t
67 internal_function __attribute_warn_unused_result__
68 re_string_construct (re_string_t *pstr, const char *str, int len,
69 RE_TRANSLATE_TYPE trans, int icase, const re_dfa_t *dfa)
71 reg_errcode_t ret;
72 memset (pstr, '\0', sizeof (re_string_t));
73 re_string_construct_common (str, len, pstr, trans, icase, dfa);
75 if (len > 0)
77 ret = re_string_realloc_buffers (pstr, len + 1);
78 if (BE (ret != REG_NOERROR, 0))
79 return ret;
81 pstr->mbs = pstr->mbs_allocated ? pstr->mbs : (unsigned char *) str;
83 if (icase)
85 #ifdef RE_ENABLE_I18N
86 if (dfa->mb_cur_max > 1)
88 while (1)
90 ret = build_wcs_upper_buffer (pstr);
91 if (BE (ret != REG_NOERROR, 0))
92 return ret;
93 if (pstr->valid_raw_len >= len)
94 break;
95 if (pstr->bufs_len > pstr->valid_len + dfa->mb_cur_max)
96 break;
97 ret = re_string_realloc_buffers (pstr, pstr->bufs_len * 2);
98 if (BE (ret != REG_NOERROR, 0))
99 return ret;
102 else
103 #endif /* RE_ENABLE_I18N */
104 build_upper_buffer (pstr);
106 else
108 #ifdef RE_ENABLE_I18N
109 if (dfa->mb_cur_max > 1)
110 build_wcs_buffer (pstr);
111 else
112 #endif /* RE_ENABLE_I18N */
114 if (trans != NULL)
115 re_string_translate_buffer (pstr);
116 else
118 pstr->valid_len = pstr->bufs_len;
119 pstr->valid_raw_len = pstr->bufs_len;
124 return REG_NOERROR;
127 /* Helper functions for re_string_allocate, and re_string_construct. */
129 static reg_errcode_t
130 internal_function __attribute_warn_unused_result__
131 re_string_realloc_buffers (re_string_t *pstr, int new_buf_len)
133 #ifdef RE_ENABLE_I18N
134 if (pstr->mb_cur_max > 1)
136 wint_t *new_wcs;
138 /* Avoid overflow in realloc. */
139 const size_t max_object_size = MAX (sizeof (wint_t), sizeof (int));
140 if (BE (SIZE_MAX / max_object_size < new_buf_len, 0))
141 return REG_ESPACE;
143 new_wcs = re_realloc (pstr->wcs, wint_t, new_buf_len);
144 if (BE (new_wcs == NULL, 0))
145 return REG_ESPACE;
146 pstr->wcs = new_wcs;
147 if (pstr->offsets != NULL)
149 int *new_offsets = re_realloc (pstr->offsets, int, new_buf_len);
150 if (BE (new_offsets == NULL, 0))
151 return REG_ESPACE;
152 pstr->offsets = new_offsets;
155 #endif /* RE_ENABLE_I18N */
156 if (pstr->mbs_allocated)
158 unsigned char *new_mbs = re_realloc (pstr->mbs, unsigned char,
159 new_buf_len);
160 if (BE (new_mbs == NULL, 0))
161 return REG_ESPACE;
162 pstr->mbs = new_mbs;
164 pstr->bufs_len = new_buf_len;
165 return REG_NOERROR;
169 static void
170 internal_function
171 re_string_construct_common (const char *str, int len, re_string_t *pstr,
172 RE_TRANSLATE_TYPE trans, int icase,
173 const re_dfa_t *dfa)
175 pstr->raw_mbs = (const unsigned char *) str;
176 pstr->len = len;
177 pstr->raw_len = len;
178 pstr->trans = trans;
179 pstr->icase = icase ? 1 : 0;
180 pstr->mbs_allocated = (trans != NULL || icase);
181 pstr->mb_cur_max = dfa->mb_cur_max;
182 pstr->is_utf8 = dfa->is_utf8;
183 pstr->map_notascii = dfa->map_notascii;
184 pstr->stop = pstr->len;
185 pstr->raw_stop = pstr->stop;
188 #ifdef RE_ENABLE_I18N
190 /* Build wide character buffer PSTR->WCS.
191 If the byte sequence of the string are:
192 <mb1>(0), <mb1>(1), <mb2>(0), <mb2>(1), <sb3>
193 Then wide character buffer will be:
194 <wc1> , WEOF , <wc2> , WEOF , <wc3>
195 We use WEOF for padding, they indicate that the position isn't
196 a first byte of a multibyte character.
198 Note that this function assumes PSTR->VALID_LEN elements are already
199 built and starts from PSTR->VALID_LEN. */
201 static void
202 internal_function
203 build_wcs_buffer (re_string_t *pstr)
205 #ifdef _LIBC
206 unsigned char buf[MB_LEN_MAX];
207 assert (MB_LEN_MAX >= pstr->mb_cur_max);
208 #else
209 unsigned char buf[64];
210 #endif
211 mbstate_t prev_st;
212 int byte_idx, end_idx, remain_len;
213 size_t mbclen;
215 /* Build the buffers from pstr->valid_len to either pstr->len or
216 pstr->bufs_len. */
217 end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
218 for (byte_idx = pstr->valid_len; byte_idx < end_idx;)
220 wchar_t wc;
221 const char *p;
223 remain_len = end_idx - byte_idx;
224 prev_st = pstr->cur_state;
225 /* Apply the translation if we need. */
226 if (BE (pstr->trans != NULL, 0))
228 int i, ch;
230 for (i = 0; i < pstr->mb_cur_max && i < remain_len; ++i)
232 ch = pstr->raw_mbs [pstr->raw_mbs_idx + byte_idx + i];
233 buf[i] = pstr->mbs[byte_idx + i] = pstr->trans[ch];
235 p = (const char *) buf;
237 else
238 p = (const char *) pstr->raw_mbs + pstr->raw_mbs_idx + byte_idx;
239 mbclen = __mbrtowc (&wc, p, remain_len, &pstr->cur_state);
240 if (BE (mbclen == (size_t) -1 || mbclen == 0
241 || (mbclen == (size_t) -2 && pstr->bufs_len >= pstr->len), 0))
243 /* We treat these cases as a singlebyte character. */
244 mbclen = 1;
245 wc = (wchar_t) pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx];
246 if (BE (pstr->trans != NULL, 0))
247 wc = pstr->trans[wc];
248 pstr->cur_state = prev_st;
250 else if (BE (mbclen == (size_t) -2, 0))
252 /* The buffer doesn't have enough space, finish to build. */
253 pstr->cur_state = prev_st;
254 break;
257 /* Write wide character and padding. */
258 pstr->wcs[byte_idx++] = wc;
259 /* Write paddings. */
260 for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
261 pstr->wcs[byte_idx++] = WEOF;
263 pstr->valid_len = byte_idx;
264 pstr->valid_raw_len = byte_idx;
267 /* Build wide character buffer PSTR->WCS like build_wcs_buffer,
268 but for REG_ICASE. */
270 static reg_errcode_t
271 internal_function __attribute_warn_unused_result__
272 build_wcs_upper_buffer (re_string_t *pstr)
274 mbstate_t prev_st;
275 int src_idx, byte_idx, end_idx, remain_len;
276 size_t mbclen;
277 #ifdef _LIBC
278 char buf[MB_LEN_MAX];
279 assert (MB_LEN_MAX >= pstr->mb_cur_max);
280 #else
281 char buf[64];
282 #endif
284 byte_idx = pstr->valid_len;
285 end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
287 /* The following optimization assumes that ASCII characters can be
288 mapped to wide characters with a simple cast. */
289 if (! pstr->map_notascii && pstr->trans == NULL && !pstr->offsets_needed)
291 while (byte_idx < end_idx)
293 wchar_t wc;
295 if (isascii (pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx])
296 && mbsinit (&pstr->cur_state))
298 /* In case of a singlebyte character. */
299 pstr->mbs[byte_idx]
300 = toupper (pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx]);
301 /* The next step uses the assumption that wchar_t is encoded
302 ASCII-safe: all ASCII values can be converted like this. */
303 pstr->wcs[byte_idx] = (wchar_t) pstr->mbs[byte_idx];
304 ++byte_idx;
305 continue;
308 remain_len = end_idx - byte_idx;
309 prev_st = pstr->cur_state;
310 mbclen = __mbrtowc (&wc,
311 ((const char *) pstr->raw_mbs + pstr->raw_mbs_idx
312 + byte_idx), remain_len, &pstr->cur_state);
313 if (BE (mbclen + 2 > 2, 1))
315 wchar_t wcu = wc;
316 if (iswlower (wc))
318 size_t mbcdlen;
320 wcu = towupper (wc);
321 mbcdlen = wcrtomb (buf, wcu, &prev_st);
322 if (BE (mbclen == mbcdlen, 1))
323 memcpy (pstr->mbs + byte_idx, buf, mbclen);
324 else
326 src_idx = byte_idx;
327 goto offsets_needed;
330 else
331 memcpy (pstr->mbs + byte_idx,
332 pstr->raw_mbs + pstr->raw_mbs_idx + byte_idx, mbclen);
333 pstr->wcs[byte_idx++] = wcu;
334 /* Write paddings. */
335 for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
336 pstr->wcs[byte_idx++] = WEOF;
338 else if (mbclen == (size_t) -1 || mbclen == 0
339 || (mbclen == (size_t) -2 && pstr->bufs_len >= pstr->len))
341 /* It is an invalid character, an incomplete character
342 at the end of the string, or '\0'. Just use the byte. */
343 int ch = pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx];
344 pstr->mbs[byte_idx] = ch;
345 /* And also cast it to wide char. */
346 pstr->wcs[byte_idx++] = (wchar_t) ch;
347 if (BE (mbclen == (size_t) -1, 0))
348 pstr->cur_state = prev_st;
350 else
352 /* The buffer doesn't have enough space, finish to build. */
353 pstr->cur_state = prev_st;
354 break;
357 pstr->valid_len = byte_idx;
358 pstr->valid_raw_len = byte_idx;
359 return REG_NOERROR;
361 else
362 for (src_idx = pstr->valid_raw_len; byte_idx < end_idx;)
364 wchar_t wc;
365 const char *p;
366 offsets_needed:
367 remain_len = end_idx - byte_idx;
368 prev_st = pstr->cur_state;
369 if (BE (pstr->trans != NULL, 0))
371 int i, ch;
373 for (i = 0; i < pstr->mb_cur_max && i < remain_len; ++i)
375 ch = pstr->raw_mbs [pstr->raw_mbs_idx + src_idx + i];
376 buf[i] = pstr->trans[ch];
378 p = (const char *) buf;
380 else
381 p = (const char *) pstr->raw_mbs + pstr->raw_mbs_idx + src_idx;
382 mbclen = __mbrtowc (&wc, p, remain_len, &pstr->cur_state);
383 if (BE (mbclen + 2 > 2, 1))
385 wchar_t wcu = wc;
386 if (iswlower (wc))
388 size_t mbcdlen;
390 wcu = towupper (wc);
391 mbcdlen = wcrtomb ((char *) buf, wcu, &prev_st);
392 if (BE (mbclen == mbcdlen, 1))
393 memcpy (pstr->mbs + byte_idx, buf, mbclen);
394 else if (mbcdlen != (size_t) -1)
396 size_t i;
398 if (byte_idx + mbcdlen > pstr->bufs_len)
400 pstr->cur_state = prev_st;
401 break;
404 if (pstr->offsets == NULL)
406 pstr->offsets = re_malloc (int, pstr->bufs_len);
408 if (pstr->offsets == NULL)
409 return REG_ESPACE;
411 if (!pstr->offsets_needed)
413 for (i = 0; i < (size_t) byte_idx; ++i)
414 pstr->offsets[i] = i;
415 pstr->offsets_needed = 1;
418 memcpy (pstr->mbs + byte_idx, buf, mbcdlen);
419 pstr->wcs[byte_idx] = wcu;
420 pstr->offsets[byte_idx] = src_idx;
421 for (i = 1; i < mbcdlen; ++i)
423 pstr->offsets[byte_idx + i]
424 = src_idx + (i < mbclen ? i : mbclen - 1);
425 pstr->wcs[byte_idx + i] = WEOF;
427 pstr->len += mbcdlen - mbclen;
428 if (pstr->raw_stop > src_idx)
429 pstr->stop += mbcdlen - mbclen;
430 end_idx = (pstr->bufs_len > pstr->len)
431 ? pstr->len : pstr->bufs_len;
432 byte_idx += mbcdlen;
433 src_idx += mbclen;
434 continue;
436 else
437 memcpy (pstr->mbs + byte_idx, p, mbclen);
439 else
440 memcpy (pstr->mbs + byte_idx, p, mbclen);
442 if (BE (pstr->offsets_needed != 0, 0))
444 size_t i;
445 for (i = 0; i < mbclen; ++i)
446 pstr->offsets[byte_idx + i] = src_idx + i;
448 src_idx += mbclen;
450 pstr->wcs[byte_idx++] = wcu;
451 /* Write paddings. */
452 for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
453 pstr->wcs[byte_idx++] = WEOF;
455 else if (mbclen == (size_t) -1 || mbclen == 0
456 || (mbclen == (size_t) -2 && pstr->bufs_len >= pstr->len))
458 /* It is an invalid character or '\0'. Just use the byte. */
459 int ch = pstr->raw_mbs[pstr->raw_mbs_idx + src_idx];
461 if (BE (pstr->trans != NULL, 0))
462 ch = pstr->trans [ch];
463 pstr->mbs[byte_idx] = ch;
465 if (BE (pstr->offsets_needed != 0, 0))
466 pstr->offsets[byte_idx] = src_idx;
467 ++src_idx;
469 /* And also cast it to wide char. */
470 pstr->wcs[byte_idx++] = (wchar_t) ch;
471 if (BE (mbclen == (size_t) -1, 0))
472 pstr->cur_state = prev_st;
474 else
476 /* The buffer doesn't have enough space, finish to build. */
477 pstr->cur_state = prev_st;
478 break;
481 pstr->valid_len = byte_idx;
482 pstr->valid_raw_len = src_idx;
483 return REG_NOERROR;
486 /* Skip characters until the index becomes greater than NEW_RAW_IDX.
487 Return the index. */
489 static int
490 internal_function
491 re_string_skip_chars (re_string_t *pstr, int new_raw_idx, wint_t *last_wc)
493 mbstate_t prev_st;
494 int rawbuf_idx;
495 size_t mbclen;
496 wint_t wc = WEOF;
498 /* Skip the characters which are not necessary to check. */
499 for (rawbuf_idx = pstr->raw_mbs_idx + pstr->valid_raw_len;
500 rawbuf_idx < new_raw_idx;)
502 wchar_t wc2;
503 int remain_len = pstr->len - rawbuf_idx;
504 prev_st = pstr->cur_state;
505 mbclen = __mbrtowc (&wc2, (const char *) pstr->raw_mbs + rawbuf_idx,
506 remain_len, &pstr->cur_state);
507 if (BE ((ssize_t) mbclen <= 0, 0))
509 /* We treat these cases as a single byte character. */
510 if (mbclen == 0 || remain_len == 0)
511 wc = L'\0';
512 else
513 wc = *(unsigned char *) (pstr->raw_mbs + rawbuf_idx);
514 mbclen = 1;
515 pstr->cur_state = prev_st;
517 else
518 wc = (wint_t) wc2;
519 /* Then proceed the next character. */
520 rawbuf_idx += mbclen;
522 *last_wc = wc;
523 return rawbuf_idx;
525 #endif /* RE_ENABLE_I18N */
527 /* Build the buffer PSTR->MBS, and apply the translation if we need.
528 This function is used in case of REG_ICASE. */
530 static void
531 internal_function
532 build_upper_buffer (re_string_t *pstr)
534 int char_idx, end_idx;
535 end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
537 for (char_idx = pstr->valid_len; char_idx < end_idx; ++char_idx)
539 int ch = pstr->raw_mbs[pstr->raw_mbs_idx + char_idx];
540 if (BE (pstr->trans != NULL, 0))
541 ch = pstr->trans[ch];
542 if (islower (ch))
543 pstr->mbs[char_idx] = toupper (ch);
544 else
545 pstr->mbs[char_idx] = ch;
547 pstr->valid_len = char_idx;
548 pstr->valid_raw_len = char_idx;
551 /* Apply TRANS to the buffer in PSTR. */
553 static void
554 internal_function
555 re_string_translate_buffer (re_string_t *pstr)
557 int buf_idx, end_idx;
558 end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
560 for (buf_idx = pstr->valid_len; buf_idx < end_idx; ++buf_idx)
562 int ch = pstr->raw_mbs[pstr->raw_mbs_idx + buf_idx];
563 pstr->mbs[buf_idx] = pstr->trans[ch];
566 pstr->valid_len = buf_idx;
567 pstr->valid_raw_len = buf_idx;
570 /* This function re-construct the buffers.
571 Concretely, convert to wide character in case of pstr->mb_cur_max > 1,
572 convert to upper case in case of REG_ICASE, apply translation. */
574 static reg_errcode_t
575 internal_function __attribute_warn_unused_result__
576 re_string_reconstruct (re_string_t *pstr, int idx, int eflags)
578 int offset = idx - pstr->raw_mbs_idx;
579 if (BE (offset < 0, 0))
581 /* Reset buffer. */
582 #ifdef RE_ENABLE_I18N
583 if (pstr->mb_cur_max > 1)
584 memset (&pstr->cur_state, '\0', sizeof (mbstate_t));
585 #endif /* RE_ENABLE_I18N */
586 pstr->len = pstr->raw_len;
587 pstr->stop = pstr->raw_stop;
588 pstr->valid_len = 0;
589 pstr->raw_mbs_idx = 0;
590 pstr->valid_raw_len = 0;
591 pstr->offsets_needed = 0;
592 pstr->tip_context = ((eflags & REG_NOTBOL) ? CONTEXT_BEGBUF
593 : CONTEXT_NEWLINE | CONTEXT_BEGBUF);
594 if (!pstr->mbs_allocated)
595 pstr->mbs = (unsigned char *) pstr->raw_mbs;
596 offset = idx;
599 if (BE (offset != 0, 1))
601 /* Should the already checked characters be kept? */
602 if (BE (offset < pstr->valid_raw_len, 1))
604 /* Yes, move them to the front of the buffer. */
605 #ifdef RE_ENABLE_I18N
606 if (BE (pstr->offsets_needed, 0))
608 int low = 0, high = pstr->valid_len, mid;
611 mid = (high + low) / 2;
612 if (pstr->offsets[mid] > offset)
613 high = mid;
614 else if (pstr->offsets[mid] < offset)
615 low = mid + 1;
616 else
617 break;
619 while (low < high);
620 if (pstr->offsets[mid] < offset)
621 ++mid;
622 pstr->tip_context = re_string_context_at (pstr, mid - 1,
623 eflags);
624 /* This can be quite complicated, so handle specially
625 only the common and easy case where the character with
626 different length representation of lower and upper
627 case is present at or after offset. */
628 if (pstr->valid_len > offset
629 && mid == offset && pstr->offsets[mid] == offset)
631 memmove (pstr->wcs, pstr->wcs + offset,
632 (pstr->valid_len - offset) * sizeof (wint_t));
633 memmove (pstr->mbs, pstr->mbs + offset, pstr->valid_len - offset);
634 pstr->valid_len -= offset;
635 pstr->valid_raw_len -= offset;
636 for (low = 0; low < pstr->valid_len; low++)
637 pstr->offsets[low] = pstr->offsets[low + offset] - offset;
639 else
641 /* Otherwise, just find out how long the partial multibyte
642 character at offset is and fill it with WEOF/255. */
643 pstr->len = pstr->raw_len - idx + offset;
644 pstr->stop = pstr->raw_stop - idx + offset;
645 pstr->offsets_needed = 0;
646 while (mid > 0 && pstr->offsets[mid - 1] == offset)
647 --mid;
648 while (mid < pstr->valid_len)
649 if (pstr->wcs[mid] != WEOF)
650 break;
651 else
652 ++mid;
653 if (mid == pstr->valid_len)
654 pstr->valid_len = 0;
655 else
657 pstr->valid_len = pstr->offsets[mid] - offset;
658 if (pstr->valid_len)
660 for (low = 0; low < pstr->valid_len; ++low)
661 pstr->wcs[low] = WEOF;
662 memset (pstr->mbs, 255, pstr->valid_len);
665 pstr->valid_raw_len = pstr->valid_len;
668 else
669 #endif
671 pstr->tip_context = re_string_context_at (pstr, offset - 1,
672 eflags);
673 #ifdef RE_ENABLE_I18N
674 if (pstr->mb_cur_max > 1)
675 memmove (pstr->wcs, pstr->wcs + offset,
676 (pstr->valid_len - offset) * sizeof (wint_t));
677 #endif /* RE_ENABLE_I18N */
678 if (BE (pstr->mbs_allocated, 0))
679 memmove (pstr->mbs, pstr->mbs + offset,
680 pstr->valid_len - offset);
681 pstr->valid_len -= offset;
682 pstr->valid_raw_len -= offset;
683 #if DEBUG
684 assert (pstr->valid_len > 0);
685 #endif
688 else
690 /* No, skip all characters until IDX. */
691 int prev_valid_len = pstr->valid_len;
693 #ifdef RE_ENABLE_I18N
694 if (BE (pstr->offsets_needed, 0))
696 pstr->len = pstr->raw_len - idx + offset;
697 pstr->stop = pstr->raw_stop - idx + offset;
698 pstr->offsets_needed = 0;
700 #endif
701 pstr->valid_len = 0;
702 #ifdef RE_ENABLE_I18N
703 if (pstr->mb_cur_max > 1)
705 int wcs_idx;
706 wint_t wc = WEOF;
708 if (pstr->is_utf8)
710 const unsigned char *raw, *p, *end;
712 /* Special case UTF-8. Multi-byte chars start with any
713 byte other than 0x80 - 0xbf. */
714 raw = pstr->raw_mbs + pstr->raw_mbs_idx;
715 end = raw + (offset - pstr->mb_cur_max);
716 if (end < pstr->raw_mbs)
717 end = pstr->raw_mbs;
718 p = raw + offset - 1;
719 #ifdef _LIBC
720 /* We know the wchar_t encoding is UCS4, so for the simple
721 case, ASCII characters, skip the conversion step. */
722 if (isascii (*p) && BE (pstr->trans == NULL, 1))
724 memset (&pstr->cur_state, '\0', sizeof (mbstate_t));
725 /* pstr->valid_len = 0; */
726 wc = (wchar_t) *p;
728 else
729 #endif
730 for (; p >= end; --p)
731 if ((*p & 0xc0) != 0x80)
733 mbstate_t cur_state;
734 wchar_t wc2;
735 int mlen = raw + pstr->len - p;
736 unsigned char buf[6];
737 size_t mbclen;
739 const unsigned char *pp = p;
740 if (BE (pstr->trans != NULL, 0))
742 int i = mlen < 6 ? mlen : 6;
743 while (--i >= 0)
744 buf[i] = pstr->trans[p[i]];
745 pp = buf;
747 /* XXX Don't use mbrtowc, we know which conversion
748 to use (UTF-8 -> UCS4). */
749 memset (&cur_state, 0, sizeof (cur_state));
750 mbclen = __mbrtowc (&wc2, (const char *) pp, mlen,
751 &cur_state);
752 if (raw + offset - p <= mbclen
753 && mbclen < (size_t) -2)
755 memset (&pstr->cur_state, '\0',
756 sizeof (mbstate_t));
757 pstr->valid_len = mbclen - (raw + offset - p);
758 wc = wc2;
760 break;
764 if (wc == WEOF)
765 pstr->valid_len = re_string_skip_chars (pstr, idx, &wc) - idx;
766 if (wc == WEOF)
767 pstr->tip_context
768 = re_string_context_at (pstr, prev_valid_len - 1, eflags);
769 else
770 pstr->tip_context = ((BE (pstr->word_ops_used != 0, 0)
771 && IS_WIDE_WORD_CHAR (wc))
772 ? CONTEXT_WORD
773 : ((IS_WIDE_NEWLINE (wc)
774 && pstr->newline_anchor)
775 ? CONTEXT_NEWLINE : 0));
776 if (BE (pstr->valid_len, 0))
778 for (wcs_idx = 0; wcs_idx < pstr->valid_len; ++wcs_idx)
779 pstr->wcs[wcs_idx] = WEOF;
780 if (pstr->mbs_allocated)
781 memset (pstr->mbs, 255, pstr->valid_len);
783 pstr->valid_raw_len = pstr->valid_len;
785 else
786 #endif /* RE_ENABLE_I18N */
788 int c = pstr->raw_mbs[pstr->raw_mbs_idx + offset - 1];
789 pstr->valid_raw_len = 0;
790 if (pstr->trans)
791 c = pstr->trans[c];
792 pstr->tip_context = (bitset_contain (pstr->word_char, c)
793 ? CONTEXT_WORD
794 : ((IS_NEWLINE (c) && pstr->newline_anchor)
795 ? CONTEXT_NEWLINE : 0));
798 if (!BE (pstr->mbs_allocated, 0))
799 pstr->mbs += offset;
801 pstr->raw_mbs_idx = idx;
802 pstr->len -= offset;
803 pstr->stop -= offset;
805 /* Then build the buffers. */
806 #ifdef RE_ENABLE_I18N
807 if (pstr->mb_cur_max > 1)
809 if (pstr->icase)
811 reg_errcode_t ret = build_wcs_upper_buffer (pstr);
812 if (BE (ret != REG_NOERROR, 0))
813 return ret;
815 else
816 build_wcs_buffer (pstr);
818 else
819 #endif /* RE_ENABLE_I18N */
820 if (BE (pstr->mbs_allocated, 0))
822 if (pstr->icase)
823 build_upper_buffer (pstr);
824 else if (pstr->trans != NULL)
825 re_string_translate_buffer (pstr);
827 else
828 pstr->valid_len = pstr->len;
830 pstr->cur_idx = 0;
831 return REG_NOERROR;
834 static unsigned char
835 internal_function __attribute ((pure))
836 re_string_peek_byte_case (const re_string_t *pstr, int idx)
838 int ch, off;
840 /* Handle the common (easiest) cases first. */
841 if (BE (!pstr->mbs_allocated, 1))
842 return re_string_peek_byte (pstr, idx);
844 #ifdef RE_ENABLE_I18N
845 if (pstr->mb_cur_max > 1
846 && ! re_string_is_single_byte_char (pstr, pstr->cur_idx + idx))
847 return re_string_peek_byte (pstr, idx);
848 #endif
850 off = pstr->cur_idx + idx;
851 #ifdef RE_ENABLE_I18N
852 if (pstr->offsets_needed)
853 off = pstr->offsets[off];
854 #endif
856 ch = pstr->raw_mbs[pstr->raw_mbs_idx + off];
858 #ifdef RE_ENABLE_I18N
859 /* Ensure that e.g. for tr_TR.UTF-8 BACKSLASH DOTLESS SMALL LETTER I
860 this function returns CAPITAL LETTER I instead of first byte of
861 DOTLESS SMALL LETTER I. The latter would confuse the parser,
862 since peek_byte_case doesn't advance cur_idx in any way. */
863 if (pstr->offsets_needed && !isascii (ch))
864 return re_string_peek_byte (pstr, idx);
865 #endif
867 return ch;
870 static unsigned char
871 internal_function
872 re_string_fetch_byte_case (re_string_t *pstr)
874 if (BE (!pstr->mbs_allocated, 1))
875 return re_string_fetch_byte (pstr);
877 #ifdef RE_ENABLE_I18N
878 if (pstr->offsets_needed)
880 int off, ch;
882 /* For tr_TR.UTF-8 [[:islower:]] there is
883 [[: CAPITAL LETTER I WITH DOT lower:]] in mbs. Skip
884 in that case the whole multi-byte character and return
885 the original letter. On the other side, with
886 [[: DOTLESS SMALL LETTER I return [[:I, as doing
887 anything else would complicate things too much. */
889 if (!re_string_first_byte (pstr, pstr->cur_idx))
890 return re_string_fetch_byte (pstr);
892 off = pstr->offsets[pstr->cur_idx];
893 ch = pstr->raw_mbs[pstr->raw_mbs_idx + off];
895 if (! isascii (ch))
896 return re_string_fetch_byte (pstr);
898 re_string_skip_bytes (pstr,
899 re_string_char_size_at (pstr, pstr->cur_idx));
900 return ch;
902 #endif
904 return pstr->raw_mbs[pstr->raw_mbs_idx + pstr->cur_idx++];
907 static void
908 internal_function
909 re_string_destruct (re_string_t *pstr)
911 #ifdef RE_ENABLE_I18N
912 re_free (pstr->wcs);
913 re_free (pstr->offsets);
914 #endif /* RE_ENABLE_I18N */
915 if (pstr->mbs_allocated)
916 re_free (pstr->mbs);
919 /* Return the context at IDX in INPUT. */
921 static unsigned int
922 internal_function
923 re_string_context_at (const re_string_t *input, int idx, int eflags)
925 int c;
926 if (BE (idx < 0, 0))
927 /* In this case, we use the value stored in input->tip_context,
928 since we can't know the character in input->mbs[-1] here. */
929 return input->tip_context;
930 if (BE (idx == input->len, 0))
931 return ((eflags & REG_NOTEOL) ? CONTEXT_ENDBUF
932 : CONTEXT_NEWLINE | CONTEXT_ENDBUF);
933 #ifdef RE_ENABLE_I18N
934 if (input->mb_cur_max > 1)
936 wint_t wc;
937 int wc_idx = idx;
938 while(input->wcs[wc_idx] == WEOF)
940 #ifdef DEBUG
941 /* It must not happen. */
942 assert (wc_idx >= 0);
943 #endif
944 --wc_idx;
945 if (wc_idx < 0)
946 return input->tip_context;
948 wc = input->wcs[wc_idx];
949 if (BE (input->word_ops_used != 0, 0) && IS_WIDE_WORD_CHAR (wc))
950 return CONTEXT_WORD;
951 return (IS_WIDE_NEWLINE (wc) && input->newline_anchor
952 ? CONTEXT_NEWLINE : 0);
954 else
955 #endif
957 c = re_string_byte_at (input, idx);
958 if (bitset_contain (input->word_char, c))
959 return CONTEXT_WORD;
960 return IS_NEWLINE (c) && input->newline_anchor ? CONTEXT_NEWLINE : 0;
964 /* Functions for set operation. */
966 static reg_errcode_t
967 internal_function __attribute_warn_unused_result__
968 re_node_set_alloc (re_node_set *set, int size)
970 set->alloc = size;
971 set->nelem = 0;
972 set->elems = re_malloc (int, size);
973 if (BE (set->elems == NULL, 0))
974 return REG_ESPACE;
975 return REG_NOERROR;
978 static reg_errcode_t
979 internal_function __attribute_warn_unused_result__
980 re_node_set_init_1 (re_node_set *set, int elem)
982 set->alloc = 1;
983 set->nelem = 1;
984 set->elems = re_malloc (int, 1);
985 if (BE (set->elems == NULL, 0))
987 set->alloc = set->nelem = 0;
988 return REG_ESPACE;
990 set->elems[0] = elem;
991 return REG_NOERROR;
994 static reg_errcode_t
995 internal_function __attribute_warn_unused_result__
996 re_node_set_init_2 (re_node_set *set, int elem1, int elem2)
998 set->alloc = 2;
999 set->elems = re_malloc (int, 2);
1000 if (BE (set->elems == NULL, 0))
1001 return REG_ESPACE;
1002 if (elem1 == elem2)
1004 set->nelem = 1;
1005 set->elems[0] = elem1;
1007 else
1009 set->nelem = 2;
1010 if (elem1 < elem2)
1012 set->elems[0] = elem1;
1013 set->elems[1] = elem2;
1015 else
1017 set->elems[0] = elem2;
1018 set->elems[1] = elem1;
1021 return REG_NOERROR;
1024 static reg_errcode_t
1025 internal_function __attribute_warn_unused_result__
1026 re_node_set_init_copy (re_node_set *dest, const re_node_set *src)
1028 dest->nelem = src->nelem;
1029 if (src->nelem > 0)
1031 dest->alloc = dest->nelem;
1032 dest->elems = re_malloc (int, dest->alloc);
1033 if (BE (dest->elems == NULL, 0))
1035 dest->alloc = dest->nelem = 0;
1036 return REG_ESPACE;
1038 memcpy (dest->elems, src->elems, src->nelem * sizeof (int));
1040 else
1041 re_node_set_init_empty (dest);
1042 return REG_NOERROR;
1045 /* Calculate the intersection of the sets SRC1 and SRC2. And merge it to
1046 DEST. Return value indicate the error code or REG_NOERROR if succeeded.
1047 Note: We assume dest->elems is NULL, when dest->alloc is 0. */
1049 static reg_errcode_t
1050 internal_function __attribute_warn_unused_result__
1051 re_node_set_add_intersect (re_node_set *dest, const re_node_set *src1,
1052 const re_node_set *src2)
1054 int i1, i2, is, id, delta, sbase;
1055 if (src1->nelem == 0 || src2->nelem == 0)
1056 return REG_NOERROR;
1058 /* We need dest->nelem + 2 * elems_in_intersection; this is a
1059 conservative estimate. */
1060 if (src1->nelem + src2->nelem + dest->nelem > dest->alloc)
1062 int new_alloc = src1->nelem + src2->nelem + dest->alloc;
1063 int *new_elems = re_realloc (dest->elems, int, new_alloc);
1064 if (BE (new_elems == NULL, 0))
1065 return REG_ESPACE;
1066 dest->elems = new_elems;
1067 dest->alloc = new_alloc;
1070 /* Find the items in the intersection of SRC1 and SRC2, and copy
1071 into the top of DEST those that are not already in DEST itself. */
1072 sbase = dest->nelem + src1->nelem + src2->nelem;
1073 i1 = src1->nelem - 1;
1074 i2 = src2->nelem - 1;
1075 id = dest->nelem - 1;
1076 for (;;)
1078 if (src1->elems[i1] == src2->elems[i2])
1080 /* Try to find the item in DEST. Maybe we could binary search? */
1081 while (id >= 0 && dest->elems[id] > src1->elems[i1])
1082 --id;
1084 if (id < 0 || dest->elems[id] != src1->elems[i1])
1085 dest->elems[--sbase] = src1->elems[i1];
1087 if (--i1 < 0 || --i2 < 0)
1088 break;
1091 /* Lower the highest of the two items. */
1092 else if (src1->elems[i1] < src2->elems[i2])
1094 if (--i2 < 0)
1095 break;
1097 else
1099 if (--i1 < 0)
1100 break;
1104 id = dest->nelem - 1;
1105 is = dest->nelem + src1->nelem + src2->nelem - 1;
1106 delta = is - sbase + 1;
1108 /* Now copy. When DELTA becomes zero, the remaining
1109 DEST elements are already in place; this is more or
1110 less the same loop that is in re_node_set_merge. */
1111 dest->nelem += delta;
1112 if (delta > 0 && id >= 0)
1113 for (;;)
1115 if (dest->elems[is] > dest->elems[id])
1117 /* Copy from the top. */
1118 dest->elems[id + delta--] = dest->elems[is--];
1119 if (delta == 0)
1120 break;
1122 else
1124 /* Slide from the bottom. */
1125 dest->elems[id + delta] = dest->elems[id];
1126 if (--id < 0)
1127 break;
1131 /* Copy remaining SRC elements. */
1132 memcpy (dest->elems, dest->elems + sbase, delta * sizeof (int));
1134 return REG_NOERROR;
1137 /* Calculate the union set of the sets SRC1 and SRC2. And store it to
1138 DEST. Return value indicate the error code or REG_NOERROR if succeeded. */
1140 static reg_errcode_t
1141 internal_function __attribute_warn_unused_result__
1142 re_node_set_init_union (re_node_set *dest, const re_node_set *src1,
1143 const re_node_set *src2)
1145 int i1, i2, id;
1146 if (src1 != NULL && src1->nelem > 0 && src2 != NULL && src2->nelem > 0)
1148 dest->alloc = src1->nelem + src2->nelem;
1149 dest->elems = re_malloc (int, dest->alloc);
1150 if (BE (dest->elems == NULL, 0))
1151 return REG_ESPACE;
1153 else
1155 if (src1 != NULL && src1->nelem > 0)
1156 return re_node_set_init_copy (dest, src1);
1157 else if (src2 != NULL && src2->nelem > 0)
1158 return re_node_set_init_copy (dest, src2);
1159 else
1160 re_node_set_init_empty (dest);
1161 return REG_NOERROR;
1163 for (i1 = i2 = id = 0 ; i1 < src1->nelem && i2 < src2->nelem ;)
1165 if (src1->elems[i1] > src2->elems[i2])
1167 dest->elems[id++] = src2->elems[i2++];
1168 continue;
1170 if (src1->elems[i1] == src2->elems[i2])
1171 ++i2;
1172 dest->elems[id++] = src1->elems[i1++];
1174 if (i1 < src1->nelem)
1176 memcpy (dest->elems + id, src1->elems + i1,
1177 (src1->nelem - i1) * sizeof (int));
1178 id += src1->nelem - i1;
1180 else if (i2 < src2->nelem)
1182 memcpy (dest->elems + id, src2->elems + i2,
1183 (src2->nelem - i2) * sizeof (int));
1184 id += src2->nelem - i2;
1186 dest->nelem = id;
1187 return REG_NOERROR;
1190 /* Calculate the union set of the sets DEST and SRC. And store it to
1191 DEST. Return value indicate the error code or REG_NOERROR if succeeded. */
1193 static reg_errcode_t
1194 internal_function __attribute_warn_unused_result__
1195 re_node_set_merge (re_node_set *dest, const re_node_set *src)
1197 int is, id, sbase, delta;
1198 if (src == NULL || src->nelem == 0)
1199 return REG_NOERROR;
1200 if (dest->alloc < 2 * src->nelem + dest->nelem)
1202 int new_alloc = 2 * (src->nelem + dest->alloc);
1203 int *new_buffer = re_realloc (dest->elems, int, new_alloc);
1204 if (BE (new_buffer == NULL, 0))
1205 return REG_ESPACE;
1206 dest->elems = new_buffer;
1207 dest->alloc = new_alloc;
1210 if (BE (dest->nelem == 0, 0))
1212 dest->nelem = src->nelem;
1213 memcpy (dest->elems, src->elems, src->nelem * sizeof (int));
1214 return REG_NOERROR;
1217 /* Copy into the top of DEST the items of SRC that are not
1218 found in DEST. Maybe we could binary search in DEST? */
1219 for (sbase = dest->nelem + 2 * src->nelem,
1220 is = src->nelem - 1, id = dest->nelem - 1; is >= 0 && id >= 0; )
1222 if (dest->elems[id] == src->elems[is])
1223 is--, id--;
1224 else if (dest->elems[id] < src->elems[is])
1225 dest->elems[--sbase] = src->elems[is--];
1226 else /* if (dest->elems[id] > src->elems[is]) */
1227 --id;
1230 if (is >= 0)
1232 /* If DEST is exhausted, the remaining items of SRC must be unique. */
1233 sbase -= is + 1;
1234 memcpy (dest->elems + sbase, src->elems, (is + 1) * sizeof (int));
1237 id = dest->nelem - 1;
1238 is = dest->nelem + 2 * src->nelem - 1;
1239 delta = is - sbase + 1;
1240 if (delta == 0)
1241 return REG_NOERROR;
1243 /* Now copy. When DELTA becomes zero, the remaining
1244 DEST elements are already in place. */
1245 dest->nelem += delta;
1246 for (;;)
1248 if (dest->elems[is] > dest->elems[id])
1250 /* Copy from the top. */
1251 dest->elems[id + delta--] = dest->elems[is--];
1252 if (delta == 0)
1253 break;
1255 else
1257 /* Slide from the bottom. */
1258 dest->elems[id + delta] = dest->elems[id];
1259 if (--id < 0)
1261 /* Copy remaining SRC elements. */
1262 memcpy (dest->elems, dest->elems + sbase,
1263 delta * sizeof (int));
1264 break;
1269 return REG_NOERROR;
1272 /* Insert the new element ELEM to the re_node_set* SET.
1273 SET should not already have ELEM.
1274 return -1 if an error is occured, return 1 otherwise. */
1276 static int
1277 internal_function __attribute_warn_unused_result__
1278 re_node_set_insert (re_node_set *set, int elem)
1280 int idx;
1281 /* In case the set is empty. */
1282 if (set->alloc == 0)
1284 if (BE (re_node_set_init_1 (set, elem) == REG_NOERROR, 1))
1285 return 1;
1286 else
1287 return -1;
1290 if (BE (set->nelem, 0) == 0)
1292 /* We already guaranteed above that set->alloc != 0. */
1293 set->elems[0] = elem;
1294 ++set->nelem;
1295 return 1;
1298 /* Realloc if we need. */
1299 if (set->alloc == set->nelem)
1301 int *new_elems;
1302 set->alloc = set->alloc * 2;
1303 new_elems = re_realloc (set->elems, int, set->alloc);
1304 if (BE (new_elems == NULL, 0))
1305 return -1;
1306 set->elems = new_elems;
1309 /* Move the elements which follows the new element. Test the
1310 first element separately to skip a check in the inner loop. */
1311 if (elem < set->elems[0])
1313 idx = 0;
1314 for (idx = set->nelem; idx > 0; idx--)
1315 set->elems[idx] = set->elems[idx - 1];
1317 else
1319 for (idx = set->nelem; set->elems[idx - 1] > elem; idx--)
1320 set->elems[idx] = set->elems[idx - 1];
1323 /* Insert the new element. */
1324 set->elems[idx] = elem;
1325 ++set->nelem;
1326 return 1;
1329 /* Insert the new element ELEM to the re_node_set* SET.
1330 SET should not already have any element greater than or equal to ELEM.
1331 Return -1 if an error is occured, return 1 otherwise. */
1333 static int
1334 internal_function __attribute_warn_unused_result__
1335 re_node_set_insert_last (re_node_set *set, int elem)
1337 /* Realloc if we need. */
1338 if (set->alloc == set->nelem)
1340 int *new_elems;
1341 set->alloc = (set->alloc + 1) * 2;
1342 new_elems = re_realloc (set->elems, int, set->alloc);
1343 if (BE (new_elems == NULL, 0))
1344 return -1;
1345 set->elems = new_elems;
1348 /* Insert the new element. */
1349 set->elems[set->nelem++] = elem;
1350 return 1;
1353 /* Compare two node sets SET1 and SET2.
1354 return 1 if SET1 and SET2 are equivalent, return 0 otherwise. */
1356 static int
1357 internal_function __attribute ((pure))
1358 re_node_set_compare (const re_node_set *set1, const re_node_set *set2)
1360 int i;
1361 if (set1 == NULL || set2 == NULL || set1->nelem != set2->nelem)
1362 return 0;
1363 for (i = set1->nelem ; --i >= 0 ; )
1364 if (set1->elems[i] != set2->elems[i])
1365 return 0;
1366 return 1;
1369 /* Return (idx + 1) if SET contains the element ELEM, return 0 otherwise. */
1371 static int
1372 internal_function __attribute ((pure))
1373 re_node_set_contains (const re_node_set *set, int elem)
1375 unsigned int idx, right, mid;
1376 if (set->nelem <= 0)
1377 return 0;
1379 /* Binary search the element. */
1380 idx = 0;
1381 right = set->nelem - 1;
1382 while (idx < right)
1384 mid = (idx + right) / 2;
1385 if (set->elems[mid] < elem)
1386 idx = mid + 1;
1387 else
1388 right = mid;
1390 return set->elems[idx] == elem ? idx + 1 : 0;
1393 static void
1394 internal_function
1395 re_node_set_remove_at (re_node_set *set, int idx)
1397 if (idx < 0 || idx >= set->nelem)
1398 return;
1399 --set->nelem;
1400 for (; idx < set->nelem; idx++)
1401 set->elems[idx] = set->elems[idx + 1];
1405 /* Add the token TOKEN to dfa->nodes, and return the index of the token.
1406 Or return -1, if an error will be occured. */
1408 static int
1409 internal_function
1410 re_dfa_add_node (re_dfa_t *dfa, re_token_t token)
1412 int type = token.type;
1413 if (BE (dfa->nodes_len >= dfa->nodes_alloc, 0))
1415 size_t new_nodes_alloc = dfa->nodes_alloc * 2;
1416 int *new_nexts, *new_indices;
1417 re_node_set *new_edests, *new_eclosures;
1418 re_token_t *new_nodes;
1420 /* Avoid overflows in realloc. */
1421 const size_t max_object_size = MAX (sizeof (re_token_t),
1422 MAX (sizeof (re_node_set),
1423 sizeof (int)));
1424 if (BE (SIZE_MAX / max_object_size < new_nodes_alloc, 0))
1425 return -1;
1427 new_nodes = re_realloc (dfa->nodes, re_token_t, new_nodes_alloc);
1428 if (BE (new_nodes == NULL, 0))
1429 return -1;
1430 dfa->nodes = new_nodes;
1431 new_nexts = re_realloc (dfa->nexts, int, new_nodes_alloc);
1432 new_indices = re_realloc (dfa->org_indices, int, new_nodes_alloc);
1433 new_edests = re_realloc (dfa->edests, re_node_set, new_nodes_alloc);
1434 new_eclosures = re_realloc (dfa->eclosures, re_node_set, new_nodes_alloc);
1435 if (BE (new_nexts == NULL || new_indices == NULL
1436 || new_edests == NULL || new_eclosures == NULL, 0))
1437 return -1;
1438 dfa->nexts = new_nexts;
1439 dfa->org_indices = new_indices;
1440 dfa->edests = new_edests;
1441 dfa->eclosures = new_eclosures;
1442 dfa->nodes_alloc = new_nodes_alloc;
1444 dfa->nodes[dfa->nodes_len] = token;
1445 dfa->nodes[dfa->nodes_len].constraint = 0;
1446 #ifdef RE_ENABLE_I18N
1447 dfa->nodes[dfa->nodes_len].accept_mb =
1448 (type == OP_PERIOD && dfa->mb_cur_max > 1) || type == COMPLEX_BRACKET;
1449 #endif
1450 dfa->nexts[dfa->nodes_len] = -1;
1451 re_node_set_init_empty (dfa->edests + dfa->nodes_len);
1452 re_node_set_init_empty (dfa->eclosures + dfa->nodes_len);
1453 return dfa->nodes_len++;
1456 static inline unsigned int
1457 internal_function
1458 calc_state_hash (const re_node_set *nodes, unsigned int context)
1460 unsigned int hash = nodes->nelem + context;
1461 int i;
1462 for (i = 0 ; i < nodes->nelem ; i++)
1463 hash += nodes->elems[i];
1464 return hash;
1467 /* Search for the state whose node_set is equivalent to NODES.
1468 Return the pointer to the state, if we found it in the DFA.
1469 Otherwise create the new one and return it. In case of an error
1470 return NULL and set the error code in ERR.
1471 Note: - We assume NULL as the invalid state, then it is possible that
1472 return value is NULL and ERR is REG_NOERROR.
1473 - We never return non-NULL value in case of any errors, it is for
1474 optimization. */
1476 static re_dfastate_t *
1477 internal_function __attribute_warn_unused_result__
1478 re_acquire_state (reg_errcode_t *err, const re_dfa_t *dfa,
1479 const re_node_set *nodes)
1481 unsigned int hash;
1482 re_dfastate_t *new_state;
1483 struct re_state_table_entry *spot;
1484 int i;
1485 if (BE (nodes->nelem == 0, 0))
1487 *err = REG_NOERROR;
1488 return NULL;
1490 hash = calc_state_hash (nodes, 0);
1491 spot = dfa->state_table + (hash & dfa->state_hash_mask);
1493 for (i = 0 ; i < spot->num ; i++)
1495 re_dfastate_t *state = spot->array[i];
1496 if (hash != state->hash)
1497 continue;
1498 if (re_node_set_compare (&state->nodes, nodes))
1499 return state;
1502 /* There are no appropriate state in the dfa, create the new one. */
1503 new_state = create_ci_newstate (dfa, nodes, hash);
1504 if (BE (new_state == NULL, 0))
1505 *err = REG_ESPACE;
1507 return new_state;
1510 /* Search for the state whose node_set is equivalent to NODES and
1511 whose context is equivalent to CONTEXT.
1512 Return the pointer to the state, if we found it in the DFA.
1513 Otherwise create the new one and return it. In case of an error
1514 return NULL and set the error code in ERR.
1515 Note: - We assume NULL as the invalid state, then it is possible that
1516 return value is NULL and ERR is REG_NOERROR.
1517 - We never return non-NULL value in case of any errors, it is for
1518 optimization. */
1520 static re_dfastate_t *
1521 internal_function __attribute_warn_unused_result__
1522 re_acquire_state_context (reg_errcode_t *err, const re_dfa_t *dfa,
1523 const re_node_set *nodes, unsigned int context)
1525 unsigned int hash;
1526 re_dfastate_t *new_state;
1527 struct re_state_table_entry *spot;
1528 int i;
1529 if (nodes->nelem == 0)
1531 *err = REG_NOERROR;
1532 return NULL;
1534 hash = calc_state_hash (nodes, context);
1535 spot = dfa->state_table + (hash & dfa->state_hash_mask);
1537 for (i = 0 ; i < spot->num ; i++)
1539 re_dfastate_t *state = spot->array[i];
1540 if (state->hash == hash
1541 && state->context == context
1542 && re_node_set_compare (state->entrance_nodes, nodes))
1543 return state;
1545 /* There are no appropriate state in `dfa', create the new one. */
1546 new_state = create_cd_newstate (dfa, nodes, context, hash);
1547 if (BE (new_state == NULL, 0))
1548 *err = REG_ESPACE;
1550 return new_state;
1553 /* Finish initialization of the new state NEWSTATE, and using its hash value
1554 HASH put in the appropriate bucket of DFA's state table. Return value
1555 indicates the error code if failed. */
1557 static reg_errcode_t
1558 __attribute_warn_unused_result__
1559 register_state (const re_dfa_t *dfa, re_dfastate_t *newstate,
1560 unsigned int hash)
1562 struct re_state_table_entry *spot;
1563 reg_errcode_t err;
1564 int i;
1566 newstate->hash = hash;
1567 err = re_node_set_alloc (&newstate->non_eps_nodes, newstate->nodes.nelem);
1568 if (BE (err != REG_NOERROR, 0))
1569 return REG_ESPACE;
1570 for (i = 0; i < newstate->nodes.nelem; i++)
1572 int elem = newstate->nodes.elems[i];
1573 if (!IS_EPSILON_NODE (dfa->nodes[elem].type))
1574 if (re_node_set_insert_last (&newstate->non_eps_nodes, elem) < 0)
1575 return REG_ESPACE;
1578 spot = dfa->state_table + (hash & dfa->state_hash_mask);
1579 if (BE (spot->alloc <= spot->num, 0))
1581 int new_alloc = 2 * spot->num + 2;
1582 re_dfastate_t **new_array = re_realloc (spot->array, re_dfastate_t *,
1583 new_alloc);
1584 if (BE (new_array == NULL, 0))
1585 return REG_ESPACE;
1586 spot->array = new_array;
1587 spot->alloc = new_alloc;
1589 spot->array[spot->num++] = newstate;
1590 return REG_NOERROR;
1593 static void
1594 free_state (re_dfastate_t *state)
1596 re_node_set_free (&state->non_eps_nodes);
1597 re_node_set_free (&state->inveclosure);
1598 if (state->entrance_nodes != &state->nodes)
1600 re_node_set_free (state->entrance_nodes);
1601 re_free (state->entrance_nodes);
1603 re_node_set_free (&state->nodes);
1604 re_free (state->word_trtable);
1605 re_free (state->trtable);
1606 re_free (state);
1609 /* Create the new state which is independ of contexts.
1610 Return the new state if succeeded, otherwise return NULL. */
1612 static re_dfastate_t *
1613 internal_function __attribute_warn_unused_result__
1614 create_ci_newstate (const re_dfa_t *dfa, const re_node_set *nodes,
1615 unsigned int hash)
1617 int i;
1618 reg_errcode_t err;
1619 re_dfastate_t *newstate;
1621 newstate = (re_dfastate_t *) calloc (sizeof (re_dfastate_t), 1);
1622 if (BE (newstate == NULL, 0))
1623 return NULL;
1624 err = re_node_set_init_copy (&newstate->nodes, nodes);
1625 if (BE (err != REG_NOERROR, 0))
1627 re_free (newstate);
1628 return NULL;
1631 newstate->entrance_nodes = &newstate->nodes;
1632 for (i = 0 ; i < nodes->nelem ; i++)
1634 re_token_t *node = dfa->nodes + nodes->elems[i];
1635 re_token_type_t type = node->type;
1636 if (type == CHARACTER && !node->constraint)
1637 continue;
1638 #ifdef RE_ENABLE_I18N
1639 newstate->accept_mb |= node->accept_mb;
1640 #endif /* RE_ENABLE_I18N */
1642 /* If the state has the halt node, the state is a halt state. */
1643 if (type == END_OF_RE)
1644 newstate->halt = 1;
1645 else if (type == OP_BACK_REF)
1646 newstate->has_backref = 1;
1647 else if (type == ANCHOR || node->constraint)
1648 newstate->has_constraint = 1;
1650 err = register_state (dfa, newstate, hash);
1651 if (BE (err != REG_NOERROR, 0))
1653 free_state (newstate);
1654 newstate = NULL;
1656 return newstate;
1659 /* Create the new state which is depend on the context CONTEXT.
1660 Return the new state if succeeded, otherwise return NULL. */
1662 static re_dfastate_t *
1663 internal_function __attribute_warn_unused_result__
1664 create_cd_newstate (const re_dfa_t *dfa, const re_node_set *nodes,
1665 unsigned int context, unsigned int hash)
1667 int i, nctx_nodes = 0;
1668 reg_errcode_t err;
1669 re_dfastate_t *newstate;
1671 newstate = (re_dfastate_t *) calloc (sizeof (re_dfastate_t), 1);
1672 if (BE (newstate == NULL, 0))
1673 return NULL;
1674 err = re_node_set_init_copy (&newstate->nodes, nodes);
1675 if (BE (err != REG_NOERROR, 0))
1677 re_free (newstate);
1678 return NULL;
1681 newstate->context = context;
1682 newstate->entrance_nodes = &newstate->nodes;
1684 for (i = 0 ; i < nodes->nelem ; i++)
1686 re_token_t *node = dfa->nodes + nodes->elems[i];
1687 re_token_type_t type = node->type;
1688 unsigned int constraint = node->constraint;
1690 if (type == CHARACTER && !constraint)
1691 continue;
1692 #ifdef RE_ENABLE_I18N
1693 newstate->accept_mb |= node->accept_mb;
1694 #endif /* RE_ENABLE_I18N */
1696 /* If the state has the halt node, the state is a halt state. */
1697 if (type == END_OF_RE)
1698 newstate->halt = 1;
1699 else if (type == OP_BACK_REF)
1700 newstate->has_backref = 1;
1702 if (constraint)
1704 if (newstate->entrance_nodes == &newstate->nodes)
1706 newstate->entrance_nodes = re_malloc (re_node_set, 1);
1707 if (BE (newstate->entrance_nodes == NULL, 0))
1709 free_state (newstate);
1710 return NULL;
1712 if (re_node_set_init_copy (newstate->entrance_nodes, nodes)
1713 != REG_NOERROR)
1714 return NULL;
1715 nctx_nodes = 0;
1716 newstate->has_constraint = 1;
1719 if (NOT_SATISFY_PREV_CONSTRAINT (constraint,context))
1721 re_node_set_remove_at (&newstate->nodes, i - nctx_nodes);
1722 ++nctx_nodes;
1726 err = register_state (dfa, newstate, hash);
1727 if (BE (err != REG_NOERROR, 0))
1729 free_state (newstate);
1730 newstate = NULL;
1732 return newstate;