Cleanups in ldconfig's chroot handling
[glibc.git] / posix / regex_internal.c
blob285ae3b38ed0a61c98d95d13303575ef933e7c8b
1 /* Extended regular expression matching and search library.
2 Copyright (C) 2002-2006, 2010 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4 Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library; if not, write to the Free
18 Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
19 02111-1307 USA. */
21 static void re_string_construct_common (const char *str, int len,
22 re_string_t *pstr,
23 RE_TRANSLATE_TYPE trans, int icase,
24 const re_dfa_t *dfa) internal_function;
25 static re_dfastate_t *create_ci_newstate (const re_dfa_t *dfa,
26 const re_node_set *nodes,
27 unsigned int hash) internal_function;
28 static re_dfastate_t *create_cd_newstate (const re_dfa_t *dfa,
29 const re_node_set *nodes,
30 unsigned int context,
31 unsigned int hash) internal_function;
33 /* Functions for string operation. */
35 /* This function allocate the buffers. It is necessary to call
36 re_string_reconstruct before using the object. */
38 static reg_errcode_t
39 internal_function __attribute_warn_unused_result__
40 re_string_allocate (re_string_t *pstr, const char *str, int len, int init_len,
41 RE_TRANSLATE_TYPE trans, int icase, const re_dfa_t *dfa)
43 reg_errcode_t ret;
44 int init_buf_len;
46 /* Ensure at least one character fits into the buffers. */
47 if (init_len < dfa->mb_cur_max)
48 init_len = dfa->mb_cur_max;
49 init_buf_len = (len + 1 < init_len) ? len + 1: init_len;
50 re_string_construct_common (str, len, pstr, trans, icase, dfa);
52 ret = re_string_realloc_buffers (pstr, init_buf_len);
53 if (BE (ret != REG_NOERROR, 0))
54 return ret;
56 pstr->word_char = dfa->word_char;
57 pstr->word_ops_used = dfa->word_ops_used;
58 pstr->mbs = pstr->mbs_allocated ? pstr->mbs : (unsigned char *) str;
59 pstr->valid_len = (pstr->mbs_allocated || dfa->mb_cur_max > 1) ? 0 : len;
60 pstr->valid_raw_len = pstr->valid_len;
61 return REG_NOERROR;
64 /* This function allocate the buffers, and initialize them. */
66 static reg_errcode_t
67 internal_function __attribute_warn_unused_result__
68 re_string_construct (re_string_t *pstr, const char *str, int len,
69 RE_TRANSLATE_TYPE trans, int icase, const re_dfa_t *dfa)
71 reg_errcode_t ret;
72 memset (pstr, '\0', sizeof (re_string_t));
73 re_string_construct_common (str, len, pstr, trans, icase, dfa);
75 if (len > 0)
77 ret = re_string_realloc_buffers (pstr, len + 1);
78 if (BE (ret != REG_NOERROR, 0))
79 return ret;
81 pstr->mbs = pstr->mbs_allocated ? pstr->mbs : (unsigned char *) str;
83 if (icase)
85 #ifdef RE_ENABLE_I18N
86 if (dfa->mb_cur_max > 1)
88 while (1)
90 ret = build_wcs_upper_buffer (pstr);
91 if (BE (ret != REG_NOERROR, 0))
92 return ret;
93 if (pstr->valid_raw_len >= len)
94 break;
95 if (pstr->bufs_len > pstr->valid_len + dfa->mb_cur_max)
96 break;
97 ret = re_string_realloc_buffers (pstr, pstr->bufs_len * 2);
98 if (BE (ret != REG_NOERROR, 0))
99 return ret;
102 else
103 #endif /* RE_ENABLE_I18N */
104 build_upper_buffer (pstr);
106 else
108 #ifdef RE_ENABLE_I18N
109 if (dfa->mb_cur_max > 1)
110 build_wcs_buffer (pstr);
111 else
112 #endif /* RE_ENABLE_I18N */
114 if (trans != NULL)
115 re_string_translate_buffer (pstr);
116 else
118 pstr->valid_len = pstr->bufs_len;
119 pstr->valid_raw_len = pstr->bufs_len;
124 return REG_NOERROR;
127 /* Helper functions for re_string_allocate, and re_string_construct. */
129 static reg_errcode_t
130 internal_function __attribute_warn_unused_result__
131 re_string_realloc_buffers (re_string_t *pstr, int new_buf_len)
133 #ifdef RE_ENABLE_I18N
134 if (pstr->mb_cur_max > 1)
136 wint_t *new_wcs;
138 /* Avoid overflow in realloc. */
139 const size_t max_object_size = MAX (sizeof (wint_t), sizeof (int));
140 if (BE (SIZE_MAX / max_object_size < new_buf_len, 0))
141 return REG_ESPACE;
143 new_wcs = re_realloc (pstr->wcs, wint_t, new_buf_len);
144 if (BE (new_wcs == NULL, 0))
145 return REG_ESPACE;
146 pstr->wcs = new_wcs;
147 if (pstr->offsets != NULL)
149 int *new_offsets = re_realloc (pstr->offsets, int, new_buf_len);
150 if (BE (new_offsets == NULL, 0))
151 return REG_ESPACE;
152 pstr->offsets = new_offsets;
155 #endif /* RE_ENABLE_I18N */
156 if (pstr->mbs_allocated)
158 unsigned char *new_mbs = re_realloc (pstr->mbs, unsigned char,
159 new_buf_len);
160 if (BE (new_mbs == NULL, 0))
161 return REG_ESPACE;
162 pstr->mbs = new_mbs;
164 pstr->bufs_len = new_buf_len;
165 return REG_NOERROR;
169 static void
170 internal_function
171 re_string_construct_common (const char *str, int len, re_string_t *pstr,
172 RE_TRANSLATE_TYPE trans, int icase,
173 const re_dfa_t *dfa)
175 pstr->raw_mbs = (const unsigned char *) str;
176 pstr->len = len;
177 pstr->raw_len = len;
178 pstr->trans = trans;
179 pstr->icase = icase ? 1 : 0;
180 pstr->mbs_allocated = (trans != NULL || icase);
181 pstr->mb_cur_max = dfa->mb_cur_max;
182 pstr->is_utf8 = dfa->is_utf8;
183 pstr->map_notascii = dfa->map_notascii;
184 pstr->stop = pstr->len;
185 pstr->raw_stop = pstr->stop;
188 #ifdef RE_ENABLE_I18N
190 /* Build wide character buffer PSTR->WCS.
191 If the byte sequence of the string are:
192 <mb1>(0), <mb1>(1), <mb2>(0), <mb2>(1), <sb3>
193 Then wide character buffer will be:
194 <wc1> , WEOF , <wc2> , WEOF , <wc3>
195 We use WEOF for padding, they indicate that the position isn't
196 a first byte of a multibyte character.
198 Note that this function assumes PSTR->VALID_LEN elements are already
199 built and starts from PSTR->VALID_LEN. */
201 static void
202 internal_function
203 build_wcs_buffer (re_string_t *pstr)
205 #ifdef _LIBC
206 unsigned char buf[MB_LEN_MAX];
207 assert (MB_LEN_MAX >= pstr->mb_cur_max);
208 #else
209 unsigned char buf[64];
210 #endif
211 mbstate_t prev_st;
212 int byte_idx, end_idx, remain_len;
213 size_t mbclen;
215 /* Build the buffers from pstr->valid_len to either pstr->len or
216 pstr->bufs_len. */
217 end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
218 for (byte_idx = pstr->valid_len; byte_idx < end_idx;)
220 wchar_t wc;
221 const char *p;
223 remain_len = end_idx - byte_idx;
224 prev_st = pstr->cur_state;
225 /* Apply the translation if we need. */
226 if (BE (pstr->trans != NULL, 0))
228 int i, ch;
230 for (i = 0; i < pstr->mb_cur_max && i < remain_len; ++i)
232 ch = pstr->raw_mbs [pstr->raw_mbs_idx + byte_idx + i];
233 buf[i] = pstr->mbs[byte_idx + i] = pstr->trans[ch];
235 p = (const char *) buf;
237 else
238 p = (const char *) pstr->raw_mbs + pstr->raw_mbs_idx + byte_idx;
239 mbclen = __mbrtowc (&wc, p, remain_len, &pstr->cur_state);
240 if (BE (mbclen == (size_t) -1 || mbclen == 0
241 || (mbclen == (size_t) -2 && pstr->bufs_len >= pstr->len), 0))
243 /* We treat these cases as a singlebyte character. */
244 mbclen = 1;
245 wc = (wchar_t) pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx];
246 if (BE (pstr->trans != NULL, 0))
247 wc = pstr->trans[wc];
248 pstr->cur_state = prev_st;
250 else if (BE (mbclen == (size_t) -2, 0))
252 /* The buffer doesn't have enough space, finish to build. */
253 pstr->cur_state = prev_st;
254 break;
257 /* Write wide character and padding. */
258 pstr->wcs[byte_idx++] = wc;
259 /* Write paddings. */
260 for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
261 pstr->wcs[byte_idx++] = WEOF;
263 pstr->valid_len = byte_idx;
264 pstr->valid_raw_len = byte_idx;
267 /* Build wide character buffer PSTR->WCS like build_wcs_buffer,
268 but for REG_ICASE. */
270 static reg_errcode_t
271 internal_function __attribute_warn_unused_result__
272 build_wcs_upper_buffer (re_string_t *pstr)
274 mbstate_t prev_st;
275 int src_idx, byte_idx, end_idx, remain_len;
276 size_t mbclen;
277 #ifdef _LIBC
278 char buf[MB_LEN_MAX];
279 assert (MB_LEN_MAX >= pstr->mb_cur_max);
280 #else
281 char buf[64];
282 #endif
284 byte_idx = pstr->valid_len;
285 end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
287 /* The following optimization assumes that ASCII characters can be
288 mapped to wide characters with a simple cast. */
289 if (! pstr->map_notascii && pstr->trans == NULL && !pstr->offsets_needed)
291 while (byte_idx < end_idx)
293 wchar_t wc;
295 if (isascii (pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx])
296 && mbsinit (&pstr->cur_state))
298 /* In case of a singlebyte character. */
299 pstr->mbs[byte_idx]
300 = toupper (pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx]);
301 /* The next step uses the assumption that wchar_t is encoded
302 ASCII-safe: all ASCII values can be converted like this. */
303 pstr->wcs[byte_idx] = (wchar_t) pstr->mbs[byte_idx];
304 ++byte_idx;
305 continue;
308 remain_len = end_idx - byte_idx;
309 prev_st = pstr->cur_state;
310 mbclen = __mbrtowc (&wc,
311 ((const char *) pstr->raw_mbs + pstr->raw_mbs_idx
312 + byte_idx), remain_len, &pstr->cur_state);
313 if (BE (mbclen + 2 > 2, 1))
315 wchar_t wcu = wc;
316 if (iswlower (wc))
318 size_t mbcdlen;
320 wcu = towupper (wc);
321 mbcdlen = wcrtomb (buf, wcu, &prev_st);
322 if (BE (mbclen == mbcdlen, 1))
323 memcpy (pstr->mbs + byte_idx, buf, mbclen);
324 else
326 src_idx = byte_idx;
327 goto offsets_needed;
330 else
331 memcpy (pstr->mbs + byte_idx,
332 pstr->raw_mbs + pstr->raw_mbs_idx + byte_idx, mbclen);
333 pstr->wcs[byte_idx++] = wcu;
334 /* Write paddings. */
335 for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
336 pstr->wcs[byte_idx++] = WEOF;
338 else if (mbclen == (size_t) -1 || mbclen == 0
339 || (mbclen == (size_t) -2 && pstr->bufs_len >= pstr->len))
341 /* It is an invalid character, an incomplete character
342 at the end of the string, or '\0'. Just use the byte. */
343 int ch = pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx];
344 pstr->mbs[byte_idx] = ch;
345 /* And also cast it to wide char. */
346 pstr->wcs[byte_idx++] = (wchar_t) ch;
347 if (BE (mbclen == (size_t) -1, 0))
348 pstr->cur_state = prev_st;
350 else
352 /* The buffer doesn't have enough space, finish to build. */
353 pstr->cur_state = prev_st;
354 break;
357 pstr->valid_len = byte_idx;
358 pstr->valid_raw_len = byte_idx;
359 return REG_NOERROR;
361 else
362 for (src_idx = pstr->valid_raw_len; byte_idx < end_idx;)
364 wchar_t wc;
365 const char *p;
366 offsets_needed:
367 remain_len = end_idx - byte_idx;
368 prev_st = pstr->cur_state;
369 if (BE (pstr->trans != NULL, 0))
371 int i, ch;
373 for (i = 0; i < pstr->mb_cur_max && i < remain_len; ++i)
375 ch = pstr->raw_mbs [pstr->raw_mbs_idx + src_idx + i];
376 buf[i] = pstr->trans[ch];
378 p = (const char *) buf;
380 else
381 p = (const char *) pstr->raw_mbs + pstr->raw_mbs_idx + src_idx;
382 mbclen = __mbrtowc (&wc, p, remain_len, &pstr->cur_state);
383 if (BE (mbclen + 2 > 2, 1))
385 wchar_t wcu = wc;
386 if (iswlower (wc))
388 size_t mbcdlen;
390 wcu = towupper (wc);
391 mbcdlen = wcrtomb ((char *) buf, wcu, &prev_st);
392 if (BE (mbclen == mbcdlen, 1))
393 memcpy (pstr->mbs + byte_idx, buf, mbclen);
394 else if (mbcdlen != (size_t) -1)
396 size_t i;
398 if (byte_idx + mbcdlen > pstr->bufs_len)
400 pstr->cur_state = prev_st;
401 break;
404 if (pstr->offsets == NULL)
406 pstr->offsets = re_malloc (int, pstr->bufs_len);
408 if (pstr->offsets == NULL)
409 return REG_ESPACE;
411 if (!pstr->offsets_needed)
413 for (i = 0; i < (size_t) byte_idx; ++i)
414 pstr->offsets[i] = i;
415 pstr->offsets_needed = 1;
418 memcpy (pstr->mbs + byte_idx, buf, mbcdlen);
419 pstr->wcs[byte_idx] = wcu;
420 pstr->offsets[byte_idx] = src_idx;
421 for (i = 1; i < mbcdlen; ++i)
423 pstr->offsets[byte_idx + i]
424 = src_idx + (i < mbclen ? i : mbclen - 1);
425 pstr->wcs[byte_idx + i] = WEOF;
427 pstr->len += mbcdlen - mbclen;
428 if (pstr->raw_stop > src_idx)
429 pstr->stop += mbcdlen - mbclen;
430 end_idx = (pstr->bufs_len > pstr->len)
431 ? pstr->len : pstr->bufs_len;
432 byte_idx += mbcdlen;
433 src_idx += mbclen;
434 continue;
436 else
437 memcpy (pstr->mbs + byte_idx, p, mbclen);
439 else
440 memcpy (pstr->mbs + byte_idx, p, mbclen);
442 if (BE (pstr->offsets_needed != 0, 0))
444 size_t i;
445 for (i = 0; i < mbclen; ++i)
446 pstr->offsets[byte_idx + i] = src_idx + i;
448 src_idx += mbclen;
450 pstr->wcs[byte_idx++] = wcu;
451 /* Write paddings. */
452 for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
453 pstr->wcs[byte_idx++] = WEOF;
455 else if (mbclen == (size_t) -1 || mbclen == 0
456 || (mbclen == (size_t) -2 && pstr->bufs_len >= pstr->len))
458 /* It is an invalid character or '\0'. Just use the byte. */
459 int ch = pstr->raw_mbs[pstr->raw_mbs_idx + src_idx];
461 if (BE (pstr->trans != NULL, 0))
462 ch = pstr->trans [ch];
463 pstr->mbs[byte_idx] = ch;
465 if (BE (pstr->offsets_needed != 0, 0))
466 pstr->offsets[byte_idx] = src_idx;
467 ++src_idx;
469 /* And also cast it to wide char. */
470 pstr->wcs[byte_idx++] = (wchar_t) ch;
471 if (BE (mbclen == (size_t) -1, 0))
472 pstr->cur_state = prev_st;
474 else
476 /* The buffer doesn't have enough space, finish to build. */
477 pstr->cur_state = prev_st;
478 break;
481 pstr->valid_len = byte_idx;
482 pstr->valid_raw_len = src_idx;
483 return REG_NOERROR;
486 /* Skip characters until the index becomes greater than NEW_RAW_IDX.
487 Return the index. */
489 static int
490 internal_function
491 re_string_skip_chars (re_string_t *pstr, int new_raw_idx, wint_t *last_wc)
493 mbstate_t prev_st;
494 int rawbuf_idx;
495 size_t mbclen;
496 wint_t wc = WEOF;
498 /* Skip the characters which are not necessary to check. */
499 for (rawbuf_idx = pstr->raw_mbs_idx + pstr->valid_raw_len;
500 rawbuf_idx < new_raw_idx;)
502 wchar_t wc2;
503 int remain_len = pstr->len - rawbuf_idx;
504 prev_st = pstr->cur_state;
505 mbclen = __mbrtowc (&wc2, (const char *) pstr->raw_mbs + rawbuf_idx,
506 remain_len, &pstr->cur_state);
507 if (BE ((ssize_t) mbclen <= 0, 0))
509 /* We treat these cases as a single byte character. */
510 if (mbclen == 0 || remain_len == 0)
511 wc = L'\0';
512 else
513 wc = *(unsigned char *) (pstr->raw_mbs + rawbuf_idx);
514 mbclen = 1;
515 pstr->cur_state = prev_st;
517 else
518 wc = (wint_t) wc2;
519 /* Then proceed the next character. */
520 rawbuf_idx += mbclen;
522 *last_wc = wc;
523 return rawbuf_idx;
525 #endif /* RE_ENABLE_I18N */
527 /* Build the buffer PSTR->MBS, and apply the translation if we need.
528 This function is used in case of REG_ICASE. */
530 static void
531 internal_function
532 build_upper_buffer (re_string_t *pstr)
534 int char_idx, end_idx;
535 end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
537 for (char_idx = pstr->valid_len; char_idx < end_idx; ++char_idx)
539 int ch = pstr->raw_mbs[pstr->raw_mbs_idx + char_idx];
540 if (BE (pstr->trans != NULL, 0))
541 ch = pstr->trans[ch];
542 if (islower (ch))
543 pstr->mbs[char_idx] = toupper (ch);
544 else
545 pstr->mbs[char_idx] = ch;
547 pstr->valid_len = char_idx;
548 pstr->valid_raw_len = char_idx;
551 /* Apply TRANS to the buffer in PSTR. */
553 static void
554 internal_function
555 re_string_translate_buffer (re_string_t *pstr)
557 int buf_idx, end_idx;
558 end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
560 for (buf_idx = pstr->valid_len; buf_idx < end_idx; ++buf_idx)
562 int ch = pstr->raw_mbs[pstr->raw_mbs_idx + buf_idx];
563 pstr->mbs[buf_idx] = pstr->trans[ch];
566 pstr->valid_len = buf_idx;
567 pstr->valid_raw_len = buf_idx;
570 /* This function re-construct the buffers.
571 Concretely, convert to wide character in case of pstr->mb_cur_max > 1,
572 convert to upper case in case of REG_ICASE, apply translation. */
574 static reg_errcode_t
575 internal_function __attribute_warn_unused_result__
576 re_string_reconstruct (re_string_t *pstr, int idx, int eflags)
578 int offset = idx - pstr->raw_mbs_idx;
579 if (BE (offset < 0, 0))
581 /* Reset buffer. */
582 #ifdef RE_ENABLE_I18N
583 if (pstr->mb_cur_max > 1)
584 memset (&pstr->cur_state, '\0', sizeof (mbstate_t));
585 #endif /* RE_ENABLE_I18N */
586 pstr->len = pstr->raw_len;
587 pstr->stop = pstr->raw_stop;
588 pstr->valid_len = 0;
589 pstr->raw_mbs_idx = 0;
590 pstr->valid_raw_len = 0;
591 pstr->offsets_needed = 0;
592 pstr->tip_context = ((eflags & REG_NOTBOL) ? CONTEXT_BEGBUF
593 : CONTEXT_NEWLINE | CONTEXT_BEGBUF);
594 if (!pstr->mbs_allocated)
595 pstr->mbs = (unsigned char *) pstr->raw_mbs;
596 offset = idx;
599 if (BE (offset != 0, 1))
601 /* Should the already checked characters be kept? */
602 if (BE (offset < pstr->valid_raw_len, 1))
604 /* Yes, move them to the front of the buffer. */
605 #ifdef RE_ENABLE_I18N
606 if (BE (pstr->offsets_needed, 0))
608 int low = 0, high = pstr->valid_len, mid;
611 mid = (high + low) / 2;
612 if (pstr->offsets[mid] > offset)
613 high = mid;
614 else if (pstr->offsets[mid] < offset)
615 low = mid + 1;
616 else
617 break;
619 while (low < high);
620 if (pstr->offsets[mid] < offset)
621 ++mid;
622 pstr->tip_context = re_string_context_at (pstr, mid - 1,
623 eflags);
624 /* This can be quite complicated, so handle specially
625 only the common and easy case where the character with
626 different length representation of lower and upper
627 case is present at or after offset. */
628 if (pstr->valid_len > offset
629 && mid == offset && pstr->offsets[mid] == offset)
631 memmove (pstr->wcs, pstr->wcs + offset,
632 (pstr->valid_len - offset) * sizeof (wint_t));
633 memmove (pstr->mbs, pstr->mbs + offset, pstr->valid_len - offset);
634 pstr->valid_len -= offset;
635 pstr->valid_raw_len -= offset;
636 for (low = 0; low < pstr->valid_len; low++)
637 pstr->offsets[low] = pstr->offsets[low + offset] - offset;
639 else
641 /* Otherwise, just find out how long the partial multibyte
642 character at offset is and fill it with WEOF/255. */
643 pstr->len = pstr->raw_len - idx + offset;
644 pstr->stop = pstr->raw_stop - idx + offset;
645 pstr->offsets_needed = 0;
646 while (mid > 0 && pstr->offsets[mid - 1] == offset)
647 --mid;
648 while (mid < pstr->valid_len)
649 if (pstr->wcs[mid] != WEOF)
650 break;
651 else
652 ++mid;
653 if (mid == pstr->valid_len)
654 pstr->valid_len = 0;
655 else
657 pstr->valid_len = pstr->offsets[mid] - offset;
658 if (pstr->valid_len)
660 for (low = 0; low < pstr->valid_len; ++low)
661 pstr->wcs[low] = WEOF;
662 memset (pstr->mbs, 255, pstr->valid_len);
665 pstr->valid_raw_len = pstr->valid_len;
668 else
669 #endif
671 pstr->tip_context = re_string_context_at (pstr, offset - 1,
672 eflags);
673 #ifdef RE_ENABLE_I18N
674 if (pstr->mb_cur_max > 1)
675 memmove (pstr->wcs, pstr->wcs + offset,
676 (pstr->valid_len - offset) * sizeof (wint_t));
677 #endif /* RE_ENABLE_I18N */
678 if (BE (pstr->mbs_allocated, 0))
679 memmove (pstr->mbs, pstr->mbs + offset,
680 pstr->valid_len - offset);
681 pstr->valid_len -= offset;
682 pstr->valid_raw_len -= offset;
683 #if DEBUG
684 assert (pstr->valid_len > 0);
685 #endif
688 else
690 /* No, skip all characters until IDX. */
691 int prev_valid_len = pstr->valid_len;
693 #ifdef RE_ENABLE_I18N
694 if (BE (pstr->offsets_needed, 0))
696 pstr->len = pstr->raw_len - idx + offset;
697 pstr->stop = pstr->raw_stop - idx + offset;
698 pstr->offsets_needed = 0;
700 #endif
701 pstr->valid_len = 0;
702 #ifdef RE_ENABLE_I18N
703 if (pstr->mb_cur_max > 1)
705 int wcs_idx;
706 wint_t wc = WEOF;
708 if (pstr->is_utf8)
710 const unsigned char *raw, *p, *end;
712 /* Special case UTF-8. Multi-byte chars start with any
713 byte other than 0x80 - 0xbf. */
714 raw = pstr->raw_mbs + pstr->raw_mbs_idx;
715 end = raw + (offset - pstr->mb_cur_max);
716 if (end < pstr->raw_mbs)
717 end = pstr->raw_mbs;
718 p = raw + offset - 1;
719 #ifdef _LIBC
720 /* We know the wchar_t encoding is UCS4, so for the simple
721 case, ASCII characters, skip the conversion step. */
722 if (isascii (*p) && BE (pstr->trans == NULL, 1))
724 memset (&pstr->cur_state, '\0', sizeof (mbstate_t));
725 /* pstr->valid_len = 0; */
726 wc = (wchar_t) *p;
728 else
729 #endif
730 for (; p >= end; --p)
731 if ((*p & 0xc0) != 0x80)
733 mbstate_t cur_state;
734 wchar_t wc2;
735 int mlen = raw + pstr->len - p;
736 unsigned char buf[6];
737 size_t mbclen;
739 if (BE (pstr->trans != NULL, 0))
741 int i = mlen < 6 ? mlen : 6;
742 while (--i >= 0)
743 buf[i] = pstr->trans[p[i]];
745 /* XXX Don't use mbrtowc, we know which conversion
746 to use (UTF-8 -> UCS4). */
747 memset (&cur_state, 0, sizeof (cur_state));
748 mbclen = __mbrtowc (&wc2, (const char *) p, mlen,
749 &cur_state);
750 if (raw + offset - p <= mbclen
751 && mbclen < (size_t) -2)
753 memset (&pstr->cur_state, '\0',
754 sizeof (mbstate_t));
755 pstr->valid_len = mbclen - (raw + offset - p);
756 wc = wc2;
758 break;
762 if (wc == WEOF)
763 pstr->valid_len = re_string_skip_chars (pstr, idx, &wc) - idx;
764 if (wc == WEOF)
765 pstr->tip_context
766 = re_string_context_at (pstr, prev_valid_len - 1, eflags);
767 else
768 pstr->tip_context = ((BE (pstr->word_ops_used != 0, 0)
769 && IS_WIDE_WORD_CHAR (wc))
770 ? CONTEXT_WORD
771 : ((IS_WIDE_NEWLINE (wc)
772 && pstr->newline_anchor)
773 ? CONTEXT_NEWLINE : 0));
774 if (BE (pstr->valid_len, 0))
776 for (wcs_idx = 0; wcs_idx < pstr->valid_len; ++wcs_idx)
777 pstr->wcs[wcs_idx] = WEOF;
778 if (pstr->mbs_allocated)
779 memset (pstr->mbs, 255, pstr->valid_len);
781 pstr->valid_raw_len = pstr->valid_len;
783 else
784 #endif /* RE_ENABLE_I18N */
786 int c = pstr->raw_mbs[pstr->raw_mbs_idx + offset - 1];
787 pstr->valid_raw_len = 0;
788 if (pstr->trans)
789 c = pstr->trans[c];
790 pstr->tip_context = (bitset_contain (pstr->word_char, c)
791 ? CONTEXT_WORD
792 : ((IS_NEWLINE (c) && pstr->newline_anchor)
793 ? CONTEXT_NEWLINE : 0));
796 if (!BE (pstr->mbs_allocated, 0))
797 pstr->mbs += offset;
799 pstr->raw_mbs_idx = idx;
800 pstr->len -= offset;
801 pstr->stop -= offset;
803 /* Then build the buffers. */
804 #ifdef RE_ENABLE_I18N
805 if (pstr->mb_cur_max > 1)
807 if (pstr->icase)
809 reg_errcode_t ret = build_wcs_upper_buffer (pstr);
810 if (BE (ret != REG_NOERROR, 0))
811 return ret;
813 else
814 build_wcs_buffer (pstr);
816 else
817 #endif /* RE_ENABLE_I18N */
818 if (BE (pstr->mbs_allocated, 0))
820 if (pstr->icase)
821 build_upper_buffer (pstr);
822 else if (pstr->trans != NULL)
823 re_string_translate_buffer (pstr);
825 else
826 pstr->valid_len = pstr->len;
828 pstr->cur_idx = 0;
829 return REG_NOERROR;
832 static unsigned char
833 internal_function __attribute ((pure))
834 re_string_peek_byte_case (const re_string_t *pstr, int idx)
836 int ch, off;
838 /* Handle the common (easiest) cases first. */
839 if (BE (!pstr->mbs_allocated, 1))
840 return re_string_peek_byte (pstr, idx);
842 #ifdef RE_ENABLE_I18N
843 if (pstr->mb_cur_max > 1
844 && ! re_string_is_single_byte_char (pstr, pstr->cur_idx + idx))
845 return re_string_peek_byte (pstr, idx);
846 #endif
848 off = pstr->cur_idx + idx;
849 #ifdef RE_ENABLE_I18N
850 if (pstr->offsets_needed)
851 off = pstr->offsets[off];
852 #endif
854 ch = pstr->raw_mbs[pstr->raw_mbs_idx + off];
856 #ifdef RE_ENABLE_I18N
857 /* Ensure that e.g. for tr_TR.UTF-8 BACKSLASH DOTLESS SMALL LETTER I
858 this function returns CAPITAL LETTER I instead of first byte of
859 DOTLESS SMALL LETTER I. The latter would confuse the parser,
860 since peek_byte_case doesn't advance cur_idx in any way. */
861 if (pstr->offsets_needed && !isascii (ch))
862 return re_string_peek_byte (pstr, idx);
863 #endif
865 return ch;
868 static unsigned char
869 internal_function __attribute ((pure))
870 re_string_fetch_byte_case (re_string_t *pstr)
872 if (BE (!pstr->mbs_allocated, 1))
873 return re_string_fetch_byte (pstr);
875 #ifdef RE_ENABLE_I18N
876 if (pstr->offsets_needed)
878 int off, ch;
880 /* For tr_TR.UTF-8 [[:islower:]] there is
881 [[: CAPITAL LETTER I WITH DOT lower:]] in mbs. Skip
882 in that case the whole multi-byte character and return
883 the original letter. On the other side, with
884 [[: DOTLESS SMALL LETTER I return [[:I, as doing
885 anything else would complicate things too much. */
887 if (!re_string_first_byte (pstr, pstr->cur_idx))
888 return re_string_fetch_byte (pstr);
890 off = pstr->offsets[pstr->cur_idx];
891 ch = pstr->raw_mbs[pstr->raw_mbs_idx + off];
893 if (! isascii (ch))
894 return re_string_fetch_byte (pstr);
896 re_string_skip_bytes (pstr,
897 re_string_char_size_at (pstr, pstr->cur_idx));
898 return ch;
900 #endif
902 return pstr->raw_mbs[pstr->raw_mbs_idx + pstr->cur_idx++];
905 static void
906 internal_function
907 re_string_destruct (re_string_t *pstr)
909 #ifdef RE_ENABLE_I18N
910 re_free (pstr->wcs);
911 re_free (pstr->offsets);
912 #endif /* RE_ENABLE_I18N */
913 if (pstr->mbs_allocated)
914 re_free (pstr->mbs);
917 /* Return the context at IDX in INPUT. */
919 static unsigned int
920 internal_function
921 re_string_context_at (const re_string_t *input, int idx, int eflags)
923 int c;
924 if (BE (idx < 0, 0))
925 /* In this case, we use the value stored in input->tip_context,
926 since we can't know the character in input->mbs[-1] here. */
927 return input->tip_context;
928 if (BE (idx == input->len, 0))
929 return ((eflags & REG_NOTEOL) ? CONTEXT_ENDBUF
930 : CONTEXT_NEWLINE | CONTEXT_ENDBUF);
931 #ifdef RE_ENABLE_I18N
932 if (input->mb_cur_max > 1)
934 wint_t wc;
935 int wc_idx = idx;
936 while(input->wcs[wc_idx] == WEOF)
938 #ifdef DEBUG
939 /* It must not happen. */
940 assert (wc_idx >= 0);
941 #endif
942 --wc_idx;
943 if (wc_idx < 0)
944 return input->tip_context;
946 wc = input->wcs[wc_idx];
947 if (BE (input->word_ops_used != 0, 0) && IS_WIDE_WORD_CHAR (wc))
948 return CONTEXT_WORD;
949 return (IS_WIDE_NEWLINE (wc) && input->newline_anchor
950 ? CONTEXT_NEWLINE : 0);
952 else
953 #endif
955 c = re_string_byte_at (input, idx);
956 if (bitset_contain (input->word_char, c))
957 return CONTEXT_WORD;
958 return IS_NEWLINE (c) && input->newline_anchor ? CONTEXT_NEWLINE : 0;
962 /* Functions for set operation. */
964 static reg_errcode_t
965 internal_function __attribute_warn_unused_result__
966 re_node_set_alloc (re_node_set *set, int size)
968 set->alloc = size;
969 set->nelem = 0;
970 set->elems = re_malloc (int, size);
971 if (BE (set->elems == NULL, 0))
972 return REG_ESPACE;
973 return REG_NOERROR;
976 static reg_errcode_t
977 internal_function __attribute_warn_unused_result__
978 re_node_set_init_1 (re_node_set *set, int elem)
980 set->alloc = 1;
981 set->nelem = 1;
982 set->elems = re_malloc (int, 1);
983 if (BE (set->elems == NULL, 0))
985 set->alloc = set->nelem = 0;
986 return REG_ESPACE;
988 set->elems[0] = elem;
989 return REG_NOERROR;
992 static reg_errcode_t
993 internal_function __attribute_warn_unused_result__
994 re_node_set_init_2 (re_node_set *set, int elem1, int elem2)
996 set->alloc = 2;
997 set->elems = re_malloc (int, 2);
998 if (BE (set->elems == NULL, 0))
999 return REG_ESPACE;
1000 if (elem1 == elem2)
1002 set->nelem = 1;
1003 set->elems[0] = elem1;
1005 else
1007 set->nelem = 2;
1008 if (elem1 < elem2)
1010 set->elems[0] = elem1;
1011 set->elems[1] = elem2;
1013 else
1015 set->elems[0] = elem2;
1016 set->elems[1] = elem1;
1019 return REG_NOERROR;
1022 static reg_errcode_t
1023 internal_function __attribute_warn_unused_result__
1024 re_node_set_init_copy (re_node_set *dest, const re_node_set *src)
1026 dest->nelem = src->nelem;
1027 if (src->nelem > 0)
1029 dest->alloc = dest->nelem;
1030 dest->elems = re_malloc (int, dest->alloc);
1031 if (BE (dest->elems == NULL, 0))
1033 dest->alloc = dest->nelem = 0;
1034 return REG_ESPACE;
1036 memcpy (dest->elems, src->elems, src->nelem * sizeof (int));
1038 else
1039 re_node_set_init_empty (dest);
1040 return REG_NOERROR;
1043 /* Calculate the intersection of the sets SRC1 and SRC2. And merge it to
1044 DEST. Return value indicate the error code or REG_NOERROR if succeeded.
1045 Note: We assume dest->elems is NULL, when dest->alloc is 0. */
1047 static reg_errcode_t
1048 internal_function __attribute_warn_unused_result__
1049 re_node_set_add_intersect (re_node_set *dest, const re_node_set *src1,
1050 const re_node_set *src2)
1052 int i1, i2, is, id, delta, sbase;
1053 if (src1->nelem == 0 || src2->nelem == 0)
1054 return REG_NOERROR;
1056 /* We need dest->nelem + 2 * elems_in_intersection; this is a
1057 conservative estimate. */
1058 if (src1->nelem + src2->nelem + dest->nelem > dest->alloc)
1060 int new_alloc = src1->nelem + src2->nelem + dest->alloc;
1061 int *new_elems = re_realloc (dest->elems, int, new_alloc);
1062 if (BE (new_elems == NULL, 0))
1063 return REG_ESPACE;
1064 dest->elems = new_elems;
1065 dest->alloc = new_alloc;
1068 /* Find the items in the intersection of SRC1 and SRC2, and copy
1069 into the top of DEST those that are not already in DEST itself. */
1070 sbase = dest->nelem + src1->nelem + src2->nelem;
1071 i1 = src1->nelem - 1;
1072 i2 = src2->nelem - 1;
1073 id = dest->nelem - 1;
1074 for (;;)
1076 if (src1->elems[i1] == src2->elems[i2])
1078 /* Try to find the item in DEST. Maybe we could binary search? */
1079 while (id >= 0 && dest->elems[id] > src1->elems[i1])
1080 --id;
1082 if (id < 0 || dest->elems[id] != src1->elems[i1])
1083 dest->elems[--sbase] = src1->elems[i1];
1085 if (--i1 < 0 || --i2 < 0)
1086 break;
1089 /* Lower the highest of the two items. */
1090 else if (src1->elems[i1] < src2->elems[i2])
1092 if (--i2 < 0)
1093 break;
1095 else
1097 if (--i1 < 0)
1098 break;
1102 id = dest->nelem - 1;
1103 is = dest->nelem + src1->nelem + src2->nelem - 1;
1104 delta = is - sbase + 1;
1106 /* Now copy. When DELTA becomes zero, the remaining
1107 DEST elements are already in place; this is more or
1108 less the same loop that is in re_node_set_merge. */
1109 dest->nelem += delta;
1110 if (delta > 0 && id >= 0)
1111 for (;;)
1113 if (dest->elems[is] > dest->elems[id])
1115 /* Copy from the top. */
1116 dest->elems[id + delta--] = dest->elems[is--];
1117 if (delta == 0)
1118 break;
1120 else
1122 /* Slide from the bottom. */
1123 dest->elems[id + delta] = dest->elems[id];
1124 if (--id < 0)
1125 break;
1129 /* Copy remaining SRC elements. */
1130 memcpy (dest->elems, dest->elems + sbase, delta * sizeof (int));
1132 return REG_NOERROR;
1135 /* Calculate the union set of the sets SRC1 and SRC2. And store it to
1136 DEST. Return value indicate the error code or REG_NOERROR if succeeded. */
1138 static reg_errcode_t
1139 internal_function __attribute_warn_unused_result__
1140 re_node_set_init_union (re_node_set *dest, const re_node_set *src1,
1141 const re_node_set *src2)
1143 int i1, i2, id;
1144 if (src1 != NULL && src1->nelem > 0 && src2 != NULL && src2->nelem > 0)
1146 dest->alloc = src1->nelem + src2->nelem;
1147 dest->elems = re_malloc (int, dest->alloc);
1148 if (BE (dest->elems == NULL, 0))
1149 return REG_ESPACE;
1151 else
1153 if (src1 != NULL && src1->nelem > 0)
1154 return re_node_set_init_copy (dest, src1);
1155 else if (src2 != NULL && src2->nelem > 0)
1156 return re_node_set_init_copy (dest, src2);
1157 else
1158 re_node_set_init_empty (dest);
1159 return REG_NOERROR;
1161 for (i1 = i2 = id = 0 ; i1 < src1->nelem && i2 < src2->nelem ;)
1163 if (src1->elems[i1] > src2->elems[i2])
1165 dest->elems[id++] = src2->elems[i2++];
1166 continue;
1168 if (src1->elems[i1] == src2->elems[i2])
1169 ++i2;
1170 dest->elems[id++] = src1->elems[i1++];
1172 if (i1 < src1->nelem)
1174 memcpy (dest->elems + id, src1->elems + i1,
1175 (src1->nelem - i1) * sizeof (int));
1176 id += src1->nelem - i1;
1178 else if (i2 < src2->nelem)
1180 memcpy (dest->elems + id, src2->elems + i2,
1181 (src2->nelem - i2) * sizeof (int));
1182 id += src2->nelem - i2;
1184 dest->nelem = id;
1185 return REG_NOERROR;
1188 /* Calculate the union set of the sets DEST and SRC. And store it to
1189 DEST. Return value indicate the error code or REG_NOERROR if succeeded. */
1191 static reg_errcode_t
1192 internal_function __attribute_warn_unused_result__
1193 re_node_set_merge (re_node_set *dest, const re_node_set *src)
1195 int is, id, sbase, delta;
1196 if (src == NULL || src->nelem == 0)
1197 return REG_NOERROR;
1198 if (dest->alloc < 2 * src->nelem + dest->nelem)
1200 int new_alloc = 2 * (src->nelem + dest->alloc);
1201 int *new_buffer = re_realloc (dest->elems, int, new_alloc);
1202 if (BE (new_buffer == NULL, 0))
1203 return REG_ESPACE;
1204 dest->elems = new_buffer;
1205 dest->alloc = new_alloc;
1208 if (BE (dest->nelem == 0, 0))
1210 dest->nelem = src->nelem;
1211 memcpy (dest->elems, src->elems, src->nelem * sizeof (int));
1212 return REG_NOERROR;
1215 /* Copy into the top of DEST the items of SRC that are not
1216 found in DEST. Maybe we could binary search in DEST? */
1217 for (sbase = dest->nelem + 2 * src->nelem,
1218 is = src->nelem - 1, id = dest->nelem - 1; is >= 0 && id >= 0; )
1220 if (dest->elems[id] == src->elems[is])
1221 is--, id--;
1222 else if (dest->elems[id] < src->elems[is])
1223 dest->elems[--sbase] = src->elems[is--];
1224 else /* if (dest->elems[id] > src->elems[is]) */
1225 --id;
1228 if (is >= 0)
1230 /* If DEST is exhausted, the remaining items of SRC must be unique. */
1231 sbase -= is + 1;
1232 memcpy (dest->elems + sbase, src->elems, (is + 1) * sizeof (int));
1235 id = dest->nelem - 1;
1236 is = dest->nelem + 2 * src->nelem - 1;
1237 delta = is - sbase + 1;
1238 if (delta == 0)
1239 return REG_NOERROR;
1241 /* Now copy. When DELTA becomes zero, the remaining
1242 DEST elements are already in place. */
1243 dest->nelem += delta;
1244 for (;;)
1246 if (dest->elems[is] > dest->elems[id])
1248 /* Copy from the top. */
1249 dest->elems[id + delta--] = dest->elems[is--];
1250 if (delta == 0)
1251 break;
1253 else
1255 /* Slide from the bottom. */
1256 dest->elems[id + delta] = dest->elems[id];
1257 if (--id < 0)
1259 /* Copy remaining SRC elements. */
1260 memcpy (dest->elems, dest->elems + sbase,
1261 delta * sizeof (int));
1262 break;
1267 return REG_NOERROR;
1270 /* Insert the new element ELEM to the re_node_set* SET.
1271 SET should not already have ELEM.
1272 return -1 if an error is occured, return 1 otherwise. */
1274 static int
1275 internal_function __attribute_warn_unused_result__
1276 re_node_set_insert (re_node_set *set, int elem)
1278 int idx;
1279 /* In case the set is empty. */
1280 if (set->alloc == 0)
1282 if (BE (re_node_set_init_1 (set, elem) == REG_NOERROR, 1))
1283 return 1;
1284 else
1285 return -1;
1288 if (BE (set->nelem, 0) == 0)
1290 /* We already guaranteed above that set->alloc != 0. */
1291 set->elems[0] = elem;
1292 ++set->nelem;
1293 return 1;
1296 /* Realloc if we need. */
1297 if (set->alloc == set->nelem)
1299 int *new_elems;
1300 set->alloc = set->alloc * 2;
1301 new_elems = re_realloc (set->elems, int, set->alloc);
1302 if (BE (new_elems == NULL, 0))
1303 return -1;
1304 set->elems = new_elems;
1307 /* Move the elements which follows the new element. Test the
1308 first element separately to skip a check in the inner loop. */
1309 if (elem < set->elems[0])
1311 idx = 0;
1312 for (idx = set->nelem; idx > 0; idx--)
1313 set->elems[idx] = set->elems[idx - 1];
1315 else
1317 for (idx = set->nelem; set->elems[idx - 1] > elem; idx--)
1318 set->elems[idx] = set->elems[idx - 1];
1321 /* Insert the new element. */
1322 set->elems[idx] = elem;
1323 ++set->nelem;
1324 return 1;
1327 /* Insert the new element ELEM to the re_node_set* SET.
1328 SET should not already have any element greater than or equal to ELEM.
1329 Return -1 if an error is occured, return 1 otherwise. */
1331 static int
1332 internal_function __attribute_warn_unused_result__
1333 re_node_set_insert_last (re_node_set *set, int elem)
1335 /* Realloc if we need. */
1336 if (set->alloc == set->nelem)
1338 int *new_elems;
1339 set->alloc = (set->alloc + 1) * 2;
1340 new_elems = re_realloc (set->elems, int, set->alloc);
1341 if (BE (new_elems == NULL, 0))
1342 return -1;
1343 set->elems = new_elems;
1346 /* Insert the new element. */
1347 set->elems[set->nelem++] = elem;
1348 return 1;
1351 /* Compare two node sets SET1 and SET2.
1352 return 1 if SET1 and SET2 are equivalent, return 0 otherwise. */
1354 static int
1355 internal_function __attribute ((pure))
1356 re_node_set_compare (const re_node_set *set1, const re_node_set *set2)
1358 int i;
1359 if (set1 == NULL || set2 == NULL || set1->nelem != set2->nelem)
1360 return 0;
1361 for (i = set1->nelem ; --i >= 0 ; )
1362 if (set1->elems[i] != set2->elems[i])
1363 return 0;
1364 return 1;
1367 /* Return (idx + 1) if SET contains the element ELEM, return 0 otherwise. */
1369 static int
1370 internal_function __attribute ((pure))
1371 re_node_set_contains (const re_node_set *set, int elem)
1373 unsigned int idx, right, mid;
1374 if (set->nelem <= 0)
1375 return 0;
1377 /* Binary search the element. */
1378 idx = 0;
1379 right = set->nelem - 1;
1380 while (idx < right)
1382 mid = (idx + right) / 2;
1383 if (set->elems[mid] < elem)
1384 idx = mid + 1;
1385 else
1386 right = mid;
1388 return set->elems[idx] == elem ? idx + 1 : 0;
1391 static void
1392 internal_function
1393 re_node_set_remove_at (re_node_set *set, int idx)
1395 if (idx < 0 || idx >= set->nelem)
1396 return;
1397 --set->nelem;
1398 for (; idx < set->nelem; idx++)
1399 set->elems[idx] = set->elems[idx + 1];
1403 /* Add the token TOKEN to dfa->nodes, and return the index of the token.
1404 Or return -1, if an error will be occured. */
1406 static int
1407 internal_function
1408 re_dfa_add_node (re_dfa_t *dfa, re_token_t token)
1410 int type = token.type;
1411 if (BE (dfa->nodes_len >= dfa->nodes_alloc, 0))
1413 size_t new_nodes_alloc = dfa->nodes_alloc * 2;
1414 int *new_nexts, *new_indices;
1415 re_node_set *new_edests, *new_eclosures;
1416 re_token_t *new_nodes;
1418 /* Avoid overflows in realloc. */
1419 const size_t max_object_size = MAX (sizeof (re_token_t),
1420 MAX (sizeof (re_node_set),
1421 sizeof (int)));
1422 if (BE (SIZE_MAX / max_object_size < new_nodes_alloc, 0))
1423 return -1;
1425 new_nodes = re_realloc (dfa->nodes, re_token_t, new_nodes_alloc);
1426 if (BE (new_nodes == NULL, 0))
1427 return -1;
1428 dfa->nodes = new_nodes;
1429 new_nexts = re_realloc (dfa->nexts, int, new_nodes_alloc);
1430 new_indices = re_realloc (dfa->org_indices, int, new_nodes_alloc);
1431 new_edests = re_realloc (dfa->edests, re_node_set, new_nodes_alloc);
1432 new_eclosures = re_realloc (dfa->eclosures, re_node_set, new_nodes_alloc);
1433 if (BE (new_nexts == NULL || new_indices == NULL
1434 || new_edests == NULL || new_eclosures == NULL, 0))
1435 return -1;
1436 dfa->nexts = new_nexts;
1437 dfa->org_indices = new_indices;
1438 dfa->edests = new_edests;
1439 dfa->eclosures = new_eclosures;
1440 dfa->nodes_alloc = new_nodes_alloc;
1442 dfa->nodes[dfa->nodes_len] = token;
1443 dfa->nodes[dfa->nodes_len].constraint = 0;
1444 #ifdef RE_ENABLE_I18N
1445 dfa->nodes[dfa->nodes_len].accept_mb =
1446 (type == OP_PERIOD && dfa->mb_cur_max > 1) || type == COMPLEX_BRACKET;
1447 #endif
1448 dfa->nexts[dfa->nodes_len] = -1;
1449 re_node_set_init_empty (dfa->edests + dfa->nodes_len);
1450 re_node_set_init_empty (dfa->eclosures + dfa->nodes_len);
1451 return dfa->nodes_len++;
1454 static inline unsigned int
1455 internal_function
1456 calc_state_hash (const re_node_set *nodes, unsigned int context)
1458 unsigned int hash = nodes->nelem + context;
1459 int i;
1460 for (i = 0 ; i < nodes->nelem ; i++)
1461 hash += nodes->elems[i];
1462 return hash;
1465 /* Search for the state whose node_set is equivalent to NODES.
1466 Return the pointer to the state, if we found it in the DFA.
1467 Otherwise create the new one and return it. In case of an error
1468 return NULL and set the error code in ERR.
1469 Note: - We assume NULL as the invalid state, then it is possible that
1470 return value is NULL and ERR is REG_NOERROR.
1471 - We never return non-NULL value in case of any errors, it is for
1472 optimization. */
1474 static re_dfastate_t *
1475 internal_function __attribute_warn_unused_result__
1476 re_acquire_state (reg_errcode_t *err, const re_dfa_t *dfa,
1477 const re_node_set *nodes)
1479 unsigned int hash;
1480 re_dfastate_t *new_state;
1481 struct re_state_table_entry *spot;
1482 int i;
1483 if (BE (nodes->nelem == 0, 0))
1485 *err = REG_NOERROR;
1486 return NULL;
1488 hash = calc_state_hash (nodes, 0);
1489 spot = dfa->state_table + (hash & dfa->state_hash_mask);
1491 for (i = 0 ; i < spot->num ; i++)
1493 re_dfastate_t *state = spot->array[i];
1494 if (hash != state->hash)
1495 continue;
1496 if (re_node_set_compare (&state->nodes, nodes))
1497 return state;
1500 /* There are no appropriate state in the dfa, create the new one. */
1501 new_state = create_ci_newstate (dfa, nodes, hash);
1502 if (BE (new_state == NULL, 0))
1503 *err = REG_ESPACE;
1505 return new_state;
1508 /* Search for the state whose node_set is equivalent to NODES and
1509 whose context is equivalent to CONTEXT.
1510 Return the pointer to the state, if we found it in the DFA.
1511 Otherwise create the new one and return it. In case of an error
1512 return NULL and set the error code in ERR.
1513 Note: - We assume NULL as the invalid state, then it is possible that
1514 return value is NULL and ERR is REG_NOERROR.
1515 - We never return non-NULL value in case of any errors, it is for
1516 optimization. */
1518 static re_dfastate_t *
1519 internal_function __attribute_warn_unused_result__
1520 re_acquire_state_context (reg_errcode_t *err, const re_dfa_t *dfa,
1521 const re_node_set *nodes, unsigned int context)
1523 unsigned int hash;
1524 re_dfastate_t *new_state;
1525 struct re_state_table_entry *spot;
1526 int i;
1527 if (nodes->nelem == 0)
1529 *err = REG_NOERROR;
1530 return NULL;
1532 hash = calc_state_hash (nodes, context);
1533 spot = dfa->state_table + (hash & dfa->state_hash_mask);
1535 for (i = 0 ; i < spot->num ; i++)
1537 re_dfastate_t *state = spot->array[i];
1538 if (state->hash == hash
1539 && state->context == context
1540 && re_node_set_compare (state->entrance_nodes, nodes))
1541 return state;
1543 /* There are no appropriate state in `dfa', create the new one. */
1544 new_state = create_cd_newstate (dfa, nodes, context, hash);
1545 if (BE (new_state == NULL, 0))
1546 *err = REG_ESPACE;
1548 return new_state;
1551 /* Finish initialization of the new state NEWSTATE, and using its hash value
1552 HASH put in the appropriate bucket of DFA's state table. Return value
1553 indicates the error code if failed. */
1555 static reg_errcode_t
1556 __attribute_warn_unused_result__
1557 register_state (const re_dfa_t *dfa, re_dfastate_t *newstate,
1558 unsigned int hash)
1560 struct re_state_table_entry *spot;
1561 reg_errcode_t err;
1562 int i;
1564 newstate->hash = hash;
1565 err = re_node_set_alloc (&newstate->non_eps_nodes, newstate->nodes.nelem);
1566 if (BE (err != REG_NOERROR, 0))
1567 return REG_ESPACE;
1568 for (i = 0; i < newstate->nodes.nelem; i++)
1570 int elem = newstate->nodes.elems[i];
1571 if (!IS_EPSILON_NODE (dfa->nodes[elem].type))
1572 if (re_node_set_insert_last (&newstate->non_eps_nodes, elem) < 0)
1573 return REG_ESPACE;
1576 spot = dfa->state_table + (hash & dfa->state_hash_mask);
1577 if (BE (spot->alloc <= spot->num, 0))
1579 int new_alloc = 2 * spot->num + 2;
1580 re_dfastate_t **new_array = re_realloc (spot->array, re_dfastate_t *,
1581 new_alloc);
1582 if (BE (new_array == NULL, 0))
1583 return REG_ESPACE;
1584 spot->array = new_array;
1585 spot->alloc = new_alloc;
1587 spot->array[spot->num++] = newstate;
1588 return REG_NOERROR;
1591 static void
1592 free_state (re_dfastate_t *state)
1594 re_node_set_free (&state->non_eps_nodes);
1595 re_node_set_free (&state->inveclosure);
1596 if (state->entrance_nodes != &state->nodes)
1598 re_node_set_free (state->entrance_nodes);
1599 re_free (state->entrance_nodes);
1601 re_node_set_free (&state->nodes);
1602 re_free (state->word_trtable);
1603 re_free (state->trtable);
1604 re_free (state);
1607 /* Create the new state which is independ of contexts.
1608 Return the new state if succeeded, otherwise return NULL. */
1610 static re_dfastate_t *
1611 internal_function __attribute_warn_unused_result__
1612 create_ci_newstate (const re_dfa_t *dfa, const re_node_set *nodes,
1613 unsigned int hash)
1615 int i;
1616 reg_errcode_t err;
1617 re_dfastate_t *newstate;
1619 newstate = (re_dfastate_t *) calloc (sizeof (re_dfastate_t), 1);
1620 if (BE (newstate == NULL, 0))
1621 return NULL;
1622 err = re_node_set_init_copy (&newstate->nodes, nodes);
1623 if (BE (err != REG_NOERROR, 0))
1625 re_free (newstate);
1626 return NULL;
1629 newstate->entrance_nodes = &newstate->nodes;
1630 for (i = 0 ; i < nodes->nelem ; i++)
1632 re_token_t *node = dfa->nodes + nodes->elems[i];
1633 re_token_type_t type = node->type;
1634 if (type == CHARACTER && !node->constraint)
1635 continue;
1636 #ifdef RE_ENABLE_I18N
1637 newstate->accept_mb |= node->accept_mb;
1638 #endif /* RE_ENABLE_I18N */
1640 /* If the state has the halt node, the state is a halt state. */
1641 if (type == END_OF_RE)
1642 newstate->halt = 1;
1643 else if (type == OP_BACK_REF)
1644 newstate->has_backref = 1;
1645 else if (type == ANCHOR || node->constraint)
1646 newstate->has_constraint = 1;
1648 err = register_state (dfa, newstate, hash);
1649 if (BE (err != REG_NOERROR, 0))
1651 free_state (newstate);
1652 newstate = NULL;
1654 return newstate;
1657 /* Create the new state which is depend on the context CONTEXT.
1658 Return the new state if succeeded, otherwise return NULL. */
1660 static re_dfastate_t *
1661 internal_function __attribute_warn_unused_result__
1662 create_cd_newstate (const re_dfa_t *dfa, const re_node_set *nodes,
1663 unsigned int context, unsigned int hash)
1665 int i, nctx_nodes = 0;
1666 reg_errcode_t err;
1667 re_dfastate_t *newstate;
1669 newstate = (re_dfastate_t *) calloc (sizeof (re_dfastate_t), 1);
1670 if (BE (newstate == NULL, 0))
1671 return NULL;
1672 err = re_node_set_init_copy (&newstate->nodes, nodes);
1673 if (BE (err != REG_NOERROR, 0))
1675 re_free (newstate);
1676 return NULL;
1679 newstate->context = context;
1680 newstate->entrance_nodes = &newstate->nodes;
1682 for (i = 0 ; i < nodes->nelem ; i++)
1684 re_token_t *node = dfa->nodes + nodes->elems[i];
1685 re_token_type_t type = node->type;
1686 unsigned int constraint = node->constraint;
1688 if (type == CHARACTER && !constraint)
1689 continue;
1690 #ifdef RE_ENABLE_I18N
1691 newstate->accept_mb |= node->accept_mb;
1692 #endif /* RE_ENABLE_I18N */
1694 /* If the state has the halt node, the state is a halt state. */
1695 if (type == END_OF_RE)
1696 newstate->halt = 1;
1697 else if (type == OP_BACK_REF)
1698 newstate->has_backref = 1;
1700 if (constraint)
1702 if (newstate->entrance_nodes == &newstate->nodes)
1704 newstate->entrance_nodes = re_malloc (re_node_set, 1);
1705 if (BE (newstate->entrance_nodes == NULL, 0))
1707 free_state (newstate);
1708 return NULL;
1710 if (re_node_set_init_copy (newstate->entrance_nodes, nodes)
1711 != REG_NOERROR)
1712 return NULL;
1713 nctx_nodes = 0;
1714 newstate->has_constraint = 1;
1717 if (NOT_SATISFY_PREV_CONSTRAINT (constraint,context))
1719 re_node_set_remove_at (&newstate->nodes, i - nctx_nodes);
1720 ++nctx_nodes;
1724 err = register_state (dfa, newstate, hash);
1725 if (BE (err != REG_NOERROR, 0))
1727 free_state (newstate);
1728 newstate = NULL;
1730 return newstate;