Merge branch 'maint'
[git/gitweb.git] / compat / regex / regex_internal.c
blob193854cf5b609d714b03218bd507af840d478c6e
1 /* Extended regular expression matching and search library.
2 Copyright (C) 2002-2006, 2010 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4 Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library; if not, write to the Free
18 Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
19 02110-1301 USA. */
21 static void re_string_construct_common (const char *str, int len,
22 re_string_t *pstr,
23 RE_TRANSLATE_TYPE trans, int icase,
24 const re_dfa_t *dfa) internal_function;
25 static re_dfastate_t *create_ci_newstate (const re_dfa_t *dfa,
26 const re_node_set *nodes,
27 unsigned int hash) internal_function;
28 static re_dfastate_t *create_cd_newstate (const re_dfa_t *dfa,
29 const re_node_set *nodes,
30 unsigned int context,
31 unsigned int hash) internal_function;
33 #ifdef GAWK
34 #undef MAX /* safety */
35 static int
36 MAX(size_t a, size_t b)
38 return (a > b ? a : b);
40 #endif
42 /* Functions for string operation. */
44 /* This function allocate the buffers. It is necessary to call
45 re_string_reconstruct before using the object. */
47 static reg_errcode_t
48 internal_function
49 re_string_allocate (re_string_t *pstr, const char *str, int len, int init_len,
50 RE_TRANSLATE_TYPE trans, int icase, const re_dfa_t *dfa)
52 reg_errcode_t ret;
53 int init_buf_len;
55 /* Ensure at least one character fits into the buffers. */
56 if (init_len < dfa->mb_cur_max)
57 init_len = dfa->mb_cur_max;
58 init_buf_len = (len + 1 < init_len) ? len + 1: init_len;
59 re_string_construct_common (str, len, pstr, trans, icase, dfa);
61 ret = re_string_realloc_buffers (pstr, init_buf_len);
62 if (BE (ret != REG_NOERROR, 0))
63 return ret;
65 pstr->word_char = dfa->word_char;
66 pstr->word_ops_used = dfa->word_ops_used;
67 pstr->mbs = pstr->mbs_allocated ? pstr->mbs : (unsigned char *) str;
68 pstr->valid_len = (pstr->mbs_allocated || dfa->mb_cur_max > 1) ? 0 : len;
69 pstr->valid_raw_len = pstr->valid_len;
70 return REG_NOERROR;
73 /* This function allocate the buffers, and initialize them. */
75 static reg_errcode_t
76 internal_function
77 re_string_construct (re_string_t *pstr, const char *str, int len,
78 RE_TRANSLATE_TYPE trans, int icase, const re_dfa_t *dfa)
80 reg_errcode_t ret;
81 memset (pstr, '\0', sizeof (re_string_t));
82 re_string_construct_common (str, len, pstr, trans, icase, dfa);
84 if (len > 0)
86 ret = re_string_realloc_buffers (pstr, len + 1);
87 if (BE (ret != REG_NOERROR, 0))
88 return ret;
90 pstr->mbs = pstr->mbs_allocated ? pstr->mbs : (unsigned char *) str;
92 if (icase)
94 #ifdef RE_ENABLE_I18N
95 if (dfa->mb_cur_max > 1)
97 while (1)
99 ret = build_wcs_upper_buffer (pstr);
100 if (BE (ret != REG_NOERROR, 0))
101 return ret;
102 if (pstr->valid_raw_len >= len)
103 break;
104 if (pstr->bufs_len > pstr->valid_len + dfa->mb_cur_max)
105 break;
106 ret = re_string_realloc_buffers (pstr, pstr->bufs_len * 2);
107 if (BE (ret != REG_NOERROR, 0))
108 return ret;
111 else
112 #endif /* RE_ENABLE_I18N */
113 build_upper_buffer (pstr);
115 else
117 #ifdef RE_ENABLE_I18N
118 if (dfa->mb_cur_max > 1)
119 build_wcs_buffer (pstr);
120 else
121 #endif /* RE_ENABLE_I18N */
123 if (trans != NULL)
124 re_string_translate_buffer (pstr);
125 else
127 pstr->valid_len = pstr->bufs_len;
128 pstr->valid_raw_len = pstr->bufs_len;
133 return REG_NOERROR;
136 /* Helper functions for re_string_allocate, and re_string_construct. */
138 static reg_errcode_t
139 internal_function
140 re_string_realloc_buffers (re_string_t *pstr, int new_buf_len)
142 #ifdef RE_ENABLE_I18N
143 if (pstr->mb_cur_max > 1)
145 wint_t *new_wcs;
147 /* Avoid overflow in realloc. */
148 const size_t max_object_size = MAX (sizeof (wint_t), sizeof (int));
149 if (BE (SIZE_MAX / max_object_size < new_buf_len, 0))
150 return REG_ESPACE;
152 new_wcs = re_realloc (pstr->wcs, wint_t, new_buf_len);
153 if (BE (new_wcs == NULL, 0))
154 return REG_ESPACE;
155 pstr->wcs = new_wcs;
156 if (pstr->offsets != NULL)
158 int *new_offsets = re_realloc (pstr->offsets, int, new_buf_len);
159 if (BE (new_offsets == NULL, 0))
160 return REG_ESPACE;
161 pstr->offsets = new_offsets;
164 #endif /* RE_ENABLE_I18N */
165 if (pstr->mbs_allocated)
167 unsigned char *new_mbs = re_realloc (pstr->mbs, unsigned char,
168 new_buf_len);
169 if (BE (new_mbs == NULL, 0))
170 return REG_ESPACE;
171 pstr->mbs = new_mbs;
173 pstr->bufs_len = new_buf_len;
174 return REG_NOERROR;
178 static void
179 internal_function
180 re_string_construct_common (const char *str, int len, re_string_t *pstr,
181 RE_TRANSLATE_TYPE trans, int icase,
182 const re_dfa_t *dfa)
184 pstr->raw_mbs = (const unsigned char *) str;
185 pstr->len = len;
186 pstr->raw_len = len;
187 pstr->trans = trans;
188 pstr->icase = icase ? 1 : 0;
189 pstr->mbs_allocated = (trans != NULL || icase);
190 pstr->mb_cur_max = dfa->mb_cur_max;
191 pstr->is_utf8 = dfa->is_utf8;
192 pstr->map_notascii = dfa->map_notascii;
193 pstr->stop = pstr->len;
194 pstr->raw_stop = pstr->stop;
197 #ifdef RE_ENABLE_I18N
199 /* Build wide character buffer PSTR->WCS.
200 If the byte sequence of the string are:
201 <mb1>(0), <mb1>(1), <mb2>(0), <mb2>(1), <sb3>
202 Then wide character buffer will be:
203 <wc1> , WEOF , <wc2> , WEOF , <wc3>
204 We use WEOF for padding, they indicate that the position isn't
205 a first byte of a multibyte character.
207 Note that this function assumes PSTR->VALID_LEN elements are already
208 built and starts from PSTR->VALID_LEN. */
210 static void
211 internal_function
212 build_wcs_buffer (re_string_t *pstr)
214 #ifdef _LIBC
215 unsigned char buf[MB_LEN_MAX];
216 assert (MB_LEN_MAX >= pstr->mb_cur_max);
217 #else
218 unsigned char buf[64];
219 #endif
220 mbstate_t prev_st;
221 int byte_idx, end_idx, remain_len;
222 size_t mbclen;
224 /* Build the buffers from pstr->valid_len to either pstr->len or
225 pstr->bufs_len. */
226 end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
227 for (byte_idx = pstr->valid_len; byte_idx < end_idx;)
229 wchar_t wc;
230 const char *p;
232 remain_len = end_idx - byte_idx;
233 prev_st = pstr->cur_state;
234 /* Apply the translation if we need. */
235 if (BE (pstr->trans != NULL, 0))
237 int i, ch;
239 for (i = 0; i < pstr->mb_cur_max && i < remain_len; ++i)
241 ch = pstr->raw_mbs [pstr->raw_mbs_idx + byte_idx + i];
242 buf[i] = pstr->mbs[byte_idx + i] = pstr->trans[ch];
244 p = (const char *) buf;
246 else
247 p = (const char *) pstr->raw_mbs + pstr->raw_mbs_idx + byte_idx;
248 mbclen = __mbrtowc (&wc, p, remain_len, &pstr->cur_state);
249 if (BE (mbclen == (size_t) -2, 0))
251 /* The buffer doesn't have enough space, finish to build. */
252 pstr->cur_state = prev_st;
253 break;
255 else if (BE (mbclen == (size_t) -1 || mbclen == 0, 0))
257 /* We treat these cases as a singlebyte character. */
258 mbclen = 1;
259 wc = (wchar_t) pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx];
260 if (BE (pstr->trans != NULL, 0))
261 wc = pstr->trans[wc];
262 pstr->cur_state = prev_st;
265 /* Write wide character and padding. */
266 pstr->wcs[byte_idx++] = wc;
267 /* Write paddings. */
268 for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
269 pstr->wcs[byte_idx++] = WEOF;
271 pstr->valid_len = byte_idx;
272 pstr->valid_raw_len = byte_idx;
275 /* Build wide character buffer PSTR->WCS like build_wcs_buffer,
276 but for REG_ICASE. */
278 static reg_errcode_t
279 internal_function
280 build_wcs_upper_buffer (re_string_t *pstr)
282 mbstate_t prev_st;
283 int src_idx, byte_idx, end_idx, remain_len;
284 size_t mbclen;
285 #ifdef _LIBC
286 char buf[MB_LEN_MAX];
287 assert (MB_LEN_MAX >= pstr->mb_cur_max);
288 #else
289 char buf[64];
290 #endif
292 byte_idx = pstr->valid_len;
293 end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
295 /* The following optimization assumes that ASCII characters can be
296 mapped to wide characters with a simple cast. */
297 if (! pstr->map_notascii && pstr->trans == NULL && !pstr->offsets_needed)
299 while (byte_idx < end_idx)
301 wchar_t wc;
303 if (isascii (pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx])
304 && mbsinit (&pstr->cur_state))
306 /* In case of a singlebyte character. */
307 pstr->mbs[byte_idx]
308 = toupper (pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx]);
309 /* The next step uses the assumption that wchar_t is encoded
310 ASCII-safe: all ASCII values can be converted like this. */
311 pstr->wcs[byte_idx] = (wchar_t) pstr->mbs[byte_idx];
312 ++byte_idx;
313 continue;
316 remain_len = end_idx - byte_idx;
317 prev_st = pstr->cur_state;
318 mbclen = __mbrtowc (&wc,
319 ((const char *) pstr->raw_mbs + pstr->raw_mbs_idx
320 + byte_idx), remain_len, &pstr->cur_state);
321 if (BE (mbclen + 2 > 2, 1))
323 wchar_t wcu = wc;
324 if (iswlower (wc))
326 size_t mbcdlen;
328 wcu = towupper (wc);
329 mbcdlen = wcrtomb (buf, wcu, &prev_st);
330 if (BE (mbclen == mbcdlen, 1))
331 memcpy (pstr->mbs + byte_idx, buf, mbclen);
332 else
334 src_idx = byte_idx;
335 goto offsets_needed;
338 else
339 memcpy (pstr->mbs + byte_idx,
340 pstr->raw_mbs + pstr->raw_mbs_idx + byte_idx, mbclen);
341 pstr->wcs[byte_idx++] = wcu;
342 /* Write paddings. */
343 for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
344 pstr->wcs[byte_idx++] = WEOF;
346 else if (mbclen == (size_t) -1 || mbclen == 0)
348 /* It is an invalid character or '\0'. Just use the byte. */
349 int ch = pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx];
350 pstr->mbs[byte_idx] = ch;
351 /* And also cast it to wide char. */
352 pstr->wcs[byte_idx++] = (wchar_t) ch;
353 if (BE (mbclen == (size_t) -1, 0))
354 pstr->cur_state = prev_st;
356 else
358 /* The buffer doesn't have enough space, finish to build. */
359 pstr->cur_state = prev_st;
360 break;
363 pstr->valid_len = byte_idx;
364 pstr->valid_raw_len = byte_idx;
365 return REG_NOERROR;
367 else
368 for (src_idx = pstr->valid_raw_len; byte_idx < end_idx;)
370 wchar_t wc;
371 const char *p;
372 offsets_needed:
373 remain_len = end_idx - byte_idx;
374 prev_st = pstr->cur_state;
375 if (BE (pstr->trans != NULL, 0))
377 int i, ch;
379 for (i = 0; i < pstr->mb_cur_max && i < remain_len; ++i)
381 ch = pstr->raw_mbs [pstr->raw_mbs_idx + src_idx + i];
382 buf[i] = pstr->trans[ch];
384 p = (const char *) buf;
386 else
387 p = (const char *) pstr->raw_mbs + pstr->raw_mbs_idx + src_idx;
388 mbclen = __mbrtowc (&wc, p, remain_len, &pstr->cur_state);
389 if (BE (mbclen + 2 > 2, 1))
391 wchar_t wcu = wc;
392 if (iswlower (wc))
394 size_t mbcdlen;
396 wcu = towupper (wc);
397 mbcdlen = wcrtomb ((char *) buf, wcu, &prev_st);
398 if (BE (mbclen == mbcdlen, 1))
399 memcpy (pstr->mbs + byte_idx, buf, mbclen);
400 else if (mbcdlen != (size_t) -1)
402 size_t i;
404 if (byte_idx + mbcdlen > pstr->bufs_len)
406 pstr->cur_state = prev_st;
407 break;
410 if (pstr->offsets == NULL)
412 pstr->offsets = re_malloc (int, pstr->bufs_len);
414 if (pstr->offsets == NULL)
415 return REG_ESPACE;
417 if (!pstr->offsets_needed)
419 for (i = 0; i < (size_t) byte_idx; ++i)
420 pstr->offsets[i] = i;
421 pstr->offsets_needed = 1;
424 memcpy (pstr->mbs + byte_idx, buf, mbcdlen);
425 pstr->wcs[byte_idx] = wcu;
426 pstr->offsets[byte_idx] = src_idx;
427 for (i = 1; i < mbcdlen; ++i)
429 pstr->offsets[byte_idx + i]
430 = src_idx + (i < mbclen ? i : mbclen - 1);
431 pstr->wcs[byte_idx + i] = WEOF;
433 pstr->len += mbcdlen - mbclen;
434 if (pstr->raw_stop > src_idx)
435 pstr->stop += mbcdlen - mbclen;
436 end_idx = (pstr->bufs_len > pstr->len)
437 ? pstr->len : pstr->bufs_len;
438 byte_idx += mbcdlen;
439 src_idx += mbclen;
440 continue;
442 else
443 memcpy (pstr->mbs + byte_idx, p, mbclen);
445 else
446 memcpy (pstr->mbs + byte_idx, p, mbclen);
448 if (BE (pstr->offsets_needed != 0, 0))
450 size_t i;
451 for (i = 0; i < mbclen; ++i)
452 pstr->offsets[byte_idx + i] = src_idx + i;
454 src_idx += mbclen;
456 pstr->wcs[byte_idx++] = wcu;
457 /* Write paddings. */
458 for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
459 pstr->wcs[byte_idx++] = WEOF;
461 else if (mbclen == (size_t) -1 || mbclen == 0)
463 /* It is an invalid character or '\0'. Just use the byte. */
464 int ch = pstr->raw_mbs[pstr->raw_mbs_idx + src_idx];
466 if (BE (pstr->trans != NULL, 0))
467 ch = pstr->trans [ch];
468 pstr->mbs[byte_idx] = ch;
470 if (BE (pstr->offsets_needed != 0, 0))
471 pstr->offsets[byte_idx] = src_idx;
472 ++src_idx;
474 /* And also cast it to wide char. */
475 pstr->wcs[byte_idx++] = (wchar_t) ch;
476 if (BE (mbclen == (size_t) -1, 0))
477 pstr->cur_state = prev_st;
479 else
481 /* The buffer doesn't have enough space, finish to build. */
482 pstr->cur_state = prev_st;
483 break;
486 pstr->valid_len = byte_idx;
487 pstr->valid_raw_len = src_idx;
488 return REG_NOERROR;
491 /* Skip characters until the index becomes greater than NEW_RAW_IDX.
492 Return the index. */
494 static int
495 internal_function
496 re_string_skip_chars (re_string_t *pstr, int new_raw_idx, wint_t *last_wc)
498 mbstate_t prev_st;
499 int rawbuf_idx;
500 size_t mbclen;
501 wint_t wc = WEOF;
503 /* Skip the characters which are not necessary to check. */
504 for (rawbuf_idx = pstr->raw_mbs_idx + pstr->valid_raw_len;
505 rawbuf_idx < new_raw_idx;)
507 wchar_t wc2;
508 int remain_len = pstr->len - rawbuf_idx;
509 prev_st = pstr->cur_state;
510 mbclen = __mbrtowc (&wc2, (const char *) pstr->raw_mbs + rawbuf_idx,
511 remain_len, &pstr->cur_state);
512 if (BE (mbclen == (size_t) -2 || mbclen == (size_t) -1 || mbclen == 0, 0))
514 /* We treat these cases as a single byte character. */
515 if (mbclen == 0 || remain_len == 0)
516 wc = L'\0';
517 else
518 wc = *(unsigned char *) (pstr->raw_mbs + rawbuf_idx);
519 mbclen = 1;
520 pstr->cur_state = prev_st;
522 else
523 wc = (wint_t) wc2;
524 /* Then proceed the next character. */
525 rawbuf_idx += mbclen;
527 *last_wc = (wint_t) wc;
528 return rawbuf_idx;
530 #endif /* RE_ENABLE_I18N */
532 /* Build the buffer PSTR->MBS, and apply the translation if we need.
533 This function is used in case of REG_ICASE. */
535 static void
536 internal_function
537 build_upper_buffer (re_string_t *pstr)
539 int char_idx, end_idx;
540 end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
542 for (char_idx = pstr->valid_len; char_idx < end_idx; ++char_idx)
544 int ch = pstr->raw_mbs[pstr->raw_mbs_idx + char_idx];
545 if (BE (pstr->trans != NULL, 0))
546 ch = pstr->trans[ch];
547 if (islower (ch))
548 pstr->mbs[char_idx] = toupper (ch);
549 else
550 pstr->mbs[char_idx] = ch;
552 pstr->valid_len = char_idx;
553 pstr->valid_raw_len = char_idx;
556 /* Apply TRANS to the buffer in PSTR. */
558 static void
559 internal_function
560 re_string_translate_buffer (re_string_t *pstr)
562 int buf_idx, end_idx;
563 end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
565 for (buf_idx = pstr->valid_len; buf_idx < end_idx; ++buf_idx)
567 int ch = pstr->raw_mbs[pstr->raw_mbs_idx + buf_idx];
568 pstr->mbs[buf_idx] = pstr->trans[ch];
571 pstr->valid_len = buf_idx;
572 pstr->valid_raw_len = buf_idx;
575 /* This function re-construct the buffers.
576 Concretely, convert to wide character in case of pstr->mb_cur_max > 1,
577 convert to upper case in case of REG_ICASE, apply translation. */
579 static reg_errcode_t
580 internal_function
581 re_string_reconstruct (re_string_t *pstr, int idx, int eflags)
583 int offset = idx - pstr->raw_mbs_idx;
584 if (BE (offset < 0, 0))
586 /* Reset buffer. */
587 #ifdef RE_ENABLE_I18N
588 if (pstr->mb_cur_max > 1)
589 memset (&pstr->cur_state, '\0', sizeof (mbstate_t));
590 #endif /* RE_ENABLE_I18N */
591 pstr->len = pstr->raw_len;
592 pstr->stop = pstr->raw_stop;
593 pstr->valid_len = 0;
594 pstr->raw_mbs_idx = 0;
595 pstr->valid_raw_len = 0;
596 pstr->offsets_needed = 0;
597 pstr->tip_context = ((eflags & REG_NOTBOL) ? CONTEXT_BEGBUF
598 : CONTEXT_NEWLINE | CONTEXT_BEGBUF);
599 if (!pstr->mbs_allocated)
600 pstr->mbs = (unsigned char *) pstr->raw_mbs;
601 offset = idx;
604 if (BE (offset != 0, 1))
606 /* Should the already checked characters be kept? */
607 if (BE (offset < pstr->valid_raw_len, 1))
609 /* Yes, move them to the front of the buffer. */
610 #ifdef RE_ENABLE_I18N
611 if (BE (pstr->offsets_needed, 0))
613 int low = 0, high = pstr->valid_len, mid;
616 mid = (high + low) / 2;
617 if (pstr->offsets[mid] > offset)
618 high = mid;
619 else if (pstr->offsets[mid] < offset)
620 low = mid + 1;
621 else
622 break;
624 while (low < high);
625 if (pstr->offsets[mid] < offset)
626 ++mid;
627 pstr->tip_context = re_string_context_at (pstr, mid - 1,
628 eflags);
629 /* This can be quite complicated, so handle specially
630 only the common and easy case where the character with
631 different length representation of lower and upper
632 case is present at or after offset. */
633 if (pstr->valid_len > offset
634 && mid == offset && pstr->offsets[mid] == offset)
636 memmove (pstr->wcs, pstr->wcs + offset,
637 (pstr->valid_len - offset) * sizeof (wint_t));
638 memmove (pstr->mbs, pstr->mbs + offset, pstr->valid_len - offset);
639 pstr->valid_len -= offset;
640 pstr->valid_raw_len -= offset;
641 for (low = 0; low < pstr->valid_len; low++)
642 pstr->offsets[low] = pstr->offsets[low + offset] - offset;
644 else
646 /* Otherwise, just find out how long the partial multibyte
647 character at offset is and fill it with WEOF/255. */
648 pstr->len = pstr->raw_len - idx + offset;
649 pstr->stop = pstr->raw_stop - idx + offset;
650 pstr->offsets_needed = 0;
651 while (mid > 0 && pstr->offsets[mid - 1] == offset)
652 --mid;
653 while (mid < pstr->valid_len)
654 if (pstr->wcs[mid] != WEOF)
655 break;
656 else
657 ++mid;
658 if (mid == pstr->valid_len)
659 pstr->valid_len = 0;
660 else
662 pstr->valid_len = pstr->offsets[mid] - offset;
663 if (pstr->valid_len)
665 for (low = 0; low < pstr->valid_len; ++low)
666 pstr->wcs[low] = WEOF;
667 memset (pstr->mbs, 255, pstr->valid_len);
670 pstr->valid_raw_len = pstr->valid_len;
673 else
674 #endif
676 pstr->tip_context = re_string_context_at (pstr, offset - 1,
677 eflags);
678 #ifdef RE_ENABLE_I18N
679 if (pstr->mb_cur_max > 1)
680 memmove (pstr->wcs, pstr->wcs + offset,
681 (pstr->valid_len - offset) * sizeof (wint_t));
682 #endif /* RE_ENABLE_I18N */
683 if (BE (pstr->mbs_allocated, 0))
684 memmove (pstr->mbs, pstr->mbs + offset,
685 pstr->valid_len - offset);
686 pstr->valid_len -= offset;
687 pstr->valid_raw_len -= offset;
688 #if DEBUG
689 assert (pstr->valid_len > 0);
690 #endif
693 else
695 #ifdef RE_ENABLE_I18N
696 /* No, skip all characters until IDX. */
697 int prev_valid_len = pstr->valid_len;
699 if (BE (pstr->offsets_needed, 0))
701 pstr->len = pstr->raw_len - idx + offset;
702 pstr->stop = pstr->raw_stop - idx + offset;
703 pstr->offsets_needed = 0;
705 #endif
706 pstr->valid_len = 0;
707 #ifdef RE_ENABLE_I18N
708 if (pstr->mb_cur_max > 1)
710 int wcs_idx;
711 wint_t wc = WEOF;
713 if (pstr->is_utf8)
715 const unsigned char *raw, *p, *end;
717 /* Special case UTF-8. Multi-byte chars start with any
718 byte other than 0x80 - 0xbf. */
719 raw = pstr->raw_mbs + pstr->raw_mbs_idx;
720 end = raw + (offset - pstr->mb_cur_max);
721 if (end < pstr->raw_mbs)
722 end = pstr->raw_mbs;
723 p = raw + offset - 1;
724 #ifdef _LIBC
725 /* We know the wchar_t encoding is UCS4, so for the simple
726 case, ASCII characters, skip the conversion step. */
727 if (isascii (*p) && BE (pstr->trans == NULL, 1))
729 memset (&pstr->cur_state, '\0', sizeof (mbstate_t));
730 /* pstr->valid_len = 0; */
731 wc = (wchar_t) *p;
733 else
734 #endif
735 for (; p >= end; --p)
736 if ((*p & 0xc0) != 0x80)
738 mbstate_t cur_state;
739 wchar_t wc2;
740 int mlen = raw + pstr->len - p;
741 unsigned char buf[6];
742 size_t mbclen;
744 if (BE (pstr->trans != NULL, 0))
746 int i = mlen < 6 ? mlen : 6;
747 while (--i >= 0)
748 buf[i] = pstr->trans[p[i]];
750 /* XXX Don't use mbrtowc, we know which conversion
751 to use (UTF-8 -> UCS4). */
752 memset (&cur_state, 0, sizeof (cur_state));
753 mbclen = __mbrtowc (&wc2, (const char *) p, mlen,
754 &cur_state);
755 if (raw + offset - p <= mbclen
756 && mbclen < (size_t) -2)
758 memset (&pstr->cur_state, '\0',
759 sizeof (mbstate_t));
760 pstr->valid_len = mbclen - (raw + offset - p);
761 wc = wc2;
763 break;
767 if (wc == WEOF)
768 pstr->valid_len = re_string_skip_chars (pstr, idx, &wc) - idx;
769 if (wc == WEOF)
770 pstr->tip_context
771 = re_string_context_at (pstr, prev_valid_len - 1, eflags);
772 else
773 pstr->tip_context = ((BE (pstr->word_ops_used != 0, 0)
774 && IS_WIDE_WORD_CHAR (wc))
775 ? CONTEXT_WORD
776 : ((IS_WIDE_NEWLINE (wc)
777 && pstr->newline_anchor)
778 ? CONTEXT_NEWLINE : 0));
779 if (BE (pstr->valid_len, 0))
781 for (wcs_idx = 0; wcs_idx < pstr->valid_len; ++wcs_idx)
782 pstr->wcs[wcs_idx] = WEOF;
783 if (pstr->mbs_allocated)
784 memset (pstr->mbs, 255, pstr->valid_len);
786 pstr->valid_raw_len = pstr->valid_len;
788 else
789 #endif /* RE_ENABLE_I18N */
791 int c = pstr->raw_mbs[pstr->raw_mbs_idx + offset - 1];
792 pstr->valid_raw_len = 0;
793 if (pstr->trans)
794 c = pstr->trans[c];
795 pstr->tip_context = (bitset_contain (pstr->word_char, c)
796 ? CONTEXT_WORD
797 : ((IS_NEWLINE (c) && pstr->newline_anchor)
798 ? CONTEXT_NEWLINE : 0));
801 if (!BE (pstr->mbs_allocated, 0))
802 pstr->mbs += offset;
804 pstr->raw_mbs_idx = idx;
805 pstr->len -= offset;
806 pstr->stop -= offset;
808 /* Then build the buffers. */
809 #ifdef RE_ENABLE_I18N
810 if (pstr->mb_cur_max > 1)
812 if (pstr->icase)
814 reg_errcode_t ret = build_wcs_upper_buffer (pstr);
815 if (BE (ret != REG_NOERROR, 0))
816 return ret;
818 else
819 build_wcs_buffer (pstr);
821 else
822 #endif /* RE_ENABLE_I18N */
823 if (BE (pstr->mbs_allocated, 0))
825 if (pstr->icase)
826 build_upper_buffer (pstr);
827 else if (pstr->trans != NULL)
828 re_string_translate_buffer (pstr);
830 else
831 pstr->valid_len = pstr->len;
833 pstr->cur_idx = 0;
834 return REG_NOERROR;
837 static unsigned char
838 internal_function __attribute ((pure))
839 re_string_peek_byte_case (const re_string_t *pstr, int idx)
841 int ch, off;
843 /* Handle the common (easiest) cases first. */
844 if (BE (!pstr->mbs_allocated, 1))
845 return re_string_peek_byte (pstr, idx);
847 #ifdef RE_ENABLE_I18N
848 if (pstr->mb_cur_max > 1
849 && ! re_string_is_single_byte_char (pstr, pstr->cur_idx + idx))
850 return re_string_peek_byte (pstr, idx);
851 #endif
853 off = pstr->cur_idx + idx;
854 #ifdef RE_ENABLE_I18N
855 if (pstr->offsets_needed)
856 off = pstr->offsets[off];
857 #endif
859 ch = pstr->raw_mbs[pstr->raw_mbs_idx + off];
861 #ifdef RE_ENABLE_I18N
862 /* Ensure that e.g. for tr_TR.UTF-8 BACKSLASH DOTLESS SMALL LETTER I
863 this function returns CAPITAL LETTER I instead of first byte of
864 DOTLESS SMALL LETTER I. The latter would confuse the parser,
865 since peek_byte_case doesn't advance cur_idx in any way. */
866 if (pstr->offsets_needed && !isascii (ch))
867 return re_string_peek_byte (pstr, idx);
868 #endif
870 return ch;
873 static unsigned char
874 internal_function __attribute ((pure))
875 re_string_fetch_byte_case (re_string_t *pstr)
877 if (BE (!pstr->mbs_allocated, 1))
878 return re_string_fetch_byte (pstr);
880 #ifdef RE_ENABLE_I18N
881 if (pstr->offsets_needed)
883 int off, ch;
885 /* For tr_TR.UTF-8 [[:islower:]] there is
886 [[: CAPITAL LETTER I WITH DOT lower:]] in mbs. Skip
887 in that case the whole multi-byte character and return
888 the original letter. On the other side, with
889 [[: DOTLESS SMALL LETTER I return [[:I, as doing
890 anything else would complicate things too much. */
892 if (!re_string_first_byte (pstr, pstr->cur_idx))
893 return re_string_fetch_byte (pstr);
895 off = pstr->offsets[pstr->cur_idx];
896 ch = pstr->raw_mbs[pstr->raw_mbs_idx + off];
898 if (! isascii (ch))
899 return re_string_fetch_byte (pstr);
901 re_string_skip_bytes (pstr,
902 re_string_char_size_at (pstr, pstr->cur_idx));
903 return ch;
905 #endif
907 return pstr->raw_mbs[pstr->raw_mbs_idx + pstr->cur_idx++];
910 static void
911 internal_function
912 re_string_destruct (re_string_t *pstr)
914 #ifdef RE_ENABLE_I18N
915 re_free (pstr->wcs);
916 re_free (pstr->offsets);
917 #endif /* RE_ENABLE_I18N */
918 if (pstr->mbs_allocated)
919 re_free (pstr->mbs);
922 /* Return the context at IDX in INPUT. */
924 static unsigned int
925 internal_function
926 re_string_context_at (const re_string_t *input, int idx, int eflags)
928 int c;
929 if (BE (idx < 0, 0))
930 /* In this case, we use the value stored in input->tip_context,
931 since we can't know the character in input->mbs[-1] here. */
932 return input->tip_context;
933 if (BE (idx == input->len, 0))
934 return ((eflags & REG_NOTEOL) ? CONTEXT_ENDBUF
935 : CONTEXT_NEWLINE | CONTEXT_ENDBUF);
936 #ifdef RE_ENABLE_I18N
937 if (input->mb_cur_max > 1)
939 wint_t wc;
940 int wc_idx = idx;
941 while(input->wcs[wc_idx] == WEOF)
943 #ifdef DEBUG
944 /* It must not happen. */
945 assert (wc_idx >= 0);
946 #endif
947 --wc_idx;
948 if (wc_idx < 0)
949 return input->tip_context;
951 wc = input->wcs[wc_idx];
952 if (BE (input->word_ops_used != 0, 0) && IS_WIDE_WORD_CHAR (wc))
953 return CONTEXT_WORD;
954 return (IS_WIDE_NEWLINE (wc) && input->newline_anchor
955 ? CONTEXT_NEWLINE : 0);
957 else
958 #endif
960 c = re_string_byte_at (input, idx);
961 if (bitset_contain (input->word_char, c))
962 return CONTEXT_WORD;
963 return IS_NEWLINE (c) && input->newline_anchor ? CONTEXT_NEWLINE : 0;
967 /* Functions for set operation. */
969 static reg_errcode_t
970 internal_function
971 re_node_set_alloc (re_node_set *set, int size)
974 * ADR: valgrind says size can be 0, which then doesn't
975 * free the block of size 0. Harumph. This seems
976 * to work ok, though.
978 if (size == 0)
980 memset(set, 0, sizeof(*set));
981 return REG_NOERROR;
983 set->alloc = size;
984 set->nelem = 0;
985 set->elems = re_malloc (int, size);
986 if (BE (set->elems == NULL, 0))
987 return REG_ESPACE;
988 return REG_NOERROR;
991 static reg_errcode_t
992 internal_function
993 re_node_set_init_1 (re_node_set *set, int elem)
995 set->alloc = 1;
996 set->nelem = 1;
997 set->elems = re_malloc (int, 1);
998 if (BE (set->elems == NULL, 0))
1000 set->alloc = set->nelem = 0;
1001 return REG_ESPACE;
1003 set->elems[0] = elem;
1004 return REG_NOERROR;
1007 static reg_errcode_t
1008 internal_function
1009 re_node_set_init_2 (re_node_set *set, int elem1, int elem2)
1011 set->alloc = 2;
1012 set->elems = re_malloc (int, 2);
1013 if (BE (set->elems == NULL, 0))
1014 return REG_ESPACE;
1015 if (elem1 == elem2)
1017 set->nelem = 1;
1018 set->elems[0] = elem1;
1020 else
1022 set->nelem = 2;
1023 if (elem1 < elem2)
1025 set->elems[0] = elem1;
1026 set->elems[1] = elem2;
1028 else
1030 set->elems[0] = elem2;
1031 set->elems[1] = elem1;
1034 return REG_NOERROR;
1037 static reg_errcode_t
1038 internal_function
1039 re_node_set_init_copy (re_node_set *dest, const re_node_set *src)
1041 dest->nelem = src->nelem;
1042 if (src->nelem > 0)
1044 dest->alloc = dest->nelem;
1045 dest->elems = re_malloc (int, dest->alloc);
1046 if (BE (dest->elems == NULL, 0))
1048 dest->alloc = dest->nelem = 0;
1049 return REG_ESPACE;
1051 memcpy (dest->elems, src->elems, src->nelem * sizeof (int));
1053 else
1054 re_node_set_init_empty (dest);
1055 return REG_NOERROR;
1058 /* Calculate the intersection of the sets SRC1 and SRC2. And merge it to
1059 DEST. Return value indicate the error code or REG_NOERROR if succeeded.
1060 Note: We assume dest->elems is NULL, when dest->alloc is 0. */
1062 static reg_errcode_t
1063 internal_function
1064 re_node_set_add_intersect (re_node_set *dest, const re_node_set *src1,
1065 const re_node_set *src2)
1067 int i1, i2, is, id, delta, sbase;
1068 if (src1->nelem == 0 || src2->nelem == 0)
1069 return REG_NOERROR;
1071 /* We need dest->nelem + 2 * elems_in_intersection; this is a
1072 conservative estimate. */
1073 if (src1->nelem + src2->nelem + dest->nelem > dest->alloc)
1075 int new_alloc = src1->nelem + src2->nelem + dest->alloc;
1076 int *new_elems = re_realloc (dest->elems, int, new_alloc);
1077 if (BE (new_elems == NULL, 0))
1078 return REG_ESPACE;
1079 dest->elems = new_elems;
1080 dest->alloc = new_alloc;
1083 /* Find the items in the intersection of SRC1 and SRC2, and copy
1084 into the top of DEST those that are not already in DEST itself. */
1085 sbase = dest->nelem + src1->nelem + src2->nelem;
1086 i1 = src1->nelem - 1;
1087 i2 = src2->nelem - 1;
1088 id = dest->nelem - 1;
1089 for (;;)
1091 if (src1->elems[i1] == src2->elems[i2])
1093 /* Try to find the item in DEST. Maybe we could binary search? */
1094 while (id >= 0 && dest->elems[id] > src1->elems[i1])
1095 --id;
1097 if (id < 0 || dest->elems[id] != src1->elems[i1])
1098 dest->elems[--sbase] = src1->elems[i1];
1100 if (--i1 < 0 || --i2 < 0)
1101 break;
1104 /* Lower the highest of the two items. */
1105 else if (src1->elems[i1] < src2->elems[i2])
1107 if (--i2 < 0)
1108 break;
1110 else
1112 if (--i1 < 0)
1113 break;
1117 id = dest->nelem - 1;
1118 is = dest->nelem + src1->nelem + src2->nelem - 1;
1119 delta = is - sbase + 1;
1121 /* Now copy. When DELTA becomes zero, the remaining
1122 DEST elements are already in place; this is more or
1123 less the same loop that is in re_node_set_merge. */
1124 dest->nelem += delta;
1125 if (delta > 0 && id >= 0)
1126 for (;;)
1128 if (dest->elems[is] > dest->elems[id])
1130 /* Copy from the top. */
1131 dest->elems[id + delta--] = dest->elems[is--];
1132 if (delta == 0)
1133 break;
1135 else
1137 /* Slide from the bottom. */
1138 dest->elems[id + delta] = dest->elems[id];
1139 if (--id < 0)
1140 break;
1144 /* Copy remaining SRC elements. */
1145 memcpy (dest->elems, dest->elems + sbase, delta * sizeof (int));
1147 return REG_NOERROR;
1150 /* Calculate the union set of the sets SRC1 and SRC2. And store it to
1151 DEST. Return value indicate the error code or REG_NOERROR if succeeded. */
1153 static reg_errcode_t
1154 internal_function
1155 re_node_set_init_union (re_node_set *dest, const re_node_set *src1,
1156 const re_node_set *src2)
1158 int i1, i2, id;
1159 if (src1 != NULL && src1->nelem > 0 && src2 != NULL && src2->nelem > 0)
1161 dest->alloc = src1->nelem + src2->nelem;
1162 dest->elems = re_malloc (int, dest->alloc);
1163 if (BE (dest->elems == NULL, 0))
1164 return REG_ESPACE;
1166 else
1168 if (src1 != NULL && src1->nelem > 0)
1169 return re_node_set_init_copy (dest, src1);
1170 else if (src2 != NULL && src2->nelem > 0)
1171 return re_node_set_init_copy (dest, src2);
1172 else
1173 re_node_set_init_empty (dest);
1174 return REG_NOERROR;
1176 for (i1 = i2 = id = 0 ; i1 < src1->nelem && i2 < src2->nelem ;)
1178 if (src1->elems[i1] > src2->elems[i2])
1180 dest->elems[id++] = src2->elems[i2++];
1181 continue;
1183 if (src1->elems[i1] == src2->elems[i2])
1184 ++i2;
1185 dest->elems[id++] = src1->elems[i1++];
1187 if (i1 < src1->nelem)
1189 memcpy (dest->elems + id, src1->elems + i1,
1190 (src1->nelem - i1) * sizeof (int));
1191 id += src1->nelem - i1;
1193 else if (i2 < src2->nelem)
1195 memcpy (dest->elems + id, src2->elems + i2,
1196 (src2->nelem - i2) * sizeof (int));
1197 id += src2->nelem - i2;
1199 dest->nelem = id;
1200 return REG_NOERROR;
1203 /* Calculate the union set of the sets DEST and SRC. And store it to
1204 DEST. Return value indicate the error code or REG_NOERROR if succeeded. */
1206 static reg_errcode_t
1207 internal_function
1208 re_node_set_merge (re_node_set *dest, const re_node_set *src)
1210 int is, id, sbase, delta;
1211 if (src == NULL || src->nelem == 0)
1212 return REG_NOERROR;
1213 if (dest->alloc < 2 * src->nelem + dest->nelem)
1215 int new_alloc = 2 * (src->nelem + dest->alloc);
1216 int *new_buffer = re_realloc (dest->elems, int, new_alloc);
1217 if (BE (new_buffer == NULL, 0))
1218 return REG_ESPACE;
1219 dest->elems = new_buffer;
1220 dest->alloc = new_alloc;
1223 if (BE (dest->nelem == 0, 0))
1225 dest->nelem = src->nelem;
1226 memcpy (dest->elems, src->elems, src->nelem * sizeof (int));
1227 return REG_NOERROR;
1230 /* Copy into the top of DEST the items of SRC that are not
1231 found in DEST. Maybe we could binary search in DEST? */
1232 for (sbase = dest->nelem + 2 * src->nelem,
1233 is = src->nelem - 1, id = dest->nelem - 1; is >= 0 && id >= 0; )
1235 if (dest->elems[id] == src->elems[is])
1236 is--, id--;
1237 else if (dest->elems[id] < src->elems[is])
1238 dest->elems[--sbase] = src->elems[is--];
1239 else /* if (dest->elems[id] > src->elems[is]) */
1240 --id;
1243 if (is >= 0)
1245 /* If DEST is exhausted, the remaining items of SRC must be unique. */
1246 sbase -= is + 1;
1247 memcpy (dest->elems + sbase, src->elems, (is + 1) * sizeof (int));
1250 id = dest->nelem - 1;
1251 is = dest->nelem + 2 * src->nelem - 1;
1252 delta = is - sbase + 1;
1253 if (delta == 0)
1254 return REG_NOERROR;
1256 /* Now copy. When DELTA becomes zero, the remaining
1257 DEST elements are already in place. */
1258 dest->nelem += delta;
1259 for (;;)
1261 if (dest->elems[is] > dest->elems[id])
1263 /* Copy from the top. */
1264 dest->elems[id + delta--] = dest->elems[is--];
1265 if (delta == 0)
1266 break;
1268 else
1270 /* Slide from the bottom. */
1271 dest->elems[id + delta] = dest->elems[id];
1272 if (--id < 0)
1274 /* Copy remaining SRC elements. */
1275 memcpy (dest->elems, dest->elems + sbase,
1276 delta * sizeof (int));
1277 break;
1282 return REG_NOERROR;
1285 /* Insert the new element ELEM to the re_node_set* SET.
1286 SET should not already have ELEM.
1287 return -1 if an error is occured, return 1 otherwise. */
1289 static int
1290 internal_function
1291 re_node_set_insert (re_node_set *set, int elem)
1293 int idx;
1294 /* In case the set is empty. */
1295 if (set->alloc == 0)
1297 if (BE (re_node_set_init_1 (set, elem) == REG_NOERROR, 1))
1298 return 1;
1299 else
1300 return -1;
1303 if (BE (set->nelem, 0) == 0)
1305 /* We already guaranteed above that set->alloc != 0. */
1306 set->elems[0] = elem;
1307 ++set->nelem;
1308 return 1;
1311 /* Realloc if we need. */
1312 if (set->alloc == set->nelem)
1314 int *new_elems;
1315 set->alloc = set->alloc * 2;
1316 new_elems = re_realloc (set->elems, int, set->alloc);
1317 if (BE (new_elems == NULL, 0))
1318 return -1;
1319 set->elems = new_elems;
1322 /* Move the elements which follows the new element. Test the
1323 first element separately to skip a check in the inner loop. */
1324 if (elem < set->elems[0])
1326 idx = 0;
1327 for (idx = set->nelem; idx > 0; idx--)
1328 set->elems[idx] = set->elems[idx - 1];
1330 else
1332 for (idx = set->nelem; set->elems[idx - 1] > elem; idx--)
1333 set->elems[idx] = set->elems[idx - 1];
1336 /* Insert the new element. */
1337 set->elems[idx] = elem;
1338 ++set->nelem;
1339 return 1;
1342 /* Insert the new element ELEM to the re_node_set* SET.
1343 SET should not already have any element greater than or equal to ELEM.
1344 Return -1 if an error is occured, return 1 otherwise. */
1346 static int
1347 internal_function
1348 re_node_set_insert_last (re_node_set *set, int elem)
1350 /* Realloc if we need. */
1351 if (set->alloc == set->nelem)
1353 int *new_elems;
1354 set->alloc = (set->alloc + 1) * 2;
1355 new_elems = re_realloc (set->elems, int, set->alloc);
1356 if (BE (new_elems == NULL, 0))
1357 return -1;
1358 set->elems = new_elems;
1361 /* Insert the new element. */
1362 set->elems[set->nelem++] = elem;
1363 return 1;
1366 /* Compare two node sets SET1 and SET2.
1367 return 1 if SET1 and SET2 are equivalent, return 0 otherwise. */
1369 static int
1370 internal_function __attribute ((pure))
1371 re_node_set_compare (const re_node_set *set1, const re_node_set *set2)
1373 int i;
1374 if (set1 == NULL || set2 == NULL || set1->nelem != set2->nelem)
1375 return 0;
1376 for (i = set1->nelem ; --i >= 0 ; )
1377 if (set1->elems[i] != set2->elems[i])
1378 return 0;
1379 return 1;
1382 /* Return (idx + 1) if SET contains the element ELEM, return 0 otherwise. */
1384 static int
1385 internal_function __attribute ((pure))
1386 re_node_set_contains (const re_node_set *set, int elem)
1388 unsigned int idx, right, mid;
1389 if (set->nelem <= 0)
1390 return 0;
1392 /* Binary search the element. */
1393 idx = 0;
1394 right = set->nelem - 1;
1395 while (idx < right)
1397 mid = (idx + right) / 2;
1398 if (set->elems[mid] < elem)
1399 idx = mid + 1;
1400 else
1401 right = mid;
1403 return set->elems[idx] == elem ? idx + 1 : 0;
1406 static void
1407 internal_function
1408 re_node_set_remove_at (re_node_set *set, int idx)
1410 if (idx < 0 || idx >= set->nelem)
1411 return;
1412 --set->nelem;
1413 for (; idx < set->nelem; idx++)
1414 set->elems[idx] = set->elems[idx + 1];
1418 /* Add the token TOKEN to dfa->nodes, and return the index of the token.
1419 Or return -1, if an error will be occured. */
1421 static int
1422 internal_function
1423 re_dfa_add_node (re_dfa_t *dfa, re_token_t token)
1425 if (BE (dfa->nodes_len >= dfa->nodes_alloc, 0))
1427 size_t new_nodes_alloc = dfa->nodes_alloc * 2;
1428 int *new_nexts, *new_indices;
1429 re_node_set *new_edests, *new_eclosures;
1430 re_token_t *new_nodes;
1432 /* Avoid overflows in realloc. */
1433 const size_t max_object_size = MAX (sizeof (re_token_t),
1434 MAX (sizeof (re_node_set),
1435 sizeof (int)));
1436 if (BE (SIZE_MAX / max_object_size < new_nodes_alloc, 0))
1437 return -1;
1439 new_nodes = re_realloc (dfa->nodes, re_token_t, new_nodes_alloc);
1440 if (BE (new_nodes == NULL, 0))
1441 return -1;
1442 dfa->nodes = new_nodes;
1443 new_nexts = re_realloc (dfa->nexts, int, new_nodes_alloc);
1444 new_indices = re_realloc (dfa->org_indices, int, new_nodes_alloc);
1445 new_edests = re_realloc (dfa->edests, re_node_set, new_nodes_alloc);
1446 new_eclosures = re_realloc (dfa->eclosures, re_node_set, new_nodes_alloc);
1447 if (BE (new_nexts == NULL || new_indices == NULL
1448 || new_edests == NULL || new_eclosures == NULL, 0))
1449 return -1;
1450 dfa->nexts = new_nexts;
1451 dfa->org_indices = new_indices;
1452 dfa->edests = new_edests;
1453 dfa->eclosures = new_eclosures;
1454 dfa->nodes_alloc = new_nodes_alloc;
1456 dfa->nodes[dfa->nodes_len] = token;
1457 dfa->nodes[dfa->nodes_len].constraint = 0;
1458 #ifdef RE_ENABLE_I18N
1459 dfa->nodes[dfa->nodes_len].accept_mb =
1460 (token.type == OP_PERIOD && dfa->mb_cur_max > 1) || token.type == COMPLEX_BRACKET;
1461 #endif
1462 dfa->nexts[dfa->nodes_len] = -1;
1463 re_node_set_init_empty (dfa->edests + dfa->nodes_len);
1464 re_node_set_init_empty (dfa->eclosures + dfa->nodes_len);
1465 return dfa->nodes_len++;
1468 static inline unsigned int
1469 internal_function
1470 calc_state_hash (const re_node_set *nodes, unsigned int context)
1472 unsigned int hash = nodes->nelem + context;
1473 int i;
1474 for (i = 0 ; i < nodes->nelem ; i++)
1475 hash += nodes->elems[i];
1476 return hash;
1479 /* Search for the state whose node_set is equivalent to NODES.
1480 Return the pointer to the state, if we found it in the DFA.
1481 Otherwise create the new one and return it. In case of an error
1482 return NULL and set the error code in ERR.
1483 Note: - We assume NULL as the invalid state, then it is possible that
1484 return value is NULL and ERR is REG_NOERROR.
1485 - We never return non-NULL value in case of any errors, it is for
1486 optimization. */
1488 static re_dfastate_t *
1489 internal_function
1490 re_acquire_state (reg_errcode_t *err, const re_dfa_t *dfa,
1491 const re_node_set *nodes)
1493 unsigned int hash;
1494 re_dfastate_t *new_state;
1495 struct re_state_table_entry *spot;
1496 int i;
1497 if (BE (nodes->nelem == 0, 0))
1499 *err = REG_NOERROR;
1500 return NULL;
1502 hash = calc_state_hash (nodes, 0);
1503 spot = dfa->state_table + (hash & dfa->state_hash_mask);
1505 for (i = 0 ; i < spot->num ; i++)
1507 re_dfastate_t *state = spot->array[i];
1508 if (hash != state->hash)
1509 continue;
1510 if (re_node_set_compare (&state->nodes, nodes))
1511 return state;
1514 /* There are no appropriate state in the dfa, create the new one. */
1515 new_state = create_ci_newstate (dfa, nodes, hash);
1516 if (BE (new_state == NULL, 0))
1517 *err = REG_ESPACE;
1519 return new_state;
1522 /* Search for the state whose node_set is equivalent to NODES and
1523 whose context is equivalent to CONTEXT.
1524 Return the pointer to the state, if we found it in the DFA.
1525 Otherwise create the new one and return it. In case of an error
1526 return NULL and set the error code in ERR.
1527 Note: - We assume NULL as the invalid state, then it is possible that
1528 return value is NULL and ERR is REG_NOERROR.
1529 - We never return non-NULL value in case of any errors, it is for
1530 optimization. */
1532 static re_dfastate_t *
1533 internal_function
1534 re_acquire_state_context (reg_errcode_t *err, const re_dfa_t *dfa,
1535 const re_node_set *nodes, unsigned int context)
1537 unsigned int hash;
1538 re_dfastate_t *new_state;
1539 struct re_state_table_entry *spot;
1540 int i;
1541 if (nodes->nelem == 0)
1543 *err = REG_NOERROR;
1544 return NULL;
1546 hash = calc_state_hash (nodes, context);
1547 spot = dfa->state_table + (hash & dfa->state_hash_mask);
1549 for (i = 0 ; i < spot->num ; i++)
1551 re_dfastate_t *state = spot->array[i];
1552 if (state->hash == hash
1553 && state->context == context
1554 && re_node_set_compare (state->entrance_nodes, nodes))
1555 return state;
1557 /* There are no appropriate state in `dfa', create the new one. */
1558 new_state = create_cd_newstate (dfa, nodes, context, hash);
1559 if (BE (new_state == NULL, 0))
1560 *err = REG_ESPACE;
1562 return new_state;
1565 /* Finish initialization of the new state NEWSTATE, and using its hash value
1566 HASH put in the appropriate bucket of DFA's state table. Return value
1567 indicates the error code if failed. */
1569 static reg_errcode_t
1570 register_state (const re_dfa_t *dfa, re_dfastate_t *newstate,
1571 unsigned int hash)
1573 struct re_state_table_entry *spot;
1574 reg_errcode_t err;
1575 int i;
1577 newstate->hash = hash;
1578 err = re_node_set_alloc (&newstate->non_eps_nodes, newstate->nodes.nelem);
1579 if (BE (err != REG_NOERROR, 0))
1580 return REG_ESPACE;
1581 for (i = 0; i < newstate->nodes.nelem; i++)
1583 int elem = newstate->nodes.elems[i];
1584 if (!IS_EPSILON_NODE (dfa->nodes[elem].type))
1585 if (re_node_set_insert_last (&newstate->non_eps_nodes, elem) < 0)
1586 return REG_ESPACE;
1589 spot = dfa->state_table + (hash & dfa->state_hash_mask);
1590 if (BE (spot->alloc <= spot->num, 0))
1592 int new_alloc = 2 * spot->num + 2;
1593 re_dfastate_t **new_array = re_realloc (spot->array, re_dfastate_t *,
1594 new_alloc);
1595 if (BE (new_array == NULL, 0))
1596 return REG_ESPACE;
1597 spot->array = new_array;
1598 spot->alloc = new_alloc;
1600 spot->array[spot->num++] = newstate;
1601 return REG_NOERROR;
1604 static void
1605 free_state (re_dfastate_t *state)
1607 re_node_set_free (&state->non_eps_nodes);
1608 re_node_set_free (&state->inveclosure);
1609 if (state->entrance_nodes != &state->nodes)
1611 re_node_set_free (state->entrance_nodes);
1612 re_free (state->entrance_nodes);
1614 re_node_set_free (&state->nodes);
1615 re_free (state->word_trtable);
1616 re_free (state->trtable);
1617 re_free (state);
1620 /* Create the new state which is independ of contexts.
1621 Return the new state if succeeded, otherwise return NULL. */
1623 static re_dfastate_t *
1624 internal_function
1625 create_ci_newstate (const re_dfa_t *dfa, const re_node_set *nodes,
1626 unsigned int hash)
1628 int i;
1629 reg_errcode_t err;
1630 re_dfastate_t *newstate;
1632 newstate = (re_dfastate_t *) calloc (sizeof (re_dfastate_t), 1);
1633 if (BE (newstate == NULL, 0))
1634 return NULL;
1635 err = re_node_set_init_copy (&newstate->nodes, nodes);
1636 if (BE (err != REG_NOERROR, 0))
1638 re_free (newstate);
1639 return NULL;
1642 newstate->entrance_nodes = &newstate->nodes;
1643 for (i = 0 ; i < nodes->nelem ; i++)
1645 re_token_t *node = dfa->nodes + nodes->elems[i];
1646 re_token_type_t type = node->type;
1647 if (type == CHARACTER && !node->constraint)
1648 continue;
1649 #ifdef RE_ENABLE_I18N
1650 newstate->accept_mb |= node->accept_mb;
1651 #endif /* RE_ENABLE_I18N */
1653 /* If the state has the halt node, the state is a halt state. */
1654 if (type == END_OF_RE)
1655 newstate->halt = 1;
1656 else if (type == OP_BACK_REF)
1657 newstate->has_backref = 1;
1658 else if (type == ANCHOR || node->constraint)
1659 newstate->has_constraint = 1;
1661 err = register_state (dfa, newstate, hash);
1662 if (BE (err != REG_NOERROR, 0))
1664 free_state (newstate);
1665 newstate = NULL;
1667 return newstate;
1670 /* Create the new state which is depend on the context CONTEXT.
1671 Return the new state if succeeded, otherwise return NULL. */
1673 static re_dfastate_t *
1674 internal_function
1675 create_cd_newstate (const re_dfa_t *dfa, const re_node_set *nodes,
1676 unsigned int context, unsigned int hash)
1678 int i, nctx_nodes = 0;
1679 reg_errcode_t err;
1680 re_dfastate_t *newstate;
1682 newstate = (re_dfastate_t *) calloc (sizeof (re_dfastate_t), 1);
1683 if (BE (newstate == NULL, 0))
1684 return NULL;
1685 err = re_node_set_init_copy (&newstate->nodes, nodes);
1686 if (BE (err != REG_NOERROR, 0))
1688 re_free (newstate);
1689 return NULL;
1692 newstate->context = context;
1693 newstate->entrance_nodes = &newstate->nodes;
1695 for (i = 0 ; i < nodes->nelem ; i++)
1697 re_token_t *node = dfa->nodes + nodes->elems[i];
1698 re_token_type_t type = node->type;
1699 unsigned int constraint = node->constraint;
1701 if (type == CHARACTER && !constraint)
1702 continue;
1703 #ifdef RE_ENABLE_I18N
1704 newstate->accept_mb |= node->accept_mb;
1705 #endif /* RE_ENABLE_I18N */
1707 /* If the state has the halt node, the state is a halt state. */
1708 if (type == END_OF_RE)
1709 newstate->halt = 1;
1710 else if (type == OP_BACK_REF)
1711 newstate->has_backref = 1;
1713 if (constraint)
1715 if (newstate->entrance_nodes == &newstate->nodes)
1717 newstate->entrance_nodes = re_malloc (re_node_set, 1);
1718 if (BE (newstate->entrance_nodes == NULL, 0))
1720 free_state (newstate);
1721 return NULL;
1723 if (re_node_set_init_copy (newstate->entrance_nodes, nodes)
1724 != REG_NOERROR)
1725 return NULL;
1726 nctx_nodes = 0;
1727 newstate->has_constraint = 1;
1730 if (NOT_SATISFY_PREV_CONSTRAINT (constraint,context))
1732 re_node_set_remove_at (&newstate->nodes, i - nctx_nodes);
1733 ++nctx_nodes;
1737 err = register_state (dfa, newstate, hash);
1738 if (BE (err != REG_NOERROR, 0))
1740 free_state (newstate);
1741 newstate = NULL;
1743 return newstate;