Merge branch 'rj/add-i-leak-fix'
[git.git] / compat / regex / regex_internal.c
blobec51cf34461ef2eeafea7087b2abe9cafc260932
1 /* Extended regular expression matching and search library.
2 Copyright (C) 2002-2006, 2010 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4 Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library; if not, see
18 <http://www.gnu.org/licenses/>. */
20 static void re_string_construct_common (const char *str, int len,
21 re_string_t *pstr,
22 RE_TRANSLATE_TYPE trans, int icase,
23 const re_dfa_t *dfa) internal_function;
24 static re_dfastate_t *create_ci_newstate (const re_dfa_t *dfa,
25 const re_node_set *nodes,
26 unsigned int hash) internal_function;
27 static re_dfastate_t *create_cd_newstate (const re_dfa_t *dfa,
28 const re_node_set *nodes,
29 unsigned int context,
30 unsigned int hash) internal_function;
32 #ifdef GAWK
33 #undef MAX /* safety */
34 static int
35 MAX(size_t a, size_t b)
37 return (a > b ? a : b);
39 #endif
41 /* Functions for string operation. */
43 /* This function allocate the buffers. It is necessary to call
44 re_string_reconstruct before using the object. */
46 static reg_errcode_t
47 internal_function
48 re_string_allocate (re_string_t *pstr, const char *str, int len, int init_len,
49 RE_TRANSLATE_TYPE trans, int icase, const re_dfa_t *dfa)
51 reg_errcode_t ret;
52 int init_buf_len;
54 /* Ensure at least one character fits into the buffers. */
55 if (init_len < dfa->mb_cur_max)
56 init_len = dfa->mb_cur_max;
57 init_buf_len = (len + 1 < init_len) ? len + 1: init_len;
58 re_string_construct_common (str, len, pstr, trans, icase, dfa);
60 ret = re_string_realloc_buffers (pstr, init_buf_len);
61 if (BE (ret != REG_NOERROR, 0))
62 return ret;
64 pstr->word_char = dfa->word_char;
65 pstr->word_ops_used = dfa->word_ops_used;
66 pstr->mbs = pstr->mbs_allocated ? pstr->mbs : (unsigned char *) str;
67 pstr->valid_len = (pstr->mbs_allocated || dfa->mb_cur_max > 1) ? 0 : len;
68 pstr->valid_raw_len = pstr->valid_len;
69 return REG_NOERROR;
72 /* This function allocate the buffers, and initialize them. */
74 static reg_errcode_t
75 internal_function
76 re_string_construct (re_string_t *pstr, const char *str, int len,
77 RE_TRANSLATE_TYPE trans, int icase, const re_dfa_t *dfa)
79 reg_errcode_t ret;
80 memset (pstr, '\0', sizeof (re_string_t));
81 re_string_construct_common (str, len, pstr, trans, icase, dfa);
83 if (len > 0)
85 ret = re_string_realloc_buffers (pstr, len + 1);
86 if (BE (ret != REG_NOERROR, 0))
87 return ret;
89 pstr->mbs = pstr->mbs_allocated ? pstr->mbs : (unsigned char *) str;
91 if (icase)
93 #ifdef RE_ENABLE_I18N
94 if (dfa->mb_cur_max > 1)
96 while (1)
98 ret = build_wcs_upper_buffer (pstr);
99 if (BE (ret != REG_NOERROR, 0))
100 return ret;
101 if (pstr->valid_raw_len >= len)
102 break;
103 if (pstr->bufs_len > pstr->valid_len + dfa->mb_cur_max)
104 break;
105 ret = re_string_realloc_buffers (pstr, pstr->bufs_len * 2);
106 if (BE (ret != REG_NOERROR, 0))
107 return ret;
110 else
111 #endif /* RE_ENABLE_I18N */
112 build_upper_buffer (pstr);
114 else
116 #ifdef RE_ENABLE_I18N
117 if (dfa->mb_cur_max > 1)
118 build_wcs_buffer (pstr);
119 else
120 #endif /* RE_ENABLE_I18N */
122 if (trans != NULL)
123 re_string_translate_buffer (pstr);
124 else
126 pstr->valid_len = pstr->bufs_len;
127 pstr->valid_raw_len = pstr->bufs_len;
132 return REG_NOERROR;
135 /* Helper functions for re_string_allocate, and re_string_construct. */
137 static reg_errcode_t
138 internal_function
139 re_string_realloc_buffers (re_string_t *pstr, int new_buf_len)
141 #ifdef RE_ENABLE_I18N
142 if (pstr->mb_cur_max > 1)
144 wint_t *new_wcs;
146 /* Avoid overflow in realloc. */
147 const size_t max_object_size = MAX (sizeof (wint_t), sizeof (int));
148 if (BE (SIZE_MAX / max_object_size < new_buf_len, 0))
149 return REG_ESPACE;
151 new_wcs = re_realloc (pstr->wcs, wint_t, new_buf_len);
152 if (BE (new_wcs == NULL, 0))
153 return REG_ESPACE;
154 pstr->wcs = new_wcs;
155 if (pstr->offsets != NULL)
157 int *new_offsets = re_realloc (pstr->offsets, int, new_buf_len);
158 if (BE (new_offsets == NULL, 0))
159 return REG_ESPACE;
160 pstr->offsets = new_offsets;
163 #endif /* RE_ENABLE_I18N */
164 if (pstr->mbs_allocated)
166 unsigned char *new_mbs = re_realloc (pstr->mbs, unsigned char,
167 new_buf_len);
168 if (BE (new_mbs == NULL, 0))
169 return REG_ESPACE;
170 pstr->mbs = new_mbs;
172 pstr->bufs_len = new_buf_len;
173 return REG_NOERROR;
177 static void
178 internal_function
179 re_string_construct_common (const char *str, int len, re_string_t *pstr,
180 RE_TRANSLATE_TYPE trans, int icase,
181 const re_dfa_t *dfa)
183 pstr->raw_mbs = (const unsigned char *) str;
184 pstr->len = len;
185 pstr->raw_len = len;
186 pstr->trans = trans;
187 pstr->icase = icase ? 1 : 0;
188 pstr->mbs_allocated = (trans != NULL || icase);
189 pstr->mb_cur_max = dfa->mb_cur_max;
190 pstr->is_utf8 = dfa->is_utf8;
191 pstr->map_notascii = dfa->map_notascii;
192 pstr->stop = pstr->len;
193 pstr->raw_stop = pstr->stop;
196 #ifdef RE_ENABLE_I18N
198 /* Build wide character buffer PSTR->WCS.
199 If the byte sequence of the string are:
200 <mb1>(0), <mb1>(1), <mb2>(0), <mb2>(1), <sb3>
201 Then wide character buffer will be:
202 <wc1> , WEOF , <wc2> , WEOF , <wc3>
203 We use WEOF for padding, they indicate that the position isn't
204 a first byte of a multibyte character.
206 Note that this function assumes PSTR->VALID_LEN elements are already
207 built and starts from PSTR->VALID_LEN. */
209 static void
210 internal_function
211 build_wcs_buffer (re_string_t *pstr)
213 #ifdef _LIBC
214 unsigned char buf[MB_LEN_MAX];
215 assert (MB_LEN_MAX >= pstr->mb_cur_max);
216 #else
217 unsigned char buf[64];
218 #endif
219 mbstate_t prev_st;
220 int byte_idx, end_idx, remain_len;
221 size_t mbclen;
223 /* Build the buffers from pstr->valid_len to either pstr->len or
224 pstr->bufs_len. */
225 end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
226 for (byte_idx = pstr->valid_len; byte_idx < end_idx;)
228 wchar_t wc;
229 const char *p;
231 remain_len = end_idx - byte_idx;
232 prev_st = pstr->cur_state;
233 /* Apply the translation if we need. */
234 if (BE (pstr->trans != NULL, 0))
236 int i, ch;
238 for (i = 0; i < pstr->mb_cur_max && i < remain_len; ++i)
240 ch = pstr->raw_mbs [pstr->raw_mbs_idx + byte_idx + i];
241 buf[i] = pstr->mbs[byte_idx + i] = pstr->trans[ch];
243 p = (const char *) buf;
245 else
246 p = (const char *) pstr->raw_mbs + pstr->raw_mbs_idx + byte_idx;
247 mbclen = __mbrtowc (&wc, p, remain_len, &pstr->cur_state);
248 if (BE (mbclen == (size_t) -2, 0))
250 /* The buffer doesn't have enough space, finish to build. */
251 pstr->cur_state = prev_st;
252 break;
254 else if (BE (mbclen == (size_t) -1 || mbclen == 0, 0))
256 /* We treat these cases as a singlebyte character. */
257 mbclen = 1;
258 wc = (wchar_t) pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx];
259 if (BE (pstr->trans != NULL, 0))
260 wc = pstr->trans[wc];
261 pstr->cur_state = prev_st;
264 /* Write wide character and padding. */
265 pstr->wcs[byte_idx++] = wc;
266 /* Write paddings. */
267 for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
268 pstr->wcs[byte_idx++] = WEOF;
270 pstr->valid_len = byte_idx;
271 pstr->valid_raw_len = byte_idx;
274 /* Build wide character buffer PSTR->WCS like build_wcs_buffer,
275 but for REG_ICASE. */
277 static reg_errcode_t
278 internal_function
279 build_wcs_upper_buffer (re_string_t *pstr)
281 mbstate_t prev_st;
282 int src_idx, byte_idx, end_idx, remain_len;
283 size_t mbclen;
284 #ifdef _LIBC
285 char buf[MB_LEN_MAX];
286 assert (MB_LEN_MAX >= pstr->mb_cur_max);
287 #else
288 char buf[64];
289 #endif
291 byte_idx = pstr->valid_len;
292 end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
294 /* The following optimization assumes that ASCII characters can be
295 mapped to wide characters with a simple cast. */
296 if (! pstr->map_notascii && pstr->trans == NULL && !pstr->offsets_needed)
298 while (byte_idx < end_idx)
300 wchar_t wc;
302 if (isascii (pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx])
303 && mbsinit (&pstr->cur_state))
305 /* In case of a singlebyte character. */
306 pstr->mbs[byte_idx]
307 = toupper (pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx]);
308 /* The next step uses the assumption that wchar_t is encoded
309 ASCII-safe: all ASCII values can be converted like this. */
310 pstr->wcs[byte_idx] = (wchar_t) pstr->mbs[byte_idx];
311 ++byte_idx;
312 continue;
315 remain_len = end_idx - byte_idx;
316 prev_st = pstr->cur_state;
317 mbclen = __mbrtowc (&wc,
318 ((const char *) pstr->raw_mbs + pstr->raw_mbs_idx
319 + byte_idx), remain_len, &pstr->cur_state);
320 if (BE (mbclen + 2 > 2, 1))
322 wchar_t wcu = wc;
323 if (iswlower (wc))
325 size_t mbcdlen;
327 wcu = towupper (wc);
328 mbcdlen = wcrtomb (buf, wcu, &prev_st);
329 if (BE (mbclen == mbcdlen, 1))
330 memcpy (pstr->mbs + byte_idx, buf, mbclen);
331 else
333 src_idx = byte_idx;
334 goto offsets_needed;
337 else
338 memcpy (pstr->mbs + byte_idx,
339 pstr->raw_mbs + pstr->raw_mbs_idx + byte_idx, mbclen);
340 pstr->wcs[byte_idx++] = wcu;
341 /* Write paddings. */
342 for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
343 pstr->wcs[byte_idx++] = WEOF;
345 else if (mbclen == (size_t) -1 || mbclen == 0)
347 /* It is an invalid character or '\0'. Just use the byte. */
348 int ch = pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx];
349 pstr->mbs[byte_idx] = ch;
350 /* And also cast it to wide char. */
351 pstr->wcs[byte_idx++] = (wchar_t) ch;
352 if (BE (mbclen == (size_t) -1, 0))
353 pstr->cur_state = prev_st;
355 else
357 /* The buffer doesn't have enough space, finish to build. */
358 pstr->cur_state = prev_st;
359 break;
362 pstr->valid_len = byte_idx;
363 pstr->valid_raw_len = byte_idx;
364 return REG_NOERROR;
366 else
367 for (src_idx = pstr->valid_raw_len; byte_idx < end_idx;)
369 wchar_t wc;
370 const char *p;
371 offsets_needed:
372 remain_len = end_idx - byte_idx;
373 prev_st = pstr->cur_state;
374 if (BE (pstr->trans != NULL, 0))
376 int i, ch;
378 for (i = 0; i < pstr->mb_cur_max && i < remain_len; ++i)
380 ch = pstr->raw_mbs [pstr->raw_mbs_idx + src_idx + i];
381 buf[i] = pstr->trans[ch];
383 p = (const char *) buf;
385 else
386 p = (const char *) pstr->raw_mbs + pstr->raw_mbs_idx + src_idx;
387 mbclen = __mbrtowc (&wc, p, remain_len, &pstr->cur_state);
388 if (BE (mbclen + 2 > 2, 1))
390 wchar_t wcu = wc;
391 if (iswlower (wc))
393 size_t mbcdlen;
395 wcu = towupper (wc);
396 mbcdlen = wcrtomb ((char *) buf, wcu, &prev_st);
397 if (BE (mbclen == mbcdlen, 1))
398 memcpy (pstr->mbs + byte_idx, buf, mbclen);
399 else if (mbcdlen != (size_t) -1)
401 size_t i;
403 if (byte_idx + mbcdlen > pstr->bufs_len)
405 pstr->cur_state = prev_st;
406 break;
409 if (pstr->offsets == NULL)
411 pstr->offsets = re_malloc (int, pstr->bufs_len);
413 if (pstr->offsets == NULL)
414 return REG_ESPACE;
416 if (!pstr->offsets_needed)
418 for (i = 0; i < (size_t) byte_idx; ++i)
419 pstr->offsets[i] = i;
420 pstr->offsets_needed = 1;
423 memcpy (pstr->mbs + byte_idx, buf, mbcdlen);
424 pstr->wcs[byte_idx] = wcu;
425 pstr->offsets[byte_idx] = src_idx;
426 for (i = 1; i < mbcdlen; ++i)
428 pstr->offsets[byte_idx + i]
429 = src_idx + (i < mbclen ? i : mbclen - 1);
430 pstr->wcs[byte_idx + i] = WEOF;
432 pstr->len += mbcdlen - mbclen;
433 if (pstr->raw_stop > src_idx)
434 pstr->stop += mbcdlen - mbclen;
435 end_idx = (pstr->bufs_len > pstr->len)
436 ? pstr->len : pstr->bufs_len;
437 byte_idx += mbcdlen;
438 src_idx += mbclen;
439 continue;
441 else
442 memcpy (pstr->mbs + byte_idx, p, mbclen);
444 else
445 memcpy (pstr->mbs + byte_idx, p, mbclen);
447 if (BE (pstr->offsets_needed != 0, 0))
449 size_t i;
450 for (i = 0; i < mbclen; ++i)
451 pstr->offsets[byte_idx + i] = src_idx + i;
453 src_idx += mbclen;
455 pstr->wcs[byte_idx++] = wcu;
456 /* Write paddings. */
457 for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
458 pstr->wcs[byte_idx++] = WEOF;
460 else if (mbclen == (size_t) -1 || mbclen == 0)
462 /* It is an invalid character or '\0'. Just use the byte. */
463 int ch = pstr->raw_mbs[pstr->raw_mbs_idx + src_idx];
465 if (BE (pstr->trans != NULL, 0))
466 ch = pstr->trans [ch];
467 pstr->mbs[byte_idx] = ch;
469 if (BE (pstr->offsets_needed != 0, 0))
470 pstr->offsets[byte_idx] = src_idx;
471 ++src_idx;
473 /* And also cast it to wide char. */
474 pstr->wcs[byte_idx++] = (wchar_t) ch;
475 if (BE (mbclen == (size_t) -1, 0))
476 pstr->cur_state = prev_st;
478 else
480 /* The buffer doesn't have enough space, finish to build. */
481 pstr->cur_state = prev_st;
482 break;
485 pstr->valid_len = byte_idx;
486 pstr->valid_raw_len = src_idx;
487 return REG_NOERROR;
490 /* Skip characters until the index becomes greater than NEW_RAW_IDX.
491 Return the index. */
493 static int
494 internal_function
495 re_string_skip_chars (re_string_t *pstr, int new_raw_idx, wint_t *last_wc)
497 mbstate_t prev_st;
498 int rawbuf_idx;
499 size_t mbclen;
500 wint_t wc = WEOF;
502 /* Skip the characters which are not necessary to check. */
503 for (rawbuf_idx = pstr->raw_mbs_idx + pstr->valid_raw_len;
504 rawbuf_idx < new_raw_idx;)
506 wchar_t wc2;
507 int remain_len = pstr->len - rawbuf_idx;
508 prev_st = pstr->cur_state;
509 mbclen = __mbrtowc (&wc2, (const char *) pstr->raw_mbs + rawbuf_idx,
510 remain_len, &pstr->cur_state);
511 if (BE (mbclen == (size_t) -2 || mbclen == (size_t) -1 || mbclen == 0, 0))
513 /* We treat these cases as a single byte character. */
514 if (mbclen == 0 || remain_len == 0)
515 wc = L'\0';
516 else
517 wc = *(unsigned char *) (pstr->raw_mbs + rawbuf_idx);
518 mbclen = 1;
519 pstr->cur_state = prev_st;
521 else
522 wc = (wint_t) wc2;
523 /* Then proceed the next character. */
524 rawbuf_idx += mbclen;
526 *last_wc = (wint_t) wc;
527 return rawbuf_idx;
529 #endif /* RE_ENABLE_I18N */
531 /* Build the buffer PSTR->MBS, and apply the translation if we need.
532 This function is used in case of REG_ICASE. */
534 static void
535 internal_function
536 build_upper_buffer (re_string_t *pstr)
538 int char_idx, end_idx;
539 end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
541 for (char_idx = pstr->valid_len; char_idx < end_idx; ++char_idx)
543 int ch = pstr->raw_mbs[pstr->raw_mbs_idx + char_idx];
544 if (BE (pstr->trans != NULL, 0))
545 ch = pstr->trans[ch];
546 if (islower (ch))
547 pstr->mbs[char_idx] = toupper (ch);
548 else
549 pstr->mbs[char_idx] = ch;
551 pstr->valid_len = char_idx;
552 pstr->valid_raw_len = char_idx;
555 /* Apply TRANS to the buffer in PSTR. */
557 static void
558 internal_function
559 re_string_translate_buffer (re_string_t *pstr)
561 int buf_idx, end_idx;
562 end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
564 for (buf_idx = pstr->valid_len; buf_idx < end_idx; ++buf_idx)
566 int ch = pstr->raw_mbs[pstr->raw_mbs_idx + buf_idx];
567 pstr->mbs[buf_idx] = pstr->trans[ch];
570 pstr->valid_len = buf_idx;
571 pstr->valid_raw_len = buf_idx;
574 /* This function re-construct the buffers.
575 Concretely, convert to wide character in case of pstr->mb_cur_max > 1,
576 convert to upper case in case of REG_ICASE, apply translation. */
578 static reg_errcode_t
579 internal_function
580 re_string_reconstruct (re_string_t *pstr, int idx, int eflags)
582 int offset = idx - pstr->raw_mbs_idx;
583 if (BE (offset < 0, 0))
585 /* Reset buffer. */
586 #ifdef RE_ENABLE_I18N
587 if (pstr->mb_cur_max > 1)
588 memset (&pstr->cur_state, '\0', sizeof (mbstate_t));
589 #endif /* RE_ENABLE_I18N */
590 pstr->len = pstr->raw_len;
591 pstr->stop = pstr->raw_stop;
592 pstr->valid_len = 0;
593 pstr->raw_mbs_idx = 0;
594 pstr->valid_raw_len = 0;
595 pstr->offsets_needed = 0;
596 pstr->tip_context = ((eflags & REG_NOTBOL) ? CONTEXT_BEGBUF
597 : CONTEXT_NEWLINE | CONTEXT_BEGBUF);
598 if (!pstr->mbs_allocated)
599 pstr->mbs = (unsigned char *) pstr->raw_mbs;
600 offset = idx;
603 if (BE (offset != 0, 1))
605 /* Should the already checked characters be kept? */
606 if (BE (offset < pstr->valid_raw_len, 1))
608 /* Yes, move them to the front of the buffer. */
609 #ifdef RE_ENABLE_I18N
610 if (BE (pstr->offsets_needed, 0))
612 int low = 0, high = pstr->valid_len, mid;
615 mid = low + (high - low) / 2;
616 if (pstr->offsets[mid] > offset)
617 high = mid;
618 else if (pstr->offsets[mid] < offset)
619 low = mid + 1;
620 else
621 break;
623 while (low < high);
624 if (pstr->offsets[mid] < offset)
625 ++mid;
626 pstr->tip_context = re_string_context_at (pstr, mid - 1,
627 eflags);
628 /* This can be quite complicated, so handle specially
629 only the common and easy case where the character with
630 different length representation of lower and upper
631 case is present at or after offset. */
632 if (pstr->valid_len > offset
633 && mid == offset && pstr->offsets[mid] == offset)
635 memmove (pstr->wcs, pstr->wcs + offset,
636 (pstr->valid_len - offset) * sizeof (wint_t));
637 memmove (pstr->mbs, pstr->mbs + offset, pstr->valid_len - offset);
638 pstr->valid_len -= offset;
639 pstr->valid_raw_len -= offset;
640 for (low = 0; low < pstr->valid_len; low++)
641 pstr->offsets[low] = pstr->offsets[low + offset] - offset;
643 else
645 /* Otherwise, just find out how long the partial multibyte
646 character at offset is and fill it with WEOF/255. */
647 pstr->len = pstr->raw_len - idx + offset;
648 pstr->stop = pstr->raw_stop - idx + offset;
649 pstr->offsets_needed = 0;
650 while (mid > 0 && pstr->offsets[mid - 1] == offset)
651 --mid;
652 while (mid < pstr->valid_len)
653 if (pstr->wcs[mid] != WEOF)
654 break;
655 else
656 ++mid;
657 if (mid == pstr->valid_len)
658 pstr->valid_len = 0;
659 else
661 pstr->valid_len = pstr->offsets[mid] - offset;
662 if (pstr->valid_len)
664 for (low = 0; low < pstr->valid_len; ++low)
665 pstr->wcs[low] = WEOF;
666 memset (pstr->mbs, 255, pstr->valid_len);
669 pstr->valid_raw_len = pstr->valid_len;
672 else
673 #endif
675 pstr->tip_context = re_string_context_at (pstr, offset - 1,
676 eflags);
677 #ifdef RE_ENABLE_I18N
678 if (pstr->mb_cur_max > 1)
679 memmove (pstr->wcs, pstr->wcs + offset,
680 (pstr->valid_len - offset) * sizeof (wint_t));
681 #endif /* RE_ENABLE_I18N */
682 if (BE (pstr->mbs_allocated, 0))
683 memmove (pstr->mbs, pstr->mbs + offset,
684 pstr->valid_len - offset);
685 pstr->valid_len -= offset;
686 pstr->valid_raw_len -= offset;
687 #if DEBUG
688 assert (pstr->valid_len > 0);
689 #endif
692 else
694 #ifdef RE_ENABLE_I18N
695 /* No, skip all characters until IDX. */
696 int prev_valid_len = pstr->valid_len;
698 if (BE (pstr->offsets_needed, 0))
700 pstr->len = pstr->raw_len - idx + offset;
701 pstr->stop = pstr->raw_stop - idx + offset;
702 pstr->offsets_needed = 0;
704 #endif
705 pstr->valid_len = 0;
706 #ifdef RE_ENABLE_I18N
707 if (pstr->mb_cur_max > 1)
709 int wcs_idx;
710 wint_t wc = WEOF;
712 if (pstr->is_utf8)
714 const unsigned char *raw, *p, *end;
716 /* Special case UTF-8. Multi-byte chars start with any
717 byte other than 0x80 - 0xbf. */
718 raw = pstr->raw_mbs + pstr->raw_mbs_idx;
719 end = raw + (offset - pstr->mb_cur_max);
720 if (end < pstr->raw_mbs)
721 end = pstr->raw_mbs;
722 p = raw + offset - 1;
723 #ifdef _LIBC
724 /* We know the wchar_t encoding is UCS4, so for the simple
725 case, ASCII characters, skip the conversion step. */
726 if (isascii (*p) && BE (pstr->trans == NULL, 1))
728 memset (&pstr->cur_state, '\0', sizeof (mbstate_t));
729 /* pstr->valid_len = 0; */
730 wc = (wchar_t) *p;
732 else
733 #endif
734 for (; p >= end; --p)
735 if ((*p & 0xc0) != 0x80)
737 mbstate_t cur_state;
738 wchar_t wc2;
739 int mlen = raw + pstr->len - p;
740 unsigned char buf[6];
741 size_t mbclen;
743 if (BE (pstr->trans != NULL, 0))
745 int i = mlen < 6 ? mlen : 6;
746 while (--i >= 0)
747 buf[i] = pstr->trans[p[i]];
749 /* XXX Don't use mbrtowc, we know which conversion
750 to use (UTF-8 -> UCS4). */
751 memset (&cur_state, 0, sizeof (cur_state));
752 mbclen = __mbrtowc (&wc2, (const char *) p, mlen,
753 &cur_state);
754 if (raw + offset - p <= mbclen
755 && mbclen < (size_t) -2)
757 memset (&pstr->cur_state, '\0',
758 sizeof (mbstate_t));
759 pstr->valid_len = mbclen - (raw + offset - p);
760 wc = wc2;
762 break;
766 if (wc == WEOF)
767 pstr->valid_len = re_string_skip_chars (pstr, idx, &wc) - idx;
768 if (wc == WEOF)
769 pstr->tip_context
770 = re_string_context_at (pstr, prev_valid_len - 1, eflags);
771 else
772 pstr->tip_context = ((BE (pstr->word_ops_used != 0, 0)
773 && IS_WIDE_WORD_CHAR (wc))
774 ? CONTEXT_WORD
775 : ((IS_WIDE_NEWLINE (wc)
776 && pstr->newline_anchor)
777 ? CONTEXT_NEWLINE : 0));
778 if (BE (pstr->valid_len, 0))
780 for (wcs_idx = 0; wcs_idx < pstr->valid_len; ++wcs_idx)
781 pstr->wcs[wcs_idx] = WEOF;
782 if (pstr->mbs_allocated)
783 memset (pstr->mbs, 255, pstr->valid_len);
785 pstr->valid_raw_len = pstr->valid_len;
787 else
788 #endif /* RE_ENABLE_I18N */
790 int c = pstr->raw_mbs[pstr->raw_mbs_idx + offset - 1];
791 pstr->valid_raw_len = 0;
792 if (pstr->trans)
793 c = pstr->trans[c];
794 pstr->tip_context = (bitset_contain (pstr->word_char, c)
795 ? CONTEXT_WORD
796 : ((IS_NEWLINE (c) && pstr->newline_anchor)
797 ? CONTEXT_NEWLINE : 0));
800 if (!BE (pstr->mbs_allocated, 0))
801 pstr->mbs += offset;
803 pstr->raw_mbs_idx = idx;
804 pstr->len -= offset;
805 pstr->stop -= offset;
807 /* Then build the buffers. */
808 #ifdef RE_ENABLE_I18N
809 if (pstr->mb_cur_max > 1)
811 if (pstr->icase)
813 reg_errcode_t ret = build_wcs_upper_buffer (pstr);
814 if (BE (ret != REG_NOERROR, 0))
815 return ret;
817 else
818 build_wcs_buffer (pstr);
820 else
821 #endif /* RE_ENABLE_I18N */
822 if (BE (pstr->mbs_allocated, 0))
824 if (pstr->icase)
825 build_upper_buffer (pstr);
826 else if (pstr->trans != NULL)
827 re_string_translate_buffer (pstr);
829 else
830 pstr->valid_len = pstr->len;
832 pstr->cur_idx = 0;
833 return REG_NOERROR;
836 static unsigned char
837 internal_function __attribute ((pure))
838 re_string_peek_byte_case (const re_string_t *pstr, int idx)
840 int ch, off;
842 /* Handle the common (easiest) cases first. */
843 if (BE (!pstr->mbs_allocated, 1))
844 return re_string_peek_byte (pstr, idx);
846 #ifdef RE_ENABLE_I18N
847 if (pstr->mb_cur_max > 1
848 && ! re_string_is_single_byte_char (pstr, pstr->cur_idx + idx))
849 return re_string_peek_byte (pstr, idx);
850 #endif
852 off = pstr->cur_idx + idx;
853 #ifdef RE_ENABLE_I18N
854 if (pstr->offsets_needed)
855 off = pstr->offsets[off];
856 #endif
858 ch = pstr->raw_mbs[pstr->raw_mbs_idx + off];
860 #ifdef RE_ENABLE_I18N
861 /* Ensure that e.g. for tr_TR.UTF-8 BACKSLASH DOTLESS SMALL LETTER I
862 this function returns CAPITAL LETTER I instead of first byte of
863 DOTLESS SMALL LETTER I. The latter would confuse the parser,
864 since peek_byte_case doesn't advance cur_idx in any way. */
865 if (pstr->offsets_needed && !isascii (ch))
866 return re_string_peek_byte (pstr, idx);
867 #endif
869 return ch;
872 static unsigned char
873 internal_function __attribute ((pure))
874 re_string_fetch_byte_case (re_string_t *pstr)
876 if (BE (!pstr->mbs_allocated, 1))
877 return re_string_fetch_byte (pstr);
879 #ifdef RE_ENABLE_I18N
880 if (pstr->offsets_needed)
882 int off, ch;
884 /* For tr_TR.UTF-8 [[:islower:]] there is
885 [[: CAPITAL LETTER I WITH DOT lower:]] in mbs. Skip
886 in that case the whole multi-byte character and return
887 the original letter. On the other side, with
888 [[: DOTLESS SMALL LETTER I return [[:I, as doing
889 anything else would complicate things too much. */
891 if (!re_string_first_byte (pstr, pstr->cur_idx))
892 return re_string_fetch_byte (pstr);
894 off = pstr->offsets[pstr->cur_idx];
895 ch = pstr->raw_mbs[pstr->raw_mbs_idx + off];
897 if (! isascii (ch))
898 return re_string_fetch_byte (pstr);
900 re_string_skip_bytes (pstr,
901 re_string_char_size_at (pstr, pstr->cur_idx));
902 return ch;
904 #endif
906 return pstr->raw_mbs[pstr->raw_mbs_idx + pstr->cur_idx++];
909 static void
910 internal_function
911 re_string_destruct (re_string_t *pstr)
913 #ifdef RE_ENABLE_I18N
914 re_free (pstr->wcs);
915 re_free (pstr->offsets);
916 #endif /* RE_ENABLE_I18N */
917 if (pstr->mbs_allocated)
918 re_free (pstr->mbs);
921 /* Return the context at IDX in INPUT. */
923 static unsigned int
924 internal_function
925 re_string_context_at (const re_string_t *input, int idx, int eflags)
927 int c;
928 if (BE (idx < 0, 0))
929 /* In this case, we use the value stored in input->tip_context,
930 since we can't know the character in input->mbs[-1] here. */
931 return input->tip_context;
932 if (BE (idx == input->len, 0))
933 return ((eflags & REG_NOTEOL) ? CONTEXT_ENDBUF
934 : CONTEXT_NEWLINE | CONTEXT_ENDBUF);
935 #ifdef RE_ENABLE_I18N
936 if (input->mb_cur_max > 1)
938 wint_t wc;
939 int wc_idx = idx;
940 while(input->wcs[wc_idx] == WEOF)
942 #ifdef DEBUG
943 /* It must not happen. */
944 assert (wc_idx >= 0);
945 #endif
946 --wc_idx;
947 if (wc_idx < 0)
948 return input->tip_context;
950 wc = input->wcs[wc_idx];
951 if (BE (input->word_ops_used != 0, 0) && IS_WIDE_WORD_CHAR (wc))
952 return CONTEXT_WORD;
953 return (IS_WIDE_NEWLINE (wc) && input->newline_anchor
954 ? CONTEXT_NEWLINE : 0);
956 else
957 #endif
959 c = re_string_byte_at (input, idx);
960 if (bitset_contain (input->word_char, c))
961 return CONTEXT_WORD;
962 return IS_NEWLINE (c) && input->newline_anchor ? CONTEXT_NEWLINE : 0;
966 /* Functions for set operation. */
968 static reg_errcode_t
969 internal_function
970 re_node_set_alloc (re_node_set *set, int size)
973 * ADR: valgrind says size can be 0, which then doesn't
974 * free the block of size 0. Harumph. This seems
975 * to work ok, though.
977 if (size == 0)
979 memset(set, 0, sizeof(*set));
980 return REG_NOERROR;
982 set->alloc = size;
983 set->nelem = 0;
984 set->elems = re_malloc (int, size);
985 if (BE (set->elems == NULL, 0))
986 return REG_ESPACE;
987 return REG_NOERROR;
990 static reg_errcode_t
991 internal_function
992 re_node_set_init_1 (re_node_set *set, int elem)
994 set->alloc = 1;
995 set->nelem = 1;
996 set->elems = re_malloc (int, 1);
997 if (BE (set->elems == NULL, 0))
999 set->alloc = set->nelem = 0;
1000 return REG_ESPACE;
1002 set->elems[0] = elem;
1003 return REG_NOERROR;
1006 static reg_errcode_t
1007 internal_function
1008 re_node_set_init_2 (re_node_set *set, int elem1, int elem2)
1010 set->alloc = 2;
1011 set->elems = re_malloc (int, 2);
1012 if (BE (set->elems == NULL, 0))
1013 return REG_ESPACE;
1014 if (elem1 == elem2)
1016 set->nelem = 1;
1017 set->elems[0] = elem1;
1019 else
1021 set->nelem = 2;
1022 if (elem1 < elem2)
1024 set->elems[0] = elem1;
1025 set->elems[1] = elem2;
1027 else
1029 set->elems[0] = elem2;
1030 set->elems[1] = elem1;
1033 return REG_NOERROR;
1036 static reg_errcode_t
1037 internal_function
1038 re_node_set_init_copy (re_node_set *dest, const re_node_set *src)
1040 dest->nelem = src->nelem;
1041 if (src->nelem > 0)
1043 dest->alloc = dest->nelem;
1044 dest->elems = re_malloc (int, dest->alloc);
1045 if (BE (dest->elems == NULL, 0))
1047 dest->alloc = dest->nelem = 0;
1048 return REG_ESPACE;
1050 memcpy (dest->elems, src->elems, src->nelem * sizeof (int));
1052 else
1053 re_node_set_init_empty (dest);
1054 return REG_NOERROR;
1057 /* Calculate the intersection of the sets SRC1 and SRC2. And merge it to
1058 DEST. Return value indicate the error code or REG_NOERROR if succeeded.
1059 Note: We assume dest->elems is NULL, when dest->alloc is 0. */
1061 static reg_errcode_t
1062 internal_function
1063 re_node_set_add_intersect (re_node_set *dest, const re_node_set *src1,
1064 const re_node_set *src2)
1066 int i1, i2, is, id, delta, sbase;
1067 if (src1->nelem == 0 || src2->nelem == 0)
1068 return REG_NOERROR;
1070 /* We need dest->nelem + 2 * elems_in_intersection; this is a
1071 conservative estimate. */
1072 if (src1->nelem + src2->nelem + dest->nelem > dest->alloc)
1074 int new_alloc = src1->nelem + src2->nelem + dest->alloc;
1075 int *new_elems = re_realloc (dest->elems, int, new_alloc);
1076 if (BE (new_elems == NULL, 0))
1077 return REG_ESPACE;
1078 dest->elems = new_elems;
1079 dest->alloc = new_alloc;
1082 /* Find the items in the intersection of SRC1 and SRC2, and copy
1083 into the top of DEST those that are not already in DEST itself. */
1084 sbase = dest->nelem + src1->nelem + src2->nelem;
1085 i1 = src1->nelem - 1;
1086 i2 = src2->nelem - 1;
1087 id = dest->nelem - 1;
1088 for (;;)
1090 if (src1->elems[i1] == src2->elems[i2])
1092 /* Try to find the item in DEST. Maybe we could binary search? */
1093 while (id >= 0 && dest->elems[id] > src1->elems[i1])
1094 --id;
1096 if (id < 0 || dest->elems[id] != src1->elems[i1])
1097 dest->elems[--sbase] = src1->elems[i1];
1099 if (--i1 < 0 || --i2 < 0)
1100 break;
1103 /* Lower the highest of the two items. */
1104 else if (src1->elems[i1] < src2->elems[i2])
1106 if (--i2 < 0)
1107 break;
1109 else
1111 if (--i1 < 0)
1112 break;
1116 id = dest->nelem - 1;
1117 is = dest->nelem + src1->nelem + src2->nelem - 1;
1118 delta = is - sbase + 1;
1120 /* Now copy. When DELTA becomes zero, the remaining
1121 DEST elements are already in place; this is more or
1122 less the same loop that is in re_node_set_merge. */
1123 dest->nelem += delta;
1124 if (delta > 0 && id >= 0)
1125 for (;;)
1127 if (dest->elems[is] > dest->elems[id])
1129 /* Copy from the top. */
1130 dest->elems[id + delta--] = dest->elems[is--];
1131 if (delta == 0)
1132 break;
1134 else
1136 /* Slide from the bottom. */
1137 dest->elems[id + delta] = dest->elems[id];
1138 if (--id < 0)
1139 break;
1143 /* Copy remaining SRC elements. */
1144 memcpy (dest->elems, dest->elems + sbase, delta * sizeof (int));
1146 return REG_NOERROR;
1149 /* Calculate the union set of the sets SRC1 and SRC2. And store it to
1150 DEST. Return value indicate the error code or REG_NOERROR if succeeded. */
1152 static reg_errcode_t
1153 internal_function
1154 re_node_set_init_union (re_node_set *dest, const re_node_set *src1,
1155 const re_node_set *src2)
1157 int i1, i2, id;
1158 if (src1 != NULL && src1->nelem > 0 && src2 != NULL && src2->nelem > 0)
1160 dest->alloc = src1->nelem + src2->nelem;
1161 dest->elems = re_malloc (int, dest->alloc);
1162 if (BE (dest->elems == NULL, 0))
1163 return REG_ESPACE;
1165 else
1167 if (src1 != NULL && src1->nelem > 0)
1168 return re_node_set_init_copy (dest, src1);
1169 else if (src2 != NULL && src2->nelem > 0)
1170 return re_node_set_init_copy (dest, src2);
1171 else
1172 re_node_set_init_empty (dest);
1173 return REG_NOERROR;
1175 for (i1 = i2 = id = 0 ; i1 < src1->nelem && i2 < src2->nelem ;)
1177 if (src1->elems[i1] > src2->elems[i2])
1179 dest->elems[id++] = src2->elems[i2++];
1180 continue;
1182 if (src1->elems[i1] == src2->elems[i2])
1183 ++i2;
1184 dest->elems[id++] = src1->elems[i1++];
1186 if (i1 < src1->nelem)
1188 memcpy (dest->elems + id, src1->elems + i1,
1189 (src1->nelem - i1) * sizeof (int));
1190 id += src1->nelem - i1;
1192 else if (i2 < src2->nelem)
1194 memcpy (dest->elems + id, src2->elems + i2,
1195 (src2->nelem - i2) * sizeof (int));
1196 id += src2->nelem - i2;
1198 dest->nelem = id;
1199 return REG_NOERROR;
1202 /* Calculate the union set of the sets DEST and SRC. And store it to
1203 DEST. Return value indicate the error code or REG_NOERROR if succeeded. */
1205 static reg_errcode_t
1206 internal_function
1207 re_node_set_merge (re_node_set *dest, const re_node_set *src)
1209 int is, id, sbase, delta;
1210 if (src == NULL || src->nelem == 0)
1211 return REG_NOERROR;
1212 if (dest->alloc < 2 * src->nelem + dest->nelem)
1214 int new_alloc = 2 * (src->nelem + dest->alloc);
1215 int *new_buffer = re_realloc (dest->elems, int, new_alloc);
1216 if (BE (new_buffer == NULL, 0))
1217 return REG_ESPACE;
1218 dest->elems = new_buffer;
1219 dest->alloc = new_alloc;
1222 if (BE (dest->nelem == 0, 0))
1224 dest->nelem = src->nelem;
1225 memcpy (dest->elems, src->elems, src->nelem * sizeof (int));
1226 return REG_NOERROR;
1229 /* Copy into the top of DEST the items of SRC that are not
1230 found in DEST. Maybe we could binary search in DEST? */
1231 for (sbase = dest->nelem + 2 * src->nelem,
1232 is = src->nelem - 1, id = dest->nelem - 1; is >= 0 && id >= 0; )
1234 if (dest->elems[id] == src->elems[is])
1235 is--, id--;
1236 else if (dest->elems[id] < src->elems[is])
1237 dest->elems[--sbase] = src->elems[is--];
1238 else /* if (dest->elems[id] > src->elems[is]) */
1239 --id;
1242 if (is >= 0)
1244 /* If DEST is exhausted, the remaining items of SRC must be unique. */
1245 sbase -= is + 1;
1246 memcpy (dest->elems + sbase, src->elems, (is + 1) * sizeof (int));
1249 id = dest->nelem - 1;
1250 is = dest->nelem + 2 * src->nelem - 1;
1251 delta = is - sbase + 1;
1252 if (delta == 0)
1253 return REG_NOERROR;
1255 /* Now copy. When DELTA becomes zero, the remaining
1256 DEST elements are already in place. */
1257 dest->nelem += delta;
1258 for (;;)
1260 if (dest->elems[is] > dest->elems[id])
1262 /* Copy from the top. */
1263 dest->elems[id + delta--] = dest->elems[is--];
1264 if (delta == 0)
1265 break;
1267 else
1269 /* Slide from the bottom. */
1270 dest->elems[id + delta] = dest->elems[id];
1271 if (--id < 0)
1273 /* Copy remaining SRC elements. */
1274 memcpy (dest->elems, dest->elems + sbase,
1275 delta * sizeof (int));
1276 break;
1281 return REG_NOERROR;
1284 /* Insert the new element ELEM to the re_node_set* SET.
1285 SET should not already have ELEM.
1286 return -1 if an error has occurred, return 1 otherwise. */
1288 static int
1289 internal_function
1290 re_node_set_insert (re_node_set *set, int elem)
1292 int idx;
1293 /* In case the set is empty. */
1294 if (set->alloc == 0)
1296 if (BE (re_node_set_init_1 (set, elem) == REG_NOERROR, 1))
1297 return 1;
1298 else
1299 return -1;
1302 if (BE (set->nelem, 0) == 0)
1304 /* We already guaranteed above that set->alloc != 0. */
1305 set->elems[0] = elem;
1306 ++set->nelem;
1307 return 1;
1310 /* Realloc if we need. */
1311 if (set->alloc == set->nelem)
1313 int *new_elems;
1314 set->alloc = set->alloc * 2;
1315 new_elems = re_realloc (set->elems, int, set->alloc);
1316 if (BE (new_elems == NULL, 0))
1317 return -1;
1318 set->elems = new_elems;
1321 /* Move the elements which follows the new element. Test the
1322 first element separately to skip a check in the inner loop. */
1323 if (elem < set->elems[0])
1325 idx = 0;
1326 for (idx = set->nelem; idx > 0; idx--)
1327 set->elems[idx] = set->elems[idx - 1];
1329 else
1331 for (idx = set->nelem; set->elems[idx - 1] > elem; idx--)
1332 set->elems[idx] = set->elems[idx - 1];
1335 /* Insert the new element. */
1336 set->elems[idx] = elem;
1337 ++set->nelem;
1338 return 1;
1341 /* Insert the new element ELEM to the re_node_set* SET.
1342 SET should not already have any element greater than or equal to ELEM.
1343 Return -1 if an error has occurred, return 1 otherwise. */
1345 static int
1346 internal_function
1347 re_node_set_insert_last (re_node_set *set, int elem)
1349 /* Realloc if we need. */
1350 if (set->alloc == set->nelem)
1352 int *new_elems;
1353 set->alloc = (set->alloc + 1) * 2;
1354 new_elems = re_realloc (set->elems, int, set->alloc);
1355 if (BE (new_elems == NULL, 0))
1356 return -1;
1357 set->elems = new_elems;
1360 /* Insert the new element. */
1361 set->elems[set->nelem++] = elem;
1362 return 1;
1365 /* Compare two node sets SET1 and SET2.
1366 return 1 if SET1 and SET2 are equivalent, return 0 otherwise. */
1368 static int
1369 internal_function __attribute ((pure))
1370 re_node_set_compare (const re_node_set *set1, const re_node_set *set2)
1372 int i;
1373 if (set1 == NULL || set2 == NULL || set1->nelem != set2->nelem)
1374 return 0;
1375 for (i = set1->nelem ; --i >= 0 ; )
1376 if (set1->elems[i] != set2->elems[i])
1377 return 0;
1378 return 1;
1381 /* Return (idx + 1) if SET contains the element ELEM, return 0 otherwise. */
1383 static int
1384 internal_function __attribute ((pure))
1385 re_node_set_contains (const re_node_set *set, int elem)
1387 unsigned int idx, right, mid;
1388 if (set->nelem <= 0)
1389 return 0;
1391 /* Binary search the element. */
1392 idx = 0;
1393 right = set->nelem - 1;
1394 while (idx < right)
1396 mid = idx + (right - idx) / 2;
1397 if (set->elems[mid] < elem)
1398 idx = mid + 1;
1399 else
1400 right = mid;
1402 return set->elems[idx] == elem ? idx + 1 : 0;
1405 static void
1406 internal_function
1407 re_node_set_remove_at (re_node_set *set, int idx)
1409 if (idx < 0 || idx >= set->nelem)
1410 return;
1411 --set->nelem;
1412 for (; idx < set->nelem; idx++)
1413 set->elems[idx] = set->elems[idx + 1];
1417 /* Add the token TOKEN to dfa->nodes, and return the index of the token.
1418 Or return -1, if an error has occurred. */
1420 static int
1421 internal_function
1422 re_dfa_add_node (re_dfa_t *dfa, re_token_t token)
1424 if (BE (dfa->nodes_len >= dfa->nodes_alloc, 0))
1426 size_t new_nodes_alloc = dfa->nodes_alloc * 2;
1427 int *new_nexts, *new_indices;
1428 re_node_set *new_edests, *new_eclosures;
1429 re_token_t *new_nodes;
1431 /* Avoid overflows in realloc. */
1432 const size_t max_object_size = MAX (sizeof (re_token_t),
1433 MAX (sizeof (re_node_set),
1434 sizeof (int)));
1435 if (BE (SIZE_MAX / max_object_size < new_nodes_alloc, 0))
1436 return -1;
1438 new_nodes = re_realloc (dfa->nodes, re_token_t, new_nodes_alloc);
1439 if (BE (new_nodes == NULL, 0))
1440 return -1;
1441 dfa->nodes = new_nodes;
1442 new_nexts = re_realloc (dfa->nexts, int, new_nodes_alloc);
1443 new_indices = re_realloc (dfa->org_indices, int, new_nodes_alloc);
1444 new_edests = re_realloc (dfa->edests, re_node_set, new_nodes_alloc);
1445 new_eclosures = re_realloc (dfa->eclosures, re_node_set, new_nodes_alloc);
1446 if (BE (new_nexts == NULL || new_indices == NULL
1447 || new_edests == NULL || new_eclosures == NULL, 0))
1448 return -1;
1449 dfa->nexts = new_nexts;
1450 dfa->org_indices = new_indices;
1451 dfa->edests = new_edests;
1452 dfa->eclosures = new_eclosures;
1453 dfa->nodes_alloc = new_nodes_alloc;
1455 dfa->nodes[dfa->nodes_len] = token;
1456 dfa->nodes[dfa->nodes_len].constraint = 0;
1457 #ifdef RE_ENABLE_I18N
1458 dfa->nodes[dfa->nodes_len].accept_mb =
1459 (token.type == OP_PERIOD && dfa->mb_cur_max > 1) || token.type == COMPLEX_BRACKET;
1460 #endif
1461 dfa->nexts[dfa->nodes_len] = -1;
1462 re_node_set_init_empty (dfa->edests + dfa->nodes_len);
1463 re_node_set_init_empty (dfa->eclosures + dfa->nodes_len);
1464 return dfa->nodes_len++;
1467 static inline unsigned int
1468 internal_function
1469 calc_state_hash (const re_node_set *nodes, unsigned int context)
1471 unsigned int hash = nodes->nelem + context;
1472 int i;
1473 for (i = 0 ; i < nodes->nelem ; i++)
1474 hash += nodes->elems[i];
1475 return hash;
1478 /* Search for the state whose node_set is equivalent to NODES.
1479 Return the pointer to the state, if we found it in the DFA.
1480 Otherwise create the new one and return it. In case of an error
1481 return NULL and set the error code in ERR.
1482 Note: - We assume NULL as the invalid state, then it is possible that
1483 return value is NULL and ERR is REG_NOERROR.
1484 - We never return non-NULL value in case of any errors, it is for
1485 optimization. */
1487 static re_dfastate_t *
1488 internal_function
1489 re_acquire_state (reg_errcode_t *err, const re_dfa_t *dfa,
1490 const re_node_set *nodes)
1492 unsigned int hash;
1493 re_dfastate_t *new_state;
1494 struct re_state_table_entry *spot;
1495 int i;
1496 if (BE (nodes->nelem == 0, 0))
1498 *err = REG_NOERROR;
1499 return NULL;
1501 hash = calc_state_hash (nodes, 0);
1502 spot = dfa->state_table + (hash & dfa->state_hash_mask);
1504 for (i = 0 ; i < spot->num ; i++)
1506 re_dfastate_t *state = spot->array[i];
1507 if (hash != state->hash)
1508 continue;
1509 if (re_node_set_compare (&state->nodes, nodes))
1510 return state;
1513 /* There are no appropriate state in the dfa, create the new one. */
1514 new_state = create_ci_newstate (dfa, nodes, hash);
1515 if (BE (new_state == NULL, 0))
1516 *err = REG_ESPACE;
1518 return new_state;
1521 /* Search for the state whose node_set is equivalent to NODES and
1522 whose context is equivalent to CONTEXT.
1523 Return the pointer to the state, if we found it in the DFA.
1524 Otherwise create the new one and return it. In case of an error
1525 return NULL and set the error code in ERR.
1526 Note: - We assume NULL as the invalid state, then it is possible that
1527 return value is NULL and ERR is REG_NOERROR.
1528 - We never return non-NULL value in case of any errors, it is for
1529 optimization. */
1531 static re_dfastate_t *
1532 internal_function
1533 re_acquire_state_context (reg_errcode_t *err, const re_dfa_t *dfa,
1534 const re_node_set *nodes, unsigned int context)
1536 unsigned int hash;
1537 re_dfastate_t *new_state;
1538 struct re_state_table_entry *spot;
1539 int i;
1540 if (nodes->nelem == 0)
1542 *err = REG_NOERROR;
1543 return NULL;
1545 hash = calc_state_hash (nodes, context);
1546 spot = dfa->state_table + (hash & dfa->state_hash_mask);
1548 for (i = 0 ; i < spot->num ; i++)
1550 re_dfastate_t *state = spot->array[i];
1551 if (state->hash == hash
1552 && state->context == context
1553 && re_node_set_compare (state->entrance_nodes, nodes))
1554 return state;
1556 /* There are no appropriate state in `dfa', create the new one. */
1557 new_state = create_cd_newstate (dfa, nodes, context, hash);
1558 if (BE (new_state == NULL, 0))
1559 *err = REG_ESPACE;
1561 return new_state;
1564 /* Finish initialization of the new state NEWSTATE, and using its hash value
1565 HASH put in the appropriate bucket of DFA's state table. Return value
1566 indicates the error code if failed. */
1568 static reg_errcode_t
1569 register_state (const re_dfa_t *dfa, re_dfastate_t *newstate,
1570 unsigned int hash)
1572 struct re_state_table_entry *spot;
1573 reg_errcode_t err;
1574 int i;
1576 newstate->hash = hash;
1577 err = re_node_set_alloc (&newstate->non_eps_nodes, newstate->nodes.nelem);
1578 if (BE (err != REG_NOERROR, 0))
1579 return REG_ESPACE;
1580 for (i = 0; i < newstate->nodes.nelem; i++)
1582 int elem = newstate->nodes.elems[i];
1583 if (!IS_EPSILON_NODE (dfa->nodes[elem].type))
1584 if (re_node_set_insert_last (&newstate->non_eps_nodes, elem) < 0)
1585 return REG_ESPACE;
1588 spot = dfa->state_table + (hash & dfa->state_hash_mask);
1589 if (BE (spot->alloc <= spot->num, 0))
1591 int new_alloc = 2 * spot->num + 2;
1592 re_dfastate_t **new_array = re_realloc (spot->array, re_dfastate_t *,
1593 new_alloc);
1594 if (BE (new_array == NULL, 0))
1595 return REG_ESPACE;
1596 spot->array = new_array;
1597 spot->alloc = new_alloc;
1599 spot->array[spot->num++] = newstate;
1600 return REG_NOERROR;
1603 static void
1604 free_state (re_dfastate_t *state)
1606 re_node_set_free (&state->non_eps_nodes);
1607 re_node_set_free (&state->inveclosure);
1608 if (state->entrance_nodes != &state->nodes)
1610 re_node_set_free (state->entrance_nodes);
1611 re_free (state->entrance_nodes);
1613 re_node_set_free (&state->nodes);
1614 re_free (state->word_trtable);
1615 re_free (state->trtable);
1616 re_free (state);
1619 /* Create the new state which is independent of contexts.
1620 Return the new state if succeeded, otherwise return NULL. */
1622 static re_dfastate_t *
1623 internal_function
1624 create_ci_newstate (const re_dfa_t *dfa, const re_node_set *nodes,
1625 unsigned int hash)
1627 int i;
1628 reg_errcode_t err;
1629 re_dfastate_t *newstate;
1631 newstate = (re_dfastate_t *) calloc (sizeof (re_dfastate_t), 1);
1632 if (BE (newstate == NULL, 0))
1633 return NULL;
1634 err = re_node_set_init_copy (&newstate->nodes, nodes);
1635 if (BE (err != REG_NOERROR, 0))
1637 re_free (newstate);
1638 return NULL;
1641 newstate->entrance_nodes = &newstate->nodes;
1642 for (i = 0 ; i < nodes->nelem ; i++)
1644 re_token_t *node = dfa->nodes + nodes->elems[i];
1645 re_token_type_t type = node->type;
1646 if (type == CHARACTER && !node->constraint)
1647 continue;
1648 #ifdef RE_ENABLE_I18N
1649 newstate->accept_mb |= node->accept_mb;
1650 #endif /* RE_ENABLE_I18N */
1652 /* If the state has the halt node, the state is a halt state. */
1653 if (type == END_OF_RE)
1654 newstate->halt = 1;
1655 else if (type == OP_BACK_REF)
1656 newstate->has_backref = 1;
1657 else if (type == ANCHOR || node->constraint)
1658 newstate->has_constraint = 1;
1660 err = register_state (dfa, newstate, hash);
1661 if (BE (err != REG_NOERROR, 0))
1663 free_state (newstate);
1664 newstate = NULL;
1666 return newstate;
1669 /* Create the new state which is depend on the context CONTEXT.
1670 Return the new state if succeeded, otherwise return NULL. */
1672 static re_dfastate_t *
1673 internal_function
1674 create_cd_newstate (const re_dfa_t *dfa, const re_node_set *nodes,
1675 unsigned int context, unsigned int hash)
1677 int i, nctx_nodes = 0;
1678 reg_errcode_t err;
1679 re_dfastate_t *newstate;
1681 newstate = (re_dfastate_t *) calloc (sizeof (re_dfastate_t), 1);
1682 if (BE (newstate == NULL, 0))
1683 return NULL;
1684 err = re_node_set_init_copy (&newstate->nodes, nodes);
1685 if (BE (err != REG_NOERROR, 0))
1687 re_free (newstate);
1688 return NULL;
1691 newstate->context = context;
1692 newstate->entrance_nodes = &newstate->nodes;
1694 for (i = 0 ; i < nodes->nelem ; i++)
1696 re_token_t *node = dfa->nodes + nodes->elems[i];
1697 re_token_type_t type = node->type;
1698 unsigned int constraint = node->constraint;
1700 if (type == CHARACTER && !constraint)
1701 continue;
1702 #ifdef RE_ENABLE_I18N
1703 newstate->accept_mb |= node->accept_mb;
1704 #endif /* RE_ENABLE_I18N */
1706 /* If the state has the halt node, the state is a halt state. */
1707 if (type == END_OF_RE)
1708 newstate->halt = 1;
1709 else if (type == OP_BACK_REF)
1710 newstate->has_backref = 1;
1712 if (constraint)
1714 if (newstate->entrance_nodes == &newstate->nodes)
1716 newstate->entrance_nodes = re_malloc (re_node_set, 1);
1717 if (BE (newstate->entrance_nodes == NULL, 0))
1719 free_state (newstate);
1720 return NULL;
1722 if (re_node_set_init_copy (newstate->entrance_nodes, nodes)
1723 != REG_NOERROR)
1724 return NULL;
1725 nctx_nodes = 0;
1726 newstate->has_constraint = 1;
1729 if (NOT_SATISFY_PREV_CONSTRAINT (constraint,context))
1731 re_node_set_remove_at (&newstate->nodes, i - nctx_nodes);
1732 ++nctx_nodes;
1736 err = register_state (dfa, newstate, hash);
1737 if (BE (err != REG_NOERROR, 0))
1739 free_state (newstate);
1740 newstate = NULL;
1742 return newstate;