* locale/langinfo.h: nl_langinfo_l is in POSIX 2008.
[glibc.git] / posix / regex_internal.c
blobc9da2b961cc34a183921b428150f41b8a6251b16
1 /* Extended regular expression matching and search library.
2 Copyright (C) 2002, 2003, 2004, 2005, 2006 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4 Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library; if not, write to the Free
18 Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
19 02111-1307 USA. */
21 static void re_string_construct_common (const char *str, int len,
22 re_string_t *pstr,
23 RE_TRANSLATE_TYPE trans, int icase,
24 const re_dfa_t *dfa) internal_function;
25 static re_dfastate_t *create_ci_newstate (const re_dfa_t *dfa,
26 const re_node_set *nodes,
27 unsigned int hash) internal_function;
28 static re_dfastate_t *create_cd_newstate (const re_dfa_t *dfa,
29 const re_node_set *nodes,
30 unsigned int context,
31 unsigned int hash) internal_function;
33 /* Functions for string operation. */
35 /* This function allocate the buffers. It is necessary to call
36 re_string_reconstruct before using the object. */
38 static reg_errcode_t
39 internal_function
40 re_string_allocate (re_string_t *pstr, const char *str, int len, int init_len,
41 RE_TRANSLATE_TYPE trans, int icase, const re_dfa_t *dfa)
43 reg_errcode_t ret;
44 int init_buf_len;
46 /* Ensure at least one character fits into the buffers. */
47 if (init_len < dfa->mb_cur_max)
48 init_len = dfa->mb_cur_max;
49 init_buf_len = (len + 1 < init_len) ? len + 1: init_len;
50 re_string_construct_common (str, len, pstr, trans, icase, dfa);
52 ret = re_string_realloc_buffers (pstr, init_buf_len);
53 if (BE (ret != REG_NOERROR, 0))
54 return ret;
56 pstr->word_char = dfa->word_char;
57 pstr->word_ops_used = dfa->word_ops_used;
58 pstr->mbs = pstr->mbs_allocated ? pstr->mbs : (unsigned char *) str;
59 pstr->valid_len = (pstr->mbs_allocated || dfa->mb_cur_max > 1) ? 0 : len;
60 pstr->valid_raw_len = pstr->valid_len;
61 return REG_NOERROR;
64 /* This function allocate the buffers, and initialize them. */
66 static reg_errcode_t
67 internal_function
68 re_string_construct (re_string_t *pstr, const char *str, int len,
69 RE_TRANSLATE_TYPE trans, int icase, const re_dfa_t *dfa)
71 reg_errcode_t ret;
72 memset (pstr, '\0', sizeof (re_string_t));
73 re_string_construct_common (str, len, pstr, trans, icase, dfa);
75 if (len > 0)
77 ret = re_string_realloc_buffers (pstr, len + 1);
78 if (BE (ret != REG_NOERROR, 0))
79 return ret;
81 pstr->mbs = pstr->mbs_allocated ? pstr->mbs : (unsigned char *) str;
83 if (icase)
85 #ifdef RE_ENABLE_I18N
86 if (dfa->mb_cur_max > 1)
88 while (1)
90 ret = build_wcs_upper_buffer (pstr);
91 if (BE (ret != REG_NOERROR, 0))
92 return ret;
93 if (pstr->valid_raw_len >= len)
94 break;
95 if (pstr->bufs_len > pstr->valid_len + dfa->mb_cur_max)
96 break;
97 ret = re_string_realloc_buffers (pstr, pstr->bufs_len * 2);
98 if (BE (ret != REG_NOERROR, 0))
99 return ret;
102 else
103 #endif /* RE_ENABLE_I18N */
104 build_upper_buffer (pstr);
106 else
108 #ifdef RE_ENABLE_I18N
109 if (dfa->mb_cur_max > 1)
110 build_wcs_buffer (pstr);
111 else
112 #endif /* RE_ENABLE_I18N */
114 if (trans != NULL)
115 re_string_translate_buffer (pstr);
116 else
118 pstr->valid_len = pstr->bufs_len;
119 pstr->valid_raw_len = pstr->bufs_len;
124 return REG_NOERROR;
127 /* Helper functions for re_string_allocate, and re_string_construct. */
129 static reg_errcode_t
130 internal_function
131 re_string_realloc_buffers (re_string_t *pstr, int new_buf_len)
133 #ifdef RE_ENABLE_I18N
134 if (pstr->mb_cur_max > 1)
136 wint_t *new_wcs = re_realloc (pstr->wcs, wint_t, new_buf_len);
137 if (BE (new_wcs == NULL, 0))
138 return REG_ESPACE;
139 pstr->wcs = new_wcs;
140 if (pstr->offsets != NULL)
142 int *new_offsets = re_realloc (pstr->offsets, int, new_buf_len);
143 if (BE (new_offsets == NULL, 0))
144 return REG_ESPACE;
145 pstr->offsets = new_offsets;
148 #endif /* RE_ENABLE_I18N */
149 if (pstr->mbs_allocated)
151 unsigned char *new_mbs = re_realloc (pstr->mbs, unsigned char,
152 new_buf_len);
153 if (BE (new_mbs == NULL, 0))
154 return REG_ESPACE;
155 pstr->mbs = new_mbs;
157 pstr->bufs_len = new_buf_len;
158 return REG_NOERROR;
162 static void
163 internal_function
164 re_string_construct_common (const char *str, int len, re_string_t *pstr,
165 RE_TRANSLATE_TYPE trans, int icase,
166 const re_dfa_t *dfa)
168 pstr->raw_mbs = (const unsigned char *) str;
169 pstr->len = len;
170 pstr->raw_len = len;
171 pstr->trans = trans;
172 pstr->icase = icase ? 1 : 0;
173 pstr->mbs_allocated = (trans != NULL || icase);
174 pstr->mb_cur_max = dfa->mb_cur_max;
175 pstr->is_utf8 = dfa->is_utf8;
176 pstr->map_notascii = dfa->map_notascii;
177 pstr->stop = pstr->len;
178 pstr->raw_stop = pstr->stop;
181 #ifdef RE_ENABLE_I18N
183 /* Build wide character buffer PSTR->WCS.
184 If the byte sequence of the string are:
185 <mb1>(0), <mb1>(1), <mb2>(0), <mb2>(1), <sb3>
186 Then wide character buffer will be:
187 <wc1> , WEOF , <wc2> , WEOF , <wc3>
188 We use WEOF for padding, they indicate that the position isn't
189 a first byte of a multibyte character.
191 Note that this function assumes PSTR->VALID_LEN elements are already
192 built and starts from PSTR->VALID_LEN. */
194 static void
195 internal_function
196 build_wcs_buffer (re_string_t *pstr)
198 #ifdef _LIBC
199 unsigned char buf[MB_LEN_MAX];
200 assert (MB_LEN_MAX >= pstr->mb_cur_max);
201 #else
202 unsigned char buf[64];
203 #endif
204 mbstate_t prev_st;
205 int byte_idx, end_idx, remain_len;
206 size_t mbclen;
208 /* Build the buffers from pstr->valid_len to either pstr->len or
209 pstr->bufs_len. */
210 end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
211 for (byte_idx = pstr->valid_len; byte_idx < end_idx;)
213 wchar_t wc;
214 const char *p;
216 remain_len = end_idx - byte_idx;
217 prev_st = pstr->cur_state;
218 /* Apply the translation if we need. */
219 if (BE (pstr->trans != NULL, 0))
221 int i, ch;
223 for (i = 0; i < pstr->mb_cur_max && i < remain_len; ++i)
225 ch = pstr->raw_mbs [pstr->raw_mbs_idx + byte_idx + i];
226 buf[i] = pstr->mbs[byte_idx + i] = pstr->trans[ch];
228 p = (const char *) buf;
230 else
231 p = (const char *) pstr->raw_mbs + pstr->raw_mbs_idx + byte_idx;
232 mbclen = __mbrtowc (&wc, p, remain_len, &pstr->cur_state);
233 if (BE (mbclen == (size_t) -2, 0))
235 /* The buffer doesn't have enough space, finish to build. */
236 pstr->cur_state = prev_st;
237 break;
239 else if (BE (mbclen == (size_t) -1 || mbclen == 0, 0))
241 /* We treat these cases as a singlebyte character. */
242 mbclen = 1;
243 wc = (wchar_t) pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx];
244 if (BE (pstr->trans != NULL, 0))
245 wc = pstr->trans[wc];
246 pstr->cur_state = prev_st;
249 /* Write wide character and padding. */
250 pstr->wcs[byte_idx++] = wc;
251 /* Write paddings. */
252 for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
253 pstr->wcs[byte_idx++] = WEOF;
255 pstr->valid_len = byte_idx;
256 pstr->valid_raw_len = byte_idx;
259 /* Build wide character buffer PSTR->WCS like build_wcs_buffer,
260 but for REG_ICASE. */
262 static reg_errcode_t
263 internal_function
264 build_wcs_upper_buffer (re_string_t *pstr)
266 mbstate_t prev_st;
267 int src_idx, byte_idx, end_idx, remain_len;
268 size_t mbclen;
269 #ifdef _LIBC
270 char buf[MB_LEN_MAX];
271 assert (MB_LEN_MAX >= pstr->mb_cur_max);
272 #else
273 char buf[64];
274 #endif
276 byte_idx = pstr->valid_len;
277 end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
279 /* The following optimization assumes that ASCII characters can be
280 mapped to wide characters with a simple cast. */
281 if (! pstr->map_notascii && pstr->trans == NULL && !pstr->offsets_needed)
283 while (byte_idx < end_idx)
285 wchar_t wc;
287 if (isascii (pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx])
288 && mbsinit (&pstr->cur_state))
290 /* In case of a singlebyte character. */
291 pstr->mbs[byte_idx]
292 = toupper (pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx]);
293 /* The next step uses the assumption that wchar_t is encoded
294 ASCII-safe: all ASCII values can be converted like this. */
295 pstr->wcs[byte_idx] = (wchar_t) pstr->mbs[byte_idx];
296 ++byte_idx;
297 continue;
300 remain_len = end_idx - byte_idx;
301 prev_st = pstr->cur_state;
302 mbclen = __mbrtowc (&wc,
303 ((const char *) pstr->raw_mbs + pstr->raw_mbs_idx
304 + byte_idx), remain_len, &pstr->cur_state);
305 if (BE (mbclen + 2 > 2, 1))
307 wchar_t wcu = wc;
308 if (iswlower (wc))
310 size_t mbcdlen;
312 wcu = towupper (wc);
313 mbcdlen = wcrtomb (buf, wcu, &prev_st);
314 if (BE (mbclen == mbcdlen, 1))
315 memcpy (pstr->mbs + byte_idx, buf, mbclen);
316 else
318 src_idx = byte_idx;
319 goto offsets_needed;
322 else
323 memcpy (pstr->mbs + byte_idx,
324 pstr->raw_mbs + pstr->raw_mbs_idx + byte_idx, mbclen);
325 pstr->wcs[byte_idx++] = wcu;
326 /* Write paddings. */
327 for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
328 pstr->wcs[byte_idx++] = WEOF;
330 else if (mbclen == (size_t) -1 || mbclen == 0)
332 /* It is an invalid character or '\0'. Just use the byte. */
333 int ch = pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx];
334 pstr->mbs[byte_idx] = ch;
335 /* And also cast it to wide char. */
336 pstr->wcs[byte_idx++] = (wchar_t) ch;
337 if (BE (mbclen == (size_t) -1, 0))
338 pstr->cur_state = prev_st;
340 else
342 /* The buffer doesn't have enough space, finish to build. */
343 pstr->cur_state = prev_st;
344 break;
347 pstr->valid_len = byte_idx;
348 pstr->valid_raw_len = byte_idx;
349 return REG_NOERROR;
351 else
352 for (src_idx = pstr->valid_raw_len; byte_idx < end_idx;)
354 wchar_t wc;
355 const char *p;
356 offsets_needed:
357 remain_len = end_idx - byte_idx;
358 prev_st = pstr->cur_state;
359 if (BE (pstr->trans != NULL, 0))
361 int i, ch;
363 for (i = 0; i < pstr->mb_cur_max && i < remain_len; ++i)
365 ch = pstr->raw_mbs [pstr->raw_mbs_idx + src_idx + i];
366 buf[i] = pstr->trans[ch];
368 p = (const char *) buf;
370 else
371 p = (const char *) pstr->raw_mbs + pstr->raw_mbs_idx + src_idx;
372 mbclen = __mbrtowc (&wc, p, remain_len, &pstr->cur_state);
373 if (BE (mbclen + 2 > 2, 1))
375 wchar_t wcu = wc;
376 if (iswlower (wc))
378 size_t mbcdlen;
380 wcu = towupper (wc);
381 mbcdlen = wcrtomb ((char *) buf, wcu, &prev_st);
382 if (BE (mbclen == mbcdlen, 1))
383 memcpy (pstr->mbs + byte_idx, buf, mbclen);
384 else if (mbcdlen != (size_t) -1)
386 size_t i;
388 if (byte_idx + mbcdlen > pstr->bufs_len)
390 pstr->cur_state = prev_st;
391 break;
394 if (pstr->offsets == NULL)
396 pstr->offsets = re_malloc (int, pstr->bufs_len);
398 if (pstr->offsets == NULL)
399 return REG_ESPACE;
401 if (!pstr->offsets_needed)
403 for (i = 0; i < (size_t) byte_idx; ++i)
404 pstr->offsets[i] = i;
405 pstr->offsets_needed = 1;
408 memcpy (pstr->mbs + byte_idx, buf, mbcdlen);
409 pstr->wcs[byte_idx] = wcu;
410 pstr->offsets[byte_idx] = src_idx;
411 for (i = 1; i < mbcdlen; ++i)
413 pstr->offsets[byte_idx + i]
414 = src_idx + (i < mbclen ? i : mbclen - 1);
415 pstr->wcs[byte_idx + i] = WEOF;
417 pstr->len += mbcdlen - mbclen;
418 if (pstr->raw_stop > src_idx)
419 pstr->stop += mbcdlen - mbclen;
420 end_idx = (pstr->bufs_len > pstr->len)
421 ? pstr->len : pstr->bufs_len;
422 byte_idx += mbcdlen;
423 src_idx += mbclen;
424 continue;
426 else
427 memcpy (pstr->mbs + byte_idx, p, mbclen);
429 else
430 memcpy (pstr->mbs + byte_idx, p, mbclen);
432 if (BE (pstr->offsets_needed != 0, 0))
434 size_t i;
435 for (i = 0; i < mbclen; ++i)
436 pstr->offsets[byte_idx + i] = src_idx + i;
438 src_idx += mbclen;
440 pstr->wcs[byte_idx++] = wcu;
441 /* Write paddings. */
442 for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
443 pstr->wcs[byte_idx++] = WEOF;
445 else if (mbclen == (size_t) -1 || mbclen == 0)
447 /* It is an invalid character or '\0'. Just use the byte. */
448 int ch = pstr->raw_mbs[pstr->raw_mbs_idx + src_idx];
450 if (BE (pstr->trans != NULL, 0))
451 ch = pstr->trans [ch];
452 pstr->mbs[byte_idx] = ch;
454 if (BE (pstr->offsets_needed != 0, 0))
455 pstr->offsets[byte_idx] = src_idx;
456 ++src_idx;
458 /* And also cast it to wide char. */
459 pstr->wcs[byte_idx++] = (wchar_t) ch;
460 if (BE (mbclen == (size_t) -1, 0))
461 pstr->cur_state = prev_st;
463 else
465 /* The buffer doesn't have enough space, finish to build. */
466 pstr->cur_state = prev_st;
467 break;
470 pstr->valid_len = byte_idx;
471 pstr->valid_raw_len = src_idx;
472 return REG_NOERROR;
475 /* Skip characters until the index becomes greater than NEW_RAW_IDX.
476 Return the index. */
478 static int
479 internal_function
480 re_string_skip_chars (re_string_t *pstr, int new_raw_idx, wint_t *last_wc)
482 mbstate_t prev_st;
483 int rawbuf_idx;
484 size_t mbclen;
485 wchar_t wc = WEOF;
487 /* Skip the characters which are not necessary to check. */
488 for (rawbuf_idx = pstr->raw_mbs_idx + pstr->valid_raw_len;
489 rawbuf_idx < new_raw_idx;)
491 int remain_len;
492 remain_len = pstr->len - rawbuf_idx;
493 prev_st = pstr->cur_state;
494 mbclen = __mbrtowc (&wc, (const char *) pstr->raw_mbs + rawbuf_idx,
495 remain_len, &pstr->cur_state);
496 if (BE (mbclen == (size_t) -2 || mbclen == (size_t) -1 || mbclen == 0, 0))
498 /* We treat these cases as a single byte character. */
499 if (mbclen == 0 || remain_len == 0)
500 wc = L'\0';
501 else
502 wc = *(unsigned char *) (pstr->raw_mbs + rawbuf_idx);
503 mbclen = 1;
504 pstr->cur_state = prev_st;
506 /* Then proceed the next character. */
507 rawbuf_idx += mbclen;
509 *last_wc = (wint_t) wc;
510 return rawbuf_idx;
512 #endif /* RE_ENABLE_I18N */
514 /* Build the buffer PSTR->MBS, and apply the translation if we need.
515 This function is used in case of REG_ICASE. */
517 static void
518 internal_function
519 build_upper_buffer (re_string_t *pstr)
521 int char_idx, end_idx;
522 end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
524 for (char_idx = pstr->valid_len; char_idx < end_idx; ++char_idx)
526 int ch = pstr->raw_mbs[pstr->raw_mbs_idx + char_idx];
527 if (BE (pstr->trans != NULL, 0))
528 ch = pstr->trans[ch];
529 if (islower (ch))
530 pstr->mbs[char_idx] = toupper (ch);
531 else
532 pstr->mbs[char_idx] = ch;
534 pstr->valid_len = char_idx;
535 pstr->valid_raw_len = char_idx;
538 /* Apply TRANS to the buffer in PSTR. */
540 static void
541 internal_function
542 re_string_translate_buffer (re_string_t *pstr)
544 int buf_idx, end_idx;
545 end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
547 for (buf_idx = pstr->valid_len; buf_idx < end_idx; ++buf_idx)
549 int ch = pstr->raw_mbs[pstr->raw_mbs_idx + buf_idx];
550 pstr->mbs[buf_idx] = pstr->trans[ch];
553 pstr->valid_len = buf_idx;
554 pstr->valid_raw_len = buf_idx;
557 /* This function re-construct the buffers.
558 Concretely, convert to wide character in case of pstr->mb_cur_max > 1,
559 convert to upper case in case of REG_ICASE, apply translation. */
561 static reg_errcode_t
562 internal_function
563 re_string_reconstruct (re_string_t *pstr, int idx, int eflags)
565 int offset = idx - pstr->raw_mbs_idx;
566 if (BE (offset < 0, 0))
568 /* Reset buffer. */
569 #ifdef RE_ENABLE_I18N
570 if (pstr->mb_cur_max > 1)
571 memset (&pstr->cur_state, '\0', sizeof (mbstate_t));
572 #endif /* RE_ENABLE_I18N */
573 pstr->len = pstr->raw_len;
574 pstr->stop = pstr->raw_stop;
575 pstr->valid_len = 0;
576 pstr->raw_mbs_idx = 0;
577 pstr->valid_raw_len = 0;
578 pstr->offsets_needed = 0;
579 pstr->tip_context = ((eflags & REG_NOTBOL) ? CONTEXT_BEGBUF
580 : CONTEXT_NEWLINE | CONTEXT_BEGBUF);
581 if (!pstr->mbs_allocated)
582 pstr->mbs = (unsigned char *) pstr->raw_mbs;
583 offset = idx;
586 if (BE (offset != 0, 1))
588 /* Should the already checked characters be kept? */
589 if (BE (offset < pstr->valid_raw_len, 1))
591 /* Yes, move them to the front of the buffer. */
592 #ifdef RE_ENABLE_I18N
593 if (BE (pstr->offsets_needed, 0))
595 int low = 0, high = pstr->valid_len, mid;
598 mid = (high + low) / 2;
599 if (pstr->offsets[mid] > offset)
600 high = mid;
601 else if (pstr->offsets[mid] < offset)
602 low = mid + 1;
603 else
604 break;
606 while (low < high);
607 if (pstr->offsets[mid] < offset)
608 ++mid;
609 pstr->tip_context = re_string_context_at (pstr, mid - 1,
610 eflags);
611 /* This can be quite complicated, so handle specially
612 only the common and easy case where the character with
613 different length representation of lower and upper
614 case is present at or after offset. */
615 if (pstr->valid_len > offset
616 && mid == offset && pstr->offsets[mid] == offset)
618 memmove (pstr->wcs, pstr->wcs + offset,
619 (pstr->valid_len - offset) * sizeof (wint_t));
620 memmove (pstr->mbs, pstr->mbs + offset, pstr->valid_len - offset);
621 pstr->valid_len -= offset;
622 pstr->valid_raw_len -= offset;
623 for (low = 0; low < pstr->valid_len; low++)
624 pstr->offsets[low] = pstr->offsets[low + offset] - offset;
626 else
628 /* Otherwise, just find out how long the partial multibyte
629 character at offset is and fill it with WEOF/255. */
630 pstr->len = pstr->raw_len - idx + offset;
631 pstr->stop = pstr->raw_stop - idx + offset;
632 pstr->offsets_needed = 0;
633 while (mid > 0 && pstr->offsets[mid - 1] == offset)
634 --mid;
635 while (mid < pstr->valid_len)
636 if (pstr->wcs[mid] != WEOF)
637 break;
638 else
639 ++mid;
640 if (mid == pstr->valid_len)
641 pstr->valid_len = 0;
642 else
644 pstr->valid_len = pstr->offsets[mid] - offset;
645 if (pstr->valid_len)
647 for (low = 0; low < pstr->valid_len; ++low)
648 pstr->wcs[low] = WEOF;
649 memset (pstr->mbs, 255, pstr->valid_len);
652 pstr->valid_raw_len = pstr->valid_len;
655 else
656 #endif
658 pstr->tip_context = re_string_context_at (pstr, offset - 1,
659 eflags);
660 #ifdef RE_ENABLE_I18N
661 if (pstr->mb_cur_max > 1)
662 memmove (pstr->wcs, pstr->wcs + offset,
663 (pstr->valid_len - offset) * sizeof (wint_t));
664 #endif /* RE_ENABLE_I18N */
665 if (BE (pstr->mbs_allocated, 0))
666 memmove (pstr->mbs, pstr->mbs + offset,
667 pstr->valid_len - offset);
668 pstr->valid_len -= offset;
669 pstr->valid_raw_len -= offset;
670 #if DEBUG
671 assert (pstr->valid_len > 0);
672 #endif
675 else
677 /* No, skip all characters until IDX. */
678 int prev_valid_len = pstr->valid_len;
680 #ifdef RE_ENABLE_I18N
681 if (BE (pstr->offsets_needed, 0))
683 pstr->len = pstr->raw_len - idx + offset;
684 pstr->stop = pstr->raw_stop - idx + offset;
685 pstr->offsets_needed = 0;
687 #endif
688 pstr->valid_len = 0;
689 #ifdef RE_ENABLE_I18N
690 if (pstr->mb_cur_max > 1)
692 int wcs_idx;
693 wint_t wc = WEOF;
695 if (pstr->is_utf8)
697 const unsigned char *raw, *p, *q, *end;
699 /* Special case UTF-8. Multi-byte chars start with any
700 byte other than 0x80 - 0xbf. */
701 raw = pstr->raw_mbs + pstr->raw_mbs_idx;
702 end = raw + (offset - pstr->mb_cur_max);
703 if (end < pstr->raw_mbs)
704 end = pstr->raw_mbs;
705 p = raw + offset - 1;
706 #ifdef _LIBC
707 /* We know the wchar_t encoding is UCS4, so for the simple
708 case, ASCII characters, skip the conversion step. */
709 if (isascii (*p) && BE (pstr->trans == NULL, 1))
711 memset (&pstr->cur_state, '\0', sizeof (mbstate_t));
712 /* pstr->valid_len = 0; */
713 wc = (wchar_t) *p;
715 else
716 #endif
717 for (; p >= end; --p)
718 if ((*p & 0xc0) != 0x80)
720 mbstate_t cur_state;
721 wchar_t wc2;
722 int mlen = raw + pstr->len - p;
723 unsigned char buf[6];
724 size_t mbclen;
726 q = p;
727 if (BE (pstr->trans != NULL, 0))
729 int i = mlen < 6 ? mlen : 6;
730 while (--i >= 0)
731 buf[i] = pstr->trans[p[i]];
732 q = buf;
734 /* XXX Don't use mbrtowc, we know which conversion
735 to use (UTF-8 -> UCS4). */
736 memset (&cur_state, 0, sizeof (cur_state));
737 mbclen = __mbrtowc (&wc2, (const char *) p, mlen,
738 &cur_state);
739 if (raw + offset - p <= mbclen
740 && mbclen < (size_t) -2)
742 memset (&pstr->cur_state, '\0',
743 sizeof (mbstate_t));
744 pstr->valid_len = mbclen - (raw + offset - p);
745 wc = wc2;
747 break;
751 if (wc == WEOF)
752 pstr->valid_len = re_string_skip_chars (pstr, idx, &wc) - idx;
753 if (wc == WEOF)
754 pstr->tip_context
755 = re_string_context_at (pstr, prev_valid_len - 1, eflags);
756 else
757 pstr->tip_context = ((BE (pstr->word_ops_used != 0, 0)
758 && IS_WIDE_WORD_CHAR (wc))
759 ? CONTEXT_WORD
760 : ((IS_WIDE_NEWLINE (wc)
761 && pstr->newline_anchor)
762 ? CONTEXT_NEWLINE : 0));
763 if (BE (pstr->valid_len, 0))
765 for (wcs_idx = 0; wcs_idx < pstr->valid_len; ++wcs_idx)
766 pstr->wcs[wcs_idx] = WEOF;
767 if (pstr->mbs_allocated)
768 memset (pstr->mbs, 255, pstr->valid_len);
770 pstr->valid_raw_len = pstr->valid_len;
772 else
773 #endif /* RE_ENABLE_I18N */
775 int c = pstr->raw_mbs[pstr->raw_mbs_idx + offset - 1];
776 pstr->valid_raw_len = 0;
777 if (pstr->trans)
778 c = pstr->trans[c];
779 pstr->tip_context = (bitset_contain (pstr->word_char, c)
780 ? CONTEXT_WORD
781 : ((IS_NEWLINE (c) && pstr->newline_anchor)
782 ? CONTEXT_NEWLINE : 0));
785 if (!BE (pstr->mbs_allocated, 0))
786 pstr->mbs += offset;
788 pstr->raw_mbs_idx = idx;
789 pstr->len -= offset;
790 pstr->stop -= offset;
792 /* Then build the buffers. */
793 #ifdef RE_ENABLE_I18N
794 if (pstr->mb_cur_max > 1)
796 if (pstr->icase)
798 reg_errcode_t ret = build_wcs_upper_buffer (pstr);
799 if (BE (ret != REG_NOERROR, 0))
800 return ret;
802 else
803 build_wcs_buffer (pstr);
805 else
806 #endif /* RE_ENABLE_I18N */
807 if (BE (pstr->mbs_allocated, 0))
809 if (pstr->icase)
810 build_upper_buffer (pstr);
811 else if (pstr->trans != NULL)
812 re_string_translate_buffer (pstr);
814 else
815 pstr->valid_len = pstr->len;
817 pstr->cur_idx = 0;
818 return REG_NOERROR;
821 static unsigned char
822 internal_function __attribute ((pure))
823 re_string_peek_byte_case (const re_string_t *pstr, int idx)
825 int ch, off;
827 /* Handle the common (easiest) cases first. */
828 if (BE (!pstr->mbs_allocated, 1))
829 return re_string_peek_byte (pstr, idx);
831 #ifdef RE_ENABLE_I18N
832 if (pstr->mb_cur_max > 1
833 && ! re_string_is_single_byte_char (pstr, pstr->cur_idx + idx))
834 return re_string_peek_byte (pstr, idx);
835 #endif
837 off = pstr->cur_idx + idx;
838 #ifdef RE_ENABLE_I18N
839 if (pstr->offsets_needed)
840 off = pstr->offsets[off];
841 #endif
843 ch = pstr->raw_mbs[pstr->raw_mbs_idx + off];
845 #ifdef RE_ENABLE_I18N
846 /* Ensure that e.g. for tr_TR.UTF-8 BACKSLASH DOTLESS SMALL LETTER I
847 this function returns CAPITAL LETTER I instead of first byte of
848 DOTLESS SMALL LETTER I. The latter would confuse the parser,
849 since peek_byte_case doesn't advance cur_idx in any way. */
850 if (pstr->offsets_needed && !isascii (ch))
851 return re_string_peek_byte (pstr, idx);
852 #endif
854 return ch;
857 static unsigned char
858 internal_function __attribute ((pure))
859 re_string_fetch_byte_case (re_string_t *pstr)
861 if (BE (!pstr->mbs_allocated, 1))
862 return re_string_fetch_byte (pstr);
864 #ifdef RE_ENABLE_I18N
865 if (pstr->offsets_needed)
867 int off, ch;
869 /* For tr_TR.UTF-8 [[:islower:]] there is
870 [[: CAPITAL LETTER I WITH DOT lower:]] in mbs. Skip
871 in that case the whole multi-byte character and return
872 the original letter. On the other side, with
873 [[: DOTLESS SMALL LETTER I return [[:I, as doing
874 anything else would complicate things too much. */
876 if (!re_string_first_byte (pstr, pstr->cur_idx))
877 return re_string_fetch_byte (pstr);
879 off = pstr->offsets[pstr->cur_idx];
880 ch = pstr->raw_mbs[pstr->raw_mbs_idx + off];
882 if (! isascii (ch))
883 return re_string_fetch_byte (pstr);
885 re_string_skip_bytes (pstr,
886 re_string_char_size_at (pstr, pstr->cur_idx));
887 return ch;
889 #endif
891 return pstr->raw_mbs[pstr->raw_mbs_idx + pstr->cur_idx++];
894 static void
895 internal_function
896 re_string_destruct (re_string_t *pstr)
898 #ifdef RE_ENABLE_I18N
899 re_free (pstr->wcs);
900 re_free (pstr->offsets);
901 #endif /* RE_ENABLE_I18N */
902 if (pstr->mbs_allocated)
903 re_free (pstr->mbs);
906 /* Return the context at IDX in INPUT. */
908 static unsigned int
909 internal_function
910 re_string_context_at (const re_string_t *input, int idx, int eflags)
912 int c;
913 if (BE (idx < 0, 0))
914 /* In this case, we use the value stored in input->tip_context,
915 since we can't know the character in input->mbs[-1] here. */
916 return input->tip_context;
917 if (BE (idx == input->len, 0))
918 return ((eflags & REG_NOTEOL) ? CONTEXT_ENDBUF
919 : CONTEXT_NEWLINE | CONTEXT_ENDBUF);
920 #ifdef RE_ENABLE_I18N
921 if (input->mb_cur_max > 1)
923 wint_t wc;
924 int wc_idx = idx;
925 while(input->wcs[wc_idx] == WEOF)
927 #ifdef DEBUG
928 /* It must not happen. */
929 assert (wc_idx >= 0);
930 #endif
931 --wc_idx;
932 if (wc_idx < 0)
933 return input->tip_context;
935 wc = input->wcs[wc_idx];
936 if (BE (input->word_ops_used != 0, 0) && IS_WIDE_WORD_CHAR (wc))
937 return CONTEXT_WORD;
938 return (IS_WIDE_NEWLINE (wc) && input->newline_anchor
939 ? CONTEXT_NEWLINE : 0);
941 else
942 #endif
944 c = re_string_byte_at (input, idx);
945 if (bitset_contain (input->word_char, c))
946 return CONTEXT_WORD;
947 return IS_NEWLINE (c) && input->newline_anchor ? CONTEXT_NEWLINE : 0;
951 /* Functions for set operation. */
953 static reg_errcode_t
954 internal_function
955 re_node_set_alloc (re_node_set *set, int size)
957 set->alloc = size;
958 set->nelem = 0;
959 set->elems = re_malloc (int, size);
960 if (BE (set->elems == NULL, 0))
961 return REG_ESPACE;
962 return REG_NOERROR;
965 static reg_errcode_t
966 internal_function
967 re_node_set_init_1 (re_node_set *set, int elem)
969 set->alloc = 1;
970 set->nelem = 1;
971 set->elems = re_malloc (int, 1);
972 if (BE (set->elems == NULL, 0))
974 set->alloc = set->nelem = 0;
975 return REG_ESPACE;
977 set->elems[0] = elem;
978 return REG_NOERROR;
981 static reg_errcode_t
982 internal_function
983 re_node_set_init_2 (re_node_set *set, int elem1, int elem2)
985 set->alloc = 2;
986 set->elems = re_malloc (int, 2);
987 if (BE (set->elems == NULL, 0))
988 return REG_ESPACE;
989 if (elem1 == elem2)
991 set->nelem = 1;
992 set->elems[0] = elem1;
994 else
996 set->nelem = 2;
997 if (elem1 < elem2)
999 set->elems[0] = elem1;
1000 set->elems[1] = elem2;
1002 else
1004 set->elems[0] = elem2;
1005 set->elems[1] = elem1;
1008 return REG_NOERROR;
1011 static reg_errcode_t
1012 internal_function
1013 re_node_set_init_copy (re_node_set *dest, const re_node_set *src)
1015 dest->nelem = src->nelem;
1016 if (src->nelem > 0)
1018 dest->alloc = dest->nelem;
1019 dest->elems = re_malloc (int, dest->alloc);
1020 if (BE (dest->elems == NULL, 0))
1022 dest->alloc = dest->nelem = 0;
1023 return REG_ESPACE;
1025 memcpy (dest->elems, src->elems, src->nelem * sizeof (int));
1027 else
1028 re_node_set_init_empty (dest);
1029 return REG_NOERROR;
1032 /* Calculate the intersection of the sets SRC1 and SRC2. And merge it to
1033 DEST. Return value indicate the error code or REG_NOERROR if succeeded.
1034 Note: We assume dest->elems is NULL, when dest->alloc is 0. */
1036 static reg_errcode_t
1037 internal_function
1038 re_node_set_add_intersect (re_node_set *dest, const re_node_set *src1,
1039 const re_node_set *src2)
1041 int i1, i2, is, id, delta, sbase;
1042 if (src1->nelem == 0 || src2->nelem == 0)
1043 return REG_NOERROR;
1045 /* We need dest->nelem + 2 * elems_in_intersection; this is a
1046 conservative estimate. */
1047 if (src1->nelem + src2->nelem + dest->nelem > dest->alloc)
1049 int new_alloc = src1->nelem + src2->nelem + dest->alloc;
1050 int *new_elems = re_realloc (dest->elems, int, new_alloc);
1051 if (BE (new_elems == NULL, 0))
1052 return REG_ESPACE;
1053 dest->elems = new_elems;
1054 dest->alloc = new_alloc;
1057 /* Find the items in the intersection of SRC1 and SRC2, and copy
1058 into the top of DEST those that are not already in DEST itself. */
1059 sbase = dest->nelem + src1->nelem + src2->nelem;
1060 i1 = src1->nelem - 1;
1061 i2 = src2->nelem - 1;
1062 id = dest->nelem - 1;
1063 for (;;)
1065 if (src1->elems[i1] == src2->elems[i2])
1067 /* Try to find the item in DEST. Maybe we could binary search? */
1068 while (id >= 0 && dest->elems[id] > src1->elems[i1])
1069 --id;
1071 if (id < 0 || dest->elems[id] != src1->elems[i1])
1072 dest->elems[--sbase] = src1->elems[i1];
1074 if (--i1 < 0 || --i2 < 0)
1075 break;
1078 /* Lower the highest of the two items. */
1079 else if (src1->elems[i1] < src2->elems[i2])
1081 if (--i2 < 0)
1082 break;
1084 else
1086 if (--i1 < 0)
1087 break;
1091 id = dest->nelem - 1;
1092 is = dest->nelem + src1->nelem + src2->nelem - 1;
1093 delta = is - sbase + 1;
1095 /* Now copy. When DELTA becomes zero, the remaining
1096 DEST elements are already in place; this is more or
1097 less the same loop that is in re_node_set_merge. */
1098 dest->nelem += delta;
1099 if (delta > 0 && id >= 0)
1100 for (;;)
1102 if (dest->elems[is] > dest->elems[id])
1104 /* Copy from the top. */
1105 dest->elems[id + delta--] = dest->elems[is--];
1106 if (delta == 0)
1107 break;
1109 else
1111 /* Slide from the bottom. */
1112 dest->elems[id + delta] = dest->elems[id];
1113 if (--id < 0)
1114 break;
1118 /* Copy remaining SRC elements. */
1119 memcpy (dest->elems, dest->elems + sbase, delta * sizeof (int));
1121 return REG_NOERROR;
1124 /* Calculate the union set of the sets SRC1 and SRC2. And store it to
1125 DEST. Return value indicate the error code or REG_NOERROR if succeeded. */
1127 static reg_errcode_t
1128 internal_function
1129 re_node_set_init_union (re_node_set *dest, const re_node_set *src1,
1130 const re_node_set *src2)
1132 int i1, i2, id;
1133 if (src1 != NULL && src1->nelem > 0 && src2 != NULL && src2->nelem > 0)
1135 dest->alloc = src1->nelem + src2->nelem;
1136 dest->elems = re_malloc (int, dest->alloc);
1137 if (BE (dest->elems == NULL, 0))
1138 return REG_ESPACE;
1140 else
1142 if (src1 != NULL && src1->nelem > 0)
1143 return re_node_set_init_copy (dest, src1);
1144 else if (src2 != NULL && src2->nelem > 0)
1145 return re_node_set_init_copy (dest, src2);
1146 else
1147 re_node_set_init_empty (dest);
1148 return REG_NOERROR;
1150 for (i1 = i2 = id = 0 ; i1 < src1->nelem && i2 < src2->nelem ;)
1152 if (src1->elems[i1] > src2->elems[i2])
1154 dest->elems[id++] = src2->elems[i2++];
1155 continue;
1157 if (src1->elems[i1] == src2->elems[i2])
1158 ++i2;
1159 dest->elems[id++] = src1->elems[i1++];
1161 if (i1 < src1->nelem)
1163 memcpy (dest->elems + id, src1->elems + i1,
1164 (src1->nelem - i1) * sizeof (int));
1165 id += src1->nelem - i1;
1167 else if (i2 < src2->nelem)
1169 memcpy (dest->elems + id, src2->elems + i2,
1170 (src2->nelem - i2) * sizeof (int));
1171 id += src2->nelem - i2;
1173 dest->nelem = id;
1174 return REG_NOERROR;
1177 /* Calculate the union set of the sets DEST and SRC. And store it to
1178 DEST. Return value indicate the error code or REG_NOERROR if succeeded. */
1180 static reg_errcode_t
1181 internal_function
1182 re_node_set_merge (re_node_set *dest, const re_node_set *src)
1184 int is, id, sbase, delta;
1185 if (src == NULL || src->nelem == 0)
1186 return REG_NOERROR;
1187 if (dest->alloc < 2 * src->nelem + dest->nelem)
1189 int new_alloc = 2 * (src->nelem + dest->alloc);
1190 int *new_buffer = re_realloc (dest->elems, int, new_alloc);
1191 if (BE (new_buffer == NULL, 0))
1192 return REG_ESPACE;
1193 dest->elems = new_buffer;
1194 dest->alloc = new_alloc;
1197 if (BE (dest->nelem == 0, 0))
1199 dest->nelem = src->nelem;
1200 memcpy (dest->elems, src->elems, src->nelem * sizeof (int));
1201 return REG_NOERROR;
1204 /* Copy into the top of DEST the items of SRC that are not
1205 found in DEST. Maybe we could binary search in DEST? */
1206 for (sbase = dest->nelem + 2 * src->nelem,
1207 is = src->nelem - 1, id = dest->nelem - 1; is >= 0 && id >= 0; )
1209 if (dest->elems[id] == src->elems[is])
1210 is--, id--;
1211 else if (dest->elems[id] < src->elems[is])
1212 dest->elems[--sbase] = src->elems[is--];
1213 else /* if (dest->elems[id] > src->elems[is]) */
1214 --id;
1217 if (is >= 0)
1219 /* If DEST is exhausted, the remaining items of SRC must be unique. */
1220 sbase -= is + 1;
1221 memcpy (dest->elems + sbase, src->elems, (is + 1) * sizeof (int));
1224 id = dest->nelem - 1;
1225 is = dest->nelem + 2 * src->nelem - 1;
1226 delta = is - sbase + 1;
1227 if (delta == 0)
1228 return REG_NOERROR;
1230 /* Now copy. When DELTA becomes zero, the remaining
1231 DEST elements are already in place. */
1232 dest->nelem += delta;
1233 for (;;)
1235 if (dest->elems[is] > dest->elems[id])
1237 /* Copy from the top. */
1238 dest->elems[id + delta--] = dest->elems[is--];
1239 if (delta == 0)
1240 break;
1242 else
1244 /* Slide from the bottom. */
1245 dest->elems[id + delta] = dest->elems[id];
1246 if (--id < 0)
1248 /* Copy remaining SRC elements. */
1249 memcpy (dest->elems, dest->elems + sbase,
1250 delta * sizeof (int));
1251 break;
1256 return REG_NOERROR;
1259 /* Insert the new element ELEM to the re_node_set* SET.
1260 SET should not already have ELEM.
1261 return -1 if an error is occured, return 1 otherwise. */
1263 static int
1264 internal_function
1265 re_node_set_insert (re_node_set *set, int elem)
1267 int idx;
1268 /* In case the set is empty. */
1269 if (set->alloc == 0)
1271 if (BE (re_node_set_init_1 (set, elem) == REG_NOERROR, 1))
1272 return 1;
1273 else
1274 return -1;
1277 if (BE (set->nelem, 0) == 0)
1279 /* We already guaranteed above that set->alloc != 0. */
1280 set->elems[0] = elem;
1281 ++set->nelem;
1282 return 1;
1285 /* Realloc if we need. */
1286 if (set->alloc == set->nelem)
1288 int *new_elems;
1289 set->alloc = set->alloc * 2;
1290 new_elems = re_realloc (set->elems, int, set->alloc);
1291 if (BE (new_elems == NULL, 0))
1292 return -1;
1293 set->elems = new_elems;
1296 /* Move the elements which follows the new element. Test the
1297 first element separately to skip a check in the inner loop. */
1298 if (elem < set->elems[0])
1300 idx = 0;
1301 for (idx = set->nelem; idx > 0; idx--)
1302 set->elems[idx] = set->elems[idx - 1];
1304 else
1306 for (idx = set->nelem; set->elems[idx - 1] > elem; idx--)
1307 set->elems[idx] = set->elems[idx - 1];
1310 /* Insert the new element. */
1311 set->elems[idx] = elem;
1312 ++set->nelem;
1313 return 1;
1316 /* Insert the new element ELEM to the re_node_set* SET.
1317 SET should not already have any element greater than or equal to ELEM.
1318 Return -1 if an error is occured, return 1 otherwise. */
1320 static int
1321 internal_function
1322 re_node_set_insert_last (re_node_set *set, int elem)
1324 /* Realloc if we need. */
1325 if (set->alloc == set->nelem)
1327 int *new_elems;
1328 set->alloc = (set->alloc + 1) * 2;
1329 new_elems = re_realloc (set->elems, int, set->alloc);
1330 if (BE (new_elems == NULL, 0))
1331 return -1;
1332 set->elems = new_elems;
1335 /* Insert the new element. */
1336 set->elems[set->nelem++] = elem;
1337 return 1;
1340 /* Compare two node sets SET1 and SET2.
1341 return 1 if SET1 and SET2 are equivalent, return 0 otherwise. */
1343 static int
1344 internal_function __attribute ((pure))
1345 re_node_set_compare (const re_node_set *set1, const re_node_set *set2)
1347 int i;
1348 if (set1 == NULL || set2 == NULL || set1->nelem != set2->nelem)
1349 return 0;
1350 for (i = set1->nelem ; --i >= 0 ; )
1351 if (set1->elems[i] != set2->elems[i])
1352 return 0;
1353 return 1;
1356 /* Return (idx + 1) if SET contains the element ELEM, return 0 otherwise. */
1358 static int
1359 internal_function __attribute ((pure))
1360 re_node_set_contains (const re_node_set *set, int elem)
1362 unsigned int idx, right, mid;
1363 if (set->nelem <= 0)
1364 return 0;
1366 /* Binary search the element. */
1367 idx = 0;
1368 right = set->nelem - 1;
1369 while (idx < right)
1371 mid = (idx + right) / 2;
1372 if (set->elems[mid] < elem)
1373 idx = mid + 1;
1374 else
1375 right = mid;
1377 return set->elems[idx] == elem ? idx + 1 : 0;
1380 static void
1381 internal_function
1382 re_node_set_remove_at (re_node_set *set, int idx)
1384 if (idx < 0 || idx >= set->nelem)
1385 return;
1386 --set->nelem;
1387 for (; idx < set->nelem; idx++)
1388 set->elems[idx] = set->elems[idx + 1];
1392 /* Add the token TOKEN to dfa->nodes, and return the index of the token.
1393 Or return -1, if an error will be occured. */
1395 static int
1396 internal_function
1397 re_dfa_add_node (re_dfa_t *dfa, re_token_t token)
1399 int type = token.type;
1400 if (BE (dfa->nodes_len >= dfa->nodes_alloc, 0))
1402 size_t new_nodes_alloc = dfa->nodes_alloc * 2;
1403 int *new_nexts, *new_indices;
1404 re_node_set *new_edests, *new_eclosures;
1405 re_token_t *new_nodes;
1407 /* Avoid overflows. */
1408 if (BE (new_nodes_alloc < dfa->nodes_alloc, 0))
1409 return -1;
1411 new_nodes = re_realloc (dfa->nodes, re_token_t, new_nodes_alloc);
1412 if (BE (new_nodes == NULL, 0))
1413 return -1;
1414 dfa->nodes = new_nodes;
1415 new_nexts = re_realloc (dfa->nexts, int, new_nodes_alloc);
1416 new_indices = re_realloc (dfa->org_indices, int, new_nodes_alloc);
1417 new_edests = re_realloc (dfa->edests, re_node_set, new_nodes_alloc);
1418 new_eclosures = re_realloc (dfa->eclosures, re_node_set, new_nodes_alloc);
1419 if (BE (new_nexts == NULL || new_indices == NULL
1420 || new_edests == NULL || new_eclosures == NULL, 0))
1421 return -1;
1422 dfa->nexts = new_nexts;
1423 dfa->org_indices = new_indices;
1424 dfa->edests = new_edests;
1425 dfa->eclosures = new_eclosures;
1426 dfa->nodes_alloc = new_nodes_alloc;
1428 dfa->nodes[dfa->nodes_len] = token;
1429 dfa->nodes[dfa->nodes_len].constraint = 0;
1430 #ifdef RE_ENABLE_I18N
1431 dfa->nodes[dfa->nodes_len].accept_mb =
1432 (type == OP_PERIOD && dfa->mb_cur_max > 1) || type == COMPLEX_BRACKET;
1433 #endif
1434 dfa->nexts[dfa->nodes_len] = -1;
1435 re_node_set_init_empty (dfa->edests + dfa->nodes_len);
1436 re_node_set_init_empty (dfa->eclosures + dfa->nodes_len);
1437 return dfa->nodes_len++;
1440 static inline unsigned int
1441 internal_function
1442 calc_state_hash (const re_node_set *nodes, unsigned int context)
1444 unsigned int hash = nodes->nelem + context;
1445 int i;
1446 for (i = 0 ; i < nodes->nelem ; i++)
1447 hash += nodes->elems[i];
1448 return hash;
1451 /* Search for the state whose node_set is equivalent to NODES.
1452 Return the pointer to the state, if we found it in the DFA.
1453 Otherwise create the new one and return it. In case of an error
1454 return NULL and set the error code in ERR.
1455 Note: - We assume NULL as the invalid state, then it is possible that
1456 return value is NULL and ERR is REG_NOERROR.
1457 - We never return non-NULL value in case of any errors, it is for
1458 optimization. */
1460 static re_dfastate_t *
1461 internal_function
1462 re_acquire_state (reg_errcode_t *err, const re_dfa_t *dfa,
1463 const re_node_set *nodes)
1465 unsigned int hash;
1466 re_dfastate_t *new_state;
1467 struct re_state_table_entry *spot;
1468 int i;
1469 if (BE (nodes->nelem == 0, 0))
1471 *err = REG_NOERROR;
1472 return NULL;
1474 hash = calc_state_hash (nodes, 0);
1475 spot = dfa->state_table + (hash & dfa->state_hash_mask);
1477 for (i = 0 ; i < spot->num ; i++)
1479 re_dfastate_t *state = spot->array[i];
1480 if (hash != state->hash)
1481 continue;
1482 if (re_node_set_compare (&state->nodes, nodes))
1483 return state;
1486 /* There are no appropriate state in the dfa, create the new one. */
1487 new_state = create_ci_newstate (dfa, nodes, hash);
1488 if (BE (new_state == NULL, 0))
1489 *err = REG_ESPACE;
1491 return new_state;
1494 /* Search for the state whose node_set is equivalent to NODES and
1495 whose context is equivalent to CONTEXT.
1496 Return the pointer to the state, if we found it in the DFA.
1497 Otherwise create the new one and return it. In case of an error
1498 return NULL and set the error code in ERR.
1499 Note: - We assume NULL as the invalid state, then it is possible that
1500 return value is NULL and ERR is REG_NOERROR.
1501 - We never return non-NULL value in case of any errors, it is for
1502 optimization. */
1504 static re_dfastate_t *
1505 internal_function
1506 re_acquire_state_context (reg_errcode_t *err, const re_dfa_t *dfa,
1507 const re_node_set *nodes, unsigned int context)
1509 unsigned int hash;
1510 re_dfastate_t *new_state;
1511 struct re_state_table_entry *spot;
1512 int i;
1513 if (nodes->nelem == 0)
1515 *err = REG_NOERROR;
1516 return NULL;
1518 hash = calc_state_hash (nodes, context);
1519 spot = dfa->state_table + (hash & dfa->state_hash_mask);
1521 for (i = 0 ; i < spot->num ; i++)
1523 re_dfastate_t *state = spot->array[i];
1524 if (state->hash == hash
1525 && state->context == context
1526 && re_node_set_compare (state->entrance_nodes, nodes))
1527 return state;
1529 /* There are no appropriate state in `dfa', create the new one. */
1530 new_state = create_cd_newstate (dfa, nodes, context, hash);
1531 if (BE (new_state == NULL, 0))
1532 *err = REG_ESPACE;
1534 return new_state;
1537 /* Finish initialization of the new state NEWSTATE, and using its hash value
1538 HASH put in the appropriate bucket of DFA's state table. Return value
1539 indicates the error code if failed. */
1541 static reg_errcode_t
1542 register_state (const re_dfa_t *dfa, re_dfastate_t *newstate,
1543 unsigned int hash)
1545 struct re_state_table_entry *spot;
1546 reg_errcode_t err;
1547 int i;
1549 newstate->hash = hash;
1550 err = re_node_set_alloc (&newstate->non_eps_nodes, newstate->nodes.nelem);
1551 if (BE (err != REG_NOERROR, 0))
1552 return REG_ESPACE;
1553 for (i = 0; i < newstate->nodes.nelem; i++)
1555 int elem = newstate->nodes.elems[i];
1556 if (!IS_EPSILON_NODE (dfa->nodes[elem].type))
1557 re_node_set_insert_last (&newstate->non_eps_nodes, elem);
1560 spot = dfa->state_table + (hash & dfa->state_hash_mask);
1561 if (BE (spot->alloc <= spot->num, 0))
1563 int new_alloc = 2 * spot->num + 2;
1564 re_dfastate_t **new_array = re_realloc (spot->array, re_dfastate_t *,
1565 new_alloc);
1566 if (BE (new_array == NULL, 0))
1567 return REG_ESPACE;
1568 spot->array = new_array;
1569 spot->alloc = new_alloc;
1571 spot->array[spot->num++] = newstate;
1572 return REG_NOERROR;
1575 static void
1576 free_state (re_dfastate_t *state)
1578 re_node_set_free (&state->non_eps_nodes);
1579 re_node_set_free (&state->inveclosure);
1580 if (state->entrance_nodes != &state->nodes)
1582 re_node_set_free (state->entrance_nodes);
1583 re_free (state->entrance_nodes);
1585 re_node_set_free (&state->nodes);
1586 re_free (state->word_trtable);
1587 re_free (state->trtable);
1588 re_free (state);
1591 /* Create the new state which is independ of contexts.
1592 Return the new state if succeeded, otherwise return NULL. */
1594 static re_dfastate_t *
1595 internal_function
1596 create_ci_newstate (const re_dfa_t *dfa, const re_node_set *nodes,
1597 unsigned int hash)
1599 int i;
1600 reg_errcode_t err;
1601 re_dfastate_t *newstate;
1603 newstate = (re_dfastate_t *) calloc (sizeof (re_dfastate_t), 1);
1604 if (BE (newstate == NULL, 0))
1605 return NULL;
1606 err = re_node_set_init_copy (&newstate->nodes, nodes);
1607 if (BE (err != REG_NOERROR, 0))
1609 re_free (newstate);
1610 return NULL;
1613 newstate->entrance_nodes = &newstate->nodes;
1614 for (i = 0 ; i < nodes->nelem ; i++)
1616 re_token_t *node = dfa->nodes + nodes->elems[i];
1617 re_token_type_t type = node->type;
1618 if (type == CHARACTER && !node->constraint)
1619 continue;
1620 #ifdef RE_ENABLE_I18N
1621 newstate->accept_mb |= node->accept_mb;
1622 #endif /* RE_ENABLE_I18N */
1624 /* If the state has the halt node, the state is a halt state. */
1625 if (type == END_OF_RE)
1626 newstate->halt = 1;
1627 else if (type == OP_BACK_REF)
1628 newstate->has_backref = 1;
1629 else if (type == ANCHOR || node->constraint)
1630 newstate->has_constraint = 1;
1632 err = register_state (dfa, newstate, hash);
1633 if (BE (err != REG_NOERROR, 0))
1635 free_state (newstate);
1636 newstate = NULL;
1638 return newstate;
1641 /* Create the new state which is depend on the context CONTEXT.
1642 Return the new state if succeeded, otherwise return NULL. */
1644 static re_dfastate_t *
1645 internal_function
1646 create_cd_newstate (const re_dfa_t *dfa, const re_node_set *nodes,
1647 unsigned int context, unsigned int hash)
1649 int i, nctx_nodes = 0;
1650 reg_errcode_t err;
1651 re_dfastate_t *newstate;
1653 newstate = (re_dfastate_t *) calloc (sizeof (re_dfastate_t), 1);
1654 if (BE (newstate == NULL, 0))
1655 return NULL;
1656 err = re_node_set_init_copy (&newstate->nodes, nodes);
1657 if (BE (err != REG_NOERROR, 0))
1659 re_free (newstate);
1660 return NULL;
1663 newstate->context = context;
1664 newstate->entrance_nodes = &newstate->nodes;
1666 for (i = 0 ; i < nodes->nelem ; i++)
1668 re_token_t *node = dfa->nodes + nodes->elems[i];
1669 re_token_type_t type = node->type;
1670 unsigned int constraint = node->constraint;
1672 if (type == CHARACTER && !constraint)
1673 continue;
1674 #ifdef RE_ENABLE_I18N
1675 newstate->accept_mb |= node->accept_mb;
1676 #endif /* RE_ENABLE_I18N */
1678 /* If the state has the halt node, the state is a halt state. */
1679 if (type == END_OF_RE)
1680 newstate->halt = 1;
1681 else if (type == OP_BACK_REF)
1682 newstate->has_backref = 1;
1684 if (constraint)
1686 if (newstate->entrance_nodes == &newstate->nodes)
1688 newstate->entrance_nodes = re_malloc (re_node_set, 1);
1689 if (BE (newstate->entrance_nodes == NULL, 0))
1691 free_state (newstate);
1692 return NULL;
1694 re_node_set_init_copy (newstate->entrance_nodes, nodes);
1695 nctx_nodes = 0;
1696 newstate->has_constraint = 1;
1699 if (NOT_SATISFY_PREV_CONSTRAINT (constraint,context))
1701 re_node_set_remove_at (&newstate->nodes, i - nctx_nodes);
1702 ++nctx_nodes;
1706 err = register_state (dfa, newstate, hash);
1707 if (BE (err != REG_NOERROR, 0))
1709 free_state (newstate);
1710 newstate = NULL;
1712 return newstate;