2004-09-08 Roland McGrath <roland@redhat.com>
[glibc.git] / posix / regex_internal.c
blob96f113745f0ab8d63d07c6cb80feb0ee99a15d0a
1 /* Extended regular expression matching and search library.
2 Copyright (C) 2002, 2003, 2004 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4 Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library; if not, write to the Free
18 Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
19 02111-1307 USA. */
21 static void re_string_construct_common (const char *str, int len,
22 re_string_t *pstr,
23 RE_TRANSLATE_TYPE trans, int icase,
24 const re_dfa_t *dfa) internal_function;
25 #ifdef RE_ENABLE_I18N
26 static int re_string_skip_chars (re_string_t *pstr, int new_raw_idx,
27 wint_t *last_wc) internal_function;
28 #endif /* RE_ENABLE_I18N */
29 static re_dfastate_t *create_newstate_common (re_dfa_t *dfa,
30 const re_node_set *nodes,
31 unsigned int hash) internal_function;
32 static reg_errcode_t register_state (re_dfa_t *dfa, re_dfastate_t *newstate,
33 unsigned int hash) internal_function;
34 static re_dfastate_t *create_ci_newstate (re_dfa_t *dfa,
35 const re_node_set *nodes,
36 unsigned int hash) internal_function;
37 static re_dfastate_t *create_cd_newstate (re_dfa_t *dfa,
38 const re_node_set *nodes,
39 unsigned int context,
40 unsigned int hash) internal_function;
41 static unsigned int inline calc_state_hash (const re_node_set *nodes,
42 unsigned int context) internal_function;
44 /* Functions for string operation. */
46 /* This function allocate the buffers. It is necessary to call
47 re_string_reconstruct before using the object. */
49 static reg_errcode_t
50 re_string_allocate (pstr, str, len, init_len, trans, icase, dfa)
51 re_string_t *pstr;
52 const char *str;
53 int len, init_len, icase;
54 RE_TRANSLATE_TYPE trans;
55 const re_dfa_t *dfa;
57 reg_errcode_t ret;
58 int init_buf_len;
60 /* Ensure at least one character fits into the buffers. */
61 if (init_len < dfa->mb_cur_max)
62 init_len = dfa->mb_cur_max;
63 init_buf_len = (len + 1 < init_len) ? len + 1: init_len;
64 re_string_construct_common (str, len, pstr, trans, icase, dfa);
66 ret = re_string_realloc_buffers (pstr, init_buf_len);
67 if (BE (ret != REG_NOERROR, 0))
68 return ret;
70 pstr->word_char = dfa->word_char;
71 pstr->word_ops_used = dfa->word_ops_used;
72 pstr->mbs = pstr->mbs_allocated ? pstr->mbs : (unsigned char *) str;
73 pstr->valid_len = (pstr->mbs_allocated || dfa->mb_cur_max > 1) ? 0 : len;
74 pstr->valid_raw_len = pstr->valid_len;
75 return REG_NOERROR;
78 /* This function allocate the buffers, and initialize them. */
80 static reg_errcode_t
81 re_string_construct (pstr, str, len, trans, icase, dfa)
82 re_string_t *pstr;
83 const char *str;
84 int len, icase;
85 RE_TRANSLATE_TYPE trans;
86 const re_dfa_t *dfa;
88 reg_errcode_t ret;
89 memset (pstr, '\0', sizeof (re_string_t));
90 re_string_construct_common (str, len, pstr, trans, icase, dfa);
92 if (len > 0)
94 ret = re_string_realloc_buffers (pstr, len + 1);
95 if (BE (ret != REG_NOERROR, 0))
96 return ret;
98 pstr->mbs = pstr->mbs_allocated ? pstr->mbs : (unsigned char *) str;
100 if (icase)
102 #ifdef RE_ENABLE_I18N
103 if (dfa->mb_cur_max > 1)
105 while (1)
107 ret = build_wcs_upper_buffer (pstr);
108 if (BE (ret != REG_NOERROR, 0))
109 return ret;
110 if (pstr->valid_raw_len >= len)
111 break;
112 if (pstr->bufs_len > pstr->valid_len + dfa->mb_cur_max)
113 break;
114 ret = re_string_realloc_buffers (pstr, pstr->bufs_len * 2);
115 if (BE (ret != REG_NOERROR, 0))
116 return ret;
119 else
120 #endif /* RE_ENABLE_I18N */
121 build_upper_buffer (pstr);
123 else
125 #ifdef RE_ENABLE_I18N
126 if (dfa->mb_cur_max > 1)
127 build_wcs_buffer (pstr);
128 else
129 #endif /* RE_ENABLE_I18N */
131 if (trans != NULL)
132 re_string_translate_buffer (pstr);
133 else
135 pstr->valid_len = pstr->bufs_len;
136 pstr->valid_raw_len = pstr->bufs_len;
141 return REG_NOERROR;
144 /* Helper functions for re_string_allocate, and re_string_construct. */
146 static reg_errcode_t
147 re_string_realloc_buffers (pstr, new_buf_len)
148 re_string_t *pstr;
149 int new_buf_len;
151 #ifdef RE_ENABLE_I18N
152 if (pstr->mb_cur_max > 1)
154 wint_t *new_array = re_realloc (pstr->wcs, wint_t, new_buf_len);
155 if (BE (new_array == NULL, 0))
156 return REG_ESPACE;
157 pstr->wcs = new_array;
158 if (pstr->offsets != NULL)
160 int *new_array = re_realloc (pstr->offsets, int, new_buf_len);
161 if (BE (new_array == NULL, 0))
162 return REG_ESPACE;
163 pstr->offsets = new_array;
166 #endif /* RE_ENABLE_I18N */
167 if (pstr->mbs_allocated)
169 unsigned char *new_array = re_realloc (pstr->mbs, unsigned char,
170 new_buf_len);
171 if (BE (new_array == NULL, 0))
172 return REG_ESPACE;
173 pstr->mbs = new_array;
175 pstr->bufs_len = new_buf_len;
176 return REG_NOERROR;
180 static void
181 re_string_construct_common (str, len, pstr, trans, icase, dfa)
182 const char *str;
183 int len;
184 re_string_t *pstr;
185 RE_TRANSLATE_TYPE trans;
186 int icase;
187 const re_dfa_t *dfa;
189 pstr->raw_mbs = (const unsigned char *) str;
190 pstr->len = len;
191 pstr->raw_len = len;
192 pstr->trans = (unsigned RE_TRANSLATE_TYPE) trans;
193 pstr->icase = icase ? 1 : 0;
194 pstr->mbs_allocated = (trans != NULL || icase);
195 pstr->mb_cur_max = dfa->mb_cur_max;
196 pstr->is_utf8 = dfa->is_utf8;
197 pstr->map_notascii = dfa->map_notascii;
198 pstr->stop = pstr->len;
199 pstr->raw_stop = pstr->stop;
202 #ifdef RE_ENABLE_I18N
204 /* Build wide character buffer PSTR->WCS.
205 If the byte sequence of the string are:
206 <mb1>(0), <mb1>(1), <mb2>(0), <mb2>(1), <sb3>
207 Then wide character buffer will be:
208 <wc1> , WEOF , <wc2> , WEOF , <wc3>
209 We use WEOF for padding, they indicate that the position isn't
210 a first byte of a multibyte character.
212 Note that this function assumes PSTR->VALID_LEN elements are already
213 built and starts from PSTR->VALID_LEN. */
215 static void
216 build_wcs_buffer (pstr)
217 re_string_t *pstr;
219 #ifdef _LIBC
220 unsigned char buf[pstr->mb_cur_max];
221 #else
222 unsigned char buf[64];
223 #endif
224 mbstate_t prev_st;
225 int byte_idx, end_idx, mbclen, remain_len;
227 /* Build the buffers from pstr->valid_len to either pstr->len or
228 pstr->bufs_len. */
229 end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
230 for (byte_idx = pstr->valid_len; byte_idx < end_idx;)
232 wchar_t wc;
233 const char *p;
235 remain_len = end_idx - byte_idx;
236 prev_st = pstr->cur_state;
237 /* Apply the translation if we need. */
238 if (BE (pstr->trans != NULL, 0))
240 int i, ch;
242 for (i = 0; i < pstr->mb_cur_max && i < remain_len; ++i)
244 ch = pstr->raw_mbs [pstr->raw_mbs_idx + byte_idx + i];
245 buf[i] = pstr->mbs[byte_idx + i] = pstr->trans[ch];
247 p = (const char *) buf;
249 else
250 p = (const char *) pstr->raw_mbs + pstr->raw_mbs_idx + byte_idx;
251 mbclen = mbrtowc (&wc, p, remain_len, &pstr->cur_state);
252 if (BE (mbclen == (size_t) -2, 0))
254 /* The buffer doesn't have enough space, finish to build. */
255 pstr->cur_state = prev_st;
256 break;
258 else if (BE (mbclen == (size_t) -1 || mbclen == 0, 0))
260 /* We treat these cases as a singlebyte character. */
261 mbclen = 1;
262 wc = (wchar_t) pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx];
263 if (BE (pstr->trans != NULL, 0))
264 wc = pstr->trans[wc];
265 pstr->cur_state = prev_st;
268 /* Write wide character and padding. */
269 pstr->wcs[byte_idx++] = wc;
270 /* Write paddings. */
271 for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
272 pstr->wcs[byte_idx++] = WEOF;
274 pstr->valid_len = byte_idx;
275 pstr->valid_raw_len = byte_idx;
278 /* Build wide character buffer PSTR->WCS like build_wcs_buffer,
279 but for REG_ICASE. */
281 static int
282 build_wcs_upper_buffer (pstr)
283 re_string_t *pstr;
285 mbstate_t prev_st;
286 int src_idx, byte_idx, end_idx, mbclen, remain_len;
287 #ifdef _LIBC
288 unsigned char buf[pstr->mb_cur_max];
289 #else
290 unsigned char buf[64];
291 #endif
293 byte_idx = pstr->valid_len;
294 end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
296 #ifdef _LIBC
297 /* The following optimization assumes that the wchar_t encoding is
298 always ISO 10646. */
299 if (! pstr->map_notascii && pstr->trans == NULL && !pstr->offsets_needed)
301 while (byte_idx < end_idx)
303 wchar_t wc;
305 if (isascii (pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx])
306 && mbsinit (&pstr->cur_state))
308 /* In case of a singlebyte character. */
309 pstr->mbs[byte_idx]
310 = toupper (pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx]);
311 /* The next step uses the assumption that wchar_t is encoded
312 with ISO 10646: all ASCII values can be converted like
313 this. */
314 pstr->wcs[byte_idx] = (wchar_t) pstr->mbs[byte_idx];
315 ++byte_idx;
316 continue;
319 remain_len = end_idx - byte_idx;
320 prev_st = pstr->cur_state;
321 mbclen = mbrtowc (&wc,
322 ((const char *) pstr->raw_mbs + pstr->raw_mbs_idx
323 + byte_idx), remain_len, &pstr->cur_state);
324 if (BE (mbclen > 0, 1))
326 wchar_t wcu = wc;
327 if (iswlower (wc))
329 int mbcdlen;
331 wcu = towupper (wc);
332 mbcdlen = wcrtomb (buf, wcu, &prev_st);
333 if (BE (mbclen == mbcdlen, 1))
334 memcpy (pstr->mbs + byte_idx, buf, mbclen);
335 else
337 src_idx = byte_idx;
338 goto offsets_needed;
341 else
342 memcpy (pstr->mbs + byte_idx,
343 pstr->raw_mbs + pstr->raw_mbs_idx + byte_idx, mbclen);
344 pstr->wcs[byte_idx++] = wcu;
345 /* Write paddings. */
346 for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
347 pstr->wcs[byte_idx++] = WEOF;
349 else if (mbclen == (size_t) -1 || mbclen == 0)
351 /* It is an invalid character or '\0'. Just use the byte. */
352 int ch = pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx];
353 pstr->mbs[byte_idx] = ch;
354 /* And also cast it to wide char. */
355 pstr->wcs[byte_idx++] = (wchar_t) ch;
356 if (BE (mbclen == (size_t) -1, 0))
357 pstr->cur_state = prev_st;
359 else
361 /* The buffer doesn't have enough space, finish to build. */
362 pstr->cur_state = prev_st;
363 break;
366 pstr->valid_len = byte_idx;
367 pstr->valid_raw_len = byte_idx;
368 return REG_NOERROR;
370 else
371 #endif
372 for (src_idx = pstr->valid_raw_len; byte_idx < end_idx;)
374 wchar_t wc;
375 const char *p;
376 #ifdef _LIBC
377 offsets_needed:
378 #endif
379 remain_len = end_idx - byte_idx;
380 prev_st = pstr->cur_state;
381 if (BE (pstr->trans != NULL, 0))
383 int i, ch;
385 for (i = 0; i < pstr->mb_cur_max && i < remain_len; ++i)
387 ch = pstr->raw_mbs [pstr->raw_mbs_idx + src_idx + i];
388 buf[i] = pstr->trans[ch];
390 p = (const char *) buf;
392 else
393 p = (const char *) pstr->raw_mbs + pstr->raw_mbs_idx + src_idx;
394 mbclen = mbrtowc (&wc, p, remain_len, &pstr->cur_state);
395 if (BE (mbclen > 0, 1))
397 wchar_t wcu = wc;
398 if (iswlower (wc))
400 int mbcdlen;
402 wcu = towupper (wc);
403 mbcdlen = wcrtomb ((char *) buf, wcu, &prev_st);
404 if (BE (mbclen == mbcdlen, 1))
405 memcpy (pstr->mbs + byte_idx, buf, mbclen);
406 else
408 int i;
410 if (byte_idx + mbcdlen > pstr->bufs_len)
412 pstr->cur_state = prev_st;
413 break;
416 if (pstr->offsets == NULL)
418 pstr->offsets = re_malloc (int, pstr->bufs_len);
420 if (pstr->offsets == NULL)
421 return REG_ESPACE;
423 if (!pstr->offsets_needed)
425 for (i = 0; i < byte_idx; ++i)
426 pstr->offsets[i] = i;
427 pstr->offsets_needed = 1;
430 memcpy (pstr->mbs + byte_idx, buf, mbcdlen);
431 pstr->wcs[byte_idx] = wcu;
432 pstr->offsets[byte_idx] = src_idx;
433 for (i = 1; i < mbcdlen; ++i)
435 pstr->offsets[byte_idx + i]
436 = src_idx + (i < mbclen ? i : mbclen - 1);
437 pstr->wcs[byte_idx + i] = WEOF;
439 pstr->len += mbcdlen - mbclen;
440 if (pstr->raw_stop > src_idx)
441 pstr->stop += mbcdlen - mbclen;
442 end_idx = (pstr->bufs_len > pstr->len)
443 ? pstr->len : pstr->bufs_len;
444 byte_idx += mbcdlen;
445 src_idx += mbclen;
446 continue;
449 else
450 memcpy (pstr->mbs + byte_idx, p, mbclen);
452 if (BE (pstr->offsets_needed != 0, 0))
454 int i;
455 for (i = 0; i < mbclen; ++i)
456 pstr->offsets[byte_idx + i] = src_idx + i;
458 src_idx += mbclen;
460 pstr->wcs[byte_idx++] = wcu;
461 /* Write paddings. */
462 for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
463 pstr->wcs[byte_idx++] = WEOF;
465 else if (mbclen == (size_t) -1 || mbclen == 0)
467 /* It is an invalid character or '\0'. Just use the byte. */
468 int ch = pstr->raw_mbs[pstr->raw_mbs_idx + src_idx];
470 if (BE (pstr->trans != NULL, 0))
471 ch = pstr->trans [ch];
472 pstr->mbs[byte_idx] = ch;
474 if (BE (pstr->offsets_needed != 0, 0))
475 pstr->offsets[byte_idx] = src_idx;
476 ++src_idx;
478 /* And also cast it to wide char. */
479 pstr->wcs[byte_idx++] = (wchar_t) ch;
480 if (BE (mbclen == (size_t) -1, 0))
481 pstr->cur_state = prev_st;
483 else
485 /* The buffer doesn't have enough space, finish to build. */
486 pstr->cur_state = prev_st;
487 break;
490 pstr->valid_len = byte_idx;
491 pstr->valid_raw_len = src_idx;
492 return REG_NOERROR;
495 /* Skip characters until the index becomes greater than NEW_RAW_IDX.
496 Return the index. */
498 static int
499 re_string_skip_chars (pstr, new_raw_idx, last_wc)
500 re_string_t *pstr;
501 int new_raw_idx;
502 wint_t *last_wc;
504 mbstate_t prev_st;
505 int rawbuf_idx, mbclen;
506 wchar_t wc = 0;
508 /* Skip the characters which are not necessary to check. */
509 for (rawbuf_idx = pstr->raw_mbs_idx + pstr->valid_raw_len;
510 rawbuf_idx < new_raw_idx;)
512 int remain_len;
513 remain_len = pstr->len - rawbuf_idx;
514 prev_st = pstr->cur_state;
515 mbclen = mbrtowc (&wc, (const char *) pstr->raw_mbs + rawbuf_idx,
516 remain_len, &pstr->cur_state);
517 if (BE (mbclen == (size_t) -2 || mbclen == (size_t) -1 || mbclen == 0, 0))
519 /* We treat these cases as a singlebyte character. */
520 mbclen = 1;
521 pstr->cur_state = prev_st;
523 /* Then proceed the next character. */
524 rawbuf_idx += mbclen;
526 *last_wc = (wint_t) wc;
527 return rawbuf_idx;
529 #endif /* RE_ENABLE_I18N */
531 /* Build the buffer PSTR->MBS, and apply the translation if we need.
532 This function is used in case of REG_ICASE. */
534 static void
535 build_upper_buffer (pstr)
536 re_string_t *pstr;
538 int char_idx, end_idx;
539 end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
541 for (char_idx = pstr->valid_len; char_idx < end_idx; ++char_idx)
543 int ch = pstr->raw_mbs[pstr->raw_mbs_idx + char_idx];
544 if (BE (pstr->trans != NULL, 0))
545 ch = pstr->trans[ch];
546 if (islower (ch))
547 pstr->mbs[char_idx] = toupper (ch);
548 else
549 pstr->mbs[char_idx] = ch;
551 pstr->valid_len = char_idx;
552 pstr->valid_raw_len = char_idx;
555 /* Apply TRANS to the buffer in PSTR. */
557 static void
558 re_string_translate_buffer (pstr)
559 re_string_t *pstr;
561 int buf_idx, end_idx;
562 end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
564 for (buf_idx = pstr->valid_len; buf_idx < end_idx; ++buf_idx)
566 int ch = pstr->raw_mbs[pstr->raw_mbs_idx + buf_idx];
567 pstr->mbs[buf_idx] = pstr->trans[ch];
570 pstr->valid_len = buf_idx;
571 pstr->valid_raw_len = buf_idx;
574 /* This function re-construct the buffers.
575 Concretely, convert to wide character in case of pstr->mb_cur_max > 1,
576 convert to upper case in case of REG_ICASE, apply translation. */
578 static reg_errcode_t
579 re_string_reconstruct (pstr, idx, eflags)
580 re_string_t *pstr;
581 int idx, eflags;
583 int offset = idx - pstr->raw_mbs_idx;
584 if (offset < 0)
586 /* Reset buffer. */
587 #ifdef RE_ENABLE_I18N
588 if (pstr->mb_cur_max > 1)
589 memset (&pstr->cur_state, '\0', sizeof (mbstate_t));
590 #endif /* RE_ENABLE_I18N */
591 pstr->len = pstr->raw_len;
592 pstr->stop = pstr->raw_stop;
593 pstr->valid_len = 0;
594 pstr->raw_mbs_idx = 0;
595 pstr->valid_raw_len = 0;
596 pstr->offsets_needed = 0;
597 pstr->tip_context = ((eflags & REG_NOTBOL) ? CONTEXT_BEGBUF
598 : CONTEXT_NEWLINE | CONTEXT_BEGBUF);
599 if (!pstr->mbs_allocated)
600 pstr->mbs = (unsigned char *) pstr->raw_mbs;
601 offset = idx;
604 if (offset != 0)
606 /* Are the characters which are already checked remain? */
607 if (offset < pstr->valid_raw_len
608 #ifdef RE_ENABLE_I18N
609 /* Handling this would enlarge the code too much.
610 Accept a slowdown in that case. */
611 && pstr->offsets_needed == 0
612 #endif
615 /* Yes, move them to the front of the buffer. */
616 pstr->tip_context = re_string_context_at (pstr, offset - 1, eflags);
617 #ifdef RE_ENABLE_I18N
618 if (pstr->mb_cur_max > 1)
619 memmove (pstr->wcs, pstr->wcs + offset,
620 (pstr->valid_len - offset) * sizeof (wint_t));
621 #endif /* RE_ENABLE_I18N */
622 if (pstr->mbs_allocated)
623 memmove (pstr->mbs, pstr->mbs + offset,
624 pstr->valid_len - offset);
625 pstr->valid_len -= offset;
626 pstr->valid_raw_len -= offset;
627 #if DEBUG
628 assert (pstr->valid_len > 0);
629 #endif
631 else
633 /* No, skip all characters until IDX. */
634 #ifdef RE_ENABLE_I18N
635 if (BE (pstr->offsets_needed, 0))
637 pstr->len = pstr->raw_len - idx + offset;
638 pstr->stop = pstr->raw_stop - idx + offset;
639 pstr->offsets_needed = 0;
641 #endif
642 pstr->valid_len = 0;
643 pstr->valid_raw_len = 0;
644 #ifdef RE_ENABLE_I18N
645 if (pstr->mb_cur_max > 1)
647 int wcs_idx;
648 wint_t wc = WEOF;
650 #ifdef _LIBC
651 if (pstr->is_utf8)
653 const unsigned char *raw, *p, *q, *end;
655 /* Special case UTF-8. Multi-byte chars start with any
656 byte other than 0x80 - 0xbf. */
657 raw = pstr->raw_mbs + pstr->raw_mbs_idx;
658 end = raw + (offset - pstr->mb_cur_max);
659 for (p = raw + offset - 1; p >= end; --p)
660 if ((*p & 0xc0) != 0x80)
662 mbstate_t cur_state;
663 wchar_t wc2;
664 int mlen = raw + pstr->len - p;
665 unsigned char buf[6];
667 q = p;
668 if (BE (pstr->trans != NULL, 0))
670 int i = mlen < 6 ? mlen : 6;
671 while (--i >= 0)
672 buf[i] = pstr->trans[p[i]];
673 q = buf;
675 /* XXX Don't use mbrtowc, we know which conversion
676 to use (UTF-8 -> UCS4). */
677 memset (&cur_state, 0, sizeof (cur_state));
678 mlen = mbrtowc (&wc2, p, mlen, &cur_state)
679 - (raw + offset - p);
680 if (mlen >= 0)
682 memset (&pstr->cur_state, '\0',
683 sizeof (mbstate_t));
684 pstr->valid_len = mlen;
685 wc = wc2;
687 break;
690 #endif
691 if (wc == WEOF)
692 pstr->valid_len = re_string_skip_chars (pstr, idx, &wc) - idx;
693 if (BE (pstr->valid_len, 0))
695 for (wcs_idx = 0; wcs_idx < pstr->valid_len; ++wcs_idx)
696 pstr->wcs[wcs_idx] = WEOF;
697 if (pstr->mbs_allocated)
698 memset (pstr->mbs, 255, pstr->valid_len);
700 pstr->valid_raw_len = pstr->valid_len;
701 pstr->tip_context = ((BE (pstr->word_ops_used != 0, 0)
702 && IS_WIDE_WORD_CHAR (wc))
703 ? CONTEXT_WORD
704 : ((IS_WIDE_NEWLINE (wc)
705 && pstr->newline_anchor)
706 ? CONTEXT_NEWLINE : 0));
708 else
709 #endif /* RE_ENABLE_I18N */
711 int c = pstr->raw_mbs[pstr->raw_mbs_idx + offset - 1];
712 if (pstr->trans)
713 c = pstr->trans[c];
714 pstr->tip_context = (bitset_contain (pstr->word_char, c)
715 ? CONTEXT_WORD
716 : ((IS_NEWLINE (c) && pstr->newline_anchor)
717 ? CONTEXT_NEWLINE : 0));
720 if (!pstr->mbs_allocated)
721 pstr->mbs += offset;
723 pstr->raw_mbs_idx = idx;
724 pstr->len -= offset;
725 pstr->stop -= offset;
727 /* Then build the buffers. */
728 #ifdef RE_ENABLE_I18N
729 if (pstr->mb_cur_max > 1)
731 if (pstr->icase)
733 int ret = build_wcs_upper_buffer (pstr);
734 if (BE (ret != REG_NOERROR, 0))
735 return ret;
737 else
738 build_wcs_buffer (pstr);
740 else
741 #endif /* RE_ENABLE_I18N */
743 if (pstr->icase)
744 build_upper_buffer (pstr);
745 else if (pstr->trans != NULL)
746 re_string_translate_buffer (pstr);
747 else
748 pstr->valid_len = pstr->len;
750 pstr->cur_idx = 0;
752 return REG_NOERROR;
755 static unsigned char
756 re_string_peek_byte_case (pstr, idx)
757 const re_string_t *pstr;
758 int idx;
760 int ch, off;
762 /* Handle the common (easiest) cases first. */
763 if (BE (!pstr->mbs_allocated, 1))
764 return re_string_peek_byte (pstr, idx);
766 #ifdef RE_ENABLE_I18N
767 if (pstr->mb_cur_max > 1
768 && ! re_string_is_single_byte_char (pstr, pstr->cur_idx + idx))
769 return re_string_peek_byte (pstr, idx);
770 #endif
772 off = pstr->cur_idx + idx;
773 #ifdef RE_ENABLE_I18N
774 if (pstr->offsets_needed)
775 off = pstr->offsets[off];
776 #endif
778 ch = pstr->raw_mbs[pstr->raw_mbs_idx + off];
780 #ifdef RE_ENABLE_I18N
781 /* Ensure that e.g. for tr_TR.UTF-8 BACKSLASH DOTLESS SMALL LETTER I
782 this function returns CAPITAL LETTER I instead of first byte of
783 DOTLESS SMALL LETTER I. The latter would confuse the parser,
784 since peek_byte_case doesn't advance cur_idx in any way. */
785 if (pstr->offsets_needed && !isascii (ch))
786 return re_string_peek_byte (pstr, idx);
787 #endif
789 return ch;
792 static unsigned char
793 re_string_fetch_byte_case (pstr)
794 re_string_t *pstr;
796 if (BE (!pstr->mbs_allocated, 1))
797 return re_string_fetch_byte (pstr);
799 #ifdef RE_ENABLE_I18N
800 if (pstr->offsets_needed)
802 int off, ch;
804 /* For tr_TR.UTF-8 [[:islower:]] there is
805 [[: CAPITAL LETTER I WITH DOT lower:]] in mbs. Skip
806 in that case the whole multi-byte character and return
807 the original letter. On the other side, with
808 [[: DOTLESS SMALL LETTER I return [[:I, as doing
809 anything else would complicate things too much. */
811 if (!re_string_first_byte (pstr, pstr->cur_idx))
812 return re_string_fetch_byte (pstr);
814 off = pstr->offsets[pstr->cur_idx];
815 ch = pstr->raw_mbs[pstr->raw_mbs_idx + off];
817 if (! isascii (ch))
818 return re_string_fetch_byte (pstr);
820 re_string_skip_bytes (pstr,
821 re_string_char_size_at (pstr, pstr->cur_idx));
822 return ch;
824 #endif
826 return pstr->raw_mbs[pstr->raw_mbs_idx + pstr->cur_idx++];
829 static void
830 re_string_destruct (pstr)
831 re_string_t *pstr;
833 #ifdef RE_ENABLE_I18N
834 re_free (pstr->wcs);
835 re_free (pstr->offsets);
836 #endif /* RE_ENABLE_I18N */
837 if (pstr->mbs_allocated)
838 re_free (pstr->mbs);
841 /* Return the context at IDX in INPUT. */
843 static unsigned int
844 re_string_context_at (input, idx, eflags)
845 const re_string_t *input;
846 int idx, eflags;
848 int c;
849 if (idx < 0 || idx == input->len)
851 if (idx < 0)
852 /* In this case, we use the value stored in input->tip_context,
853 since we can't know the character in input->mbs[-1] here. */
854 return input->tip_context;
855 else /* (idx == input->len) */
856 return ((eflags & REG_NOTEOL) ? CONTEXT_ENDBUF
857 : CONTEXT_NEWLINE | CONTEXT_ENDBUF);
859 #ifdef RE_ENABLE_I18N
860 if (input->mb_cur_max > 1)
862 wint_t wc;
863 int wc_idx = idx;
864 while(input->wcs[wc_idx] == WEOF)
866 #ifdef DEBUG
867 /* It must not happen. */
868 assert (wc_idx >= 0);
869 #endif
870 --wc_idx;
871 if (wc_idx < 0)
872 return input->tip_context;
874 wc = input->wcs[wc_idx];
875 if (BE (input->word_ops_used != 0, 0) && IS_WIDE_WORD_CHAR (wc))
876 return CONTEXT_WORD;
877 return (IS_WIDE_NEWLINE (wc) && input->newline_anchor
878 ? CONTEXT_NEWLINE : 0);
880 else
881 #endif
883 c = re_string_byte_at (input, idx);
884 if (bitset_contain (input->word_char, c))
885 return CONTEXT_WORD;
886 return IS_NEWLINE (c) && input->newline_anchor ? CONTEXT_NEWLINE : 0;
890 /* Functions for set operation. */
892 static reg_errcode_t
893 re_node_set_alloc (set, size)
894 re_node_set *set;
895 int size;
897 set->alloc = size;
898 set->nelem = 0;
899 set->elems = re_malloc (int, size);
900 if (BE (set->elems == NULL, 0))
901 return REG_ESPACE;
902 return REG_NOERROR;
905 static reg_errcode_t
906 re_node_set_init_1 (set, elem)
907 re_node_set *set;
908 int elem;
910 set->alloc = 1;
911 set->nelem = 1;
912 set->elems = re_malloc (int, 1);
913 if (BE (set->elems == NULL, 0))
915 set->alloc = set->nelem = 0;
916 return REG_ESPACE;
918 set->elems[0] = elem;
919 return REG_NOERROR;
922 static reg_errcode_t
923 re_node_set_init_2 (set, elem1, elem2)
924 re_node_set *set;
925 int elem1, elem2;
927 set->alloc = 2;
928 set->elems = re_malloc (int, 2);
929 if (BE (set->elems == NULL, 0))
930 return REG_ESPACE;
931 if (elem1 == elem2)
933 set->nelem = 1;
934 set->elems[0] = elem1;
936 else
938 set->nelem = 2;
939 if (elem1 < elem2)
941 set->elems[0] = elem1;
942 set->elems[1] = elem2;
944 else
946 set->elems[0] = elem2;
947 set->elems[1] = elem1;
950 return REG_NOERROR;
953 static reg_errcode_t
954 re_node_set_init_copy (dest, src)
955 re_node_set *dest;
956 const re_node_set *src;
958 dest->nelem = src->nelem;
959 if (src->nelem > 0)
961 dest->alloc = dest->nelem;
962 dest->elems = re_malloc (int, dest->alloc);
963 if (BE (dest->elems == NULL, 0))
965 dest->alloc = dest->nelem = 0;
966 return REG_ESPACE;
968 memcpy (dest->elems, src->elems, src->nelem * sizeof (int));
970 else
971 re_node_set_init_empty (dest);
972 return REG_NOERROR;
975 /* Calculate the intersection of the sets SRC1 and SRC2. And merge it to
976 DEST. Return value indicate the error code or REG_NOERROR if succeeded.
977 Note: We assume dest->elems is NULL, when dest->alloc is 0. */
979 static reg_errcode_t
980 re_node_set_add_intersect (dest, src1, src2)
981 re_node_set *dest;
982 const re_node_set *src1, *src2;
984 int i1, i2, is, id, delta, sbase;
985 if (src1->nelem == 0 || src2->nelem == 0)
986 return REG_NOERROR;
988 /* We need dest->nelem + 2 * elems_in_intersection; this is a
989 conservative estimate. */
990 if (src1->nelem + src2->nelem + dest->nelem > dest->alloc)
992 int new_alloc = src1->nelem + src2->nelem + dest->alloc;
993 int *new_elems = re_realloc (dest->elems, int, new_alloc);
994 if (BE (new_elems == NULL, 0))
995 return REG_ESPACE;
996 dest->elems = new_elems;
997 dest->alloc = new_alloc;
1000 /* Find the items in the intersection of SRC1 and SRC2, and copy
1001 into the top of DEST those that are not already in DEST itself. */
1002 sbase = dest->nelem + src1->nelem + src2->nelem;
1003 i1 = src1->nelem - 1;
1004 i2 = src2->nelem - 1;
1005 id = dest->nelem - 1;
1006 for (;;)
1008 if (src1->elems[i1] == src2->elems[i2])
1010 /* Try to find the item in DEST. Maybe we could binary search? */
1011 while (id >= 0 && dest->elems[id] > src1->elems[i1])
1012 --id;
1014 if (id < 0 || dest->elems[id] != src1->elems[i1])
1015 dest->elems[--sbase] = src1->elems[i1];
1017 if (--i1 < 0 || --i2 < 0)
1018 break;
1021 /* Lower the highest of the two items. */
1022 else if (src1->elems[i1] < src2->elems[i2])
1024 if (--i2 < 0)
1025 break;
1027 else
1029 if (--i1 < 0)
1030 break;
1034 id = dest->nelem - 1;
1035 is = dest->nelem + src1->nelem + src2->nelem - 1;
1036 delta = is - sbase + 1;
1038 /* Now copy. When DELTA becomes zero, the remaining
1039 DEST elements are already in place; this is more or
1040 less the same loop that is in re_node_set_merge. */
1041 dest->nelem += delta;
1042 if (delta > 0 && id >= 0)
1043 for (;;)
1045 if (dest->elems[is] > dest->elems[id])
1047 /* Copy from the top. */
1048 dest->elems[id + delta--] = dest->elems[is--];
1049 if (delta == 0)
1050 break;
1052 else
1054 /* Slide from the bottom. */
1055 dest->elems[id + delta] = dest->elems[id];
1056 if (--id < 0)
1057 break;
1061 /* Copy remaining SRC elements. */
1062 memcpy (dest->elems, dest->elems + sbase, delta * sizeof (int));
1064 return REG_NOERROR;
1067 /* Calculate the union set of the sets SRC1 and SRC2. And store it to
1068 DEST. Return value indicate the error code or REG_NOERROR if succeeded. */
1070 static reg_errcode_t
1071 re_node_set_init_union (dest, src1, src2)
1072 re_node_set *dest;
1073 const re_node_set *src1, *src2;
1075 int i1, i2, id;
1076 if (src1 != NULL && src1->nelem > 0 && src2 != NULL && src2->nelem > 0)
1078 dest->alloc = src1->nelem + src2->nelem;
1079 dest->elems = re_malloc (int, dest->alloc);
1080 if (BE (dest->elems == NULL, 0))
1081 return REG_ESPACE;
1083 else
1085 if (src1 != NULL && src1->nelem > 0)
1086 return re_node_set_init_copy (dest, src1);
1087 else if (src2 != NULL && src2->nelem > 0)
1088 return re_node_set_init_copy (dest, src2);
1089 else
1090 re_node_set_init_empty (dest);
1091 return REG_NOERROR;
1093 for (i1 = i2 = id = 0 ; i1 < src1->nelem && i2 < src2->nelem ;)
1095 if (src1->elems[i1] > src2->elems[i2])
1097 dest->elems[id++] = src2->elems[i2++];
1098 continue;
1100 if (src1->elems[i1] == src2->elems[i2])
1101 ++i2;
1102 dest->elems[id++] = src1->elems[i1++];
1104 if (i1 < src1->nelem)
1106 memcpy (dest->elems + id, src1->elems + i1,
1107 (src1->nelem - i1) * sizeof (int));
1108 id += src1->nelem - i1;
1110 else if (i2 < src2->nelem)
1112 memcpy (dest->elems + id, src2->elems + i2,
1113 (src2->nelem - i2) * sizeof (int));
1114 id += src2->nelem - i2;
1116 dest->nelem = id;
1117 return REG_NOERROR;
1120 /* Calculate the union set of the sets DEST and SRC. And store it to
1121 DEST. Return value indicate the error code or REG_NOERROR if succeeded. */
1123 static reg_errcode_t
1124 re_node_set_merge (dest, src)
1125 re_node_set *dest;
1126 const re_node_set *src;
1128 int is, id, sbase, delta;
1129 if (src == NULL || src->nelem == 0)
1130 return REG_NOERROR;
1131 if (dest->alloc < 2 * src->nelem + dest->nelem)
1133 int new_alloc = 2 * (src->nelem + dest->alloc);
1134 int *new_buffer = re_realloc (dest->elems, int, new_alloc);
1135 if (BE (new_buffer == NULL, 0))
1136 return REG_ESPACE;
1137 dest->elems = new_buffer;
1138 dest->alloc = new_alloc;
1141 if (BE (dest->nelem == 0, 0))
1143 dest->nelem = src->nelem;
1144 memcpy (dest->elems, src->elems, src->nelem * sizeof (int));
1145 return REG_NOERROR;
1148 /* Copy into the top of DEST the items of SRC that are not
1149 found in DEST. Maybe we could binary search in DEST? */
1150 for (sbase = dest->nelem + 2 * src->nelem,
1151 is = src->nelem - 1, id = dest->nelem - 1; is >= 0 && id >= 0; )
1153 if (dest->elems[id] == src->elems[is])
1154 is--, id--;
1155 else if (dest->elems[id] < src->elems[is])
1156 dest->elems[--sbase] = src->elems[is--];
1157 else /* if (dest->elems[id] > src->elems[is]) */
1158 --id;
1161 if (is >= 0)
1163 /* If DEST is exhausted, the remaining items of SRC must be unique. */
1164 sbase -= is + 1;
1165 memcpy (dest->elems + sbase, src->elems, (is + 1) * sizeof (int));
1168 id = dest->nelem - 1;
1169 is = dest->nelem + 2 * src->nelem - 1;
1170 delta = is - sbase + 1;
1171 if (delta == 0)
1172 return REG_NOERROR;
1174 /* Now copy. When DELTA becomes zero, the remaining
1175 DEST elements are already in place. */
1176 dest->nelem += delta;
1177 for (;;)
1179 if (dest->elems[is] > dest->elems[id])
1181 /* Copy from the top. */
1182 dest->elems[id + delta--] = dest->elems[is--];
1183 if (delta == 0)
1184 break;
1186 else
1188 /* Slide from the bottom. */
1189 dest->elems[id + delta] = dest->elems[id];
1190 if (--id < 0)
1192 /* Copy remaining SRC elements. */
1193 memcpy (dest->elems, dest->elems + sbase,
1194 delta * sizeof (int));
1195 break;
1200 return REG_NOERROR;
1203 /* Insert the new element ELEM to the re_node_set* SET.
1204 SET should not already have ELEM.
1205 return -1 if an error is occured, return 1 otherwise. */
1207 static int
1208 re_node_set_insert (set, elem)
1209 re_node_set *set;
1210 int elem;
1212 int idx;
1213 /* In case the set is empty. */
1214 if (set->alloc == 0)
1216 if (BE (re_node_set_init_1 (set, elem) == REG_NOERROR, 1))
1217 return 1;
1218 else
1219 return -1;
1222 if (BE (set->nelem, 0) == 0)
1224 /* We already guaranteed above that set->alloc != 0. */
1225 set->elems[0] = elem;
1226 ++set->nelem;
1227 return 1;
1230 /* Realloc if we need. */
1231 if (set->alloc == set->nelem)
1233 int *new_array;
1234 set->alloc = set->alloc * 2;
1235 new_array = re_realloc (set->elems, int, set->alloc);
1236 if (BE (new_array == NULL, 0))
1237 return -1;
1238 set->elems = new_array;
1241 /* Move the elements which follows the new element. Test the
1242 first element separately to skip a check in the inner loop. */
1243 if (elem < set->elems[0])
1245 idx = 0;
1246 for (idx = set->nelem; idx > 0; idx--)
1247 set->elems[idx] = set->elems[idx - 1];
1249 else
1251 for (idx = set->nelem; set->elems[idx - 1] > elem; idx--)
1252 set->elems[idx] = set->elems[idx - 1];
1255 /* Insert the new element. */
1256 set->elems[idx] = elem;
1257 ++set->nelem;
1258 return 1;
1261 /* Compare two node sets SET1 and SET2.
1262 return 1 if SET1 and SET2 are equivalent, return 0 otherwise. */
1264 static int
1265 re_node_set_compare (set1, set2)
1266 const re_node_set *set1, *set2;
1268 int i;
1269 if (set1 == NULL || set2 == NULL || set1->nelem != set2->nelem)
1270 return 0;
1271 for (i = set1->nelem ; --i >= 0 ; )
1272 if (set1->elems[i] != set2->elems[i])
1273 return 0;
1274 return 1;
1277 /* Return (idx + 1) if SET contains the element ELEM, return 0 otherwise. */
1279 static int
1280 re_node_set_contains (set, elem)
1281 const re_node_set *set;
1282 int elem;
1284 int idx, right, mid;
1285 if (set->nelem <= 0)
1286 return 0;
1288 /* Binary search the element. */
1289 idx = 0;
1290 right = set->nelem - 1;
1291 while (idx < right)
1293 mid = (idx + right) / 2;
1294 if (set->elems[mid] < elem)
1295 idx = mid + 1;
1296 else
1297 right = mid;
1299 return set->elems[idx] == elem ? idx + 1 : 0;
1302 static void
1303 re_node_set_remove_at (set, idx)
1304 re_node_set *set;
1305 int idx;
1307 if (idx < 0 || idx >= set->nelem)
1308 return;
1309 --set->nelem;
1310 for (; idx < set->nelem; idx++)
1311 set->elems[idx] = set->elems[idx + 1];
1315 /* Add the token TOKEN to dfa->nodes, and return the index of the token.
1316 Or return -1, if an error will be occured. */
1318 static int
1319 re_dfa_add_node (dfa, token, mode)
1320 re_dfa_t *dfa;
1321 re_token_t token;
1322 int mode;
1324 if (BE (dfa->nodes_len >= dfa->nodes_alloc, 0))
1326 int new_nodes_alloc = dfa->nodes_alloc * 2;
1327 re_token_t *new_array = re_realloc (dfa->nodes, re_token_t,
1328 new_nodes_alloc);
1329 if (BE (new_array == NULL, 0))
1330 return -1;
1331 dfa->nodes = new_array;
1332 if (mode)
1334 int *new_nexts, *new_indices;
1335 re_node_set *new_edests, *new_eclosures, *new_inveclosures;
1337 new_nexts = re_realloc (dfa->nexts, int, new_nodes_alloc);
1338 new_indices = re_realloc (dfa->org_indices, int, new_nodes_alloc);
1339 new_edests = re_realloc (dfa->edests, re_node_set, new_nodes_alloc);
1340 new_eclosures = re_realloc (dfa->eclosures, re_node_set,
1341 new_nodes_alloc);
1342 new_inveclosures = re_realloc (dfa->inveclosures, re_node_set,
1343 new_nodes_alloc);
1344 if (BE (new_nexts == NULL || new_indices == NULL
1345 || new_edests == NULL || new_eclosures == NULL
1346 || new_inveclosures == NULL, 0))
1347 return -1;
1348 dfa->nexts = new_nexts;
1349 dfa->org_indices = new_indices;
1350 dfa->edests = new_edests;
1351 dfa->eclosures = new_eclosures;
1352 dfa->inveclosures = new_inveclosures;
1354 dfa->nodes_alloc = new_nodes_alloc;
1356 dfa->nodes[dfa->nodes_len] = token;
1357 dfa->nodes[dfa->nodes_len].opt_subexp = 0;
1358 dfa->nodes[dfa->nodes_len].duplicated = 0;
1359 dfa->nodes[dfa->nodes_len].constraint = 0;
1360 return dfa->nodes_len++;
1363 static unsigned int inline
1364 calc_state_hash (nodes, context)
1365 const re_node_set *nodes;
1366 unsigned int context;
1368 unsigned int hash = nodes->nelem + context;
1369 int i;
1370 for (i = 0 ; i < nodes->nelem ; i++)
1371 hash += nodes->elems[i];
1372 return hash;
1375 /* Search for the state whose node_set is equivalent to NODES.
1376 Return the pointer to the state, if we found it in the DFA.
1377 Otherwise create the new one and return it. In case of an error
1378 return NULL and set the error code in ERR.
1379 Note: - We assume NULL as the invalid state, then it is possible that
1380 return value is NULL and ERR is REG_NOERROR.
1381 - We never return non-NULL value in case of any errors, it is for
1382 optimization. */
1384 static re_dfastate_t*
1385 re_acquire_state (err, dfa, nodes)
1386 reg_errcode_t *err;
1387 re_dfa_t *dfa;
1388 const re_node_set *nodes;
1390 unsigned int hash;
1391 re_dfastate_t *new_state;
1392 struct re_state_table_entry *spot;
1393 int i;
1394 if (BE (nodes->nelem == 0, 0))
1396 *err = REG_NOERROR;
1397 return NULL;
1399 hash = calc_state_hash (nodes, 0);
1400 spot = dfa->state_table + (hash & dfa->state_hash_mask);
1402 for (i = 0 ; i < spot->num ; i++)
1404 re_dfastate_t *state = spot->array[i];
1405 if (hash != state->hash)
1406 continue;
1407 if (re_node_set_compare (&state->nodes, nodes))
1408 return state;
1411 /* There are no appropriate state in the dfa, create the new one. */
1412 new_state = create_ci_newstate (dfa, nodes, hash);
1413 if (BE (new_state != NULL, 1))
1414 return new_state;
1415 else
1417 *err = REG_ESPACE;
1418 return NULL;
1422 /* Search for the state whose node_set is equivalent to NODES and
1423 whose context is equivalent to CONTEXT.
1424 Return the pointer to the state, if we found it in the DFA.
1425 Otherwise create the new one and return it. In case of an error
1426 return NULL and set the error code in ERR.
1427 Note: - We assume NULL as the invalid state, then it is possible that
1428 return value is NULL and ERR is REG_NOERROR.
1429 - We never return non-NULL value in case of any errors, it is for
1430 optimization. */
1432 static re_dfastate_t*
1433 re_acquire_state_context (err, dfa, nodes, context)
1434 reg_errcode_t *err;
1435 re_dfa_t *dfa;
1436 const re_node_set *nodes;
1437 unsigned int context;
1439 unsigned int hash;
1440 re_dfastate_t *new_state;
1441 struct re_state_table_entry *spot;
1442 int i;
1443 if (nodes->nelem == 0)
1445 *err = REG_NOERROR;
1446 return NULL;
1448 hash = calc_state_hash (nodes, context);
1449 spot = dfa->state_table + (hash & dfa->state_hash_mask);
1451 for (i = 0 ; i < spot->num ; i++)
1453 re_dfastate_t *state = spot->array[i];
1454 if (state->hash == hash
1455 && state->context == context
1456 && re_node_set_compare (state->entrance_nodes, nodes))
1457 return state;
1459 /* There are no appropriate state in `dfa', create the new one. */
1460 new_state = create_cd_newstate (dfa, nodes, context, hash);
1461 if (BE (new_state != NULL, 1))
1462 return new_state;
1463 else
1465 *err = REG_ESPACE;
1466 return NULL;
1470 /* Allocate memory for DFA state and initialize common properties.
1471 Return the new state if succeeded, otherwise return NULL. */
1473 static re_dfastate_t *
1474 create_newstate_common (dfa, nodes, hash)
1475 re_dfa_t *dfa;
1476 const re_node_set *nodes;
1477 unsigned int hash;
1479 re_dfastate_t *newstate;
1480 reg_errcode_t err;
1481 newstate = (re_dfastate_t *) calloc (sizeof (re_dfastate_t), 1);
1482 if (BE (newstate == NULL, 0))
1483 return NULL;
1484 err = re_node_set_init_copy (&newstate->nodes, nodes);
1485 if (BE (err != REG_NOERROR, 0))
1487 re_free (newstate);
1488 return NULL;
1490 newstate->trtable = NULL;
1491 newstate->hash = hash;
1492 return newstate;
1495 /* Store the new state NEWSTATE whose hash value is HASH in appropriate
1496 position. Return value indicate the error code if failed. */
1498 static reg_errcode_t
1499 register_state (dfa, newstate, hash)
1500 re_dfa_t *dfa;
1501 re_dfastate_t *newstate;
1502 unsigned int hash;
1504 struct re_state_table_entry *spot;
1505 spot = dfa->state_table + (hash & dfa->state_hash_mask);
1507 if (BE (spot->alloc <= spot->num, 0))
1509 int new_alloc = 2 * spot->num + 2;
1510 re_dfastate_t **new_array = re_realloc (spot->array, re_dfastate_t *,
1511 new_alloc);
1512 if (BE (new_array == NULL, 0))
1513 return REG_ESPACE;
1514 spot->array = new_array;
1515 spot->alloc = new_alloc;
1517 spot->array[spot->num++] = newstate;
1518 return REG_NOERROR;
1521 /* Create the new state which is independ of contexts.
1522 Return the new state if succeeded, otherwise return NULL. */
1524 static re_dfastate_t *
1525 create_ci_newstate (dfa, nodes, hash)
1526 re_dfa_t *dfa;
1527 const re_node_set *nodes;
1528 unsigned int hash;
1530 int i;
1531 reg_errcode_t err;
1532 re_dfastate_t *newstate;
1533 newstate = create_newstate_common (dfa, nodes, hash);
1534 if (BE (newstate == NULL, 0))
1535 return NULL;
1536 newstate->entrance_nodes = &newstate->nodes;
1538 for (i = 0 ; i < nodes->nelem ; i++)
1540 re_token_t *node = dfa->nodes + nodes->elems[i];
1541 re_token_type_t type = node->type;
1542 if (type == CHARACTER && !node->constraint)
1543 continue;
1545 /* If the state has the halt node, the state is a halt state. */
1546 else if (type == END_OF_RE)
1547 newstate->halt = 1;
1548 #ifdef RE_ENABLE_I18N
1549 else if (type == COMPLEX_BRACKET
1550 || type == OP_UTF8_PERIOD
1551 || (type == OP_PERIOD && dfa->mb_cur_max > 1))
1552 newstate->accept_mb = 1;
1553 #endif /* RE_ENABLE_I18N */
1554 else if (type == OP_BACK_REF)
1555 newstate->has_backref = 1;
1556 else if (type == ANCHOR || node->constraint)
1557 newstate->has_constraint = 1;
1559 err = register_state (dfa, newstate, hash);
1560 if (BE (err != REG_NOERROR, 0))
1562 free_state (newstate);
1563 newstate = NULL;
1565 return newstate;
1568 /* Create the new state which is depend on the context CONTEXT.
1569 Return the new state if succeeded, otherwise return NULL. */
1571 static re_dfastate_t *
1572 create_cd_newstate (dfa, nodes, context, hash)
1573 re_dfa_t *dfa;
1574 const re_node_set *nodes;
1575 unsigned int context, hash;
1577 int i, nctx_nodes = 0;
1578 reg_errcode_t err;
1579 re_dfastate_t *newstate;
1581 newstate = create_newstate_common (dfa, nodes, hash);
1582 if (BE (newstate == NULL, 0))
1583 return NULL;
1584 newstate->context = context;
1585 newstate->entrance_nodes = &newstate->nodes;
1587 for (i = 0 ; i < nodes->nelem ; i++)
1589 unsigned int constraint = 0;
1590 re_token_t *node = dfa->nodes + nodes->elems[i];
1591 re_token_type_t type = node->type;
1592 if (node->constraint)
1593 constraint = node->constraint;
1595 if (type == CHARACTER && !constraint)
1596 continue;
1597 /* If the state has the halt node, the state is a halt state. */
1598 else if (type == END_OF_RE)
1599 newstate->halt = 1;
1600 #ifdef RE_ENABLE_I18N
1601 else if (type == COMPLEX_BRACKET
1602 || type == OP_UTF8_PERIOD
1603 || (type == OP_PERIOD && dfa->mb_cur_max > 1))
1604 newstate->accept_mb = 1;
1605 #endif /* RE_ENABLE_I18N */
1606 else if (type == OP_BACK_REF)
1607 newstate->has_backref = 1;
1608 else if (type == ANCHOR)
1609 constraint = node->opr.ctx_type;
1611 if (constraint)
1613 if (newstate->entrance_nodes == &newstate->nodes)
1615 newstate->entrance_nodes = re_malloc (re_node_set, 1);
1616 if (BE (newstate->entrance_nodes == NULL, 0))
1618 free_state (newstate);
1619 return NULL;
1621 re_node_set_init_copy (newstate->entrance_nodes, nodes);
1622 nctx_nodes = 0;
1623 newstate->has_constraint = 1;
1626 if (NOT_SATISFY_PREV_CONSTRAINT (constraint,context))
1628 re_node_set_remove_at (&newstate->nodes, i - nctx_nodes);
1629 ++nctx_nodes;
1633 err = register_state (dfa, newstate, hash);
1634 if (BE (err != REG_NOERROR, 0))
1636 free_state (newstate);
1637 newstate = NULL;
1639 return newstate;
1642 static void
1643 free_state (state)
1644 re_dfastate_t *state;
1646 if (state->entrance_nodes != &state->nodes)
1648 re_node_set_free (state->entrance_nodes);
1649 re_free (state->entrance_nodes);
1651 re_node_set_free (&state->nodes);
1652 re_free (state->trtable);
1653 re_free (state);