(CFLAGS-tst-align.c): Add -mpreferred-stack-boundary=4.
[glibc.git] / posix / regex_internal.c
blob001b50b13482a8f0e173a5548fb48fc5ed2e414d
1 /* Extended regular expression matching and search library.
2 Copyright (C) 2002, 2003, 2004 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4 Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library; if not, write to the Free
18 Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
19 02111-1307 USA. */
21 static void re_string_construct_common (const char *str, int len,
22 re_string_t *pstr,
23 RE_TRANSLATE_TYPE trans, int icase,
24 const re_dfa_t *dfa) internal_function;
25 #ifdef RE_ENABLE_I18N
26 static int re_string_skip_chars (re_string_t *pstr, int new_raw_idx,
27 wint_t *last_wc) internal_function;
28 #endif /* RE_ENABLE_I18N */
29 static reg_errcode_t register_state (re_dfa_t *dfa, re_dfastate_t *newstate,
30 unsigned int hash) internal_function;
31 static re_dfastate_t *create_ci_newstate (re_dfa_t *dfa,
32 const re_node_set *nodes,
33 unsigned int hash) internal_function;
34 static re_dfastate_t *create_cd_newstate (re_dfa_t *dfa,
35 const re_node_set *nodes,
36 unsigned int context,
37 unsigned int hash) internal_function;
38 static unsigned int inline calc_state_hash (const re_node_set *nodes,
39 unsigned int context) internal_function;
41 /* Functions for string operation. */
43 /* This function allocate the buffers. It is necessary to call
44 re_string_reconstruct before using the object. */
46 static reg_errcode_t
47 re_string_allocate (pstr, str, len, init_len, trans, icase, dfa)
48 re_string_t *pstr;
49 const char *str;
50 int len, init_len, icase;
51 RE_TRANSLATE_TYPE trans;
52 const re_dfa_t *dfa;
54 reg_errcode_t ret;
55 int init_buf_len;
57 /* Ensure at least one character fits into the buffers. */
58 if (init_len < dfa->mb_cur_max)
59 init_len = dfa->mb_cur_max;
60 init_buf_len = (len + 1 < init_len) ? len + 1: init_len;
61 re_string_construct_common (str, len, pstr, trans, icase, dfa);
63 ret = re_string_realloc_buffers (pstr, init_buf_len);
64 if (BE (ret != REG_NOERROR, 0))
65 return ret;
67 pstr->word_char = dfa->word_char;
68 pstr->word_ops_used = dfa->word_ops_used;
69 pstr->mbs = pstr->mbs_allocated ? pstr->mbs : (unsigned char *) str;
70 pstr->valid_len = (pstr->mbs_allocated || dfa->mb_cur_max > 1) ? 0 : len;
71 pstr->valid_raw_len = pstr->valid_len;
72 return REG_NOERROR;
75 /* This function allocate the buffers, and initialize them. */
77 static reg_errcode_t
78 re_string_construct (pstr, str, len, trans, icase, dfa)
79 re_string_t *pstr;
80 const char *str;
81 int len, icase;
82 RE_TRANSLATE_TYPE trans;
83 const re_dfa_t *dfa;
85 reg_errcode_t ret;
86 memset (pstr, '\0', sizeof (re_string_t));
87 re_string_construct_common (str, len, pstr, trans, icase, dfa);
89 if (len > 0)
91 ret = re_string_realloc_buffers (pstr, len + 1);
92 if (BE (ret != REG_NOERROR, 0))
93 return ret;
95 pstr->mbs = pstr->mbs_allocated ? pstr->mbs : (unsigned char *) str;
97 if (icase)
99 #ifdef RE_ENABLE_I18N
100 if (dfa->mb_cur_max > 1)
102 while (1)
104 ret = build_wcs_upper_buffer (pstr);
105 if (BE (ret != REG_NOERROR, 0))
106 return ret;
107 if (pstr->valid_raw_len >= len)
108 break;
109 if (pstr->bufs_len > pstr->valid_len + dfa->mb_cur_max)
110 break;
111 ret = re_string_realloc_buffers (pstr, pstr->bufs_len * 2);
112 if (BE (ret != REG_NOERROR, 0))
113 return ret;
116 else
117 #endif /* RE_ENABLE_I18N */
118 build_upper_buffer (pstr);
120 else
122 #ifdef RE_ENABLE_I18N
123 if (dfa->mb_cur_max > 1)
124 build_wcs_buffer (pstr);
125 else
126 #endif /* RE_ENABLE_I18N */
128 if (trans != NULL)
129 re_string_translate_buffer (pstr);
130 else
132 pstr->valid_len = pstr->bufs_len;
133 pstr->valid_raw_len = pstr->bufs_len;
138 return REG_NOERROR;
141 /* Helper functions for re_string_allocate, and re_string_construct. */
143 static reg_errcode_t
144 re_string_realloc_buffers (pstr, new_buf_len)
145 re_string_t *pstr;
146 int new_buf_len;
148 #ifdef RE_ENABLE_I18N
149 if (pstr->mb_cur_max > 1)
151 wint_t *new_array = re_realloc (pstr->wcs, wint_t, new_buf_len);
152 if (BE (new_array == NULL, 0))
153 return REG_ESPACE;
154 pstr->wcs = new_array;
155 if (pstr->offsets != NULL)
157 int *new_array = re_realloc (pstr->offsets, int, new_buf_len);
158 if (BE (new_array == NULL, 0))
159 return REG_ESPACE;
160 pstr->offsets = new_array;
163 #endif /* RE_ENABLE_I18N */
164 if (pstr->mbs_allocated)
166 unsigned char *new_array = re_realloc (pstr->mbs, unsigned char,
167 new_buf_len);
168 if (BE (new_array == NULL, 0))
169 return REG_ESPACE;
170 pstr->mbs = new_array;
172 pstr->bufs_len = new_buf_len;
173 return REG_NOERROR;
177 static void
178 re_string_construct_common (str, len, pstr, trans, icase, dfa)
179 const char *str;
180 int len;
181 re_string_t *pstr;
182 RE_TRANSLATE_TYPE trans;
183 int icase;
184 const re_dfa_t *dfa;
186 pstr->raw_mbs = (const unsigned char *) str;
187 pstr->len = len;
188 pstr->raw_len = len;
189 pstr->trans = (unsigned RE_TRANSLATE_TYPE) trans;
190 pstr->icase = icase ? 1 : 0;
191 pstr->mbs_allocated = (trans != NULL || icase);
192 pstr->mb_cur_max = dfa->mb_cur_max;
193 pstr->is_utf8 = dfa->is_utf8;
194 pstr->map_notascii = dfa->map_notascii;
195 pstr->stop = pstr->len;
196 pstr->raw_stop = pstr->stop;
199 #ifdef RE_ENABLE_I18N
201 /* Build wide character buffer PSTR->WCS.
202 If the byte sequence of the string are:
203 <mb1>(0), <mb1>(1), <mb2>(0), <mb2>(1), <sb3>
204 Then wide character buffer will be:
205 <wc1> , WEOF , <wc2> , WEOF , <wc3>
206 We use WEOF for padding, they indicate that the position isn't
207 a first byte of a multibyte character.
209 Note that this function assumes PSTR->VALID_LEN elements are already
210 built and starts from PSTR->VALID_LEN. */
212 static void
213 build_wcs_buffer (pstr)
214 re_string_t *pstr;
216 #ifdef _LIBC
217 unsigned char buf[pstr->mb_cur_max];
218 #else
219 unsigned char buf[64];
220 #endif
221 mbstate_t prev_st;
222 int byte_idx, end_idx, mbclen, remain_len;
224 /* Build the buffers from pstr->valid_len to either pstr->len or
225 pstr->bufs_len. */
226 end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
227 for (byte_idx = pstr->valid_len; byte_idx < end_idx;)
229 wchar_t wc;
230 const char *p;
232 remain_len = end_idx - byte_idx;
233 prev_st = pstr->cur_state;
234 /* Apply the translation if we need. */
235 if (BE (pstr->trans != NULL, 0))
237 int i, ch;
239 for (i = 0; i < pstr->mb_cur_max && i < remain_len; ++i)
241 ch = pstr->raw_mbs [pstr->raw_mbs_idx + byte_idx + i];
242 buf[i] = pstr->mbs[byte_idx + i] = pstr->trans[ch];
244 p = (const char *) buf;
246 else
247 p = (const char *) pstr->raw_mbs + pstr->raw_mbs_idx + byte_idx;
248 mbclen = mbrtowc (&wc, p, remain_len, &pstr->cur_state);
249 if (BE (mbclen == (size_t) -2, 0))
251 /* The buffer doesn't have enough space, finish to build. */
252 pstr->cur_state = prev_st;
253 break;
255 else if (BE (mbclen == (size_t) -1 || mbclen == 0, 0))
257 /* We treat these cases as a singlebyte character. */
258 mbclen = 1;
259 wc = (wchar_t) pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx];
260 if (BE (pstr->trans != NULL, 0))
261 wc = pstr->trans[wc];
262 pstr->cur_state = prev_st;
265 /* Write wide character and padding. */
266 pstr->wcs[byte_idx++] = wc;
267 /* Write paddings. */
268 for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
269 pstr->wcs[byte_idx++] = WEOF;
271 pstr->valid_len = byte_idx;
272 pstr->valid_raw_len = byte_idx;
275 /* Build wide character buffer PSTR->WCS like build_wcs_buffer,
276 but for REG_ICASE. */
278 static int
279 build_wcs_upper_buffer (pstr)
280 re_string_t *pstr;
282 mbstate_t prev_st;
283 int src_idx, byte_idx, end_idx, mbclen, remain_len;
284 #ifdef _LIBC
285 unsigned char buf[pstr->mb_cur_max];
286 #else
287 unsigned char buf[64];
288 #endif
290 byte_idx = pstr->valid_len;
291 end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
293 /* The following optimization assumes that ASCII characters can be
294 mapped to wide characters with a simple cast. */
295 if (! pstr->map_notascii && pstr->trans == NULL && !pstr->offsets_needed)
297 while (byte_idx < end_idx)
299 wchar_t wc;
301 if (isascii (pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx])
302 && mbsinit (&pstr->cur_state))
304 /* In case of a singlebyte character. */
305 pstr->mbs[byte_idx]
306 = toupper (pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx]);
307 /* The next step uses the assumption that wchar_t is encoded
308 ASCII-safe: all ASCII values can be converted like this. */
309 pstr->wcs[byte_idx] = (wchar_t) pstr->mbs[byte_idx];
310 ++byte_idx;
311 continue;
314 remain_len = end_idx - byte_idx;
315 prev_st = pstr->cur_state;
316 mbclen = mbrtowc (&wc,
317 ((const char *) pstr->raw_mbs + pstr->raw_mbs_idx
318 + byte_idx), remain_len, &pstr->cur_state);
319 if (BE (mbclen > 0, 1))
321 wchar_t wcu = wc;
322 if (iswlower (wc))
324 int mbcdlen;
326 wcu = towupper (wc);
327 mbcdlen = wcrtomb (buf, wcu, &prev_st);
328 if (BE (mbclen == mbcdlen, 1))
329 memcpy (pstr->mbs + byte_idx, buf, mbclen);
330 else
332 src_idx = byte_idx;
333 goto offsets_needed;
336 else
337 memcpy (pstr->mbs + byte_idx,
338 pstr->raw_mbs + pstr->raw_mbs_idx + byte_idx, mbclen);
339 pstr->wcs[byte_idx++] = wcu;
340 /* Write paddings. */
341 for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
342 pstr->wcs[byte_idx++] = WEOF;
344 else if (mbclen == (size_t) -1 || mbclen == 0)
346 /* It is an invalid character or '\0'. Just use the byte. */
347 int ch = pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx];
348 pstr->mbs[byte_idx] = ch;
349 /* And also cast it to wide char. */
350 pstr->wcs[byte_idx++] = (wchar_t) ch;
351 if (BE (mbclen == (size_t) -1, 0))
352 pstr->cur_state = prev_st;
354 else
356 /* The buffer doesn't have enough space, finish to build. */
357 pstr->cur_state = prev_st;
358 break;
361 pstr->valid_len = byte_idx;
362 pstr->valid_raw_len = byte_idx;
363 return REG_NOERROR;
365 else
366 for (src_idx = pstr->valid_raw_len; byte_idx < end_idx;)
368 wchar_t wc;
369 const char *p;
370 offsets_needed:
371 remain_len = end_idx - byte_idx;
372 prev_st = pstr->cur_state;
373 if (BE (pstr->trans != NULL, 0))
375 int i, ch;
377 for (i = 0; i < pstr->mb_cur_max && i < remain_len; ++i)
379 ch = pstr->raw_mbs [pstr->raw_mbs_idx + src_idx + i];
380 buf[i] = pstr->trans[ch];
382 p = (const char *) buf;
384 else
385 p = (const char *) pstr->raw_mbs + pstr->raw_mbs_idx + src_idx;
386 mbclen = mbrtowc (&wc, p, remain_len, &pstr->cur_state);
387 if (BE (mbclen > 0, 1))
389 wchar_t wcu = wc;
390 if (iswlower (wc))
392 int mbcdlen;
394 wcu = towupper (wc);
395 mbcdlen = wcrtomb ((char *) buf, wcu, &prev_st);
396 if (BE (mbclen == mbcdlen, 1))
397 memcpy (pstr->mbs + byte_idx, buf, mbclen);
398 else
400 int i;
402 if (byte_idx + mbcdlen > pstr->bufs_len)
404 pstr->cur_state = prev_st;
405 break;
408 if (pstr->offsets == NULL)
410 pstr->offsets = re_malloc (int, pstr->bufs_len);
412 if (pstr->offsets == NULL)
413 return REG_ESPACE;
415 if (!pstr->offsets_needed)
417 for (i = 0; i < byte_idx; ++i)
418 pstr->offsets[i] = i;
419 pstr->offsets_needed = 1;
422 memcpy (pstr->mbs + byte_idx, buf, mbcdlen);
423 pstr->wcs[byte_idx] = wcu;
424 pstr->offsets[byte_idx] = src_idx;
425 for (i = 1; i < mbcdlen; ++i)
427 pstr->offsets[byte_idx + i]
428 = src_idx + (i < mbclen ? i : mbclen - 1);
429 pstr->wcs[byte_idx + i] = WEOF;
431 pstr->len += mbcdlen - mbclen;
432 if (pstr->raw_stop > src_idx)
433 pstr->stop += mbcdlen - mbclen;
434 end_idx = (pstr->bufs_len > pstr->len)
435 ? pstr->len : pstr->bufs_len;
436 byte_idx += mbcdlen;
437 src_idx += mbclen;
438 continue;
441 else
442 memcpy (pstr->mbs + byte_idx, p, mbclen);
444 if (BE (pstr->offsets_needed != 0, 0))
446 int i;
447 for (i = 0; i < mbclen; ++i)
448 pstr->offsets[byte_idx + i] = src_idx + i;
450 src_idx += mbclen;
452 pstr->wcs[byte_idx++] = wcu;
453 /* Write paddings. */
454 for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
455 pstr->wcs[byte_idx++] = WEOF;
457 else if (mbclen == (size_t) -1 || mbclen == 0)
459 /* It is an invalid character or '\0'. Just use the byte. */
460 int ch = pstr->raw_mbs[pstr->raw_mbs_idx + src_idx];
462 if (BE (pstr->trans != NULL, 0))
463 ch = pstr->trans [ch];
464 pstr->mbs[byte_idx] = ch;
466 if (BE (pstr->offsets_needed != 0, 0))
467 pstr->offsets[byte_idx] = src_idx;
468 ++src_idx;
470 /* And also cast it to wide char. */
471 pstr->wcs[byte_idx++] = (wchar_t) ch;
472 if (BE (mbclen == (size_t) -1, 0))
473 pstr->cur_state = prev_st;
475 else
477 /* The buffer doesn't have enough space, finish to build. */
478 pstr->cur_state = prev_st;
479 break;
482 pstr->valid_len = byte_idx;
483 pstr->valid_raw_len = src_idx;
484 return REG_NOERROR;
487 /* Skip characters until the index becomes greater than NEW_RAW_IDX.
488 Return the index. */
490 static int
491 re_string_skip_chars (pstr, new_raw_idx, last_wc)
492 re_string_t *pstr;
493 int new_raw_idx;
494 wint_t *last_wc;
496 mbstate_t prev_st;
497 int rawbuf_idx, mbclen;
498 wchar_t wc = 0;
500 /* Skip the characters which are not necessary to check. */
501 for (rawbuf_idx = pstr->raw_mbs_idx + pstr->valid_raw_len;
502 rawbuf_idx < new_raw_idx;)
504 int remain_len;
505 remain_len = pstr->len - rawbuf_idx;
506 prev_st = pstr->cur_state;
507 mbclen = mbrtowc (&wc, (const char *) pstr->raw_mbs + rawbuf_idx,
508 remain_len, &pstr->cur_state);
509 if (BE (mbclen == (size_t) -2 || mbclen == (size_t) -1 || mbclen == 0, 0))
511 /* We treat these cases as a singlebyte character. */
512 mbclen = 1;
513 pstr->cur_state = prev_st;
515 /* Then proceed the next character. */
516 rawbuf_idx += mbclen;
518 *last_wc = (wint_t) wc;
519 return rawbuf_idx;
521 #endif /* RE_ENABLE_I18N */
523 /* Build the buffer PSTR->MBS, and apply the translation if we need.
524 This function is used in case of REG_ICASE. */
526 static void
527 build_upper_buffer (pstr)
528 re_string_t *pstr;
530 int char_idx, end_idx;
531 end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
533 for (char_idx = pstr->valid_len; char_idx < end_idx; ++char_idx)
535 int ch = pstr->raw_mbs[pstr->raw_mbs_idx + char_idx];
536 if (BE (pstr->trans != NULL, 0))
537 ch = pstr->trans[ch];
538 if (islower (ch))
539 pstr->mbs[char_idx] = toupper (ch);
540 else
541 pstr->mbs[char_idx] = ch;
543 pstr->valid_len = char_idx;
544 pstr->valid_raw_len = char_idx;
547 /* Apply TRANS to the buffer in PSTR. */
549 static void
550 re_string_translate_buffer (pstr)
551 re_string_t *pstr;
553 int buf_idx, end_idx;
554 end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
556 for (buf_idx = pstr->valid_len; buf_idx < end_idx; ++buf_idx)
558 int ch = pstr->raw_mbs[pstr->raw_mbs_idx + buf_idx];
559 pstr->mbs[buf_idx] = pstr->trans[ch];
562 pstr->valid_len = buf_idx;
563 pstr->valid_raw_len = buf_idx;
566 /* This function re-construct the buffers.
567 Concretely, convert to wide character in case of pstr->mb_cur_max > 1,
568 convert to upper case in case of REG_ICASE, apply translation. */
570 static reg_errcode_t
571 re_string_reconstruct (pstr, idx, eflags)
572 re_string_t *pstr;
573 int idx, eflags;
575 int offset = idx - pstr->raw_mbs_idx;
576 if (BE (offset < 0, 0))
578 /* Reset buffer. */
579 #ifdef RE_ENABLE_I18N
580 if (pstr->mb_cur_max > 1)
581 memset (&pstr->cur_state, '\0', sizeof (mbstate_t));
582 #endif /* RE_ENABLE_I18N */
583 pstr->len = pstr->raw_len;
584 pstr->stop = pstr->raw_stop;
585 pstr->valid_len = 0;
586 pstr->raw_mbs_idx = 0;
587 pstr->valid_raw_len = 0;
588 pstr->offsets_needed = 0;
589 pstr->tip_context = ((eflags & REG_NOTBOL) ? CONTEXT_BEGBUF
590 : CONTEXT_NEWLINE | CONTEXT_BEGBUF);
591 if (!pstr->mbs_allocated)
592 pstr->mbs = (unsigned char *) pstr->raw_mbs;
593 offset = idx;
596 if (BE (offset != 0, 1))
598 /* Are the characters which are already checked remain? */
599 if (BE (offset < pstr->valid_raw_len, 1)
600 #ifdef RE_ENABLE_I18N
601 /* Handling this would enlarge the code too much.
602 Accept a slowdown in that case. */
603 && pstr->offsets_needed == 0
604 #endif
607 /* Yes, move them to the front of the buffer. */
608 pstr->tip_context = re_string_context_at (pstr, offset - 1, eflags);
609 #ifdef RE_ENABLE_I18N
610 if (pstr->mb_cur_max > 1)
611 memmove (pstr->wcs, pstr->wcs + offset,
612 (pstr->valid_len - offset) * sizeof (wint_t));
613 #endif /* RE_ENABLE_I18N */
614 if (BE (pstr->mbs_allocated, 0))
615 memmove (pstr->mbs, pstr->mbs + offset,
616 pstr->valid_len - offset);
617 pstr->valid_len -= offset;
618 pstr->valid_raw_len -= offset;
619 #if DEBUG
620 assert (pstr->valid_len > 0);
621 #endif
623 else
625 /* No, skip all characters until IDX. */
626 #ifdef RE_ENABLE_I18N
627 if (BE (pstr->offsets_needed, 0))
629 pstr->len = pstr->raw_len - idx + offset;
630 pstr->stop = pstr->raw_stop - idx + offset;
631 pstr->offsets_needed = 0;
633 #endif
634 pstr->valid_len = 0;
635 pstr->valid_raw_len = 0;
636 #ifdef RE_ENABLE_I18N
637 if (pstr->mb_cur_max > 1)
639 int wcs_idx;
640 wint_t wc = WEOF;
642 if (pstr->is_utf8)
644 const unsigned char *raw, *p, *q, *end;
646 /* Special case UTF-8. Multi-byte chars start with any
647 byte other than 0x80 - 0xbf. */
648 raw = pstr->raw_mbs + pstr->raw_mbs_idx;
649 end = raw + (offset - pstr->mb_cur_max);
650 for (p = raw + offset - 1; p >= end; --p)
651 if ((*p & 0xc0) != 0x80)
653 mbstate_t cur_state;
654 wchar_t wc2;
655 int mlen = raw + pstr->len - p;
656 unsigned char buf[6];
658 q = p;
659 if (BE (pstr->trans != NULL, 0))
661 int i = mlen < 6 ? mlen : 6;
662 while (--i >= 0)
663 buf[i] = pstr->trans[p[i]];
664 q = buf;
666 /* XXX Don't use mbrtowc, we know which conversion
667 to use (UTF-8 -> UCS4). */
668 memset (&cur_state, 0, sizeof (cur_state));
669 mlen = mbrtowc (&wc2, p, mlen, &cur_state)
670 - (raw + offset - p);
671 if (mlen >= 0)
673 memset (&pstr->cur_state, '\0',
674 sizeof (mbstate_t));
675 pstr->valid_len = mlen;
676 wc = wc2;
678 break;
682 if (wc == WEOF)
683 pstr->valid_len = re_string_skip_chars (pstr, idx, &wc) - idx;
684 if (BE (pstr->valid_len, 0))
686 for (wcs_idx = 0; wcs_idx < pstr->valid_len; ++wcs_idx)
687 pstr->wcs[wcs_idx] = WEOF;
688 if (pstr->mbs_allocated)
689 memset (pstr->mbs, 255, pstr->valid_len);
691 pstr->valid_raw_len = pstr->valid_len;
692 pstr->tip_context = ((BE (pstr->word_ops_used != 0, 0)
693 && IS_WIDE_WORD_CHAR (wc))
694 ? CONTEXT_WORD
695 : ((IS_WIDE_NEWLINE (wc)
696 && pstr->newline_anchor)
697 ? CONTEXT_NEWLINE : 0));
699 else
700 #endif /* RE_ENABLE_I18N */
702 int c = pstr->raw_mbs[pstr->raw_mbs_idx + offset - 1];
703 if (pstr->trans)
704 c = pstr->trans[c];
705 pstr->tip_context = (bitset_contain (pstr->word_char, c)
706 ? CONTEXT_WORD
707 : ((IS_NEWLINE (c) && pstr->newline_anchor)
708 ? CONTEXT_NEWLINE : 0));
711 if (!BE (pstr->mbs_allocated, 0))
712 pstr->mbs += offset;
714 pstr->raw_mbs_idx = idx;
715 pstr->len -= offset;
716 pstr->stop -= offset;
718 /* Then build the buffers. */
719 #ifdef RE_ENABLE_I18N
720 if (pstr->mb_cur_max > 1)
722 if (pstr->icase)
724 int ret = build_wcs_upper_buffer (pstr);
725 if (BE (ret != REG_NOERROR, 0))
726 return ret;
728 else
729 build_wcs_buffer (pstr);
731 else
732 #endif /* RE_ENABLE_I18N */
733 if (BE (pstr->mbs_allocated, 0))
735 if (pstr->icase)
736 build_upper_buffer (pstr);
737 else if (pstr->trans != NULL)
738 re_string_translate_buffer (pstr);
740 else
741 pstr->valid_len = pstr->len;
743 pstr->cur_idx = 0;
744 return REG_NOERROR;
747 static unsigned char
748 re_string_peek_byte_case (pstr, idx)
749 const re_string_t *pstr;
750 int idx;
752 int ch, off;
754 /* Handle the common (easiest) cases first. */
755 if (BE (!pstr->mbs_allocated, 1))
756 return re_string_peek_byte (pstr, idx);
758 #ifdef RE_ENABLE_I18N
759 if (pstr->mb_cur_max > 1
760 && ! re_string_is_single_byte_char (pstr, pstr->cur_idx + idx))
761 return re_string_peek_byte (pstr, idx);
762 #endif
764 off = pstr->cur_idx + idx;
765 #ifdef RE_ENABLE_I18N
766 if (pstr->offsets_needed)
767 off = pstr->offsets[off];
768 #endif
770 ch = pstr->raw_mbs[pstr->raw_mbs_idx + off];
772 #ifdef RE_ENABLE_I18N
773 /* Ensure that e.g. for tr_TR.UTF-8 BACKSLASH DOTLESS SMALL LETTER I
774 this function returns CAPITAL LETTER I instead of first byte of
775 DOTLESS SMALL LETTER I. The latter would confuse the parser,
776 since peek_byte_case doesn't advance cur_idx in any way. */
777 if (pstr->offsets_needed && !isascii (ch))
778 return re_string_peek_byte (pstr, idx);
779 #endif
781 return ch;
784 static unsigned char
785 re_string_fetch_byte_case (pstr)
786 re_string_t *pstr;
788 if (BE (!pstr->mbs_allocated, 1))
789 return re_string_fetch_byte (pstr);
791 #ifdef RE_ENABLE_I18N
792 if (pstr->offsets_needed)
794 int off, ch;
796 /* For tr_TR.UTF-8 [[:islower:]] there is
797 [[: CAPITAL LETTER I WITH DOT lower:]] in mbs. Skip
798 in that case the whole multi-byte character and return
799 the original letter. On the other side, with
800 [[: DOTLESS SMALL LETTER I return [[:I, as doing
801 anything else would complicate things too much. */
803 if (!re_string_first_byte (pstr, pstr->cur_idx))
804 return re_string_fetch_byte (pstr);
806 off = pstr->offsets[pstr->cur_idx];
807 ch = pstr->raw_mbs[pstr->raw_mbs_idx + off];
809 if (! isascii (ch))
810 return re_string_fetch_byte (pstr);
812 re_string_skip_bytes (pstr,
813 re_string_char_size_at (pstr, pstr->cur_idx));
814 return ch;
816 #endif
818 return pstr->raw_mbs[pstr->raw_mbs_idx + pstr->cur_idx++];
821 static void
822 re_string_destruct (pstr)
823 re_string_t *pstr;
825 #ifdef RE_ENABLE_I18N
826 re_free (pstr->wcs);
827 re_free (pstr->offsets);
828 #endif /* RE_ENABLE_I18N */
829 if (pstr->mbs_allocated)
830 re_free (pstr->mbs);
833 /* Return the context at IDX in INPUT. */
835 static unsigned int
836 re_string_context_at (input, idx, eflags)
837 const re_string_t *input;
838 int idx, eflags;
840 int c;
841 if (BE (idx < 0, 0))
842 /* In this case, we use the value stored in input->tip_context,
843 since we can't know the character in input->mbs[-1] here. */
844 return input->tip_context;
845 if (BE (idx == input->len, 0))
846 return ((eflags & REG_NOTEOL) ? CONTEXT_ENDBUF
847 : CONTEXT_NEWLINE | CONTEXT_ENDBUF);
848 #ifdef RE_ENABLE_I18N
849 if (input->mb_cur_max > 1)
851 wint_t wc;
852 int wc_idx = idx;
853 while(input->wcs[wc_idx] == WEOF)
855 #ifdef DEBUG
856 /* It must not happen. */
857 assert (wc_idx >= 0);
858 #endif
859 --wc_idx;
860 if (wc_idx < 0)
861 return input->tip_context;
863 wc = input->wcs[wc_idx];
864 if (BE (input->word_ops_used != 0, 0) && IS_WIDE_WORD_CHAR (wc))
865 return CONTEXT_WORD;
866 return (IS_WIDE_NEWLINE (wc) && input->newline_anchor
867 ? CONTEXT_NEWLINE : 0);
869 else
870 #endif
872 c = re_string_byte_at (input, idx);
873 if (bitset_contain (input->word_char, c))
874 return CONTEXT_WORD;
875 return IS_NEWLINE (c) && input->newline_anchor ? CONTEXT_NEWLINE : 0;
879 /* Functions for set operation. */
881 static reg_errcode_t
882 re_node_set_alloc (set, size)
883 re_node_set *set;
884 int size;
886 set->alloc = size;
887 set->nelem = 0;
888 set->elems = re_malloc (int, size);
889 if (BE (set->elems == NULL, 0))
890 return REG_ESPACE;
891 return REG_NOERROR;
894 static reg_errcode_t
895 re_node_set_init_1 (set, elem)
896 re_node_set *set;
897 int elem;
899 set->alloc = 1;
900 set->nelem = 1;
901 set->elems = re_malloc (int, 1);
902 if (BE (set->elems == NULL, 0))
904 set->alloc = set->nelem = 0;
905 return REG_ESPACE;
907 set->elems[0] = elem;
908 return REG_NOERROR;
911 static reg_errcode_t
912 re_node_set_init_2 (set, elem1, elem2)
913 re_node_set *set;
914 int elem1, elem2;
916 set->alloc = 2;
917 set->elems = re_malloc (int, 2);
918 if (BE (set->elems == NULL, 0))
919 return REG_ESPACE;
920 if (elem1 == elem2)
922 set->nelem = 1;
923 set->elems[0] = elem1;
925 else
927 set->nelem = 2;
928 if (elem1 < elem2)
930 set->elems[0] = elem1;
931 set->elems[1] = elem2;
933 else
935 set->elems[0] = elem2;
936 set->elems[1] = elem1;
939 return REG_NOERROR;
942 static reg_errcode_t
943 re_node_set_init_copy (dest, src)
944 re_node_set *dest;
945 const re_node_set *src;
947 dest->nelem = src->nelem;
948 if (src->nelem > 0)
950 dest->alloc = dest->nelem;
951 dest->elems = re_malloc (int, dest->alloc);
952 if (BE (dest->elems == NULL, 0))
954 dest->alloc = dest->nelem = 0;
955 return REG_ESPACE;
957 memcpy (dest->elems, src->elems, src->nelem * sizeof (int));
959 else
960 re_node_set_init_empty (dest);
961 return REG_NOERROR;
964 /* Calculate the intersection of the sets SRC1 and SRC2. And merge it to
965 DEST. Return value indicate the error code or REG_NOERROR if succeeded.
966 Note: We assume dest->elems is NULL, when dest->alloc is 0. */
968 static reg_errcode_t
969 re_node_set_add_intersect (dest, src1, src2)
970 re_node_set *dest;
971 const re_node_set *src1, *src2;
973 int i1, i2, is, id, delta, sbase;
974 if (src1->nelem == 0 || src2->nelem == 0)
975 return REG_NOERROR;
977 /* We need dest->nelem + 2 * elems_in_intersection; this is a
978 conservative estimate. */
979 if (src1->nelem + src2->nelem + dest->nelem > dest->alloc)
981 int new_alloc = src1->nelem + src2->nelem + dest->alloc;
982 int *new_elems = re_realloc (dest->elems, int, new_alloc);
983 if (BE (new_elems == NULL, 0))
984 return REG_ESPACE;
985 dest->elems = new_elems;
986 dest->alloc = new_alloc;
989 /* Find the items in the intersection of SRC1 and SRC2, and copy
990 into the top of DEST those that are not already in DEST itself. */
991 sbase = dest->nelem + src1->nelem + src2->nelem;
992 i1 = src1->nelem - 1;
993 i2 = src2->nelem - 1;
994 id = dest->nelem - 1;
995 for (;;)
997 if (src1->elems[i1] == src2->elems[i2])
999 /* Try to find the item in DEST. Maybe we could binary search? */
1000 while (id >= 0 && dest->elems[id] > src1->elems[i1])
1001 --id;
1003 if (id < 0 || dest->elems[id] != src1->elems[i1])
1004 dest->elems[--sbase] = src1->elems[i1];
1006 if (--i1 < 0 || --i2 < 0)
1007 break;
1010 /* Lower the highest of the two items. */
1011 else if (src1->elems[i1] < src2->elems[i2])
1013 if (--i2 < 0)
1014 break;
1016 else
1018 if (--i1 < 0)
1019 break;
1023 id = dest->nelem - 1;
1024 is = dest->nelem + src1->nelem + src2->nelem - 1;
1025 delta = is - sbase + 1;
1027 /* Now copy. When DELTA becomes zero, the remaining
1028 DEST elements are already in place; this is more or
1029 less the same loop that is in re_node_set_merge. */
1030 dest->nelem += delta;
1031 if (delta > 0 && id >= 0)
1032 for (;;)
1034 if (dest->elems[is] > dest->elems[id])
1036 /* Copy from the top. */
1037 dest->elems[id + delta--] = dest->elems[is--];
1038 if (delta == 0)
1039 break;
1041 else
1043 /* Slide from the bottom. */
1044 dest->elems[id + delta] = dest->elems[id];
1045 if (--id < 0)
1046 break;
1050 /* Copy remaining SRC elements. */
1051 memcpy (dest->elems, dest->elems + sbase, delta * sizeof (int));
1053 return REG_NOERROR;
1056 /* Calculate the union set of the sets SRC1 and SRC2. And store it to
1057 DEST. Return value indicate the error code or REG_NOERROR if succeeded. */
1059 static reg_errcode_t
1060 re_node_set_init_union (dest, src1, src2)
1061 re_node_set *dest;
1062 const re_node_set *src1, *src2;
1064 int i1, i2, id;
1065 if (src1 != NULL && src1->nelem > 0 && src2 != NULL && src2->nelem > 0)
1067 dest->alloc = src1->nelem + src2->nelem;
1068 dest->elems = re_malloc (int, dest->alloc);
1069 if (BE (dest->elems == NULL, 0))
1070 return REG_ESPACE;
1072 else
1074 if (src1 != NULL && src1->nelem > 0)
1075 return re_node_set_init_copy (dest, src1);
1076 else if (src2 != NULL && src2->nelem > 0)
1077 return re_node_set_init_copy (dest, src2);
1078 else
1079 re_node_set_init_empty (dest);
1080 return REG_NOERROR;
1082 for (i1 = i2 = id = 0 ; i1 < src1->nelem && i2 < src2->nelem ;)
1084 if (src1->elems[i1] > src2->elems[i2])
1086 dest->elems[id++] = src2->elems[i2++];
1087 continue;
1089 if (src1->elems[i1] == src2->elems[i2])
1090 ++i2;
1091 dest->elems[id++] = src1->elems[i1++];
1093 if (i1 < src1->nelem)
1095 memcpy (dest->elems + id, src1->elems + i1,
1096 (src1->nelem - i1) * sizeof (int));
1097 id += src1->nelem - i1;
1099 else if (i2 < src2->nelem)
1101 memcpy (dest->elems + id, src2->elems + i2,
1102 (src2->nelem - i2) * sizeof (int));
1103 id += src2->nelem - i2;
1105 dest->nelem = id;
1106 return REG_NOERROR;
1109 /* Calculate the union set of the sets DEST and SRC. And store it to
1110 DEST. Return value indicate the error code or REG_NOERROR if succeeded. */
1112 static reg_errcode_t
1113 re_node_set_merge (dest, src)
1114 re_node_set *dest;
1115 const re_node_set *src;
1117 int is, id, sbase, delta;
1118 if (src == NULL || src->nelem == 0)
1119 return REG_NOERROR;
1120 if (dest->alloc < 2 * src->nelem + dest->nelem)
1122 int new_alloc = 2 * (src->nelem + dest->alloc);
1123 int *new_buffer = re_realloc (dest->elems, int, new_alloc);
1124 if (BE (new_buffer == NULL, 0))
1125 return REG_ESPACE;
1126 dest->elems = new_buffer;
1127 dest->alloc = new_alloc;
1130 if (BE (dest->nelem == 0, 0))
1132 dest->nelem = src->nelem;
1133 memcpy (dest->elems, src->elems, src->nelem * sizeof (int));
1134 return REG_NOERROR;
1137 /* Copy into the top of DEST the items of SRC that are not
1138 found in DEST. Maybe we could binary search in DEST? */
1139 for (sbase = dest->nelem + 2 * src->nelem,
1140 is = src->nelem - 1, id = dest->nelem - 1; is >= 0 && id >= 0; )
1142 if (dest->elems[id] == src->elems[is])
1143 is--, id--;
1144 else if (dest->elems[id] < src->elems[is])
1145 dest->elems[--sbase] = src->elems[is--];
1146 else /* if (dest->elems[id] > src->elems[is]) */
1147 --id;
1150 if (is >= 0)
1152 /* If DEST is exhausted, the remaining items of SRC must be unique. */
1153 sbase -= is + 1;
1154 memcpy (dest->elems + sbase, src->elems, (is + 1) * sizeof (int));
1157 id = dest->nelem - 1;
1158 is = dest->nelem + 2 * src->nelem - 1;
1159 delta = is - sbase + 1;
1160 if (delta == 0)
1161 return REG_NOERROR;
1163 /* Now copy. When DELTA becomes zero, the remaining
1164 DEST elements are already in place. */
1165 dest->nelem += delta;
1166 for (;;)
1168 if (dest->elems[is] > dest->elems[id])
1170 /* Copy from the top. */
1171 dest->elems[id + delta--] = dest->elems[is--];
1172 if (delta == 0)
1173 break;
1175 else
1177 /* Slide from the bottom. */
1178 dest->elems[id + delta] = dest->elems[id];
1179 if (--id < 0)
1181 /* Copy remaining SRC elements. */
1182 memcpy (dest->elems, dest->elems + sbase,
1183 delta * sizeof (int));
1184 break;
1189 return REG_NOERROR;
1192 /* Insert the new element ELEM to the re_node_set* SET.
1193 SET should not already have ELEM.
1194 return -1 if an error is occured, return 1 otherwise. */
1196 static int
1197 re_node_set_insert (set, elem)
1198 re_node_set *set;
1199 int elem;
1201 int idx;
1202 /* In case the set is empty. */
1203 if (set->alloc == 0)
1205 if (BE (re_node_set_init_1 (set, elem) == REG_NOERROR, 1))
1206 return 1;
1207 else
1208 return -1;
1211 if (BE (set->nelem, 0) == 0)
1213 /* We already guaranteed above that set->alloc != 0. */
1214 set->elems[0] = elem;
1215 ++set->nelem;
1216 return 1;
1219 /* Realloc if we need. */
1220 if (set->alloc == set->nelem)
1222 int *new_array;
1223 set->alloc = set->alloc * 2;
1224 new_array = re_realloc (set->elems, int, set->alloc);
1225 if (BE (new_array == NULL, 0))
1226 return -1;
1227 set->elems = new_array;
1230 /* Move the elements which follows the new element. Test the
1231 first element separately to skip a check in the inner loop. */
1232 if (elem < set->elems[0])
1234 idx = 0;
1235 for (idx = set->nelem; idx > 0; idx--)
1236 set->elems[idx] = set->elems[idx - 1];
1238 else
1240 for (idx = set->nelem; set->elems[idx - 1] > elem; idx--)
1241 set->elems[idx] = set->elems[idx - 1];
1244 /* Insert the new element. */
1245 set->elems[idx] = elem;
1246 ++set->nelem;
1247 return 1;
1250 /* Insert the new element ELEM to the re_node_set* SET.
1251 SET should not already have any element greater than or equal to ELEM.
1252 Return -1 if an error is occured, return 1 otherwise. */
1254 static int
1255 re_node_set_insert_last (set, elem)
1256 re_node_set *set;
1257 int elem;
1259 /* Realloc if we need. */
1260 if (set->alloc == set->nelem)
1262 int *new_array;
1263 set->alloc = (set->alloc + 1) * 2;
1264 new_array = re_realloc (set->elems, int, set->alloc);
1265 if (BE (new_array == NULL, 0))
1266 return -1;
1267 set->elems = new_array;
1270 /* Insert the new element. */
1271 set->elems[set->nelem++] = elem;
1272 return 1;
1275 /* Compare two node sets SET1 and SET2.
1276 return 1 if SET1 and SET2 are equivalent, return 0 otherwise. */
1278 static int
1279 re_node_set_compare (set1, set2)
1280 const re_node_set *set1, *set2;
1282 int i;
1283 if (set1 == NULL || set2 == NULL || set1->nelem != set2->nelem)
1284 return 0;
1285 for (i = set1->nelem ; --i >= 0 ; )
1286 if (set1->elems[i] != set2->elems[i])
1287 return 0;
1288 return 1;
1291 /* Return (idx + 1) if SET contains the element ELEM, return 0 otherwise. */
1293 static int
1294 re_node_set_contains (set, elem)
1295 const re_node_set *set;
1296 int elem;
1298 unsigned int idx, right, mid;
1299 if (set->nelem <= 0)
1300 return 0;
1302 /* Binary search the element. */
1303 idx = 0;
1304 right = set->nelem - 1;
1305 while (idx < right)
1307 mid = (idx + right) / 2;
1308 if (set->elems[mid] < elem)
1309 idx = mid + 1;
1310 else
1311 right = mid;
1313 return set->elems[idx] == elem ? idx + 1 : 0;
1316 static void
1317 re_node_set_remove_at (set, idx)
1318 re_node_set *set;
1319 int idx;
1321 if (idx < 0 || idx >= set->nelem)
1322 return;
1323 --set->nelem;
1324 for (; idx < set->nelem; idx++)
1325 set->elems[idx] = set->elems[idx + 1];
1329 /* Add the token TOKEN to dfa->nodes, and return the index of the token.
1330 Or return -1, if an error will be occured. */
1332 static int
1333 re_dfa_add_node (dfa, token, mode)
1334 re_dfa_t *dfa;
1335 re_token_t token;
1336 int mode;
1338 if (BE (dfa->nodes_len >= dfa->nodes_alloc, 0))
1340 int new_nodes_alloc = dfa->nodes_alloc * 2;
1341 re_token_t *new_array = re_realloc (dfa->nodes, re_token_t,
1342 new_nodes_alloc);
1343 if (BE (new_array == NULL, 0))
1344 return -1;
1345 dfa->nodes = new_array;
1346 if (mode)
1348 int *new_nexts, *new_indices;
1349 re_node_set *new_edests, *new_eclosures, *new_inveclosures;
1351 new_nexts = re_realloc (dfa->nexts, int, new_nodes_alloc);
1352 new_indices = re_realloc (dfa->org_indices, int, new_nodes_alloc);
1353 new_edests = re_realloc (dfa->edests, re_node_set, new_nodes_alloc);
1354 new_eclosures = re_realloc (dfa->eclosures, re_node_set,
1355 new_nodes_alloc);
1356 new_inveclosures = re_realloc (dfa->inveclosures, re_node_set,
1357 new_nodes_alloc);
1358 if (BE (new_nexts == NULL || new_indices == NULL
1359 || new_edests == NULL || new_eclosures == NULL
1360 || new_inveclosures == NULL, 0))
1361 return -1;
1362 dfa->nexts = new_nexts;
1363 dfa->org_indices = new_indices;
1364 dfa->edests = new_edests;
1365 dfa->eclosures = new_eclosures;
1366 dfa->inveclosures = new_inveclosures;
1368 dfa->nodes_alloc = new_nodes_alloc;
1370 dfa->nodes[dfa->nodes_len] = token;
1371 dfa->nodes[dfa->nodes_len].opt_subexp = 0;
1372 dfa->nodes[dfa->nodes_len].duplicated = 0;
1373 dfa->nodes[dfa->nodes_len].constraint = 0;
1374 return dfa->nodes_len++;
1377 static unsigned int inline
1378 calc_state_hash (nodes, context)
1379 const re_node_set *nodes;
1380 unsigned int context;
1382 unsigned int hash = nodes->nelem + context;
1383 int i;
1384 for (i = 0 ; i < nodes->nelem ; i++)
1385 hash += nodes->elems[i];
1386 return hash;
1389 /* Search for the state whose node_set is equivalent to NODES.
1390 Return the pointer to the state, if we found it in the DFA.
1391 Otherwise create the new one and return it. In case of an error
1392 return NULL and set the error code in ERR.
1393 Note: - We assume NULL as the invalid state, then it is possible that
1394 return value is NULL and ERR is REG_NOERROR.
1395 - We never return non-NULL value in case of any errors, it is for
1396 optimization. */
1398 static re_dfastate_t*
1399 re_acquire_state (err, dfa, nodes)
1400 reg_errcode_t *err;
1401 re_dfa_t *dfa;
1402 const re_node_set *nodes;
1404 unsigned int hash;
1405 re_dfastate_t *new_state;
1406 struct re_state_table_entry *spot;
1407 int i;
1408 if (BE (nodes->nelem == 0, 0))
1410 *err = REG_NOERROR;
1411 return NULL;
1413 hash = calc_state_hash (nodes, 0);
1414 spot = dfa->state_table + (hash & dfa->state_hash_mask);
1416 for (i = 0 ; i < spot->num ; i++)
1418 re_dfastate_t *state = spot->array[i];
1419 if (hash != state->hash)
1420 continue;
1421 if (re_node_set_compare (&state->nodes, nodes))
1422 return state;
1425 /* There are no appropriate state in the dfa, create the new one. */
1426 new_state = create_ci_newstate (dfa, nodes, hash);
1427 if (BE (new_state != NULL, 1))
1428 return new_state;
1429 else
1431 *err = REG_ESPACE;
1432 return NULL;
1436 /* Search for the state whose node_set is equivalent to NODES and
1437 whose context is equivalent to CONTEXT.
1438 Return the pointer to the state, if we found it in the DFA.
1439 Otherwise create the new one and return it. In case of an error
1440 return NULL and set the error code in ERR.
1441 Note: - We assume NULL as the invalid state, then it is possible that
1442 return value is NULL and ERR is REG_NOERROR.
1443 - We never return non-NULL value in case of any errors, it is for
1444 optimization. */
1446 static re_dfastate_t*
1447 re_acquire_state_context (err, dfa, nodes, context)
1448 reg_errcode_t *err;
1449 re_dfa_t *dfa;
1450 const re_node_set *nodes;
1451 unsigned int context;
1453 unsigned int hash;
1454 re_dfastate_t *new_state;
1455 struct re_state_table_entry *spot;
1456 int i;
1457 if (nodes->nelem == 0)
1459 *err = REG_NOERROR;
1460 return NULL;
1462 hash = calc_state_hash (nodes, context);
1463 spot = dfa->state_table + (hash & dfa->state_hash_mask);
1465 for (i = 0 ; i < spot->num ; i++)
1467 re_dfastate_t *state = spot->array[i];
1468 if (state->hash == hash
1469 && state->context == context
1470 && re_node_set_compare (state->entrance_nodes, nodes))
1471 return state;
1473 /* There are no appropriate state in `dfa', create the new one. */
1474 new_state = create_cd_newstate (dfa, nodes, context, hash);
1475 if (BE (new_state != NULL, 1))
1476 return new_state;
1477 else
1479 *err = REG_ESPACE;
1480 return NULL;
1484 /* Finish initialization of the new state NEWSTATE, and using its hash value
1485 HASH put in the appropriate bucket of DFA's state table. Return value
1486 indicates the error code if failed. */
1488 static reg_errcode_t
1489 register_state (dfa, newstate, hash)
1490 re_dfa_t *dfa;
1491 re_dfastate_t *newstate;
1492 unsigned int hash;
1494 struct re_state_table_entry *spot;
1495 reg_errcode_t err;
1496 int i;
1498 newstate->hash = hash;
1499 err = re_node_set_alloc (&newstate->non_eps_nodes, newstate->nodes.nelem);
1500 if (BE (err != REG_NOERROR, 0))
1501 return REG_ESPACE;
1502 for (i = 0; i < newstate->nodes.nelem; i++)
1504 int elem = newstate->nodes.elems[i];
1505 if (!IS_EPSILON_NODE (dfa->nodes[elem].type))
1506 re_node_set_insert_last (&newstate->non_eps_nodes, elem);
1509 spot = dfa->state_table + (hash & dfa->state_hash_mask);
1510 if (BE (spot->alloc <= spot->num, 0))
1512 int new_alloc = 2 * spot->num + 2;
1513 re_dfastate_t **new_array = re_realloc (spot->array, re_dfastate_t *,
1514 new_alloc);
1515 if (BE (new_array == NULL, 0))
1516 return REG_ESPACE;
1517 spot->array = new_array;
1518 spot->alloc = new_alloc;
1520 spot->array[spot->num++] = newstate;
1521 return REG_NOERROR;
1524 /* Create the new state which is independ of contexts.
1525 Return the new state if succeeded, otherwise return NULL. */
1527 static re_dfastate_t *
1528 create_ci_newstate (dfa, nodes, hash)
1529 re_dfa_t *dfa;
1530 const re_node_set *nodes;
1531 unsigned int hash;
1533 int i;
1534 reg_errcode_t err;
1535 re_dfastate_t *newstate;
1537 newstate = (re_dfastate_t *) calloc (sizeof (re_dfastate_t), 1);
1538 if (BE (newstate == NULL, 0))
1539 return NULL;
1540 err = re_node_set_init_copy (&newstate->nodes, nodes);
1541 if (BE (err != REG_NOERROR, 0))
1543 re_free (newstate);
1544 return NULL;
1547 newstate->entrance_nodes = &newstate->nodes;
1548 for (i = 0 ; i < nodes->nelem ; i++)
1550 re_token_t *node = dfa->nodes + nodes->elems[i];
1551 re_token_type_t type = node->type;
1552 if (type == CHARACTER && !node->constraint)
1553 continue;
1555 /* If the state has the halt node, the state is a halt state. */
1556 else if (type == END_OF_RE)
1557 newstate->halt = 1;
1558 #ifdef RE_ENABLE_I18N
1559 else if (type == COMPLEX_BRACKET
1560 || type == OP_UTF8_PERIOD
1561 || (type == OP_PERIOD && dfa->mb_cur_max > 1))
1562 newstate->accept_mb = 1;
1563 #endif /* RE_ENABLE_I18N */
1564 else if (type == OP_BACK_REF)
1565 newstate->has_backref = 1;
1566 else if (type == ANCHOR || node->constraint)
1567 newstate->has_constraint = 1;
1569 err = register_state (dfa, newstate, hash);
1570 if (BE (err != REG_NOERROR, 0))
1572 free_state (newstate);
1573 newstate = NULL;
1575 return newstate;
1578 /* Create the new state which is depend on the context CONTEXT.
1579 Return the new state if succeeded, otherwise return NULL. */
1581 static re_dfastate_t *
1582 create_cd_newstate (dfa, nodes, context, hash)
1583 re_dfa_t *dfa;
1584 const re_node_set *nodes;
1585 unsigned int context, hash;
1587 int i, nctx_nodes = 0;
1588 reg_errcode_t err;
1589 re_dfastate_t *newstate;
1591 newstate = (re_dfastate_t *) calloc (sizeof (re_dfastate_t), 1);
1592 if (BE (newstate == NULL, 0))
1593 return NULL;
1594 err = re_node_set_init_copy (&newstate->nodes, nodes);
1595 if (BE (err != REG_NOERROR, 0))
1597 re_free (newstate);
1598 return NULL;
1601 newstate->context = context;
1602 newstate->entrance_nodes = &newstate->nodes;
1604 for (i = 0 ; i < nodes->nelem ; i++)
1606 unsigned int constraint = 0;
1607 re_token_t *node = dfa->nodes + nodes->elems[i];
1608 re_token_type_t type = node->type;
1609 if (node->constraint)
1610 constraint = node->constraint;
1612 if (type == CHARACTER && !constraint)
1613 continue;
1614 /* If the state has the halt node, the state is a halt state. */
1615 else if (type == END_OF_RE)
1616 newstate->halt = 1;
1617 #ifdef RE_ENABLE_I18N
1618 else if (type == COMPLEX_BRACKET
1619 || type == OP_UTF8_PERIOD
1620 || (type == OP_PERIOD && dfa->mb_cur_max > 1))
1621 newstate->accept_mb = 1;
1622 #endif /* RE_ENABLE_I18N */
1623 else if (type == OP_BACK_REF)
1624 newstate->has_backref = 1;
1625 else if (type == ANCHOR)
1626 constraint = node->opr.ctx_type;
1628 if (constraint)
1630 if (newstate->entrance_nodes == &newstate->nodes)
1632 newstate->entrance_nodes = re_malloc (re_node_set, 1);
1633 if (BE (newstate->entrance_nodes == NULL, 0))
1635 free_state (newstate);
1636 return NULL;
1638 re_node_set_init_copy (newstate->entrance_nodes, nodes);
1639 nctx_nodes = 0;
1640 newstate->has_constraint = 1;
1643 if (NOT_SATISFY_PREV_CONSTRAINT (constraint,context))
1645 re_node_set_remove_at (&newstate->nodes, i - nctx_nodes);
1646 ++nctx_nodes;
1650 err = register_state (dfa, newstate, hash);
1651 if (BE (err != REG_NOERROR, 0))
1653 free_state (newstate);
1654 newstate = NULL;
1656 return newstate;
1659 static void
1660 free_state (state)
1661 re_dfastate_t *state;
1663 re_node_set_free (&state->non_eps_nodes);
1664 re_node_set_free (&state->inveclosure);
1665 if (state->entrance_nodes != &state->nodes)
1667 re_node_set_free (state->entrance_nodes);
1668 re_free (state->entrance_nodes);
1670 re_node_set_free (&state->nodes);
1671 re_free (state->trtable);
1672 re_free (state);