(VERSION): Update from 2.3 release.
[glibc.git] / posix / regex_internal.c
blob25e7b7e0790f3d24271fda8d333259a89913ed51
1 /* Extended regular expression matching and search library.
2 Copyright (C) 2002 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4 Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library; if not, write to the Free
18 Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
19 02111-1307 USA. */
21 #include <assert.h>
22 #include <ctype.h>
23 #include <limits.h>
24 #include <stdio.h>
25 #include <stdlib.h>
26 #include <string.h>
27 #include <wchar.h>
28 #include <wctype.h>
30 #ifdef _LIBC
31 # ifndef _RE_DEFINE_LOCALE_FUNCTIONS
32 # define _RE_DEFINE_LOCALE_FUNCTIONS 1
33 # include <locale/localeinfo.h>
34 # include <locale/elem-hash.h>
35 # include <locale/coll-lookup.h>
36 # endif
37 #endif
39 /* This is for other GNU distributions with internationalized messages. */
40 #if HAVE_LIBINTL_H || defined _LIBC
41 # include <libintl.h>
42 # ifdef _LIBC
43 # undef gettext
44 # define gettext(msgid) \
45 INTUSE(__dcgettext) (_libc_intl_domainname_internal, msgid, LC_MESSAGES)
46 # endif
47 #else
48 # define gettext(msgid) (msgid)
49 #endif
51 #ifndef gettext_noop
52 /* This define is so xgettext can find the internationalizable
53 strings. */
54 # define gettext_noop(String) String
55 #endif
57 #include "regex.h"
58 #include "regex_internal.h"
60 static void re_string_construct_common (const unsigned char *str,
61 int len, re_string_t *pstr,
62 RE_TRANSLATE_TYPE trans, int icase);
63 #ifdef RE_ENABLE_I18N
64 static int re_string_skip_chars (re_string_t *pstr, int new_raw_idx);
65 #endif /* RE_ENABLE_I18N */
66 static re_dfastate_t *create_newstate_common (re_dfa_t *dfa,
67 const re_node_set *nodes,
68 unsigned int hash);
69 static reg_errcode_t register_state (re_dfa_t *dfa, re_dfastate_t *newstate,
70 unsigned int hash);
71 static re_dfastate_t *create_ci_newstate (re_dfa_t *dfa,
72 const re_node_set *nodes,
73 unsigned int hash);
74 static re_dfastate_t *create_cd_newstate (re_dfa_t *dfa,
75 const re_node_set *nodes,
76 unsigned int context,
77 unsigned int hash);
78 static unsigned int inline calc_state_hash (const re_node_set *nodes,
79 unsigned int context);
81 /* Functions for string operation. */
83 /* This function allocate the buffers. It is necessary to call
84 re_string_reconstruct before using the object. */
86 static reg_errcode_t
87 re_string_allocate (pstr, str, len, init_len, trans, icase)
88 re_string_t *pstr;
89 const unsigned char *str;
90 int len, init_len, icase;
91 RE_TRANSLATE_TYPE trans;
93 reg_errcode_t ret;
94 int init_buf_len = (len + 1 < init_len) ? len + 1: init_len;
95 re_string_construct_common (str, len, pstr, trans, icase);
96 pstr->stop = pstr->len;
98 ret = re_string_realloc_buffers (pstr, init_buf_len);
99 if (BE (ret != REG_NOERROR, 0))
100 return ret;
102 pstr->mbs_case = (MBS_CASE_ALLOCATED (pstr) ? pstr->mbs_case
103 : (unsigned char *)str);
104 pstr->mbs = MBS_ALLOCATED (pstr) ? pstr->mbs : pstr->mbs_case;
105 pstr->valid_len = (MBS_CASE_ALLOCATED (pstr) || MBS_ALLOCATED (pstr)
106 || MB_CUR_MAX > 1) ? pstr->valid_len : len;
107 return REG_NOERROR;
110 /* This function allocate the buffers, and initialize them. */
112 static reg_errcode_t
113 re_string_construct (pstr, str, len, trans, icase)
114 re_string_t *pstr;
115 const unsigned char *str;
116 int len, icase;
117 RE_TRANSLATE_TYPE trans;
119 reg_errcode_t ret;
120 re_string_construct_common (str, len, pstr, trans, icase);
121 pstr->stop = pstr->len;
122 /* Set 0 so that this function can initialize whole buffers. */
123 pstr->valid_len = 0;
125 if (len > 0)
127 ret = re_string_realloc_buffers (pstr, len + 1);
128 if (BE (ret != REG_NOERROR, 0))
129 return ret;
131 pstr->mbs_case = (MBS_CASE_ALLOCATED (pstr) ? pstr->mbs_case
132 : (unsigned char *)str);
133 pstr->mbs = MBS_ALLOCATED (pstr) ? pstr->mbs : pstr->mbs_case;
135 if (icase)
137 #ifdef RE_ENABLE_I18N
138 if (MB_CUR_MAX > 1)
139 build_wcs_upper_buffer (pstr);
140 else
141 #endif /* RE_ENABLE_I18N */
142 build_upper_buffer (pstr);
144 else
146 #ifdef RE_ENABLE_I18N
147 if (MB_CUR_MAX > 1)
148 build_wcs_buffer (pstr);
149 else
150 #endif /* RE_ENABLE_I18N */
152 if (trans != NULL)
153 re_string_translate_buffer (pstr);
154 else
155 pstr->valid_len = len;
159 /* Initialized whole buffers, then valid_len == bufs_len. */
160 pstr->valid_len = pstr->bufs_len;
161 return REG_NOERROR;
164 /* Helper functions for re_string_allocate, and re_string_construct. */
166 static reg_errcode_t
167 re_string_realloc_buffers (pstr, new_buf_len)
168 re_string_t *pstr;
169 int new_buf_len;
171 #ifdef RE_ENABLE_I18N
172 if (MB_CUR_MAX > 1)
174 pstr->wcs = re_realloc (pstr->wcs, wint_t, new_buf_len);
175 if (BE (pstr->wcs == NULL, 0))
176 return REG_ESPACE;
178 #endif /* RE_ENABLE_I18N */
179 if (MBS_ALLOCATED (pstr))
181 pstr->mbs = re_realloc (pstr->mbs, unsigned char, new_buf_len);
182 if (BE (pstr->mbs == NULL, 0))
183 return REG_ESPACE;
185 if (MBS_CASE_ALLOCATED (pstr))
187 pstr->mbs_case = re_realloc (pstr->mbs_case, unsigned char, new_buf_len);
188 if (BE (pstr->mbs_case == NULL, 0))
189 return REG_ESPACE;
190 if (!MBS_ALLOCATED (pstr))
191 pstr->mbs = pstr->mbs_case;
193 pstr->bufs_len = new_buf_len;
194 return REG_NOERROR;
198 static void
199 re_string_construct_common (str, len, pstr, trans, icase)
200 const unsigned char *str;
201 int len;
202 re_string_t *pstr;
203 RE_TRANSLATE_TYPE trans;
204 int icase;
206 memset (pstr, '\0', sizeof (re_string_t));
207 pstr->raw_mbs = str;
208 pstr->len = len;
209 pstr->trans = trans;
210 pstr->icase = icase ? 1 : 0;
213 #ifdef RE_ENABLE_I18N
215 /* Build wide character buffer PSTR->WCS.
216 If the byte sequence of the string are:
217 <mb1>(0), <mb1>(1), <mb2>(0), <mb2>(1), <sb3>
218 Then wide character buffer will be:
219 <wc1> , WEOF , <wc2> , WEOF , <wc3>
220 We use WEOF for padding, they indicate that the position isn't
221 a first byte of a multibyte character.
223 Note that this function assumes PSTR->VALID_LEN elements are already
224 built and starts from PSTR->VALID_LEN. */
226 static void
227 build_wcs_buffer (pstr)
228 re_string_t *pstr;
230 mbstate_t prev_st;
231 int byte_idx, end_idx, mbclen, remain_len;
232 /* Build the buffers from pstr->valid_len to either pstr->len or
233 pstr->bufs_len. */
234 end_idx = (pstr->bufs_len > pstr->len)? pstr->len : pstr->bufs_len;
235 for (byte_idx = pstr->valid_len; byte_idx < end_idx;)
237 wchar_t wc;
238 remain_len = end_idx - byte_idx;
239 prev_st = pstr->cur_state;
240 mbclen = mbrtowc (&wc, pstr->raw_mbs + pstr->raw_mbs_idx + byte_idx,
241 remain_len, &pstr->cur_state);
242 if (BE (mbclen == (size_t) -2, 0))
244 /* The buffer doesn't have enough space, finish to build. */
245 pstr->cur_state = prev_st;
246 break;
248 else if (BE (mbclen == (size_t) -1 || mbclen == 0, 0))
250 /* We treat these cases as a singlebyte character. */
251 mbclen = 1;
252 wc = (wchar_t) pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx];
253 pstr->cur_state = prev_st;
256 /* Apply the translateion if we need. */
257 if (pstr->trans != NULL && mbclen == 1)
259 int ch = pstr->trans[pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx]];
260 pstr->mbs_case[byte_idx] = ch;
262 /* Write wide character and padding. */
263 pstr->wcs[byte_idx++] = wc;
264 /* Write paddings. */
265 for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
266 pstr->wcs[byte_idx++] = WEOF;
268 pstr->valid_len = byte_idx;
271 /* Build wide character buffer PSTR->WCS like build_wcs_buffer,
272 but for REG_ICASE. */
274 static void
275 build_wcs_upper_buffer (pstr)
276 re_string_t *pstr;
278 mbstate_t prev_st;
279 int byte_idx, end_idx, mbclen, remain_len;
280 /* Build the buffers from pstr->valid_len to either pstr->len or
281 pstr->bufs_len. */
282 end_idx = (pstr->bufs_len > pstr->len)? pstr->len : pstr->bufs_len;
283 for (byte_idx = pstr->valid_len; byte_idx < end_idx;)
285 wchar_t wc;
286 remain_len = end_idx - byte_idx;
287 prev_st = pstr->cur_state;
288 mbclen = mbrtowc (&wc, pstr->raw_mbs + pstr->raw_mbs_idx + byte_idx,
289 remain_len, &pstr->cur_state);
290 if (BE (mbclen == (size_t) -2, 0))
292 /* The buffer doesn't have enough space, finish to build. */
293 pstr->cur_state = prev_st;
294 break;
296 else if (mbclen == 1 || mbclen == (size_t) -1 || mbclen == 0)
298 /* In case of a singlebyte character. */
299 int ch = pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx];
300 /* Apply the translateion if we need. */
301 if (pstr->trans != NULL && mbclen == 1)
303 ch = pstr->trans[ch];
304 pstr->mbs_case[byte_idx] = ch;
306 pstr->wcs[byte_idx] = iswlower (wc) ? toupper (wc) : wc;
307 pstr->mbs[byte_idx++] = islower (ch) ? toupper (ch) : ch;
308 if (BE (mbclen == (size_t) -1, 0))
309 pstr->cur_state = prev_st;
311 else /* mbclen > 1 */
313 if (iswlower (wc))
314 wcrtomb (pstr->mbs + byte_idx, towupper (wc), &prev_st);
315 else
316 memcpy (pstr->mbs + byte_idx,
317 pstr->raw_mbs + pstr->raw_mbs_idx + byte_idx, mbclen);
318 pstr->wcs[byte_idx++] = iswlower (wc) ? toupper (wc) : wc;
319 /* Write paddings. */
320 for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
321 pstr->wcs[byte_idx++] = WEOF;
324 pstr->valid_len = byte_idx;
327 /* Skip characters until the index becomes greater than NEW_RAW_IDX.
328 Return the index. */
330 static int
331 re_string_skip_chars (pstr, new_raw_idx)
332 re_string_t *pstr;
333 int new_raw_idx;
335 mbstate_t prev_st;
336 int rawbuf_idx, mbclen;
338 /* Skip the characters which are not necessary to check. */
339 for (rawbuf_idx = pstr->raw_mbs_idx + pstr->valid_len;
340 rawbuf_idx < new_raw_idx;)
342 int remain_len = pstr->len - rawbuf_idx;
343 prev_st = pstr->cur_state;
344 mbclen = mbrlen (pstr->raw_mbs + rawbuf_idx, remain_len,
345 &pstr->cur_state);
346 if (BE (mbclen == (size_t) -2 || mbclen == (size_t) -1 || mbclen == 0, 0))
348 /* We treat these cases as a singlebyte character. */
349 mbclen = 1;
350 pstr->cur_state = prev_st;
352 /* Then proceed the next character. */
353 rawbuf_idx += mbclen;
355 return rawbuf_idx;
357 #endif /* RE_ENABLE_I18N */
359 /* Build the buffer PSTR->MBS, and apply the translation if we need.
360 This function is used in case of REG_ICASE. */
362 static void
363 build_upper_buffer (pstr)
364 re_string_t *pstr;
366 int char_idx, end_idx;
367 end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
369 for (char_idx = pstr->valid_len; char_idx < end_idx; ++char_idx)
371 int ch = pstr->raw_mbs[pstr->raw_mbs_idx + char_idx];
372 if (pstr->trans != NULL)
374 ch = pstr->trans[ch];
375 pstr->mbs_case[char_idx] = ch;
377 if (islower (ch))
378 pstr->mbs[char_idx] = toupper (ch);
379 else
380 pstr->mbs[char_idx] = ch;
382 pstr->valid_len = char_idx;
385 /* Apply TRANS to the buffer in PSTR. */
387 static void
388 re_string_translate_buffer (pstr)
389 re_string_t *pstr;
391 int buf_idx, end_idx;
392 end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
394 for (buf_idx = pstr->valid_len; buf_idx < end_idx; ++buf_idx)
396 int ch = pstr->raw_mbs[pstr->raw_mbs_idx + buf_idx];
397 pstr->mbs_case[buf_idx] = pstr->trans[ch];
400 pstr->valid_len = buf_idx;
403 /* This function re-construct the buffers.
404 Concretely, convert to wide character in case of MB_CUR_MAX > 1,
405 convert to upper case in case of REG_ICASE, apply translation. */
407 static reg_errcode_t
408 re_string_reconstruct (pstr, idx, eflags, newline)
409 re_string_t *pstr;
410 int idx, eflags, newline;
412 int offset = idx - pstr->raw_mbs_idx;
413 if (offset < 0)
415 /* Reset buffer. */
416 #ifdef RE_ENABLE_I18N
417 if (MB_CUR_MAX > 1)
418 memset (&pstr->cur_state, '\0', sizeof (mbstate_t));
419 #endif /* RE_ENABLE_I18N */
420 pstr->valid_len = pstr->raw_mbs_idx = 0;
421 pstr->tip_context = ((eflags & REG_NOTBOL) ? CONTEXT_BEGBUF
422 : CONTEXT_NEWLINE | CONTEXT_BEGBUF);
423 if (!MBS_CASE_ALLOCATED (pstr))
424 pstr->mbs_case = (unsigned char *)pstr->raw_mbs;
425 if (!MBS_ALLOCATED (pstr) && !MBS_CASE_ALLOCATED (pstr))
426 pstr->mbs = (unsigned char *)pstr->raw_mbs;
427 offset = idx;
430 if (offset != 0)
432 pstr->tip_context = re_string_context_at (pstr, offset - 1, eflags,
433 newline);
434 /* Are the characters which are already checked remain? */
435 if (offset < pstr->valid_len)
437 /* Yes, move them to the front of the buffer. */
438 #ifdef RE_ENABLE_I18N
439 if (MB_CUR_MAX > 1)
440 memmove (pstr->wcs, pstr->wcs + offset,
441 (pstr->valid_len - offset) * sizeof (wint_t));
442 #endif /* RE_ENABLE_I18N */
443 if (MBS_ALLOCATED (pstr))
444 memmove (pstr->mbs, pstr->mbs + offset,
445 pstr->valid_len - offset);
446 if (MBS_CASE_ALLOCATED (pstr))
447 memmove (pstr->mbs_case, pstr->mbs_case + offset,
448 pstr->valid_len - offset);
449 pstr->valid_len -= offset;
450 #if DEBUG
451 assert (pstr->valid_len > 0);
452 #endif
454 else
456 /* No, skip all characters until IDX. */
457 pstr->valid_len = 0;
458 #ifdef RE_ENABLE_I18N
459 if (MB_CUR_MAX > 1)
461 int wcs_idx;
462 pstr->valid_len = re_string_skip_chars (pstr, idx) - idx;
463 for (wcs_idx = 0; wcs_idx < pstr->valid_len; ++wcs_idx)
464 pstr->wcs[wcs_idx] = WEOF;
466 #endif /* RE_ENABLE_I18N */
468 if (!MBS_CASE_ALLOCATED (pstr))
470 pstr->mbs_case += offset;
471 /* In case of !MBS_ALLOCATED && !MBS_CASE_ALLOCATED. */
472 if (!MBS_ALLOCATED (pstr))
473 pstr->mbs += offset;
476 pstr->raw_mbs_idx = idx;
477 pstr->len -= offset;
478 pstr->stop -= offset;
480 /* Then build the buffers. */
481 #ifdef RE_ENABLE_I18N
482 if (MB_CUR_MAX > 1)
484 if (pstr->icase)
485 build_wcs_upper_buffer (pstr);
486 else
487 build_wcs_buffer (pstr);
489 else
490 #endif /* RE_ENABLE_I18N */
492 if (pstr->icase)
493 build_upper_buffer (pstr);
494 else if (pstr->trans != NULL)
495 re_string_translate_buffer (pstr);
497 pstr->cur_idx = 0;
499 return REG_NOERROR;
502 static void
503 re_string_destruct (pstr)
504 re_string_t *pstr;
506 #ifdef RE_ENABLE_I18N
507 re_free (pstr->wcs);
508 #endif /* RE_ENABLE_I18N */
509 if (MBS_ALLOCATED (pstr))
510 re_free (pstr->mbs);
511 if (MBS_CASE_ALLOCATED (pstr))
512 re_free (pstr->mbs_case);
515 /* Return the context at IDX in INPUT. */
517 static unsigned int
518 re_string_context_at (input, idx, eflags, newline_anchor)
519 const re_string_t *input;
520 int idx, eflags, newline_anchor;
522 int c;
523 if (idx < 0 || idx == input->len)
525 if (idx < 0)
526 /* In this case, we use the value stored in input->tip_context,
527 since we can't know the character in input->mbs[-1] here. */
528 return input->tip_context;
529 else /* (idx == input->len) */
530 return ((eflags & REG_NOTEOL) ? CONTEXT_ENDBUF
531 : CONTEXT_NEWLINE | CONTEXT_ENDBUF);
533 c = re_string_byte_at (input, idx);
534 if (IS_WORD_CHAR (c))
535 return CONTEXT_WORD;
536 return (newline_anchor && IS_NEWLINE (c)) ? CONTEXT_NEWLINE : 0;
539 /* Functions for set operation. */
541 static reg_errcode_t
542 re_node_set_alloc (set, size)
543 re_node_set *set;
544 int size;
546 set->alloc = size;
547 set->nelem = 0;
548 set->elems = re_malloc (int, size);
549 if (BE (set->elems == NULL, 0))
550 return REG_ESPACE;
551 return REG_NOERROR;
554 static reg_errcode_t
555 re_node_set_init_1 (set, elem)
556 re_node_set *set;
557 int elem;
559 set->alloc = 1;
560 set->nelem = 1;
561 set->elems = re_malloc (int, 1);
562 if (BE (set->elems == NULL, 0))
563 return REG_ESPACE;
564 set->elems[0] = elem;
565 return REG_NOERROR;
568 static reg_errcode_t
569 re_node_set_init_2 (set, elem1, elem2)
570 re_node_set *set;
571 int elem1, elem2;
573 set->alloc = 2;
574 set->elems = re_malloc (int, 2);
575 if (BE (set->elems == NULL, 0))
576 return REG_ESPACE;
577 if (elem1 == elem2)
579 set->nelem = 1;
580 set->elems[0] = elem1;
582 else
584 set->nelem = 2;
585 if (elem1 < elem2)
587 set->elems[0] = elem1;
588 set->elems[1] = elem2;
590 else
592 set->elems[0] = elem2;
593 set->elems[1] = elem1;
596 return REG_NOERROR;
599 static reg_errcode_t
600 re_node_set_init_copy (dest, src)
601 re_node_set *dest;
602 const re_node_set *src;
604 dest->nelem = src->nelem;
605 if (src->nelem > 0)
607 dest->alloc = dest->nelem;
608 dest->elems = re_malloc (int, dest->alloc);
609 if (BE (dest->elems == NULL, 0))
610 return REG_ESPACE;
611 memcpy (dest->elems, src->elems, src->nelem * sizeof (int));
613 else
614 re_node_set_init_empty (dest);
615 return REG_NOERROR;
618 /* Calculate the intersection of the sets SRC1 and SRC2. And store it in
619 DEST. Return value indicate the error code or REG_NOERROR if succeeded.
620 Note: We assume dest->elems is NULL, when dest->alloc is 0. */
622 static reg_errcode_t
623 re_node_set_intersect (dest, src1, src2)
624 re_node_set *dest;
625 const re_node_set *src1, *src2;
627 int i1, i2, id;
628 if (src1->nelem > 0 && src2->nelem > 0)
630 if (src1->nelem + src2->nelem > dest->alloc)
632 dest->alloc = src1->nelem + src2->nelem;
633 dest->elems = re_realloc (dest->elems, int, dest->alloc);
634 if (BE (dest->elems == NULL, 0))
635 return REG_ESPACE;
638 else
640 /* The intersection of empty sets is also empty set. */
641 dest->nelem = 0;
642 return REG_NOERROR;
645 for (i1 = i2 = id = 0; i1 < src1->nelem && i2 < src2->nelem; )
647 if (src1->elems[i1] > src2->elems[i2])
649 ++i2;
650 continue;
652 /* The intersection must have the elements which are in both of
653 SRC1 and SRC2. */
654 if (src1->elems[i1] == src2->elems[i2])
655 dest->elems[id++] = src2->elems[i2++];
656 ++i1;
658 dest->nelem = id;
659 return REG_NOERROR;
662 /* Calculate the intersection of the sets SRC1 and SRC2. And merge it to
663 DEST. Return value indicate the error code or REG_NOERROR if succeeded.
664 Note: We assume dest->elems is NULL, when dest->alloc is 0. */
666 static reg_errcode_t
667 re_node_set_add_intersect (dest, src1, src2)
668 re_node_set *dest;
669 const re_node_set *src1, *src2;
671 int i1, i2, id;
672 if (src1->nelem > 0 && src2->nelem > 0)
674 if (src1->nelem + src2->nelem + dest->nelem > dest->alloc)
676 dest->alloc = src1->nelem + src2->nelem + dest->nelem;
677 dest->elems = re_realloc (dest->elems, int, dest->alloc);
678 if (BE (dest->elems == NULL, 0))
679 return REG_ESPACE;
682 else
683 return REG_NOERROR;
685 for (i1 = i2 = id = 0 ; i1 < src1->nelem && i2 < src2->nelem ;)
687 if (src1->elems[i1] > src2->elems[i2])
689 ++i2;
690 continue;
692 if (src1->elems[i1] == src2->elems[i2])
694 while (id < dest->nelem && dest->elems[id] < src2->elems[i2])
695 ++id;
696 if (id < dest->nelem && dest->elems[id] == src2->elems[i2])
697 ++id;
698 else
700 memmove (dest->elems + id + 1, dest->elems + id,
701 sizeof (int) * (dest->nelem - id));
702 dest->elems[id++] = src2->elems[i2++];
703 ++dest->nelem;
706 ++i1;
708 return REG_NOERROR;
711 /* Calculate the union set of the sets SRC1 and SRC2. And store it to
712 DEST. Return value indicate the error code or REG_NOERROR if succeeded. */
714 static reg_errcode_t
715 re_node_set_init_union (dest, src1, src2)
716 re_node_set *dest;
717 const re_node_set *src1, *src2;
719 int i1, i2, id;
720 if (src1 != NULL && src1->nelem > 0 && src2 != NULL && src2->nelem > 0)
722 dest->alloc = src1->nelem + src2->nelem;
723 dest->elems = re_malloc (int, dest->alloc);
724 if (BE (dest->elems == NULL, 0))
725 return REG_ESPACE;
727 else
729 if (src1 != NULL && src1->nelem > 0)
730 return re_node_set_init_copy (dest, src1);
731 else if (src2 != NULL && src2->nelem > 0)
732 return re_node_set_init_copy (dest, src2);
733 else
734 re_node_set_init_empty (dest);
735 return REG_NOERROR;
737 for (i1 = i2 = id = 0 ; i1 < src1->nelem && i2 < src2->nelem ;)
739 if (src1->elems[i1] > src2->elems[i2])
741 dest->elems[id++] = src2->elems[i2++];
742 continue;
744 if (src1->elems[i1] == src2->elems[i2])
745 ++i2;
746 dest->elems[id++] = src1->elems[i1++];
748 if (i1 < src1->nelem)
750 memcpy (dest->elems + id, src1->elems + i1,
751 (src1->nelem - i1) * sizeof (int));
752 id += src1->nelem - i1;
754 else if (i2 < src2->nelem)
756 memcpy (dest->elems + id, src2->elems + i2,
757 (src2->nelem - i2) * sizeof (int));
758 id += src2->nelem - i2;
760 dest->nelem = id;
761 return REG_NOERROR;
764 /* Calculate the union set of the sets DEST and SRC. And store it to
765 DEST. Return value indicate the error code or REG_NOERROR if succeeded. */
767 static reg_errcode_t
768 re_node_set_merge (dest, src)
769 re_node_set *dest;
770 const re_node_set *src;
772 int si, di;
773 if (src == NULL || src->nelem == 0)
774 return REG_NOERROR;
775 if (dest->alloc < src->nelem + dest->nelem)
777 dest->alloc = 2 * (src->nelem + dest->alloc);
778 dest->elems = re_realloc (dest->elems, int, dest->alloc);
779 if (BE (dest->elems == NULL, 0))
780 return REG_ESPACE;
783 for (si = 0, di = 0 ; si < src->nelem && di < dest->nelem ;)
785 int cp_from, ncp, mid, right, src_elem = src->elems[si];
786 /* Binary search the spot we will add the new element. */
787 right = dest->nelem;
788 while (di < right)
790 mid = (di + right) / 2;
791 if (dest->elems[mid] < src_elem)
792 di = mid + 1;
793 else
794 right = mid;
796 if (di >= dest->nelem)
797 break;
799 if (dest->elems[di] == src_elem)
801 /* Skip since, DEST already has the element. */
802 ++di;
803 ++si;
804 continue;
807 /* Skip the src elements which are less than dest->elems[di]. */
808 cp_from = si;
809 while (si < src->nelem && src->elems[si] < dest->elems[di])
810 ++si;
811 /* Copy these src elements. */
812 ncp = si - cp_from;
813 memmove (dest->elems + di + ncp, dest->elems + di,
814 sizeof (int) * (dest->nelem - di));
815 memcpy (dest->elems + di, src->elems + cp_from,
816 sizeof (int) * ncp);
817 /* Update counters. */
818 di += ncp;
819 dest->nelem += ncp;
822 /* Copy remaining src elements. */
823 if (si < src->nelem)
825 memcpy (dest->elems + di, src->elems + si,
826 sizeof (int) * (src->nelem - si));
827 dest->nelem += src->nelem - si;
829 return REG_NOERROR;
832 /* Insert the new element ELEM to the re_node_set* SET.
833 return 0 if SET already has ELEM,
834 return -1 if an error is occured, return 1 otherwise. */
836 static int
837 re_node_set_insert (set, elem)
838 re_node_set *set;
839 int elem;
841 int idx, right, mid;
842 /* In case of the set is empty. */
843 if (set->elems == NULL || set->alloc == 0)
845 if (BE (re_node_set_init_1 (set, elem) == REG_NOERROR, 1))
846 return 1;
847 else
848 return -1;
851 /* Binary search the spot we will add the new element. */
852 idx = 0;
853 right = set->nelem;
854 while (idx < right)
856 mid = (idx + right) / 2;
857 if (set->elems[mid] < elem)
858 idx = mid + 1;
859 else
860 right = mid;
863 /* Realloc if we need. */
864 if (set->alloc < set->nelem + 1)
866 int *new_array;
867 set->alloc = set->alloc * 2;
868 new_array = re_malloc (int, set->alloc);
869 if (BE (new_array == NULL, 0))
870 return -1;
871 /* Copy the elements they are followed by the new element. */
872 if (idx > 0)
873 memcpy (new_array, set->elems, sizeof (int) * (idx));
874 /* Copy the elements which follows the new element. */
875 if (set->nelem - idx > 0)
876 memcpy (new_array + idx + 1, set->elems + idx,
877 sizeof (int) * (set->nelem - idx));
878 re_free (set->elems);
879 set->elems = new_array;
881 else
883 /* Move the elements which follows the new element. */
884 if (set->nelem - idx > 0)
885 memmove (set->elems + idx + 1, set->elems + idx,
886 sizeof (int) * (set->nelem - idx));
888 /* Insert the new element. */
889 set->elems[idx] = elem;
890 ++set->nelem;
891 return 1;
894 /* Compare two node sets SET1 and SET2.
895 return 1 if SET1 and SET2 are equivalent, retrun 0 otherwise. */
897 static int
898 re_node_set_compare (set1, set2)
899 const re_node_set *set1, *set2;
901 int i;
902 if (set1 == NULL || set2 == NULL || set1->nelem != set2->nelem)
903 return 0;
904 for (i = 0 ; i < set1->nelem ; i++)
905 if (set1->elems[i] != set2->elems[i])
906 return 0;
907 return 1;
910 /* Return 1 if SET contains the element ELEM, return 0 otherwise. */
912 static int
913 re_node_set_contains (set, elem)
914 const re_node_set *set;
915 int elem;
917 int idx, right, mid;
918 if (set->nelem <= 0)
919 return 0;
921 /* Binary search the element. */
922 idx = 0;
923 right = set->nelem - 1;
924 while (idx < right)
926 mid = (idx + right) / 2;
927 if (set->elems[mid] < elem)
928 idx = mid + 1;
929 else
930 right = mid;
932 return set->elems[idx] == elem;
935 static void
936 re_node_set_remove_at (set, idx)
937 re_node_set *set;
938 int idx;
940 if (idx < 0 || idx >= set->nelem)
941 return;
942 if (idx < set->nelem - 1)
943 memmove (set->elems + idx, set->elems + idx + 1,
944 sizeof (int) * (set->nelem - idx - 1));
945 --set->nelem;
949 /* Add the token TOKEN to dfa->nodes, and return the index of the token.
950 Or return -1, if an error will be occured. */
952 static int
953 re_dfa_add_node (dfa, token, mode)
954 re_dfa_t *dfa;
955 re_token_t token;
956 int mode;
958 if (dfa->nodes_len >= dfa->nodes_alloc)
960 re_token_t *new_array;
961 dfa->nodes_alloc *= 2;
962 new_array = re_realloc (dfa->nodes, re_token_t, dfa->nodes_alloc);
963 if (BE (new_array == NULL, 0))
964 return -1;
965 else
966 dfa->nodes = new_array;
967 if (mode)
969 int *new_firsts, *new_nexts;
970 re_node_set *new_edests, *new_eclosures, *new_inveclosures;
972 new_firsts = re_realloc (dfa->firsts, int, dfa->nodes_alloc);
973 new_nexts = re_realloc (dfa->nexts, int, dfa->nodes_alloc);
974 new_edests = re_realloc (dfa->edests, re_node_set, dfa->nodes_alloc);
975 new_eclosures = re_realloc (dfa->eclosures, re_node_set,
976 dfa->nodes_alloc);
977 new_inveclosures = re_realloc (dfa->inveclosures, re_node_set,
978 dfa->nodes_alloc);
979 if (BE (new_firsts == NULL || new_nexts == NULL || new_edests == NULL
980 || new_eclosures == NULL || new_inveclosures == NULL, 0))
981 return -1;
982 dfa->firsts = new_firsts;
983 dfa->nexts = new_nexts;
984 dfa->edests = new_edests;
985 dfa->eclosures = new_eclosures;
986 dfa->inveclosures = new_inveclosures;
989 dfa->nodes[dfa->nodes_len] = token;
990 dfa->nodes[dfa->nodes_len].duplicated = 0;
991 return dfa->nodes_len++;
994 static unsigned int inline
995 calc_state_hash (nodes, context)
996 const re_node_set *nodes;
997 unsigned int context;
999 unsigned int hash = nodes->nelem + context;
1000 int i;
1001 for (i = 0 ; i < nodes->nelem ; i++)
1002 hash += nodes->elems[i];
1003 return hash;
1006 /* Search for the state whose node_set is equivalent to NODES.
1007 Return the pointer to the state, if we found it in the DFA.
1008 Otherwise create the new one and return it. In case of an error
1009 return NULL and set the error code in ERR.
1010 Note: - We assume NULL as the invalid state, then it is possible that
1011 return value is NULL and ERR is REG_NOERROR.
1012 - We never return non-NULL value in case of any errors, it is for
1013 optimization. */
1015 static re_dfastate_t*
1016 re_acquire_state (err, dfa, nodes)
1017 reg_errcode_t *err;
1018 re_dfa_t *dfa;
1019 const re_node_set *nodes;
1021 unsigned int hash;
1022 re_dfastate_t *new_state;
1023 struct re_state_table_entry *spot;
1024 int i;
1025 if (BE (nodes->nelem == 0, 0))
1027 *err = REG_NOERROR;
1028 return NULL;
1030 hash = calc_state_hash (nodes, 0);
1031 spot = dfa->state_table + (hash & dfa->state_hash_mask);
1033 for (i = 0 ; i < spot->num ; i++)
1035 re_dfastate_t *state = spot->array[i];
1036 if (hash != state->hash)
1037 continue;
1038 if (re_node_set_compare (&state->nodes, nodes))
1039 return state;
1042 /* There are no appropriate state in the dfa, create the new one. */
1043 new_state = create_ci_newstate (dfa, nodes, hash);
1044 if (BE (new_state != NULL, 1))
1045 return new_state;
1046 else
1048 *err = REG_ESPACE;
1049 return NULL;
1053 /* Search for the state whose node_set is equivalent to NODES and
1054 whose context is equivalent to CONTEXT.
1055 Return the pointer to the state, if we found it in the DFA.
1056 Otherwise create the new one and return it. In case of an error
1057 return NULL and set the error code in ERR.
1058 Note: - We assume NULL as the invalid state, then it is possible that
1059 return value is NULL and ERR is REG_NOERROR.
1060 - We never return non-NULL value in case of any errors, it is for
1061 optimization. */
1063 static re_dfastate_t*
1064 re_acquire_state_context (err, dfa, nodes, context)
1065 reg_errcode_t *err;
1066 re_dfa_t *dfa;
1067 const re_node_set *nodes;
1068 unsigned int context;
1070 unsigned int hash;
1071 re_dfastate_t *new_state;
1072 struct re_state_table_entry *spot;
1073 int i;
1074 if (nodes->nelem == 0)
1076 *err = REG_NOERROR;
1077 return NULL;
1079 hash = calc_state_hash (nodes, context);
1080 spot = dfa->state_table + (hash & dfa->state_hash_mask);
1082 for (i = 0 ; i < spot->num ; i++)
1084 re_dfastate_t *state = spot->array[i];
1085 if (hash != state->hash)
1086 continue;
1087 if (re_node_set_compare (state->entrance_nodes, nodes)
1088 && state->context == context)
1089 return state;
1091 /* There are no appropriate state in `dfa', create the new one. */
1092 new_state = create_cd_newstate (dfa, nodes, context, hash);
1093 if (BE (new_state != NULL, 1))
1094 return new_state;
1095 else
1097 *err = REG_ESPACE;
1098 return NULL;
1102 /* Allocate memory for DFA state and initialize common properties.
1103 Return the new state if succeeded, otherwise return NULL. */
1105 static re_dfastate_t *
1106 create_newstate_common (dfa, nodes, hash)
1107 re_dfa_t *dfa;
1108 const re_node_set *nodes;
1109 unsigned int hash;
1111 re_dfastate_t *newstate;
1112 newstate = (re_dfastate_t *) calloc (sizeof (re_dfastate_t), 1);
1113 if (BE (newstate == NULL, 0))
1114 return NULL;
1115 re_node_set_init_copy (&newstate->nodes, nodes);
1116 newstate->trtable = NULL;
1117 newstate->trtable_search = NULL;
1118 newstate->hash = hash;
1119 return newstate;
1122 /* Store the new state NEWSTATE whose hash value is HASH in appropriate
1123 position. Return value indicate the error code if failed. */
1125 static reg_errcode_t
1126 register_state (dfa, newstate, hash)
1127 re_dfa_t *dfa;
1128 re_dfastate_t *newstate;
1129 unsigned int hash;
1131 struct re_state_table_entry *spot;
1132 spot = dfa->state_table + (hash & dfa->state_hash_mask);
1134 if (spot->alloc <= spot->num)
1136 spot->alloc = 2 * spot->num + 2;
1137 spot->array = re_realloc (spot->array, re_dfastate_t *, spot->alloc);
1138 if (BE (spot->array == NULL, 0))
1139 return REG_ESPACE;
1141 spot->array[spot->num++] = newstate;
1142 return REG_NOERROR;
1145 /* Create the new state which is independ of contexts.
1146 Return the new state if succeeded, otherwise return NULL. */
1148 static re_dfastate_t *
1149 create_ci_newstate (dfa, nodes, hash)
1150 re_dfa_t *dfa;
1151 const re_node_set *nodes;
1152 unsigned int hash;
1154 int i;
1155 reg_errcode_t err;
1156 re_dfastate_t *newstate;
1157 newstate = create_newstate_common (dfa, nodes, hash);
1158 if (BE (newstate == NULL, 0))
1159 return NULL;
1160 newstate->entrance_nodes = &newstate->nodes;
1162 for (i = 0 ; i < nodes->nelem ; i++)
1164 re_token_t *node = dfa->nodes + nodes->elems[i];
1165 re_token_type_t type = node->type;
1166 if (type == CHARACTER)
1167 continue;
1169 /* If the state has the halt node, the state is a halt state. */
1170 else if (type == END_OF_RE)
1171 newstate->halt = 1;
1172 #ifdef RE_ENABLE_I18N
1173 else if (type == COMPLEX_BRACKET
1174 || (type == OP_PERIOD && MB_CUR_MAX > 1))
1175 newstate->accept_mb = 1;
1176 #endif /* RE_ENABLE_I18N */
1177 else if (type == OP_BACK_REF)
1178 newstate->has_backref = 1;
1179 else if (type == ANCHOR || OP_CONTEXT_NODE)
1181 newstate->has_constraint = 1;
1182 if (type == OP_CONTEXT_NODE
1183 && dfa->nodes[node->opr.ctx_info->entity].type == END_OF_RE)
1184 newstate->halt = 1;
1187 err = register_state (dfa, newstate, hash);
1188 return (err != REG_NOERROR) ? NULL : newstate;
1191 /* Create the new state which is depend on the context CONTEXT.
1192 Return the new state if succeeded, otherwise return NULL. */
1194 static re_dfastate_t *
1195 create_cd_newstate (dfa, nodes, context, hash)
1196 re_dfa_t *dfa;
1197 const re_node_set *nodes;
1198 unsigned int context, hash;
1200 int i, nctx_nodes = 0;
1201 reg_errcode_t err;
1202 re_dfastate_t *newstate;
1204 newstate = create_newstate_common (dfa, nodes, hash);
1205 if (BE (newstate == NULL, 0))
1206 return NULL;
1207 newstate->context = context;
1208 newstate->entrance_nodes = &newstate->nodes;
1210 for (i = 0 ; i < nodes->nelem ; i++)
1212 unsigned int constraint = 0;
1213 re_token_t *node = dfa->nodes + nodes->elems[i];
1214 re_token_type_t type = node->type;
1215 if (type == CHARACTER)
1216 continue;
1218 /* If the state has the halt node, the state is a halt state. */
1219 else if (type == END_OF_RE)
1220 newstate->halt = 1;
1221 #ifdef RE_ENABLE_I18N
1222 else if (type == COMPLEX_BRACKET
1223 || (type == OP_PERIOD && MB_CUR_MAX > 1))
1224 newstate->accept_mb = 1;
1225 #endif /* RE_ENABLE_I18N */
1226 else if (type == OP_BACK_REF)
1227 newstate->has_backref = 1;
1228 else if (type == ANCHOR)
1229 constraint = node->opr.ctx_type;
1230 else if (type == OP_CONTEXT_NODE)
1232 re_token_type_t ctype = dfa->nodes[node->opr.ctx_info->entity].type;
1233 constraint = node->constraint;
1234 if (ctype == END_OF_RE)
1235 newstate->halt = 1;
1236 else if (ctype == OP_BACK_REF)
1237 newstate->has_backref = 1;
1238 #ifdef RE_ENABLE_I18N
1239 else if (ctype == COMPLEX_BRACKET
1240 || (type == OP_PERIOD && MB_CUR_MAX > 1))
1241 newstate->accept_mb = 1;
1242 #endif /* RE_ENABLE_I18N */
1245 if (constraint)
1247 if (newstate->entrance_nodes == &newstate->nodes)
1249 newstate->entrance_nodes = re_malloc (re_node_set, 1);
1250 if (BE (newstate->entrance_nodes == NULL, 0))
1251 return NULL;
1252 re_node_set_init_copy (newstate->entrance_nodes, nodes);
1253 nctx_nodes = 0;
1254 newstate->has_constraint = 1;
1257 if (NOT_SATISFY_PREV_CONSTRAINT (constraint,context))
1259 re_node_set_remove_at (&newstate->nodes, i - nctx_nodes);
1260 ++nctx_nodes;
1264 err = register_state (dfa, newstate, hash);
1265 return (err != REG_NOERROR) ? NULL : newstate;