* sysdeps/generic/allocrtsig.c: Include <testrtsig.h>, not
[glibc.git] / posix / regex_internal.c
blob69fb9a65f5df79f990a4272a59ac21a9b66f884c
1 /* Extended regular expression matching and search library.
2 Copyright (C) 2002 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4 Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library; if not, write to the Free
18 Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
19 02111-1307 USA. */
21 #include <assert.h>
22 #include <ctype.h>
23 #include <limits.h>
24 #include <stdio.h>
25 #include <stdlib.h>
26 #include <string.h>
28 #if defined HAVE_WCHAR_H || defined _LIBC
29 # include <wchar.h>
30 #endif /* HAVE_WCHAR_H || _LIBC */
31 #if defined HAVE_WCTYPE_H || defined _LIBC
32 # include <wctype.h>
33 #endif /* HAVE_WCTYPE_H || _LIBC */
35 #ifdef _LIBC
36 # ifndef _RE_DEFINE_LOCALE_FUNCTIONS
37 # define _RE_DEFINE_LOCALE_FUNCTIONS 1
38 # include <locale/localeinfo.h>
39 # include <locale/elem-hash.h>
40 # include <locale/coll-lookup.h>
41 # endif
42 #endif
44 /* This is for other GNU distributions with internationalized messages. */
45 #if HAVE_LIBINTL_H || defined _LIBC
46 # include <libintl.h>
47 # ifdef _LIBC
48 # undef gettext
49 # define gettext(msgid) \
50 INTUSE(__dcgettext) (_libc_intl_domainname_internal, msgid, LC_MESSAGES)
51 # endif
52 #else
53 # define gettext(msgid) (msgid)
54 #endif
56 #ifndef gettext_noop
57 /* This define is so xgettext can find the internationalizable
58 strings. */
59 # define gettext_noop(String) String
60 #endif
62 #include "regex.h"
63 #include "regex_internal.h"
65 static void re_string_construct_common (const char *str, int len,
66 re_string_t *pstr,
67 RE_TRANSLATE_TYPE trans, int icase);
68 #ifdef RE_ENABLE_I18N
69 static int re_string_skip_chars (re_string_t *pstr, int new_raw_idx);
70 #endif /* RE_ENABLE_I18N */
71 static re_dfastate_t *create_newstate_common (re_dfa_t *dfa,
72 const re_node_set *nodes,
73 unsigned int hash);
74 static reg_errcode_t register_state (re_dfa_t *dfa, re_dfastate_t *newstate,
75 unsigned int hash);
76 static re_dfastate_t *create_ci_newstate (re_dfa_t *dfa,
77 const re_node_set *nodes,
78 unsigned int hash);
79 static re_dfastate_t *create_cd_newstate (re_dfa_t *dfa,
80 const re_node_set *nodes,
81 unsigned int context,
82 unsigned int hash);
83 static unsigned int inline calc_state_hash (const re_node_set *nodes,
84 unsigned int context);
86 /* Functions for string operation. */
88 /* This function allocate the buffers. It is necessary to call
89 re_string_reconstruct before using the object. */
91 static reg_errcode_t
92 re_string_allocate (pstr, str, len, init_len, trans, icase)
93 re_string_t *pstr;
94 const char *str;
95 int len, init_len, icase;
96 RE_TRANSLATE_TYPE trans;
98 reg_errcode_t ret;
99 int init_buf_len = (len + 1 < init_len) ? len + 1: init_len;
100 re_string_construct_common (str, len, pstr, trans, icase);
101 pstr->stop = pstr->len;
103 ret = re_string_realloc_buffers (pstr, init_buf_len);
104 if (BE (ret != REG_NOERROR, 0))
105 return ret;
107 pstr->mbs_case = (MBS_CASE_ALLOCATED (pstr) ? pstr->mbs_case
108 : (unsigned char *) str);
109 pstr->mbs = MBS_ALLOCATED (pstr) ? pstr->mbs : pstr->mbs_case;
110 pstr->valid_len = (MBS_CASE_ALLOCATED (pstr) || MBS_ALLOCATED (pstr)
111 || MB_CUR_MAX > 1) ? pstr->valid_len : len;
112 return REG_NOERROR;
115 /* This function allocate the buffers, and initialize them. */
117 static reg_errcode_t
118 re_string_construct (pstr, str, len, trans, icase)
119 re_string_t *pstr;
120 const char *str;
121 int len, icase;
122 RE_TRANSLATE_TYPE trans;
124 reg_errcode_t ret;
125 re_string_construct_common (str, len, pstr, trans, icase);
126 pstr->stop = pstr->len;
127 /* Set 0 so that this function can initialize whole buffers. */
128 pstr->valid_len = 0;
130 if (len > 0)
132 ret = re_string_realloc_buffers (pstr, len + 1);
133 if (BE (ret != REG_NOERROR, 0))
134 return ret;
136 pstr->mbs_case = (MBS_CASE_ALLOCATED (pstr) ? pstr->mbs_case
137 : (unsigned char *) str);
138 pstr->mbs = MBS_ALLOCATED (pstr) ? pstr->mbs : pstr->mbs_case;
140 if (icase)
142 #ifdef RE_ENABLE_I18N
143 if (MB_CUR_MAX > 1)
144 build_wcs_upper_buffer (pstr);
145 else
146 #endif /* RE_ENABLE_I18N */
147 build_upper_buffer (pstr);
149 else
151 #ifdef RE_ENABLE_I18N
152 if (MB_CUR_MAX > 1)
153 build_wcs_buffer (pstr);
154 else
155 #endif /* RE_ENABLE_I18N */
157 if (trans != NULL)
158 re_string_translate_buffer (pstr);
159 else
160 pstr->valid_len = len;
164 /* Initialized whole buffers, then valid_len == bufs_len. */
165 pstr->valid_len = pstr->bufs_len;
166 return REG_NOERROR;
169 /* Helper functions for re_string_allocate, and re_string_construct. */
171 static reg_errcode_t
172 re_string_realloc_buffers (pstr, new_buf_len)
173 re_string_t *pstr;
174 int new_buf_len;
176 #ifdef RE_ENABLE_I18N
177 if (MB_CUR_MAX > 1)
179 pstr->wcs = re_realloc (pstr->wcs, wint_t, new_buf_len);
180 if (BE (pstr->wcs == NULL, 0))
181 return REG_ESPACE;
183 #endif /* RE_ENABLE_I18N */
184 if (MBS_ALLOCATED (pstr))
186 pstr->mbs = re_realloc (pstr->mbs, unsigned char, new_buf_len);
187 if (BE (pstr->mbs == NULL, 0))
188 return REG_ESPACE;
190 if (MBS_CASE_ALLOCATED (pstr))
192 pstr->mbs_case = re_realloc (pstr->mbs_case, unsigned char, new_buf_len);
193 if (BE (pstr->mbs_case == NULL, 0))
194 return REG_ESPACE;
195 if (!MBS_ALLOCATED (pstr))
196 pstr->mbs = pstr->mbs_case;
198 pstr->bufs_len = new_buf_len;
199 return REG_NOERROR;
203 static void
204 re_string_construct_common (str, len, pstr, trans, icase)
205 const char *str;
206 int len;
207 re_string_t *pstr;
208 RE_TRANSLATE_TYPE trans;
209 int icase;
211 memset (pstr, '\0', sizeof (re_string_t));
212 pstr->raw_mbs = (const unsigned char *) str;
213 pstr->len = len;
214 pstr->trans = trans;
215 pstr->icase = icase ? 1 : 0;
218 #ifdef RE_ENABLE_I18N
220 /* Build wide character buffer PSTR->WCS.
221 If the byte sequence of the string are:
222 <mb1>(0), <mb1>(1), <mb2>(0), <mb2>(1), <sb3>
223 Then wide character buffer will be:
224 <wc1> , WEOF , <wc2> , WEOF , <wc3>
225 We use WEOF for padding, they indicate that the position isn't
226 a first byte of a multibyte character.
228 Note that this function assumes PSTR->VALID_LEN elements are already
229 built and starts from PSTR->VALID_LEN. */
231 static void
232 build_wcs_buffer (pstr)
233 re_string_t *pstr;
235 mbstate_t prev_st;
236 int byte_idx, end_idx, mbclen, remain_len;
237 /* Build the buffers from pstr->valid_len to either pstr->len or
238 pstr->bufs_len. */
239 end_idx = (pstr->bufs_len > pstr->len)? pstr->len : pstr->bufs_len;
240 for (byte_idx = pstr->valid_len; byte_idx < end_idx;)
242 wchar_t wc;
243 remain_len = end_idx - byte_idx;
244 prev_st = pstr->cur_state;
245 mbclen = mbrtowc (&wc, ((const char *) pstr->raw_mbs + pstr->raw_mbs_idx
246 + byte_idx), remain_len, &pstr->cur_state);
247 if (BE (mbclen == (size_t) -2, 0))
249 /* The buffer doesn't have enough space, finish to build. */
250 pstr->cur_state = prev_st;
251 break;
253 else if (BE (mbclen == (size_t) -1 || mbclen == 0, 0))
255 /* We treat these cases as a singlebyte character. */
256 mbclen = 1;
257 wc = (wchar_t) pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx];
258 pstr->cur_state = prev_st;
261 /* Apply the translateion if we need. */
262 if (pstr->trans != NULL && mbclen == 1)
264 int ch = pstr->trans[pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx]];
265 pstr->mbs_case[byte_idx] = ch;
267 /* Write wide character and padding. */
268 pstr->wcs[byte_idx++] = wc;
269 /* Write paddings. */
270 for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
271 pstr->wcs[byte_idx++] = WEOF;
273 pstr->valid_len = byte_idx;
276 /* Build wide character buffer PSTR->WCS like build_wcs_buffer,
277 but for REG_ICASE. */
279 static void
280 build_wcs_upper_buffer (pstr)
281 re_string_t *pstr;
283 mbstate_t prev_st;
284 int byte_idx, end_idx, mbclen, remain_len;
285 /* Build the buffers from pstr->valid_len to either pstr->len or
286 pstr->bufs_len. */
287 end_idx = (pstr->bufs_len > pstr->len)? pstr->len : pstr->bufs_len;
288 for (byte_idx = pstr->valid_len; byte_idx < end_idx;)
290 wchar_t wc;
291 remain_len = end_idx - byte_idx;
292 prev_st = pstr->cur_state;
293 mbclen = mbrtowc (&wc, ((const char *) pstr->raw_mbs + pstr->raw_mbs_idx
294 + byte_idx), remain_len, &pstr->cur_state);
295 if (BE (mbclen == (size_t) -2, 0))
297 /* The buffer doesn't have enough space, finish to build. */
298 pstr->cur_state = prev_st;
299 break;
301 else if (mbclen == 1 || mbclen == (size_t) -1 || mbclen == 0)
303 /* In case of a singlebyte character. */
304 int ch = pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx];
305 /* Apply the translateion if we need. */
306 if (pstr->trans != NULL && mbclen == 1)
308 ch = pstr->trans[ch];
309 pstr->mbs_case[byte_idx] = ch;
311 pstr->wcs[byte_idx] = iswlower (wc) ? toupper (wc) : wc;
312 pstr->mbs[byte_idx++] = islower (ch) ? toupper (ch) : ch;
313 if (BE (mbclen == (size_t) -1, 0))
314 pstr->cur_state = prev_st;
316 else /* mbclen > 1 */
318 if (iswlower (wc))
319 wcrtomb ((char *) pstr->mbs + byte_idx, towupper (wc), &prev_st);
320 else
321 memcpy (pstr->mbs + byte_idx,
322 pstr->raw_mbs + pstr->raw_mbs_idx + byte_idx, mbclen);
323 pstr->wcs[byte_idx++] = iswlower (wc) ? toupper (wc) : wc;
324 /* Write paddings. */
325 for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
326 pstr->wcs[byte_idx++] = WEOF;
329 pstr->valid_len = byte_idx;
332 /* Skip characters until the index becomes greater than NEW_RAW_IDX.
333 Return the index. */
335 static int
336 re_string_skip_chars (pstr, new_raw_idx)
337 re_string_t *pstr;
338 int new_raw_idx;
340 mbstate_t prev_st;
341 int rawbuf_idx, mbclen;
343 /* Skip the characters which are not necessary to check. */
344 for (rawbuf_idx = pstr->raw_mbs_idx + pstr->valid_len;
345 rawbuf_idx < new_raw_idx;)
347 int remain_len = pstr->len - rawbuf_idx;
348 prev_st = pstr->cur_state;
349 mbclen = mbrlen ((const char *) pstr->raw_mbs + rawbuf_idx, remain_len,
350 &pstr->cur_state);
351 if (BE (mbclen == (size_t) -2 || mbclen == (size_t) -1 || mbclen == 0, 0))
353 /* We treat these cases as a singlebyte character. */
354 mbclen = 1;
355 pstr->cur_state = prev_st;
357 /* Then proceed the next character. */
358 rawbuf_idx += mbclen;
360 return rawbuf_idx;
362 #endif /* RE_ENABLE_I18N */
364 /* Build the buffer PSTR->MBS, and apply the translation if we need.
365 This function is used in case of REG_ICASE. */
367 static void
368 build_upper_buffer (pstr)
369 re_string_t *pstr;
371 int char_idx, end_idx;
372 end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
374 for (char_idx = pstr->valid_len; char_idx < end_idx; ++char_idx)
376 int ch = pstr->raw_mbs[pstr->raw_mbs_idx + char_idx];
377 if (pstr->trans != NULL)
379 ch = pstr->trans[ch];
380 pstr->mbs_case[char_idx] = ch;
382 if (islower (ch))
383 pstr->mbs[char_idx] = toupper (ch);
384 else
385 pstr->mbs[char_idx] = ch;
387 pstr->valid_len = char_idx;
390 /* Apply TRANS to the buffer in PSTR. */
392 static void
393 re_string_translate_buffer (pstr)
394 re_string_t *pstr;
396 int buf_idx, end_idx;
397 end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
399 for (buf_idx = pstr->valid_len; buf_idx < end_idx; ++buf_idx)
401 int ch = pstr->raw_mbs[pstr->raw_mbs_idx + buf_idx];
402 pstr->mbs_case[buf_idx] = pstr->trans[ch];
405 pstr->valid_len = buf_idx;
408 /* This function re-construct the buffers.
409 Concretely, convert to wide character in case of MB_CUR_MAX > 1,
410 convert to upper case in case of REG_ICASE, apply translation. */
412 static reg_errcode_t
413 re_string_reconstruct (pstr, idx, eflags, newline)
414 re_string_t *pstr;
415 int idx, eflags, newline;
417 int offset = idx - pstr->raw_mbs_idx;
418 if (offset < 0)
420 /* Reset buffer. */
421 #ifdef RE_ENABLE_I18N
422 if (MB_CUR_MAX > 1)
423 memset (&pstr->cur_state, '\0', sizeof (mbstate_t));
424 #endif /* RE_ENABLE_I18N */
425 pstr->len += pstr->raw_mbs_idx;
426 pstr->stop += pstr->raw_mbs_idx;
427 pstr->valid_len = pstr->raw_mbs_idx = 0;
428 pstr->tip_context = ((eflags & REG_NOTBOL) ? CONTEXT_BEGBUF
429 : CONTEXT_NEWLINE | CONTEXT_BEGBUF);
430 if (!MBS_CASE_ALLOCATED (pstr))
431 pstr->mbs_case = (unsigned char *) pstr->raw_mbs;
432 if (!MBS_ALLOCATED (pstr) && !MBS_CASE_ALLOCATED (pstr))
433 pstr->mbs = (unsigned char *) pstr->raw_mbs;
434 offset = idx;
437 if (offset != 0)
439 pstr->tip_context = re_string_context_at (pstr, offset - 1, eflags,
440 newline);
441 /* Are the characters which are already checked remain? */
442 if (offset < pstr->valid_len)
444 /* Yes, move them to the front of the buffer. */
445 #ifdef RE_ENABLE_I18N
446 if (MB_CUR_MAX > 1)
447 memmove (pstr->wcs, pstr->wcs + offset,
448 (pstr->valid_len - offset) * sizeof (wint_t));
449 #endif /* RE_ENABLE_I18N */
450 if (MBS_ALLOCATED (pstr))
451 memmove (pstr->mbs, pstr->mbs + offset,
452 pstr->valid_len - offset);
453 if (MBS_CASE_ALLOCATED (pstr))
454 memmove (pstr->mbs_case, pstr->mbs_case + offset,
455 pstr->valid_len - offset);
456 pstr->valid_len -= offset;
457 #if DEBUG
458 assert (pstr->valid_len > 0);
459 #endif
461 else
463 /* No, skip all characters until IDX. */
464 pstr->valid_len = 0;
465 #ifdef RE_ENABLE_I18N
466 if (MB_CUR_MAX > 1)
468 int wcs_idx;
469 pstr->valid_len = re_string_skip_chars (pstr, idx) - idx;
470 for (wcs_idx = 0; wcs_idx < pstr->valid_len; ++wcs_idx)
471 pstr->wcs[wcs_idx] = WEOF;
473 #endif /* RE_ENABLE_I18N */
475 if (!MBS_CASE_ALLOCATED (pstr))
477 pstr->mbs_case += offset;
478 /* In case of !MBS_ALLOCATED && !MBS_CASE_ALLOCATED. */
479 if (!MBS_ALLOCATED (pstr))
480 pstr->mbs += offset;
483 pstr->raw_mbs_idx = idx;
484 pstr->len -= offset;
485 pstr->stop -= offset;
487 /* Then build the buffers. */
488 #ifdef RE_ENABLE_I18N
489 if (MB_CUR_MAX > 1)
491 if (pstr->icase)
492 build_wcs_upper_buffer (pstr);
493 else
494 build_wcs_buffer (pstr);
496 else
497 #endif /* RE_ENABLE_I18N */
499 if (pstr->icase)
500 build_upper_buffer (pstr);
501 else if (pstr->trans != NULL)
502 re_string_translate_buffer (pstr);
504 pstr->cur_idx = 0;
506 return REG_NOERROR;
509 static void
510 re_string_destruct (pstr)
511 re_string_t *pstr;
513 #ifdef RE_ENABLE_I18N
514 re_free (pstr->wcs);
515 #endif /* RE_ENABLE_I18N */
516 if (MBS_ALLOCATED (pstr))
517 re_free (pstr->mbs);
518 if (MBS_CASE_ALLOCATED (pstr))
519 re_free (pstr->mbs_case);
522 /* Return the context at IDX in INPUT. */
524 static unsigned int
525 re_string_context_at (input, idx, eflags, newline_anchor)
526 const re_string_t *input;
527 int idx, eflags, newline_anchor;
529 int c;
530 if (idx < 0 || idx == input->len)
532 if (idx < 0)
533 /* In this case, we use the value stored in input->tip_context,
534 since we can't know the character in input->mbs[-1] here. */
535 return input->tip_context;
536 else /* (idx == input->len) */
537 return ((eflags & REG_NOTEOL) ? CONTEXT_ENDBUF
538 : CONTEXT_NEWLINE | CONTEXT_ENDBUF);
540 c = re_string_byte_at (input, idx);
541 if (IS_WORD_CHAR (c))
542 return CONTEXT_WORD;
543 return (newline_anchor && IS_NEWLINE (c)) ? CONTEXT_NEWLINE : 0;
546 /* Functions for set operation. */
548 static reg_errcode_t
549 re_node_set_alloc (set, size)
550 re_node_set *set;
551 int size;
553 set->alloc = size;
554 set->nelem = 0;
555 set->elems = re_malloc (int, size);
556 if (BE (set->elems == NULL, 0))
557 return REG_ESPACE;
558 return REG_NOERROR;
561 static reg_errcode_t
562 re_node_set_init_1 (set, elem)
563 re_node_set *set;
564 int elem;
566 set->alloc = 1;
567 set->nelem = 1;
568 set->elems = re_malloc (int, 1);
569 if (BE (set->elems == NULL, 0))
570 return REG_ESPACE;
571 set->elems[0] = elem;
572 return REG_NOERROR;
575 static reg_errcode_t
576 re_node_set_init_2 (set, elem1, elem2)
577 re_node_set *set;
578 int elem1, elem2;
580 set->alloc = 2;
581 set->elems = re_malloc (int, 2);
582 if (BE (set->elems == NULL, 0))
583 return REG_ESPACE;
584 if (elem1 == elem2)
586 set->nelem = 1;
587 set->elems[0] = elem1;
589 else
591 set->nelem = 2;
592 if (elem1 < elem2)
594 set->elems[0] = elem1;
595 set->elems[1] = elem2;
597 else
599 set->elems[0] = elem2;
600 set->elems[1] = elem1;
603 return REG_NOERROR;
606 static reg_errcode_t
607 re_node_set_init_copy (dest, src)
608 re_node_set *dest;
609 const re_node_set *src;
611 dest->nelem = src->nelem;
612 if (src->nelem > 0)
614 dest->alloc = dest->nelem;
615 dest->elems = re_malloc (int, dest->alloc);
616 if (BE (dest->elems == NULL, 0))
617 return REG_ESPACE;
618 memcpy (dest->elems, src->elems, src->nelem * sizeof (int));
620 else
621 re_node_set_init_empty (dest);
622 return REG_NOERROR;
625 /* Calculate the intersection of the sets SRC1 and SRC2. And merge it to
626 DEST. Return value indicate the error code or REG_NOERROR if succeeded.
627 Note: We assume dest->elems is NULL, when dest->alloc is 0. */
629 static reg_errcode_t
630 re_node_set_add_intersect (dest, src1, src2)
631 re_node_set *dest;
632 const re_node_set *src1, *src2;
634 int i1, i2, id;
635 if (src1->nelem > 0 && src2->nelem > 0)
637 if (src1->nelem + src2->nelem + dest->nelem > dest->alloc)
639 dest->alloc = src1->nelem + src2->nelem + dest->nelem;
640 dest->elems = re_realloc (dest->elems, int, dest->alloc);
641 if (BE (dest->elems == NULL, 0))
642 return REG_ESPACE;
645 else
646 return REG_NOERROR;
648 for (i1 = i2 = id = 0 ; i1 < src1->nelem && i2 < src2->nelem ;)
650 if (src1->elems[i1] > src2->elems[i2])
652 ++i2;
653 continue;
655 if (src1->elems[i1] == src2->elems[i2])
657 while (id < dest->nelem && dest->elems[id] < src2->elems[i2])
658 ++id;
659 if (id < dest->nelem && dest->elems[id] == src2->elems[i2])
660 ++id;
661 else
663 memmove (dest->elems + id + 1, dest->elems + id,
664 sizeof (int) * (dest->nelem - id));
665 dest->elems[id++] = src2->elems[i2++];
666 ++dest->nelem;
669 ++i1;
671 return REG_NOERROR;
674 /* Calculate the union set of the sets SRC1 and SRC2. And store it to
675 DEST. Return value indicate the error code or REG_NOERROR if succeeded. */
677 static reg_errcode_t
678 re_node_set_init_union (dest, src1, src2)
679 re_node_set *dest;
680 const re_node_set *src1, *src2;
682 int i1, i2, id;
683 if (src1 != NULL && src1->nelem > 0 && src2 != NULL && src2->nelem > 0)
685 dest->alloc = src1->nelem + src2->nelem;
686 dest->elems = re_malloc (int, dest->alloc);
687 if (BE (dest->elems == NULL, 0))
688 return REG_ESPACE;
690 else
692 if (src1 != NULL && src1->nelem > 0)
693 return re_node_set_init_copy (dest, src1);
694 else if (src2 != NULL && src2->nelem > 0)
695 return re_node_set_init_copy (dest, src2);
696 else
697 re_node_set_init_empty (dest);
698 return REG_NOERROR;
700 for (i1 = i2 = id = 0 ; i1 < src1->nelem && i2 < src2->nelem ;)
702 if (src1->elems[i1] > src2->elems[i2])
704 dest->elems[id++] = src2->elems[i2++];
705 continue;
707 if (src1->elems[i1] == src2->elems[i2])
708 ++i2;
709 dest->elems[id++] = src1->elems[i1++];
711 if (i1 < src1->nelem)
713 memcpy (dest->elems + id, src1->elems + i1,
714 (src1->nelem - i1) * sizeof (int));
715 id += src1->nelem - i1;
717 else if (i2 < src2->nelem)
719 memcpy (dest->elems + id, src2->elems + i2,
720 (src2->nelem - i2) * sizeof (int));
721 id += src2->nelem - i2;
723 dest->nelem = id;
724 return REG_NOERROR;
727 /* Calculate the union set of the sets DEST and SRC. And store it to
728 DEST. Return value indicate the error code or REG_NOERROR if succeeded. */
730 static reg_errcode_t
731 re_node_set_merge (dest, src)
732 re_node_set *dest;
733 const re_node_set *src;
735 int si, di;
736 if (src == NULL || src->nelem == 0)
737 return REG_NOERROR;
738 if (dest->alloc < src->nelem + dest->nelem)
740 dest->alloc = 2 * (src->nelem + dest->alloc);
741 dest->elems = re_realloc (dest->elems, int, dest->alloc);
742 if (BE (dest->elems == NULL, 0))
743 return REG_ESPACE;
746 for (si = 0, di = 0 ; si < src->nelem && di < dest->nelem ;)
748 int cp_from, ncp, mid, right, src_elem = src->elems[si];
749 /* Binary search the spot we will add the new element. */
750 right = dest->nelem;
751 while (di < right)
753 mid = (di + right) / 2;
754 if (dest->elems[mid] < src_elem)
755 di = mid + 1;
756 else
757 right = mid;
759 if (di >= dest->nelem)
760 break;
762 if (dest->elems[di] == src_elem)
764 /* Skip since, DEST already has the element. */
765 ++di;
766 ++si;
767 continue;
770 /* Skip the src elements which are less than dest->elems[di]. */
771 cp_from = si;
772 while (si < src->nelem && src->elems[si] < dest->elems[di])
773 ++si;
774 /* Copy these src elements. */
775 ncp = si - cp_from;
776 memmove (dest->elems + di + ncp, dest->elems + di,
777 sizeof (int) * (dest->nelem - di));
778 memcpy (dest->elems + di, src->elems + cp_from,
779 sizeof (int) * ncp);
780 /* Update counters. */
781 di += ncp;
782 dest->nelem += ncp;
785 /* Copy remaining src elements. */
786 if (si < src->nelem)
788 memcpy (dest->elems + di, src->elems + si,
789 sizeof (int) * (src->nelem - si));
790 dest->nelem += src->nelem - si;
792 return REG_NOERROR;
795 /* Insert the new element ELEM to the re_node_set* SET.
796 return 0 if SET already has ELEM,
797 return -1 if an error is occured, return 1 otherwise. */
799 static int
800 re_node_set_insert (set, elem)
801 re_node_set *set;
802 int elem;
804 int idx, right, mid;
805 /* In case of the set is empty. */
806 if (set->elems == NULL || set->alloc == 0)
808 if (BE (re_node_set_init_1 (set, elem) == REG_NOERROR, 1))
809 return 1;
810 else
811 return -1;
814 /* Binary search the spot we will add the new element. */
815 idx = 0;
816 right = set->nelem;
817 while (idx < right)
819 mid = (idx + right) / 2;
820 if (set->elems[mid] < elem)
821 idx = mid + 1;
822 else
823 right = mid;
826 /* Realloc if we need. */
827 if (set->alloc < set->nelem + 1)
829 int *new_array;
830 set->alloc = set->alloc * 2;
831 new_array = re_malloc (int, set->alloc);
832 if (BE (new_array == NULL, 0))
833 return -1;
834 /* Copy the elements they are followed by the new element. */
835 if (idx > 0)
836 memcpy (new_array, set->elems, sizeof (int) * (idx));
837 /* Copy the elements which follows the new element. */
838 if (set->nelem - idx > 0)
839 memcpy (new_array + idx + 1, set->elems + idx,
840 sizeof (int) * (set->nelem - idx));
841 re_free (set->elems);
842 set->elems = new_array;
844 else
846 /* Move the elements which follows the new element. */
847 if (set->nelem - idx > 0)
848 memmove (set->elems + idx + 1, set->elems + idx,
849 sizeof (int) * (set->nelem - idx));
851 /* Insert the new element. */
852 set->elems[idx] = elem;
853 ++set->nelem;
854 return 1;
857 /* Compare two node sets SET1 and SET2.
858 return 1 if SET1 and SET2 are equivalent, retrun 0 otherwise. */
860 static int
861 re_node_set_compare (set1, set2)
862 const re_node_set *set1, *set2;
864 int i;
865 if (set1 == NULL || set2 == NULL || set1->nelem != set2->nelem)
866 return 0;
867 for (i = 0 ; i < set1->nelem ; i++)
868 if (set1->elems[i] != set2->elems[i])
869 return 0;
870 return 1;
873 /* Return (idx + 1) if SET contains the element ELEM, return 0 otherwise. */
875 static int
876 re_node_set_contains (set, elem)
877 const re_node_set *set;
878 int elem;
880 int idx, right, mid;
881 if (set->nelem <= 0)
882 return 0;
884 /* Binary search the element. */
885 idx = 0;
886 right = set->nelem - 1;
887 while (idx < right)
889 mid = (idx + right) / 2;
890 if (set->elems[mid] < elem)
891 idx = mid + 1;
892 else
893 right = mid;
895 return set->elems[idx] == elem ? idx + 1 : 0;
898 static void
899 re_node_set_remove_at (set, idx)
900 re_node_set *set;
901 int idx;
903 if (idx < 0 || idx >= set->nelem)
904 return;
905 if (idx < set->nelem - 1)
906 memmove (set->elems + idx, set->elems + idx + 1,
907 sizeof (int) * (set->nelem - idx - 1));
908 --set->nelem;
912 /* Add the token TOKEN to dfa->nodes, and return the index of the token.
913 Or return -1, if an error will be occured. */
915 static int
916 re_dfa_add_node (dfa, token, mode)
917 re_dfa_t *dfa;
918 re_token_t token;
919 int mode;
921 if (dfa->nodes_len >= dfa->nodes_alloc)
923 re_token_t *new_array;
924 dfa->nodes_alloc *= 2;
925 new_array = re_realloc (dfa->nodes, re_token_t, dfa->nodes_alloc);
926 if (BE (new_array == NULL, 0))
927 return -1;
928 else
929 dfa->nodes = new_array;
930 if (mode)
932 int *new_nexts;
933 re_node_set *new_edests, *new_eclosures, *new_inveclosures;
935 new_nexts = re_realloc (dfa->nexts, int, dfa->nodes_alloc);
936 new_edests = re_realloc (dfa->edests, re_node_set, dfa->nodes_alloc);
937 new_eclosures = re_realloc (dfa->eclosures, re_node_set,
938 dfa->nodes_alloc);
939 new_inveclosures = re_realloc (dfa->inveclosures, re_node_set,
940 dfa->nodes_alloc);
941 if (BE (new_nexts == NULL || new_edests == NULL
942 || new_eclosures == NULL || new_inveclosures == NULL, 0))
943 return -1;
944 dfa->nexts = new_nexts;
945 dfa->edests = new_edests;
946 dfa->eclosures = new_eclosures;
947 dfa->inveclosures = new_inveclosures;
950 dfa->nodes[dfa->nodes_len] = token;
951 dfa->nodes[dfa->nodes_len].duplicated = 0;
952 dfa->nodes[dfa->nodes_len].constraint = 0;
953 return dfa->nodes_len++;
956 static unsigned int inline
957 calc_state_hash (nodes, context)
958 const re_node_set *nodes;
959 unsigned int context;
961 unsigned int hash = nodes->nelem + context;
962 int i;
963 for (i = 0 ; i < nodes->nelem ; i++)
964 hash += nodes->elems[i];
965 return hash;
968 /* Search for the state whose node_set is equivalent to NODES.
969 Return the pointer to the state, if we found it in the DFA.
970 Otherwise create the new one and return it. In case of an error
971 return NULL and set the error code in ERR.
972 Note: - We assume NULL as the invalid state, then it is possible that
973 return value is NULL and ERR is REG_NOERROR.
974 - We never return non-NULL value in case of any errors, it is for
975 optimization. */
977 static re_dfastate_t*
978 re_acquire_state (err, dfa, nodes)
979 reg_errcode_t *err;
980 re_dfa_t *dfa;
981 const re_node_set *nodes;
983 unsigned int hash;
984 re_dfastate_t *new_state;
985 struct re_state_table_entry *spot;
986 int i;
987 if (BE (nodes->nelem == 0, 0))
989 *err = REG_NOERROR;
990 return NULL;
992 hash = calc_state_hash (nodes, 0);
993 spot = dfa->state_table + (hash & dfa->state_hash_mask);
995 for (i = 0 ; i < spot->num ; i++)
997 re_dfastate_t *state = spot->array[i];
998 if (hash != state->hash)
999 continue;
1000 if (re_node_set_compare (&state->nodes, nodes))
1001 return state;
1004 /* There are no appropriate state in the dfa, create the new one. */
1005 new_state = create_ci_newstate (dfa, nodes, hash);
1006 if (BE (new_state != NULL, 1))
1007 return new_state;
1008 else
1010 *err = REG_ESPACE;
1011 return NULL;
1015 /* Search for the state whose node_set is equivalent to NODES and
1016 whose context is equivalent to CONTEXT.
1017 Return the pointer to the state, if we found it in the DFA.
1018 Otherwise create the new one and return it. In case of an error
1019 return NULL and set the error code in ERR.
1020 Note: - We assume NULL as the invalid state, then it is possible that
1021 return value is NULL and ERR is REG_NOERROR.
1022 - We never return non-NULL value in case of any errors, it is for
1023 optimization. */
1025 static re_dfastate_t*
1026 re_acquire_state_context (err, dfa, nodes, context)
1027 reg_errcode_t *err;
1028 re_dfa_t *dfa;
1029 const re_node_set *nodes;
1030 unsigned int context;
1032 unsigned int hash;
1033 re_dfastate_t *new_state;
1034 struct re_state_table_entry *spot;
1035 int i;
1036 if (nodes->nelem == 0)
1038 *err = REG_NOERROR;
1039 return NULL;
1041 hash = calc_state_hash (nodes, context);
1042 spot = dfa->state_table + (hash & dfa->state_hash_mask);
1044 for (i = 0 ; i < spot->num ; i++)
1046 re_dfastate_t *state = spot->array[i];
1047 if (hash != state->hash)
1048 continue;
1049 if (re_node_set_compare (state->entrance_nodes, nodes)
1050 && state->context == context)
1051 return state;
1053 /* There are no appropriate state in `dfa', create the new one. */
1054 new_state = create_cd_newstate (dfa, nodes, context, hash);
1055 if (BE (new_state != NULL, 1))
1056 return new_state;
1057 else
1059 *err = REG_ESPACE;
1060 return NULL;
1064 /* Allocate memory for DFA state and initialize common properties.
1065 Return the new state if succeeded, otherwise return NULL. */
1067 static re_dfastate_t *
1068 create_newstate_common (dfa, nodes, hash)
1069 re_dfa_t *dfa;
1070 const re_node_set *nodes;
1071 unsigned int hash;
1073 re_dfastate_t *newstate;
1074 newstate = (re_dfastate_t *) calloc (sizeof (re_dfastate_t), 1);
1075 if (BE (newstate == NULL, 0))
1076 return NULL;
1077 re_node_set_init_copy (&newstate->nodes, nodes);
1078 newstate->trtable = NULL;
1079 newstate->trtable_search = NULL;
1080 newstate->hash = hash;
1081 return newstate;
1084 /* Store the new state NEWSTATE whose hash value is HASH in appropriate
1085 position. Return value indicate the error code if failed. */
1087 static reg_errcode_t
1088 register_state (dfa, newstate, hash)
1089 re_dfa_t *dfa;
1090 re_dfastate_t *newstate;
1091 unsigned int hash;
1093 struct re_state_table_entry *spot;
1094 spot = dfa->state_table + (hash & dfa->state_hash_mask);
1096 if (spot->alloc <= spot->num)
1098 spot->alloc = 2 * spot->num + 2;
1099 spot->array = re_realloc (spot->array, re_dfastate_t *, spot->alloc);
1100 if (BE (spot->array == NULL, 0))
1101 return REG_ESPACE;
1103 spot->array[spot->num++] = newstate;
1104 return REG_NOERROR;
1107 /* Create the new state which is independ of contexts.
1108 Return the new state if succeeded, otherwise return NULL. */
1110 static re_dfastate_t *
1111 create_ci_newstate (dfa, nodes, hash)
1112 re_dfa_t *dfa;
1113 const re_node_set *nodes;
1114 unsigned int hash;
1116 int i;
1117 reg_errcode_t err;
1118 re_dfastate_t *newstate;
1119 newstate = create_newstate_common (dfa, nodes, hash);
1120 if (BE (newstate == NULL, 0))
1121 return NULL;
1122 newstate->entrance_nodes = &newstate->nodes;
1124 for (i = 0 ; i < nodes->nelem ; i++)
1126 re_token_t *node = dfa->nodes + nodes->elems[i];
1127 re_token_type_t type = node->type;
1128 if (type == CHARACTER && !node->constraint)
1129 continue;
1131 /* If the state has the halt node, the state is a halt state. */
1132 else if (type == END_OF_RE)
1133 newstate->halt = 1;
1134 #ifdef RE_ENABLE_I18N
1135 else if (type == COMPLEX_BRACKET
1136 || (type == OP_PERIOD && MB_CUR_MAX > 1))
1137 newstate->accept_mb = 1;
1138 #endif /* RE_ENABLE_I18N */
1139 else if (type == OP_BACK_REF)
1140 newstate->has_backref = 1;
1141 else if (type == ANCHOR || node->constraint)
1142 newstate->has_constraint = 1;
1144 err = register_state (dfa, newstate, hash);
1145 return (err != REG_NOERROR) ? NULL : newstate;
1148 /* Create the new state which is depend on the context CONTEXT.
1149 Return the new state if succeeded, otherwise return NULL. */
1151 static re_dfastate_t *
1152 create_cd_newstate (dfa, nodes, context, hash)
1153 re_dfa_t *dfa;
1154 const re_node_set *nodes;
1155 unsigned int context, hash;
1157 int i, nctx_nodes = 0;
1158 reg_errcode_t err;
1159 re_dfastate_t *newstate;
1161 newstate = create_newstate_common (dfa, nodes, hash);
1162 if (BE (newstate == NULL, 0))
1163 return NULL;
1164 newstate->context = context;
1165 newstate->entrance_nodes = &newstate->nodes;
1167 for (i = 0 ; i < nodes->nelem ; i++)
1169 unsigned int constraint = 0;
1170 re_token_t *node = dfa->nodes + nodes->elems[i];
1171 re_token_type_t type = node->type;
1172 if (node->constraint)
1173 constraint = node->constraint;
1175 if (type == CHARACTER && !constraint)
1176 continue;
1177 /* If the state has the halt node, the state is a halt state. */
1178 else if (type == END_OF_RE)
1179 newstate->halt = 1;
1180 #ifdef RE_ENABLE_I18N
1181 else if (type == COMPLEX_BRACKET
1182 || (type == OP_PERIOD && MB_CUR_MAX > 1))
1183 newstate->accept_mb = 1;
1184 #endif /* RE_ENABLE_I18N */
1185 else if (type == OP_BACK_REF)
1186 newstate->has_backref = 1;
1187 else if (type == ANCHOR)
1188 constraint = node->opr.ctx_type;
1190 if (constraint)
1192 if (newstate->entrance_nodes == &newstate->nodes)
1194 newstate->entrance_nodes = re_malloc (re_node_set, 1);
1195 if (BE (newstate->entrance_nodes == NULL, 0))
1196 return NULL;
1197 re_node_set_init_copy (newstate->entrance_nodes, nodes);
1198 nctx_nodes = 0;
1199 newstate->has_constraint = 1;
1202 if (NOT_SATISFY_PREV_CONSTRAINT (constraint,context))
1204 re_node_set_remove_at (&newstate->nodes, i - nctx_nodes);
1205 ++nctx_nodes;
1209 err = register_state (dfa, newstate, hash);
1210 return (err != REG_NOERROR) ? NULL : newstate;