* sysdeps/unix/sysv/linux/m68k/clone.S: Make inline syscall to
[glibc.git] / posix / regex_internal.c
bloba9559e27683ea13c01a949145cdce9f1ccda4e83
1 /* Extended regular expression matching and search library.
2 Copyright (C) 2002 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4 Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library; if not, write to the Free
18 Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
19 02111-1307 USA. */
21 #include <assert.h>
22 #include <ctype.h>
23 #include <limits.h>
24 #include <stdio.h>
25 #include <stdlib.h>
26 #include <string.h>
28 #if defined HAVE_WCHAR_H || defined _LIBC
29 # include <wchar.h>
30 #endif /* HAVE_WCHAR_H || _LIBC */
31 #if defined HAVE_WCTYPE_H || defined _LIBC
32 # include <wctype.h>
33 #endif /* HAVE_WCTYPE_H || _LIBC */
35 #ifdef _LIBC
36 # ifndef _RE_DEFINE_LOCALE_FUNCTIONS
37 # define _RE_DEFINE_LOCALE_FUNCTIONS 1
38 # include <locale/localeinfo.h>
39 # include <locale/elem-hash.h>
40 # include <locale/coll-lookup.h>
41 # endif
42 #endif
44 /* This is for other GNU distributions with internationalized messages. */
45 #if HAVE_LIBINTL_H || defined _LIBC
46 # include <libintl.h>
47 # ifdef _LIBC
48 # undef gettext
49 # define gettext(msgid) \
50 INTUSE(__dcgettext) (_libc_intl_domainname_internal, msgid, LC_MESSAGES)
51 # endif
52 #else
53 # define gettext(msgid) (msgid)
54 #endif
56 #ifndef gettext_noop
57 /* This define is so xgettext can find the internationalizable
58 strings. */
59 # define gettext_noop(String) String
60 #endif
62 #include "regex.h"
63 #include "regex_internal.h"
65 static void re_string_construct_common (const char *str, int len,
66 re_string_t *pstr,
67 RE_TRANSLATE_TYPE trans, int icase);
68 #ifdef RE_ENABLE_I18N
69 static int re_string_skip_chars (re_string_t *pstr, int new_raw_idx,
70 wint_t *last_wc);
71 #endif /* RE_ENABLE_I18N */
72 static re_dfastate_t *create_newstate_common (re_dfa_t *dfa,
73 const re_node_set *nodes,
74 unsigned int hash);
75 static reg_errcode_t register_state (re_dfa_t *dfa, re_dfastate_t *newstate,
76 unsigned int hash);
77 static re_dfastate_t *create_ci_newstate (re_dfa_t *dfa,
78 const re_node_set *nodes,
79 unsigned int hash);
80 static re_dfastate_t *create_cd_newstate (re_dfa_t *dfa,
81 const re_node_set *nodes,
82 unsigned int context,
83 unsigned int hash);
84 static unsigned int inline calc_state_hash (const re_node_set *nodes,
85 unsigned int context);
87 /* Functions for string operation. */
89 /* This function allocate the buffers. It is necessary to call
90 re_string_reconstruct before using the object. */
92 static reg_errcode_t
93 re_string_allocate (pstr, str, len, init_len, trans, icase)
94 re_string_t *pstr;
95 const char *str;
96 int len, init_len, icase;
97 RE_TRANSLATE_TYPE trans;
99 reg_errcode_t ret;
100 int init_buf_len = (len + 1 < init_len) ? len + 1: init_len;
101 re_string_construct_common (str, len, pstr, trans, icase);
102 pstr->stop = pstr->len;
104 ret = re_string_realloc_buffers (pstr, init_buf_len);
105 if (BE (ret != REG_NOERROR, 0))
106 return ret;
108 pstr->mbs_case = (MBS_CASE_ALLOCATED (pstr) ? pstr->mbs_case
109 : (unsigned char *) str);
110 pstr->mbs = MBS_ALLOCATED (pstr) ? pstr->mbs : pstr->mbs_case;
111 pstr->valid_len = (MBS_CASE_ALLOCATED (pstr) || MBS_ALLOCATED (pstr)
112 || MB_CUR_MAX > 1) ? pstr->valid_len : len;
113 return REG_NOERROR;
116 /* This function allocate the buffers, and initialize them. */
118 static reg_errcode_t
119 re_string_construct (pstr, str, len, trans, icase)
120 re_string_t *pstr;
121 const char *str;
122 int len, icase;
123 RE_TRANSLATE_TYPE trans;
125 reg_errcode_t ret;
126 re_string_construct_common (str, len, pstr, trans, icase);
127 pstr->stop = pstr->len;
128 /* Set 0 so that this function can initialize whole buffers. */
129 pstr->valid_len = 0;
131 if (len > 0)
133 ret = re_string_realloc_buffers (pstr, len + 1);
134 if (BE (ret != REG_NOERROR, 0))
135 return ret;
137 pstr->mbs_case = (MBS_CASE_ALLOCATED (pstr) ? pstr->mbs_case
138 : (unsigned char *) str);
139 pstr->mbs = MBS_ALLOCATED (pstr) ? pstr->mbs : pstr->mbs_case;
141 if (icase)
143 #ifdef RE_ENABLE_I18N
144 if (MB_CUR_MAX > 1)
145 build_wcs_upper_buffer (pstr);
146 else
147 #endif /* RE_ENABLE_I18N */
148 build_upper_buffer (pstr);
150 else
152 #ifdef RE_ENABLE_I18N
153 if (MB_CUR_MAX > 1)
154 build_wcs_buffer (pstr);
155 else
156 #endif /* RE_ENABLE_I18N */
158 if (trans != NULL)
159 re_string_translate_buffer (pstr);
160 else
161 pstr->valid_len = len;
165 /* Initialized whole buffers, then valid_len == bufs_len. */
166 pstr->valid_len = pstr->bufs_len;
167 return REG_NOERROR;
170 /* Helper functions for re_string_allocate, and re_string_construct. */
172 static reg_errcode_t
173 re_string_realloc_buffers (pstr, new_buf_len)
174 re_string_t *pstr;
175 int new_buf_len;
177 #ifdef RE_ENABLE_I18N
178 if (MB_CUR_MAX > 1)
180 wint_t *new_array = re_realloc (pstr->wcs, wint_t, new_buf_len);
181 if (BE (new_array == NULL, 0))
182 return REG_ESPACE;
183 pstr->wcs = new_array;
185 #endif /* RE_ENABLE_I18N */
186 if (MBS_ALLOCATED (pstr))
188 unsigned char *new_array = re_realloc (pstr->mbs, unsigned char,
189 new_buf_len);
190 if (BE (new_array == NULL, 0))
191 return REG_ESPACE;
192 pstr->mbs = new_array;
194 if (MBS_CASE_ALLOCATED (pstr))
196 unsigned char *new_array = re_realloc (pstr->mbs_case, unsigned char,
197 new_buf_len);
198 if (BE (new_array == NULL, 0))
199 return REG_ESPACE;
200 pstr->mbs_case = new_array;
201 if (!MBS_ALLOCATED (pstr))
202 pstr->mbs = pstr->mbs_case;
204 pstr->bufs_len = new_buf_len;
205 return REG_NOERROR;
209 static void
210 re_string_construct_common (str, len, pstr, trans, icase)
211 const char *str;
212 int len;
213 re_string_t *pstr;
214 RE_TRANSLATE_TYPE trans;
215 int icase;
217 memset (pstr, '\0', sizeof (re_string_t));
218 pstr->raw_mbs = (const unsigned char *) str;
219 pstr->len = len;
220 pstr->trans = trans;
221 pstr->icase = icase ? 1 : 0;
224 #ifdef RE_ENABLE_I18N
226 /* Build wide character buffer PSTR->WCS.
227 If the byte sequence of the string are:
228 <mb1>(0), <mb1>(1), <mb2>(0), <mb2>(1), <sb3>
229 Then wide character buffer will be:
230 <wc1> , WEOF , <wc2> , WEOF , <wc3>
231 We use WEOF for padding, they indicate that the position isn't
232 a first byte of a multibyte character.
234 Note that this function assumes PSTR->VALID_LEN elements are already
235 built and starts from PSTR->VALID_LEN. */
237 static void
238 build_wcs_buffer (pstr)
239 re_string_t *pstr;
241 mbstate_t prev_st;
242 int byte_idx, end_idx, mbclen, remain_len;
243 /* Build the buffers from pstr->valid_len to either pstr->len or
244 pstr->bufs_len. */
245 end_idx = (pstr->bufs_len > pstr->len)? pstr->len : pstr->bufs_len;
246 for (byte_idx = pstr->valid_len; byte_idx < end_idx;)
248 wchar_t wc;
249 remain_len = end_idx - byte_idx;
250 prev_st = pstr->cur_state;
251 mbclen = mbrtowc (&wc, ((const char *) pstr->raw_mbs + pstr->raw_mbs_idx
252 + byte_idx), remain_len, &pstr->cur_state);
253 if (BE (mbclen == (size_t) -2, 0))
255 /* The buffer doesn't have enough space, finish to build. */
256 pstr->cur_state = prev_st;
257 break;
259 else if (BE (mbclen == (size_t) -1 || mbclen == 0, 0))
261 /* We treat these cases as a singlebyte character. */
262 mbclen = 1;
263 wc = (wchar_t) pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx];
264 pstr->cur_state = prev_st;
267 /* Apply the translateion if we need. */
268 if (pstr->trans != NULL && mbclen == 1)
270 int ch = pstr->trans[pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx]];
271 pstr->mbs_case[byte_idx] = ch;
273 /* Write wide character and padding. */
274 pstr->wcs[byte_idx++] = wc;
275 /* Write paddings. */
276 for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
277 pstr->wcs[byte_idx++] = WEOF;
279 pstr->valid_len = byte_idx;
282 /* Build wide character buffer PSTR->WCS like build_wcs_buffer,
283 but for REG_ICASE. */
285 static void
286 build_wcs_upper_buffer (pstr)
287 re_string_t *pstr;
289 mbstate_t prev_st;
290 int byte_idx, end_idx, mbclen, remain_len;
291 /* Build the buffers from pstr->valid_len to either pstr->len or
292 pstr->bufs_len. */
293 end_idx = (pstr->bufs_len > pstr->len)? pstr->len : pstr->bufs_len;
294 for (byte_idx = pstr->valid_len; byte_idx < end_idx;)
296 wchar_t wc;
297 remain_len = end_idx - byte_idx;
298 prev_st = pstr->cur_state;
299 mbclen = mbrtowc (&wc, ((const char *) pstr->raw_mbs + pstr->raw_mbs_idx
300 + byte_idx), remain_len, &pstr->cur_state);
301 if (BE (mbclen == (size_t) -2, 0))
303 /* The buffer doesn't have enough space, finish to build. */
304 pstr->cur_state = prev_st;
305 break;
307 else if (mbclen == 1 || mbclen == (size_t) -1 || mbclen == 0)
309 /* In case of a singlebyte character. */
310 int ch = pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx];
311 /* Apply the translateion if we need. */
312 if (pstr->trans != NULL && mbclen == 1)
314 ch = pstr->trans[ch];
315 pstr->mbs_case[byte_idx] = ch;
317 pstr->wcs[byte_idx] = iswlower (wc) ? toupper (wc) : wc;
318 pstr->mbs[byte_idx++] = islower (ch) ? toupper (ch) : ch;
319 if (BE (mbclen == (size_t) -1, 0))
320 pstr->cur_state = prev_st;
322 else /* mbclen > 1 */
324 if (iswlower (wc))
325 wcrtomb ((char *) pstr->mbs + byte_idx, towupper (wc), &prev_st);
326 else
327 memcpy (pstr->mbs + byte_idx,
328 pstr->raw_mbs + pstr->raw_mbs_idx + byte_idx, mbclen);
329 pstr->wcs[byte_idx++] = iswlower (wc) ? toupper (wc) : wc;
330 /* Write paddings. */
331 for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
332 pstr->wcs[byte_idx++] = WEOF;
335 pstr->valid_len = byte_idx;
338 /* Skip characters until the index becomes greater than NEW_RAW_IDX.
339 Return the index. */
341 static int
342 re_string_skip_chars (pstr, new_raw_idx, last_wc)
343 re_string_t *pstr;
344 int new_raw_idx;
345 wint_t *last_wc;
347 mbstate_t prev_st;
348 int rawbuf_idx, mbclen;
349 wchar_t wc = 0;
351 /* Skip the characters which are not necessary to check. */
352 for (rawbuf_idx = pstr->raw_mbs_idx + pstr->valid_len;
353 rawbuf_idx < new_raw_idx;)
355 int remain_len;
356 remain_len = pstr->len - rawbuf_idx;
357 prev_st = pstr->cur_state;
358 mbclen = mbrtowc (&wc, (const char *) pstr->raw_mbs + rawbuf_idx,
359 remain_len, &pstr->cur_state);
360 if (BE (mbclen == (size_t) -2 || mbclen == (size_t) -1 || mbclen == 0, 0))
362 /* We treat these cases as a singlebyte character. */
363 mbclen = 1;
364 pstr->cur_state = prev_st;
366 /* Then proceed the next character. */
367 rawbuf_idx += mbclen;
369 *last_wc = (wint_t) wc;
370 return rawbuf_idx;
372 #endif /* RE_ENABLE_I18N */
374 /* Build the buffer PSTR->MBS, and apply the translation if we need.
375 This function is used in case of REG_ICASE. */
377 static void
378 build_upper_buffer (pstr)
379 re_string_t *pstr;
381 int char_idx, end_idx;
382 end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
384 for (char_idx = pstr->valid_len; char_idx < end_idx; ++char_idx)
386 int ch = pstr->raw_mbs[pstr->raw_mbs_idx + char_idx];
387 if (pstr->trans != NULL)
389 ch = pstr->trans[ch];
390 pstr->mbs_case[char_idx] = ch;
392 if (islower (ch))
393 pstr->mbs[char_idx] = toupper (ch);
394 else
395 pstr->mbs[char_idx] = ch;
397 pstr->valid_len = char_idx;
400 /* Apply TRANS to the buffer in PSTR. */
402 static void
403 re_string_translate_buffer (pstr)
404 re_string_t *pstr;
406 int buf_idx, end_idx;
407 end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
409 for (buf_idx = pstr->valid_len; buf_idx < end_idx; ++buf_idx)
411 int ch = pstr->raw_mbs[pstr->raw_mbs_idx + buf_idx];
412 pstr->mbs_case[buf_idx] = pstr->trans[ch];
415 pstr->valid_len = buf_idx;
418 /* This function re-construct the buffers.
419 Concretely, convert to wide character in case of MB_CUR_MAX > 1,
420 convert to upper case in case of REG_ICASE, apply translation. */
422 static reg_errcode_t
423 re_string_reconstruct (pstr, idx, eflags, newline)
424 re_string_t *pstr;
425 int idx, eflags, newline;
427 int offset = idx - pstr->raw_mbs_idx;
428 if (offset < 0)
430 /* Reset buffer. */
431 #ifdef RE_ENABLE_I18N
432 if (MB_CUR_MAX > 1)
433 memset (&pstr->cur_state, '\0', sizeof (mbstate_t));
434 #endif /* RE_ENABLE_I18N */
435 pstr->len += pstr->raw_mbs_idx;
436 pstr->stop += pstr->raw_mbs_idx;
437 pstr->valid_len = pstr->raw_mbs_idx = 0;
438 pstr->tip_context = ((eflags & REG_NOTBOL) ? CONTEXT_BEGBUF
439 : CONTEXT_NEWLINE | CONTEXT_BEGBUF);
440 if (!MBS_CASE_ALLOCATED (pstr))
441 pstr->mbs_case = (unsigned char *) pstr->raw_mbs;
442 if (!MBS_ALLOCATED (pstr) && !MBS_CASE_ALLOCATED (pstr))
443 pstr->mbs = (unsigned char *) pstr->raw_mbs;
444 offset = idx;
447 if (offset != 0)
449 /* Are the characters which are already checked remain? */
450 if (offset < pstr->valid_len)
452 /* Yes, move them to the front of the buffer. */
453 pstr->tip_context = re_string_context_at (pstr, offset - 1, eflags,
454 newline);
455 #ifdef RE_ENABLE_I18N
456 if (MB_CUR_MAX > 1)
457 memmove (pstr->wcs, pstr->wcs + offset,
458 (pstr->valid_len - offset) * sizeof (wint_t));
459 #endif /* RE_ENABLE_I18N */
460 if (MBS_ALLOCATED (pstr))
461 memmove (pstr->mbs, pstr->mbs + offset,
462 pstr->valid_len - offset);
463 if (MBS_CASE_ALLOCATED (pstr))
464 memmove (pstr->mbs_case, pstr->mbs_case + offset,
465 pstr->valid_len - offset);
466 pstr->valid_len -= offset;
467 #if DEBUG
468 assert (pstr->valid_len > 0);
469 #endif
471 else
473 /* No, skip all characters until IDX. */
474 pstr->valid_len = 0;
475 #ifdef RE_ENABLE_I18N
476 if (MB_CUR_MAX > 1)
478 int wcs_idx;
479 wint_t wc;
480 pstr->valid_len = re_string_skip_chars (pstr, idx, &wc) - idx;
481 for (wcs_idx = 0; wcs_idx < pstr->valid_len; ++wcs_idx)
482 pstr->wcs[wcs_idx] = WEOF;
483 if (pstr->trans && wc <= 0xff)
484 wc = pstr->trans[wc];
485 pstr->tip_context = (IS_WIDE_WORD_CHAR (wc) ? CONTEXT_WORD
486 : ((newline && IS_WIDE_NEWLINE (wc))
487 ? CONTEXT_NEWLINE : 0));
489 else
490 #endif /* RE_ENABLE_I18N */
492 int c = pstr->raw_mbs[pstr->raw_mbs_idx + offset - 1];
493 if (pstr->trans)
494 c = pstr->trans[c];
495 pstr->tip_context = (IS_WORD_CHAR (c) ? CONTEXT_WORD
496 : ((newline && IS_NEWLINE (c))
497 ? CONTEXT_NEWLINE : 0));
500 if (!MBS_CASE_ALLOCATED (pstr))
502 pstr->mbs_case += offset;
503 /* In case of !MBS_ALLOCATED && !MBS_CASE_ALLOCATED. */
504 if (!MBS_ALLOCATED (pstr))
505 pstr->mbs += offset;
508 pstr->raw_mbs_idx = idx;
509 pstr->len -= offset;
510 pstr->stop -= offset;
512 /* Then build the buffers. */
513 #ifdef RE_ENABLE_I18N
514 if (MB_CUR_MAX > 1)
516 if (pstr->icase)
517 build_wcs_upper_buffer (pstr);
518 else
519 build_wcs_buffer (pstr);
521 else
522 #endif /* RE_ENABLE_I18N */
524 if (pstr->icase)
525 build_upper_buffer (pstr);
526 else if (pstr->trans != NULL)
527 re_string_translate_buffer (pstr);
529 pstr->cur_idx = 0;
531 return REG_NOERROR;
534 static void
535 re_string_destruct (pstr)
536 re_string_t *pstr;
538 #ifdef RE_ENABLE_I18N
539 re_free (pstr->wcs);
540 #endif /* RE_ENABLE_I18N */
541 if (MBS_ALLOCATED (pstr))
542 re_free (pstr->mbs);
543 if (MBS_CASE_ALLOCATED (pstr))
544 re_free (pstr->mbs_case);
547 /* Return the context at IDX in INPUT. */
549 static unsigned int
550 re_string_context_at (input, idx, eflags, newline_anchor)
551 const re_string_t *input;
552 int idx, eflags, newline_anchor;
554 int c;
555 if (idx < 0 || idx == input->len)
557 if (idx < 0)
558 /* In this case, we use the value stored in input->tip_context,
559 since we can't know the character in input->mbs[-1] here. */
560 return input->tip_context;
561 else /* (idx == input->len) */
562 return ((eflags & REG_NOTEOL) ? CONTEXT_ENDBUF
563 : CONTEXT_NEWLINE | CONTEXT_ENDBUF);
565 #ifdef RE_ENABLE_I18N
566 if (MB_CUR_MAX > 1)
568 wint_t wc;
569 int wc_idx = idx;
570 while(input->wcs[wc_idx] == WEOF)
572 #ifdef DEBUG
573 /* It must not happen. */
574 assert (wc_idx >= 0);
575 #endif
576 --wc_idx;
577 if (wc_idx < 0)
578 return input->tip_context;
580 wc = input->wcs[wc_idx];
581 if (IS_WIDE_WORD_CHAR (wc))
582 return CONTEXT_WORD;
583 return (newline_anchor && IS_WIDE_NEWLINE (wc)) ? CONTEXT_NEWLINE : 0;
585 else
586 #endif
588 c = re_string_byte_at (input, idx);
589 if (IS_WORD_CHAR (c))
590 return CONTEXT_WORD;
591 return (newline_anchor && IS_NEWLINE (c)) ? CONTEXT_NEWLINE : 0;
595 /* Functions for set operation. */
597 static reg_errcode_t
598 re_node_set_alloc (set, size)
599 re_node_set *set;
600 int size;
602 set->alloc = size;
603 set->nelem = 0;
604 set->elems = re_malloc (int, size);
605 if (BE (set->elems == NULL, 0))
606 return REG_ESPACE;
607 return REG_NOERROR;
610 static reg_errcode_t
611 re_node_set_init_1 (set, elem)
612 re_node_set *set;
613 int elem;
615 set->alloc = 1;
616 set->nelem = 1;
617 set->elems = re_malloc (int, 1);
618 if (BE (set->elems == NULL, 0))
620 set->alloc = set->nelem = 0;
621 return REG_ESPACE;
623 set->elems[0] = elem;
624 return REG_NOERROR;
627 static reg_errcode_t
628 re_node_set_init_2 (set, elem1, elem2)
629 re_node_set *set;
630 int elem1, elem2;
632 set->alloc = 2;
633 set->elems = re_malloc (int, 2);
634 if (BE (set->elems == NULL, 0))
635 return REG_ESPACE;
636 if (elem1 == elem2)
638 set->nelem = 1;
639 set->elems[0] = elem1;
641 else
643 set->nelem = 2;
644 if (elem1 < elem2)
646 set->elems[0] = elem1;
647 set->elems[1] = elem2;
649 else
651 set->elems[0] = elem2;
652 set->elems[1] = elem1;
655 return REG_NOERROR;
658 static reg_errcode_t
659 re_node_set_init_copy (dest, src)
660 re_node_set *dest;
661 const re_node_set *src;
663 dest->nelem = src->nelem;
664 if (src->nelem > 0)
666 dest->alloc = dest->nelem;
667 dest->elems = re_malloc (int, dest->alloc);
668 if (BE (dest->elems == NULL, 0))
670 dest->alloc = dest->nelem = 0;
671 return REG_ESPACE;
673 memcpy (dest->elems, src->elems, src->nelem * sizeof (int));
675 else
676 re_node_set_init_empty (dest);
677 return REG_NOERROR;
680 /* Calculate the intersection of the sets SRC1 and SRC2. And merge it to
681 DEST. Return value indicate the error code or REG_NOERROR if succeeded.
682 Note: We assume dest->elems is NULL, when dest->alloc is 0. */
684 static reg_errcode_t
685 re_node_set_add_intersect (dest, src1, src2)
686 re_node_set *dest;
687 const re_node_set *src1, *src2;
689 int i1, i2, id;
690 if (src1->nelem > 0 && src2->nelem > 0)
692 if (src1->nelem + src2->nelem + dest->nelem > dest->alloc)
694 dest->alloc = src1->nelem + src2->nelem + dest->nelem;
695 dest->elems = re_realloc (dest->elems, int, dest->alloc);
696 if (BE (dest->elems == NULL, 0))
697 return REG_ESPACE;
700 else
701 return REG_NOERROR;
703 for (i1 = i2 = id = 0 ; i1 < src1->nelem && i2 < src2->nelem ;)
705 if (src1->elems[i1] > src2->elems[i2])
707 ++i2;
708 continue;
710 if (src1->elems[i1] == src2->elems[i2])
712 while (id < dest->nelem && dest->elems[id] < src2->elems[i2])
713 ++id;
714 if (id < dest->nelem && dest->elems[id] == src2->elems[i2])
715 ++id;
716 else
718 memmove (dest->elems + id + 1, dest->elems + id,
719 sizeof (int) * (dest->nelem - id));
720 dest->elems[id++] = src2->elems[i2++];
721 ++dest->nelem;
724 ++i1;
726 return REG_NOERROR;
729 /* Calculate the union set of the sets SRC1 and SRC2. And store it to
730 DEST. Return value indicate the error code or REG_NOERROR if succeeded. */
732 static reg_errcode_t
733 re_node_set_init_union (dest, src1, src2)
734 re_node_set *dest;
735 const re_node_set *src1, *src2;
737 int i1, i2, id;
738 if (src1 != NULL && src1->nelem > 0 && src2 != NULL && src2->nelem > 0)
740 dest->alloc = src1->nelem + src2->nelem;
741 dest->elems = re_malloc (int, dest->alloc);
742 if (BE (dest->elems == NULL, 0))
743 return REG_ESPACE;
745 else
747 if (src1 != NULL && src1->nelem > 0)
748 return re_node_set_init_copy (dest, src1);
749 else if (src2 != NULL && src2->nelem > 0)
750 return re_node_set_init_copy (dest, src2);
751 else
752 re_node_set_init_empty (dest);
753 return REG_NOERROR;
755 for (i1 = i2 = id = 0 ; i1 < src1->nelem && i2 < src2->nelem ;)
757 if (src1->elems[i1] > src2->elems[i2])
759 dest->elems[id++] = src2->elems[i2++];
760 continue;
762 if (src1->elems[i1] == src2->elems[i2])
763 ++i2;
764 dest->elems[id++] = src1->elems[i1++];
766 if (i1 < src1->nelem)
768 memcpy (dest->elems + id, src1->elems + i1,
769 (src1->nelem - i1) * sizeof (int));
770 id += src1->nelem - i1;
772 else if (i2 < src2->nelem)
774 memcpy (dest->elems + id, src2->elems + i2,
775 (src2->nelem - i2) * sizeof (int));
776 id += src2->nelem - i2;
778 dest->nelem = id;
779 return REG_NOERROR;
782 /* Calculate the union set of the sets DEST and SRC. And store it to
783 DEST. Return value indicate the error code or REG_NOERROR if succeeded. */
785 static reg_errcode_t
786 re_node_set_merge (dest, src)
787 re_node_set *dest;
788 const re_node_set *src;
790 int si, di;
791 if (src == NULL || src->nelem == 0)
792 return REG_NOERROR;
793 if (dest->alloc < src->nelem + dest->nelem)
795 int *new_buffer;
796 dest->alloc = 2 * (src->nelem + dest->alloc);
797 new_buffer = re_realloc (dest->elems, int, dest->alloc);
798 if (BE (new_buffer == NULL, 0))
799 return REG_ESPACE;
800 dest->elems = new_buffer;
803 for (si = 0, di = 0 ; si < src->nelem && di < dest->nelem ;)
805 int cp_from, ncp, mid, right, src_elem = src->elems[si];
806 /* Binary search the spot we will add the new element. */
807 right = dest->nelem;
808 while (di < right)
810 mid = (di + right) / 2;
811 if (dest->elems[mid] < src_elem)
812 di = mid + 1;
813 else
814 right = mid;
816 if (di >= dest->nelem)
817 break;
819 if (dest->elems[di] == src_elem)
821 /* Skip since, DEST already has the element. */
822 ++di;
823 ++si;
824 continue;
827 /* Skip the src elements which are less than dest->elems[di]. */
828 cp_from = si;
829 while (si < src->nelem && src->elems[si] < dest->elems[di])
830 ++si;
831 /* Copy these src elements. */
832 ncp = si - cp_from;
833 memmove (dest->elems + di + ncp, dest->elems + di,
834 sizeof (int) * (dest->nelem - di));
835 memcpy (dest->elems + di, src->elems + cp_from,
836 sizeof (int) * ncp);
837 /* Update counters. */
838 di += ncp;
839 dest->nelem += ncp;
842 /* Copy remaining src elements. */
843 if (si < src->nelem)
845 memcpy (dest->elems + di, src->elems + si,
846 sizeof (int) * (src->nelem - si));
847 dest->nelem += src->nelem - si;
849 return REG_NOERROR;
852 /* Insert the new element ELEM to the re_node_set* SET.
853 return 0 if SET already has ELEM,
854 return -1 if an error is occured, return 1 otherwise. */
856 static int
857 re_node_set_insert (set, elem)
858 re_node_set *set;
859 int elem;
861 int idx, right, mid;
862 /* In case of the set is empty. */
863 if (set->elems == NULL || set->alloc == 0)
865 if (BE (re_node_set_init_1 (set, elem) == REG_NOERROR, 1))
866 return 1;
867 else
868 return -1;
871 /* Binary search the spot we will add the new element. */
872 idx = 0;
873 right = set->nelem;
874 while (idx < right)
876 mid = (idx + right) / 2;
877 if (set->elems[mid] < elem)
878 idx = mid + 1;
879 else
880 right = mid;
883 /* Realloc if we need. */
884 if (set->alloc < set->nelem + 1)
886 int *new_array;
887 set->alloc = set->alloc * 2;
888 new_array = re_malloc (int, set->alloc);
889 if (BE (new_array == NULL, 0))
890 return -1;
891 /* Copy the elements they are followed by the new element. */
892 if (idx > 0)
893 memcpy (new_array, set->elems, sizeof (int) * (idx));
894 /* Copy the elements which follows the new element. */
895 if (set->nelem - idx > 0)
896 memcpy (new_array + idx + 1, set->elems + idx,
897 sizeof (int) * (set->nelem - idx));
898 re_free (set->elems);
899 set->elems = new_array;
901 else
903 /* Move the elements which follows the new element. */
904 if (set->nelem - idx > 0)
905 memmove (set->elems + idx + 1, set->elems + idx,
906 sizeof (int) * (set->nelem - idx));
908 /* Insert the new element. */
909 set->elems[idx] = elem;
910 ++set->nelem;
911 return 1;
914 /* Compare two node sets SET1 and SET2.
915 return 1 if SET1 and SET2 are equivalent, retrun 0 otherwise. */
917 static int
918 re_node_set_compare (set1, set2)
919 const re_node_set *set1, *set2;
921 int i;
922 if (set1 == NULL || set2 == NULL || set1->nelem != set2->nelem)
923 return 0;
924 for (i = 0 ; i < set1->nelem ; i++)
925 if (set1->elems[i] != set2->elems[i])
926 return 0;
927 return 1;
930 /* Return (idx + 1) if SET contains the element ELEM, return 0 otherwise. */
932 static int
933 re_node_set_contains (set, elem)
934 const re_node_set *set;
935 int elem;
937 int idx, right, mid;
938 if (set->nelem <= 0)
939 return 0;
941 /* Binary search the element. */
942 idx = 0;
943 right = set->nelem - 1;
944 while (idx < right)
946 mid = (idx + right) / 2;
947 if (set->elems[mid] < elem)
948 idx = mid + 1;
949 else
950 right = mid;
952 return set->elems[idx] == elem ? idx + 1 : 0;
955 static void
956 re_node_set_remove_at (set, idx)
957 re_node_set *set;
958 int idx;
960 if (idx < 0 || idx >= set->nelem)
961 return;
962 if (idx < set->nelem - 1)
963 memmove (set->elems + idx, set->elems + idx + 1,
964 sizeof (int) * (set->nelem - idx - 1));
965 --set->nelem;
969 /* Add the token TOKEN to dfa->nodes, and return the index of the token.
970 Or return -1, if an error will be occured. */
972 static int
973 re_dfa_add_node (dfa, token, mode)
974 re_dfa_t *dfa;
975 re_token_t token;
976 int mode;
978 if (dfa->nodes_len >= dfa->nodes_alloc)
980 re_token_t *new_array;
981 dfa->nodes_alloc *= 2;
982 new_array = re_realloc (dfa->nodes, re_token_t, dfa->nodes_alloc);
983 if (BE (new_array == NULL, 0))
984 return -1;
985 else
986 dfa->nodes = new_array;
987 if (mode)
989 int *new_nexts;
990 re_node_set *new_edests, *new_eclosures, *new_inveclosures;
992 new_nexts = re_realloc (dfa->nexts, int, dfa->nodes_alloc);
993 new_edests = re_realloc (dfa->edests, re_node_set, dfa->nodes_alloc);
994 new_eclosures = re_realloc (dfa->eclosures, re_node_set,
995 dfa->nodes_alloc);
996 new_inveclosures = re_realloc (dfa->inveclosures, re_node_set,
997 dfa->nodes_alloc);
998 if (BE (new_nexts == NULL || new_edests == NULL
999 || new_eclosures == NULL || new_inveclosures == NULL, 0))
1000 return -1;
1001 dfa->nexts = new_nexts;
1002 dfa->edests = new_edests;
1003 dfa->eclosures = new_eclosures;
1004 dfa->inveclosures = new_inveclosures;
1007 dfa->nodes[dfa->nodes_len] = token;
1008 dfa->nodes[dfa->nodes_len].duplicated = 0;
1009 dfa->nodes[dfa->nodes_len].constraint = 0;
1010 return dfa->nodes_len++;
1013 static unsigned int inline
1014 calc_state_hash (nodes, context)
1015 const re_node_set *nodes;
1016 unsigned int context;
1018 unsigned int hash = nodes->nelem + context;
1019 int i;
1020 for (i = 0 ; i < nodes->nelem ; i++)
1021 hash += nodes->elems[i];
1022 return hash;
1025 /* Search for the state whose node_set is equivalent to NODES.
1026 Return the pointer to the state, if we found it in the DFA.
1027 Otherwise create the new one and return it. In case of an error
1028 return NULL and set the error code in ERR.
1029 Note: - We assume NULL as the invalid state, then it is possible that
1030 return value is NULL and ERR is REG_NOERROR.
1031 - We never return non-NULL value in case of any errors, it is for
1032 optimization. */
1034 static re_dfastate_t*
1035 re_acquire_state (err, dfa, nodes)
1036 reg_errcode_t *err;
1037 re_dfa_t *dfa;
1038 const re_node_set *nodes;
1040 unsigned int hash;
1041 re_dfastate_t *new_state;
1042 struct re_state_table_entry *spot;
1043 int i;
1044 if (BE (nodes->nelem == 0, 0))
1046 *err = REG_NOERROR;
1047 return NULL;
1049 hash = calc_state_hash (nodes, 0);
1050 spot = dfa->state_table + (hash & dfa->state_hash_mask);
1052 for (i = 0 ; i < spot->num ; i++)
1054 re_dfastate_t *state = spot->array[i];
1055 if (hash != state->hash)
1056 continue;
1057 if (re_node_set_compare (&state->nodes, nodes))
1058 return state;
1061 /* There are no appropriate state in the dfa, create the new one. */
1062 new_state = create_ci_newstate (dfa, nodes, hash);
1063 if (BE (new_state != NULL, 1))
1064 return new_state;
1065 else
1067 *err = REG_ESPACE;
1068 return NULL;
1072 /* Search for the state whose node_set is equivalent to NODES and
1073 whose context is equivalent to CONTEXT.
1074 Return the pointer to the state, if we found it in the DFA.
1075 Otherwise create the new one and return it. In case of an error
1076 return NULL and set the error code in ERR.
1077 Note: - We assume NULL as the invalid state, then it is possible that
1078 return value is NULL and ERR is REG_NOERROR.
1079 - We never return non-NULL value in case of any errors, it is for
1080 optimization. */
1082 static re_dfastate_t*
1083 re_acquire_state_context (err, dfa, nodes, context)
1084 reg_errcode_t *err;
1085 re_dfa_t *dfa;
1086 const re_node_set *nodes;
1087 unsigned int context;
1089 unsigned int hash;
1090 re_dfastate_t *new_state;
1091 struct re_state_table_entry *spot;
1092 int i;
1093 if (nodes->nelem == 0)
1095 *err = REG_NOERROR;
1096 return NULL;
1098 hash = calc_state_hash (nodes, context);
1099 spot = dfa->state_table + (hash & dfa->state_hash_mask);
1101 for (i = 0 ; i < spot->num ; i++)
1103 re_dfastate_t *state = spot->array[i];
1104 if (hash != state->hash)
1105 continue;
1106 if (re_node_set_compare (state->entrance_nodes, nodes)
1107 && state->context == context)
1108 return state;
1110 /* There are no appropriate state in `dfa', create the new one. */
1111 new_state = create_cd_newstate (dfa, nodes, context, hash);
1112 if (BE (new_state != NULL, 1))
1113 return new_state;
1114 else
1116 *err = REG_ESPACE;
1117 return NULL;
1121 /* Allocate memory for DFA state and initialize common properties.
1122 Return the new state if succeeded, otherwise return NULL. */
1124 static re_dfastate_t *
1125 create_newstate_common (dfa, nodes, hash)
1126 re_dfa_t *dfa;
1127 const re_node_set *nodes;
1128 unsigned int hash;
1130 re_dfastate_t *newstate;
1131 reg_errcode_t err;
1132 newstate = (re_dfastate_t *) calloc (sizeof (re_dfastate_t), 1);
1133 if (BE (newstate == NULL, 0))
1134 return NULL;
1135 err = re_node_set_init_copy (&newstate->nodes, nodes);
1136 if (BE (err != REG_NOERROR, 0))
1138 re_free (newstate);
1139 return NULL;
1141 newstate->trtable = NULL;
1142 newstate->trtable_search = NULL;
1143 newstate->hash = hash;
1144 return newstate;
1147 /* Store the new state NEWSTATE whose hash value is HASH in appropriate
1148 position. Return value indicate the error code if failed. */
1150 static reg_errcode_t
1151 register_state (dfa, newstate, hash)
1152 re_dfa_t *dfa;
1153 re_dfastate_t *newstate;
1154 unsigned int hash;
1156 struct re_state_table_entry *spot;
1157 spot = dfa->state_table + (hash & dfa->state_hash_mask);
1159 if (spot->alloc <= spot->num)
1161 re_dfastate_t **new_array;
1162 spot->alloc = 2 * spot->num + 2;
1163 new_array = re_realloc (spot->array, re_dfastate_t *, spot->alloc);
1164 if (BE (new_array == NULL, 0))
1165 return REG_ESPACE;
1166 spot->array = new_array;
1168 spot->array[spot->num++] = newstate;
1169 return REG_NOERROR;
1172 /* Create the new state which is independ of contexts.
1173 Return the new state if succeeded, otherwise return NULL. */
1175 static re_dfastate_t *
1176 create_ci_newstate (dfa, nodes, hash)
1177 re_dfa_t *dfa;
1178 const re_node_set *nodes;
1179 unsigned int hash;
1181 int i;
1182 reg_errcode_t err;
1183 re_dfastate_t *newstate;
1184 newstate = create_newstate_common (dfa, nodes, hash);
1185 if (BE (newstate == NULL, 0))
1186 return NULL;
1187 newstate->entrance_nodes = &newstate->nodes;
1189 for (i = 0 ; i < nodes->nelem ; i++)
1191 re_token_t *node = dfa->nodes + nodes->elems[i];
1192 re_token_type_t type = node->type;
1193 if (type == CHARACTER && !node->constraint)
1194 continue;
1196 /* If the state has the halt node, the state is a halt state. */
1197 else if (type == END_OF_RE)
1198 newstate->halt = 1;
1199 #ifdef RE_ENABLE_I18N
1200 else if (type == COMPLEX_BRACKET
1201 || (type == OP_PERIOD && MB_CUR_MAX > 1))
1202 newstate->accept_mb = 1;
1203 #endif /* RE_ENABLE_I18N */
1204 else if (type == OP_BACK_REF)
1205 newstate->has_backref = 1;
1206 else if (type == ANCHOR || node->constraint)
1207 newstate->has_constraint = 1;
1209 err = register_state (dfa, newstate, hash);
1210 if (BE (err != REG_NOERROR, 0))
1212 free_state (newstate);
1213 newstate = NULL;
1215 return newstate;
1218 /* Create the new state which is depend on the context CONTEXT.
1219 Return the new state if succeeded, otherwise return NULL. */
1221 static re_dfastate_t *
1222 create_cd_newstate (dfa, nodes, context, hash)
1223 re_dfa_t *dfa;
1224 const re_node_set *nodes;
1225 unsigned int context, hash;
1227 int i, nctx_nodes = 0;
1228 reg_errcode_t err;
1229 re_dfastate_t *newstate;
1231 newstate = create_newstate_common (dfa, nodes, hash);
1232 if (BE (newstate == NULL, 0))
1233 return NULL;
1234 newstate->context = context;
1235 newstate->entrance_nodes = &newstate->nodes;
1237 for (i = 0 ; i < nodes->nelem ; i++)
1239 unsigned int constraint = 0;
1240 re_token_t *node = dfa->nodes + nodes->elems[i];
1241 re_token_type_t type = node->type;
1242 if (node->constraint)
1243 constraint = node->constraint;
1245 if (type == CHARACTER && !constraint)
1246 continue;
1247 /* If the state has the halt node, the state is a halt state. */
1248 else if (type == END_OF_RE)
1249 newstate->halt = 1;
1250 #ifdef RE_ENABLE_I18N
1251 else if (type == COMPLEX_BRACKET
1252 || (type == OP_PERIOD && MB_CUR_MAX > 1))
1253 newstate->accept_mb = 1;
1254 #endif /* RE_ENABLE_I18N */
1255 else if (type == OP_BACK_REF)
1256 newstate->has_backref = 1;
1257 else if (type == ANCHOR)
1258 constraint = node->opr.ctx_type;
1260 if (constraint)
1262 if (newstate->entrance_nodes == &newstate->nodes)
1264 newstate->entrance_nodes = re_malloc (re_node_set, 1);
1265 if (BE (newstate->entrance_nodes == NULL, 0))
1267 free_state (newstate);
1268 return NULL;
1270 re_node_set_init_copy (newstate->entrance_nodes, nodes);
1271 nctx_nodes = 0;
1272 newstate->has_constraint = 1;
1275 if (NOT_SATISFY_PREV_CONSTRAINT (constraint,context))
1277 re_node_set_remove_at (&newstate->nodes, i - nctx_nodes);
1278 ++nctx_nodes;
1282 err = register_state (dfa, newstate, hash);
1283 if (BE (err != REG_NOERROR, 0))
1285 free_state (newstate);
1286 newstate = NULL;
1288 return newstate;
1291 static void
1292 free_state (state)
1293 re_dfastate_t *state;
1295 if (state->entrance_nodes != &state->nodes)
1297 re_node_set_free (state->entrance_nodes);
1298 re_free (state->entrance_nodes);
1300 re_node_set_free (&state->nodes);
1301 re_free (state->trtable);
1302 re_free (state->trtable_search);
1303 re_free (state);