Define macro DO_ELF_MACHINE_REL_RELATIVE for 'elf_machine_rel_relative'. (elf_dynamic...
[glibc.git] / posix / regex_internal.c
blob116543a6da76a945145805c0e19a5f34e26727d4
1 /* Extended regular expression matching and search library.
2 Copyright (C) 2002 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4 Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library; if not, write to the Free
18 Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
19 02111-1307 USA. */
21 #include <assert.h>
22 #include <ctype.h>
23 #include <limits.h>
24 #include <stdio.h>
25 #include <stdlib.h>
26 #include <string.h>
28 #if defined HAVE_WCHAR_H || defined _LIBC
29 # include <wchar.h>
30 #endif /* HAVE_WCHAR_H || _LIBC */
31 #if defined HAVE_WCTYPE_H || defined _LIBC
32 # include <wctype.h>
33 #endif /* HAVE_WCTYPE_H || _LIBC */
35 #ifdef _LIBC
36 # ifndef _RE_DEFINE_LOCALE_FUNCTIONS
37 # define _RE_DEFINE_LOCALE_FUNCTIONS 1
38 # include <locale/localeinfo.h>
39 # include <locale/elem-hash.h>
40 # include <locale/coll-lookup.h>
41 # endif
42 #endif
44 /* This is for other GNU distributions with internationalized messages. */
45 #if HAVE_LIBINTL_H || defined _LIBC
46 # include <libintl.h>
47 # ifdef _LIBC
48 # undef gettext
49 # define gettext(msgid) \
50 INTUSE(__dcgettext) (_libc_intl_domainname_internal, msgid, LC_MESSAGES)
51 # endif
52 #else
53 # define gettext(msgid) (msgid)
54 #endif
56 #ifndef gettext_noop
57 /* This define is so xgettext can find the internationalizable
58 strings. */
59 # define gettext_noop(String) String
60 #endif
62 #include "regex.h"
63 #include "regex_internal.h"
65 static void re_string_construct_common (const char *str, int len,
66 re_string_t *pstr,
67 RE_TRANSLATE_TYPE trans, int icase);
68 #ifdef RE_ENABLE_I18N
69 static int re_string_skip_chars (re_string_t *pstr, int new_raw_idx);
70 #endif /* RE_ENABLE_I18N */
71 static re_dfastate_t *create_newstate_common (re_dfa_t *dfa,
72 const re_node_set *nodes,
73 unsigned int hash);
74 static reg_errcode_t register_state (re_dfa_t *dfa, re_dfastate_t *newstate,
75 unsigned int hash);
76 static re_dfastate_t *create_ci_newstate (re_dfa_t *dfa,
77 const re_node_set *nodes,
78 unsigned int hash);
79 static re_dfastate_t *create_cd_newstate (re_dfa_t *dfa,
80 const re_node_set *nodes,
81 unsigned int context,
82 unsigned int hash);
83 static unsigned int inline calc_state_hash (const re_node_set *nodes,
84 unsigned int context);
86 /* Functions for string operation. */
88 /* This function allocate the buffers. It is necessary to call
89 re_string_reconstruct before using the object. */
91 static reg_errcode_t
92 re_string_allocate (pstr, str, len, init_len, trans, icase)
93 re_string_t *pstr;
94 const char *str;
95 int len, init_len, icase;
96 RE_TRANSLATE_TYPE trans;
98 reg_errcode_t ret;
99 int init_buf_len = (len + 1 < init_len) ? len + 1: init_len;
100 re_string_construct_common (str, len, pstr, trans, icase);
101 pstr->stop = pstr->len;
103 ret = re_string_realloc_buffers (pstr, init_buf_len);
104 if (BE (ret != REG_NOERROR, 0))
105 return ret;
107 pstr->mbs_case = (MBS_CASE_ALLOCATED (pstr) ? pstr->mbs_case
108 : (unsigned char *) str);
109 pstr->mbs = MBS_ALLOCATED (pstr) ? pstr->mbs : pstr->mbs_case;
110 pstr->valid_len = (MBS_CASE_ALLOCATED (pstr) || MBS_ALLOCATED (pstr)
111 || MB_CUR_MAX > 1) ? pstr->valid_len : len;
112 return REG_NOERROR;
115 /* This function allocate the buffers, and initialize them. */
117 static reg_errcode_t
118 re_string_construct (pstr, str, len, trans, icase)
119 re_string_t *pstr;
120 const char *str;
121 int len, icase;
122 RE_TRANSLATE_TYPE trans;
124 reg_errcode_t ret;
125 re_string_construct_common (str, len, pstr, trans, icase);
126 pstr->stop = pstr->len;
127 /* Set 0 so that this function can initialize whole buffers. */
128 pstr->valid_len = 0;
130 if (len > 0)
132 ret = re_string_realloc_buffers (pstr, len + 1);
133 if (BE (ret != REG_NOERROR, 0))
134 return ret;
136 pstr->mbs_case = (MBS_CASE_ALLOCATED (pstr) ? pstr->mbs_case
137 : (unsigned char *) str);
138 pstr->mbs = MBS_ALLOCATED (pstr) ? pstr->mbs : pstr->mbs_case;
140 if (icase)
142 #ifdef RE_ENABLE_I18N
143 if (MB_CUR_MAX > 1)
144 build_wcs_upper_buffer (pstr);
145 else
146 #endif /* RE_ENABLE_I18N */
147 build_upper_buffer (pstr);
149 else
151 #ifdef RE_ENABLE_I18N
152 if (MB_CUR_MAX > 1)
153 build_wcs_buffer (pstr);
154 else
155 #endif /* RE_ENABLE_I18N */
157 if (trans != NULL)
158 re_string_translate_buffer (pstr);
159 else
160 pstr->valid_len = len;
164 /* Initialized whole buffers, then valid_len == bufs_len. */
165 pstr->valid_len = pstr->bufs_len;
166 return REG_NOERROR;
169 /* Helper functions for re_string_allocate, and re_string_construct. */
171 static reg_errcode_t
172 re_string_realloc_buffers (pstr, new_buf_len)
173 re_string_t *pstr;
174 int new_buf_len;
176 #ifdef RE_ENABLE_I18N
177 if (MB_CUR_MAX > 1)
179 pstr->wcs = re_realloc (pstr->wcs, wint_t, new_buf_len);
180 if (BE (pstr->wcs == NULL, 0))
181 return REG_ESPACE;
183 #endif /* RE_ENABLE_I18N */
184 if (MBS_ALLOCATED (pstr))
186 pstr->mbs = re_realloc (pstr->mbs, unsigned char, new_buf_len);
187 if (BE (pstr->mbs == NULL, 0))
188 return REG_ESPACE;
190 if (MBS_CASE_ALLOCATED (pstr))
192 pstr->mbs_case = re_realloc (pstr->mbs_case, unsigned char, new_buf_len);
193 if (BE (pstr->mbs_case == NULL, 0))
194 return REG_ESPACE;
195 if (!MBS_ALLOCATED (pstr))
196 pstr->mbs = pstr->mbs_case;
198 pstr->bufs_len = new_buf_len;
199 return REG_NOERROR;
203 static void
204 re_string_construct_common (str, len, pstr, trans, icase)
205 const char *str;
206 int len;
207 re_string_t *pstr;
208 RE_TRANSLATE_TYPE trans;
209 int icase;
211 memset (pstr, '\0', sizeof (re_string_t));
212 pstr->raw_mbs = (const unsigned char *) str;
213 pstr->len = len;
214 pstr->trans = trans;
215 pstr->icase = icase ? 1 : 0;
218 #ifdef RE_ENABLE_I18N
220 /* Build wide character buffer PSTR->WCS.
221 If the byte sequence of the string are:
222 <mb1>(0), <mb1>(1), <mb2>(0), <mb2>(1), <sb3>
223 Then wide character buffer will be:
224 <wc1> , WEOF , <wc2> , WEOF , <wc3>
225 We use WEOF for padding, they indicate that the position isn't
226 a first byte of a multibyte character.
228 Note that this function assumes PSTR->VALID_LEN elements are already
229 built and starts from PSTR->VALID_LEN. */
231 static void
232 build_wcs_buffer (pstr)
233 re_string_t *pstr;
235 mbstate_t prev_st;
236 int byte_idx, end_idx, mbclen, remain_len;
237 /* Build the buffers from pstr->valid_len to either pstr->len or
238 pstr->bufs_len. */
239 end_idx = (pstr->bufs_len > pstr->len)? pstr->len : pstr->bufs_len;
240 for (byte_idx = pstr->valid_len; byte_idx < end_idx;)
242 wchar_t wc;
243 remain_len = end_idx - byte_idx;
244 prev_st = pstr->cur_state;
245 mbclen = mbrtowc (&wc, ((const char *) pstr->raw_mbs + pstr->raw_mbs_idx
246 + byte_idx), remain_len, &pstr->cur_state);
247 if (BE (mbclen == (size_t) -2, 0))
249 /* The buffer doesn't have enough space, finish to build. */
250 pstr->cur_state = prev_st;
251 break;
253 else if (BE (mbclen == (size_t) -1 || mbclen == 0, 0))
255 /* We treat these cases as a singlebyte character. */
256 mbclen = 1;
257 wc = (wchar_t) pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx];
258 pstr->cur_state = prev_st;
261 /* Apply the translateion if we need. */
262 if (pstr->trans != NULL && mbclen == 1)
264 int ch = pstr->trans[pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx]];
265 pstr->mbs_case[byte_idx] = ch;
267 /* Write wide character and padding. */
268 pstr->wcs[byte_idx++] = wc;
269 /* Write paddings. */
270 for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
271 pstr->wcs[byte_idx++] = WEOF;
273 pstr->valid_len = byte_idx;
276 /* Build wide character buffer PSTR->WCS like build_wcs_buffer,
277 but for REG_ICASE. */
279 static void
280 build_wcs_upper_buffer (pstr)
281 re_string_t *pstr;
283 mbstate_t prev_st;
284 int byte_idx, end_idx, mbclen, remain_len;
285 /* Build the buffers from pstr->valid_len to either pstr->len or
286 pstr->bufs_len. */
287 end_idx = (pstr->bufs_len > pstr->len)? pstr->len : pstr->bufs_len;
288 for (byte_idx = pstr->valid_len; byte_idx < end_idx;)
290 wchar_t wc;
291 remain_len = end_idx - byte_idx;
292 prev_st = pstr->cur_state;
293 mbclen = mbrtowc (&wc, ((const char *) pstr->raw_mbs + pstr->raw_mbs_idx
294 + byte_idx), remain_len, &pstr->cur_state);
295 if (BE (mbclen == (size_t) -2, 0))
297 /* The buffer doesn't have enough space, finish to build. */
298 pstr->cur_state = prev_st;
299 break;
301 else if (mbclen == 1 || mbclen == (size_t) -1 || mbclen == 0)
303 /* In case of a singlebyte character. */
304 int ch = pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx];
305 /* Apply the translateion if we need. */
306 if (pstr->trans != NULL && mbclen == 1)
308 ch = pstr->trans[ch];
309 pstr->mbs_case[byte_idx] = ch;
311 pstr->wcs[byte_idx] = iswlower (wc) ? toupper (wc) : wc;
312 pstr->mbs[byte_idx++] = islower (ch) ? toupper (ch) : ch;
313 if (BE (mbclen == (size_t) -1, 0))
314 pstr->cur_state = prev_st;
316 else /* mbclen > 1 */
318 if (iswlower (wc))
319 wcrtomb ((char *) pstr->mbs + byte_idx, towupper (wc), &prev_st);
320 else
321 memcpy (pstr->mbs + byte_idx,
322 pstr->raw_mbs + pstr->raw_mbs_idx + byte_idx, mbclen);
323 pstr->wcs[byte_idx++] = iswlower (wc) ? toupper (wc) : wc;
324 /* Write paddings. */
325 for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
326 pstr->wcs[byte_idx++] = WEOF;
329 pstr->valid_len = byte_idx;
332 /* Skip characters until the index becomes greater than NEW_RAW_IDX.
333 Return the index. */
335 static int
336 re_string_skip_chars (pstr, new_raw_idx)
337 re_string_t *pstr;
338 int new_raw_idx;
340 mbstate_t prev_st;
341 int rawbuf_idx, mbclen;
343 /* Skip the characters which are not necessary to check. */
344 for (rawbuf_idx = pstr->raw_mbs_idx + pstr->valid_len;
345 rawbuf_idx < new_raw_idx;)
347 int remain_len = pstr->len - rawbuf_idx;
348 prev_st = pstr->cur_state;
349 mbclen = mbrlen ((const char *) pstr->raw_mbs + rawbuf_idx, remain_len,
350 &pstr->cur_state);
351 if (BE (mbclen == (size_t) -2 || mbclen == (size_t) -1 || mbclen == 0, 0))
353 /* We treat these cases as a singlebyte character. */
354 mbclen = 1;
355 pstr->cur_state = prev_st;
357 /* Then proceed the next character. */
358 rawbuf_idx += mbclen;
360 return rawbuf_idx;
362 #endif /* RE_ENABLE_I18N */
364 /* Build the buffer PSTR->MBS, and apply the translation if we need.
365 This function is used in case of REG_ICASE. */
367 static void
368 build_upper_buffer (pstr)
369 re_string_t *pstr;
371 int char_idx, end_idx;
372 end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
374 for (char_idx = pstr->valid_len; char_idx < end_idx; ++char_idx)
376 int ch = pstr->raw_mbs[pstr->raw_mbs_idx + char_idx];
377 if (pstr->trans != NULL)
379 ch = pstr->trans[ch];
380 pstr->mbs_case[char_idx] = ch;
382 if (islower (ch))
383 pstr->mbs[char_idx] = toupper (ch);
384 else
385 pstr->mbs[char_idx] = ch;
387 pstr->valid_len = char_idx;
390 /* Apply TRANS to the buffer in PSTR. */
392 static void
393 re_string_translate_buffer (pstr)
394 re_string_t *pstr;
396 int buf_idx, end_idx;
397 end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
399 for (buf_idx = pstr->valid_len; buf_idx < end_idx; ++buf_idx)
401 int ch = pstr->raw_mbs[pstr->raw_mbs_idx + buf_idx];
402 pstr->mbs_case[buf_idx] = pstr->trans[ch];
405 pstr->valid_len = buf_idx;
408 /* This function re-construct the buffers.
409 Concretely, convert to wide character in case of MB_CUR_MAX > 1,
410 convert to upper case in case of REG_ICASE, apply translation. */
412 static reg_errcode_t
413 re_string_reconstruct (pstr, idx, eflags, newline)
414 re_string_t *pstr;
415 int idx, eflags, newline;
417 int offset = idx - pstr->raw_mbs_idx;
418 if (offset < 0)
420 /* Reset buffer. */
421 #ifdef RE_ENABLE_I18N
422 if (MB_CUR_MAX > 1)
423 memset (&pstr->cur_state, '\0', sizeof (mbstate_t));
424 #endif /* RE_ENABLE_I18N */
425 pstr->valid_len = pstr->raw_mbs_idx = 0;
426 pstr->tip_context = ((eflags & REG_NOTBOL) ? CONTEXT_BEGBUF
427 : CONTEXT_NEWLINE | CONTEXT_BEGBUF);
428 if (!MBS_CASE_ALLOCATED (pstr))
429 pstr->mbs_case = (unsigned char *) pstr->raw_mbs;
430 if (!MBS_ALLOCATED (pstr) && !MBS_CASE_ALLOCATED (pstr))
431 pstr->mbs = (unsigned char *) pstr->raw_mbs;
432 offset = idx;
435 if (offset != 0)
437 pstr->tip_context = re_string_context_at (pstr, offset - 1, eflags,
438 newline);
439 /* Are the characters which are already checked remain? */
440 if (offset < pstr->valid_len)
442 /* Yes, move them to the front of the buffer. */
443 #ifdef RE_ENABLE_I18N
444 if (MB_CUR_MAX > 1)
445 memmove (pstr->wcs, pstr->wcs + offset,
446 (pstr->valid_len - offset) * sizeof (wint_t));
447 #endif /* RE_ENABLE_I18N */
448 if (MBS_ALLOCATED (pstr))
449 memmove (pstr->mbs, pstr->mbs + offset,
450 pstr->valid_len - offset);
451 if (MBS_CASE_ALLOCATED (pstr))
452 memmove (pstr->mbs_case, pstr->mbs_case + offset,
453 pstr->valid_len - offset);
454 pstr->valid_len -= offset;
455 #if DEBUG
456 assert (pstr->valid_len > 0);
457 #endif
459 else
461 /* No, skip all characters until IDX. */
462 pstr->valid_len = 0;
463 #ifdef RE_ENABLE_I18N
464 if (MB_CUR_MAX > 1)
466 int wcs_idx;
467 pstr->valid_len = re_string_skip_chars (pstr, idx) - idx;
468 for (wcs_idx = 0; wcs_idx < pstr->valid_len; ++wcs_idx)
469 pstr->wcs[wcs_idx] = WEOF;
471 #endif /* RE_ENABLE_I18N */
473 if (!MBS_CASE_ALLOCATED (pstr))
475 pstr->mbs_case += offset;
476 /* In case of !MBS_ALLOCATED && !MBS_CASE_ALLOCATED. */
477 if (!MBS_ALLOCATED (pstr))
478 pstr->mbs += offset;
481 pstr->raw_mbs_idx = idx;
482 pstr->len -= offset;
483 pstr->stop -= offset;
485 /* Then build the buffers. */
486 #ifdef RE_ENABLE_I18N
487 if (MB_CUR_MAX > 1)
489 if (pstr->icase)
490 build_wcs_upper_buffer (pstr);
491 else
492 build_wcs_buffer (pstr);
494 else
495 #endif /* RE_ENABLE_I18N */
497 if (pstr->icase)
498 build_upper_buffer (pstr);
499 else if (pstr->trans != NULL)
500 re_string_translate_buffer (pstr);
502 pstr->cur_idx = 0;
504 return REG_NOERROR;
507 static void
508 re_string_destruct (pstr)
509 re_string_t *pstr;
511 #ifdef RE_ENABLE_I18N
512 re_free (pstr->wcs);
513 #endif /* RE_ENABLE_I18N */
514 if (MBS_ALLOCATED (pstr))
515 re_free (pstr->mbs);
516 if (MBS_CASE_ALLOCATED (pstr))
517 re_free (pstr->mbs_case);
520 /* Return the context at IDX in INPUT. */
522 static unsigned int
523 re_string_context_at (input, idx, eflags, newline_anchor)
524 const re_string_t *input;
525 int idx, eflags, newline_anchor;
527 int c;
528 if (idx < 0 || idx == input->len)
530 if (idx < 0)
531 /* In this case, we use the value stored in input->tip_context,
532 since we can't know the character in input->mbs[-1] here. */
533 return input->tip_context;
534 else /* (idx == input->len) */
535 return ((eflags & REG_NOTEOL) ? CONTEXT_ENDBUF
536 : CONTEXT_NEWLINE | CONTEXT_ENDBUF);
538 c = re_string_byte_at (input, idx);
539 if (IS_WORD_CHAR (c))
540 return CONTEXT_WORD;
541 return (newline_anchor && IS_NEWLINE (c)) ? CONTEXT_NEWLINE : 0;
544 /* Functions for set operation. */
546 static reg_errcode_t
547 re_node_set_alloc (set, size)
548 re_node_set *set;
549 int size;
551 set->alloc = size;
552 set->nelem = 0;
553 set->elems = re_malloc (int, size);
554 if (BE (set->elems == NULL, 0))
555 return REG_ESPACE;
556 return REG_NOERROR;
559 static reg_errcode_t
560 re_node_set_init_1 (set, elem)
561 re_node_set *set;
562 int elem;
564 set->alloc = 1;
565 set->nelem = 1;
566 set->elems = re_malloc (int, 1);
567 if (BE (set->elems == NULL, 0))
568 return REG_ESPACE;
569 set->elems[0] = elem;
570 return REG_NOERROR;
573 static reg_errcode_t
574 re_node_set_init_2 (set, elem1, elem2)
575 re_node_set *set;
576 int elem1, elem2;
578 set->alloc = 2;
579 set->elems = re_malloc (int, 2);
580 if (BE (set->elems == NULL, 0))
581 return REG_ESPACE;
582 if (elem1 == elem2)
584 set->nelem = 1;
585 set->elems[0] = elem1;
587 else
589 set->nelem = 2;
590 if (elem1 < elem2)
592 set->elems[0] = elem1;
593 set->elems[1] = elem2;
595 else
597 set->elems[0] = elem2;
598 set->elems[1] = elem1;
601 return REG_NOERROR;
604 static reg_errcode_t
605 re_node_set_init_copy (dest, src)
606 re_node_set *dest;
607 const re_node_set *src;
609 dest->nelem = src->nelem;
610 if (src->nelem > 0)
612 dest->alloc = dest->nelem;
613 dest->elems = re_malloc (int, dest->alloc);
614 if (BE (dest->elems == NULL, 0))
615 return REG_ESPACE;
616 memcpy (dest->elems, src->elems, src->nelem * sizeof (int));
618 else
619 re_node_set_init_empty (dest);
620 return REG_NOERROR;
623 /* Calculate the intersection of the sets SRC1 and SRC2. And store it in
624 DEST. Return value indicate the error code or REG_NOERROR if succeeded.
625 Note: We assume dest->elems is NULL, when dest->alloc is 0. */
627 static reg_errcode_t
628 re_node_set_intersect (dest, src1, src2)
629 re_node_set *dest;
630 const re_node_set *src1, *src2;
632 int i1, i2, id;
633 if (src1->nelem > 0 && src2->nelem > 0)
635 if (src1->nelem + src2->nelem > dest->alloc)
637 dest->alloc = src1->nelem + src2->nelem;
638 dest->elems = re_realloc (dest->elems, int, dest->alloc);
639 if (BE (dest->elems == NULL, 0))
640 return REG_ESPACE;
643 else
645 /* The intersection of empty sets is also empty set. */
646 dest->nelem = 0;
647 return REG_NOERROR;
650 for (i1 = i2 = id = 0; i1 < src1->nelem && i2 < src2->nelem; )
652 if (src1->elems[i1] > src2->elems[i2])
654 ++i2;
655 continue;
657 /* The intersection must have the elements which are in both of
658 SRC1 and SRC2. */
659 if (src1->elems[i1] == src2->elems[i2])
660 dest->elems[id++] = src2->elems[i2++];
661 ++i1;
663 dest->nelem = id;
664 return REG_NOERROR;
667 /* Calculate the intersection of the sets SRC1 and SRC2. And merge it to
668 DEST. Return value indicate the error code or REG_NOERROR if succeeded.
669 Note: We assume dest->elems is NULL, when dest->alloc is 0. */
671 static reg_errcode_t
672 re_node_set_add_intersect (dest, src1, src2)
673 re_node_set *dest;
674 const re_node_set *src1, *src2;
676 int i1, i2, id;
677 if (src1->nelem > 0 && src2->nelem > 0)
679 if (src1->nelem + src2->nelem + dest->nelem > dest->alloc)
681 dest->alloc = src1->nelem + src2->nelem + dest->nelem;
682 dest->elems = re_realloc (dest->elems, int, dest->alloc);
683 if (BE (dest->elems == NULL, 0))
684 return REG_ESPACE;
687 else
688 return REG_NOERROR;
690 for (i1 = i2 = id = 0 ; i1 < src1->nelem && i2 < src2->nelem ;)
692 if (src1->elems[i1] > src2->elems[i2])
694 ++i2;
695 continue;
697 if (src1->elems[i1] == src2->elems[i2])
699 while (id < dest->nelem && dest->elems[id] < src2->elems[i2])
700 ++id;
701 if (id < dest->nelem && dest->elems[id] == src2->elems[i2])
702 ++id;
703 else
705 memmove (dest->elems + id + 1, dest->elems + id,
706 sizeof (int) * (dest->nelem - id));
707 dest->elems[id++] = src2->elems[i2++];
708 ++dest->nelem;
711 ++i1;
713 return REG_NOERROR;
716 /* Calculate the union set of the sets SRC1 and SRC2. And store it to
717 DEST. Return value indicate the error code or REG_NOERROR if succeeded. */
719 static reg_errcode_t
720 re_node_set_init_union (dest, src1, src2)
721 re_node_set *dest;
722 const re_node_set *src1, *src2;
724 int i1, i2, id;
725 if (src1 != NULL && src1->nelem > 0 && src2 != NULL && src2->nelem > 0)
727 dest->alloc = src1->nelem + src2->nelem;
728 dest->elems = re_malloc (int, dest->alloc);
729 if (BE (dest->elems == NULL, 0))
730 return REG_ESPACE;
732 else
734 if (src1 != NULL && src1->nelem > 0)
735 return re_node_set_init_copy (dest, src1);
736 else if (src2 != NULL && src2->nelem > 0)
737 return re_node_set_init_copy (dest, src2);
738 else
739 re_node_set_init_empty (dest);
740 return REG_NOERROR;
742 for (i1 = i2 = id = 0 ; i1 < src1->nelem && i2 < src2->nelem ;)
744 if (src1->elems[i1] > src2->elems[i2])
746 dest->elems[id++] = src2->elems[i2++];
747 continue;
749 if (src1->elems[i1] == src2->elems[i2])
750 ++i2;
751 dest->elems[id++] = src1->elems[i1++];
753 if (i1 < src1->nelem)
755 memcpy (dest->elems + id, src1->elems + i1,
756 (src1->nelem - i1) * sizeof (int));
757 id += src1->nelem - i1;
759 else if (i2 < src2->nelem)
761 memcpy (dest->elems + id, src2->elems + i2,
762 (src2->nelem - i2) * sizeof (int));
763 id += src2->nelem - i2;
765 dest->nelem = id;
766 return REG_NOERROR;
769 /* Calculate the union set of the sets DEST and SRC. And store it to
770 DEST. Return value indicate the error code or REG_NOERROR if succeeded. */
772 static reg_errcode_t
773 re_node_set_merge (dest, src)
774 re_node_set *dest;
775 const re_node_set *src;
777 int si, di;
778 if (src == NULL || src->nelem == 0)
779 return REG_NOERROR;
780 if (dest->alloc < src->nelem + dest->nelem)
782 dest->alloc = 2 * (src->nelem + dest->alloc);
783 dest->elems = re_realloc (dest->elems, int, dest->alloc);
784 if (BE (dest->elems == NULL, 0))
785 return REG_ESPACE;
788 for (si = 0, di = 0 ; si < src->nelem && di < dest->nelem ;)
790 int cp_from, ncp, mid, right, src_elem = src->elems[si];
791 /* Binary search the spot we will add the new element. */
792 right = dest->nelem;
793 while (di < right)
795 mid = (di + right) / 2;
796 if (dest->elems[mid] < src_elem)
797 di = mid + 1;
798 else
799 right = mid;
801 if (di >= dest->nelem)
802 break;
804 if (dest->elems[di] == src_elem)
806 /* Skip since, DEST already has the element. */
807 ++di;
808 ++si;
809 continue;
812 /* Skip the src elements which are less than dest->elems[di]. */
813 cp_from = si;
814 while (si < src->nelem && src->elems[si] < dest->elems[di])
815 ++si;
816 /* Copy these src elements. */
817 ncp = si - cp_from;
818 memmove (dest->elems + di + ncp, dest->elems + di,
819 sizeof (int) * (dest->nelem - di));
820 memcpy (dest->elems + di, src->elems + cp_from,
821 sizeof (int) * ncp);
822 /* Update counters. */
823 di += ncp;
824 dest->nelem += ncp;
827 /* Copy remaining src elements. */
828 if (si < src->nelem)
830 memcpy (dest->elems + di, src->elems + si,
831 sizeof (int) * (src->nelem - si));
832 dest->nelem += src->nelem - si;
834 return REG_NOERROR;
837 /* Insert the new element ELEM to the re_node_set* SET.
838 return 0 if SET already has ELEM,
839 return -1 if an error is occured, return 1 otherwise. */
841 static int
842 re_node_set_insert (set, elem)
843 re_node_set *set;
844 int elem;
846 int idx, right, mid;
847 /* In case of the set is empty. */
848 if (set->elems == NULL || set->alloc == 0)
850 if (BE (re_node_set_init_1 (set, elem) == REG_NOERROR, 1))
851 return 1;
852 else
853 return -1;
856 /* Binary search the spot we will add the new element. */
857 idx = 0;
858 right = set->nelem;
859 while (idx < right)
861 mid = (idx + right) / 2;
862 if (set->elems[mid] < elem)
863 idx = mid + 1;
864 else
865 right = mid;
868 /* Realloc if we need. */
869 if (set->alloc < set->nelem + 1)
871 int *new_array;
872 set->alloc = set->alloc * 2;
873 new_array = re_malloc (int, set->alloc);
874 if (BE (new_array == NULL, 0))
875 return -1;
876 /* Copy the elements they are followed by the new element. */
877 if (idx > 0)
878 memcpy (new_array, set->elems, sizeof (int) * (idx));
879 /* Copy the elements which follows the new element. */
880 if (set->nelem - idx > 0)
881 memcpy (new_array + idx + 1, set->elems + idx,
882 sizeof (int) * (set->nelem - idx));
883 re_free (set->elems);
884 set->elems = new_array;
886 else
888 /* Move the elements which follows the new element. */
889 if (set->nelem - idx > 0)
890 memmove (set->elems + idx + 1, set->elems + idx,
891 sizeof (int) * (set->nelem - idx));
893 /* Insert the new element. */
894 set->elems[idx] = elem;
895 ++set->nelem;
896 return 1;
899 /* Compare two node sets SET1 and SET2.
900 return 1 if SET1 and SET2 are equivalent, retrun 0 otherwise. */
902 static int
903 re_node_set_compare (set1, set2)
904 const re_node_set *set1, *set2;
906 int i;
907 if (set1 == NULL || set2 == NULL || set1->nelem != set2->nelem)
908 return 0;
909 for (i = 0 ; i < set1->nelem ; i++)
910 if (set1->elems[i] != set2->elems[i])
911 return 0;
912 return 1;
915 /* Return 1 if SET contains the element ELEM, return 0 otherwise. */
917 static int
918 re_node_set_contains (set, elem)
919 const re_node_set *set;
920 int elem;
922 int idx, right, mid;
923 if (set->nelem <= 0)
924 return 0;
926 /* Binary search the element. */
927 idx = 0;
928 right = set->nelem - 1;
929 while (idx < right)
931 mid = (idx + right) / 2;
932 if (set->elems[mid] < elem)
933 idx = mid + 1;
934 else
935 right = mid;
937 return set->elems[idx] == elem;
940 static void
941 re_node_set_remove_at (set, idx)
942 re_node_set *set;
943 int idx;
945 if (idx < 0 || idx >= set->nelem)
946 return;
947 if (idx < set->nelem - 1)
948 memmove (set->elems + idx, set->elems + idx + 1,
949 sizeof (int) * (set->nelem - idx - 1));
950 --set->nelem;
954 /* Add the token TOKEN to dfa->nodes, and return the index of the token.
955 Or return -1, if an error will be occured. */
957 static int
958 re_dfa_add_node (dfa, token, mode)
959 re_dfa_t *dfa;
960 re_token_t token;
961 int mode;
963 if (dfa->nodes_len >= dfa->nodes_alloc)
965 re_token_t *new_array;
966 dfa->nodes_alloc *= 2;
967 new_array = re_realloc (dfa->nodes, re_token_t, dfa->nodes_alloc);
968 if (BE (new_array == NULL, 0))
969 return -1;
970 else
971 dfa->nodes = new_array;
972 if (mode)
974 int *new_firsts, *new_nexts;
975 re_node_set *new_edests, *new_eclosures, *new_inveclosures;
977 new_firsts = re_realloc (dfa->firsts, int, dfa->nodes_alloc);
978 new_nexts = re_realloc (dfa->nexts, int, dfa->nodes_alloc);
979 new_edests = re_realloc (dfa->edests, re_node_set, dfa->nodes_alloc);
980 new_eclosures = re_realloc (dfa->eclosures, re_node_set,
981 dfa->nodes_alloc);
982 new_inveclosures = re_realloc (dfa->inveclosures, re_node_set,
983 dfa->nodes_alloc);
984 if (BE (new_firsts == NULL || new_nexts == NULL || new_edests == NULL
985 || new_eclosures == NULL || new_inveclosures == NULL, 0))
986 return -1;
987 dfa->firsts = new_firsts;
988 dfa->nexts = new_nexts;
989 dfa->edests = new_edests;
990 dfa->eclosures = new_eclosures;
991 dfa->inveclosures = new_inveclosures;
994 dfa->nodes[dfa->nodes_len] = token;
995 dfa->nodes[dfa->nodes_len].duplicated = 0;
996 return dfa->nodes_len++;
999 static unsigned int inline
1000 calc_state_hash (nodes, context)
1001 const re_node_set *nodes;
1002 unsigned int context;
1004 unsigned int hash = nodes->nelem + context;
1005 int i;
1006 for (i = 0 ; i < nodes->nelem ; i++)
1007 hash += nodes->elems[i];
1008 return hash;
1011 /* Search for the state whose node_set is equivalent to NODES.
1012 Return the pointer to the state, if we found it in the DFA.
1013 Otherwise create the new one and return it. In case of an error
1014 return NULL and set the error code in ERR.
1015 Note: - We assume NULL as the invalid state, then it is possible that
1016 return value is NULL and ERR is REG_NOERROR.
1017 - We never return non-NULL value in case of any errors, it is for
1018 optimization. */
1020 static re_dfastate_t*
1021 re_acquire_state (err, dfa, nodes)
1022 reg_errcode_t *err;
1023 re_dfa_t *dfa;
1024 const re_node_set *nodes;
1026 unsigned int hash;
1027 re_dfastate_t *new_state;
1028 struct re_state_table_entry *spot;
1029 int i;
1030 if (BE (nodes->nelem == 0, 0))
1032 *err = REG_NOERROR;
1033 return NULL;
1035 hash = calc_state_hash (nodes, 0);
1036 spot = dfa->state_table + (hash & dfa->state_hash_mask);
1038 for (i = 0 ; i < spot->num ; i++)
1040 re_dfastate_t *state = spot->array[i];
1041 if (hash != state->hash)
1042 continue;
1043 if (re_node_set_compare (&state->nodes, nodes))
1044 return state;
1047 /* There are no appropriate state in the dfa, create the new one. */
1048 new_state = create_ci_newstate (dfa, nodes, hash);
1049 if (BE (new_state != NULL, 1))
1050 return new_state;
1051 else
1053 *err = REG_ESPACE;
1054 return NULL;
1058 /* Search for the state whose node_set is equivalent to NODES and
1059 whose context is equivalent to CONTEXT.
1060 Return the pointer to the state, if we found it in the DFA.
1061 Otherwise create the new one and return it. In case of an error
1062 return NULL and set the error code in ERR.
1063 Note: - We assume NULL as the invalid state, then it is possible that
1064 return value is NULL and ERR is REG_NOERROR.
1065 - We never return non-NULL value in case of any errors, it is for
1066 optimization. */
1068 static re_dfastate_t*
1069 re_acquire_state_context (err, dfa, nodes, context)
1070 reg_errcode_t *err;
1071 re_dfa_t *dfa;
1072 const re_node_set *nodes;
1073 unsigned int context;
1075 unsigned int hash;
1076 re_dfastate_t *new_state;
1077 struct re_state_table_entry *spot;
1078 int i;
1079 if (nodes->nelem == 0)
1081 *err = REG_NOERROR;
1082 return NULL;
1084 hash = calc_state_hash (nodes, context);
1085 spot = dfa->state_table + (hash & dfa->state_hash_mask);
1087 for (i = 0 ; i < spot->num ; i++)
1089 re_dfastate_t *state = spot->array[i];
1090 if (hash != state->hash)
1091 continue;
1092 if (re_node_set_compare (state->entrance_nodes, nodes)
1093 && state->context == context)
1094 return state;
1096 /* There are no appropriate state in `dfa', create the new one. */
1097 new_state = create_cd_newstate (dfa, nodes, context, hash);
1098 if (BE (new_state != NULL, 1))
1099 return new_state;
1100 else
1102 *err = REG_ESPACE;
1103 return NULL;
1107 /* Allocate memory for DFA state and initialize common properties.
1108 Return the new state if succeeded, otherwise return NULL. */
1110 static re_dfastate_t *
1111 create_newstate_common (dfa, nodes, hash)
1112 re_dfa_t *dfa;
1113 const re_node_set *nodes;
1114 unsigned int hash;
1116 re_dfastate_t *newstate;
1117 newstate = (re_dfastate_t *) calloc (sizeof (re_dfastate_t), 1);
1118 if (BE (newstate == NULL, 0))
1119 return NULL;
1120 re_node_set_init_copy (&newstate->nodes, nodes);
1121 newstate->trtable = NULL;
1122 newstate->trtable_search = NULL;
1123 newstate->hash = hash;
1124 return newstate;
1127 /* Store the new state NEWSTATE whose hash value is HASH in appropriate
1128 position. Return value indicate the error code if failed. */
1130 static reg_errcode_t
1131 register_state (dfa, newstate, hash)
1132 re_dfa_t *dfa;
1133 re_dfastate_t *newstate;
1134 unsigned int hash;
1136 struct re_state_table_entry *spot;
1137 spot = dfa->state_table + (hash & dfa->state_hash_mask);
1139 if (spot->alloc <= spot->num)
1141 spot->alloc = 2 * spot->num + 2;
1142 spot->array = re_realloc (spot->array, re_dfastate_t *, spot->alloc);
1143 if (BE (spot->array == NULL, 0))
1144 return REG_ESPACE;
1146 spot->array[spot->num++] = newstate;
1147 return REG_NOERROR;
1150 /* Create the new state which is independ of contexts.
1151 Return the new state if succeeded, otherwise return NULL. */
1153 static re_dfastate_t *
1154 create_ci_newstate (dfa, nodes, hash)
1155 re_dfa_t *dfa;
1156 const re_node_set *nodes;
1157 unsigned int hash;
1159 int i;
1160 reg_errcode_t err;
1161 re_dfastate_t *newstate;
1162 newstate = create_newstate_common (dfa, nodes, hash);
1163 if (BE (newstate == NULL, 0))
1164 return NULL;
1165 newstate->entrance_nodes = &newstate->nodes;
1167 for (i = 0 ; i < nodes->nelem ; i++)
1169 re_token_t *node = dfa->nodes + nodes->elems[i];
1170 re_token_type_t type = node->type;
1171 if (type == CHARACTER)
1172 continue;
1174 /* If the state has the halt node, the state is a halt state. */
1175 else if (type == END_OF_RE)
1176 newstate->halt = 1;
1177 #ifdef RE_ENABLE_I18N
1178 else if (type == COMPLEX_BRACKET
1179 || (type == OP_PERIOD && MB_CUR_MAX > 1))
1180 newstate->accept_mb = 1;
1181 #endif /* RE_ENABLE_I18N */
1182 else if (type == OP_BACK_REF)
1183 newstate->has_backref = 1;
1184 else if (type == ANCHOR || OP_CONTEXT_NODE)
1186 newstate->has_constraint = 1;
1187 if (type == OP_CONTEXT_NODE
1188 && dfa->nodes[node->opr.ctx_info->entity].type == END_OF_RE)
1189 newstate->halt = 1;
1192 err = register_state (dfa, newstate, hash);
1193 return (err != REG_NOERROR) ? NULL : newstate;
1196 /* Create the new state which is depend on the context CONTEXT.
1197 Return the new state if succeeded, otherwise return NULL. */
1199 static re_dfastate_t *
1200 create_cd_newstate (dfa, nodes, context, hash)
1201 re_dfa_t *dfa;
1202 const re_node_set *nodes;
1203 unsigned int context, hash;
1205 int i, nctx_nodes = 0;
1206 reg_errcode_t err;
1207 re_dfastate_t *newstate;
1209 newstate = create_newstate_common (dfa, nodes, hash);
1210 if (BE (newstate == NULL, 0))
1211 return NULL;
1212 newstate->context = context;
1213 newstate->entrance_nodes = &newstate->nodes;
1215 for (i = 0 ; i < nodes->nelem ; i++)
1217 unsigned int constraint = 0;
1218 re_token_t *node = dfa->nodes + nodes->elems[i];
1219 re_token_type_t type = node->type;
1220 if (type == CHARACTER)
1221 continue;
1223 /* If the state has the halt node, the state is a halt state. */
1224 else if (type == END_OF_RE)
1225 newstate->halt = 1;
1226 #ifdef RE_ENABLE_I18N
1227 else if (type == COMPLEX_BRACKET
1228 || (type == OP_PERIOD && MB_CUR_MAX > 1))
1229 newstate->accept_mb = 1;
1230 #endif /* RE_ENABLE_I18N */
1231 else if (type == OP_BACK_REF)
1232 newstate->has_backref = 1;
1233 else if (type == ANCHOR)
1234 constraint = node->opr.ctx_type;
1235 else if (type == OP_CONTEXT_NODE)
1237 re_token_type_t ctype = dfa->nodes[node->opr.ctx_info->entity].type;
1238 constraint = node->constraint;
1239 if (ctype == END_OF_RE)
1240 newstate->halt = 1;
1241 else if (ctype == OP_BACK_REF)
1242 newstate->has_backref = 1;
1243 #ifdef RE_ENABLE_I18N
1244 else if (ctype == COMPLEX_BRACKET
1245 || (type == OP_PERIOD && MB_CUR_MAX > 1))
1246 newstate->accept_mb = 1;
1247 #endif /* RE_ENABLE_I18N */
1250 if (constraint)
1252 if (newstate->entrance_nodes == &newstate->nodes)
1254 newstate->entrance_nodes = re_malloc (re_node_set, 1);
1255 if (BE (newstate->entrance_nodes == NULL, 0))
1256 return NULL;
1257 re_node_set_init_copy (newstate->entrance_nodes, nodes);
1258 nctx_nodes = 0;
1259 newstate->has_constraint = 1;
1262 if (NOT_SATISFY_PREV_CONSTRAINT (constraint,context))
1264 re_node_set_remove_at (&newstate->nodes, i - nctx_nodes);
1265 ++nctx_nodes;
1269 err = register_state (dfa, newstate, hash);
1270 return (err != REG_NOERROR) ? NULL : newstate;