1 /* Extended regular expression matching and search library.
2 Copyright (C) 2002 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4 Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library; if not, write to the Free
18 Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
28 #if defined HAVE_WCHAR_H || defined _LIBC
30 #endif /* HAVE_WCHAR_H || _LIBC */
31 #if defined HAVE_WCTYPE_H || defined _LIBC
33 #endif /* HAVE_WCTYPE_H || _LIBC */
36 # ifndef _RE_DEFINE_LOCALE_FUNCTIONS
37 # define _RE_DEFINE_LOCALE_FUNCTIONS 1
38 # include <locale/localeinfo.h>
39 # include <locale/elem-hash.h>
40 # include <locale/coll-lookup.h>
44 /* This is for other GNU distributions with internationalized messages. */
45 #if HAVE_LIBINTL_H || defined _LIBC
49 # define gettext(msgid) \
50 INTUSE(__dcgettext) (_libc_intl_domainname_internal, msgid, LC_MESSAGES)
53 # define gettext(msgid) (msgid)
57 /* This define is so xgettext can find the internationalizable
59 # define gettext_noop(String) String
63 #include "regex_internal.h"
65 static void re_string_construct_common (const char *str
, int len
,
67 RE_TRANSLATE_TYPE trans
, int icase
);
69 static int re_string_skip_chars (re_string_t
*pstr
, int new_raw_idx
,
71 #endif /* RE_ENABLE_I18N */
72 static re_dfastate_t
*create_newstate_common (re_dfa_t
*dfa
,
73 const re_node_set
*nodes
,
75 static reg_errcode_t
register_state (re_dfa_t
*dfa
, re_dfastate_t
*newstate
,
77 static re_dfastate_t
*create_ci_newstate (re_dfa_t
*dfa
,
78 const re_node_set
*nodes
,
80 static re_dfastate_t
*create_cd_newstate (re_dfa_t
*dfa
,
81 const re_node_set
*nodes
,
84 static unsigned int inline calc_state_hash (const re_node_set
*nodes
,
85 unsigned int context
);
87 /* Functions for string operation. */
89 /* This function allocate the buffers. It is necessary to call
90 re_string_reconstruct before using the object. */
93 re_string_allocate (pstr
, str
, len
, init_len
, trans
, icase
)
96 int len
, init_len
, icase
;
97 RE_TRANSLATE_TYPE trans
;
100 int init_buf_len
= (len
+ 1 < init_len
) ? len
+ 1: init_len
;
101 re_string_construct_common (str
, len
, pstr
, trans
, icase
);
102 pstr
->stop
= pstr
->len
;
104 ret
= re_string_realloc_buffers (pstr
, init_buf_len
);
105 if (BE (ret
!= REG_NOERROR
, 0))
108 pstr
->mbs_case
= (MBS_CASE_ALLOCATED (pstr
) ? pstr
->mbs_case
109 : (unsigned char *) str
);
110 pstr
->mbs
= MBS_ALLOCATED (pstr
) ? pstr
->mbs
: pstr
->mbs_case
;
111 pstr
->valid_len
= (MBS_CASE_ALLOCATED (pstr
) || MBS_ALLOCATED (pstr
)
112 || MB_CUR_MAX
> 1) ? pstr
->valid_len
: len
;
116 /* This function allocate the buffers, and initialize them. */
119 re_string_construct (pstr
, str
, len
, trans
, icase
)
123 RE_TRANSLATE_TYPE trans
;
126 re_string_construct_common (str
, len
, pstr
, trans
, icase
);
127 pstr
->stop
= pstr
->len
;
128 /* Set 0 so that this function can initialize whole buffers. */
133 ret
= re_string_realloc_buffers (pstr
, len
+ 1);
134 if (BE (ret
!= REG_NOERROR
, 0))
137 pstr
->mbs_case
= (MBS_CASE_ALLOCATED (pstr
) ? pstr
->mbs_case
138 : (unsigned char *) str
);
139 pstr
->mbs
= MBS_ALLOCATED (pstr
) ? pstr
->mbs
: pstr
->mbs_case
;
143 #ifdef RE_ENABLE_I18N
145 build_wcs_upper_buffer (pstr
);
147 #endif /* RE_ENABLE_I18N */
148 build_upper_buffer (pstr
);
152 #ifdef RE_ENABLE_I18N
154 build_wcs_buffer (pstr
);
156 #endif /* RE_ENABLE_I18N */
159 re_string_translate_buffer (pstr
);
161 pstr
->valid_len
= len
;
165 /* Initialized whole buffers, then valid_len == bufs_len. */
166 pstr
->valid_len
= pstr
->bufs_len
;
170 /* Helper functions for re_string_allocate, and re_string_construct. */
173 re_string_realloc_buffers (pstr
, new_buf_len
)
177 #ifdef RE_ENABLE_I18N
180 wint_t *new_array
= re_realloc (pstr
->wcs
, wint_t, new_buf_len
);
181 if (BE (new_array
== NULL
, 0))
183 pstr
->wcs
= new_array
;
185 #endif /* RE_ENABLE_I18N */
186 if (MBS_ALLOCATED (pstr
))
188 unsigned char *new_array
= re_realloc (pstr
->mbs
, unsigned char,
190 if (BE (new_array
== NULL
, 0))
192 pstr
->mbs
= new_array
;
194 if (MBS_CASE_ALLOCATED (pstr
))
196 unsigned char *new_array
= re_realloc (pstr
->mbs_case
, unsigned char,
198 if (BE (new_array
== NULL
, 0))
200 pstr
->mbs_case
= new_array
;
201 if (!MBS_ALLOCATED (pstr
))
202 pstr
->mbs
= pstr
->mbs_case
;
204 pstr
->bufs_len
= new_buf_len
;
210 re_string_construct_common (str
, len
, pstr
, trans
, icase
)
214 RE_TRANSLATE_TYPE trans
;
217 memset (pstr
, '\0', sizeof (re_string_t
));
218 pstr
->raw_mbs
= (const unsigned char *) str
;
221 pstr
->icase
= icase
? 1 : 0;
224 #ifdef RE_ENABLE_I18N
226 /* Build wide character buffer PSTR->WCS.
227 If the byte sequence of the string are:
228 <mb1>(0), <mb1>(1), <mb2>(0), <mb2>(1), <sb3>
229 Then wide character buffer will be:
230 <wc1> , WEOF , <wc2> , WEOF , <wc3>
231 We use WEOF for padding, they indicate that the position isn't
232 a first byte of a multibyte character.
234 Note that this function assumes PSTR->VALID_LEN elements are already
235 built and starts from PSTR->VALID_LEN. */
238 build_wcs_buffer (pstr
)
242 int byte_idx
, end_idx
, mbclen
, remain_len
;
243 /* Build the buffers from pstr->valid_len to either pstr->len or
245 end_idx
= (pstr
->bufs_len
> pstr
->len
)? pstr
->len
: pstr
->bufs_len
;
246 for (byte_idx
= pstr
->valid_len
; byte_idx
< end_idx
;)
249 remain_len
= end_idx
- byte_idx
;
250 prev_st
= pstr
->cur_state
;
251 mbclen
= mbrtowc (&wc
, ((const char *) pstr
->raw_mbs
+ pstr
->raw_mbs_idx
252 + byte_idx
), remain_len
, &pstr
->cur_state
);
253 if (BE (mbclen
== (size_t) -2, 0))
255 /* The buffer doesn't have enough space, finish to build. */
256 pstr
->cur_state
= prev_st
;
259 else if (BE (mbclen
== (size_t) -1 || mbclen
== 0, 0))
261 /* We treat these cases as a singlebyte character. */
263 wc
= (wchar_t) pstr
->raw_mbs
[pstr
->raw_mbs_idx
+ byte_idx
];
264 pstr
->cur_state
= prev_st
;
267 /* Apply the translateion if we need. */
268 if (pstr
->trans
!= NULL
&& mbclen
== 1)
270 int ch
= pstr
->trans
[pstr
->raw_mbs
[pstr
->raw_mbs_idx
+ byte_idx
]];
271 pstr
->mbs_case
[byte_idx
] = ch
;
273 /* Write wide character and padding. */
274 pstr
->wcs
[byte_idx
++] = wc
;
275 /* Write paddings. */
276 for (remain_len
= byte_idx
+ mbclen
- 1; byte_idx
< remain_len
;)
277 pstr
->wcs
[byte_idx
++] = WEOF
;
279 pstr
->valid_len
= byte_idx
;
282 /* Build wide character buffer PSTR->WCS like build_wcs_buffer,
283 but for REG_ICASE. */
286 build_wcs_upper_buffer (pstr
)
290 int byte_idx
, end_idx
, mbclen
, remain_len
;
291 /* Build the buffers from pstr->valid_len to either pstr->len or
293 end_idx
= (pstr
->bufs_len
> pstr
->len
)? pstr
->len
: pstr
->bufs_len
;
294 for (byte_idx
= pstr
->valid_len
; byte_idx
< end_idx
;)
297 remain_len
= end_idx
- byte_idx
;
298 prev_st
= pstr
->cur_state
;
299 mbclen
= mbrtowc (&wc
, ((const char *) pstr
->raw_mbs
+ pstr
->raw_mbs_idx
300 + byte_idx
), remain_len
, &pstr
->cur_state
);
301 if (BE (mbclen
== (size_t) -2, 0))
303 /* The buffer doesn't have enough space, finish to build. */
304 pstr
->cur_state
= prev_st
;
307 else if (mbclen
== 1 || mbclen
== (size_t) -1 || mbclen
== 0)
309 /* In case of a singlebyte character. */
310 int ch
= pstr
->raw_mbs
[pstr
->raw_mbs_idx
+ byte_idx
];
311 /* Apply the translateion if we need. */
312 if (pstr
->trans
!= NULL
&& mbclen
== 1)
314 ch
= pstr
->trans
[ch
];
315 pstr
->mbs_case
[byte_idx
] = ch
;
317 pstr
->wcs
[byte_idx
] = iswlower (wc
) ? toupper (wc
) : wc
;
318 pstr
->mbs
[byte_idx
++] = islower (ch
) ? toupper (ch
) : ch
;
319 if (BE (mbclen
== (size_t) -1, 0))
320 pstr
->cur_state
= prev_st
;
322 else /* mbclen > 1 */
325 wcrtomb ((char *) pstr
->mbs
+ byte_idx
, towupper (wc
), &prev_st
);
327 memcpy (pstr
->mbs
+ byte_idx
,
328 pstr
->raw_mbs
+ pstr
->raw_mbs_idx
+ byte_idx
, mbclen
);
329 pstr
->wcs
[byte_idx
++] = iswlower (wc
) ? toupper (wc
) : wc
;
330 /* Write paddings. */
331 for (remain_len
= byte_idx
+ mbclen
- 1; byte_idx
< remain_len
;)
332 pstr
->wcs
[byte_idx
++] = WEOF
;
335 pstr
->valid_len
= byte_idx
;
338 /* Skip characters until the index becomes greater than NEW_RAW_IDX.
342 re_string_skip_chars (pstr
, new_raw_idx
, last_wc
)
348 int rawbuf_idx
, mbclen
;
351 /* Skip the characters which are not necessary to check. */
352 for (rawbuf_idx
= pstr
->raw_mbs_idx
+ pstr
->valid_len
;
353 rawbuf_idx
< new_raw_idx
;)
356 remain_len
= pstr
->len
- rawbuf_idx
;
357 prev_st
= pstr
->cur_state
;
358 mbclen
= mbrtowc (&wc
, (const char *) pstr
->raw_mbs
+ rawbuf_idx
,
359 remain_len
, &pstr
->cur_state
);
360 if (BE (mbclen
== (size_t) -2 || mbclen
== (size_t) -1 || mbclen
== 0, 0))
362 /* We treat these cases as a singlebyte character. */
364 pstr
->cur_state
= prev_st
;
366 /* Then proceed the next character. */
367 rawbuf_idx
+= mbclen
;
369 *last_wc
= (wint_t) wc
;
372 #endif /* RE_ENABLE_I18N */
374 /* Build the buffer PSTR->MBS, and apply the translation if we need.
375 This function is used in case of REG_ICASE. */
378 build_upper_buffer (pstr
)
381 int char_idx
, end_idx
;
382 end_idx
= (pstr
->bufs_len
> pstr
->len
) ? pstr
->len
: pstr
->bufs_len
;
384 for (char_idx
= pstr
->valid_len
; char_idx
< end_idx
; ++char_idx
)
386 int ch
= pstr
->raw_mbs
[pstr
->raw_mbs_idx
+ char_idx
];
387 if (pstr
->trans
!= NULL
)
389 ch
= pstr
->trans
[ch
];
390 pstr
->mbs_case
[char_idx
] = ch
;
393 pstr
->mbs
[char_idx
] = toupper (ch
);
395 pstr
->mbs
[char_idx
] = ch
;
397 pstr
->valid_len
= char_idx
;
400 /* Apply TRANS to the buffer in PSTR. */
403 re_string_translate_buffer (pstr
)
406 int buf_idx
, end_idx
;
407 end_idx
= (pstr
->bufs_len
> pstr
->len
) ? pstr
->len
: pstr
->bufs_len
;
409 for (buf_idx
= pstr
->valid_len
; buf_idx
< end_idx
; ++buf_idx
)
411 int ch
= pstr
->raw_mbs
[pstr
->raw_mbs_idx
+ buf_idx
];
412 pstr
->mbs_case
[buf_idx
] = pstr
->trans
[ch
];
415 pstr
->valid_len
= buf_idx
;
418 /* This function re-construct the buffers.
419 Concretely, convert to wide character in case of MB_CUR_MAX > 1,
420 convert to upper case in case of REG_ICASE, apply translation. */
423 re_string_reconstruct (pstr
, idx
, eflags
, newline
)
425 int idx
, eflags
, newline
;
427 int offset
= idx
- pstr
->raw_mbs_idx
;
431 #ifdef RE_ENABLE_I18N
433 memset (&pstr
->cur_state
, '\0', sizeof (mbstate_t));
434 #endif /* RE_ENABLE_I18N */
435 pstr
->len
+= pstr
->raw_mbs_idx
;
436 pstr
->stop
+= pstr
->raw_mbs_idx
;
437 pstr
->valid_len
= pstr
->raw_mbs_idx
= 0;
438 pstr
->tip_context
= ((eflags
& REG_NOTBOL
) ? CONTEXT_BEGBUF
439 : CONTEXT_NEWLINE
| CONTEXT_BEGBUF
);
440 if (!MBS_CASE_ALLOCATED (pstr
))
441 pstr
->mbs_case
= (unsigned char *) pstr
->raw_mbs
;
442 if (!MBS_ALLOCATED (pstr
) && !MBS_CASE_ALLOCATED (pstr
))
443 pstr
->mbs
= (unsigned char *) pstr
->raw_mbs
;
449 /* Are the characters which are already checked remain? */
450 if (offset
< pstr
->valid_len
)
452 /* Yes, move them to the front of the buffer. */
453 pstr
->tip_context
= re_string_context_at (pstr
, offset
- 1, eflags
,
455 #ifdef RE_ENABLE_I18N
457 memmove (pstr
->wcs
, pstr
->wcs
+ offset
,
458 (pstr
->valid_len
- offset
) * sizeof (wint_t));
459 #endif /* RE_ENABLE_I18N */
460 if (MBS_ALLOCATED (pstr
))
461 memmove (pstr
->mbs
, pstr
->mbs
+ offset
,
462 pstr
->valid_len
- offset
);
463 if (MBS_CASE_ALLOCATED (pstr
))
464 memmove (pstr
->mbs_case
, pstr
->mbs_case
+ offset
,
465 pstr
->valid_len
- offset
);
466 pstr
->valid_len
-= offset
;
468 assert (pstr
->valid_len
> 0);
473 /* No, skip all characters until IDX. */
475 #ifdef RE_ENABLE_I18N
480 pstr
->valid_len
= re_string_skip_chars (pstr
, idx
, &wc
) - idx
;
481 for (wcs_idx
= 0; wcs_idx
< pstr
->valid_len
; ++wcs_idx
)
482 pstr
->wcs
[wcs_idx
] = WEOF
;
483 if (pstr
->trans
&& wc
<= 0xff)
484 wc
= pstr
->trans
[wc
];
485 pstr
->tip_context
= (IS_WIDE_WORD_CHAR (wc
) ? CONTEXT_WORD
486 : ((newline
&& IS_WIDE_NEWLINE (wc
))
487 ? CONTEXT_NEWLINE
: 0));
490 #endif /* RE_ENABLE_I18N */
492 int c
= pstr
->raw_mbs
[pstr
->raw_mbs_idx
+ offset
- 1];
495 pstr
->tip_context
= (IS_WORD_CHAR (c
) ? CONTEXT_WORD
496 : ((newline
&& IS_NEWLINE (c
))
497 ? CONTEXT_NEWLINE
: 0));
500 if (!MBS_CASE_ALLOCATED (pstr
))
502 pstr
->mbs_case
+= offset
;
503 /* In case of !MBS_ALLOCATED && !MBS_CASE_ALLOCATED. */
504 if (!MBS_ALLOCATED (pstr
))
508 pstr
->raw_mbs_idx
= idx
;
510 pstr
->stop
-= offset
;
512 /* Then build the buffers. */
513 #ifdef RE_ENABLE_I18N
517 build_wcs_upper_buffer (pstr
);
519 build_wcs_buffer (pstr
);
522 #endif /* RE_ENABLE_I18N */
525 build_upper_buffer (pstr
);
526 else if (pstr
->trans
!= NULL
)
527 re_string_translate_buffer (pstr
);
535 re_string_destruct (pstr
)
538 #ifdef RE_ENABLE_I18N
540 #endif /* RE_ENABLE_I18N */
541 if (MBS_ALLOCATED (pstr
))
543 if (MBS_CASE_ALLOCATED (pstr
))
544 re_free (pstr
->mbs_case
);
547 /* Return the context at IDX in INPUT. */
550 re_string_context_at (input
, idx
, eflags
, newline_anchor
)
551 const re_string_t
*input
;
552 int idx
, eflags
, newline_anchor
;
555 if (idx
< 0 || idx
== input
->len
)
558 /* In this case, we use the value stored in input->tip_context,
559 since we can't know the character in input->mbs[-1] here. */
560 return input
->tip_context
;
561 else /* (idx == input->len) */
562 return ((eflags
& REG_NOTEOL
) ? CONTEXT_ENDBUF
563 : CONTEXT_NEWLINE
| CONTEXT_ENDBUF
);
565 #ifdef RE_ENABLE_I18N
570 while(input
->wcs
[wc_idx
] == WEOF
)
573 /* It must not happen. */
574 assert (wc_idx
>= 0);
578 return input
->tip_context
;
580 wc
= input
->wcs
[wc_idx
];
581 if (IS_WIDE_WORD_CHAR (wc
))
583 return (newline_anchor
&& IS_WIDE_NEWLINE (wc
)) ? CONTEXT_NEWLINE
: 0;
588 c
= re_string_byte_at (input
, idx
);
589 if (IS_WORD_CHAR (c
))
591 return (newline_anchor
&& IS_NEWLINE (c
)) ? CONTEXT_NEWLINE
: 0;
595 /* Functions for set operation. */
598 re_node_set_alloc (set
, size
)
604 set
->elems
= re_malloc (int, size
);
605 if (BE (set
->elems
== NULL
, 0))
611 re_node_set_init_1 (set
, elem
)
617 set
->elems
= re_malloc (int, 1);
618 if (BE (set
->elems
== NULL
, 0))
620 set
->alloc
= set
->nelem
= 0;
623 set
->elems
[0] = elem
;
628 re_node_set_init_2 (set
, elem1
, elem2
)
633 set
->elems
= re_malloc (int, 2);
634 if (BE (set
->elems
== NULL
, 0))
639 set
->elems
[0] = elem1
;
646 set
->elems
[0] = elem1
;
647 set
->elems
[1] = elem2
;
651 set
->elems
[0] = elem2
;
652 set
->elems
[1] = elem1
;
659 re_node_set_init_copy (dest
, src
)
661 const re_node_set
*src
;
663 dest
->nelem
= src
->nelem
;
666 dest
->alloc
= dest
->nelem
;
667 dest
->elems
= re_malloc (int, dest
->alloc
);
668 if (BE (dest
->elems
== NULL
, 0))
670 dest
->alloc
= dest
->nelem
= 0;
673 memcpy (dest
->elems
, src
->elems
, src
->nelem
* sizeof (int));
676 re_node_set_init_empty (dest
);
680 /* Calculate the intersection of the sets SRC1 and SRC2. And merge it to
681 DEST. Return value indicate the error code or REG_NOERROR if succeeded.
682 Note: We assume dest->elems is NULL, when dest->alloc is 0. */
685 re_node_set_add_intersect (dest
, src1
, src2
)
687 const re_node_set
*src1
, *src2
;
690 if (src1
->nelem
> 0 && src2
->nelem
> 0)
692 if (src1
->nelem
+ src2
->nelem
+ dest
->nelem
> dest
->alloc
)
694 dest
->alloc
= src1
->nelem
+ src2
->nelem
+ dest
->nelem
;
695 dest
->elems
= re_realloc (dest
->elems
, int, dest
->alloc
);
696 if (BE (dest
->elems
== NULL
, 0))
703 for (i1
= i2
= id
= 0 ; i1
< src1
->nelem
&& i2
< src2
->nelem
;)
705 if (src1
->elems
[i1
] > src2
->elems
[i2
])
710 if (src1
->elems
[i1
] == src2
->elems
[i2
])
712 while (id
< dest
->nelem
&& dest
->elems
[id
] < src2
->elems
[i2
])
714 if (id
< dest
->nelem
&& dest
->elems
[id
] == src2
->elems
[i2
])
718 memmove (dest
->elems
+ id
+ 1, dest
->elems
+ id
,
719 sizeof (int) * (dest
->nelem
- id
));
720 dest
->elems
[id
++] = src2
->elems
[i2
++];
729 /* Calculate the union set of the sets SRC1 and SRC2. And store it to
730 DEST. Return value indicate the error code or REG_NOERROR if succeeded. */
733 re_node_set_init_union (dest
, src1
, src2
)
735 const re_node_set
*src1
, *src2
;
738 if (src1
!= NULL
&& src1
->nelem
> 0 && src2
!= NULL
&& src2
->nelem
> 0)
740 dest
->alloc
= src1
->nelem
+ src2
->nelem
;
741 dest
->elems
= re_malloc (int, dest
->alloc
);
742 if (BE (dest
->elems
== NULL
, 0))
747 if (src1
!= NULL
&& src1
->nelem
> 0)
748 return re_node_set_init_copy (dest
, src1
);
749 else if (src2
!= NULL
&& src2
->nelem
> 0)
750 return re_node_set_init_copy (dest
, src2
);
752 re_node_set_init_empty (dest
);
755 for (i1
= i2
= id
= 0 ; i1
< src1
->nelem
&& i2
< src2
->nelem
;)
757 if (src1
->elems
[i1
] > src2
->elems
[i2
])
759 dest
->elems
[id
++] = src2
->elems
[i2
++];
762 if (src1
->elems
[i1
] == src2
->elems
[i2
])
764 dest
->elems
[id
++] = src1
->elems
[i1
++];
766 if (i1
< src1
->nelem
)
768 memcpy (dest
->elems
+ id
, src1
->elems
+ i1
,
769 (src1
->nelem
- i1
) * sizeof (int));
770 id
+= src1
->nelem
- i1
;
772 else if (i2
< src2
->nelem
)
774 memcpy (dest
->elems
+ id
, src2
->elems
+ i2
,
775 (src2
->nelem
- i2
) * sizeof (int));
776 id
+= src2
->nelem
- i2
;
782 /* Calculate the union set of the sets DEST and SRC. And store it to
783 DEST. Return value indicate the error code or REG_NOERROR if succeeded. */
786 re_node_set_merge (dest
, src
)
788 const re_node_set
*src
;
791 if (src
== NULL
|| src
->nelem
== 0)
793 if (dest
->alloc
< src
->nelem
+ dest
->nelem
)
796 dest
->alloc
= 2 * (src
->nelem
+ dest
->alloc
);
797 new_buffer
= re_realloc (dest
->elems
, int, dest
->alloc
);
798 if (BE (new_buffer
== NULL
, 0))
800 dest
->elems
= new_buffer
;
803 for (si
= 0, di
= 0 ; si
< src
->nelem
&& di
< dest
->nelem
;)
805 int cp_from
, ncp
, mid
, right
, src_elem
= src
->elems
[si
];
806 /* Binary search the spot we will add the new element. */
810 mid
= (di
+ right
) / 2;
811 if (dest
->elems
[mid
] < src_elem
)
816 if (di
>= dest
->nelem
)
819 if (dest
->elems
[di
] == src_elem
)
821 /* Skip since, DEST already has the element. */
827 /* Skip the src elements which are less than dest->elems[di]. */
829 while (si
< src
->nelem
&& src
->elems
[si
] < dest
->elems
[di
])
831 /* Copy these src elements. */
833 memmove (dest
->elems
+ di
+ ncp
, dest
->elems
+ di
,
834 sizeof (int) * (dest
->nelem
- di
));
835 memcpy (dest
->elems
+ di
, src
->elems
+ cp_from
,
837 /* Update counters. */
842 /* Copy remaining src elements. */
845 memcpy (dest
->elems
+ di
, src
->elems
+ si
,
846 sizeof (int) * (src
->nelem
- si
));
847 dest
->nelem
+= src
->nelem
- si
;
852 /* Insert the new element ELEM to the re_node_set* SET.
853 return 0 if SET already has ELEM,
854 return -1 if an error is occured, return 1 otherwise. */
857 re_node_set_insert (set
, elem
)
862 /* In case of the set is empty. */
863 if (set
->elems
== NULL
|| set
->alloc
== 0)
865 if (BE (re_node_set_init_1 (set
, elem
) == REG_NOERROR
, 1))
871 /* Binary search the spot we will add the new element. */
876 mid
= (idx
+ right
) / 2;
877 if (set
->elems
[mid
] < elem
)
883 /* Realloc if we need. */
884 if (set
->alloc
< set
->nelem
+ 1)
887 set
->alloc
= set
->alloc
* 2;
888 new_array
= re_malloc (int, set
->alloc
);
889 if (BE (new_array
== NULL
, 0))
891 /* Copy the elements they are followed by the new element. */
893 memcpy (new_array
, set
->elems
, sizeof (int) * (idx
));
894 /* Copy the elements which follows the new element. */
895 if (set
->nelem
- idx
> 0)
896 memcpy (new_array
+ idx
+ 1, set
->elems
+ idx
,
897 sizeof (int) * (set
->nelem
- idx
));
898 re_free (set
->elems
);
899 set
->elems
= new_array
;
903 /* Move the elements which follows the new element. */
904 if (set
->nelem
- idx
> 0)
905 memmove (set
->elems
+ idx
+ 1, set
->elems
+ idx
,
906 sizeof (int) * (set
->nelem
- idx
));
908 /* Insert the new element. */
909 set
->elems
[idx
] = elem
;
914 /* Compare two node sets SET1 and SET2.
915 return 1 if SET1 and SET2 are equivalent, retrun 0 otherwise. */
918 re_node_set_compare (set1
, set2
)
919 const re_node_set
*set1
, *set2
;
922 if (set1
== NULL
|| set2
== NULL
|| set1
->nelem
!= set2
->nelem
)
924 for (i
= 0 ; i
< set1
->nelem
; i
++)
925 if (set1
->elems
[i
] != set2
->elems
[i
])
930 /* Return (idx + 1) if SET contains the element ELEM, return 0 otherwise. */
933 re_node_set_contains (set
, elem
)
934 const re_node_set
*set
;
941 /* Binary search the element. */
943 right
= set
->nelem
- 1;
946 mid
= (idx
+ right
) / 2;
947 if (set
->elems
[mid
] < elem
)
952 return set
->elems
[idx
] == elem
? idx
+ 1 : 0;
956 re_node_set_remove_at (set
, idx
)
960 if (idx
< 0 || idx
>= set
->nelem
)
962 if (idx
< set
->nelem
- 1)
963 memmove (set
->elems
+ idx
, set
->elems
+ idx
+ 1,
964 sizeof (int) * (set
->nelem
- idx
- 1));
969 /* Add the token TOKEN to dfa->nodes, and return the index of the token.
970 Or return -1, if an error will be occured. */
973 re_dfa_add_node (dfa
, token
, mode
)
978 if (dfa
->nodes_len
>= dfa
->nodes_alloc
)
980 re_token_t
*new_array
;
981 dfa
->nodes_alloc
*= 2;
982 new_array
= re_realloc (dfa
->nodes
, re_token_t
, dfa
->nodes_alloc
);
983 if (BE (new_array
== NULL
, 0))
986 dfa
->nodes
= new_array
;
990 re_node_set
*new_edests
, *new_eclosures
, *new_inveclosures
;
992 new_nexts
= re_realloc (dfa
->nexts
, int, dfa
->nodes_alloc
);
993 new_edests
= re_realloc (dfa
->edests
, re_node_set
, dfa
->nodes_alloc
);
994 new_eclosures
= re_realloc (dfa
->eclosures
, re_node_set
,
996 new_inveclosures
= re_realloc (dfa
->inveclosures
, re_node_set
,
998 if (BE (new_nexts
== NULL
|| new_edests
== NULL
999 || new_eclosures
== NULL
|| new_inveclosures
== NULL
, 0))
1001 dfa
->nexts
= new_nexts
;
1002 dfa
->edests
= new_edests
;
1003 dfa
->eclosures
= new_eclosures
;
1004 dfa
->inveclosures
= new_inveclosures
;
1007 dfa
->nodes
[dfa
->nodes_len
] = token
;
1008 dfa
->nodes
[dfa
->nodes_len
].duplicated
= 0;
1009 dfa
->nodes
[dfa
->nodes_len
].constraint
= 0;
1010 return dfa
->nodes_len
++;
1013 static unsigned int inline
1014 calc_state_hash (nodes
, context
)
1015 const re_node_set
*nodes
;
1016 unsigned int context
;
1018 unsigned int hash
= nodes
->nelem
+ context
;
1020 for (i
= 0 ; i
< nodes
->nelem
; i
++)
1021 hash
+= nodes
->elems
[i
];
1025 /* Search for the state whose node_set is equivalent to NODES.
1026 Return the pointer to the state, if we found it in the DFA.
1027 Otherwise create the new one and return it. In case of an error
1028 return NULL and set the error code in ERR.
1029 Note: - We assume NULL as the invalid state, then it is possible that
1030 return value is NULL and ERR is REG_NOERROR.
1031 - We never return non-NULL value in case of any errors, it is for
1034 static re_dfastate_t
*
1035 re_acquire_state (err
, dfa
, nodes
)
1038 const re_node_set
*nodes
;
1041 re_dfastate_t
*new_state
;
1042 struct re_state_table_entry
*spot
;
1044 if (BE (nodes
->nelem
== 0, 0))
1049 hash
= calc_state_hash (nodes
, 0);
1050 spot
= dfa
->state_table
+ (hash
& dfa
->state_hash_mask
);
1052 for (i
= 0 ; i
< spot
->num
; i
++)
1054 re_dfastate_t
*state
= spot
->array
[i
];
1055 if (hash
!= state
->hash
)
1057 if (re_node_set_compare (&state
->nodes
, nodes
))
1061 /* There are no appropriate state in the dfa, create the new one. */
1062 new_state
= create_ci_newstate (dfa
, nodes
, hash
);
1063 if (BE (new_state
!= NULL
, 1))
1072 /* Search for the state whose node_set is equivalent to NODES and
1073 whose context is equivalent to CONTEXT.
1074 Return the pointer to the state, if we found it in the DFA.
1075 Otherwise create the new one and return it. In case of an error
1076 return NULL and set the error code in ERR.
1077 Note: - We assume NULL as the invalid state, then it is possible that
1078 return value is NULL and ERR is REG_NOERROR.
1079 - We never return non-NULL value in case of any errors, it is for
1082 static re_dfastate_t
*
1083 re_acquire_state_context (err
, dfa
, nodes
, context
)
1086 const re_node_set
*nodes
;
1087 unsigned int context
;
1090 re_dfastate_t
*new_state
;
1091 struct re_state_table_entry
*spot
;
1093 if (nodes
->nelem
== 0)
1098 hash
= calc_state_hash (nodes
, context
);
1099 spot
= dfa
->state_table
+ (hash
& dfa
->state_hash_mask
);
1101 for (i
= 0 ; i
< spot
->num
; i
++)
1103 re_dfastate_t
*state
= spot
->array
[i
];
1104 if (hash
!= state
->hash
)
1106 if (re_node_set_compare (state
->entrance_nodes
, nodes
)
1107 && state
->context
== context
)
1110 /* There are no appropriate state in `dfa', create the new one. */
1111 new_state
= create_cd_newstate (dfa
, nodes
, context
, hash
);
1112 if (BE (new_state
!= NULL
, 1))
1121 /* Allocate memory for DFA state and initialize common properties.
1122 Return the new state if succeeded, otherwise return NULL. */
1124 static re_dfastate_t
*
1125 create_newstate_common (dfa
, nodes
, hash
)
1127 const re_node_set
*nodes
;
1130 re_dfastate_t
*newstate
;
1132 newstate
= (re_dfastate_t
*) calloc (sizeof (re_dfastate_t
), 1);
1133 if (BE (newstate
== NULL
, 0))
1135 err
= re_node_set_init_copy (&newstate
->nodes
, nodes
);
1136 if (BE (err
!= REG_NOERROR
, 0))
1141 newstate
->trtable
= NULL
;
1142 newstate
->trtable_search
= NULL
;
1143 newstate
->hash
= hash
;
1147 /* Store the new state NEWSTATE whose hash value is HASH in appropriate
1148 position. Return value indicate the error code if failed. */
1150 static reg_errcode_t
1151 register_state (dfa
, newstate
, hash
)
1153 re_dfastate_t
*newstate
;
1156 struct re_state_table_entry
*spot
;
1157 spot
= dfa
->state_table
+ (hash
& dfa
->state_hash_mask
);
1159 if (spot
->alloc
<= spot
->num
)
1161 re_dfastate_t
**new_array
;
1162 spot
->alloc
= 2 * spot
->num
+ 2;
1163 new_array
= re_realloc (spot
->array
, re_dfastate_t
*, spot
->alloc
);
1164 if (BE (new_array
== NULL
, 0))
1166 spot
->array
= new_array
;
1168 spot
->array
[spot
->num
++] = newstate
;
1172 /* Create the new state which is independ of contexts.
1173 Return the new state if succeeded, otherwise return NULL. */
1175 static re_dfastate_t
*
1176 create_ci_newstate (dfa
, nodes
, hash
)
1178 const re_node_set
*nodes
;
1183 re_dfastate_t
*newstate
;
1184 newstate
= create_newstate_common (dfa
, nodes
, hash
);
1185 if (BE (newstate
== NULL
, 0))
1187 newstate
->entrance_nodes
= &newstate
->nodes
;
1189 for (i
= 0 ; i
< nodes
->nelem
; i
++)
1191 re_token_t
*node
= dfa
->nodes
+ nodes
->elems
[i
];
1192 re_token_type_t type
= node
->type
;
1193 if (type
== CHARACTER
&& !node
->constraint
)
1196 /* If the state has the halt node, the state is a halt state. */
1197 else if (type
== END_OF_RE
)
1199 #ifdef RE_ENABLE_I18N
1200 else if (type
== COMPLEX_BRACKET
1201 || (type
== OP_PERIOD
&& MB_CUR_MAX
> 1))
1202 newstate
->accept_mb
= 1;
1203 #endif /* RE_ENABLE_I18N */
1204 else if (type
== OP_BACK_REF
)
1205 newstate
->has_backref
= 1;
1206 else if (type
== ANCHOR
|| node
->constraint
)
1207 newstate
->has_constraint
= 1;
1209 err
= register_state (dfa
, newstate
, hash
);
1210 if (BE (err
!= REG_NOERROR
, 0))
1212 free_state (newstate
);
1218 /* Create the new state which is depend on the context CONTEXT.
1219 Return the new state if succeeded, otherwise return NULL. */
1221 static re_dfastate_t
*
1222 create_cd_newstate (dfa
, nodes
, context
, hash
)
1224 const re_node_set
*nodes
;
1225 unsigned int context
, hash
;
1227 int i
, nctx_nodes
= 0;
1229 re_dfastate_t
*newstate
;
1231 newstate
= create_newstate_common (dfa
, nodes
, hash
);
1232 if (BE (newstate
== NULL
, 0))
1234 newstate
->context
= context
;
1235 newstate
->entrance_nodes
= &newstate
->nodes
;
1237 for (i
= 0 ; i
< nodes
->nelem
; i
++)
1239 unsigned int constraint
= 0;
1240 re_token_t
*node
= dfa
->nodes
+ nodes
->elems
[i
];
1241 re_token_type_t type
= node
->type
;
1242 if (node
->constraint
)
1243 constraint
= node
->constraint
;
1245 if (type
== CHARACTER
&& !constraint
)
1247 /* If the state has the halt node, the state is a halt state. */
1248 else if (type
== END_OF_RE
)
1250 #ifdef RE_ENABLE_I18N
1251 else if (type
== COMPLEX_BRACKET
1252 || (type
== OP_PERIOD
&& MB_CUR_MAX
> 1))
1253 newstate
->accept_mb
= 1;
1254 #endif /* RE_ENABLE_I18N */
1255 else if (type
== OP_BACK_REF
)
1256 newstate
->has_backref
= 1;
1257 else if (type
== ANCHOR
)
1258 constraint
= node
->opr
.ctx_type
;
1262 if (newstate
->entrance_nodes
== &newstate
->nodes
)
1264 newstate
->entrance_nodes
= re_malloc (re_node_set
, 1);
1265 if (BE (newstate
->entrance_nodes
== NULL
, 0))
1267 free_state (newstate
);
1270 re_node_set_init_copy (newstate
->entrance_nodes
, nodes
);
1272 newstate
->has_constraint
= 1;
1275 if (NOT_SATISFY_PREV_CONSTRAINT (constraint
,context
))
1277 re_node_set_remove_at (&newstate
->nodes
, i
- nctx_nodes
);
1282 err
= register_state (dfa
, newstate
, hash
);
1283 if (BE (err
!= REG_NOERROR
, 0))
1285 free_state (newstate
);
1293 re_dfastate_t
*state
;
1295 if (state
->entrance_nodes
!= &state
->nodes
)
1297 re_node_set_free (state
->entrance_nodes
);
1298 re_free (state
->entrance_nodes
);
1300 re_node_set_free (&state
->nodes
);
1301 re_free (state
->trtable
);
1302 re_free (state
->trtable_search
);