1 /* Extended regular expression matching and search library.
2 Copyright (C) 2002 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4 Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library; if not, write to the Free
18 Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
28 #if defined HAVE_WCHAR_H || defined _LIBC
30 #endif /* HAVE_WCHAR_H || _LIBC */
31 #if defined HAVE_WCTYPE_H || defined _LIBC
33 #endif /* HAVE_WCTYPE_H || _LIBC */
36 # ifndef _RE_DEFINE_LOCALE_FUNCTIONS
37 # define _RE_DEFINE_LOCALE_FUNCTIONS 1
38 # include <locale/localeinfo.h>
39 # include <locale/elem-hash.h>
40 # include <locale/coll-lookup.h>
44 /* This is for other GNU distributions with internationalized messages. */
45 #if HAVE_LIBINTL_H || defined _LIBC
49 # define gettext(msgid) \
50 INTUSE(__dcgettext) (_libc_intl_domainname_internal, msgid, LC_MESSAGES)
53 # define gettext(msgid) (msgid)
57 /* This define is so xgettext can find the internationalizable
59 # define gettext_noop(String) String
63 #include "regex_internal.h"
65 static void re_string_construct_common (const char *str
, int len
,
67 RE_TRANSLATE_TYPE trans
, int icase
);
69 static int re_string_skip_chars (re_string_t
*pstr
, int new_raw_idx
);
70 #endif /* RE_ENABLE_I18N */
71 static re_dfastate_t
*create_newstate_common (re_dfa_t
*dfa
,
72 const re_node_set
*nodes
,
74 static reg_errcode_t
register_state (re_dfa_t
*dfa
, re_dfastate_t
*newstate
,
76 static re_dfastate_t
*create_ci_newstate (re_dfa_t
*dfa
,
77 const re_node_set
*nodes
,
79 static re_dfastate_t
*create_cd_newstate (re_dfa_t
*dfa
,
80 const re_node_set
*nodes
,
83 static unsigned int inline calc_state_hash (const re_node_set
*nodes
,
84 unsigned int context
);
86 /* Functions for string operation. */
88 /* This function allocate the buffers. It is necessary to call
89 re_string_reconstruct before using the object. */
92 re_string_allocate (pstr
, str
, len
, init_len
, trans
, icase
)
95 int len
, init_len
, icase
;
96 RE_TRANSLATE_TYPE trans
;
99 int init_buf_len
= (len
+ 1 < init_len
) ? len
+ 1: init_len
;
100 re_string_construct_common (str
, len
, pstr
, trans
, icase
);
101 pstr
->stop
= pstr
->len
;
103 ret
= re_string_realloc_buffers (pstr
, init_buf_len
);
104 if (BE (ret
!= REG_NOERROR
, 0))
107 pstr
->mbs_case
= (MBS_CASE_ALLOCATED (pstr
) ? pstr
->mbs_case
108 : (unsigned char *) str
);
109 pstr
->mbs
= MBS_ALLOCATED (pstr
) ? pstr
->mbs
: pstr
->mbs_case
;
110 pstr
->valid_len
= (MBS_CASE_ALLOCATED (pstr
) || MBS_ALLOCATED (pstr
)
111 || MB_CUR_MAX
> 1) ? pstr
->valid_len
: len
;
115 /* This function allocate the buffers, and initialize them. */
118 re_string_construct (pstr
, str
, len
, trans
, icase
)
122 RE_TRANSLATE_TYPE trans
;
125 re_string_construct_common (str
, len
, pstr
, trans
, icase
);
126 pstr
->stop
= pstr
->len
;
127 /* Set 0 so that this function can initialize whole buffers. */
132 ret
= re_string_realloc_buffers (pstr
, len
+ 1);
133 if (BE (ret
!= REG_NOERROR
, 0))
136 pstr
->mbs_case
= (MBS_CASE_ALLOCATED (pstr
) ? pstr
->mbs_case
137 : (unsigned char *) str
);
138 pstr
->mbs
= MBS_ALLOCATED (pstr
) ? pstr
->mbs
: pstr
->mbs_case
;
142 #ifdef RE_ENABLE_I18N
144 build_wcs_upper_buffer (pstr
);
146 #endif /* RE_ENABLE_I18N */
147 build_upper_buffer (pstr
);
151 #ifdef RE_ENABLE_I18N
153 build_wcs_buffer (pstr
);
155 #endif /* RE_ENABLE_I18N */
158 re_string_translate_buffer (pstr
);
160 pstr
->valid_len
= len
;
164 /* Initialized whole buffers, then valid_len == bufs_len. */
165 pstr
->valid_len
= pstr
->bufs_len
;
169 /* Helper functions for re_string_allocate, and re_string_construct. */
172 re_string_realloc_buffers (pstr
, new_buf_len
)
176 #ifdef RE_ENABLE_I18N
179 pstr
->wcs
= re_realloc (pstr
->wcs
, wint_t, new_buf_len
);
180 if (BE (pstr
->wcs
== NULL
, 0))
183 #endif /* RE_ENABLE_I18N */
184 if (MBS_ALLOCATED (pstr
))
186 pstr
->mbs
= re_realloc (pstr
->mbs
, unsigned char, new_buf_len
);
187 if (BE (pstr
->mbs
== NULL
, 0))
190 if (MBS_CASE_ALLOCATED (pstr
))
192 pstr
->mbs_case
= re_realloc (pstr
->mbs_case
, unsigned char, new_buf_len
);
193 if (BE (pstr
->mbs_case
== NULL
, 0))
195 if (!MBS_ALLOCATED (pstr
))
196 pstr
->mbs
= pstr
->mbs_case
;
198 pstr
->bufs_len
= new_buf_len
;
204 re_string_construct_common (str
, len
, pstr
, trans
, icase
)
208 RE_TRANSLATE_TYPE trans
;
211 memset (pstr
, '\0', sizeof (re_string_t
));
212 pstr
->raw_mbs
= (const unsigned char *) str
;
215 pstr
->icase
= icase
? 1 : 0;
218 #ifdef RE_ENABLE_I18N
220 /* Build wide character buffer PSTR->WCS.
221 If the byte sequence of the string are:
222 <mb1>(0), <mb1>(1), <mb2>(0), <mb2>(1), <sb3>
223 Then wide character buffer will be:
224 <wc1> , WEOF , <wc2> , WEOF , <wc3>
225 We use WEOF for padding, they indicate that the position isn't
226 a first byte of a multibyte character.
228 Note that this function assumes PSTR->VALID_LEN elements are already
229 built and starts from PSTR->VALID_LEN. */
232 build_wcs_buffer (pstr
)
236 int byte_idx
, end_idx
, mbclen
, remain_len
;
237 /* Build the buffers from pstr->valid_len to either pstr->len or
239 end_idx
= (pstr
->bufs_len
> pstr
->len
)? pstr
->len
: pstr
->bufs_len
;
240 for (byte_idx
= pstr
->valid_len
; byte_idx
< end_idx
;)
243 remain_len
= end_idx
- byte_idx
;
244 prev_st
= pstr
->cur_state
;
245 mbclen
= mbrtowc (&wc
, ((const char *) pstr
->raw_mbs
+ pstr
->raw_mbs_idx
246 + byte_idx
), remain_len
, &pstr
->cur_state
);
247 if (BE (mbclen
== (size_t) -2, 0))
249 /* The buffer doesn't have enough space, finish to build. */
250 pstr
->cur_state
= prev_st
;
253 else if (BE (mbclen
== (size_t) -1 || mbclen
== 0, 0))
255 /* We treat these cases as a singlebyte character. */
257 wc
= (wchar_t) pstr
->raw_mbs
[pstr
->raw_mbs_idx
+ byte_idx
];
258 pstr
->cur_state
= prev_st
;
261 /* Apply the translateion if we need. */
262 if (pstr
->trans
!= NULL
&& mbclen
== 1)
264 int ch
= pstr
->trans
[pstr
->raw_mbs
[pstr
->raw_mbs_idx
+ byte_idx
]];
265 pstr
->mbs_case
[byte_idx
] = ch
;
267 /* Write wide character and padding. */
268 pstr
->wcs
[byte_idx
++] = wc
;
269 /* Write paddings. */
270 for (remain_len
= byte_idx
+ mbclen
- 1; byte_idx
< remain_len
;)
271 pstr
->wcs
[byte_idx
++] = WEOF
;
273 pstr
->valid_len
= byte_idx
;
276 /* Build wide character buffer PSTR->WCS like build_wcs_buffer,
277 but for REG_ICASE. */
280 build_wcs_upper_buffer (pstr
)
284 int byte_idx
, end_idx
, mbclen
, remain_len
;
285 /* Build the buffers from pstr->valid_len to either pstr->len or
287 end_idx
= (pstr
->bufs_len
> pstr
->len
)? pstr
->len
: pstr
->bufs_len
;
288 for (byte_idx
= pstr
->valid_len
; byte_idx
< end_idx
;)
291 remain_len
= end_idx
- byte_idx
;
292 prev_st
= pstr
->cur_state
;
293 mbclen
= mbrtowc (&wc
, ((const char *) pstr
->raw_mbs
+ pstr
->raw_mbs_idx
294 + byte_idx
), remain_len
, &pstr
->cur_state
);
295 if (BE (mbclen
== (size_t) -2, 0))
297 /* The buffer doesn't have enough space, finish to build. */
298 pstr
->cur_state
= prev_st
;
301 else if (mbclen
== 1 || mbclen
== (size_t) -1 || mbclen
== 0)
303 /* In case of a singlebyte character. */
304 int ch
= pstr
->raw_mbs
[pstr
->raw_mbs_idx
+ byte_idx
];
305 /* Apply the translateion if we need. */
306 if (pstr
->trans
!= NULL
&& mbclen
== 1)
308 ch
= pstr
->trans
[ch
];
309 pstr
->mbs_case
[byte_idx
] = ch
;
311 pstr
->wcs
[byte_idx
] = iswlower (wc
) ? toupper (wc
) : wc
;
312 pstr
->mbs
[byte_idx
++] = islower (ch
) ? toupper (ch
) : ch
;
313 if (BE (mbclen
== (size_t) -1, 0))
314 pstr
->cur_state
= prev_st
;
316 else /* mbclen > 1 */
319 wcrtomb ((char *) pstr
->mbs
+ byte_idx
, towupper (wc
), &prev_st
);
321 memcpy (pstr
->mbs
+ byte_idx
,
322 pstr
->raw_mbs
+ pstr
->raw_mbs_idx
+ byte_idx
, mbclen
);
323 pstr
->wcs
[byte_idx
++] = iswlower (wc
) ? toupper (wc
) : wc
;
324 /* Write paddings. */
325 for (remain_len
= byte_idx
+ mbclen
- 1; byte_idx
< remain_len
;)
326 pstr
->wcs
[byte_idx
++] = WEOF
;
329 pstr
->valid_len
= byte_idx
;
332 /* Skip characters until the index becomes greater than NEW_RAW_IDX.
336 re_string_skip_chars (pstr
, new_raw_idx
)
341 int rawbuf_idx
, mbclen
;
343 /* Skip the characters which are not necessary to check. */
344 for (rawbuf_idx
= pstr
->raw_mbs_idx
+ pstr
->valid_len
;
345 rawbuf_idx
< new_raw_idx
;)
347 int remain_len
= pstr
->len
- rawbuf_idx
;
348 prev_st
= pstr
->cur_state
;
349 mbclen
= mbrlen ((const char *) pstr
->raw_mbs
+ rawbuf_idx
, remain_len
,
351 if (BE (mbclen
== (size_t) -2 || mbclen
== (size_t) -1 || mbclen
== 0, 0))
353 /* We treat these cases as a singlebyte character. */
355 pstr
->cur_state
= prev_st
;
357 /* Then proceed the next character. */
358 rawbuf_idx
+= mbclen
;
362 #endif /* RE_ENABLE_I18N */
364 /* Build the buffer PSTR->MBS, and apply the translation if we need.
365 This function is used in case of REG_ICASE. */
368 build_upper_buffer (pstr
)
371 int char_idx
, end_idx
;
372 end_idx
= (pstr
->bufs_len
> pstr
->len
) ? pstr
->len
: pstr
->bufs_len
;
374 for (char_idx
= pstr
->valid_len
; char_idx
< end_idx
; ++char_idx
)
376 int ch
= pstr
->raw_mbs
[pstr
->raw_mbs_idx
+ char_idx
];
377 if (pstr
->trans
!= NULL
)
379 ch
= pstr
->trans
[ch
];
380 pstr
->mbs_case
[char_idx
] = ch
;
383 pstr
->mbs
[char_idx
] = toupper (ch
);
385 pstr
->mbs
[char_idx
] = ch
;
387 pstr
->valid_len
= char_idx
;
390 /* Apply TRANS to the buffer in PSTR. */
393 re_string_translate_buffer (pstr
)
396 int buf_idx
, end_idx
;
397 end_idx
= (pstr
->bufs_len
> pstr
->len
) ? pstr
->len
: pstr
->bufs_len
;
399 for (buf_idx
= pstr
->valid_len
; buf_idx
< end_idx
; ++buf_idx
)
401 int ch
= pstr
->raw_mbs
[pstr
->raw_mbs_idx
+ buf_idx
];
402 pstr
->mbs_case
[buf_idx
] = pstr
->trans
[ch
];
405 pstr
->valid_len
= buf_idx
;
408 /* This function re-construct the buffers.
409 Concretely, convert to wide character in case of MB_CUR_MAX > 1,
410 convert to upper case in case of REG_ICASE, apply translation. */
413 re_string_reconstruct (pstr
, idx
, eflags
, newline
)
415 int idx
, eflags
, newline
;
417 int offset
= idx
- pstr
->raw_mbs_idx
;
421 #ifdef RE_ENABLE_I18N
423 memset (&pstr
->cur_state
, '\0', sizeof (mbstate_t));
424 #endif /* RE_ENABLE_I18N */
425 pstr
->valid_len
= pstr
->raw_mbs_idx
= 0;
426 pstr
->tip_context
= ((eflags
& REG_NOTBOL
) ? CONTEXT_BEGBUF
427 : CONTEXT_NEWLINE
| CONTEXT_BEGBUF
);
428 if (!MBS_CASE_ALLOCATED (pstr
))
429 pstr
->mbs_case
= (unsigned char *) pstr
->raw_mbs
;
430 if (!MBS_ALLOCATED (pstr
) && !MBS_CASE_ALLOCATED (pstr
))
431 pstr
->mbs
= (unsigned char *) pstr
->raw_mbs
;
437 pstr
->tip_context
= re_string_context_at (pstr
, offset
- 1, eflags
,
439 /* Are the characters which are already checked remain? */
440 if (offset
< pstr
->valid_len
)
442 /* Yes, move them to the front of the buffer. */
443 #ifdef RE_ENABLE_I18N
445 memmove (pstr
->wcs
, pstr
->wcs
+ offset
,
446 (pstr
->valid_len
- offset
) * sizeof (wint_t));
447 #endif /* RE_ENABLE_I18N */
448 if (MBS_ALLOCATED (pstr
))
449 memmove (pstr
->mbs
, pstr
->mbs
+ offset
,
450 pstr
->valid_len
- offset
);
451 if (MBS_CASE_ALLOCATED (pstr
))
452 memmove (pstr
->mbs_case
, pstr
->mbs_case
+ offset
,
453 pstr
->valid_len
- offset
);
454 pstr
->valid_len
-= offset
;
456 assert (pstr
->valid_len
> 0);
461 /* No, skip all characters until IDX. */
463 #ifdef RE_ENABLE_I18N
467 pstr
->valid_len
= re_string_skip_chars (pstr
, idx
) - idx
;
468 for (wcs_idx
= 0; wcs_idx
< pstr
->valid_len
; ++wcs_idx
)
469 pstr
->wcs
[wcs_idx
] = WEOF
;
471 #endif /* RE_ENABLE_I18N */
473 if (!MBS_CASE_ALLOCATED (pstr
))
475 pstr
->mbs_case
+= offset
;
476 /* In case of !MBS_ALLOCATED && !MBS_CASE_ALLOCATED. */
477 if (!MBS_ALLOCATED (pstr
))
481 pstr
->raw_mbs_idx
= idx
;
483 pstr
->stop
-= offset
;
485 /* Then build the buffers. */
486 #ifdef RE_ENABLE_I18N
490 build_wcs_upper_buffer (pstr
);
492 build_wcs_buffer (pstr
);
495 #endif /* RE_ENABLE_I18N */
498 build_upper_buffer (pstr
);
499 else if (pstr
->trans
!= NULL
)
500 re_string_translate_buffer (pstr
);
508 re_string_destruct (pstr
)
511 #ifdef RE_ENABLE_I18N
513 #endif /* RE_ENABLE_I18N */
514 if (MBS_ALLOCATED (pstr
))
516 if (MBS_CASE_ALLOCATED (pstr
))
517 re_free (pstr
->mbs_case
);
520 /* Return the context at IDX in INPUT. */
523 re_string_context_at (input
, idx
, eflags
, newline_anchor
)
524 const re_string_t
*input
;
525 int idx
, eflags
, newline_anchor
;
528 if (idx
< 0 || idx
== input
->len
)
531 /* In this case, we use the value stored in input->tip_context,
532 since we can't know the character in input->mbs[-1] here. */
533 return input
->tip_context
;
534 else /* (idx == input->len) */
535 return ((eflags
& REG_NOTEOL
) ? CONTEXT_ENDBUF
536 : CONTEXT_NEWLINE
| CONTEXT_ENDBUF
);
538 c
= re_string_byte_at (input
, idx
);
539 if (IS_WORD_CHAR (c
))
541 return (newline_anchor
&& IS_NEWLINE (c
)) ? CONTEXT_NEWLINE
: 0;
544 /* Functions for set operation. */
547 re_node_set_alloc (set
, size
)
553 set
->elems
= re_malloc (int, size
);
554 if (BE (set
->elems
== NULL
, 0))
560 re_node_set_init_1 (set
, elem
)
566 set
->elems
= re_malloc (int, 1);
567 if (BE (set
->elems
== NULL
, 0))
569 set
->elems
[0] = elem
;
574 re_node_set_init_2 (set
, elem1
, elem2
)
579 set
->elems
= re_malloc (int, 2);
580 if (BE (set
->elems
== NULL
, 0))
585 set
->elems
[0] = elem1
;
592 set
->elems
[0] = elem1
;
593 set
->elems
[1] = elem2
;
597 set
->elems
[0] = elem2
;
598 set
->elems
[1] = elem1
;
605 re_node_set_init_copy (dest
, src
)
607 const re_node_set
*src
;
609 dest
->nelem
= src
->nelem
;
612 dest
->alloc
= dest
->nelem
;
613 dest
->elems
= re_malloc (int, dest
->alloc
);
614 if (BE (dest
->elems
== NULL
, 0))
616 memcpy (dest
->elems
, src
->elems
, src
->nelem
* sizeof (int));
619 re_node_set_init_empty (dest
);
623 /* Calculate the intersection of the sets SRC1 and SRC2. And store it in
624 DEST. Return value indicate the error code or REG_NOERROR if succeeded.
625 Note: We assume dest->elems is NULL, when dest->alloc is 0. */
628 re_node_set_intersect (dest
, src1
, src2
)
630 const re_node_set
*src1
, *src2
;
633 if (src1
->nelem
> 0 && src2
->nelem
> 0)
635 if (src1
->nelem
+ src2
->nelem
> dest
->alloc
)
637 dest
->alloc
= src1
->nelem
+ src2
->nelem
;
638 dest
->elems
= re_realloc (dest
->elems
, int, dest
->alloc
);
639 if (BE (dest
->elems
== NULL
, 0))
645 /* The intersection of empty sets is also empty set. */
650 for (i1
= i2
= id
= 0; i1
< src1
->nelem
&& i2
< src2
->nelem
; )
652 if (src1
->elems
[i1
] > src2
->elems
[i2
])
657 /* The intersection must have the elements which are in both of
659 if (src1
->elems
[i1
] == src2
->elems
[i2
])
660 dest
->elems
[id
++] = src2
->elems
[i2
++];
667 /* Calculate the intersection of the sets SRC1 and SRC2. And merge it to
668 DEST. Return value indicate the error code or REG_NOERROR if succeeded.
669 Note: We assume dest->elems is NULL, when dest->alloc is 0. */
672 re_node_set_add_intersect (dest
, src1
, src2
)
674 const re_node_set
*src1
, *src2
;
677 if (src1
->nelem
> 0 && src2
->nelem
> 0)
679 if (src1
->nelem
+ src2
->nelem
+ dest
->nelem
> dest
->alloc
)
681 dest
->alloc
= src1
->nelem
+ src2
->nelem
+ dest
->nelem
;
682 dest
->elems
= re_realloc (dest
->elems
, int, dest
->alloc
);
683 if (BE (dest
->elems
== NULL
, 0))
690 for (i1
= i2
= id
= 0 ; i1
< src1
->nelem
&& i2
< src2
->nelem
;)
692 if (src1
->elems
[i1
] > src2
->elems
[i2
])
697 if (src1
->elems
[i1
] == src2
->elems
[i2
])
699 while (id
< dest
->nelem
&& dest
->elems
[id
] < src2
->elems
[i2
])
701 if (id
< dest
->nelem
&& dest
->elems
[id
] == src2
->elems
[i2
])
705 memmove (dest
->elems
+ id
+ 1, dest
->elems
+ id
,
706 sizeof (int) * (dest
->nelem
- id
));
707 dest
->elems
[id
++] = src2
->elems
[i2
++];
716 /* Calculate the union set of the sets SRC1 and SRC2. And store it to
717 DEST. Return value indicate the error code or REG_NOERROR if succeeded. */
720 re_node_set_init_union (dest
, src1
, src2
)
722 const re_node_set
*src1
, *src2
;
725 if (src1
!= NULL
&& src1
->nelem
> 0 && src2
!= NULL
&& src2
->nelem
> 0)
727 dest
->alloc
= src1
->nelem
+ src2
->nelem
;
728 dest
->elems
= re_malloc (int, dest
->alloc
);
729 if (BE (dest
->elems
== NULL
, 0))
734 if (src1
!= NULL
&& src1
->nelem
> 0)
735 return re_node_set_init_copy (dest
, src1
);
736 else if (src2
!= NULL
&& src2
->nelem
> 0)
737 return re_node_set_init_copy (dest
, src2
);
739 re_node_set_init_empty (dest
);
742 for (i1
= i2
= id
= 0 ; i1
< src1
->nelem
&& i2
< src2
->nelem
;)
744 if (src1
->elems
[i1
] > src2
->elems
[i2
])
746 dest
->elems
[id
++] = src2
->elems
[i2
++];
749 if (src1
->elems
[i1
] == src2
->elems
[i2
])
751 dest
->elems
[id
++] = src1
->elems
[i1
++];
753 if (i1
< src1
->nelem
)
755 memcpy (dest
->elems
+ id
, src1
->elems
+ i1
,
756 (src1
->nelem
- i1
) * sizeof (int));
757 id
+= src1
->nelem
- i1
;
759 else if (i2
< src2
->nelem
)
761 memcpy (dest
->elems
+ id
, src2
->elems
+ i2
,
762 (src2
->nelem
- i2
) * sizeof (int));
763 id
+= src2
->nelem
- i2
;
769 /* Calculate the union set of the sets DEST and SRC. And store it to
770 DEST. Return value indicate the error code or REG_NOERROR if succeeded. */
773 re_node_set_merge (dest
, src
)
775 const re_node_set
*src
;
778 if (src
== NULL
|| src
->nelem
== 0)
780 if (dest
->alloc
< src
->nelem
+ dest
->nelem
)
782 dest
->alloc
= 2 * (src
->nelem
+ dest
->alloc
);
783 dest
->elems
= re_realloc (dest
->elems
, int, dest
->alloc
);
784 if (BE (dest
->elems
== NULL
, 0))
788 for (si
= 0, di
= 0 ; si
< src
->nelem
&& di
< dest
->nelem
;)
790 int cp_from
, ncp
, mid
, right
, src_elem
= src
->elems
[si
];
791 /* Binary search the spot we will add the new element. */
795 mid
= (di
+ right
) / 2;
796 if (dest
->elems
[mid
] < src_elem
)
801 if (di
>= dest
->nelem
)
804 if (dest
->elems
[di
] == src_elem
)
806 /* Skip since, DEST already has the element. */
812 /* Skip the src elements which are less than dest->elems[di]. */
814 while (si
< src
->nelem
&& src
->elems
[si
] < dest
->elems
[di
])
816 /* Copy these src elements. */
818 memmove (dest
->elems
+ di
+ ncp
, dest
->elems
+ di
,
819 sizeof (int) * (dest
->nelem
- di
));
820 memcpy (dest
->elems
+ di
, src
->elems
+ cp_from
,
822 /* Update counters. */
827 /* Copy remaining src elements. */
830 memcpy (dest
->elems
+ di
, src
->elems
+ si
,
831 sizeof (int) * (src
->nelem
- si
));
832 dest
->nelem
+= src
->nelem
- si
;
837 /* Insert the new element ELEM to the re_node_set* SET.
838 return 0 if SET already has ELEM,
839 return -1 if an error is occured, return 1 otherwise. */
842 re_node_set_insert (set
, elem
)
847 /* In case of the set is empty. */
848 if (set
->elems
== NULL
|| set
->alloc
== 0)
850 if (BE (re_node_set_init_1 (set
, elem
) == REG_NOERROR
, 1))
856 /* Binary search the spot we will add the new element. */
861 mid
= (idx
+ right
) / 2;
862 if (set
->elems
[mid
] < elem
)
868 /* Realloc if we need. */
869 if (set
->alloc
< set
->nelem
+ 1)
872 set
->alloc
= set
->alloc
* 2;
873 new_array
= re_malloc (int, set
->alloc
);
874 if (BE (new_array
== NULL
, 0))
876 /* Copy the elements they are followed by the new element. */
878 memcpy (new_array
, set
->elems
, sizeof (int) * (idx
));
879 /* Copy the elements which follows the new element. */
880 if (set
->nelem
- idx
> 0)
881 memcpy (new_array
+ idx
+ 1, set
->elems
+ idx
,
882 sizeof (int) * (set
->nelem
- idx
));
883 re_free (set
->elems
);
884 set
->elems
= new_array
;
888 /* Move the elements which follows the new element. */
889 if (set
->nelem
- idx
> 0)
890 memmove (set
->elems
+ idx
+ 1, set
->elems
+ idx
,
891 sizeof (int) * (set
->nelem
- idx
));
893 /* Insert the new element. */
894 set
->elems
[idx
] = elem
;
899 /* Compare two node sets SET1 and SET2.
900 return 1 if SET1 and SET2 are equivalent, retrun 0 otherwise. */
903 re_node_set_compare (set1
, set2
)
904 const re_node_set
*set1
, *set2
;
907 if (set1
== NULL
|| set2
== NULL
|| set1
->nelem
!= set2
->nelem
)
909 for (i
= 0 ; i
< set1
->nelem
; i
++)
910 if (set1
->elems
[i
] != set2
->elems
[i
])
915 /* Return 1 if SET contains the element ELEM, return 0 otherwise. */
918 re_node_set_contains (set
, elem
)
919 const re_node_set
*set
;
926 /* Binary search the element. */
928 right
= set
->nelem
- 1;
931 mid
= (idx
+ right
) / 2;
932 if (set
->elems
[mid
] < elem
)
937 return set
->elems
[idx
] == elem
;
941 re_node_set_remove_at (set
, idx
)
945 if (idx
< 0 || idx
>= set
->nelem
)
947 if (idx
< set
->nelem
- 1)
948 memmove (set
->elems
+ idx
, set
->elems
+ idx
+ 1,
949 sizeof (int) * (set
->nelem
- idx
- 1));
954 /* Add the token TOKEN to dfa->nodes, and return the index of the token.
955 Or return -1, if an error will be occured. */
958 re_dfa_add_node (dfa
, token
, mode
)
963 if (dfa
->nodes_len
>= dfa
->nodes_alloc
)
965 re_token_t
*new_array
;
966 dfa
->nodes_alloc
*= 2;
967 new_array
= re_realloc (dfa
->nodes
, re_token_t
, dfa
->nodes_alloc
);
968 if (BE (new_array
== NULL
, 0))
971 dfa
->nodes
= new_array
;
974 int *new_firsts
, *new_nexts
;
975 re_node_set
*new_edests
, *new_eclosures
, *new_inveclosures
;
977 new_firsts
= re_realloc (dfa
->firsts
, int, dfa
->nodes_alloc
);
978 new_nexts
= re_realloc (dfa
->nexts
, int, dfa
->nodes_alloc
);
979 new_edests
= re_realloc (dfa
->edests
, re_node_set
, dfa
->nodes_alloc
);
980 new_eclosures
= re_realloc (dfa
->eclosures
, re_node_set
,
982 new_inveclosures
= re_realloc (dfa
->inveclosures
, re_node_set
,
984 if (BE (new_firsts
== NULL
|| new_nexts
== NULL
|| new_edests
== NULL
985 || new_eclosures
== NULL
|| new_inveclosures
== NULL
, 0))
987 dfa
->firsts
= new_firsts
;
988 dfa
->nexts
= new_nexts
;
989 dfa
->edests
= new_edests
;
990 dfa
->eclosures
= new_eclosures
;
991 dfa
->inveclosures
= new_inveclosures
;
994 dfa
->nodes
[dfa
->nodes_len
] = token
;
995 dfa
->nodes
[dfa
->nodes_len
].duplicated
= 0;
996 return dfa
->nodes_len
++;
999 static unsigned int inline
1000 calc_state_hash (nodes
, context
)
1001 const re_node_set
*nodes
;
1002 unsigned int context
;
1004 unsigned int hash
= nodes
->nelem
+ context
;
1006 for (i
= 0 ; i
< nodes
->nelem
; i
++)
1007 hash
+= nodes
->elems
[i
];
1011 /* Search for the state whose node_set is equivalent to NODES.
1012 Return the pointer to the state, if we found it in the DFA.
1013 Otherwise create the new one and return it. In case of an error
1014 return NULL and set the error code in ERR.
1015 Note: - We assume NULL as the invalid state, then it is possible that
1016 return value is NULL and ERR is REG_NOERROR.
1017 - We never return non-NULL value in case of any errors, it is for
1020 static re_dfastate_t
*
1021 re_acquire_state (err
, dfa
, nodes
)
1024 const re_node_set
*nodes
;
1027 re_dfastate_t
*new_state
;
1028 struct re_state_table_entry
*spot
;
1030 if (BE (nodes
->nelem
== 0, 0))
1035 hash
= calc_state_hash (nodes
, 0);
1036 spot
= dfa
->state_table
+ (hash
& dfa
->state_hash_mask
);
1038 for (i
= 0 ; i
< spot
->num
; i
++)
1040 re_dfastate_t
*state
= spot
->array
[i
];
1041 if (hash
!= state
->hash
)
1043 if (re_node_set_compare (&state
->nodes
, nodes
))
1047 /* There are no appropriate state in the dfa, create the new one. */
1048 new_state
= create_ci_newstate (dfa
, nodes
, hash
);
1049 if (BE (new_state
!= NULL
, 1))
1058 /* Search for the state whose node_set is equivalent to NODES and
1059 whose context is equivalent to CONTEXT.
1060 Return the pointer to the state, if we found it in the DFA.
1061 Otherwise create the new one and return it. In case of an error
1062 return NULL and set the error code in ERR.
1063 Note: - We assume NULL as the invalid state, then it is possible that
1064 return value is NULL and ERR is REG_NOERROR.
1065 - We never return non-NULL value in case of any errors, it is for
1068 static re_dfastate_t
*
1069 re_acquire_state_context (err
, dfa
, nodes
, context
)
1072 const re_node_set
*nodes
;
1073 unsigned int context
;
1076 re_dfastate_t
*new_state
;
1077 struct re_state_table_entry
*spot
;
1079 if (nodes
->nelem
== 0)
1084 hash
= calc_state_hash (nodes
, context
);
1085 spot
= dfa
->state_table
+ (hash
& dfa
->state_hash_mask
);
1087 for (i
= 0 ; i
< spot
->num
; i
++)
1089 re_dfastate_t
*state
= spot
->array
[i
];
1090 if (hash
!= state
->hash
)
1092 if (re_node_set_compare (state
->entrance_nodes
, nodes
)
1093 && state
->context
== context
)
1096 /* There are no appropriate state in `dfa', create the new one. */
1097 new_state
= create_cd_newstate (dfa
, nodes
, context
, hash
);
1098 if (BE (new_state
!= NULL
, 1))
1107 /* Allocate memory for DFA state and initialize common properties.
1108 Return the new state if succeeded, otherwise return NULL. */
1110 static re_dfastate_t
*
1111 create_newstate_common (dfa
, nodes
, hash
)
1113 const re_node_set
*nodes
;
1116 re_dfastate_t
*newstate
;
1117 newstate
= (re_dfastate_t
*) calloc (sizeof (re_dfastate_t
), 1);
1118 if (BE (newstate
== NULL
, 0))
1120 re_node_set_init_copy (&newstate
->nodes
, nodes
);
1121 newstate
->trtable
= NULL
;
1122 newstate
->trtable_search
= NULL
;
1123 newstate
->hash
= hash
;
1127 /* Store the new state NEWSTATE whose hash value is HASH in appropriate
1128 position. Return value indicate the error code if failed. */
1130 static reg_errcode_t
1131 register_state (dfa
, newstate
, hash
)
1133 re_dfastate_t
*newstate
;
1136 struct re_state_table_entry
*spot
;
1137 spot
= dfa
->state_table
+ (hash
& dfa
->state_hash_mask
);
1139 if (spot
->alloc
<= spot
->num
)
1141 spot
->alloc
= 2 * spot
->num
+ 2;
1142 spot
->array
= re_realloc (spot
->array
, re_dfastate_t
*, spot
->alloc
);
1143 if (BE (spot
->array
== NULL
, 0))
1146 spot
->array
[spot
->num
++] = newstate
;
1150 /* Create the new state which is independ of contexts.
1151 Return the new state if succeeded, otherwise return NULL. */
1153 static re_dfastate_t
*
1154 create_ci_newstate (dfa
, nodes
, hash
)
1156 const re_node_set
*nodes
;
1161 re_dfastate_t
*newstate
;
1162 newstate
= create_newstate_common (dfa
, nodes
, hash
);
1163 if (BE (newstate
== NULL
, 0))
1165 newstate
->entrance_nodes
= &newstate
->nodes
;
1167 for (i
= 0 ; i
< nodes
->nelem
; i
++)
1169 re_token_t
*node
= dfa
->nodes
+ nodes
->elems
[i
];
1170 re_token_type_t type
= node
->type
;
1171 if (type
== CHARACTER
)
1174 /* If the state has the halt node, the state is a halt state. */
1175 else if (type
== END_OF_RE
)
1177 #ifdef RE_ENABLE_I18N
1178 else if (type
== COMPLEX_BRACKET
1179 || (type
== OP_PERIOD
&& MB_CUR_MAX
> 1))
1180 newstate
->accept_mb
= 1;
1181 #endif /* RE_ENABLE_I18N */
1182 else if (type
== OP_BACK_REF
)
1183 newstate
->has_backref
= 1;
1184 else if (type
== ANCHOR
|| OP_CONTEXT_NODE
)
1186 newstate
->has_constraint
= 1;
1187 if (type
== OP_CONTEXT_NODE
1188 && dfa
->nodes
[node
->opr
.ctx_info
->entity
].type
== END_OF_RE
)
1192 err
= register_state (dfa
, newstate
, hash
);
1193 return (err
!= REG_NOERROR
) ? NULL
: newstate
;
1196 /* Create the new state which is depend on the context CONTEXT.
1197 Return the new state if succeeded, otherwise return NULL. */
1199 static re_dfastate_t
*
1200 create_cd_newstate (dfa
, nodes
, context
, hash
)
1202 const re_node_set
*nodes
;
1203 unsigned int context
, hash
;
1205 int i
, nctx_nodes
= 0;
1207 re_dfastate_t
*newstate
;
1209 newstate
= create_newstate_common (dfa
, nodes
, hash
);
1210 if (BE (newstate
== NULL
, 0))
1212 newstate
->context
= context
;
1213 newstate
->entrance_nodes
= &newstate
->nodes
;
1215 for (i
= 0 ; i
< nodes
->nelem
; i
++)
1217 unsigned int constraint
= 0;
1218 re_token_t
*node
= dfa
->nodes
+ nodes
->elems
[i
];
1219 re_token_type_t type
= node
->type
;
1220 if (type
== CHARACTER
)
1223 /* If the state has the halt node, the state is a halt state. */
1224 else if (type
== END_OF_RE
)
1226 #ifdef RE_ENABLE_I18N
1227 else if (type
== COMPLEX_BRACKET
1228 || (type
== OP_PERIOD
&& MB_CUR_MAX
> 1))
1229 newstate
->accept_mb
= 1;
1230 #endif /* RE_ENABLE_I18N */
1231 else if (type
== OP_BACK_REF
)
1232 newstate
->has_backref
= 1;
1233 else if (type
== ANCHOR
)
1234 constraint
= node
->opr
.ctx_type
;
1235 else if (type
== OP_CONTEXT_NODE
)
1237 re_token_type_t ctype
= dfa
->nodes
[node
->opr
.ctx_info
->entity
].type
;
1238 constraint
= node
->constraint
;
1239 if (ctype
== END_OF_RE
)
1241 else if (ctype
== OP_BACK_REF
)
1242 newstate
->has_backref
= 1;
1243 #ifdef RE_ENABLE_I18N
1244 else if (ctype
== COMPLEX_BRACKET
1245 || (type
== OP_PERIOD
&& MB_CUR_MAX
> 1))
1246 newstate
->accept_mb
= 1;
1247 #endif /* RE_ENABLE_I18N */
1252 if (newstate
->entrance_nodes
== &newstate
->nodes
)
1254 newstate
->entrance_nodes
= re_malloc (re_node_set
, 1);
1255 if (BE (newstate
->entrance_nodes
== NULL
, 0))
1257 re_node_set_init_copy (newstate
->entrance_nodes
, nodes
);
1259 newstate
->has_constraint
= 1;
1262 if (NOT_SATISFY_PREV_CONSTRAINT (constraint
,context
))
1264 re_node_set_remove_at (&newstate
->nodes
, i
- nctx_nodes
);
1269 err
= register_state (dfa
, newstate
, hash
);
1270 return (err
!= REG_NOERROR
) ? NULL
: newstate
;