1 /* Extended regular expression matching and search library.
2 Copyright (C) 2002-2017 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4 Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library; if not, see
18 <http://www.gnu.org/licenses/>. */
20 static void re_string_construct_common (const char *str
, int len
,
22 RE_TRANSLATE_TYPE trans
, int icase
,
24 static re_dfastate_t
*create_ci_newstate (const re_dfa_t
*dfa
,
25 const re_node_set
*nodes
,
27 static re_dfastate_t
*create_cd_newstate (const re_dfa_t
*dfa
,
28 const re_node_set
*nodes
,
32 /* Functions for string operation. */
34 /* This function allocate the buffers. It is necessary to call
35 re_string_reconstruct before using the object. */
38 __attribute_warn_unused_result__
39 re_string_allocate (re_string_t
*pstr
, const char *str
, int len
, int init_len
,
40 RE_TRANSLATE_TYPE trans
, int icase
, const re_dfa_t
*dfa
)
45 /* Ensure at least one character fits into the buffers. */
46 if (init_len
< dfa
->mb_cur_max
)
47 init_len
= dfa
->mb_cur_max
;
48 init_buf_len
= (len
+ 1 < init_len
) ? len
+ 1: init_len
;
49 re_string_construct_common (str
, len
, pstr
, trans
, icase
, dfa
);
51 ret
= re_string_realloc_buffers (pstr
, init_buf_len
);
52 if (BE (ret
!= REG_NOERROR
, 0))
55 pstr
->word_char
= dfa
->word_char
;
56 pstr
->word_ops_used
= dfa
->word_ops_used
;
57 pstr
->mbs
= pstr
->mbs_allocated
? pstr
->mbs
: (unsigned char *) str
;
58 pstr
->valid_len
= (pstr
->mbs_allocated
|| dfa
->mb_cur_max
> 1) ? 0 : len
;
59 pstr
->valid_raw_len
= pstr
->valid_len
;
63 /* This function allocate the buffers, and initialize them. */
66 __attribute_warn_unused_result__
67 re_string_construct (re_string_t
*pstr
, const char *str
, int len
,
68 RE_TRANSLATE_TYPE trans
, int icase
, const re_dfa_t
*dfa
)
71 memset (pstr
, '\0', sizeof (re_string_t
));
72 re_string_construct_common (str
, len
, pstr
, trans
, icase
, dfa
);
76 ret
= re_string_realloc_buffers (pstr
, len
+ 1);
77 if (BE (ret
!= REG_NOERROR
, 0))
80 pstr
->mbs
= pstr
->mbs_allocated
? pstr
->mbs
: (unsigned char *) str
;
85 if (dfa
->mb_cur_max
> 1)
89 ret
= build_wcs_upper_buffer (pstr
);
90 if (BE (ret
!= REG_NOERROR
, 0))
92 if (pstr
->valid_raw_len
>= len
)
94 if (pstr
->bufs_len
> pstr
->valid_len
+ dfa
->mb_cur_max
)
96 ret
= re_string_realloc_buffers (pstr
, pstr
->bufs_len
* 2);
97 if (BE (ret
!= REG_NOERROR
, 0))
102 #endif /* RE_ENABLE_I18N */
103 build_upper_buffer (pstr
);
107 #ifdef RE_ENABLE_I18N
108 if (dfa
->mb_cur_max
> 1)
109 build_wcs_buffer (pstr
);
111 #endif /* RE_ENABLE_I18N */
114 re_string_translate_buffer (pstr
);
117 pstr
->valid_len
= pstr
->bufs_len
;
118 pstr
->valid_raw_len
= pstr
->bufs_len
;
126 /* Helper functions for re_string_allocate, and re_string_construct. */
129 __attribute_warn_unused_result__
130 re_string_realloc_buffers (re_string_t
*pstr
, int new_buf_len
)
132 #ifdef RE_ENABLE_I18N
133 if (pstr
->mb_cur_max
> 1)
137 /* Avoid overflow in realloc. */
138 const size_t max_object_size
= MAX (sizeof (wint_t), sizeof (int));
139 if (BE (SIZE_MAX
/ max_object_size
< new_buf_len
, 0))
142 new_wcs
= re_realloc (pstr
->wcs
, wint_t, new_buf_len
);
143 if (BE (new_wcs
== NULL
, 0))
146 if (pstr
->offsets
!= NULL
)
148 int *new_offsets
= re_realloc (pstr
->offsets
, int, new_buf_len
);
149 if (BE (new_offsets
== NULL
, 0))
151 pstr
->offsets
= new_offsets
;
154 #endif /* RE_ENABLE_I18N */
155 if (pstr
->mbs_allocated
)
157 unsigned char *new_mbs
= re_realloc (pstr
->mbs
, unsigned char,
159 if (BE (new_mbs
== NULL
, 0))
163 pstr
->bufs_len
= new_buf_len
;
169 re_string_construct_common (const char *str
, int len
, re_string_t
*pstr
,
170 RE_TRANSLATE_TYPE trans
, int icase
,
173 pstr
->raw_mbs
= (const unsigned char *) str
;
177 pstr
->icase
= icase
? 1 : 0;
178 pstr
->mbs_allocated
= (trans
!= NULL
|| icase
);
179 pstr
->mb_cur_max
= dfa
->mb_cur_max
;
180 pstr
->is_utf8
= dfa
->is_utf8
;
181 pstr
->map_notascii
= dfa
->map_notascii
;
182 pstr
->stop
= pstr
->len
;
183 pstr
->raw_stop
= pstr
->stop
;
186 #ifdef RE_ENABLE_I18N
188 /* Build wide character buffer PSTR->WCS.
189 If the byte sequence of the string are:
190 <mb1>(0), <mb1>(1), <mb2>(0), <mb2>(1), <sb3>
191 Then wide character buffer will be:
192 <wc1> , WEOF , <wc2> , WEOF , <wc3>
193 We use WEOF for padding, they indicate that the position isn't
194 a first byte of a multibyte character.
196 Note that this function assumes PSTR->VALID_LEN elements are already
197 built and starts from PSTR->VALID_LEN. */
200 build_wcs_buffer (re_string_t
*pstr
)
203 unsigned char buf
[MB_LEN_MAX
];
204 assert (MB_LEN_MAX
>= pstr
->mb_cur_max
);
206 unsigned char buf
[64];
209 int byte_idx
, end_idx
, remain_len
;
212 /* Build the buffers from pstr->valid_len to either pstr->len or
214 end_idx
= (pstr
->bufs_len
> pstr
->len
) ? pstr
->len
: pstr
->bufs_len
;
215 for (byte_idx
= pstr
->valid_len
; byte_idx
< end_idx
;)
220 remain_len
= end_idx
- byte_idx
;
221 prev_st
= pstr
->cur_state
;
222 /* Apply the translation if we need. */
223 if (BE (pstr
->trans
!= NULL
, 0))
227 for (i
= 0; i
< pstr
->mb_cur_max
&& i
< remain_len
; ++i
)
229 ch
= pstr
->raw_mbs
[pstr
->raw_mbs_idx
+ byte_idx
+ i
];
230 buf
[i
] = pstr
->mbs
[byte_idx
+ i
] = pstr
->trans
[ch
];
232 p
= (const char *) buf
;
235 p
= (const char *) pstr
->raw_mbs
+ pstr
->raw_mbs_idx
+ byte_idx
;
236 mbclen
= __mbrtowc (&wc
, p
, remain_len
, &pstr
->cur_state
);
237 if (BE (mbclen
== (size_t) -1 || mbclen
== 0
238 || (mbclen
== (size_t) -2 && pstr
->bufs_len
>= pstr
->len
), 0))
240 /* We treat these cases as a singlebyte character. */
242 wc
= (wchar_t) pstr
->raw_mbs
[pstr
->raw_mbs_idx
+ byte_idx
];
243 if (BE (pstr
->trans
!= NULL
, 0))
244 wc
= pstr
->trans
[wc
];
245 pstr
->cur_state
= prev_st
;
247 else if (BE (mbclen
== (size_t) -2, 0))
249 /* The buffer doesn't have enough space, finish to build. */
250 pstr
->cur_state
= prev_st
;
254 /* Write wide character and padding. */
255 pstr
->wcs
[byte_idx
++] = wc
;
256 /* Write paddings. */
257 for (remain_len
= byte_idx
+ mbclen
- 1; byte_idx
< remain_len
;)
258 pstr
->wcs
[byte_idx
++] = WEOF
;
260 pstr
->valid_len
= byte_idx
;
261 pstr
->valid_raw_len
= byte_idx
;
264 /* Build wide character buffer PSTR->WCS like build_wcs_buffer,
265 but for REG_ICASE. */
268 __attribute_warn_unused_result__
269 build_wcs_upper_buffer (re_string_t
*pstr
)
272 int src_idx
, byte_idx
, end_idx
, remain_len
;
275 char buf
[MB_LEN_MAX
];
276 assert (MB_LEN_MAX
>= pstr
->mb_cur_max
);
281 byte_idx
= pstr
->valid_len
;
282 end_idx
= (pstr
->bufs_len
> pstr
->len
) ? pstr
->len
: pstr
->bufs_len
;
284 /* The following optimization assumes that ASCII characters can be
285 mapped to wide characters with a simple cast. */
286 if (! pstr
->map_notascii
&& pstr
->trans
== NULL
&& !pstr
->offsets_needed
)
288 while (byte_idx
< end_idx
)
292 if (isascii (pstr
->raw_mbs
[pstr
->raw_mbs_idx
+ byte_idx
])
293 && mbsinit (&pstr
->cur_state
))
295 /* In case of a singlebyte character. */
297 = toupper (pstr
->raw_mbs
[pstr
->raw_mbs_idx
+ byte_idx
]);
298 /* The next step uses the assumption that wchar_t is encoded
299 ASCII-safe: all ASCII values can be converted like this. */
300 pstr
->wcs
[byte_idx
] = (wchar_t) pstr
->mbs
[byte_idx
];
305 remain_len
= end_idx
- byte_idx
;
306 prev_st
= pstr
->cur_state
;
307 mbclen
= __mbrtowc (&wc
,
308 ((const char *) pstr
->raw_mbs
+ pstr
->raw_mbs_idx
309 + byte_idx
), remain_len
, &pstr
->cur_state
);
310 if (BE (mbclen
+ 2 > 2, 1))
317 wcu
= __towupper (wc
);
318 mbcdlen
= __wcrtomb (buf
, wcu
, &prev_st
);
319 if (BE (mbclen
== mbcdlen
, 1))
320 memcpy (pstr
->mbs
+ byte_idx
, buf
, mbclen
);
328 memcpy (pstr
->mbs
+ byte_idx
,
329 pstr
->raw_mbs
+ pstr
->raw_mbs_idx
+ byte_idx
, mbclen
);
330 pstr
->wcs
[byte_idx
++] = wcu
;
331 /* Write paddings. */
332 for (remain_len
= byte_idx
+ mbclen
- 1; byte_idx
< remain_len
;)
333 pstr
->wcs
[byte_idx
++] = WEOF
;
335 else if (mbclen
== (size_t) -1 || mbclen
== 0
336 || (mbclen
== (size_t) -2 && pstr
->bufs_len
>= pstr
->len
))
338 /* It is an invalid character, an incomplete character
339 at the end of the string, or '\0'. Just use the byte. */
340 int ch
= pstr
->raw_mbs
[pstr
->raw_mbs_idx
+ byte_idx
];
341 pstr
->mbs
[byte_idx
] = ch
;
342 /* And also cast it to wide char. */
343 pstr
->wcs
[byte_idx
++] = (wchar_t) ch
;
344 if (BE (mbclen
== (size_t) -1, 0))
345 pstr
->cur_state
= prev_st
;
349 /* The buffer doesn't have enough space, finish to build. */
350 pstr
->cur_state
= prev_st
;
354 pstr
->valid_len
= byte_idx
;
355 pstr
->valid_raw_len
= byte_idx
;
359 for (src_idx
= pstr
->valid_raw_len
; byte_idx
< end_idx
;)
364 remain_len
= end_idx
- byte_idx
;
365 prev_st
= pstr
->cur_state
;
366 if (BE (pstr
->trans
!= NULL
, 0))
370 for (i
= 0; i
< pstr
->mb_cur_max
&& i
< remain_len
; ++i
)
372 ch
= pstr
->raw_mbs
[pstr
->raw_mbs_idx
+ src_idx
+ i
];
373 buf
[i
] = pstr
->trans
[ch
];
375 p
= (const char *) buf
;
378 p
= (const char *) pstr
->raw_mbs
+ pstr
->raw_mbs_idx
+ src_idx
;
379 mbclen
= __mbrtowc (&wc
, p
, remain_len
, &pstr
->cur_state
);
380 if (BE (mbclen
+ 2 > 2, 1))
387 wcu
= __towupper (wc
);
388 mbcdlen
= __wcrtomb ((char *) buf
, wcu
, &prev_st
);
389 if (BE (mbclen
== mbcdlen
, 1))
390 memcpy (pstr
->mbs
+ byte_idx
, buf
, mbclen
);
391 else if (mbcdlen
!= (size_t) -1)
395 if (byte_idx
+ mbcdlen
> pstr
->bufs_len
)
397 pstr
->cur_state
= prev_st
;
401 if (pstr
->offsets
== NULL
)
403 pstr
->offsets
= re_malloc (int, pstr
->bufs_len
);
405 if (pstr
->offsets
== NULL
)
408 if (!pstr
->offsets_needed
)
410 for (i
= 0; i
< (size_t) byte_idx
; ++i
)
411 pstr
->offsets
[i
] = i
;
412 pstr
->offsets_needed
= 1;
415 memcpy (pstr
->mbs
+ byte_idx
, buf
, mbcdlen
);
416 pstr
->wcs
[byte_idx
] = wcu
;
417 pstr
->offsets
[byte_idx
] = src_idx
;
418 for (i
= 1; i
< mbcdlen
; ++i
)
420 pstr
->offsets
[byte_idx
+ i
]
421 = src_idx
+ (i
< mbclen
? i
: mbclen
- 1);
422 pstr
->wcs
[byte_idx
+ i
] = WEOF
;
424 pstr
->len
+= mbcdlen
- mbclen
;
425 if (pstr
->raw_stop
> src_idx
)
426 pstr
->stop
+= mbcdlen
- mbclen
;
427 end_idx
= (pstr
->bufs_len
> pstr
->len
)
428 ? pstr
->len
: pstr
->bufs_len
;
434 memcpy (pstr
->mbs
+ byte_idx
, p
, mbclen
);
437 memcpy (pstr
->mbs
+ byte_idx
, p
, mbclen
);
439 if (BE (pstr
->offsets_needed
!= 0, 0))
442 for (i
= 0; i
< mbclen
; ++i
)
443 pstr
->offsets
[byte_idx
+ i
] = src_idx
+ i
;
447 pstr
->wcs
[byte_idx
++] = wcu
;
448 /* Write paddings. */
449 for (remain_len
= byte_idx
+ mbclen
- 1; byte_idx
< remain_len
;)
450 pstr
->wcs
[byte_idx
++] = WEOF
;
452 else if (mbclen
== (size_t) -1 || mbclen
== 0
453 || (mbclen
== (size_t) -2 && pstr
->bufs_len
>= pstr
->len
))
455 /* It is an invalid character or '\0'. Just use the byte. */
456 int ch
= pstr
->raw_mbs
[pstr
->raw_mbs_idx
+ src_idx
];
458 if (BE (pstr
->trans
!= NULL
, 0))
459 ch
= pstr
->trans
[ch
];
460 pstr
->mbs
[byte_idx
] = ch
;
462 if (BE (pstr
->offsets_needed
!= 0, 0))
463 pstr
->offsets
[byte_idx
] = src_idx
;
466 /* And also cast it to wide char. */
467 pstr
->wcs
[byte_idx
++] = (wchar_t) ch
;
468 if (BE (mbclen
== (size_t) -1, 0))
469 pstr
->cur_state
= prev_st
;
473 /* The buffer doesn't have enough space, finish to build. */
474 pstr
->cur_state
= prev_st
;
478 pstr
->valid_len
= byte_idx
;
479 pstr
->valid_raw_len
= src_idx
;
483 /* Skip characters until the index becomes greater than NEW_RAW_IDX.
487 re_string_skip_chars (re_string_t
*pstr
, int new_raw_idx
, wint_t *last_wc
)
494 /* Skip the characters which are not necessary to check. */
495 for (rawbuf_idx
= pstr
->raw_mbs_idx
+ pstr
->valid_raw_len
;
496 rawbuf_idx
< new_raw_idx
;)
499 int remain_len
= pstr
->raw_len
- rawbuf_idx
;
500 prev_st
= pstr
->cur_state
;
501 mbclen
= __mbrtowc (&wc2
, (const char *) pstr
->raw_mbs
+ rawbuf_idx
,
502 remain_len
, &pstr
->cur_state
);
503 if (BE ((ssize_t
) mbclen
<= 0, 0))
505 /* We treat these cases as a single byte character. */
506 if (mbclen
== 0 || remain_len
== 0)
509 wc
= *(unsigned char *) (pstr
->raw_mbs
+ rawbuf_idx
);
511 pstr
->cur_state
= prev_st
;
515 /* Then proceed the next character. */
516 rawbuf_idx
+= mbclen
;
521 #endif /* RE_ENABLE_I18N */
523 /* Build the buffer PSTR->MBS, and apply the translation if we need.
524 This function is used in case of REG_ICASE. */
527 build_upper_buffer (re_string_t
*pstr
)
529 int char_idx
, end_idx
;
530 end_idx
= (pstr
->bufs_len
> pstr
->len
) ? pstr
->len
: pstr
->bufs_len
;
532 for (char_idx
= pstr
->valid_len
; char_idx
< end_idx
; ++char_idx
)
534 int ch
= pstr
->raw_mbs
[pstr
->raw_mbs_idx
+ char_idx
];
535 if (BE (pstr
->trans
!= NULL
, 0))
536 ch
= pstr
->trans
[ch
];
538 pstr
->mbs
[char_idx
] = toupper (ch
);
540 pstr
->mbs
[char_idx
] = ch
;
542 pstr
->valid_len
= char_idx
;
543 pstr
->valid_raw_len
= char_idx
;
546 /* Apply TRANS to the buffer in PSTR. */
549 re_string_translate_buffer (re_string_t
*pstr
)
551 int buf_idx
, end_idx
;
552 end_idx
= (pstr
->bufs_len
> pstr
->len
) ? pstr
->len
: pstr
->bufs_len
;
554 for (buf_idx
= pstr
->valid_len
; buf_idx
< end_idx
; ++buf_idx
)
556 int ch
= pstr
->raw_mbs
[pstr
->raw_mbs_idx
+ buf_idx
];
557 pstr
->mbs
[buf_idx
] = pstr
->trans
[ch
];
560 pstr
->valid_len
= buf_idx
;
561 pstr
->valid_raw_len
= buf_idx
;
564 /* This function re-construct the buffers.
565 Concretely, convert to wide character in case of pstr->mb_cur_max > 1,
566 convert to upper case in case of REG_ICASE, apply translation. */
569 __attribute_warn_unused_result__
570 re_string_reconstruct (re_string_t
*pstr
, int idx
, int eflags
)
572 int offset
= idx
- pstr
->raw_mbs_idx
;
573 if (BE (offset
< 0, 0))
576 #ifdef RE_ENABLE_I18N
577 if (pstr
->mb_cur_max
> 1)
578 memset (&pstr
->cur_state
, '\0', sizeof (mbstate_t));
579 #endif /* RE_ENABLE_I18N */
580 pstr
->len
= pstr
->raw_len
;
581 pstr
->stop
= pstr
->raw_stop
;
583 pstr
->raw_mbs_idx
= 0;
584 pstr
->valid_raw_len
= 0;
585 pstr
->offsets_needed
= 0;
586 pstr
->tip_context
= ((eflags
& REG_NOTBOL
) ? CONTEXT_BEGBUF
587 : CONTEXT_NEWLINE
| CONTEXT_BEGBUF
);
588 if (!pstr
->mbs_allocated
)
589 pstr
->mbs
= (unsigned char *) pstr
->raw_mbs
;
593 if (BE (offset
!= 0, 1))
595 /* Should the already checked characters be kept? */
596 if (BE (offset
< pstr
->valid_raw_len
, 1))
598 /* Yes, move them to the front of the buffer. */
599 #ifdef RE_ENABLE_I18N
600 if (BE (pstr
->offsets_needed
, 0))
602 int low
= 0, high
= pstr
->valid_len
, mid
;
605 mid
= (high
+ low
) / 2;
606 if (pstr
->offsets
[mid
] > offset
)
608 else if (pstr
->offsets
[mid
] < offset
)
614 if (pstr
->offsets
[mid
] < offset
)
616 pstr
->tip_context
= re_string_context_at (pstr
, mid
- 1,
618 /* This can be quite complicated, so handle specially
619 only the common and easy case where the character with
620 different length representation of lower and upper
621 case is present at or after offset. */
622 if (pstr
->valid_len
> offset
623 && mid
== offset
&& pstr
->offsets
[mid
] == offset
)
625 memmove (pstr
->wcs
, pstr
->wcs
+ offset
,
626 (pstr
->valid_len
- offset
) * sizeof (wint_t));
627 memmove (pstr
->mbs
, pstr
->mbs
+ offset
, pstr
->valid_len
- offset
);
628 pstr
->valid_len
-= offset
;
629 pstr
->valid_raw_len
-= offset
;
630 for (low
= 0; low
< pstr
->valid_len
; low
++)
631 pstr
->offsets
[low
] = pstr
->offsets
[low
+ offset
] - offset
;
635 /* Otherwise, just find out how long the partial multibyte
636 character at offset is and fill it with WEOF/255. */
637 pstr
->len
= pstr
->raw_len
- idx
+ offset
;
638 pstr
->stop
= pstr
->raw_stop
- idx
+ offset
;
639 pstr
->offsets_needed
= 0;
640 while (mid
> 0 && pstr
->offsets
[mid
- 1] == offset
)
642 while (mid
< pstr
->valid_len
)
643 if (pstr
->wcs
[mid
] != WEOF
)
647 if (mid
== pstr
->valid_len
)
651 pstr
->valid_len
= pstr
->offsets
[mid
] - offset
;
654 for (low
= 0; low
< pstr
->valid_len
; ++low
)
655 pstr
->wcs
[low
] = WEOF
;
656 memset (pstr
->mbs
, 255, pstr
->valid_len
);
659 pstr
->valid_raw_len
= pstr
->valid_len
;
665 pstr
->tip_context
= re_string_context_at (pstr
, offset
- 1,
667 #ifdef RE_ENABLE_I18N
668 if (pstr
->mb_cur_max
> 1)
669 memmove (pstr
->wcs
, pstr
->wcs
+ offset
,
670 (pstr
->valid_len
- offset
) * sizeof (wint_t));
671 #endif /* RE_ENABLE_I18N */
672 if (BE (pstr
->mbs_allocated
, 0))
673 memmove (pstr
->mbs
, pstr
->mbs
+ offset
,
674 pstr
->valid_len
- offset
);
675 pstr
->valid_len
-= offset
;
676 pstr
->valid_raw_len
-= offset
;
677 #if defined DEBUG && DEBUG
678 assert (pstr
->valid_len
> 0);
684 /* No, skip all characters until IDX. */
685 int prev_valid_len
= pstr
->valid_len
;
687 #ifdef RE_ENABLE_I18N
688 if (BE (pstr
->offsets_needed
, 0))
690 pstr
->len
= pstr
->raw_len
- idx
+ offset
;
691 pstr
->stop
= pstr
->raw_stop
- idx
+ offset
;
692 pstr
->offsets_needed
= 0;
696 #ifdef RE_ENABLE_I18N
697 if (pstr
->mb_cur_max
> 1)
704 const unsigned char *raw
, *p
, *end
;
706 /* Special case UTF-8. Multi-byte chars start with any
707 byte other than 0x80 - 0xbf. */
708 raw
= pstr
->raw_mbs
+ pstr
->raw_mbs_idx
;
709 end
= raw
+ (offset
- pstr
->mb_cur_max
);
710 if (end
< pstr
->raw_mbs
)
712 p
= raw
+ offset
- 1;
714 /* We know the wchar_t encoding is UCS4, so for the simple
715 case, ASCII characters, skip the conversion step. */
716 if (isascii (*p
) && BE (pstr
->trans
== NULL
, 1))
718 memset (&pstr
->cur_state
, '\0', sizeof (mbstate_t));
719 /* pstr->valid_len = 0; */
724 for (; p
>= end
; --p
)
725 if ((*p
& 0xc0) != 0x80)
729 int mlen
= raw
+ pstr
->len
- p
;
730 unsigned char buf
[6];
733 const unsigned char *pp
= p
;
734 if (BE (pstr
->trans
!= NULL
, 0))
736 int i
= mlen
< 6 ? mlen
: 6;
738 buf
[i
] = pstr
->trans
[p
[i
]];
741 /* XXX Don't use mbrtowc, we know which conversion
742 to use (UTF-8 -> UCS4). */
743 memset (&cur_state
, 0, sizeof (cur_state
));
744 mbclen
= __mbrtowc (&wc2
, (const char *) pp
, mlen
,
746 if (raw
+ offset
- p
<= mbclen
747 && mbclen
< (size_t) -2)
749 memset (&pstr
->cur_state
, '\0',
751 pstr
->valid_len
= mbclen
- (raw
+ offset
- p
);
759 pstr
->valid_len
= re_string_skip_chars (pstr
, idx
, &wc
) - idx
;
762 = re_string_context_at (pstr
, prev_valid_len
- 1, eflags
);
764 pstr
->tip_context
= ((BE (pstr
->word_ops_used
!= 0, 0)
765 && IS_WIDE_WORD_CHAR (wc
))
767 : ((IS_WIDE_NEWLINE (wc
)
768 && pstr
->newline_anchor
)
769 ? CONTEXT_NEWLINE
: 0));
770 if (BE (pstr
->valid_len
, 0))
772 for (wcs_idx
= 0; wcs_idx
< pstr
->valid_len
; ++wcs_idx
)
773 pstr
->wcs
[wcs_idx
] = WEOF
;
774 if (pstr
->mbs_allocated
)
775 memset (pstr
->mbs
, 255, pstr
->valid_len
);
777 pstr
->valid_raw_len
= pstr
->valid_len
;
780 #endif /* RE_ENABLE_I18N */
782 int c
= pstr
->raw_mbs
[pstr
->raw_mbs_idx
+ offset
- 1];
783 pstr
->valid_raw_len
= 0;
786 pstr
->tip_context
= (bitset_contain (pstr
->word_char
, c
)
788 : ((IS_NEWLINE (c
) && pstr
->newline_anchor
)
789 ? CONTEXT_NEWLINE
: 0));
792 if (!BE (pstr
->mbs_allocated
, 0))
795 pstr
->raw_mbs_idx
= idx
;
797 pstr
->stop
-= offset
;
799 /* Then build the buffers. */
800 #ifdef RE_ENABLE_I18N
801 if (pstr
->mb_cur_max
> 1)
805 reg_errcode_t ret
= build_wcs_upper_buffer (pstr
);
806 if (BE (ret
!= REG_NOERROR
, 0))
810 build_wcs_buffer (pstr
);
813 #endif /* RE_ENABLE_I18N */
814 if (BE (pstr
->mbs_allocated
, 0))
817 build_upper_buffer (pstr
);
818 else if (pstr
->trans
!= NULL
)
819 re_string_translate_buffer (pstr
);
822 pstr
->valid_len
= pstr
->len
;
830 re_string_peek_byte_case (const re_string_t
*pstr
, int idx
)
834 /* Handle the common (easiest) cases first. */
835 if (BE (!pstr
->mbs_allocated
, 1))
836 return re_string_peek_byte (pstr
, idx
);
838 #ifdef RE_ENABLE_I18N
839 if (pstr
->mb_cur_max
> 1
840 && ! re_string_is_single_byte_char (pstr
, pstr
->cur_idx
+ idx
))
841 return re_string_peek_byte (pstr
, idx
);
844 off
= pstr
->cur_idx
+ idx
;
845 #ifdef RE_ENABLE_I18N
846 if (pstr
->offsets_needed
)
847 off
= pstr
->offsets
[off
];
850 ch
= pstr
->raw_mbs
[pstr
->raw_mbs_idx
+ off
];
852 #ifdef RE_ENABLE_I18N
853 /* Ensure that e.g. for tr_TR.UTF-8 BACKSLASH DOTLESS SMALL LETTER I
854 this function returns CAPITAL LETTER I instead of first byte of
855 DOTLESS SMALL LETTER I. The latter would confuse the parser,
856 since peek_byte_case doesn't advance cur_idx in any way. */
857 if (pstr
->offsets_needed
&& !isascii (ch
))
858 return re_string_peek_byte (pstr
, idx
);
865 re_string_fetch_byte_case (re_string_t
*pstr
)
867 if (BE (!pstr
->mbs_allocated
, 1))
868 return re_string_fetch_byte (pstr
);
870 #ifdef RE_ENABLE_I18N
871 if (pstr
->offsets_needed
)
875 /* For tr_TR.UTF-8 [[:islower:]] there is
876 [[: CAPITAL LETTER I WITH DOT lower:]] in mbs. Skip
877 in that case the whole multi-byte character and return
878 the original letter. On the other side, with
879 [[: DOTLESS SMALL LETTER I return [[:I, as doing
880 anything else would complicate things too much. */
882 if (!re_string_first_byte (pstr
, pstr
->cur_idx
))
883 return re_string_fetch_byte (pstr
);
885 off
= pstr
->offsets
[pstr
->cur_idx
];
886 ch
= pstr
->raw_mbs
[pstr
->raw_mbs_idx
+ off
];
889 return re_string_fetch_byte (pstr
);
891 re_string_skip_bytes (pstr
,
892 re_string_char_size_at (pstr
, pstr
->cur_idx
));
897 return pstr
->raw_mbs
[pstr
->raw_mbs_idx
+ pstr
->cur_idx
++];
901 re_string_destruct (re_string_t
*pstr
)
903 #ifdef RE_ENABLE_I18N
905 re_free (pstr
->offsets
);
906 #endif /* RE_ENABLE_I18N */
907 if (pstr
->mbs_allocated
)
911 /* Return the context at IDX in INPUT. */
914 re_string_context_at (const re_string_t
*input
, int idx
, int eflags
)
918 /* In this case, we use the value stored in input->tip_context,
919 since we can't know the character in input->mbs[-1] here. */
920 return input
->tip_context
;
921 if (BE (idx
== input
->len
, 0))
922 return ((eflags
& REG_NOTEOL
) ? CONTEXT_ENDBUF
923 : CONTEXT_NEWLINE
| CONTEXT_ENDBUF
);
924 #ifdef RE_ENABLE_I18N
925 if (input
->mb_cur_max
> 1)
929 while(input
->wcs
[wc_idx
] == WEOF
)
931 #if defined DEBUG && DEBUG
932 /* It must not happen. */
933 assert (wc_idx
>= 0);
937 return input
->tip_context
;
939 wc
= input
->wcs
[wc_idx
];
940 if (BE (input
->word_ops_used
!= 0, 0) && IS_WIDE_WORD_CHAR (wc
))
942 return (IS_WIDE_NEWLINE (wc
) && input
->newline_anchor
943 ? CONTEXT_NEWLINE
: 0);
948 c
= re_string_byte_at (input
, idx
);
949 if (bitset_contain (input
->word_char
, c
))
951 return IS_NEWLINE (c
) && input
->newline_anchor
? CONTEXT_NEWLINE
: 0;
955 /* Functions for set operation. */
958 __attribute_warn_unused_result__
959 re_node_set_alloc (re_node_set
*set
, int size
)
963 set
->elems
= re_malloc (int, size
);
964 if (BE (set
->elems
== NULL
, 0))
970 __attribute_warn_unused_result__
971 re_node_set_init_1 (re_node_set
*set
, int elem
)
975 set
->elems
= re_malloc (int, 1);
976 if (BE (set
->elems
== NULL
, 0))
978 set
->alloc
= set
->nelem
= 0;
981 set
->elems
[0] = elem
;
986 __attribute_warn_unused_result__
987 re_node_set_init_2 (re_node_set
*set
, int elem1
, int elem2
)
990 set
->elems
= re_malloc (int, 2);
991 if (BE (set
->elems
== NULL
, 0))
996 set
->elems
[0] = elem1
;
1003 set
->elems
[0] = elem1
;
1004 set
->elems
[1] = elem2
;
1008 set
->elems
[0] = elem2
;
1009 set
->elems
[1] = elem1
;
1015 static reg_errcode_t
1016 __attribute_warn_unused_result__
1017 re_node_set_init_copy (re_node_set
*dest
, const re_node_set
*src
)
1019 dest
->nelem
= src
->nelem
;
1022 dest
->alloc
= dest
->nelem
;
1023 dest
->elems
= re_malloc (int, dest
->alloc
);
1024 if (BE (dest
->elems
== NULL
, 0))
1026 dest
->alloc
= dest
->nelem
= 0;
1029 memcpy (dest
->elems
, src
->elems
, src
->nelem
* sizeof (int));
1032 re_node_set_init_empty (dest
);
1036 /* Calculate the intersection of the sets SRC1 and SRC2. And merge it to
1037 DEST. Return value indicate the error code or REG_NOERROR if succeeded.
1038 Note: We assume dest->elems is NULL, when dest->alloc is 0. */
1040 static reg_errcode_t
1041 __attribute_warn_unused_result__
1042 re_node_set_add_intersect (re_node_set
*dest
, const re_node_set
*src1
,
1043 const re_node_set
*src2
)
1045 int i1
, i2
, is
, id
, delta
, sbase
;
1046 if (src1
->nelem
== 0 || src2
->nelem
== 0)
1049 /* We need dest->nelem + 2 * elems_in_intersection; this is a
1050 conservative estimate. */
1051 if (src1
->nelem
+ src2
->nelem
+ dest
->nelem
> dest
->alloc
)
1053 int new_alloc
= src1
->nelem
+ src2
->nelem
+ dest
->alloc
;
1054 int *new_elems
= re_realloc (dest
->elems
, int, new_alloc
);
1055 if (BE (new_elems
== NULL
, 0))
1057 dest
->elems
= new_elems
;
1058 dest
->alloc
= new_alloc
;
1061 /* Find the items in the intersection of SRC1 and SRC2, and copy
1062 into the top of DEST those that are not already in DEST itself. */
1063 sbase
= dest
->nelem
+ src1
->nelem
+ src2
->nelem
;
1064 i1
= src1
->nelem
- 1;
1065 i2
= src2
->nelem
- 1;
1066 id
= dest
->nelem
- 1;
1069 if (src1
->elems
[i1
] == src2
->elems
[i2
])
1071 /* Try to find the item in DEST. Maybe we could binary search? */
1072 while (id
>= 0 && dest
->elems
[id
] > src1
->elems
[i1
])
1075 if (id
< 0 || dest
->elems
[id
] != src1
->elems
[i1
])
1076 dest
->elems
[--sbase
] = src1
->elems
[i1
];
1078 if (--i1
< 0 || --i2
< 0)
1082 /* Lower the highest of the two items. */
1083 else if (src1
->elems
[i1
] < src2
->elems
[i2
])
1095 id
= dest
->nelem
- 1;
1096 is
= dest
->nelem
+ src1
->nelem
+ src2
->nelem
- 1;
1097 delta
= is
- sbase
+ 1;
1099 /* Now copy. When DELTA becomes zero, the remaining
1100 DEST elements are already in place; this is more or
1101 less the same loop that is in re_node_set_merge. */
1102 dest
->nelem
+= delta
;
1103 if (delta
> 0 && id
>= 0)
1106 if (dest
->elems
[is
] > dest
->elems
[id
])
1108 /* Copy from the top. */
1109 dest
->elems
[id
+ delta
--] = dest
->elems
[is
--];
1115 /* Slide from the bottom. */
1116 dest
->elems
[id
+ delta
] = dest
->elems
[id
];
1122 /* Copy remaining SRC elements. */
1123 memcpy (dest
->elems
, dest
->elems
+ sbase
, delta
* sizeof (int));
1128 /* Calculate the union set of the sets SRC1 and SRC2. And store it to
1129 DEST. Return value indicate the error code or REG_NOERROR if succeeded. */
1131 static reg_errcode_t
1132 __attribute_warn_unused_result__
1133 re_node_set_init_union (re_node_set
*dest
, const re_node_set
*src1
,
1134 const re_node_set
*src2
)
1137 if (src1
!= NULL
&& src1
->nelem
> 0 && src2
!= NULL
&& src2
->nelem
> 0)
1139 dest
->alloc
= src1
->nelem
+ src2
->nelem
;
1140 dest
->elems
= re_malloc (int, dest
->alloc
);
1141 if (BE (dest
->elems
== NULL
, 0))
1146 if (src1
!= NULL
&& src1
->nelem
> 0)
1147 return re_node_set_init_copy (dest
, src1
);
1148 else if (src2
!= NULL
&& src2
->nelem
> 0)
1149 return re_node_set_init_copy (dest
, src2
);
1151 re_node_set_init_empty (dest
);
1154 for (i1
= i2
= id
= 0 ; i1
< src1
->nelem
&& i2
< src2
->nelem
;)
1156 if (src1
->elems
[i1
] > src2
->elems
[i2
])
1158 dest
->elems
[id
++] = src2
->elems
[i2
++];
1161 if (src1
->elems
[i1
] == src2
->elems
[i2
])
1163 dest
->elems
[id
++] = src1
->elems
[i1
++];
1165 if (i1
< src1
->nelem
)
1167 memcpy (dest
->elems
+ id
, src1
->elems
+ i1
,
1168 (src1
->nelem
- i1
) * sizeof (int));
1169 id
+= src1
->nelem
- i1
;
1171 else if (i2
< src2
->nelem
)
1173 memcpy (dest
->elems
+ id
, src2
->elems
+ i2
,
1174 (src2
->nelem
- i2
) * sizeof (int));
1175 id
+= src2
->nelem
- i2
;
1181 /* Calculate the union set of the sets DEST and SRC. And store it to
1182 DEST. Return value indicate the error code or REG_NOERROR if succeeded. */
1184 static reg_errcode_t
1185 __attribute_warn_unused_result__
1186 re_node_set_merge (re_node_set
*dest
, const re_node_set
*src
)
1188 int is
, id
, sbase
, delta
;
1189 if (src
== NULL
|| src
->nelem
== 0)
1191 if (dest
->alloc
< 2 * src
->nelem
+ dest
->nelem
)
1193 int new_alloc
= 2 * (src
->nelem
+ dest
->alloc
);
1194 int *new_buffer
= re_realloc (dest
->elems
, int, new_alloc
);
1195 if (BE (new_buffer
== NULL
, 0))
1197 dest
->elems
= new_buffer
;
1198 dest
->alloc
= new_alloc
;
1201 if (BE (dest
->nelem
== 0, 0))
1203 dest
->nelem
= src
->nelem
;
1204 memcpy (dest
->elems
, src
->elems
, src
->nelem
* sizeof (int));
1208 /* Copy into the top of DEST the items of SRC that are not
1209 found in DEST. Maybe we could binary search in DEST? */
1210 for (sbase
= dest
->nelem
+ 2 * src
->nelem
,
1211 is
= src
->nelem
- 1, id
= dest
->nelem
- 1; is
>= 0 && id
>= 0; )
1213 if (dest
->elems
[id
] == src
->elems
[is
])
1215 else if (dest
->elems
[id
] < src
->elems
[is
])
1216 dest
->elems
[--sbase
] = src
->elems
[is
--];
1217 else /* if (dest->elems[id] > src->elems[is]) */
1223 /* If DEST is exhausted, the remaining items of SRC must be unique. */
1225 memcpy (dest
->elems
+ sbase
, src
->elems
, (is
+ 1) * sizeof (int));
1228 id
= dest
->nelem
- 1;
1229 is
= dest
->nelem
+ 2 * src
->nelem
- 1;
1230 delta
= is
- sbase
+ 1;
1234 /* Now copy. When DELTA becomes zero, the remaining
1235 DEST elements are already in place. */
1236 dest
->nelem
+= delta
;
1239 if (dest
->elems
[is
] > dest
->elems
[id
])
1241 /* Copy from the top. */
1242 dest
->elems
[id
+ delta
--] = dest
->elems
[is
--];
1248 /* Slide from the bottom. */
1249 dest
->elems
[id
+ delta
] = dest
->elems
[id
];
1252 /* Copy remaining SRC elements. */
1253 memcpy (dest
->elems
, dest
->elems
+ sbase
,
1254 delta
* sizeof (int));
1263 /* Insert the new element ELEM to the re_node_set* SET.
1264 SET should not already have ELEM.
1265 return -1 if an error is occured, return 1 otherwise. */
1268 __attribute_warn_unused_result__
1269 re_node_set_insert (re_node_set
*set
, int elem
)
1272 /* In case the set is empty. */
1273 if (set
->alloc
== 0)
1275 if (BE (re_node_set_init_1 (set
, elem
) == REG_NOERROR
, 1))
1281 if (BE (set
->nelem
, 0) == 0)
1283 /* We already guaranteed above that set->alloc != 0. */
1284 set
->elems
[0] = elem
;
1289 /* Realloc if we need. */
1290 if (set
->alloc
== set
->nelem
)
1293 set
->alloc
= set
->alloc
* 2;
1294 new_elems
= re_realloc (set
->elems
, int, set
->alloc
);
1295 if (BE (new_elems
== NULL
, 0))
1297 set
->elems
= new_elems
;
1300 /* Move the elements which follows the new element. Test the
1301 first element separately to skip a check in the inner loop. */
1302 if (elem
< set
->elems
[0])
1305 for (idx
= set
->nelem
; idx
> 0; idx
--)
1306 set
->elems
[idx
] = set
->elems
[idx
- 1];
1310 for (idx
= set
->nelem
; set
->elems
[idx
- 1] > elem
; idx
--)
1311 set
->elems
[idx
] = set
->elems
[idx
- 1];
1314 /* Insert the new element. */
1315 set
->elems
[idx
] = elem
;
1320 /* Insert the new element ELEM to the re_node_set* SET.
1321 SET should not already have any element greater than or equal to ELEM.
1322 Return -1 if an error is occured, return 1 otherwise. */
1325 __attribute_warn_unused_result__
1326 re_node_set_insert_last (re_node_set
*set
, int elem
)
1328 /* Realloc if we need. */
1329 if (set
->alloc
== set
->nelem
)
1332 set
->alloc
= (set
->alloc
+ 1) * 2;
1333 new_elems
= re_realloc (set
->elems
, int, set
->alloc
);
1334 if (BE (new_elems
== NULL
, 0))
1336 set
->elems
= new_elems
;
1339 /* Insert the new element. */
1340 set
->elems
[set
->nelem
++] = elem
;
1344 /* Compare two node sets SET1 and SET2.
1345 return 1 if SET1 and SET2 are equivalent, return 0 otherwise. */
1348 __attribute ((pure
))
1349 re_node_set_compare (const re_node_set
*set1
, const re_node_set
*set2
)
1352 if (set1
== NULL
|| set2
== NULL
|| set1
->nelem
!= set2
->nelem
)
1354 for (i
= set1
->nelem
; --i
>= 0 ; )
1355 if (set1
->elems
[i
] != set2
->elems
[i
])
1360 /* Return (idx + 1) if SET contains the element ELEM, return 0 otherwise. */
1363 __attribute ((pure
))
1364 re_node_set_contains (const re_node_set
*set
, int elem
)
1366 unsigned int idx
, right
, mid
;
1367 if (set
->nelem
<= 0)
1370 /* Binary search the element. */
1372 right
= set
->nelem
- 1;
1375 mid
= (idx
+ right
) / 2;
1376 if (set
->elems
[mid
] < elem
)
1381 return set
->elems
[idx
] == elem
? idx
+ 1 : 0;
1385 re_node_set_remove_at (re_node_set
*set
, int idx
)
1387 if (idx
< 0 || idx
>= set
->nelem
)
1390 for (; idx
< set
->nelem
; idx
++)
1391 set
->elems
[idx
] = set
->elems
[idx
+ 1];
1395 /* Add the token TOKEN to dfa->nodes, and return the index of the token.
1396 Or return -1, if an error will be occured. */
1399 re_dfa_add_node (re_dfa_t
*dfa
, re_token_t token
)
1401 int type
= token
.type
;
1402 if (BE (dfa
->nodes_len
>= dfa
->nodes_alloc
, 0))
1404 size_t new_nodes_alloc
= dfa
->nodes_alloc
* 2;
1405 int *new_nexts
, *new_indices
;
1406 re_node_set
*new_edests
, *new_eclosures
;
1407 re_token_t
*new_nodes
;
1409 /* Avoid overflows in realloc. */
1410 const size_t max_object_size
= MAX (sizeof (re_token_t
),
1411 MAX (sizeof (re_node_set
),
1413 if (BE (SIZE_MAX
/ max_object_size
< new_nodes_alloc
, 0))
1416 new_nodes
= re_realloc (dfa
->nodes
, re_token_t
, new_nodes_alloc
);
1417 if (BE (new_nodes
== NULL
, 0))
1419 dfa
->nodes
= new_nodes
;
1420 new_nexts
= re_realloc (dfa
->nexts
, int, new_nodes_alloc
);
1421 new_indices
= re_realloc (dfa
->org_indices
, int, new_nodes_alloc
);
1422 new_edests
= re_realloc (dfa
->edests
, re_node_set
, new_nodes_alloc
);
1423 new_eclosures
= re_realloc (dfa
->eclosures
, re_node_set
, new_nodes_alloc
);
1424 if (BE (new_nexts
== NULL
|| new_indices
== NULL
1425 || new_edests
== NULL
|| new_eclosures
== NULL
, 0))
1427 dfa
->nexts
= new_nexts
;
1428 dfa
->org_indices
= new_indices
;
1429 dfa
->edests
= new_edests
;
1430 dfa
->eclosures
= new_eclosures
;
1431 dfa
->nodes_alloc
= new_nodes_alloc
;
1433 dfa
->nodes
[dfa
->nodes_len
] = token
;
1434 dfa
->nodes
[dfa
->nodes_len
].constraint
= 0;
1435 #ifdef RE_ENABLE_I18N
1436 dfa
->nodes
[dfa
->nodes_len
].accept_mb
=
1437 (type
== OP_PERIOD
&& dfa
->mb_cur_max
> 1) || type
== COMPLEX_BRACKET
;
1439 dfa
->nexts
[dfa
->nodes_len
] = -1;
1440 re_node_set_init_empty (dfa
->edests
+ dfa
->nodes_len
);
1441 re_node_set_init_empty (dfa
->eclosures
+ dfa
->nodes_len
);
1442 return dfa
->nodes_len
++;
1445 static inline unsigned int
1446 calc_state_hash (const re_node_set
*nodes
, unsigned int context
)
1448 unsigned int hash
= nodes
->nelem
+ context
;
1450 for (i
= 0 ; i
< nodes
->nelem
; i
++)
1451 hash
+= nodes
->elems
[i
];
1455 /* Search for the state whose node_set is equivalent to NODES.
1456 Return the pointer to the state, if we found it in the DFA.
1457 Otherwise create the new one and return it. In case of an error
1458 return NULL and set the error code in ERR.
1459 Note: - We assume NULL as the invalid state, then it is possible that
1460 return value is NULL and ERR is REG_NOERROR.
1461 - We never return non-NULL value in case of any errors, it is for
1464 static re_dfastate_t
*
1465 __attribute_warn_unused_result__
1466 re_acquire_state (reg_errcode_t
*err
, const re_dfa_t
*dfa
,
1467 const re_node_set
*nodes
)
1470 re_dfastate_t
*new_state
;
1471 struct re_state_table_entry
*spot
;
1473 if (BE (nodes
->nelem
== 0, 0))
1478 hash
= calc_state_hash (nodes
, 0);
1479 spot
= dfa
->state_table
+ (hash
& dfa
->state_hash_mask
);
1481 for (i
= 0 ; i
< spot
->num
; i
++)
1483 re_dfastate_t
*state
= spot
->array
[i
];
1484 if (hash
!= state
->hash
)
1486 if (re_node_set_compare (&state
->nodes
, nodes
))
1490 /* There are no appropriate state in the dfa, create the new one. */
1491 new_state
= create_ci_newstate (dfa
, nodes
, hash
);
1492 if (BE (new_state
== NULL
, 0))
1498 /* Search for the state whose node_set is equivalent to NODES and
1499 whose context is equivalent to CONTEXT.
1500 Return the pointer to the state, if we found it in the DFA.
1501 Otherwise create the new one and return it. In case of an error
1502 return NULL and set the error code in ERR.
1503 Note: - We assume NULL as the invalid state, then it is possible that
1504 return value is NULL and ERR is REG_NOERROR.
1505 - We never return non-NULL value in case of any errors, it is for
1508 static re_dfastate_t
*
1509 __attribute_warn_unused_result__
1510 re_acquire_state_context (reg_errcode_t
*err
, const re_dfa_t
*dfa
,
1511 const re_node_set
*nodes
, unsigned int context
)
1514 re_dfastate_t
*new_state
;
1515 struct re_state_table_entry
*spot
;
1517 if (nodes
->nelem
== 0)
1522 hash
= calc_state_hash (nodes
, context
);
1523 spot
= dfa
->state_table
+ (hash
& dfa
->state_hash_mask
);
1525 for (i
= 0 ; i
< spot
->num
; i
++)
1527 re_dfastate_t
*state
= spot
->array
[i
];
1528 if (state
->hash
== hash
1529 && state
->context
== context
1530 && re_node_set_compare (state
->entrance_nodes
, nodes
))
1533 /* There are no appropriate state in `dfa', create the new one. */
1534 new_state
= create_cd_newstate (dfa
, nodes
, context
, hash
);
1535 if (BE (new_state
== NULL
, 0))
1541 /* Finish initialization of the new state NEWSTATE, and using its hash value
1542 HASH put in the appropriate bucket of DFA's state table. Return value
1543 indicates the error code if failed. */
1545 static reg_errcode_t
1546 __attribute_warn_unused_result__
1547 register_state (const re_dfa_t
*dfa
, re_dfastate_t
*newstate
,
1550 struct re_state_table_entry
*spot
;
1554 newstate
->hash
= hash
;
1555 err
= re_node_set_alloc (&newstate
->non_eps_nodes
, newstate
->nodes
.nelem
);
1556 if (BE (err
!= REG_NOERROR
, 0))
1558 for (i
= 0; i
< newstate
->nodes
.nelem
; i
++)
1560 int elem
= newstate
->nodes
.elems
[i
];
1561 if (!IS_EPSILON_NODE (dfa
->nodes
[elem
].type
))
1562 if (re_node_set_insert_last (&newstate
->non_eps_nodes
, elem
) < 0)
1566 spot
= dfa
->state_table
+ (hash
& dfa
->state_hash_mask
);
1567 if (BE (spot
->alloc
<= spot
->num
, 0))
1569 int new_alloc
= 2 * spot
->num
+ 2;
1570 re_dfastate_t
**new_array
= re_realloc (spot
->array
, re_dfastate_t
*,
1572 if (BE (new_array
== NULL
, 0))
1574 spot
->array
= new_array
;
1575 spot
->alloc
= new_alloc
;
1577 spot
->array
[spot
->num
++] = newstate
;
1582 free_state (re_dfastate_t
*state
)
1584 re_node_set_free (&state
->non_eps_nodes
);
1585 re_node_set_free (&state
->inveclosure
);
1586 if (state
->entrance_nodes
!= &state
->nodes
)
1588 re_node_set_free (state
->entrance_nodes
);
1589 re_free (state
->entrance_nodes
);
1591 re_node_set_free (&state
->nodes
);
1592 re_free (state
->word_trtable
);
1593 re_free (state
->trtable
);
1597 /* Create the new state which is independ of contexts.
1598 Return the new state if succeeded, otherwise return NULL. */
1600 static re_dfastate_t
*
1601 __attribute_warn_unused_result__
1602 create_ci_newstate (const re_dfa_t
*dfa
, const re_node_set
*nodes
,
1607 re_dfastate_t
*newstate
;
1609 newstate
= (re_dfastate_t
*) calloc (sizeof (re_dfastate_t
), 1);
1610 if (BE (newstate
== NULL
, 0))
1612 err
= re_node_set_init_copy (&newstate
->nodes
, nodes
);
1613 if (BE (err
!= REG_NOERROR
, 0))
1619 newstate
->entrance_nodes
= &newstate
->nodes
;
1620 for (i
= 0 ; i
< nodes
->nelem
; i
++)
1622 re_token_t
*node
= dfa
->nodes
+ nodes
->elems
[i
];
1623 re_token_type_t type
= node
->type
;
1624 if (type
== CHARACTER
&& !node
->constraint
)
1626 #ifdef RE_ENABLE_I18N
1627 newstate
->accept_mb
|= node
->accept_mb
;
1628 #endif /* RE_ENABLE_I18N */
1630 /* If the state has the halt node, the state is a halt state. */
1631 if (type
== END_OF_RE
)
1633 else if (type
== OP_BACK_REF
)
1634 newstate
->has_backref
= 1;
1635 else if (type
== ANCHOR
|| node
->constraint
)
1636 newstate
->has_constraint
= 1;
1638 err
= register_state (dfa
, newstate
, hash
);
1639 if (BE (err
!= REG_NOERROR
, 0))
1641 free_state (newstate
);
1647 /* Create the new state which is depend on the context CONTEXT.
1648 Return the new state if succeeded, otherwise return NULL. */
1650 static re_dfastate_t
*
1651 __attribute_warn_unused_result__
1652 create_cd_newstate (const re_dfa_t
*dfa
, const re_node_set
*nodes
,
1653 unsigned int context
, unsigned int hash
)
1655 int i
, nctx_nodes
= 0;
1657 re_dfastate_t
*newstate
;
1659 newstate
= (re_dfastate_t
*) calloc (sizeof (re_dfastate_t
), 1);
1660 if (BE (newstate
== NULL
, 0))
1662 err
= re_node_set_init_copy (&newstate
->nodes
, nodes
);
1663 if (BE (err
!= REG_NOERROR
, 0))
1669 newstate
->context
= context
;
1670 newstate
->entrance_nodes
= &newstate
->nodes
;
1672 for (i
= 0 ; i
< nodes
->nelem
; i
++)
1674 re_token_t
*node
= dfa
->nodes
+ nodes
->elems
[i
];
1675 re_token_type_t type
= node
->type
;
1676 unsigned int constraint
= node
->constraint
;
1678 if (type
== CHARACTER
&& !constraint
)
1680 #ifdef RE_ENABLE_I18N
1681 newstate
->accept_mb
|= node
->accept_mb
;
1682 #endif /* RE_ENABLE_I18N */
1684 /* If the state has the halt node, the state is a halt state. */
1685 if (type
== END_OF_RE
)
1687 else if (type
== OP_BACK_REF
)
1688 newstate
->has_backref
= 1;
1692 if (newstate
->entrance_nodes
== &newstate
->nodes
)
1694 newstate
->entrance_nodes
= re_malloc (re_node_set
, 1);
1695 if (BE (newstate
->entrance_nodes
== NULL
, 0))
1697 free_state (newstate
);
1700 if (re_node_set_init_copy (newstate
->entrance_nodes
, nodes
)
1704 newstate
->has_constraint
= 1;
1707 if (NOT_SATISFY_PREV_CONSTRAINT (constraint
,context
))
1709 re_node_set_remove_at (&newstate
->nodes
, i
- nctx_nodes
);
1714 err
= register_state (dfa
, newstate
, hash
);
1715 if (BE (err
!= REG_NOERROR
, 0))
1717 free_state (newstate
);