1 /* Extended regular expression matching and search library.
2 Copyright (C) 2002-2006, 2010 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4 Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library; if not, see
18 <http://www.gnu.org/licenses/>. */
20 static void re_string_construct_common (const char *str
, int len
,
22 RE_TRANSLATE_TYPE trans
, int icase
,
23 const re_dfa_t
*dfa
) internal_function
;
24 static re_dfastate_t
*create_ci_newstate (const re_dfa_t
*dfa
,
25 const re_node_set
*nodes
,
26 unsigned int hash
) internal_function
;
27 static re_dfastate_t
*create_cd_newstate (const re_dfa_t
*dfa
,
28 const re_node_set
*nodes
,
30 unsigned int hash
) internal_function
;
33 #undef MAX /* safety */
35 MAX(size_t a
, size_t b
)
37 return (a
> b
? a
: b
);
41 /* Functions for string operation. */
43 /* This function allocate the buffers. It is necessary to call
44 re_string_reconstruct before using the object. */
48 re_string_allocate (re_string_t
*pstr
, const char *str
, int len
, int init_len
,
49 RE_TRANSLATE_TYPE trans
, int icase
, const re_dfa_t
*dfa
)
54 /* Ensure at least one character fits into the buffers. */
55 if (init_len
< dfa
->mb_cur_max
)
56 init_len
= dfa
->mb_cur_max
;
57 init_buf_len
= (len
+ 1 < init_len
) ? len
+ 1: init_len
;
58 re_string_construct_common (str
, len
, pstr
, trans
, icase
, dfa
);
60 ret
= re_string_realloc_buffers (pstr
, init_buf_len
);
61 if (BE (ret
!= REG_NOERROR
, 0))
64 pstr
->word_char
= dfa
->word_char
;
65 pstr
->word_ops_used
= dfa
->word_ops_used
;
66 pstr
->mbs
= pstr
->mbs_allocated
? pstr
->mbs
: (unsigned char *) str
;
67 pstr
->valid_len
= (pstr
->mbs_allocated
|| dfa
->mb_cur_max
> 1) ? 0 : len
;
68 pstr
->valid_raw_len
= pstr
->valid_len
;
72 /* This function allocate the buffers, and initialize them. */
76 re_string_construct (re_string_t
*pstr
, const char *str
, int len
,
77 RE_TRANSLATE_TYPE trans
, int icase
, const re_dfa_t
*dfa
)
80 memset (pstr
, '\0', sizeof (re_string_t
));
81 re_string_construct_common (str
, len
, pstr
, trans
, icase
, dfa
);
85 ret
= re_string_realloc_buffers (pstr
, len
+ 1);
86 if (BE (ret
!= REG_NOERROR
, 0))
89 pstr
->mbs
= pstr
->mbs_allocated
? pstr
->mbs
: (unsigned char *) str
;
94 if (dfa
->mb_cur_max
> 1)
98 ret
= build_wcs_upper_buffer (pstr
);
99 if (BE (ret
!= REG_NOERROR
, 0))
101 if (pstr
->valid_raw_len
>= len
)
103 if (pstr
->bufs_len
> pstr
->valid_len
+ dfa
->mb_cur_max
)
105 ret
= re_string_realloc_buffers (pstr
, pstr
->bufs_len
* 2);
106 if (BE (ret
!= REG_NOERROR
, 0))
111 #endif /* RE_ENABLE_I18N */
112 build_upper_buffer (pstr
);
116 #ifdef RE_ENABLE_I18N
117 if (dfa
->mb_cur_max
> 1)
118 build_wcs_buffer (pstr
);
120 #endif /* RE_ENABLE_I18N */
123 re_string_translate_buffer (pstr
);
126 pstr
->valid_len
= pstr
->bufs_len
;
127 pstr
->valid_raw_len
= pstr
->bufs_len
;
135 /* Helper functions for re_string_allocate, and re_string_construct. */
139 re_string_realloc_buffers (re_string_t
*pstr
, int new_buf_len
)
141 #ifdef RE_ENABLE_I18N
142 if (pstr
->mb_cur_max
> 1)
146 /* Avoid overflow in realloc. */
147 const size_t max_object_size
= MAX (sizeof (wint_t), sizeof (int));
148 if (BE (SIZE_MAX
/ max_object_size
< new_buf_len
, 0))
151 new_wcs
= re_realloc (pstr
->wcs
, wint_t, new_buf_len
);
152 if (BE (new_wcs
== NULL
, 0))
155 if (pstr
->offsets
!= NULL
)
157 int *new_offsets
= re_realloc (pstr
->offsets
, int, new_buf_len
);
158 if (BE (new_offsets
== NULL
, 0))
160 pstr
->offsets
= new_offsets
;
163 #endif /* RE_ENABLE_I18N */
164 if (pstr
->mbs_allocated
)
166 unsigned char *new_mbs
= re_realloc (pstr
->mbs
, unsigned char,
168 if (BE (new_mbs
== NULL
, 0))
172 pstr
->bufs_len
= new_buf_len
;
179 re_string_construct_common (const char *str
, int len
, re_string_t
*pstr
,
180 RE_TRANSLATE_TYPE trans
, int icase
,
183 pstr
->raw_mbs
= (const unsigned char *) str
;
187 pstr
->icase
= icase
? 1 : 0;
188 pstr
->mbs_allocated
= (trans
!= NULL
|| icase
);
189 pstr
->mb_cur_max
= dfa
->mb_cur_max
;
190 pstr
->is_utf8
= dfa
->is_utf8
;
191 pstr
->map_notascii
= dfa
->map_notascii
;
192 pstr
->stop
= pstr
->len
;
193 pstr
->raw_stop
= pstr
->stop
;
196 #ifdef RE_ENABLE_I18N
198 /* Build wide character buffer PSTR->WCS.
199 If the byte sequence of the string are:
200 <mb1>(0), <mb1>(1), <mb2>(0), <mb2>(1), <sb3>
201 Then wide character buffer will be:
202 <wc1> , WEOF , <wc2> , WEOF , <wc3>
203 We use WEOF for padding, they indicate that the position isn't
204 a first byte of a multibyte character.
206 Note that this function assumes PSTR->VALID_LEN elements are already
207 built and starts from PSTR->VALID_LEN. */
211 build_wcs_buffer (re_string_t
*pstr
)
214 unsigned char buf
[MB_LEN_MAX
];
215 assert (MB_LEN_MAX
>= pstr
->mb_cur_max
);
217 unsigned char buf
[64];
220 int byte_idx
, end_idx
, remain_len
;
223 /* Build the buffers from pstr->valid_len to either pstr->len or
225 end_idx
= (pstr
->bufs_len
> pstr
->len
) ? pstr
->len
: pstr
->bufs_len
;
226 for (byte_idx
= pstr
->valid_len
; byte_idx
< end_idx
;)
231 remain_len
= end_idx
- byte_idx
;
232 prev_st
= pstr
->cur_state
;
233 /* Apply the translation if we need. */
234 if (BE (pstr
->trans
!= NULL
, 0))
238 for (i
= 0; i
< pstr
->mb_cur_max
&& i
< remain_len
; ++i
)
240 ch
= pstr
->raw_mbs
[pstr
->raw_mbs_idx
+ byte_idx
+ i
];
241 buf
[i
] = pstr
->mbs
[byte_idx
+ i
] = pstr
->trans
[ch
];
243 p
= (const char *) buf
;
246 p
= (const char *) pstr
->raw_mbs
+ pstr
->raw_mbs_idx
+ byte_idx
;
247 mbclen
= __mbrtowc (&wc
, p
, remain_len
, &pstr
->cur_state
);
248 if (BE (mbclen
== (size_t) -2, 0))
250 /* The buffer doesn't have enough space, finish to build. */
251 pstr
->cur_state
= prev_st
;
254 else if (BE (mbclen
== (size_t) -1 || mbclen
== 0, 0))
256 /* We treat these cases as a singlebyte character. */
258 wc
= (wchar_t) pstr
->raw_mbs
[pstr
->raw_mbs_idx
+ byte_idx
];
259 if (BE (pstr
->trans
!= NULL
, 0))
260 wc
= pstr
->trans
[wc
];
261 pstr
->cur_state
= prev_st
;
264 /* Write wide character and padding. */
265 pstr
->wcs
[byte_idx
++] = wc
;
266 /* Write paddings. */
267 for (remain_len
= byte_idx
+ mbclen
- 1; byte_idx
< remain_len
;)
268 pstr
->wcs
[byte_idx
++] = WEOF
;
270 pstr
->valid_len
= byte_idx
;
271 pstr
->valid_raw_len
= byte_idx
;
274 /* Build wide character buffer PSTR->WCS like build_wcs_buffer,
275 but for REG_ICASE. */
279 build_wcs_upper_buffer (re_string_t
*pstr
)
282 int src_idx
, byte_idx
, end_idx
, remain_len
;
285 char buf
[MB_LEN_MAX
];
286 assert (MB_LEN_MAX
>= pstr
->mb_cur_max
);
291 byte_idx
= pstr
->valid_len
;
292 end_idx
= (pstr
->bufs_len
> pstr
->len
) ? pstr
->len
: pstr
->bufs_len
;
294 /* The following optimization assumes that ASCII characters can be
295 mapped to wide characters with a simple cast. */
296 if (! pstr
->map_notascii
&& pstr
->trans
== NULL
&& !pstr
->offsets_needed
)
298 while (byte_idx
< end_idx
)
302 if (isascii (pstr
->raw_mbs
[pstr
->raw_mbs_idx
+ byte_idx
])
303 && mbsinit (&pstr
->cur_state
))
305 /* In case of a singlebyte character. */
307 = toupper (pstr
->raw_mbs
[pstr
->raw_mbs_idx
+ byte_idx
]);
308 /* The next step uses the assumption that wchar_t is encoded
309 ASCII-safe: all ASCII values can be converted like this. */
310 pstr
->wcs
[byte_idx
] = (wchar_t) pstr
->mbs
[byte_idx
];
315 remain_len
= end_idx
- byte_idx
;
316 prev_st
= pstr
->cur_state
;
317 mbclen
= __mbrtowc (&wc
,
318 ((const char *) pstr
->raw_mbs
+ pstr
->raw_mbs_idx
319 + byte_idx
), remain_len
, &pstr
->cur_state
);
320 if (BE (mbclen
+ 2 > 2, 1))
328 mbcdlen
= wcrtomb (buf
, wcu
, &prev_st
);
329 if (BE (mbclen
== mbcdlen
, 1))
330 memcpy (pstr
->mbs
+ byte_idx
, buf
, mbclen
);
338 memcpy (pstr
->mbs
+ byte_idx
,
339 pstr
->raw_mbs
+ pstr
->raw_mbs_idx
+ byte_idx
, mbclen
);
340 pstr
->wcs
[byte_idx
++] = wcu
;
341 /* Write paddings. */
342 for (remain_len
= byte_idx
+ mbclen
- 1; byte_idx
< remain_len
;)
343 pstr
->wcs
[byte_idx
++] = WEOF
;
345 else if (mbclen
== (size_t) -1 || mbclen
== 0)
347 /* It is an invalid character or '\0'. Just use the byte. */
348 int ch
= pstr
->raw_mbs
[pstr
->raw_mbs_idx
+ byte_idx
];
349 pstr
->mbs
[byte_idx
] = ch
;
350 /* And also cast it to wide char. */
351 pstr
->wcs
[byte_idx
++] = (wchar_t) ch
;
352 if (BE (mbclen
== (size_t) -1, 0))
353 pstr
->cur_state
= prev_st
;
357 /* The buffer doesn't have enough space, finish to build. */
358 pstr
->cur_state
= prev_st
;
362 pstr
->valid_len
= byte_idx
;
363 pstr
->valid_raw_len
= byte_idx
;
367 for (src_idx
= pstr
->valid_raw_len
; byte_idx
< end_idx
;)
372 remain_len
= end_idx
- byte_idx
;
373 prev_st
= pstr
->cur_state
;
374 if (BE (pstr
->trans
!= NULL
, 0))
378 for (i
= 0; i
< pstr
->mb_cur_max
&& i
< remain_len
; ++i
)
380 ch
= pstr
->raw_mbs
[pstr
->raw_mbs_idx
+ src_idx
+ i
];
381 buf
[i
] = pstr
->trans
[ch
];
383 p
= (const char *) buf
;
386 p
= (const char *) pstr
->raw_mbs
+ pstr
->raw_mbs_idx
+ src_idx
;
387 mbclen
= __mbrtowc (&wc
, p
, remain_len
, &pstr
->cur_state
);
388 if (BE (mbclen
+ 2 > 2, 1))
396 mbcdlen
= wcrtomb ((char *) buf
, wcu
, &prev_st
);
397 if (BE (mbclen
== mbcdlen
, 1))
398 memcpy (pstr
->mbs
+ byte_idx
, buf
, mbclen
);
399 else if (mbcdlen
!= (size_t) -1)
403 if (byte_idx
+ mbcdlen
> pstr
->bufs_len
)
405 pstr
->cur_state
= prev_st
;
409 if (pstr
->offsets
== NULL
)
411 pstr
->offsets
= re_malloc (int, pstr
->bufs_len
);
413 if (pstr
->offsets
== NULL
)
416 if (!pstr
->offsets_needed
)
418 for (i
= 0; i
< (size_t) byte_idx
; ++i
)
419 pstr
->offsets
[i
] = i
;
420 pstr
->offsets_needed
= 1;
423 memcpy (pstr
->mbs
+ byte_idx
, buf
, mbcdlen
);
424 pstr
->wcs
[byte_idx
] = wcu
;
425 pstr
->offsets
[byte_idx
] = src_idx
;
426 for (i
= 1; i
< mbcdlen
; ++i
)
428 pstr
->offsets
[byte_idx
+ i
]
429 = src_idx
+ (i
< mbclen
? i
: mbclen
- 1);
430 pstr
->wcs
[byte_idx
+ i
] = WEOF
;
432 pstr
->len
+= mbcdlen
- mbclen
;
433 if (pstr
->raw_stop
> src_idx
)
434 pstr
->stop
+= mbcdlen
- mbclen
;
435 end_idx
= (pstr
->bufs_len
> pstr
->len
)
436 ? pstr
->len
: pstr
->bufs_len
;
442 memcpy (pstr
->mbs
+ byte_idx
, p
, mbclen
);
445 memcpy (pstr
->mbs
+ byte_idx
, p
, mbclen
);
447 if (BE (pstr
->offsets_needed
!= 0, 0))
450 for (i
= 0; i
< mbclen
; ++i
)
451 pstr
->offsets
[byte_idx
+ i
] = src_idx
+ i
;
455 pstr
->wcs
[byte_idx
++] = wcu
;
456 /* Write paddings. */
457 for (remain_len
= byte_idx
+ mbclen
- 1; byte_idx
< remain_len
;)
458 pstr
->wcs
[byte_idx
++] = WEOF
;
460 else if (mbclen
== (size_t) -1 || mbclen
== 0)
462 /* It is an invalid character or '\0'. Just use the byte. */
463 int ch
= pstr
->raw_mbs
[pstr
->raw_mbs_idx
+ src_idx
];
465 if (BE (pstr
->trans
!= NULL
, 0))
466 ch
= pstr
->trans
[ch
];
467 pstr
->mbs
[byte_idx
] = ch
;
469 if (BE (pstr
->offsets_needed
!= 0, 0))
470 pstr
->offsets
[byte_idx
] = src_idx
;
473 /* And also cast it to wide char. */
474 pstr
->wcs
[byte_idx
++] = (wchar_t) ch
;
475 if (BE (mbclen
== (size_t) -1, 0))
476 pstr
->cur_state
= prev_st
;
480 /* The buffer doesn't have enough space, finish to build. */
481 pstr
->cur_state
= prev_st
;
485 pstr
->valid_len
= byte_idx
;
486 pstr
->valid_raw_len
= src_idx
;
490 /* Skip characters until the index becomes greater than NEW_RAW_IDX.
495 re_string_skip_chars (re_string_t
*pstr
, int new_raw_idx
, wint_t *last_wc
)
502 /* Skip the characters which are not necessary to check. */
503 for (rawbuf_idx
= pstr
->raw_mbs_idx
+ pstr
->valid_raw_len
;
504 rawbuf_idx
< new_raw_idx
;)
507 int remain_len
= pstr
->len
- rawbuf_idx
;
508 prev_st
= pstr
->cur_state
;
509 mbclen
= __mbrtowc (&wc2
, (const char *) pstr
->raw_mbs
+ rawbuf_idx
,
510 remain_len
, &pstr
->cur_state
);
511 if (BE (mbclen
== (size_t) -2 || mbclen
== (size_t) -1 || mbclen
== 0, 0))
513 /* We treat these cases as a single byte character. */
514 if (mbclen
== 0 || remain_len
== 0)
517 wc
= *(unsigned char *) (pstr
->raw_mbs
+ rawbuf_idx
);
519 pstr
->cur_state
= prev_st
;
523 /* Then proceed the next character. */
524 rawbuf_idx
+= mbclen
;
526 *last_wc
= (wint_t) wc
;
529 #endif /* RE_ENABLE_I18N */
531 /* Build the buffer PSTR->MBS, and apply the translation if we need.
532 This function is used in case of REG_ICASE. */
536 build_upper_buffer (re_string_t
*pstr
)
538 int char_idx
, end_idx
;
539 end_idx
= (pstr
->bufs_len
> pstr
->len
) ? pstr
->len
: pstr
->bufs_len
;
541 for (char_idx
= pstr
->valid_len
; char_idx
< end_idx
; ++char_idx
)
543 int ch
= pstr
->raw_mbs
[pstr
->raw_mbs_idx
+ char_idx
];
544 if (BE (pstr
->trans
!= NULL
, 0))
545 ch
= pstr
->trans
[ch
];
547 pstr
->mbs
[char_idx
] = toupper (ch
);
549 pstr
->mbs
[char_idx
] = ch
;
551 pstr
->valid_len
= char_idx
;
552 pstr
->valid_raw_len
= char_idx
;
555 /* Apply TRANS to the buffer in PSTR. */
559 re_string_translate_buffer (re_string_t
*pstr
)
561 int buf_idx
, end_idx
;
562 end_idx
= (pstr
->bufs_len
> pstr
->len
) ? pstr
->len
: pstr
->bufs_len
;
564 for (buf_idx
= pstr
->valid_len
; buf_idx
< end_idx
; ++buf_idx
)
566 int ch
= pstr
->raw_mbs
[pstr
->raw_mbs_idx
+ buf_idx
];
567 pstr
->mbs
[buf_idx
] = pstr
->trans
[ch
];
570 pstr
->valid_len
= buf_idx
;
571 pstr
->valid_raw_len
= buf_idx
;
574 /* This function re-construct the buffers.
575 Concretely, convert to wide character in case of pstr->mb_cur_max > 1,
576 convert to upper case in case of REG_ICASE, apply translation. */
580 re_string_reconstruct (re_string_t
*pstr
, int idx
, int eflags
)
582 int offset
= idx
- pstr
->raw_mbs_idx
;
583 if (BE (offset
< 0, 0))
586 #ifdef RE_ENABLE_I18N
587 if (pstr
->mb_cur_max
> 1)
588 memset (&pstr
->cur_state
, '\0', sizeof (mbstate_t));
589 #endif /* RE_ENABLE_I18N */
590 pstr
->len
= pstr
->raw_len
;
591 pstr
->stop
= pstr
->raw_stop
;
593 pstr
->raw_mbs_idx
= 0;
594 pstr
->valid_raw_len
= 0;
595 pstr
->offsets_needed
= 0;
596 pstr
->tip_context
= ((eflags
& REG_NOTBOL
) ? CONTEXT_BEGBUF
597 : CONTEXT_NEWLINE
| CONTEXT_BEGBUF
);
598 if (!pstr
->mbs_allocated
)
599 pstr
->mbs
= (unsigned char *) pstr
->raw_mbs
;
603 if (BE (offset
!= 0, 1))
605 /* Should the already checked characters be kept? */
606 if (BE (offset
< pstr
->valid_raw_len
, 1))
608 /* Yes, move them to the front of the buffer. */
609 #ifdef RE_ENABLE_I18N
610 if (BE (pstr
->offsets_needed
, 0))
612 int low
= 0, high
= pstr
->valid_len
, mid
;
615 mid
= low
+ (high
- low
) / 2;
616 if (pstr
->offsets
[mid
] > offset
)
618 else if (pstr
->offsets
[mid
] < offset
)
624 if (pstr
->offsets
[mid
] < offset
)
626 pstr
->tip_context
= re_string_context_at (pstr
, mid
- 1,
628 /* This can be quite complicated, so handle specially
629 only the common and easy case where the character with
630 different length representation of lower and upper
631 case is present at or after offset. */
632 if (pstr
->valid_len
> offset
633 && mid
== offset
&& pstr
->offsets
[mid
] == offset
)
635 memmove (pstr
->wcs
, pstr
->wcs
+ offset
,
636 (pstr
->valid_len
- offset
) * sizeof (wint_t));
637 memmove (pstr
->mbs
, pstr
->mbs
+ offset
, pstr
->valid_len
- offset
);
638 pstr
->valid_len
-= offset
;
639 pstr
->valid_raw_len
-= offset
;
640 for (low
= 0; low
< pstr
->valid_len
; low
++)
641 pstr
->offsets
[low
] = pstr
->offsets
[low
+ offset
] - offset
;
645 /* Otherwise, just find out how long the partial multibyte
646 character at offset is and fill it with WEOF/255. */
647 pstr
->len
= pstr
->raw_len
- idx
+ offset
;
648 pstr
->stop
= pstr
->raw_stop
- idx
+ offset
;
649 pstr
->offsets_needed
= 0;
650 while (mid
> 0 && pstr
->offsets
[mid
- 1] == offset
)
652 while (mid
< pstr
->valid_len
)
653 if (pstr
->wcs
[mid
] != WEOF
)
657 if (mid
== pstr
->valid_len
)
661 pstr
->valid_len
= pstr
->offsets
[mid
] - offset
;
664 for (low
= 0; low
< pstr
->valid_len
; ++low
)
665 pstr
->wcs
[low
] = WEOF
;
666 memset (pstr
->mbs
, 255, pstr
->valid_len
);
669 pstr
->valid_raw_len
= pstr
->valid_len
;
675 pstr
->tip_context
= re_string_context_at (pstr
, offset
- 1,
677 #ifdef RE_ENABLE_I18N
678 if (pstr
->mb_cur_max
> 1)
679 memmove (pstr
->wcs
, pstr
->wcs
+ offset
,
680 (pstr
->valid_len
- offset
) * sizeof (wint_t));
681 #endif /* RE_ENABLE_I18N */
682 if (BE (pstr
->mbs_allocated
, 0))
683 memmove (pstr
->mbs
, pstr
->mbs
+ offset
,
684 pstr
->valid_len
- offset
);
685 pstr
->valid_len
-= offset
;
686 pstr
->valid_raw_len
-= offset
;
688 assert (pstr
->valid_len
> 0);
694 #ifdef RE_ENABLE_I18N
695 /* No, skip all characters until IDX. */
696 int prev_valid_len
= pstr
->valid_len
;
698 if (BE (pstr
->offsets_needed
, 0))
700 pstr
->len
= pstr
->raw_len
- idx
+ offset
;
701 pstr
->stop
= pstr
->raw_stop
- idx
+ offset
;
702 pstr
->offsets_needed
= 0;
706 #ifdef RE_ENABLE_I18N
707 if (pstr
->mb_cur_max
> 1)
714 const unsigned char *raw
, *p
, *end
;
716 /* Special case UTF-8. Multi-byte chars start with any
717 byte other than 0x80 - 0xbf. */
718 raw
= pstr
->raw_mbs
+ pstr
->raw_mbs_idx
;
719 end
= raw
+ (offset
- pstr
->mb_cur_max
);
720 if (end
< pstr
->raw_mbs
)
722 p
= raw
+ offset
- 1;
724 /* We know the wchar_t encoding is UCS4, so for the simple
725 case, ASCII characters, skip the conversion step. */
726 if (isascii (*p
) && BE (pstr
->trans
== NULL
, 1))
728 memset (&pstr
->cur_state
, '\0', sizeof (mbstate_t));
729 /* pstr->valid_len = 0; */
734 for (; p
>= end
; --p
)
735 if ((*p
& 0xc0) != 0x80)
739 int mlen
= raw
+ pstr
->len
- p
;
740 unsigned char buf
[6];
743 if (BE (pstr
->trans
!= NULL
, 0))
745 int i
= mlen
< 6 ? mlen
: 6;
747 buf
[i
] = pstr
->trans
[p
[i
]];
749 /* XXX Don't use mbrtowc, we know which conversion
750 to use (UTF-8 -> UCS4). */
751 memset (&cur_state
, 0, sizeof (cur_state
));
752 mbclen
= __mbrtowc (&wc2
, (const char *) p
, mlen
,
754 if (raw
+ offset
- p
<= mbclen
755 && mbclen
< (size_t) -2)
757 memset (&pstr
->cur_state
, '\0',
759 pstr
->valid_len
= mbclen
- (raw
+ offset
- p
);
767 pstr
->valid_len
= re_string_skip_chars (pstr
, idx
, &wc
) - idx
;
770 = re_string_context_at (pstr
, prev_valid_len
- 1, eflags
);
772 pstr
->tip_context
= ((BE (pstr
->word_ops_used
!= 0, 0)
773 && IS_WIDE_WORD_CHAR (wc
))
775 : ((IS_WIDE_NEWLINE (wc
)
776 && pstr
->newline_anchor
)
777 ? CONTEXT_NEWLINE
: 0));
778 if (BE (pstr
->valid_len
, 0))
780 for (wcs_idx
= 0; wcs_idx
< pstr
->valid_len
; ++wcs_idx
)
781 pstr
->wcs
[wcs_idx
] = WEOF
;
782 if (pstr
->mbs_allocated
)
783 memset (pstr
->mbs
, 255, pstr
->valid_len
);
785 pstr
->valid_raw_len
= pstr
->valid_len
;
788 #endif /* RE_ENABLE_I18N */
790 int c
= pstr
->raw_mbs
[pstr
->raw_mbs_idx
+ offset
- 1];
791 pstr
->valid_raw_len
= 0;
794 pstr
->tip_context
= (bitset_contain (pstr
->word_char
, c
)
796 : ((IS_NEWLINE (c
) && pstr
->newline_anchor
)
797 ? CONTEXT_NEWLINE
: 0));
800 if (!BE (pstr
->mbs_allocated
, 0))
803 pstr
->raw_mbs_idx
= idx
;
805 pstr
->stop
-= offset
;
807 /* Then build the buffers. */
808 #ifdef RE_ENABLE_I18N
809 if (pstr
->mb_cur_max
> 1)
813 reg_errcode_t ret
= build_wcs_upper_buffer (pstr
);
814 if (BE (ret
!= REG_NOERROR
, 0))
818 build_wcs_buffer (pstr
);
821 #endif /* RE_ENABLE_I18N */
822 if (BE (pstr
->mbs_allocated
, 0))
825 build_upper_buffer (pstr
);
826 else if (pstr
->trans
!= NULL
)
827 re_string_translate_buffer (pstr
);
830 pstr
->valid_len
= pstr
->len
;
837 internal_function
__attribute ((pure
))
838 re_string_peek_byte_case (const re_string_t
*pstr
, int idx
)
842 /* Handle the common (easiest) cases first. */
843 if (BE (!pstr
->mbs_allocated
, 1))
844 return re_string_peek_byte (pstr
, idx
);
846 #ifdef RE_ENABLE_I18N
847 if (pstr
->mb_cur_max
> 1
848 && ! re_string_is_single_byte_char (pstr
, pstr
->cur_idx
+ idx
))
849 return re_string_peek_byte (pstr
, idx
);
852 off
= pstr
->cur_idx
+ idx
;
853 #ifdef RE_ENABLE_I18N
854 if (pstr
->offsets_needed
)
855 off
= pstr
->offsets
[off
];
858 ch
= pstr
->raw_mbs
[pstr
->raw_mbs_idx
+ off
];
860 #ifdef RE_ENABLE_I18N
861 /* Ensure that e.g. for tr_TR.UTF-8 BACKSLASH DOTLESS SMALL LETTER I
862 this function returns CAPITAL LETTER I instead of first byte of
863 DOTLESS SMALL LETTER I. The latter would confuse the parser,
864 since peek_byte_case doesn't advance cur_idx in any way. */
865 if (pstr
->offsets_needed
&& !isascii (ch
))
866 return re_string_peek_byte (pstr
, idx
);
873 internal_function
__attribute ((pure
))
874 re_string_fetch_byte_case (re_string_t
*pstr
)
876 if (BE (!pstr
->mbs_allocated
, 1))
877 return re_string_fetch_byte (pstr
);
879 #ifdef RE_ENABLE_I18N
880 if (pstr
->offsets_needed
)
884 /* For tr_TR.UTF-8 [[:islower:]] there is
885 [[: CAPITAL LETTER I WITH DOT lower:]] in mbs. Skip
886 in that case the whole multi-byte character and return
887 the original letter. On the other side, with
888 [[: DOTLESS SMALL LETTER I return [[:I, as doing
889 anything else would complicate things too much. */
891 if (!re_string_first_byte (pstr
, pstr
->cur_idx
))
892 return re_string_fetch_byte (pstr
);
894 off
= pstr
->offsets
[pstr
->cur_idx
];
895 ch
= pstr
->raw_mbs
[pstr
->raw_mbs_idx
+ off
];
898 return re_string_fetch_byte (pstr
);
900 re_string_skip_bytes (pstr
,
901 re_string_char_size_at (pstr
, pstr
->cur_idx
));
906 return pstr
->raw_mbs
[pstr
->raw_mbs_idx
+ pstr
->cur_idx
++];
911 re_string_destruct (re_string_t
*pstr
)
913 #ifdef RE_ENABLE_I18N
915 re_free (pstr
->offsets
);
916 #endif /* RE_ENABLE_I18N */
917 if (pstr
->mbs_allocated
)
921 /* Return the context at IDX in INPUT. */
925 re_string_context_at (const re_string_t
*input
, int idx
, int eflags
)
929 /* In this case, we use the value stored in input->tip_context,
930 since we can't know the character in input->mbs[-1] here. */
931 return input
->tip_context
;
932 if (BE (idx
== input
->len
, 0))
933 return ((eflags
& REG_NOTEOL
) ? CONTEXT_ENDBUF
934 : CONTEXT_NEWLINE
| CONTEXT_ENDBUF
);
935 #ifdef RE_ENABLE_I18N
936 if (input
->mb_cur_max
> 1)
940 while(input
->wcs
[wc_idx
] == WEOF
)
943 /* It must not happen. */
944 assert (wc_idx
>= 0);
948 return input
->tip_context
;
950 wc
= input
->wcs
[wc_idx
];
951 if (BE (input
->word_ops_used
!= 0, 0) && IS_WIDE_WORD_CHAR (wc
))
953 return (IS_WIDE_NEWLINE (wc
) && input
->newline_anchor
954 ? CONTEXT_NEWLINE
: 0);
959 c
= re_string_byte_at (input
, idx
);
960 if (bitset_contain (input
->word_char
, c
))
962 return IS_NEWLINE (c
) && input
->newline_anchor
? CONTEXT_NEWLINE
: 0;
966 /* Functions for set operation. */
970 re_node_set_alloc (re_node_set
*set
, int size
)
973 * ADR: valgrind says size can be 0, which then doesn't
974 * free the block of size 0. Harumph. This seems
975 * to work ok, though.
979 memset(set
, 0, sizeof(*set
));
984 set
->elems
= re_malloc (int, size
);
985 if (BE (set
->elems
== NULL
, 0))
992 re_node_set_init_1 (re_node_set
*set
, int elem
)
996 set
->elems
= re_malloc (int, 1);
997 if (BE (set
->elems
== NULL
, 0))
999 set
->alloc
= set
->nelem
= 0;
1002 set
->elems
[0] = elem
;
1006 static reg_errcode_t
1008 re_node_set_init_2 (re_node_set
*set
, int elem1
, int elem2
)
1011 set
->elems
= re_malloc (int, 2);
1012 if (BE (set
->elems
== NULL
, 0))
1017 set
->elems
[0] = elem1
;
1024 set
->elems
[0] = elem1
;
1025 set
->elems
[1] = elem2
;
1029 set
->elems
[0] = elem2
;
1030 set
->elems
[1] = elem1
;
1036 static reg_errcode_t
1038 re_node_set_init_copy (re_node_set
*dest
, const re_node_set
*src
)
1040 dest
->nelem
= src
->nelem
;
1043 dest
->alloc
= dest
->nelem
;
1044 dest
->elems
= re_malloc (int, dest
->alloc
);
1045 if (BE (dest
->elems
== NULL
, 0))
1047 dest
->alloc
= dest
->nelem
= 0;
1050 memcpy (dest
->elems
, src
->elems
, src
->nelem
* sizeof (int));
1053 re_node_set_init_empty (dest
);
1057 /* Calculate the intersection of the sets SRC1 and SRC2. And merge it to
1058 DEST. Return value indicate the error code or REG_NOERROR if succeeded.
1059 Note: We assume dest->elems is NULL, when dest->alloc is 0. */
1061 static reg_errcode_t
1063 re_node_set_add_intersect (re_node_set
*dest
, const re_node_set
*src1
,
1064 const re_node_set
*src2
)
1066 int i1
, i2
, is
, id
, delta
, sbase
;
1067 if (src1
->nelem
== 0 || src2
->nelem
== 0)
1070 /* We need dest->nelem + 2 * elems_in_intersection; this is a
1071 conservative estimate. */
1072 if (src1
->nelem
+ src2
->nelem
+ dest
->nelem
> dest
->alloc
)
1074 int new_alloc
= src1
->nelem
+ src2
->nelem
+ dest
->alloc
;
1075 int *new_elems
= re_realloc (dest
->elems
, int, new_alloc
);
1076 if (BE (new_elems
== NULL
, 0))
1078 dest
->elems
= new_elems
;
1079 dest
->alloc
= new_alloc
;
1082 /* Find the items in the intersection of SRC1 and SRC2, and copy
1083 into the top of DEST those that are not already in DEST itself. */
1084 sbase
= dest
->nelem
+ src1
->nelem
+ src2
->nelem
;
1085 i1
= src1
->nelem
- 1;
1086 i2
= src2
->nelem
- 1;
1087 id
= dest
->nelem
- 1;
1090 if (src1
->elems
[i1
] == src2
->elems
[i2
])
1092 /* Try to find the item in DEST. Maybe we could binary search? */
1093 while (id
>= 0 && dest
->elems
[id
] > src1
->elems
[i1
])
1096 if (id
< 0 || dest
->elems
[id
] != src1
->elems
[i1
])
1097 dest
->elems
[--sbase
] = src1
->elems
[i1
];
1099 if (--i1
< 0 || --i2
< 0)
1103 /* Lower the highest of the two items. */
1104 else if (src1
->elems
[i1
] < src2
->elems
[i2
])
1116 id
= dest
->nelem
- 1;
1117 is
= dest
->nelem
+ src1
->nelem
+ src2
->nelem
- 1;
1118 delta
= is
- sbase
+ 1;
1120 /* Now copy. When DELTA becomes zero, the remaining
1121 DEST elements are already in place; this is more or
1122 less the same loop that is in re_node_set_merge. */
1123 dest
->nelem
+= delta
;
1124 if (delta
> 0 && id
>= 0)
1127 if (dest
->elems
[is
] > dest
->elems
[id
])
1129 /* Copy from the top. */
1130 dest
->elems
[id
+ delta
--] = dest
->elems
[is
--];
1136 /* Slide from the bottom. */
1137 dest
->elems
[id
+ delta
] = dest
->elems
[id
];
1143 /* Copy remaining SRC elements. */
1144 memcpy (dest
->elems
, dest
->elems
+ sbase
, delta
* sizeof (int));
1149 /* Calculate the union set of the sets SRC1 and SRC2. And store it to
1150 DEST. Return value indicate the error code or REG_NOERROR if succeeded. */
1152 static reg_errcode_t
1154 re_node_set_init_union (re_node_set
*dest
, const re_node_set
*src1
,
1155 const re_node_set
*src2
)
1158 if (src1
!= NULL
&& src1
->nelem
> 0 && src2
!= NULL
&& src2
->nelem
> 0)
1160 dest
->alloc
= src1
->nelem
+ src2
->nelem
;
1161 dest
->elems
= re_malloc (int, dest
->alloc
);
1162 if (BE (dest
->elems
== NULL
, 0))
1167 if (src1
!= NULL
&& src1
->nelem
> 0)
1168 return re_node_set_init_copy (dest
, src1
);
1169 else if (src2
!= NULL
&& src2
->nelem
> 0)
1170 return re_node_set_init_copy (dest
, src2
);
1172 re_node_set_init_empty (dest
);
1175 for (i1
= i2
= id
= 0 ; i1
< src1
->nelem
&& i2
< src2
->nelem
;)
1177 if (src1
->elems
[i1
] > src2
->elems
[i2
])
1179 dest
->elems
[id
++] = src2
->elems
[i2
++];
1182 if (src1
->elems
[i1
] == src2
->elems
[i2
])
1184 dest
->elems
[id
++] = src1
->elems
[i1
++];
1186 if (i1
< src1
->nelem
)
1188 memcpy (dest
->elems
+ id
, src1
->elems
+ i1
,
1189 (src1
->nelem
- i1
) * sizeof (int));
1190 id
+= src1
->nelem
- i1
;
1192 else if (i2
< src2
->nelem
)
1194 memcpy (dest
->elems
+ id
, src2
->elems
+ i2
,
1195 (src2
->nelem
- i2
) * sizeof (int));
1196 id
+= src2
->nelem
- i2
;
1202 /* Calculate the union set of the sets DEST and SRC. And store it to
1203 DEST. Return value indicate the error code or REG_NOERROR if succeeded. */
1205 static reg_errcode_t
1207 re_node_set_merge (re_node_set
*dest
, const re_node_set
*src
)
1209 int is
, id
, sbase
, delta
;
1210 if (src
== NULL
|| src
->nelem
== 0)
1212 if (dest
->alloc
< 2 * src
->nelem
+ dest
->nelem
)
1214 int new_alloc
= 2 * (src
->nelem
+ dest
->alloc
);
1215 int *new_buffer
= re_realloc (dest
->elems
, int, new_alloc
);
1216 if (BE (new_buffer
== NULL
, 0))
1218 dest
->elems
= new_buffer
;
1219 dest
->alloc
= new_alloc
;
1222 if (BE (dest
->nelem
== 0, 0))
1224 dest
->nelem
= src
->nelem
;
1225 memcpy (dest
->elems
, src
->elems
, src
->nelem
* sizeof (int));
1229 /* Copy into the top of DEST the items of SRC that are not
1230 found in DEST. Maybe we could binary search in DEST? */
1231 for (sbase
= dest
->nelem
+ 2 * src
->nelem
,
1232 is
= src
->nelem
- 1, id
= dest
->nelem
- 1; is
>= 0 && id
>= 0; )
1234 if (dest
->elems
[id
] == src
->elems
[is
])
1236 else if (dest
->elems
[id
] < src
->elems
[is
])
1237 dest
->elems
[--sbase
] = src
->elems
[is
--];
1238 else /* if (dest->elems[id] > src->elems[is]) */
1244 /* If DEST is exhausted, the remaining items of SRC must be unique. */
1246 memcpy (dest
->elems
+ sbase
, src
->elems
, (is
+ 1) * sizeof (int));
1249 id
= dest
->nelem
- 1;
1250 is
= dest
->nelem
+ 2 * src
->nelem
- 1;
1251 delta
= is
- sbase
+ 1;
1255 /* Now copy. When DELTA becomes zero, the remaining
1256 DEST elements are already in place. */
1257 dest
->nelem
+= delta
;
1260 if (dest
->elems
[is
] > dest
->elems
[id
])
1262 /* Copy from the top. */
1263 dest
->elems
[id
+ delta
--] = dest
->elems
[is
--];
1269 /* Slide from the bottom. */
1270 dest
->elems
[id
+ delta
] = dest
->elems
[id
];
1273 /* Copy remaining SRC elements. */
1274 memcpy (dest
->elems
, dest
->elems
+ sbase
,
1275 delta
* sizeof (int));
1284 /* Insert the new element ELEM to the re_node_set* SET.
1285 SET should not already have ELEM.
1286 return -1 if an error has occurred, return 1 otherwise. */
1290 re_node_set_insert (re_node_set
*set
, int elem
)
1293 /* In case the set is empty. */
1294 if (set
->alloc
== 0)
1296 if (BE (re_node_set_init_1 (set
, elem
) == REG_NOERROR
, 1))
1302 if (BE (set
->nelem
, 0) == 0)
1304 /* We already guaranteed above that set->alloc != 0. */
1305 set
->elems
[0] = elem
;
1310 /* Realloc if we need. */
1311 if (set
->alloc
== set
->nelem
)
1314 set
->alloc
= set
->alloc
* 2;
1315 new_elems
= re_realloc (set
->elems
, int, set
->alloc
);
1316 if (BE (new_elems
== NULL
, 0))
1318 set
->elems
= new_elems
;
1321 /* Move the elements which follows the new element. Test the
1322 first element separately to skip a check in the inner loop. */
1323 if (elem
< set
->elems
[0])
1326 for (idx
= set
->nelem
; idx
> 0; idx
--)
1327 set
->elems
[idx
] = set
->elems
[idx
- 1];
1331 for (idx
= set
->nelem
; set
->elems
[idx
- 1] > elem
; idx
--)
1332 set
->elems
[idx
] = set
->elems
[idx
- 1];
1335 /* Insert the new element. */
1336 set
->elems
[idx
] = elem
;
1341 /* Insert the new element ELEM to the re_node_set* SET.
1342 SET should not already have any element greater than or equal to ELEM.
1343 Return -1 if an error has occurred, return 1 otherwise. */
1347 re_node_set_insert_last (re_node_set
*set
, int elem
)
1349 /* Realloc if we need. */
1350 if (set
->alloc
== set
->nelem
)
1353 set
->alloc
= (set
->alloc
+ 1) * 2;
1354 new_elems
= re_realloc (set
->elems
, int, set
->alloc
);
1355 if (BE (new_elems
== NULL
, 0))
1357 set
->elems
= new_elems
;
1360 /* Insert the new element. */
1361 set
->elems
[set
->nelem
++] = elem
;
1365 /* Compare two node sets SET1 and SET2.
1366 return 1 if SET1 and SET2 are equivalent, return 0 otherwise. */
1369 internal_function
__attribute ((pure
))
1370 re_node_set_compare (const re_node_set
*set1
, const re_node_set
*set2
)
1373 if (set1
== NULL
|| set2
== NULL
|| set1
->nelem
!= set2
->nelem
)
1375 for (i
= set1
->nelem
; --i
>= 0 ; )
1376 if (set1
->elems
[i
] != set2
->elems
[i
])
1381 /* Return (idx + 1) if SET contains the element ELEM, return 0 otherwise. */
1384 internal_function
__attribute ((pure
))
1385 re_node_set_contains (const re_node_set
*set
, int elem
)
1387 unsigned int idx
, right
, mid
;
1388 if (set
->nelem
<= 0)
1391 /* Binary search the element. */
1393 right
= set
->nelem
- 1;
1396 mid
= idx
+ (right
- idx
) / 2;
1397 if (set
->elems
[mid
] < elem
)
1402 return set
->elems
[idx
] == elem
? idx
+ 1 : 0;
1407 re_node_set_remove_at (re_node_set
*set
, int idx
)
1409 if (idx
< 0 || idx
>= set
->nelem
)
1412 for (; idx
< set
->nelem
; idx
++)
1413 set
->elems
[idx
] = set
->elems
[idx
+ 1];
1417 /* Add the token TOKEN to dfa->nodes, and return the index of the token.
1418 Or return -1, if an error has occurred. */
1422 re_dfa_add_node (re_dfa_t
*dfa
, re_token_t token
)
1424 if (BE (dfa
->nodes_len
>= dfa
->nodes_alloc
, 0))
1426 size_t new_nodes_alloc
= dfa
->nodes_alloc
* 2;
1427 int *new_nexts
, *new_indices
;
1428 re_node_set
*new_edests
, *new_eclosures
;
1429 re_token_t
*new_nodes
;
1431 /* Avoid overflows in realloc. */
1432 const size_t max_object_size
= MAX (sizeof (re_token_t
),
1433 MAX (sizeof (re_node_set
),
1435 if (BE (SIZE_MAX
/ max_object_size
< new_nodes_alloc
, 0))
1438 new_nodes
= re_realloc (dfa
->nodes
, re_token_t
, new_nodes_alloc
);
1439 if (BE (new_nodes
== NULL
, 0))
1441 dfa
->nodes
= new_nodes
;
1442 new_nexts
= re_realloc (dfa
->nexts
, int, new_nodes_alloc
);
1443 new_indices
= re_realloc (dfa
->org_indices
, int, new_nodes_alloc
);
1444 new_edests
= re_realloc (dfa
->edests
, re_node_set
, new_nodes_alloc
);
1445 new_eclosures
= re_realloc (dfa
->eclosures
, re_node_set
, new_nodes_alloc
);
1446 if (BE (new_nexts
== NULL
|| new_indices
== NULL
1447 || new_edests
== NULL
|| new_eclosures
== NULL
, 0))
1449 dfa
->nexts
= new_nexts
;
1450 dfa
->org_indices
= new_indices
;
1451 dfa
->edests
= new_edests
;
1452 dfa
->eclosures
= new_eclosures
;
1453 dfa
->nodes_alloc
= new_nodes_alloc
;
1455 dfa
->nodes
[dfa
->nodes_len
] = token
;
1456 dfa
->nodes
[dfa
->nodes_len
].constraint
= 0;
1457 #ifdef RE_ENABLE_I18N
1458 dfa
->nodes
[dfa
->nodes_len
].accept_mb
=
1459 (token
.type
== OP_PERIOD
&& dfa
->mb_cur_max
> 1) || token
.type
== COMPLEX_BRACKET
;
1461 dfa
->nexts
[dfa
->nodes_len
] = -1;
1462 re_node_set_init_empty (dfa
->edests
+ dfa
->nodes_len
);
1463 re_node_set_init_empty (dfa
->eclosures
+ dfa
->nodes_len
);
1464 return dfa
->nodes_len
++;
1467 static inline unsigned int
1469 calc_state_hash (const re_node_set
*nodes
, unsigned int context
)
1471 unsigned int hash
= nodes
->nelem
+ context
;
1473 for (i
= 0 ; i
< nodes
->nelem
; i
++)
1474 hash
+= nodes
->elems
[i
];
1478 /* Search for the state whose node_set is equivalent to NODES.
1479 Return the pointer to the state, if we found it in the DFA.
1480 Otherwise create the new one and return it. In case of an error
1481 return NULL and set the error code in ERR.
1482 Note: - We assume NULL as the invalid state, then it is possible that
1483 return value is NULL and ERR is REG_NOERROR.
1484 - We never return non-NULL value in case of any errors, it is for
1487 static re_dfastate_t
*
1489 re_acquire_state (reg_errcode_t
*err
, const re_dfa_t
*dfa
,
1490 const re_node_set
*nodes
)
1493 re_dfastate_t
*new_state
;
1494 struct re_state_table_entry
*spot
;
1496 if (BE (nodes
->nelem
== 0, 0))
1501 hash
= calc_state_hash (nodes
, 0);
1502 spot
= dfa
->state_table
+ (hash
& dfa
->state_hash_mask
);
1504 for (i
= 0 ; i
< spot
->num
; i
++)
1506 re_dfastate_t
*state
= spot
->array
[i
];
1507 if (hash
!= state
->hash
)
1509 if (re_node_set_compare (&state
->nodes
, nodes
))
1513 /* There are no appropriate state in the dfa, create the new one. */
1514 new_state
= create_ci_newstate (dfa
, nodes
, hash
);
1515 if (BE (new_state
== NULL
, 0))
1521 /* Search for the state whose node_set is equivalent to NODES and
1522 whose context is equivalent to CONTEXT.
1523 Return the pointer to the state, if we found it in the DFA.
1524 Otherwise create the new one and return it. In case of an error
1525 return NULL and set the error code in ERR.
1526 Note: - We assume NULL as the invalid state, then it is possible that
1527 return value is NULL and ERR is REG_NOERROR.
1528 - We never return non-NULL value in case of any errors, it is for
1531 static re_dfastate_t
*
1533 re_acquire_state_context (reg_errcode_t
*err
, const re_dfa_t
*dfa
,
1534 const re_node_set
*nodes
, unsigned int context
)
1537 re_dfastate_t
*new_state
;
1538 struct re_state_table_entry
*spot
;
1540 if (nodes
->nelem
== 0)
1545 hash
= calc_state_hash (nodes
, context
);
1546 spot
= dfa
->state_table
+ (hash
& dfa
->state_hash_mask
);
1548 for (i
= 0 ; i
< spot
->num
; i
++)
1550 re_dfastate_t
*state
= spot
->array
[i
];
1551 if (state
->hash
== hash
1552 && state
->context
== context
1553 && re_node_set_compare (state
->entrance_nodes
, nodes
))
1556 /* There are no appropriate state in `dfa', create the new one. */
1557 new_state
= create_cd_newstate (dfa
, nodes
, context
, hash
);
1558 if (BE (new_state
== NULL
, 0))
1564 /* Finish initialization of the new state NEWSTATE, and using its hash value
1565 HASH put in the appropriate bucket of DFA's state table. Return value
1566 indicates the error code if failed. */
1568 static reg_errcode_t
1569 register_state (const re_dfa_t
*dfa
, re_dfastate_t
*newstate
,
1572 struct re_state_table_entry
*spot
;
1576 newstate
->hash
= hash
;
1577 err
= re_node_set_alloc (&newstate
->non_eps_nodes
, newstate
->nodes
.nelem
);
1578 if (BE (err
!= REG_NOERROR
, 0))
1580 for (i
= 0; i
< newstate
->nodes
.nelem
; i
++)
1582 int elem
= newstate
->nodes
.elems
[i
];
1583 if (!IS_EPSILON_NODE (dfa
->nodes
[elem
].type
))
1584 if (re_node_set_insert_last (&newstate
->non_eps_nodes
, elem
) < 0)
1588 spot
= dfa
->state_table
+ (hash
& dfa
->state_hash_mask
);
1589 if (BE (spot
->alloc
<= spot
->num
, 0))
1591 int new_alloc
= 2 * spot
->num
+ 2;
1592 re_dfastate_t
**new_array
= re_realloc (spot
->array
, re_dfastate_t
*,
1594 if (BE (new_array
== NULL
, 0))
1596 spot
->array
= new_array
;
1597 spot
->alloc
= new_alloc
;
1599 spot
->array
[spot
->num
++] = newstate
;
1604 free_state (re_dfastate_t
*state
)
1606 re_node_set_free (&state
->non_eps_nodes
);
1607 re_node_set_free (&state
->inveclosure
);
1608 if (state
->entrance_nodes
!= &state
->nodes
)
1610 re_node_set_free (state
->entrance_nodes
);
1611 re_free (state
->entrance_nodes
);
1613 re_node_set_free (&state
->nodes
);
1614 re_free (state
->word_trtable
);
1615 re_free (state
->trtable
);
1619 /* Create the new state which is independ of contexts.
1620 Return the new state if succeeded, otherwise return NULL. */
1622 static re_dfastate_t
*
1624 create_ci_newstate (const re_dfa_t
*dfa
, const re_node_set
*nodes
,
1629 re_dfastate_t
*newstate
;
1631 newstate
= (re_dfastate_t
*) calloc (sizeof (re_dfastate_t
), 1);
1632 if (BE (newstate
== NULL
, 0))
1634 err
= re_node_set_init_copy (&newstate
->nodes
, nodes
);
1635 if (BE (err
!= REG_NOERROR
, 0))
1641 newstate
->entrance_nodes
= &newstate
->nodes
;
1642 for (i
= 0 ; i
< nodes
->nelem
; i
++)
1644 re_token_t
*node
= dfa
->nodes
+ nodes
->elems
[i
];
1645 re_token_type_t type
= node
->type
;
1646 if (type
== CHARACTER
&& !node
->constraint
)
1648 #ifdef RE_ENABLE_I18N
1649 newstate
->accept_mb
|= node
->accept_mb
;
1650 #endif /* RE_ENABLE_I18N */
1652 /* If the state has the halt node, the state is a halt state. */
1653 if (type
== END_OF_RE
)
1655 else if (type
== OP_BACK_REF
)
1656 newstate
->has_backref
= 1;
1657 else if (type
== ANCHOR
|| node
->constraint
)
1658 newstate
->has_constraint
= 1;
1660 err
= register_state (dfa
, newstate
, hash
);
1661 if (BE (err
!= REG_NOERROR
, 0))
1663 free_state (newstate
);
1669 /* Create the new state which is depend on the context CONTEXT.
1670 Return the new state if succeeded, otherwise return NULL. */
1672 static re_dfastate_t
*
1674 create_cd_newstate (const re_dfa_t
*dfa
, const re_node_set
*nodes
,
1675 unsigned int context
, unsigned int hash
)
1677 int i
, nctx_nodes
= 0;
1679 re_dfastate_t
*newstate
;
1681 newstate
= (re_dfastate_t
*) calloc (sizeof (re_dfastate_t
), 1);
1682 if (BE (newstate
== NULL
, 0))
1684 err
= re_node_set_init_copy (&newstate
->nodes
, nodes
);
1685 if (BE (err
!= REG_NOERROR
, 0))
1691 newstate
->context
= context
;
1692 newstate
->entrance_nodes
= &newstate
->nodes
;
1694 for (i
= 0 ; i
< nodes
->nelem
; i
++)
1696 re_token_t
*node
= dfa
->nodes
+ nodes
->elems
[i
];
1697 re_token_type_t type
= node
->type
;
1698 unsigned int constraint
= node
->constraint
;
1700 if (type
== CHARACTER
&& !constraint
)
1702 #ifdef RE_ENABLE_I18N
1703 newstate
->accept_mb
|= node
->accept_mb
;
1704 #endif /* RE_ENABLE_I18N */
1706 /* If the state has the halt node, the state is a halt state. */
1707 if (type
== END_OF_RE
)
1709 else if (type
== OP_BACK_REF
)
1710 newstate
->has_backref
= 1;
1714 if (newstate
->entrance_nodes
== &newstate
->nodes
)
1716 newstate
->entrance_nodes
= re_malloc (re_node_set
, 1);
1717 if (BE (newstate
->entrance_nodes
== NULL
, 0))
1719 free_state (newstate
);
1722 if (re_node_set_init_copy (newstate
->entrance_nodes
, nodes
)
1726 newstate
->has_constraint
= 1;
1729 if (NOT_SATISFY_PREV_CONSTRAINT (constraint
,context
))
1731 re_node_set_remove_at (&newstate
->nodes
, i
- nctx_nodes
);
1736 err
= register_state (dfa
, newstate
, hash
);
1737 if (BE (err
!= REG_NOERROR
, 0))
1739 free_state (newstate
);