1 /* Extended regular expression matching and search library.
2 Copyright (C) 2002-2006, 2010 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4 Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library; if not, write to the Free
18 Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
21 static void re_string_construct_common (const char *str
, int len
,
23 RE_TRANSLATE_TYPE trans
, int icase
,
24 const re_dfa_t
*dfa
) internal_function
;
25 static re_dfastate_t
*create_ci_newstate (const re_dfa_t
*dfa
,
26 const re_node_set
*nodes
,
27 unsigned int hash
) internal_function
;
28 static re_dfastate_t
*create_cd_newstate (const re_dfa_t
*dfa
,
29 const re_node_set
*nodes
,
31 unsigned int hash
) internal_function
;
34 #undef MAX /* safety */
36 MAX(size_t a
, size_t b
)
38 return (a
> b
? a
: b
);
42 /* Functions for string operation. */
44 /* This function allocate the buffers. It is necessary to call
45 re_string_reconstruct before using the object. */
49 re_string_allocate (re_string_t
*pstr
, const char *str
, int len
, int init_len
,
50 RE_TRANSLATE_TYPE trans
, int icase
, const re_dfa_t
*dfa
)
55 /* Ensure at least one character fits into the buffers. */
56 if (init_len
< dfa
->mb_cur_max
)
57 init_len
= dfa
->mb_cur_max
;
58 init_buf_len
= (len
+ 1 < init_len
) ? len
+ 1: init_len
;
59 re_string_construct_common (str
, len
, pstr
, trans
, icase
, dfa
);
61 ret
= re_string_realloc_buffers (pstr
, init_buf_len
);
62 if (BE (ret
!= REG_NOERROR
, 0))
65 pstr
->word_char
= dfa
->word_char
;
66 pstr
->word_ops_used
= dfa
->word_ops_used
;
67 pstr
->mbs
= pstr
->mbs_allocated
? pstr
->mbs
: (unsigned char *) str
;
68 pstr
->valid_len
= (pstr
->mbs_allocated
|| dfa
->mb_cur_max
> 1) ? 0 : len
;
69 pstr
->valid_raw_len
= pstr
->valid_len
;
73 /* This function allocate the buffers, and initialize them. */
77 re_string_construct (re_string_t
*pstr
, const char *str
, int len
,
78 RE_TRANSLATE_TYPE trans
, int icase
, const re_dfa_t
*dfa
)
81 memset (pstr
, '\0', sizeof (re_string_t
));
82 re_string_construct_common (str
, len
, pstr
, trans
, icase
, dfa
);
86 ret
= re_string_realloc_buffers (pstr
, len
+ 1);
87 if (BE (ret
!= REG_NOERROR
, 0))
90 pstr
->mbs
= pstr
->mbs_allocated
? pstr
->mbs
: (unsigned char *) str
;
95 if (dfa
->mb_cur_max
> 1)
99 ret
= build_wcs_upper_buffer (pstr
);
100 if (BE (ret
!= REG_NOERROR
, 0))
102 if (pstr
->valid_raw_len
>= len
)
104 if (pstr
->bufs_len
> pstr
->valid_len
+ dfa
->mb_cur_max
)
106 ret
= re_string_realloc_buffers (pstr
, pstr
->bufs_len
* 2);
107 if (BE (ret
!= REG_NOERROR
, 0))
112 #endif /* RE_ENABLE_I18N */
113 build_upper_buffer (pstr
);
117 #ifdef RE_ENABLE_I18N
118 if (dfa
->mb_cur_max
> 1)
119 build_wcs_buffer (pstr
);
121 #endif /* RE_ENABLE_I18N */
124 re_string_translate_buffer (pstr
);
127 pstr
->valid_len
= pstr
->bufs_len
;
128 pstr
->valid_raw_len
= pstr
->bufs_len
;
136 /* Helper functions for re_string_allocate, and re_string_construct. */
140 re_string_realloc_buffers (re_string_t
*pstr
, int new_buf_len
)
142 #ifdef RE_ENABLE_I18N
143 if (pstr
->mb_cur_max
> 1)
147 /* Avoid overflow in realloc. */
148 const size_t max_object_size
= MAX (sizeof (wint_t), sizeof (int));
149 if (BE (SIZE_MAX
/ max_object_size
< new_buf_len
, 0))
152 new_wcs
= re_realloc (pstr
->wcs
, wint_t, new_buf_len
);
153 if (BE (new_wcs
== NULL
, 0))
156 if (pstr
->offsets
!= NULL
)
158 int *new_offsets
= re_realloc (pstr
->offsets
, int, new_buf_len
);
159 if (BE (new_offsets
== NULL
, 0))
161 pstr
->offsets
= new_offsets
;
164 #endif /* RE_ENABLE_I18N */
165 if (pstr
->mbs_allocated
)
167 unsigned char *new_mbs
= re_realloc (pstr
->mbs
, unsigned char,
169 if (BE (new_mbs
== NULL
, 0))
173 pstr
->bufs_len
= new_buf_len
;
180 re_string_construct_common (const char *str
, int len
, re_string_t
*pstr
,
181 RE_TRANSLATE_TYPE trans
, int icase
,
184 pstr
->raw_mbs
= (const unsigned char *) str
;
188 pstr
->icase
= icase
? 1 : 0;
189 pstr
->mbs_allocated
= (trans
!= NULL
|| icase
);
190 pstr
->mb_cur_max
= dfa
->mb_cur_max
;
191 pstr
->is_utf8
= dfa
->is_utf8
;
192 pstr
->map_notascii
= dfa
->map_notascii
;
193 pstr
->stop
= pstr
->len
;
194 pstr
->raw_stop
= pstr
->stop
;
197 #ifdef RE_ENABLE_I18N
199 /* Build wide character buffer PSTR->WCS.
200 If the byte sequence of the string are:
201 <mb1>(0), <mb1>(1), <mb2>(0), <mb2>(1), <sb3>
202 Then wide character buffer will be:
203 <wc1> , WEOF , <wc2> , WEOF , <wc3>
204 We use WEOF for padding, they indicate that the position isn't
205 a first byte of a multibyte character.
207 Note that this function assumes PSTR->VALID_LEN elements are already
208 built and starts from PSTR->VALID_LEN. */
212 build_wcs_buffer (re_string_t
*pstr
)
215 unsigned char buf
[MB_LEN_MAX
];
216 assert (MB_LEN_MAX
>= pstr
->mb_cur_max
);
218 unsigned char buf
[64];
221 int byte_idx
, end_idx
, remain_len
;
224 /* Build the buffers from pstr->valid_len to either pstr->len or
226 end_idx
= (pstr
->bufs_len
> pstr
->len
) ? pstr
->len
: pstr
->bufs_len
;
227 for (byte_idx
= pstr
->valid_len
; byte_idx
< end_idx
;)
232 remain_len
= end_idx
- byte_idx
;
233 prev_st
= pstr
->cur_state
;
234 /* Apply the translation if we need. */
235 if (BE (pstr
->trans
!= NULL
, 0))
239 for (i
= 0; i
< pstr
->mb_cur_max
&& i
< remain_len
; ++i
)
241 ch
= pstr
->raw_mbs
[pstr
->raw_mbs_idx
+ byte_idx
+ i
];
242 buf
[i
] = pstr
->mbs
[byte_idx
+ i
] = pstr
->trans
[ch
];
244 p
= (const char *) buf
;
247 p
= (const char *) pstr
->raw_mbs
+ pstr
->raw_mbs_idx
+ byte_idx
;
248 mbclen
= __mbrtowc (&wc
, p
, remain_len
, &pstr
->cur_state
);
249 if (BE (mbclen
== (size_t) -2, 0))
251 /* The buffer doesn't have enough space, finish to build. */
252 pstr
->cur_state
= prev_st
;
255 else if (BE (mbclen
== (size_t) -1 || mbclen
== 0, 0))
257 /* We treat these cases as a singlebyte character. */
259 wc
= (wchar_t) pstr
->raw_mbs
[pstr
->raw_mbs_idx
+ byte_idx
];
260 if (BE (pstr
->trans
!= NULL
, 0))
261 wc
= pstr
->trans
[wc
];
262 pstr
->cur_state
= prev_st
;
265 /* Write wide character and padding. */
266 pstr
->wcs
[byte_idx
++] = wc
;
267 /* Write paddings. */
268 for (remain_len
= byte_idx
+ mbclen
- 1; byte_idx
< remain_len
;)
269 pstr
->wcs
[byte_idx
++] = WEOF
;
271 pstr
->valid_len
= byte_idx
;
272 pstr
->valid_raw_len
= byte_idx
;
275 /* Build wide character buffer PSTR->WCS like build_wcs_buffer,
276 but for REG_ICASE. */
280 build_wcs_upper_buffer (re_string_t
*pstr
)
283 int src_idx
, byte_idx
, end_idx
, remain_len
;
286 char buf
[MB_LEN_MAX
];
287 assert (MB_LEN_MAX
>= pstr
->mb_cur_max
);
292 byte_idx
= pstr
->valid_len
;
293 end_idx
= (pstr
->bufs_len
> pstr
->len
) ? pstr
->len
: pstr
->bufs_len
;
295 /* The following optimization assumes that ASCII characters can be
296 mapped to wide characters with a simple cast. */
297 if (! pstr
->map_notascii
&& pstr
->trans
== NULL
&& !pstr
->offsets_needed
)
299 while (byte_idx
< end_idx
)
303 if (isascii (pstr
->raw_mbs
[pstr
->raw_mbs_idx
+ byte_idx
])
304 && mbsinit (&pstr
->cur_state
))
306 /* In case of a singlebyte character. */
308 = toupper (pstr
->raw_mbs
[pstr
->raw_mbs_idx
+ byte_idx
]);
309 /* The next step uses the assumption that wchar_t is encoded
310 ASCII-safe: all ASCII values can be converted like this. */
311 pstr
->wcs
[byte_idx
] = (wchar_t) pstr
->mbs
[byte_idx
];
316 remain_len
= end_idx
- byte_idx
;
317 prev_st
= pstr
->cur_state
;
318 mbclen
= __mbrtowc (&wc
,
319 ((const char *) pstr
->raw_mbs
+ pstr
->raw_mbs_idx
320 + byte_idx
), remain_len
, &pstr
->cur_state
);
321 if (BE (mbclen
+ 2 > 2, 1))
329 mbcdlen
= wcrtomb (buf
, wcu
, &prev_st
);
330 if (BE (mbclen
== mbcdlen
, 1))
331 memcpy (pstr
->mbs
+ byte_idx
, buf
, mbclen
);
339 memcpy (pstr
->mbs
+ byte_idx
,
340 pstr
->raw_mbs
+ pstr
->raw_mbs_idx
+ byte_idx
, mbclen
);
341 pstr
->wcs
[byte_idx
++] = wcu
;
342 /* Write paddings. */
343 for (remain_len
= byte_idx
+ mbclen
- 1; byte_idx
< remain_len
;)
344 pstr
->wcs
[byte_idx
++] = WEOF
;
346 else if (mbclen
== (size_t) -1 || mbclen
== 0)
348 /* It is an invalid character or '\0'. Just use the byte. */
349 int ch
= pstr
->raw_mbs
[pstr
->raw_mbs_idx
+ byte_idx
];
350 pstr
->mbs
[byte_idx
] = ch
;
351 /* And also cast it to wide char. */
352 pstr
->wcs
[byte_idx
++] = (wchar_t) ch
;
353 if (BE (mbclen
== (size_t) -1, 0))
354 pstr
->cur_state
= prev_st
;
358 /* The buffer doesn't have enough space, finish to build. */
359 pstr
->cur_state
= prev_st
;
363 pstr
->valid_len
= byte_idx
;
364 pstr
->valid_raw_len
= byte_idx
;
368 for (src_idx
= pstr
->valid_raw_len
; byte_idx
< end_idx
;)
373 remain_len
= end_idx
- byte_idx
;
374 prev_st
= pstr
->cur_state
;
375 if (BE (pstr
->trans
!= NULL
, 0))
379 for (i
= 0; i
< pstr
->mb_cur_max
&& i
< remain_len
; ++i
)
381 ch
= pstr
->raw_mbs
[pstr
->raw_mbs_idx
+ src_idx
+ i
];
382 buf
[i
] = pstr
->trans
[ch
];
384 p
= (const char *) buf
;
387 p
= (const char *) pstr
->raw_mbs
+ pstr
->raw_mbs_idx
+ src_idx
;
388 mbclen
= __mbrtowc (&wc
, p
, remain_len
, &pstr
->cur_state
);
389 if (BE (mbclen
+ 2 > 2, 1))
397 mbcdlen
= wcrtomb ((char *) buf
, wcu
, &prev_st
);
398 if (BE (mbclen
== mbcdlen
, 1))
399 memcpy (pstr
->mbs
+ byte_idx
, buf
, mbclen
);
400 else if (mbcdlen
!= (size_t) -1)
404 if (byte_idx
+ mbcdlen
> pstr
->bufs_len
)
406 pstr
->cur_state
= prev_st
;
410 if (pstr
->offsets
== NULL
)
412 pstr
->offsets
= re_malloc (int, pstr
->bufs_len
);
414 if (pstr
->offsets
== NULL
)
417 if (!pstr
->offsets_needed
)
419 for (i
= 0; i
< (size_t) byte_idx
; ++i
)
420 pstr
->offsets
[i
] = i
;
421 pstr
->offsets_needed
= 1;
424 memcpy (pstr
->mbs
+ byte_idx
, buf
, mbcdlen
);
425 pstr
->wcs
[byte_idx
] = wcu
;
426 pstr
->offsets
[byte_idx
] = src_idx
;
427 for (i
= 1; i
< mbcdlen
; ++i
)
429 pstr
->offsets
[byte_idx
+ i
]
430 = src_idx
+ (i
< mbclen
? i
: mbclen
- 1);
431 pstr
->wcs
[byte_idx
+ i
] = WEOF
;
433 pstr
->len
+= mbcdlen
- mbclen
;
434 if (pstr
->raw_stop
> src_idx
)
435 pstr
->stop
+= mbcdlen
- mbclen
;
436 end_idx
= (pstr
->bufs_len
> pstr
->len
)
437 ? pstr
->len
: pstr
->bufs_len
;
443 memcpy (pstr
->mbs
+ byte_idx
, p
, mbclen
);
446 memcpy (pstr
->mbs
+ byte_idx
, p
, mbclen
);
448 if (BE (pstr
->offsets_needed
!= 0, 0))
451 for (i
= 0; i
< mbclen
; ++i
)
452 pstr
->offsets
[byte_idx
+ i
] = src_idx
+ i
;
456 pstr
->wcs
[byte_idx
++] = wcu
;
457 /* Write paddings. */
458 for (remain_len
= byte_idx
+ mbclen
- 1; byte_idx
< remain_len
;)
459 pstr
->wcs
[byte_idx
++] = WEOF
;
461 else if (mbclen
== (size_t) -1 || mbclen
== 0)
463 /* It is an invalid character or '\0'. Just use the byte. */
464 int ch
= pstr
->raw_mbs
[pstr
->raw_mbs_idx
+ src_idx
];
466 if (BE (pstr
->trans
!= NULL
, 0))
467 ch
= pstr
->trans
[ch
];
468 pstr
->mbs
[byte_idx
] = ch
;
470 if (BE (pstr
->offsets_needed
!= 0, 0))
471 pstr
->offsets
[byte_idx
] = src_idx
;
474 /* And also cast it to wide char. */
475 pstr
->wcs
[byte_idx
++] = (wchar_t) ch
;
476 if (BE (mbclen
== (size_t) -1, 0))
477 pstr
->cur_state
= prev_st
;
481 /* The buffer doesn't have enough space, finish to build. */
482 pstr
->cur_state
= prev_st
;
486 pstr
->valid_len
= byte_idx
;
487 pstr
->valid_raw_len
= src_idx
;
491 /* Skip characters until the index becomes greater than NEW_RAW_IDX.
496 re_string_skip_chars (re_string_t
*pstr
, int new_raw_idx
, wint_t *last_wc
)
503 /* Skip the characters which are not necessary to check. */
504 for (rawbuf_idx
= pstr
->raw_mbs_idx
+ pstr
->valid_raw_len
;
505 rawbuf_idx
< new_raw_idx
;)
508 int remain_len
= pstr
->len
- rawbuf_idx
;
509 prev_st
= pstr
->cur_state
;
510 mbclen
= __mbrtowc (&wc2
, (const char *) pstr
->raw_mbs
+ rawbuf_idx
,
511 remain_len
, &pstr
->cur_state
);
512 if (BE (mbclen
== (size_t) -2 || mbclen
== (size_t) -1 || mbclen
== 0, 0))
514 /* We treat these cases as a single byte character. */
515 if (mbclen
== 0 || remain_len
== 0)
518 wc
= *(unsigned char *) (pstr
->raw_mbs
+ rawbuf_idx
);
520 pstr
->cur_state
= prev_st
;
524 /* Then proceed the next character. */
525 rawbuf_idx
+= mbclen
;
527 *last_wc
= (wint_t) wc
;
530 #endif /* RE_ENABLE_I18N */
532 /* Build the buffer PSTR->MBS, and apply the translation if we need.
533 This function is used in case of REG_ICASE. */
537 build_upper_buffer (re_string_t
*pstr
)
539 int char_idx
, end_idx
;
540 end_idx
= (pstr
->bufs_len
> pstr
->len
) ? pstr
->len
: pstr
->bufs_len
;
542 for (char_idx
= pstr
->valid_len
; char_idx
< end_idx
; ++char_idx
)
544 int ch
= pstr
->raw_mbs
[pstr
->raw_mbs_idx
+ char_idx
];
545 if (BE (pstr
->trans
!= NULL
, 0))
546 ch
= pstr
->trans
[ch
];
548 pstr
->mbs
[char_idx
] = toupper (ch
);
550 pstr
->mbs
[char_idx
] = ch
;
552 pstr
->valid_len
= char_idx
;
553 pstr
->valid_raw_len
= char_idx
;
556 /* Apply TRANS to the buffer in PSTR. */
560 re_string_translate_buffer (re_string_t
*pstr
)
562 int buf_idx
, end_idx
;
563 end_idx
= (pstr
->bufs_len
> pstr
->len
) ? pstr
->len
: pstr
->bufs_len
;
565 for (buf_idx
= pstr
->valid_len
; buf_idx
< end_idx
; ++buf_idx
)
567 int ch
= pstr
->raw_mbs
[pstr
->raw_mbs_idx
+ buf_idx
];
568 pstr
->mbs
[buf_idx
] = pstr
->trans
[ch
];
571 pstr
->valid_len
= buf_idx
;
572 pstr
->valid_raw_len
= buf_idx
;
575 /* This function re-construct the buffers.
576 Concretely, convert to wide character in case of pstr->mb_cur_max > 1,
577 convert to upper case in case of REG_ICASE, apply translation. */
581 re_string_reconstruct (re_string_t
*pstr
, int idx
, int eflags
)
583 int offset
= idx
- pstr
->raw_mbs_idx
;
584 if (BE (offset
< 0, 0))
587 #ifdef RE_ENABLE_I18N
588 if (pstr
->mb_cur_max
> 1)
589 memset (&pstr
->cur_state
, '\0', sizeof (mbstate_t));
590 #endif /* RE_ENABLE_I18N */
591 pstr
->len
= pstr
->raw_len
;
592 pstr
->stop
= pstr
->raw_stop
;
594 pstr
->raw_mbs_idx
= 0;
595 pstr
->valid_raw_len
= 0;
596 pstr
->offsets_needed
= 0;
597 pstr
->tip_context
= ((eflags
& REG_NOTBOL
) ? CONTEXT_BEGBUF
598 : CONTEXT_NEWLINE
| CONTEXT_BEGBUF
);
599 if (!pstr
->mbs_allocated
)
600 pstr
->mbs
= (unsigned char *) pstr
->raw_mbs
;
604 if (BE (offset
!= 0, 1))
606 /* Should the already checked characters be kept? */
607 if (BE (offset
< pstr
->valid_raw_len
, 1))
609 /* Yes, move them to the front of the buffer. */
610 #ifdef RE_ENABLE_I18N
611 if (BE (pstr
->offsets_needed
, 0))
613 int low
= 0, high
= pstr
->valid_len
, mid
;
616 mid
= (high
+ low
) / 2;
617 if (pstr
->offsets
[mid
] > offset
)
619 else if (pstr
->offsets
[mid
] < offset
)
625 if (pstr
->offsets
[mid
] < offset
)
627 pstr
->tip_context
= re_string_context_at (pstr
, mid
- 1,
629 /* This can be quite complicated, so handle specially
630 only the common and easy case where the character with
631 different length representation of lower and upper
632 case is present at or after offset. */
633 if (pstr
->valid_len
> offset
634 && mid
== offset
&& pstr
->offsets
[mid
] == offset
)
636 memmove (pstr
->wcs
, pstr
->wcs
+ offset
,
637 (pstr
->valid_len
- offset
) * sizeof (wint_t));
638 memmove (pstr
->mbs
, pstr
->mbs
+ offset
, pstr
->valid_len
- offset
);
639 pstr
->valid_len
-= offset
;
640 pstr
->valid_raw_len
-= offset
;
641 for (low
= 0; low
< pstr
->valid_len
; low
++)
642 pstr
->offsets
[low
] = pstr
->offsets
[low
+ offset
] - offset
;
646 /* Otherwise, just find out how long the partial multibyte
647 character at offset is and fill it with WEOF/255. */
648 pstr
->len
= pstr
->raw_len
- idx
+ offset
;
649 pstr
->stop
= pstr
->raw_stop
- idx
+ offset
;
650 pstr
->offsets_needed
= 0;
651 while (mid
> 0 && pstr
->offsets
[mid
- 1] == offset
)
653 while (mid
< pstr
->valid_len
)
654 if (pstr
->wcs
[mid
] != WEOF
)
658 if (mid
== pstr
->valid_len
)
662 pstr
->valid_len
= pstr
->offsets
[mid
] - offset
;
665 for (low
= 0; low
< pstr
->valid_len
; ++low
)
666 pstr
->wcs
[low
] = WEOF
;
667 memset (pstr
->mbs
, 255, pstr
->valid_len
);
670 pstr
->valid_raw_len
= pstr
->valid_len
;
676 pstr
->tip_context
= re_string_context_at (pstr
, offset
- 1,
678 #ifdef RE_ENABLE_I18N
679 if (pstr
->mb_cur_max
> 1)
680 memmove (pstr
->wcs
, pstr
->wcs
+ offset
,
681 (pstr
->valid_len
- offset
) * sizeof (wint_t));
682 #endif /* RE_ENABLE_I18N */
683 if (BE (pstr
->mbs_allocated
, 0))
684 memmove (pstr
->mbs
, pstr
->mbs
+ offset
,
685 pstr
->valid_len
- offset
);
686 pstr
->valid_len
-= offset
;
687 pstr
->valid_raw_len
-= offset
;
689 assert (pstr
->valid_len
> 0);
695 #ifdef RE_ENABLE_I18N
696 /* No, skip all characters until IDX. */
697 int prev_valid_len
= pstr
->valid_len
;
699 if (BE (pstr
->offsets_needed
, 0))
701 pstr
->len
= pstr
->raw_len
- idx
+ offset
;
702 pstr
->stop
= pstr
->raw_stop
- idx
+ offset
;
703 pstr
->offsets_needed
= 0;
707 #ifdef RE_ENABLE_I18N
708 if (pstr
->mb_cur_max
> 1)
715 const unsigned char *raw
, *p
, *end
;
717 /* Special case UTF-8. Multi-byte chars start with any
718 byte other than 0x80 - 0xbf. */
719 raw
= pstr
->raw_mbs
+ pstr
->raw_mbs_idx
;
720 end
= raw
+ (offset
- pstr
->mb_cur_max
);
721 if (end
< pstr
->raw_mbs
)
723 p
= raw
+ offset
- 1;
725 /* We know the wchar_t encoding is UCS4, so for the simple
726 case, ASCII characters, skip the conversion step. */
727 if (isascii (*p
) && BE (pstr
->trans
== NULL
, 1))
729 memset (&pstr
->cur_state
, '\0', sizeof (mbstate_t));
730 /* pstr->valid_len = 0; */
735 for (; p
>= end
; --p
)
736 if ((*p
& 0xc0) != 0x80)
740 int mlen
= raw
+ pstr
->len
- p
;
741 unsigned char buf
[6];
744 if (BE (pstr
->trans
!= NULL
, 0))
746 int i
= mlen
< 6 ? mlen
: 6;
748 buf
[i
] = pstr
->trans
[p
[i
]];
750 /* XXX Don't use mbrtowc, we know which conversion
751 to use (UTF-8 -> UCS4). */
752 memset (&cur_state
, 0, sizeof (cur_state
));
753 mbclen
= __mbrtowc (&wc2
, (const char *) p
, mlen
,
755 if (raw
+ offset
- p
<= mbclen
756 && mbclen
< (size_t) -2)
758 memset (&pstr
->cur_state
, '\0',
760 pstr
->valid_len
= mbclen
- (raw
+ offset
- p
);
768 pstr
->valid_len
= re_string_skip_chars (pstr
, idx
, &wc
) - idx
;
771 = re_string_context_at (pstr
, prev_valid_len
- 1, eflags
);
773 pstr
->tip_context
= ((BE (pstr
->word_ops_used
!= 0, 0)
774 && IS_WIDE_WORD_CHAR (wc
))
776 : ((IS_WIDE_NEWLINE (wc
)
777 && pstr
->newline_anchor
)
778 ? CONTEXT_NEWLINE
: 0));
779 if (BE (pstr
->valid_len
, 0))
781 for (wcs_idx
= 0; wcs_idx
< pstr
->valid_len
; ++wcs_idx
)
782 pstr
->wcs
[wcs_idx
] = WEOF
;
783 if (pstr
->mbs_allocated
)
784 memset (pstr
->mbs
, 255, pstr
->valid_len
);
786 pstr
->valid_raw_len
= pstr
->valid_len
;
789 #endif /* RE_ENABLE_I18N */
791 int c
= pstr
->raw_mbs
[pstr
->raw_mbs_idx
+ offset
- 1];
792 pstr
->valid_raw_len
= 0;
795 pstr
->tip_context
= (bitset_contain (pstr
->word_char
, c
)
797 : ((IS_NEWLINE (c
) && pstr
->newline_anchor
)
798 ? CONTEXT_NEWLINE
: 0));
801 if (!BE (pstr
->mbs_allocated
, 0))
804 pstr
->raw_mbs_idx
= idx
;
806 pstr
->stop
-= offset
;
808 /* Then build the buffers. */
809 #ifdef RE_ENABLE_I18N
810 if (pstr
->mb_cur_max
> 1)
814 reg_errcode_t ret
= build_wcs_upper_buffer (pstr
);
815 if (BE (ret
!= REG_NOERROR
, 0))
819 build_wcs_buffer (pstr
);
822 #endif /* RE_ENABLE_I18N */
823 if (BE (pstr
->mbs_allocated
, 0))
826 build_upper_buffer (pstr
);
827 else if (pstr
->trans
!= NULL
)
828 re_string_translate_buffer (pstr
);
831 pstr
->valid_len
= pstr
->len
;
838 internal_function
__attribute ((pure
))
839 re_string_peek_byte_case (const re_string_t
*pstr
, int idx
)
843 /* Handle the common (easiest) cases first. */
844 if (BE (!pstr
->mbs_allocated
, 1))
845 return re_string_peek_byte (pstr
, idx
);
847 #ifdef RE_ENABLE_I18N
848 if (pstr
->mb_cur_max
> 1
849 && ! re_string_is_single_byte_char (pstr
, pstr
->cur_idx
+ idx
))
850 return re_string_peek_byte (pstr
, idx
);
853 off
= pstr
->cur_idx
+ idx
;
854 #ifdef RE_ENABLE_I18N
855 if (pstr
->offsets_needed
)
856 off
= pstr
->offsets
[off
];
859 ch
= pstr
->raw_mbs
[pstr
->raw_mbs_idx
+ off
];
861 #ifdef RE_ENABLE_I18N
862 /* Ensure that e.g. for tr_TR.UTF-8 BACKSLASH DOTLESS SMALL LETTER I
863 this function returns CAPITAL LETTER I instead of first byte of
864 DOTLESS SMALL LETTER I. The latter would confuse the parser,
865 since peek_byte_case doesn't advance cur_idx in any way. */
866 if (pstr
->offsets_needed
&& !isascii (ch
))
867 return re_string_peek_byte (pstr
, idx
);
874 internal_function
__attribute ((pure
))
875 re_string_fetch_byte_case (re_string_t
*pstr
)
877 if (BE (!pstr
->mbs_allocated
, 1))
878 return re_string_fetch_byte (pstr
);
880 #ifdef RE_ENABLE_I18N
881 if (pstr
->offsets_needed
)
885 /* For tr_TR.UTF-8 [[:islower:]] there is
886 [[: CAPITAL LETTER I WITH DOT lower:]] in mbs. Skip
887 in that case the whole multi-byte character and return
888 the original letter. On the other side, with
889 [[: DOTLESS SMALL LETTER I return [[:I, as doing
890 anything else would complicate things too much. */
892 if (!re_string_first_byte (pstr
, pstr
->cur_idx
))
893 return re_string_fetch_byte (pstr
);
895 off
= pstr
->offsets
[pstr
->cur_idx
];
896 ch
= pstr
->raw_mbs
[pstr
->raw_mbs_idx
+ off
];
899 return re_string_fetch_byte (pstr
);
901 re_string_skip_bytes (pstr
,
902 re_string_char_size_at (pstr
, pstr
->cur_idx
));
907 return pstr
->raw_mbs
[pstr
->raw_mbs_idx
+ pstr
->cur_idx
++];
912 re_string_destruct (re_string_t
*pstr
)
914 #ifdef RE_ENABLE_I18N
916 re_free (pstr
->offsets
);
917 #endif /* RE_ENABLE_I18N */
918 if (pstr
->mbs_allocated
)
922 /* Return the context at IDX in INPUT. */
926 re_string_context_at (const re_string_t
*input
, int idx
, int eflags
)
930 /* In this case, we use the value stored in input->tip_context,
931 since we can't know the character in input->mbs[-1] here. */
932 return input
->tip_context
;
933 if (BE (idx
== input
->len
, 0))
934 return ((eflags
& REG_NOTEOL
) ? CONTEXT_ENDBUF
935 : CONTEXT_NEWLINE
| CONTEXT_ENDBUF
);
936 #ifdef RE_ENABLE_I18N
937 if (input
->mb_cur_max
> 1)
941 while(input
->wcs
[wc_idx
] == WEOF
)
944 /* It must not happen. */
945 assert (wc_idx
>= 0);
949 return input
->tip_context
;
951 wc
= input
->wcs
[wc_idx
];
952 if (BE (input
->word_ops_used
!= 0, 0) && IS_WIDE_WORD_CHAR (wc
))
954 return (IS_WIDE_NEWLINE (wc
) && input
->newline_anchor
955 ? CONTEXT_NEWLINE
: 0);
960 c
= re_string_byte_at (input
, idx
);
961 if (bitset_contain (input
->word_char
, c
))
963 return IS_NEWLINE (c
) && input
->newline_anchor
? CONTEXT_NEWLINE
: 0;
967 /* Functions for set operation. */
971 re_node_set_alloc (re_node_set
*set
, int size
)
974 * ADR: valgrind says size can be 0, which then doesn't
975 * free the block of size 0. Harumph. This seems
976 * to work ok, though.
980 memset(set
, 0, sizeof(*set
));
985 set
->elems
= re_malloc (int, size
);
986 if (BE (set
->elems
== NULL
, 0))
993 re_node_set_init_1 (re_node_set
*set
, int elem
)
997 set
->elems
= re_malloc (int, 1);
998 if (BE (set
->elems
== NULL
, 0))
1000 set
->alloc
= set
->nelem
= 0;
1003 set
->elems
[0] = elem
;
1007 static reg_errcode_t
1009 re_node_set_init_2 (re_node_set
*set
, int elem1
, int elem2
)
1012 set
->elems
= re_malloc (int, 2);
1013 if (BE (set
->elems
== NULL
, 0))
1018 set
->elems
[0] = elem1
;
1025 set
->elems
[0] = elem1
;
1026 set
->elems
[1] = elem2
;
1030 set
->elems
[0] = elem2
;
1031 set
->elems
[1] = elem1
;
1037 static reg_errcode_t
1039 re_node_set_init_copy (re_node_set
*dest
, const re_node_set
*src
)
1041 dest
->nelem
= src
->nelem
;
1044 dest
->alloc
= dest
->nelem
;
1045 dest
->elems
= re_malloc (int, dest
->alloc
);
1046 if (BE (dest
->elems
== NULL
, 0))
1048 dest
->alloc
= dest
->nelem
= 0;
1051 memcpy (dest
->elems
, src
->elems
, src
->nelem
* sizeof (int));
1054 re_node_set_init_empty (dest
);
1058 /* Calculate the intersection of the sets SRC1 and SRC2. And merge it to
1059 DEST. Return value indicate the error code or REG_NOERROR if succeeded.
1060 Note: We assume dest->elems is NULL, when dest->alloc is 0. */
1062 static reg_errcode_t
1064 re_node_set_add_intersect (re_node_set
*dest
, const re_node_set
*src1
,
1065 const re_node_set
*src2
)
1067 int i1
, i2
, is
, id
, delta
, sbase
;
1068 if (src1
->nelem
== 0 || src2
->nelem
== 0)
1071 /* We need dest->nelem + 2 * elems_in_intersection; this is a
1072 conservative estimate. */
1073 if (src1
->nelem
+ src2
->nelem
+ dest
->nelem
> dest
->alloc
)
1075 int new_alloc
= src1
->nelem
+ src2
->nelem
+ dest
->alloc
;
1076 int *new_elems
= re_realloc (dest
->elems
, int, new_alloc
);
1077 if (BE (new_elems
== NULL
, 0))
1079 dest
->elems
= new_elems
;
1080 dest
->alloc
= new_alloc
;
1083 /* Find the items in the intersection of SRC1 and SRC2, and copy
1084 into the top of DEST those that are not already in DEST itself. */
1085 sbase
= dest
->nelem
+ src1
->nelem
+ src2
->nelem
;
1086 i1
= src1
->nelem
- 1;
1087 i2
= src2
->nelem
- 1;
1088 id
= dest
->nelem
- 1;
1091 if (src1
->elems
[i1
] == src2
->elems
[i2
])
1093 /* Try to find the item in DEST. Maybe we could binary search? */
1094 while (id
>= 0 && dest
->elems
[id
] > src1
->elems
[i1
])
1097 if (id
< 0 || dest
->elems
[id
] != src1
->elems
[i1
])
1098 dest
->elems
[--sbase
] = src1
->elems
[i1
];
1100 if (--i1
< 0 || --i2
< 0)
1104 /* Lower the highest of the two items. */
1105 else if (src1
->elems
[i1
] < src2
->elems
[i2
])
1117 id
= dest
->nelem
- 1;
1118 is
= dest
->nelem
+ src1
->nelem
+ src2
->nelem
- 1;
1119 delta
= is
- sbase
+ 1;
1121 /* Now copy. When DELTA becomes zero, the remaining
1122 DEST elements are already in place; this is more or
1123 less the same loop that is in re_node_set_merge. */
1124 dest
->nelem
+= delta
;
1125 if (delta
> 0 && id
>= 0)
1128 if (dest
->elems
[is
] > dest
->elems
[id
])
1130 /* Copy from the top. */
1131 dest
->elems
[id
+ delta
--] = dest
->elems
[is
--];
1137 /* Slide from the bottom. */
1138 dest
->elems
[id
+ delta
] = dest
->elems
[id
];
1144 /* Copy remaining SRC elements. */
1145 memcpy (dest
->elems
, dest
->elems
+ sbase
, delta
* sizeof (int));
1150 /* Calculate the union set of the sets SRC1 and SRC2. And store it to
1151 DEST. Return value indicate the error code or REG_NOERROR if succeeded. */
1153 static reg_errcode_t
1155 re_node_set_init_union (re_node_set
*dest
, const re_node_set
*src1
,
1156 const re_node_set
*src2
)
1159 if (src1
!= NULL
&& src1
->nelem
> 0 && src2
!= NULL
&& src2
->nelem
> 0)
1161 dest
->alloc
= src1
->nelem
+ src2
->nelem
;
1162 dest
->elems
= re_malloc (int, dest
->alloc
);
1163 if (BE (dest
->elems
== NULL
, 0))
1168 if (src1
!= NULL
&& src1
->nelem
> 0)
1169 return re_node_set_init_copy (dest
, src1
);
1170 else if (src2
!= NULL
&& src2
->nelem
> 0)
1171 return re_node_set_init_copy (dest
, src2
);
1173 re_node_set_init_empty (dest
);
1176 for (i1
= i2
= id
= 0 ; i1
< src1
->nelem
&& i2
< src2
->nelem
;)
1178 if (src1
->elems
[i1
] > src2
->elems
[i2
])
1180 dest
->elems
[id
++] = src2
->elems
[i2
++];
1183 if (src1
->elems
[i1
] == src2
->elems
[i2
])
1185 dest
->elems
[id
++] = src1
->elems
[i1
++];
1187 if (i1
< src1
->nelem
)
1189 memcpy (dest
->elems
+ id
, src1
->elems
+ i1
,
1190 (src1
->nelem
- i1
) * sizeof (int));
1191 id
+= src1
->nelem
- i1
;
1193 else if (i2
< src2
->nelem
)
1195 memcpy (dest
->elems
+ id
, src2
->elems
+ i2
,
1196 (src2
->nelem
- i2
) * sizeof (int));
1197 id
+= src2
->nelem
- i2
;
1203 /* Calculate the union set of the sets DEST and SRC. And store it to
1204 DEST. Return value indicate the error code or REG_NOERROR if succeeded. */
1206 static reg_errcode_t
1208 re_node_set_merge (re_node_set
*dest
, const re_node_set
*src
)
1210 int is
, id
, sbase
, delta
;
1211 if (src
== NULL
|| src
->nelem
== 0)
1213 if (dest
->alloc
< 2 * src
->nelem
+ dest
->nelem
)
1215 int new_alloc
= 2 * (src
->nelem
+ dest
->alloc
);
1216 int *new_buffer
= re_realloc (dest
->elems
, int, new_alloc
);
1217 if (BE (new_buffer
== NULL
, 0))
1219 dest
->elems
= new_buffer
;
1220 dest
->alloc
= new_alloc
;
1223 if (BE (dest
->nelem
== 0, 0))
1225 dest
->nelem
= src
->nelem
;
1226 memcpy (dest
->elems
, src
->elems
, src
->nelem
* sizeof (int));
1230 /* Copy into the top of DEST the items of SRC that are not
1231 found in DEST. Maybe we could binary search in DEST? */
1232 for (sbase
= dest
->nelem
+ 2 * src
->nelem
,
1233 is
= src
->nelem
- 1, id
= dest
->nelem
- 1; is
>= 0 && id
>= 0; )
1235 if (dest
->elems
[id
] == src
->elems
[is
])
1237 else if (dest
->elems
[id
] < src
->elems
[is
])
1238 dest
->elems
[--sbase
] = src
->elems
[is
--];
1239 else /* if (dest->elems[id] > src->elems[is]) */
1245 /* If DEST is exhausted, the remaining items of SRC must be unique. */
1247 memcpy (dest
->elems
+ sbase
, src
->elems
, (is
+ 1) * sizeof (int));
1250 id
= dest
->nelem
- 1;
1251 is
= dest
->nelem
+ 2 * src
->nelem
- 1;
1252 delta
= is
- sbase
+ 1;
1256 /* Now copy. When DELTA becomes zero, the remaining
1257 DEST elements are already in place. */
1258 dest
->nelem
+= delta
;
1261 if (dest
->elems
[is
] > dest
->elems
[id
])
1263 /* Copy from the top. */
1264 dest
->elems
[id
+ delta
--] = dest
->elems
[is
--];
1270 /* Slide from the bottom. */
1271 dest
->elems
[id
+ delta
] = dest
->elems
[id
];
1274 /* Copy remaining SRC elements. */
1275 memcpy (dest
->elems
, dest
->elems
+ sbase
,
1276 delta
* sizeof (int));
1285 /* Insert the new element ELEM to the re_node_set* SET.
1286 SET should not already have ELEM.
1287 return -1 if an error has occurred, return 1 otherwise. */
1291 re_node_set_insert (re_node_set
*set
, int elem
)
1294 /* In case the set is empty. */
1295 if (set
->alloc
== 0)
1297 if (BE (re_node_set_init_1 (set
, elem
) == REG_NOERROR
, 1))
1303 if (BE (set
->nelem
, 0) == 0)
1305 /* We already guaranteed above that set->alloc != 0. */
1306 set
->elems
[0] = elem
;
1311 /* Realloc if we need. */
1312 if (set
->alloc
== set
->nelem
)
1315 set
->alloc
= set
->alloc
* 2;
1316 new_elems
= re_realloc (set
->elems
, int, set
->alloc
);
1317 if (BE (new_elems
== NULL
, 0))
1319 set
->elems
= new_elems
;
1322 /* Move the elements which follows the new element. Test the
1323 first element separately to skip a check in the inner loop. */
1324 if (elem
< set
->elems
[0])
1327 for (idx
= set
->nelem
; idx
> 0; idx
--)
1328 set
->elems
[idx
] = set
->elems
[idx
- 1];
1332 for (idx
= set
->nelem
; set
->elems
[idx
- 1] > elem
; idx
--)
1333 set
->elems
[idx
] = set
->elems
[idx
- 1];
1336 /* Insert the new element. */
1337 set
->elems
[idx
] = elem
;
1342 /* Insert the new element ELEM to the re_node_set* SET.
1343 SET should not already have any element greater than or equal to ELEM.
1344 Return -1 if an error has occurred, return 1 otherwise. */
1348 re_node_set_insert_last (re_node_set
*set
, int elem
)
1350 /* Realloc if we need. */
1351 if (set
->alloc
== set
->nelem
)
1354 set
->alloc
= (set
->alloc
+ 1) * 2;
1355 new_elems
= re_realloc (set
->elems
, int, set
->alloc
);
1356 if (BE (new_elems
== NULL
, 0))
1358 set
->elems
= new_elems
;
1361 /* Insert the new element. */
1362 set
->elems
[set
->nelem
++] = elem
;
1366 /* Compare two node sets SET1 and SET2.
1367 return 1 if SET1 and SET2 are equivalent, return 0 otherwise. */
1370 internal_function
__attribute ((pure
))
1371 re_node_set_compare (const re_node_set
*set1
, const re_node_set
*set2
)
1374 if (set1
== NULL
|| set2
== NULL
|| set1
->nelem
!= set2
->nelem
)
1376 for (i
= set1
->nelem
; --i
>= 0 ; )
1377 if (set1
->elems
[i
] != set2
->elems
[i
])
1382 /* Return (idx + 1) if SET contains the element ELEM, return 0 otherwise. */
1385 internal_function
__attribute ((pure
))
1386 re_node_set_contains (const re_node_set
*set
, int elem
)
1388 unsigned int idx
, right
, mid
;
1389 if (set
->nelem
<= 0)
1392 /* Binary search the element. */
1394 right
= set
->nelem
- 1;
1397 mid
= (idx
+ right
) / 2;
1398 if (set
->elems
[mid
] < elem
)
1403 return set
->elems
[idx
] == elem
? idx
+ 1 : 0;
1408 re_node_set_remove_at (re_node_set
*set
, int idx
)
1410 if (idx
< 0 || idx
>= set
->nelem
)
1413 for (; idx
< set
->nelem
; idx
++)
1414 set
->elems
[idx
] = set
->elems
[idx
+ 1];
1418 /* Add the token TOKEN to dfa->nodes, and return the index of the token.
1419 Or return -1, if an error has occurred. */
1423 re_dfa_add_node (re_dfa_t
*dfa
, re_token_t token
)
1425 if (BE (dfa
->nodes_len
>= dfa
->nodes_alloc
, 0))
1427 size_t new_nodes_alloc
= dfa
->nodes_alloc
* 2;
1428 int *new_nexts
, *new_indices
;
1429 re_node_set
*new_edests
, *new_eclosures
;
1430 re_token_t
*new_nodes
;
1432 /* Avoid overflows in realloc. */
1433 const size_t max_object_size
= MAX (sizeof (re_token_t
),
1434 MAX (sizeof (re_node_set
),
1436 if (BE (SIZE_MAX
/ max_object_size
< new_nodes_alloc
, 0))
1439 new_nodes
= re_realloc (dfa
->nodes
, re_token_t
, new_nodes_alloc
);
1440 if (BE (new_nodes
== NULL
, 0))
1442 dfa
->nodes
= new_nodes
;
1443 new_nexts
= re_realloc (dfa
->nexts
, int, new_nodes_alloc
);
1444 new_indices
= re_realloc (dfa
->org_indices
, int, new_nodes_alloc
);
1445 new_edests
= re_realloc (dfa
->edests
, re_node_set
, new_nodes_alloc
);
1446 new_eclosures
= re_realloc (dfa
->eclosures
, re_node_set
, new_nodes_alloc
);
1447 if (BE (new_nexts
== NULL
|| new_indices
== NULL
1448 || new_edests
== NULL
|| new_eclosures
== NULL
, 0))
1450 dfa
->nexts
= new_nexts
;
1451 dfa
->org_indices
= new_indices
;
1452 dfa
->edests
= new_edests
;
1453 dfa
->eclosures
= new_eclosures
;
1454 dfa
->nodes_alloc
= new_nodes_alloc
;
1456 dfa
->nodes
[dfa
->nodes_len
] = token
;
1457 dfa
->nodes
[dfa
->nodes_len
].constraint
= 0;
1458 #ifdef RE_ENABLE_I18N
1459 dfa
->nodes
[dfa
->nodes_len
].accept_mb
=
1460 (token
.type
== OP_PERIOD
&& dfa
->mb_cur_max
> 1) || token
.type
== COMPLEX_BRACKET
;
1462 dfa
->nexts
[dfa
->nodes_len
] = -1;
1463 re_node_set_init_empty (dfa
->edests
+ dfa
->nodes_len
);
1464 re_node_set_init_empty (dfa
->eclosures
+ dfa
->nodes_len
);
1465 return dfa
->nodes_len
++;
1468 static inline unsigned int
1470 calc_state_hash (const re_node_set
*nodes
, unsigned int context
)
1472 unsigned int hash
= nodes
->nelem
+ context
;
1474 for (i
= 0 ; i
< nodes
->nelem
; i
++)
1475 hash
+= nodes
->elems
[i
];
1479 /* Search for the state whose node_set is equivalent to NODES.
1480 Return the pointer to the state, if we found it in the DFA.
1481 Otherwise create the new one and return it. In case of an error
1482 return NULL and set the error code in ERR.
1483 Note: - We assume NULL as the invalid state, then it is possible that
1484 return value is NULL and ERR is REG_NOERROR.
1485 - We never return non-NULL value in case of any errors, it is for
1488 static re_dfastate_t
*
1490 re_acquire_state (reg_errcode_t
*err
, const re_dfa_t
*dfa
,
1491 const re_node_set
*nodes
)
1494 re_dfastate_t
*new_state
;
1495 struct re_state_table_entry
*spot
;
1497 if (BE (nodes
->nelem
== 0, 0))
1502 hash
= calc_state_hash (nodes
, 0);
1503 spot
= dfa
->state_table
+ (hash
& dfa
->state_hash_mask
);
1505 for (i
= 0 ; i
< spot
->num
; i
++)
1507 re_dfastate_t
*state
= spot
->array
[i
];
1508 if (hash
!= state
->hash
)
1510 if (re_node_set_compare (&state
->nodes
, nodes
))
1514 /* There are no appropriate state in the dfa, create the new one. */
1515 new_state
= create_ci_newstate (dfa
, nodes
, hash
);
1516 if (BE (new_state
== NULL
, 0))
1522 /* Search for the state whose node_set is equivalent to NODES and
1523 whose context is equivalent to CONTEXT.
1524 Return the pointer to the state, if we found it in the DFA.
1525 Otherwise create the new one and return it. In case of an error
1526 return NULL and set the error code in ERR.
1527 Note: - We assume NULL as the invalid state, then it is possible that
1528 return value is NULL and ERR is REG_NOERROR.
1529 - We never return non-NULL value in case of any errors, it is for
1532 static re_dfastate_t
*
1534 re_acquire_state_context (reg_errcode_t
*err
, const re_dfa_t
*dfa
,
1535 const re_node_set
*nodes
, unsigned int context
)
1538 re_dfastate_t
*new_state
;
1539 struct re_state_table_entry
*spot
;
1541 if (nodes
->nelem
== 0)
1546 hash
= calc_state_hash (nodes
, context
);
1547 spot
= dfa
->state_table
+ (hash
& dfa
->state_hash_mask
);
1549 for (i
= 0 ; i
< spot
->num
; i
++)
1551 re_dfastate_t
*state
= spot
->array
[i
];
1552 if (state
->hash
== hash
1553 && state
->context
== context
1554 && re_node_set_compare (state
->entrance_nodes
, nodes
))
1557 /* There are no appropriate state in `dfa', create the new one. */
1558 new_state
= create_cd_newstate (dfa
, nodes
, context
, hash
);
1559 if (BE (new_state
== NULL
, 0))
1565 /* Finish initialization of the new state NEWSTATE, and using its hash value
1566 HASH put in the appropriate bucket of DFA's state table. Return value
1567 indicates the error code if failed. */
1569 static reg_errcode_t
1570 register_state (const re_dfa_t
*dfa
, re_dfastate_t
*newstate
,
1573 struct re_state_table_entry
*spot
;
1577 newstate
->hash
= hash
;
1578 err
= re_node_set_alloc (&newstate
->non_eps_nodes
, newstate
->nodes
.nelem
);
1579 if (BE (err
!= REG_NOERROR
, 0))
1581 for (i
= 0; i
< newstate
->nodes
.nelem
; i
++)
1583 int elem
= newstate
->nodes
.elems
[i
];
1584 if (!IS_EPSILON_NODE (dfa
->nodes
[elem
].type
))
1585 if (re_node_set_insert_last (&newstate
->non_eps_nodes
, elem
) < 0)
1589 spot
= dfa
->state_table
+ (hash
& dfa
->state_hash_mask
);
1590 if (BE (spot
->alloc
<= spot
->num
, 0))
1592 int new_alloc
= 2 * spot
->num
+ 2;
1593 re_dfastate_t
**new_array
= re_realloc (spot
->array
, re_dfastate_t
*,
1595 if (BE (new_array
== NULL
, 0))
1597 spot
->array
= new_array
;
1598 spot
->alloc
= new_alloc
;
1600 spot
->array
[spot
->num
++] = newstate
;
1605 free_state (re_dfastate_t
*state
)
1607 re_node_set_free (&state
->non_eps_nodes
);
1608 re_node_set_free (&state
->inveclosure
);
1609 if (state
->entrance_nodes
!= &state
->nodes
)
1611 re_node_set_free (state
->entrance_nodes
);
1612 re_free (state
->entrance_nodes
);
1614 re_node_set_free (&state
->nodes
);
1615 re_free (state
->word_trtable
);
1616 re_free (state
->trtable
);
1620 /* Create the new state which is independ of contexts.
1621 Return the new state if succeeded, otherwise return NULL. */
1623 static re_dfastate_t
*
1625 create_ci_newstate (const re_dfa_t
*dfa
, const re_node_set
*nodes
,
1630 re_dfastate_t
*newstate
;
1632 newstate
= (re_dfastate_t
*) calloc (sizeof (re_dfastate_t
), 1);
1633 if (BE (newstate
== NULL
, 0))
1635 err
= re_node_set_init_copy (&newstate
->nodes
, nodes
);
1636 if (BE (err
!= REG_NOERROR
, 0))
1642 newstate
->entrance_nodes
= &newstate
->nodes
;
1643 for (i
= 0 ; i
< nodes
->nelem
; i
++)
1645 re_token_t
*node
= dfa
->nodes
+ nodes
->elems
[i
];
1646 re_token_type_t type
= node
->type
;
1647 if (type
== CHARACTER
&& !node
->constraint
)
1649 #ifdef RE_ENABLE_I18N
1650 newstate
->accept_mb
|= node
->accept_mb
;
1651 #endif /* RE_ENABLE_I18N */
1653 /* If the state has the halt node, the state is a halt state. */
1654 if (type
== END_OF_RE
)
1656 else if (type
== OP_BACK_REF
)
1657 newstate
->has_backref
= 1;
1658 else if (type
== ANCHOR
|| node
->constraint
)
1659 newstate
->has_constraint
= 1;
1661 err
= register_state (dfa
, newstate
, hash
);
1662 if (BE (err
!= REG_NOERROR
, 0))
1664 free_state (newstate
);
1670 /* Create the new state which is depend on the context CONTEXT.
1671 Return the new state if succeeded, otherwise return NULL. */
1673 static re_dfastate_t
*
1675 create_cd_newstate (const re_dfa_t
*dfa
, const re_node_set
*nodes
,
1676 unsigned int context
, unsigned int hash
)
1678 int i
, nctx_nodes
= 0;
1680 re_dfastate_t
*newstate
;
1682 newstate
= (re_dfastate_t
*) calloc (sizeof (re_dfastate_t
), 1);
1683 if (BE (newstate
== NULL
, 0))
1685 err
= re_node_set_init_copy (&newstate
->nodes
, nodes
);
1686 if (BE (err
!= REG_NOERROR
, 0))
1692 newstate
->context
= context
;
1693 newstate
->entrance_nodes
= &newstate
->nodes
;
1695 for (i
= 0 ; i
< nodes
->nelem
; i
++)
1697 re_token_t
*node
= dfa
->nodes
+ nodes
->elems
[i
];
1698 re_token_type_t type
= node
->type
;
1699 unsigned int constraint
= node
->constraint
;
1701 if (type
== CHARACTER
&& !constraint
)
1703 #ifdef RE_ENABLE_I18N
1704 newstate
->accept_mb
|= node
->accept_mb
;
1705 #endif /* RE_ENABLE_I18N */
1707 /* If the state has the halt node, the state is a halt state. */
1708 if (type
== END_OF_RE
)
1710 else if (type
== OP_BACK_REF
)
1711 newstate
->has_backref
= 1;
1715 if (newstate
->entrance_nodes
== &newstate
->nodes
)
1717 newstate
->entrance_nodes
= re_malloc (re_node_set
, 1);
1718 if (BE (newstate
->entrance_nodes
== NULL
, 0))
1720 free_state (newstate
);
1723 if (re_node_set_init_copy (newstate
->entrance_nodes
, nodes
)
1727 newstate
->has_constraint
= 1;
1730 if (NOT_SATISFY_PREV_CONSTRAINT (constraint
,context
))
1732 re_node_set_remove_at (&newstate
->nodes
, i
- nctx_nodes
);
1737 err
= register_state (dfa
, newstate
, hash
);
1738 if (BE (err
!= REG_NOERROR
, 0))
1740 free_state (newstate
);