1 /* Extended regular expression matching and search library.
2 Copyright (C) 2002-2018 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4 Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library; if not, see
18 <https://www.gnu.org/licenses/>. */
20 static void re_string_construct_common (const char *str
, Idx len
,
22 RE_TRANSLATE_TYPE trans
, bool icase
,
24 static re_dfastate_t
*create_ci_newstate (const re_dfa_t
*dfa
,
25 const re_node_set
*nodes
,
27 static re_dfastate_t
*create_cd_newstate (const re_dfa_t
*dfa
,
28 const re_node_set
*nodes
,
31 static reg_errcode_t
re_string_realloc_buffers (re_string_t
*pstr
,
34 static void build_wcs_buffer (re_string_t
*pstr
);
35 static reg_errcode_t
build_wcs_upper_buffer (re_string_t
*pstr
);
36 #endif /* RE_ENABLE_I18N */
37 static void build_upper_buffer (re_string_t
*pstr
);
38 static void re_string_translate_buffer (re_string_t
*pstr
);
39 static unsigned int re_string_context_at (const re_string_t
*input
, Idx idx
,
40 int eflags
) __attribute__ ((pure
));
42 /* Functions for string operation. */
44 /* This function allocate the buffers. It is necessary to call
45 re_string_reconstruct before using the object. */
48 __attribute_warn_unused_result__
49 re_string_allocate (re_string_t
*pstr
, const char *str
, Idx len
, Idx init_len
,
50 RE_TRANSLATE_TYPE trans
, bool icase
, const re_dfa_t
*dfa
)
55 /* Ensure at least one character fits into the buffers. */
56 if (init_len
< dfa
->mb_cur_max
)
57 init_len
= dfa
->mb_cur_max
;
58 init_buf_len
= (len
+ 1 < init_len
) ? len
+ 1: init_len
;
59 re_string_construct_common (str
, len
, pstr
, trans
, icase
, dfa
);
61 ret
= re_string_realloc_buffers (pstr
, init_buf_len
);
62 if (BE (ret
!= REG_NOERROR
, 0))
65 pstr
->word_char
= dfa
->word_char
;
66 pstr
->word_ops_used
= dfa
->word_ops_used
;
67 pstr
->mbs
= pstr
->mbs_allocated
? pstr
->mbs
: (unsigned char *) str
;
68 pstr
->valid_len
= (pstr
->mbs_allocated
|| dfa
->mb_cur_max
> 1) ? 0 : len
;
69 pstr
->valid_raw_len
= pstr
->valid_len
;
73 /* This function allocate the buffers, and initialize them. */
76 __attribute_warn_unused_result__
77 re_string_construct (re_string_t
*pstr
, const char *str
, Idx len
,
78 RE_TRANSLATE_TYPE trans
, bool icase
, const re_dfa_t
*dfa
)
81 memset (pstr
, '\0', sizeof (re_string_t
));
82 re_string_construct_common (str
, len
, pstr
, trans
, icase
, dfa
);
86 ret
= re_string_realloc_buffers (pstr
, len
+ 1);
87 if (BE (ret
!= REG_NOERROR
, 0))
90 pstr
->mbs
= pstr
->mbs_allocated
? pstr
->mbs
: (unsigned char *) str
;
95 if (dfa
->mb_cur_max
> 1)
99 ret
= build_wcs_upper_buffer (pstr
);
100 if (BE (ret
!= REG_NOERROR
, 0))
102 if (pstr
->valid_raw_len
>= len
)
104 if (pstr
->bufs_len
> pstr
->valid_len
+ dfa
->mb_cur_max
)
106 ret
= re_string_realloc_buffers (pstr
, pstr
->bufs_len
* 2);
107 if (BE (ret
!= REG_NOERROR
, 0))
112 #endif /* RE_ENABLE_I18N */
113 build_upper_buffer (pstr
);
117 #ifdef RE_ENABLE_I18N
118 if (dfa
->mb_cur_max
> 1)
119 build_wcs_buffer (pstr
);
121 #endif /* RE_ENABLE_I18N */
124 re_string_translate_buffer (pstr
);
127 pstr
->valid_len
= pstr
->bufs_len
;
128 pstr
->valid_raw_len
= pstr
->bufs_len
;
136 /* Helper functions for re_string_allocate, and re_string_construct. */
139 __attribute_warn_unused_result__
140 re_string_realloc_buffers (re_string_t
*pstr
, Idx new_buf_len
)
142 #ifdef RE_ENABLE_I18N
143 if (pstr
->mb_cur_max
> 1)
147 /* Avoid overflow in realloc. */
148 const size_t max_object_size
= MAX (sizeof (wint_t), sizeof (Idx
));
149 if (BE (MIN (IDX_MAX
, SIZE_MAX
/ max_object_size
) < new_buf_len
, 0))
152 new_wcs
= re_realloc (pstr
->wcs
, wint_t, new_buf_len
);
153 if (BE (new_wcs
== NULL
, 0))
156 if (pstr
->offsets
!= NULL
)
158 Idx
*new_offsets
= re_realloc (pstr
->offsets
, Idx
, new_buf_len
);
159 if (BE (new_offsets
== NULL
, 0))
161 pstr
->offsets
= new_offsets
;
164 #endif /* RE_ENABLE_I18N */
165 if (pstr
->mbs_allocated
)
167 unsigned char *new_mbs
= re_realloc (pstr
->mbs
, unsigned char,
169 if (BE (new_mbs
== NULL
, 0))
173 pstr
->bufs_len
= new_buf_len
;
179 re_string_construct_common (const char *str
, Idx len
, re_string_t
*pstr
,
180 RE_TRANSLATE_TYPE trans
, bool icase
,
183 pstr
->raw_mbs
= (const unsigned char *) str
;
188 pstr
->mbs_allocated
= (trans
!= NULL
|| icase
);
189 pstr
->mb_cur_max
= dfa
->mb_cur_max
;
190 pstr
->is_utf8
= dfa
->is_utf8
;
191 pstr
->map_notascii
= dfa
->map_notascii
;
192 pstr
->stop
= pstr
->len
;
193 pstr
->raw_stop
= pstr
->stop
;
196 #ifdef RE_ENABLE_I18N
198 /* Build wide character buffer PSTR->WCS.
199 If the byte sequence of the string are:
200 <mb1>(0), <mb1>(1), <mb2>(0), <mb2>(1), <sb3>
201 Then wide character buffer will be:
202 <wc1> , WEOF , <wc2> , WEOF , <wc3>
203 We use WEOF for padding, they indicate that the position isn't
204 a first byte of a multibyte character.
206 Note that this function assumes PSTR->VALID_LEN elements are already
207 built and starts from PSTR->VALID_LEN. */
210 build_wcs_buffer (re_string_t
*pstr
)
213 unsigned char buf
[MB_LEN_MAX
];
214 assert (MB_LEN_MAX
>= pstr
->mb_cur_max
);
216 unsigned char buf
[64];
219 Idx byte_idx
, end_idx
, remain_len
;
222 /* Build the buffers from pstr->valid_len to either pstr->len or
224 end_idx
= (pstr
->bufs_len
> pstr
->len
) ? pstr
->len
: pstr
->bufs_len
;
225 for (byte_idx
= pstr
->valid_len
; byte_idx
< end_idx
;)
230 remain_len
= end_idx
- byte_idx
;
231 prev_st
= pstr
->cur_state
;
232 /* Apply the translation if we need. */
233 if (BE (pstr
->trans
!= NULL
, 0))
237 for (i
= 0; i
< pstr
->mb_cur_max
&& i
< remain_len
; ++i
)
239 ch
= pstr
->raw_mbs
[pstr
->raw_mbs_idx
+ byte_idx
+ i
];
240 buf
[i
] = pstr
->mbs
[byte_idx
+ i
] = pstr
->trans
[ch
];
242 p
= (const char *) buf
;
245 p
= (const char *) pstr
->raw_mbs
+ pstr
->raw_mbs_idx
+ byte_idx
;
246 mbclen
= __mbrtowc (&wc
, p
, remain_len
, &pstr
->cur_state
);
247 if (BE (mbclen
== (size_t) -1 || mbclen
== 0
248 || (mbclen
== (size_t) -2 && pstr
->bufs_len
>= pstr
->len
), 0))
250 /* We treat these cases as a singlebyte character. */
252 wc
= (wchar_t) pstr
->raw_mbs
[pstr
->raw_mbs_idx
+ byte_idx
];
253 if (BE (pstr
->trans
!= NULL
, 0))
254 wc
= pstr
->trans
[wc
];
255 pstr
->cur_state
= prev_st
;
257 else if (BE (mbclen
== (size_t) -2, 0))
259 /* The buffer doesn't have enough space, finish to build. */
260 pstr
->cur_state
= prev_st
;
264 /* Write wide character and padding. */
265 pstr
->wcs
[byte_idx
++] = wc
;
266 /* Write paddings. */
267 for (remain_len
= byte_idx
+ mbclen
- 1; byte_idx
< remain_len
;)
268 pstr
->wcs
[byte_idx
++] = WEOF
;
270 pstr
->valid_len
= byte_idx
;
271 pstr
->valid_raw_len
= byte_idx
;
274 /* Build wide character buffer PSTR->WCS like build_wcs_buffer,
275 but for REG_ICASE. */
278 __attribute_warn_unused_result__
279 build_wcs_upper_buffer (re_string_t
*pstr
)
282 Idx src_idx
, byte_idx
, end_idx
, remain_len
;
285 char buf
[MB_LEN_MAX
];
286 assert (MB_LEN_MAX
>= pstr
->mb_cur_max
);
291 byte_idx
= pstr
->valid_len
;
292 end_idx
= (pstr
->bufs_len
> pstr
->len
) ? pstr
->len
: pstr
->bufs_len
;
294 /* The following optimization assumes that ASCII characters can be
295 mapped to wide characters with a simple cast. */
296 if (! pstr
->map_notascii
&& pstr
->trans
== NULL
&& !pstr
->offsets_needed
)
298 while (byte_idx
< end_idx
)
302 if (isascii (pstr
->raw_mbs
[pstr
->raw_mbs_idx
+ byte_idx
])
303 && mbsinit (&pstr
->cur_state
))
305 /* In case of a singlebyte character. */
307 = toupper (pstr
->raw_mbs
[pstr
->raw_mbs_idx
+ byte_idx
]);
308 /* The next step uses the assumption that wchar_t is encoded
309 ASCII-safe: all ASCII values can be converted like this. */
310 pstr
->wcs
[byte_idx
] = (wchar_t) pstr
->mbs
[byte_idx
];
315 remain_len
= end_idx
- byte_idx
;
316 prev_st
= pstr
->cur_state
;
317 mbclen
= __mbrtowc (&wc
,
318 ((const char *) pstr
->raw_mbs
+ pstr
->raw_mbs_idx
319 + byte_idx
), remain_len
, &pstr
->cur_state
);
320 if (BE (0 < mbclen
&& mbclen
< (size_t) -2, 1))
322 wchar_t wcu
= __towupper (wc
);
327 mbcdlen
= __wcrtomb (buf
, wcu
, &prev_st
);
328 if (BE (mbclen
== mbcdlen
, 1))
329 memcpy (pstr
->mbs
+ byte_idx
, buf
, mbclen
);
337 memcpy (pstr
->mbs
+ byte_idx
,
338 pstr
->raw_mbs
+ pstr
->raw_mbs_idx
+ byte_idx
, mbclen
);
339 pstr
->wcs
[byte_idx
++] = wcu
;
340 /* Write paddings. */
341 for (remain_len
= byte_idx
+ mbclen
- 1; byte_idx
< remain_len
;)
342 pstr
->wcs
[byte_idx
++] = WEOF
;
344 else if (mbclen
== (size_t) -1 || mbclen
== 0
345 || (mbclen
== (size_t) -2 && pstr
->bufs_len
>= pstr
->len
))
347 /* It is an invalid character, an incomplete character
348 at the end of the string, or '\0'. Just use the byte. */
349 int ch
= pstr
->raw_mbs
[pstr
->raw_mbs_idx
+ byte_idx
];
350 pstr
->mbs
[byte_idx
] = ch
;
351 /* And also cast it to wide char. */
352 pstr
->wcs
[byte_idx
++] = (wchar_t) ch
;
353 if (BE (mbclen
== (size_t) -1, 0))
354 pstr
->cur_state
= prev_st
;
358 /* The buffer doesn't have enough space, finish to build. */
359 pstr
->cur_state
= prev_st
;
363 pstr
->valid_len
= byte_idx
;
364 pstr
->valid_raw_len
= byte_idx
;
368 for (src_idx
= pstr
->valid_raw_len
; byte_idx
< end_idx
;)
373 remain_len
= end_idx
- byte_idx
;
374 prev_st
= pstr
->cur_state
;
375 if (BE (pstr
->trans
!= NULL
, 0))
379 for (i
= 0; i
< pstr
->mb_cur_max
&& i
< remain_len
; ++i
)
381 ch
= pstr
->raw_mbs
[pstr
->raw_mbs_idx
+ src_idx
+ i
];
382 buf
[i
] = pstr
->trans
[ch
];
384 p
= (const char *) buf
;
387 p
= (const char *) pstr
->raw_mbs
+ pstr
->raw_mbs_idx
+ src_idx
;
388 mbclen
= __mbrtowc (&wc
, p
, remain_len
, &pstr
->cur_state
);
389 if (BE (0 < mbclen
&& mbclen
< (size_t) -2, 1))
391 wchar_t wcu
= __towupper (wc
);
396 mbcdlen
= __wcrtomb ((char *) buf
, wcu
, &prev_st
);
397 if (BE (mbclen
== mbcdlen
, 1))
398 memcpy (pstr
->mbs
+ byte_idx
, buf
, mbclen
);
399 else if (mbcdlen
!= (size_t) -1)
403 if (byte_idx
+ mbcdlen
> pstr
->bufs_len
)
405 pstr
->cur_state
= prev_st
;
409 if (pstr
->offsets
== NULL
)
411 pstr
->offsets
= re_malloc (Idx
, pstr
->bufs_len
);
413 if (pstr
->offsets
== NULL
)
416 if (!pstr
->offsets_needed
)
418 for (i
= 0; i
< (size_t) byte_idx
; ++i
)
419 pstr
->offsets
[i
] = i
;
420 pstr
->offsets_needed
= 1;
423 memcpy (pstr
->mbs
+ byte_idx
, buf
, mbcdlen
);
424 pstr
->wcs
[byte_idx
] = wcu
;
425 pstr
->offsets
[byte_idx
] = src_idx
;
426 for (i
= 1; i
< mbcdlen
; ++i
)
428 pstr
->offsets
[byte_idx
+ i
]
429 = src_idx
+ (i
< mbclen
? i
: mbclen
- 1);
430 pstr
->wcs
[byte_idx
+ i
] = WEOF
;
432 pstr
->len
+= mbcdlen
- mbclen
;
433 if (pstr
->raw_stop
> src_idx
)
434 pstr
->stop
+= mbcdlen
- mbclen
;
435 end_idx
= (pstr
->bufs_len
> pstr
->len
)
436 ? pstr
->len
: pstr
->bufs_len
;
442 memcpy (pstr
->mbs
+ byte_idx
, p
, mbclen
);
445 memcpy (pstr
->mbs
+ byte_idx
, p
, mbclen
);
447 if (BE (pstr
->offsets_needed
!= 0, 0))
450 for (i
= 0; i
< mbclen
; ++i
)
451 pstr
->offsets
[byte_idx
+ i
] = src_idx
+ i
;
455 pstr
->wcs
[byte_idx
++] = wcu
;
456 /* Write paddings. */
457 for (remain_len
= byte_idx
+ mbclen
- 1; byte_idx
< remain_len
;)
458 pstr
->wcs
[byte_idx
++] = WEOF
;
460 else if (mbclen
== (size_t) -1 || mbclen
== 0
461 || (mbclen
== (size_t) -2 && pstr
->bufs_len
>= pstr
->len
))
463 /* It is an invalid character or '\0'. Just use the byte. */
464 int ch
= pstr
->raw_mbs
[pstr
->raw_mbs_idx
+ src_idx
];
466 if (BE (pstr
->trans
!= NULL
, 0))
467 ch
= pstr
->trans
[ch
];
468 pstr
->mbs
[byte_idx
] = ch
;
470 if (BE (pstr
->offsets_needed
!= 0, 0))
471 pstr
->offsets
[byte_idx
] = src_idx
;
474 /* And also cast it to wide char. */
475 pstr
->wcs
[byte_idx
++] = (wchar_t) ch
;
476 if (BE (mbclen
== (size_t) -1, 0))
477 pstr
->cur_state
= prev_st
;
481 /* The buffer doesn't have enough space, finish to build. */
482 pstr
->cur_state
= prev_st
;
486 pstr
->valid_len
= byte_idx
;
487 pstr
->valid_raw_len
= src_idx
;
491 /* Skip characters until the index becomes greater than NEW_RAW_IDX.
495 re_string_skip_chars (re_string_t
*pstr
, Idx new_raw_idx
, wint_t *last_wc
)
502 /* Skip the characters which are not necessary to check. */
503 for (rawbuf_idx
= pstr
->raw_mbs_idx
+ pstr
->valid_raw_len
;
504 rawbuf_idx
< new_raw_idx
;)
507 Idx remain_len
= pstr
->raw_len
- rawbuf_idx
;
508 prev_st
= pstr
->cur_state
;
509 mbclen
= __mbrtowc (&wc2
, (const char *) pstr
->raw_mbs
+ rawbuf_idx
,
510 remain_len
, &pstr
->cur_state
);
511 if (BE (mbclen
== (size_t) -2 || mbclen
== (size_t) -1 || mbclen
== 0, 0))
513 /* We treat these cases as a single byte character. */
514 if (mbclen
== 0 || remain_len
== 0)
517 wc
= *(unsigned char *) (pstr
->raw_mbs
+ rawbuf_idx
);
519 pstr
->cur_state
= prev_st
;
523 /* Then proceed the next character. */
524 rawbuf_idx
+= mbclen
;
529 #endif /* RE_ENABLE_I18N */
531 /* Build the buffer PSTR->MBS, and apply the translation if we need.
532 This function is used in case of REG_ICASE. */
535 build_upper_buffer (re_string_t
*pstr
)
537 Idx char_idx
, end_idx
;
538 end_idx
= (pstr
->bufs_len
> pstr
->len
) ? pstr
->len
: pstr
->bufs_len
;
540 for (char_idx
= pstr
->valid_len
; char_idx
< end_idx
; ++char_idx
)
542 int ch
= pstr
->raw_mbs
[pstr
->raw_mbs_idx
+ char_idx
];
543 if (BE (pstr
->trans
!= NULL
, 0))
544 ch
= pstr
->trans
[ch
];
545 pstr
->mbs
[char_idx
] = toupper (ch
);
547 pstr
->valid_len
= char_idx
;
548 pstr
->valid_raw_len
= char_idx
;
551 /* Apply TRANS to the buffer in PSTR. */
554 re_string_translate_buffer (re_string_t
*pstr
)
556 Idx buf_idx
, end_idx
;
557 end_idx
= (pstr
->bufs_len
> pstr
->len
) ? pstr
->len
: pstr
->bufs_len
;
559 for (buf_idx
= pstr
->valid_len
; buf_idx
< end_idx
; ++buf_idx
)
561 int ch
= pstr
->raw_mbs
[pstr
->raw_mbs_idx
+ buf_idx
];
562 pstr
->mbs
[buf_idx
] = pstr
->trans
[ch
];
565 pstr
->valid_len
= buf_idx
;
566 pstr
->valid_raw_len
= buf_idx
;
569 /* This function re-construct the buffers.
570 Concretely, convert to wide character in case of pstr->mb_cur_max > 1,
571 convert to upper case in case of REG_ICASE, apply translation. */
574 __attribute_warn_unused_result__
575 re_string_reconstruct (re_string_t
*pstr
, Idx idx
, int eflags
)
579 if (BE (pstr
->raw_mbs_idx
<= idx
, 0))
580 offset
= idx
- pstr
->raw_mbs_idx
;
584 #ifdef RE_ENABLE_I18N
585 if (pstr
->mb_cur_max
> 1)
586 memset (&pstr
->cur_state
, '\0', sizeof (mbstate_t));
587 #endif /* RE_ENABLE_I18N */
588 pstr
->len
= pstr
->raw_len
;
589 pstr
->stop
= pstr
->raw_stop
;
591 pstr
->raw_mbs_idx
= 0;
592 pstr
->valid_raw_len
= 0;
593 pstr
->offsets_needed
= 0;
594 pstr
->tip_context
= ((eflags
& REG_NOTBOL
) ? CONTEXT_BEGBUF
595 : CONTEXT_NEWLINE
| CONTEXT_BEGBUF
);
596 if (!pstr
->mbs_allocated
)
597 pstr
->mbs
= (unsigned char *) pstr
->raw_mbs
;
601 if (BE (offset
!= 0, 1))
603 /* Should the already checked characters be kept? */
604 if (BE (offset
< pstr
->valid_raw_len
, 1))
606 /* Yes, move them to the front of the buffer. */
607 #ifdef RE_ENABLE_I18N
608 if (BE (pstr
->offsets_needed
, 0))
610 Idx low
= 0, high
= pstr
->valid_len
, mid
;
613 mid
= (high
+ low
) / 2;
614 if (pstr
->offsets
[mid
] > offset
)
616 else if (pstr
->offsets
[mid
] < offset
)
622 if (pstr
->offsets
[mid
] < offset
)
624 pstr
->tip_context
= re_string_context_at (pstr
, mid
- 1,
626 /* This can be quite complicated, so handle specially
627 only the common and easy case where the character with
628 different length representation of lower and upper
629 case is present at or after offset. */
630 if (pstr
->valid_len
> offset
631 && mid
== offset
&& pstr
->offsets
[mid
] == offset
)
633 memmove (pstr
->wcs
, pstr
->wcs
+ offset
,
634 (pstr
->valid_len
- offset
) * sizeof (wint_t));
635 memmove (pstr
->mbs
, pstr
->mbs
+ offset
, pstr
->valid_len
- offset
);
636 pstr
->valid_len
-= offset
;
637 pstr
->valid_raw_len
-= offset
;
638 for (low
= 0; low
< pstr
->valid_len
; low
++)
639 pstr
->offsets
[low
] = pstr
->offsets
[low
+ offset
] - offset
;
643 /* Otherwise, just find out how long the partial multibyte
644 character at offset is and fill it with WEOF/255. */
645 pstr
->len
= pstr
->raw_len
- idx
+ offset
;
646 pstr
->stop
= pstr
->raw_stop
- idx
+ offset
;
647 pstr
->offsets_needed
= 0;
648 while (mid
> 0 && pstr
->offsets
[mid
- 1] == offset
)
650 while (mid
< pstr
->valid_len
)
651 if (pstr
->wcs
[mid
] != WEOF
)
655 if (mid
== pstr
->valid_len
)
659 pstr
->valid_len
= pstr
->offsets
[mid
] - offset
;
662 for (low
= 0; low
< pstr
->valid_len
; ++low
)
663 pstr
->wcs
[low
] = WEOF
;
664 memset (pstr
->mbs
, 255, pstr
->valid_len
);
667 pstr
->valid_raw_len
= pstr
->valid_len
;
673 pstr
->tip_context
= re_string_context_at (pstr
, offset
- 1,
675 #ifdef RE_ENABLE_I18N
676 if (pstr
->mb_cur_max
> 1)
677 memmove (pstr
->wcs
, pstr
->wcs
+ offset
,
678 (pstr
->valid_len
- offset
) * sizeof (wint_t));
679 #endif /* RE_ENABLE_I18N */
680 if (BE (pstr
->mbs_allocated
, 0))
681 memmove (pstr
->mbs
, pstr
->mbs
+ offset
,
682 pstr
->valid_len
- offset
);
683 pstr
->valid_len
-= offset
;
684 pstr
->valid_raw_len
-= offset
;
685 #if defined DEBUG && DEBUG
686 assert (pstr
->valid_len
> 0);
692 #ifdef RE_ENABLE_I18N
693 /* No, skip all characters until IDX. */
694 Idx prev_valid_len
= pstr
->valid_len
;
696 if (BE (pstr
->offsets_needed
, 0))
698 pstr
->len
= pstr
->raw_len
- idx
+ offset
;
699 pstr
->stop
= pstr
->raw_stop
- idx
+ offset
;
700 pstr
->offsets_needed
= 0;
704 #ifdef RE_ENABLE_I18N
705 if (pstr
->mb_cur_max
> 1)
712 const unsigned char *raw
, *p
, *end
;
714 /* Special case UTF-8. Multi-byte chars start with any
715 byte other than 0x80 - 0xbf. */
716 raw
= pstr
->raw_mbs
+ pstr
->raw_mbs_idx
;
717 end
= raw
+ (offset
- pstr
->mb_cur_max
);
718 if (end
< pstr
->raw_mbs
)
720 p
= raw
+ offset
- 1;
722 /* We know the wchar_t encoding is UCS4, so for the simple
723 case, ASCII characters, skip the conversion step. */
724 if (isascii (*p
) && BE (pstr
->trans
== NULL
, 1))
726 memset (&pstr
->cur_state
, '\0', sizeof (mbstate_t));
727 /* pstr->valid_len = 0; */
732 for (; p
>= end
; --p
)
733 if ((*p
& 0xc0) != 0x80)
737 Idx mlen
= raw
+ pstr
->len
- p
;
738 unsigned char buf
[6];
741 const unsigned char *pp
= p
;
742 if (BE (pstr
->trans
!= NULL
, 0))
744 int i
= mlen
< 6 ? mlen
: 6;
746 buf
[i
] = pstr
->trans
[p
[i
]];
749 /* XXX Don't use mbrtowc, we know which conversion
750 to use (UTF-8 -> UCS4). */
751 memset (&cur_state
, 0, sizeof (cur_state
));
752 mbclen
= __mbrtowc (&wc2
, (const char *) pp
, mlen
,
754 if (raw
+ offset
- p
<= mbclen
755 && mbclen
< (size_t) -2)
757 memset (&pstr
->cur_state
, '\0',
759 pstr
->valid_len
= mbclen
- (raw
+ offset
- p
);
767 pstr
->valid_len
= re_string_skip_chars (pstr
, idx
, &wc
) - idx
;
770 = re_string_context_at (pstr
, prev_valid_len
- 1, eflags
);
772 pstr
->tip_context
= ((BE (pstr
->word_ops_used
!= 0, 0)
773 && IS_WIDE_WORD_CHAR (wc
))
775 : ((IS_WIDE_NEWLINE (wc
)
776 && pstr
->newline_anchor
)
777 ? CONTEXT_NEWLINE
: 0));
778 if (BE (pstr
->valid_len
, 0))
780 for (wcs_idx
= 0; wcs_idx
< pstr
->valid_len
; ++wcs_idx
)
781 pstr
->wcs
[wcs_idx
] = WEOF
;
782 if (pstr
->mbs_allocated
)
783 memset (pstr
->mbs
, 255, pstr
->valid_len
);
785 pstr
->valid_raw_len
= pstr
->valid_len
;
788 #endif /* RE_ENABLE_I18N */
790 int c
= pstr
->raw_mbs
[pstr
->raw_mbs_idx
+ offset
- 1];
791 pstr
->valid_raw_len
= 0;
794 pstr
->tip_context
= (bitset_contain (pstr
->word_char
, c
)
796 : ((IS_NEWLINE (c
) && pstr
->newline_anchor
)
797 ? CONTEXT_NEWLINE
: 0));
800 if (!BE (pstr
->mbs_allocated
, 0))
803 pstr
->raw_mbs_idx
= idx
;
805 pstr
->stop
-= offset
;
807 /* Then build the buffers. */
808 #ifdef RE_ENABLE_I18N
809 if (pstr
->mb_cur_max
> 1)
813 reg_errcode_t ret
= build_wcs_upper_buffer (pstr
);
814 if (BE (ret
!= REG_NOERROR
, 0))
818 build_wcs_buffer (pstr
);
821 #endif /* RE_ENABLE_I18N */
822 if (BE (pstr
->mbs_allocated
, 0))
825 build_upper_buffer (pstr
);
826 else if (pstr
->trans
!= NULL
)
827 re_string_translate_buffer (pstr
);
830 pstr
->valid_len
= pstr
->len
;
837 __attribute__ ((pure
))
838 re_string_peek_byte_case (const re_string_t
*pstr
, Idx idx
)
843 /* Handle the common (easiest) cases first. */
844 if (BE (!pstr
->mbs_allocated
, 1))
845 return re_string_peek_byte (pstr
, idx
);
847 #ifdef RE_ENABLE_I18N
848 if (pstr
->mb_cur_max
> 1
849 && ! re_string_is_single_byte_char (pstr
, pstr
->cur_idx
+ idx
))
850 return re_string_peek_byte (pstr
, idx
);
853 off
= pstr
->cur_idx
+ idx
;
854 #ifdef RE_ENABLE_I18N
855 if (pstr
->offsets_needed
)
856 off
= pstr
->offsets
[off
];
859 ch
= pstr
->raw_mbs
[pstr
->raw_mbs_idx
+ off
];
861 #ifdef RE_ENABLE_I18N
862 /* Ensure that e.g. for tr_TR.UTF-8 BACKSLASH DOTLESS SMALL LETTER I
863 this function returns CAPITAL LETTER I instead of first byte of
864 DOTLESS SMALL LETTER I. The latter would confuse the parser,
865 since peek_byte_case doesn't advance cur_idx in any way. */
866 if (pstr
->offsets_needed
&& !isascii (ch
))
867 return re_string_peek_byte (pstr
, idx
);
874 re_string_fetch_byte_case (re_string_t
*pstr
)
876 if (BE (!pstr
->mbs_allocated
, 1))
877 return re_string_fetch_byte (pstr
);
879 #ifdef RE_ENABLE_I18N
880 if (pstr
->offsets_needed
)
885 /* For tr_TR.UTF-8 [[:islower:]] there is
886 [[: CAPITAL LETTER I WITH DOT lower:]] in mbs. Skip
887 in that case the whole multi-byte character and return
888 the original letter. On the other side, with
889 [[: DOTLESS SMALL LETTER I return [[:I, as doing
890 anything else would complicate things too much. */
892 if (!re_string_first_byte (pstr
, pstr
->cur_idx
))
893 return re_string_fetch_byte (pstr
);
895 off
= pstr
->offsets
[pstr
->cur_idx
];
896 ch
= pstr
->raw_mbs
[pstr
->raw_mbs_idx
+ off
];
899 return re_string_fetch_byte (pstr
);
901 re_string_skip_bytes (pstr
,
902 re_string_char_size_at (pstr
, pstr
->cur_idx
));
907 return pstr
->raw_mbs
[pstr
->raw_mbs_idx
+ pstr
->cur_idx
++];
911 re_string_destruct (re_string_t
*pstr
)
913 #ifdef RE_ENABLE_I18N
915 re_free (pstr
->offsets
);
916 #endif /* RE_ENABLE_I18N */
917 if (pstr
->mbs_allocated
)
921 /* Return the context at IDX in INPUT. */
924 re_string_context_at (const re_string_t
*input
, Idx idx
, int eflags
)
928 /* In this case, we use the value stored in input->tip_context,
929 since we can't know the character in input->mbs[-1] here. */
930 return input
->tip_context
;
931 if (BE (idx
== input
->len
, 0))
932 return ((eflags
& REG_NOTEOL
) ? CONTEXT_ENDBUF
933 : CONTEXT_NEWLINE
| CONTEXT_ENDBUF
);
934 #ifdef RE_ENABLE_I18N
935 if (input
->mb_cur_max
> 1)
939 while(input
->wcs
[wc_idx
] == WEOF
)
941 #if defined DEBUG && DEBUG
942 /* It must not happen. */
943 assert (wc_idx
>= 0);
947 return input
->tip_context
;
949 wc
= input
->wcs
[wc_idx
];
950 if (BE (input
->word_ops_used
!= 0, 0) && IS_WIDE_WORD_CHAR (wc
))
952 return (IS_WIDE_NEWLINE (wc
) && input
->newline_anchor
953 ? CONTEXT_NEWLINE
: 0);
958 c
= re_string_byte_at (input
, idx
);
959 if (bitset_contain (input
->word_char
, c
))
961 return IS_NEWLINE (c
) && input
->newline_anchor
? CONTEXT_NEWLINE
: 0;
965 /* Functions for set operation. */
968 __attribute_warn_unused_result__
969 re_node_set_alloc (re_node_set
*set
, Idx size
)
973 set
->elems
= re_malloc (Idx
, size
);
974 if (BE (set
->elems
== NULL
, 0) && (MALLOC_0_IS_NONNULL
|| size
!= 0))
980 __attribute_warn_unused_result__
981 re_node_set_init_1 (re_node_set
*set
, Idx elem
)
985 set
->elems
= re_malloc (Idx
, 1);
986 if (BE (set
->elems
== NULL
, 0))
988 set
->alloc
= set
->nelem
= 0;
991 set
->elems
[0] = elem
;
996 __attribute_warn_unused_result__
997 re_node_set_init_2 (re_node_set
*set
, Idx elem1
, Idx elem2
)
1000 set
->elems
= re_malloc (Idx
, 2);
1001 if (BE (set
->elems
== NULL
, 0))
1006 set
->elems
[0] = elem1
;
1013 set
->elems
[0] = elem1
;
1014 set
->elems
[1] = elem2
;
1018 set
->elems
[0] = elem2
;
1019 set
->elems
[1] = elem1
;
1025 static reg_errcode_t
1026 __attribute_warn_unused_result__
1027 re_node_set_init_copy (re_node_set
*dest
, const re_node_set
*src
)
1029 dest
->nelem
= src
->nelem
;
1032 dest
->alloc
= dest
->nelem
;
1033 dest
->elems
= re_malloc (Idx
, dest
->alloc
);
1034 if (BE (dest
->elems
== NULL
, 0))
1036 dest
->alloc
= dest
->nelem
= 0;
1039 memcpy (dest
->elems
, src
->elems
, src
->nelem
* sizeof (Idx
));
1042 re_node_set_init_empty (dest
);
1046 /* Calculate the intersection of the sets SRC1 and SRC2. And merge it to
1047 DEST. Return value indicate the error code or REG_NOERROR if succeeded.
1048 Note: We assume dest->elems is NULL, when dest->alloc is 0. */
1050 static reg_errcode_t
1051 __attribute_warn_unused_result__
1052 re_node_set_add_intersect (re_node_set
*dest
, const re_node_set
*src1
,
1053 const re_node_set
*src2
)
1055 Idx i1
, i2
, is
, id
, delta
, sbase
;
1056 if (src1
->nelem
== 0 || src2
->nelem
== 0)
1059 /* We need dest->nelem + 2 * elems_in_intersection; this is a
1060 conservative estimate. */
1061 if (src1
->nelem
+ src2
->nelem
+ dest
->nelem
> dest
->alloc
)
1063 Idx new_alloc
= src1
->nelem
+ src2
->nelem
+ dest
->alloc
;
1064 Idx
*new_elems
= re_realloc (dest
->elems
, Idx
, new_alloc
);
1065 if (BE (new_elems
== NULL
, 0))
1067 dest
->elems
= new_elems
;
1068 dest
->alloc
= new_alloc
;
1071 /* Find the items in the intersection of SRC1 and SRC2, and copy
1072 into the top of DEST those that are not already in DEST itself. */
1073 sbase
= dest
->nelem
+ src1
->nelem
+ src2
->nelem
;
1074 i1
= src1
->nelem
- 1;
1075 i2
= src2
->nelem
- 1;
1076 id
= dest
->nelem
- 1;
1079 if (src1
->elems
[i1
] == src2
->elems
[i2
])
1081 /* Try to find the item in DEST. Maybe we could binary search? */
1082 while (id
>= 0 && dest
->elems
[id
] > src1
->elems
[i1
])
1085 if (id
< 0 || dest
->elems
[id
] != src1
->elems
[i1
])
1086 dest
->elems
[--sbase
] = src1
->elems
[i1
];
1088 if (--i1
< 0 || --i2
< 0)
1092 /* Lower the highest of the two items. */
1093 else if (src1
->elems
[i1
] < src2
->elems
[i2
])
1105 id
= dest
->nelem
- 1;
1106 is
= dest
->nelem
+ src1
->nelem
+ src2
->nelem
- 1;
1107 delta
= is
- sbase
+ 1;
1109 /* Now copy. When DELTA becomes zero, the remaining
1110 DEST elements are already in place; this is more or
1111 less the same loop that is in re_node_set_merge. */
1112 dest
->nelem
+= delta
;
1113 if (delta
> 0 && id
>= 0)
1116 if (dest
->elems
[is
] > dest
->elems
[id
])
1118 /* Copy from the top. */
1119 dest
->elems
[id
+ delta
--] = dest
->elems
[is
--];
1125 /* Slide from the bottom. */
1126 dest
->elems
[id
+ delta
] = dest
->elems
[id
];
1132 /* Copy remaining SRC elements. */
1133 memcpy (dest
->elems
, dest
->elems
+ sbase
, delta
* sizeof (Idx
));
1138 /* Calculate the union set of the sets SRC1 and SRC2. And store it to
1139 DEST. Return value indicate the error code or REG_NOERROR if succeeded. */
1141 static reg_errcode_t
1142 __attribute_warn_unused_result__
1143 re_node_set_init_union (re_node_set
*dest
, const re_node_set
*src1
,
1144 const re_node_set
*src2
)
1147 if (src1
!= NULL
&& src1
->nelem
> 0 && src2
!= NULL
&& src2
->nelem
> 0)
1149 dest
->alloc
= src1
->nelem
+ src2
->nelem
;
1150 dest
->elems
= re_malloc (Idx
, dest
->alloc
);
1151 if (BE (dest
->elems
== NULL
, 0))
1156 if (src1
!= NULL
&& src1
->nelem
> 0)
1157 return re_node_set_init_copy (dest
, src1
);
1158 else if (src2
!= NULL
&& src2
->nelem
> 0)
1159 return re_node_set_init_copy (dest
, src2
);
1161 re_node_set_init_empty (dest
);
1164 for (i1
= i2
= id
= 0 ; i1
< src1
->nelem
&& i2
< src2
->nelem
;)
1166 if (src1
->elems
[i1
] > src2
->elems
[i2
])
1168 dest
->elems
[id
++] = src2
->elems
[i2
++];
1171 if (src1
->elems
[i1
] == src2
->elems
[i2
])
1173 dest
->elems
[id
++] = src1
->elems
[i1
++];
1175 if (i1
< src1
->nelem
)
1177 memcpy (dest
->elems
+ id
, src1
->elems
+ i1
,
1178 (src1
->nelem
- i1
) * sizeof (Idx
));
1179 id
+= src1
->nelem
- i1
;
1181 else if (i2
< src2
->nelem
)
1183 memcpy (dest
->elems
+ id
, src2
->elems
+ i2
,
1184 (src2
->nelem
- i2
) * sizeof (Idx
));
1185 id
+= src2
->nelem
- i2
;
1191 /* Calculate the union set of the sets DEST and SRC. And store it to
1192 DEST. Return value indicate the error code or REG_NOERROR if succeeded. */
1194 static reg_errcode_t
1195 __attribute_warn_unused_result__
1196 re_node_set_merge (re_node_set
*dest
, const re_node_set
*src
)
1198 Idx is
, id
, sbase
, delta
;
1199 if (src
== NULL
|| src
->nelem
== 0)
1201 if (dest
->alloc
< 2 * src
->nelem
+ dest
->nelem
)
1203 Idx new_alloc
= 2 * (src
->nelem
+ dest
->alloc
);
1204 Idx
*new_buffer
= re_realloc (dest
->elems
, Idx
, new_alloc
);
1205 if (BE (new_buffer
== NULL
, 0))
1207 dest
->elems
= new_buffer
;
1208 dest
->alloc
= new_alloc
;
1211 if (BE (dest
->nelem
== 0, 0))
1213 dest
->nelem
= src
->nelem
;
1214 memcpy (dest
->elems
, src
->elems
, src
->nelem
* sizeof (Idx
));
1218 /* Copy into the top of DEST the items of SRC that are not
1219 found in DEST. Maybe we could binary search in DEST? */
1220 for (sbase
= dest
->nelem
+ 2 * src
->nelem
,
1221 is
= src
->nelem
- 1, id
= dest
->nelem
- 1; is
>= 0 && id
>= 0; )
1223 if (dest
->elems
[id
] == src
->elems
[is
])
1225 else if (dest
->elems
[id
] < src
->elems
[is
])
1226 dest
->elems
[--sbase
] = src
->elems
[is
--];
1227 else /* if (dest->elems[id] > src->elems[is]) */
1233 /* If DEST is exhausted, the remaining items of SRC must be unique. */
1235 memcpy (dest
->elems
+ sbase
, src
->elems
, (is
+ 1) * sizeof (Idx
));
1238 id
= dest
->nelem
- 1;
1239 is
= dest
->nelem
+ 2 * src
->nelem
- 1;
1240 delta
= is
- sbase
+ 1;
1244 /* Now copy. When DELTA becomes zero, the remaining
1245 DEST elements are already in place. */
1246 dest
->nelem
+= delta
;
1249 if (dest
->elems
[is
] > dest
->elems
[id
])
1251 /* Copy from the top. */
1252 dest
->elems
[id
+ delta
--] = dest
->elems
[is
--];
1258 /* Slide from the bottom. */
1259 dest
->elems
[id
+ delta
] = dest
->elems
[id
];
1262 /* Copy remaining SRC elements. */
1263 memcpy (dest
->elems
, dest
->elems
+ sbase
,
1264 delta
* sizeof (Idx
));
1273 /* Insert the new element ELEM to the re_node_set* SET.
1274 SET should not already have ELEM.
1275 Return true if successful. */
1278 __attribute_warn_unused_result__
1279 re_node_set_insert (re_node_set
*set
, Idx elem
)
1282 /* In case the set is empty. */
1283 if (set
->alloc
== 0)
1284 return BE (re_node_set_init_1 (set
, elem
) == REG_NOERROR
, 1);
1286 if (BE (set
->nelem
, 0) == 0)
1288 /* We already guaranteed above that set->alloc != 0. */
1289 set
->elems
[0] = elem
;
1294 /* Realloc if we need. */
1295 if (set
->alloc
== set
->nelem
)
1298 set
->alloc
= set
->alloc
* 2;
1299 new_elems
= re_realloc (set
->elems
, Idx
, set
->alloc
);
1300 if (BE (new_elems
== NULL
, 0))
1302 set
->elems
= new_elems
;
1305 /* Move the elements which follows the new element. Test the
1306 first element separately to skip a check in the inner loop. */
1307 if (elem
< set
->elems
[0])
1310 for (idx
= set
->nelem
; idx
> 0; idx
--)
1311 set
->elems
[idx
] = set
->elems
[idx
- 1];
1315 for (idx
= set
->nelem
; set
->elems
[idx
- 1] > elem
; idx
--)
1316 set
->elems
[idx
] = set
->elems
[idx
- 1];
1319 /* Insert the new element. */
1320 set
->elems
[idx
] = elem
;
1325 /* Insert the new element ELEM to the re_node_set* SET.
1326 SET should not already have any element greater than or equal to ELEM.
1327 Return true if successful. */
1330 __attribute_warn_unused_result__
1331 re_node_set_insert_last (re_node_set
*set
, Idx elem
)
1333 /* Realloc if we need. */
1334 if (set
->alloc
== set
->nelem
)
1337 set
->alloc
= (set
->alloc
+ 1) * 2;
1338 new_elems
= re_realloc (set
->elems
, Idx
, set
->alloc
);
1339 if (BE (new_elems
== NULL
, 0))
1341 set
->elems
= new_elems
;
1344 /* Insert the new element. */
1345 set
->elems
[set
->nelem
++] = elem
;
1349 /* Compare two node sets SET1 and SET2.
1350 Return true if SET1 and SET2 are equivalent. */
1353 __attribute__ ((pure
))
1354 re_node_set_compare (const re_node_set
*set1
, const re_node_set
*set2
)
1357 if (set1
== NULL
|| set2
== NULL
|| set1
->nelem
!= set2
->nelem
)
1359 for (i
= set1
->nelem
; --i
>= 0 ; )
1360 if (set1
->elems
[i
] != set2
->elems
[i
])
1365 /* Return (idx + 1) if SET contains the element ELEM, return 0 otherwise. */
1368 __attribute__ ((pure
))
1369 re_node_set_contains (const re_node_set
*set
, Idx elem
)
1371 __re_size_t idx
, right
, mid
;
1372 if (set
->nelem
<= 0)
1375 /* Binary search the element. */
1377 right
= set
->nelem
- 1;
1380 mid
= (idx
+ right
) / 2;
1381 if (set
->elems
[mid
] < elem
)
1386 return set
->elems
[idx
] == elem
? idx
+ 1 : 0;
1390 re_node_set_remove_at (re_node_set
*set
, Idx idx
)
1392 if (idx
< 0 || idx
>= set
->nelem
)
1395 for (; idx
< set
->nelem
; idx
++)
1396 set
->elems
[idx
] = set
->elems
[idx
+ 1];
1400 /* Add the token TOKEN to dfa->nodes, and return the index of the token.
1401 Or return -1 if an error occurred. */
1404 re_dfa_add_node (re_dfa_t
*dfa
, re_token_t token
)
1406 if (BE (dfa
->nodes_len
>= dfa
->nodes_alloc
, 0))
1408 size_t new_nodes_alloc
= dfa
->nodes_alloc
* 2;
1409 Idx
*new_nexts
, *new_indices
;
1410 re_node_set
*new_edests
, *new_eclosures
;
1411 re_token_t
*new_nodes
;
1413 /* Avoid overflows in realloc. */
1414 const size_t max_object_size
= MAX (sizeof (re_token_t
),
1415 MAX (sizeof (re_node_set
),
1417 if (BE (MIN (IDX_MAX
, SIZE_MAX
/ max_object_size
) < new_nodes_alloc
, 0))
1420 new_nodes
= re_realloc (dfa
->nodes
, re_token_t
, new_nodes_alloc
);
1421 if (BE (new_nodes
== NULL
, 0))
1423 dfa
->nodes
= new_nodes
;
1424 new_nexts
= re_realloc (dfa
->nexts
, Idx
, new_nodes_alloc
);
1425 new_indices
= re_realloc (dfa
->org_indices
, Idx
, new_nodes_alloc
);
1426 new_edests
= re_realloc (dfa
->edests
, re_node_set
, new_nodes_alloc
);
1427 new_eclosures
= re_realloc (dfa
->eclosures
, re_node_set
, new_nodes_alloc
);
1428 if (BE (new_nexts
== NULL
|| new_indices
== NULL
1429 || new_edests
== NULL
|| new_eclosures
== NULL
, 0))
1431 re_free (new_nexts
);
1432 re_free (new_indices
);
1433 re_free (new_edests
);
1434 re_free (new_eclosures
);
1437 dfa
->nexts
= new_nexts
;
1438 dfa
->org_indices
= new_indices
;
1439 dfa
->edests
= new_edests
;
1440 dfa
->eclosures
= new_eclosures
;
1441 dfa
->nodes_alloc
= new_nodes_alloc
;
1443 dfa
->nodes
[dfa
->nodes_len
] = token
;
1444 dfa
->nodes
[dfa
->nodes_len
].constraint
= 0;
1445 #ifdef RE_ENABLE_I18N
1446 dfa
->nodes
[dfa
->nodes_len
].accept_mb
=
1447 ((token
.type
== OP_PERIOD
&& dfa
->mb_cur_max
> 1)
1448 || token
.type
== COMPLEX_BRACKET
);
1450 dfa
->nexts
[dfa
->nodes_len
] = -1;
1451 re_node_set_init_empty (dfa
->edests
+ dfa
->nodes_len
);
1452 re_node_set_init_empty (dfa
->eclosures
+ dfa
->nodes_len
);
1453 return dfa
->nodes_len
++;
1457 calc_state_hash (const re_node_set
*nodes
, unsigned int context
)
1459 re_hashval_t hash
= nodes
->nelem
+ context
;
1461 for (i
= 0 ; i
< nodes
->nelem
; i
++)
1462 hash
+= nodes
->elems
[i
];
1466 /* Search for the state whose node_set is equivalent to NODES.
1467 Return the pointer to the state, if we found it in the DFA.
1468 Otherwise create the new one and return it. In case of an error
1469 return NULL and set the error code in ERR.
1470 Note: - We assume NULL as the invalid state, then it is possible that
1471 return value is NULL and ERR is REG_NOERROR.
1472 - We never return non-NULL value in case of any errors, it is for
1475 static re_dfastate_t
*
1476 __attribute_warn_unused_result__
1477 re_acquire_state (reg_errcode_t
*err
, const re_dfa_t
*dfa
,
1478 const re_node_set
*nodes
)
1481 re_dfastate_t
*new_state
;
1482 struct re_state_table_entry
*spot
;
1484 #if defined GCC_LINT || defined lint
1485 /* Suppress bogus uninitialized-variable warnings. */
1488 if (BE (nodes
->nelem
== 0, 0))
1493 hash
= calc_state_hash (nodes
, 0);
1494 spot
= dfa
->state_table
+ (hash
& dfa
->state_hash_mask
);
1496 for (i
= 0 ; i
< spot
->num
; i
++)
1498 re_dfastate_t
*state
= spot
->array
[i
];
1499 if (hash
!= state
->hash
)
1501 if (re_node_set_compare (&state
->nodes
, nodes
))
1505 /* There are no appropriate state in the dfa, create the new one. */
1506 new_state
= create_ci_newstate (dfa
, nodes
, hash
);
1507 if (BE (new_state
== NULL
, 0))
1513 /* Search for the state whose node_set is equivalent to NODES and
1514 whose context is equivalent to CONTEXT.
1515 Return the pointer to the state, if we found it in the DFA.
1516 Otherwise create the new one and return it. In case of an error
1517 return NULL and set the error code in ERR.
1518 Note: - We assume NULL as the invalid state, then it is possible that
1519 return value is NULL and ERR is REG_NOERROR.
1520 - We never return non-NULL value in case of any errors, it is for
1523 static re_dfastate_t
*
1524 __attribute_warn_unused_result__
1525 re_acquire_state_context (reg_errcode_t
*err
, const re_dfa_t
*dfa
,
1526 const re_node_set
*nodes
, unsigned int context
)
1529 re_dfastate_t
*new_state
;
1530 struct re_state_table_entry
*spot
;
1532 #if defined GCC_LINT || defined lint
1533 /* Suppress bogus uninitialized-variable warnings. */
1536 if (nodes
->nelem
== 0)
1541 hash
= calc_state_hash (nodes
, context
);
1542 spot
= dfa
->state_table
+ (hash
& dfa
->state_hash_mask
);
1544 for (i
= 0 ; i
< spot
->num
; i
++)
1546 re_dfastate_t
*state
= spot
->array
[i
];
1547 if (state
->hash
== hash
1548 && state
->context
== context
1549 && re_node_set_compare (state
->entrance_nodes
, nodes
))
1552 /* There are no appropriate state in 'dfa', create the new one. */
1553 new_state
= create_cd_newstate (dfa
, nodes
, context
, hash
);
1554 if (BE (new_state
== NULL
, 0))
1560 /* Finish initialization of the new state NEWSTATE, and using its hash value
1561 HASH put in the appropriate bucket of DFA's state table. Return value
1562 indicates the error code if failed. */
1564 static reg_errcode_t
1565 __attribute_warn_unused_result__
1566 register_state (const re_dfa_t
*dfa
, re_dfastate_t
*newstate
,
1569 struct re_state_table_entry
*spot
;
1573 newstate
->hash
= hash
;
1574 err
= re_node_set_alloc (&newstate
->non_eps_nodes
, newstate
->nodes
.nelem
);
1575 if (BE (err
!= REG_NOERROR
, 0))
1577 for (i
= 0; i
< newstate
->nodes
.nelem
; i
++)
1579 Idx elem
= newstate
->nodes
.elems
[i
];
1580 if (!IS_EPSILON_NODE (dfa
->nodes
[elem
].type
))
1581 if (! re_node_set_insert_last (&newstate
->non_eps_nodes
, elem
))
1585 spot
= dfa
->state_table
+ (hash
& dfa
->state_hash_mask
);
1586 if (BE (spot
->alloc
<= spot
->num
, 0))
1588 Idx new_alloc
= 2 * spot
->num
+ 2;
1589 re_dfastate_t
**new_array
= re_realloc (spot
->array
, re_dfastate_t
*,
1591 if (BE (new_array
== NULL
, 0))
1593 spot
->array
= new_array
;
1594 spot
->alloc
= new_alloc
;
1596 spot
->array
[spot
->num
++] = newstate
;
1601 free_state (re_dfastate_t
*state
)
1603 re_node_set_free (&state
->non_eps_nodes
);
1604 re_node_set_free (&state
->inveclosure
);
1605 if (state
->entrance_nodes
!= &state
->nodes
)
1607 re_node_set_free (state
->entrance_nodes
);
1608 re_free (state
->entrance_nodes
);
1610 re_node_set_free (&state
->nodes
);
1611 re_free (state
->word_trtable
);
1612 re_free (state
->trtable
);
1616 /* Create the new state which is independent of contexts.
1617 Return the new state if succeeded, otherwise return NULL. */
1619 static re_dfastate_t
*
1620 __attribute_warn_unused_result__
1621 create_ci_newstate (const re_dfa_t
*dfa
, const re_node_set
*nodes
,
1626 re_dfastate_t
*newstate
;
1628 newstate
= (re_dfastate_t
*) calloc (sizeof (re_dfastate_t
), 1);
1629 if (BE (newstate
== NULL
, 0))
1631 err
= re_node_set_init_copy (&newstate
->nodes
, nodes
);
1632 if (BE (err
!= REG_NOERROR
, 0))
1638 newstate
->entrance_nodes
= &newstate
->nodes
;
1639 for (i
= 0 ; i
< nodes
->nelem
; i
++)
1641 re_token_t
*node
= dfa
->nodes
+ nodes
->elems
[i
];
1642 re_token_type_t type
= node
->type
;
1643 if (type
== CHARACTER
&& !node
->constraint
)
1645 #ifdef RE_ENABLE_I18N
1646 newstate
->accept_mb
|= node
->accept_mb
;
1647 #endif /* RE_ENABLE_I18N */
1649 /* If the state has the halt node, the state is a halt state. */
1650 if (type
== END_OF_RE
)
1652 else if (type
== OP_BACK_REF
)
1653 newstate
->has_backref
= 1;
1654 else if (type
== ANCHOR
|| node
->constraint
)
1655 newstate
->has_constraint
= 1;
1657 err
= register_state (dfa
, newstate
, hash
);
1658 if (BE (err
!= REG_NOERROR
, 0))
1660 free_state (newstate
);
1666 /* Create the new state which is depend on the context CONTEXT.
1667 Return the new state if succeeded, otherwise return NULL. */
1669 static re_dfastate_t
*
1670 __attribute_warn_unused_result__
1671 create_cd_newstate (const re_dfa_t
*dfa
, const re_node_set
*nodes
,
1672 unsigned int context
, re_hashval_t hash
)
1674 Idx i
, nctx_nodes
= 0;
1676 re_dfastate_t
*newstate
;
1678 newstate
= (re_dfastate_t
*) calloc (sizeof (re_dfastate_t
), 1);
1679 if (BE (newstate
== NULL
, 0))
1681 err
= re_node_set_init_copy (&newstate
->nodes
, nodes
);
1682 if (BE (err
!= REG_NOERROR
, 0))
1688 newstate
->context
= context
;
1689 newstate
->entrance_nodes
= &newstate
->nodes
;
1691 for (i
= 0 ; i
< nodes
->nelem
; i
++)
1693 re_token_t
*node
= dfa
->nodes
+ nodes
->elems
[i
];
1694 re_token_type_t type
= node
->type
;
1695 unsigned int constraint
= node
->constraint
;
1697 if (type
== CHARACTER
&& !constraint
)
1699 #ifdef RE_ENABLE_I18N
1700 newstate
->accept_mb
|= node
->accept_mb
;
1701 #endif /* RE_ENABLE_I18N */
1703 /* If the state has the halt node, the state is a halt state. */
1704 if (type
== END_OF_RE
)
1706 else if (type
== OP_BACK_REF
)
1707 newstate
->has_backref
= 1;
1711 if (newstate
->entrance_nodes
== &newstate
->nodes
)
1713 newstate
->entrance_nodes
= re_malloc (re_node_set
, 1);
1714 if (BE (newstate
->entrance_nodes
== NULL
, 0))
1716 free_state (newstate
);
1719 if (re_node_set_init_copy (newstate
->entrance_nodes
, nodes
)
1723 newstate
->has_constraint
= 1;
1726 if (NOT_SATISFY_PREV_CONSTRAINT (constraint
,context
))
1728 re_node_set_remove_at (&newstate
->nodes
, i
- nctx_nodes
);
1733 err
= register_state (dfa
, newstate
, hash
);
1734 if (BE (err
!= REG_NOERROR
, 0))
1736 free_state (newstate
);