1 // class template regex -*- C++ -*-
3 // Copyright (C) 2013-2024 Free Software Foundation, Inc.
5 // This file is part of the GNU ISO C++ Library. This library is free
6 // software; you can redistribute it and/or modify it under the
7 // terms of the GNU General Public License as published by the
8 // Free Software Foundation; either version 3, or (at your option)
11 // This library is distributed in the hope that it will be useful,
12 // but WITHOUT ANY WARRANTY; without even the implied warranty of
13 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 // GNU General Public License for more details.
16 // Under Section 7 of GPL version 3, you are granted additional
17 // permissions described in the GCC Runtime Library Exception, version
18 // 3.1, as published by the Free Software Foundation.
20 // You should have received a copy of the GNU General Public License and
21 // a copy of the GCC Runtime Library Exception along with this program;
22 // see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
23 // <http://www.gnu.org/licenses/>.
26 * @file bits/regex_compiler.tcc
27 * This is an internal header file, included by other library headers.
28 * Do not attempt to use it directly. @headername{regex}
31 // FIXME make comments doxygen format.
34 // This compiler refers to "Regular Expression Matching Can Be Simple And Fast"
35 // (http://swtch.com/~rsc/regexp/regexp1.html),
36 // but doesn't strictly follow it.
38 // When compiling, states are *chained* instead of tree- or graph-constructed.
39 // It's more like structured programs: there's if statement and loop statement.
41 // For alternative structure (say "a|b"), aka "if statement", two branches
42 // should be constructed. However, these two shall merge to an "end_tag" at
43 // the end of this operator:
47 // => begin_tag end_tag =>
51 // This is the difference between this implementation and that in Russ's
54 // That's why we introduced dummy node here ------ "end_tag" is a dummy node.
55 // All dummy nodes will be eliminated at the end of compilation.
58 namespace std _GLIBCXX_VISIBILITY(default)
60 _GLIBCXX_BEGIN_NAMESPACE_VERSION
64 template<typename _TraitsT>
66 _Compiler(const _CharT* __b, const _CharT* __e,
67 const typename _TraitsT::locale_type& __loc, _FlagT __flags)
68 : _M_flags(_S_validate(__flags)),
69 _M_scanner(__b, __e, _M_flags, __loc),
70 _M_nfa(make_shared<_RegexT>(__loc, _M_flags)),
71 _M_traits(_M_nfa->_M_traits),
72 _M_ctype(std::use_facet<_CtypeT>(__loc))
74 _StateSeqT __r(*_M_nfa, _M_nfa->_M_start());
75 __r._M_append(_M_nfa->_M_insert_subexpr_begin());
76 this->_M_disjunction();
77 if (!_M_match_token(_ScannerT::_S_token_eof))
78 __throw_regex_error(regex_constants::error_paren);
79 __r._M_append(_M_pop());
80 __glibcxx_assert(_M_stack.empty());
81 __r._M_append(_M_nfa->_M_insert_subexpr_end());
82 __r._M_append(_M_nfa->_M_insert_accept());
83 _M_nfa->_M_eliminate_dummy();
86 template<typename _TraitsT>
91 this->_M_alternative();
92 while (_M_match_token(_ScannerT::_S_token_or))
94 _StateSeqT __alt1 = _M_pop();
95 this->_M_alternative();
96 _StateSeqT __alt2 = _M_pop();
97 auto __end = _M_nfa->_M_insert_dummy();
98 __alt1._M_append(__end);
99 __alt2._M_append(__end);
100 // __alt2 is state._M_next, __alt1 is state._M_alt. The executor
101 // executes _M_alt before _M_next, as well as executing left
102 // alternative before right one.
103 _M_stack.push(_StateSeqT(*_M_nfa,
104 _M_nfa->_M_insert_alt(
105 __alt2._M_start, __alt1._M_start, false),
110 template<typename _TraitsT>
112 _Compiler<_TraitsT>::
117 _StateSeqT __re = _M_pop();
118 this->_M_alternative();
119 __re._M_append(_M_pop());
123 _M_stack.push(_StateSeqT(*_M_nfa, _M_nfa->_M_insert_dummy()));
126 template<typename _TraitsT>
128 _Compiler<_TraitsT>::
131 if (this->_M_assertion())
135 while (this->_M_quantifier())
142 template<typename _TraitsT>
144 _Compiler<_TraitsT>::
147 if (_M_match_token(_ScannerT::_S_token_line_begin))
148 _M_stack.push(_StateSeqT(*_M_nfa, _M_nfa->_M_insert_line_begin()));
149 else if (_M_match_token(_ScannerT::_S_token_line_end))
150 _M_stack.push(_StateSeqT(*_M_nfa, _M_nfa->_M_insert_line_end()));
151 else if (_M_match_token(_ScannerT::_S_token_word_bound))
152 // _M_value[0] == 'n' means it's negative, say "not word boundary".
153 _M_stack.push(_StateSeqT(*_M_nfa, _M_nfa->
154 _M_insert_word_bound(_M_value[0] == 'n')));
155 else if (_M_match_token(_ScannerT::_S_token_subexpr_lookahead_begin))
157 auto __neg = _M_value[0] == 'n';
158 this->_M_disjunction();
159 if (!_M_match_token(_ScannerT::_S_token_subexpr_end))
160 __throw_regex_error(regex_constants::error_paren);
161 auto __tmp = _M_pop();
162 __tmp._M_append(_M_nfa->_M_insert_accept());
166 _M_nfa->_M_insert_lookahead(__tmp._M_start, __neg)));
173 template<typename _TraitsT>
175 _Compiler<_TraitsT>::
178 bool __neg = (_M_flags & regex_constants::ECMAScript);
179 auto __init = [this, &__neg]()
181 if (_M_stack.empty())
182 __throw_regex_error(regex_constants::error_badrepeat);
183 __neg = __neg && _M_match_token(_ScannerT::_S_token_opt);
185 if (_M_match_token(_ScannerT::_S_token_closure0))
189 _StateSeqT __r(*_M_nfa,
190 _M_nfa->_M_insert_repeat(_S_invalid_state_id,
191 __e._M_start, __neg));
195 else if (_M_match_token(_ScannerT::_S_token_closure1))
199 __e._M_append(_M_nfa->_M_insert_repeat(_S_invalid_state_id,
200 __e._M_start, __neg));
203 else if (_M_match_token(_ScannerT::_S_token_opt))
207 auto __end = _M_nfa->_M_insert_dummy();
208 _StateSeqT __r(*_M_nfa,
209 _M_nfa->_M_insert_repeat(_S_invalid_state_id,
210 __e._M_start, __neg));
211 __e._M_append(__end);
212 __r._M_append(__end);
215 else if (_M_match_token(_ScannerT::_S_token_interval_begin))
217 if (_M_stack.empty())
218 __throw_regex_error(regex_constants::error_badrepeat);
219 if (!_M_match_token(_ScannerT::_S_token_dup_count))
220 __throw_regex_error(regex_constants::error_badbrace);
221 _StateSeqT __r(_M_pop());
222 _StateSeqT __e(*_M_nfa, _M_nfa->_M_insert_dummy());
223 long __min_rep = _M_cur_int_value(10);
228 if (_M_match_token(_ScannerT::_S_token_comma))
230 if (_M_match_token(_ScannerT::_S_token_dup_count)) // {3,7}
231 __n = _M_cur_int_value(10) - __min_rep;
235 if (!_M_match_token(_ScannerT::_S_token_interval_end))
236 __throw_regex_error(regex_constants::error_brace);
238 __neg = __neg && _M_match_token(_ScannerT::_S_token_opt);
240 for (long __i = 0; __i < __min_rep; ++__i)
241 __e._M_append(__r._M_clone());
245 auto __tmp = __r._M_clone();
246 _StateSeqT __s(*_M_nfa,
247 _M_nfa->_M_insert_repeat(_S_invalid_state_id,
248 __tmp._M_start, __neg));
249 __tmp._M_append(__s);
255 __throw_regex_error(regex_constants::error_badbrace);
256 auto __end = _M_nfa->_M_insert_dummy();
257 // _M_alt is the "match more" branch, and _M_next is the
258 // "match less" one. Switch _M_alt and _M_next of all created
259 // nodes. This is a hack but IMO works well.
260 std::stack<_StateIdT> __stack;
261 for (long __i = 0; __i < __n; ++__i)
263 auto __tmp = __r._M_clone();
264 auto __alt = _M_nfa->_M_insert_repeat(__tmp._M_start,
267 __e._M_append(_StateSeqT(*_M_nfa, __alt, __tmp._M_end));
269 __e._M_append(__end);
270 while (!__stack.empty())
272 auto& __tmp = (*_M_nfa)[__stack.top()];
274 std::swap(__tmp._M_next, __tmp._M_alt);
284 #define __INSERT_REGEX_MATCHER(__func, ...)\
286 if (!(_M_flags & regex_constants::icase))\
287 if (!(_M_flags & regex_constants::collate))\
288 __func<false, false>(__VA_ARGS__);\
290 __func<false, true>(__VA_ARGS__);\
292 if (!(_M_flags & regex_constants::collate))\
293 __func<true, false>(__VA_ARGS__);\
295 __func<true, true>(__VA_ARGS__);\
298 template<typename _TraitsT>
300 _Compiler<_TraitsT>::
303 if (_M_match_token(_ScannerT::_S_token_anychar))
305 if (!(_M_flags & regex_constants::ECMAScript))
306 __INSERT_REGEX_MATCHER(_M_insert_any_matcher_posix);
308 __INSERT_REGEX_MATCHER(_M_insert_any_matcher_ecma);
310 else if (_M_try_char())
311 __INSERT_REGEX_MATCHER(_M_insert_char_matcher);
312 else if (_M_match_token(_ScannerT::_S_token_backref))
313 _M_stack.push(_StateSeqT(*_M_nfa, _M_nfa->
314 _M_insert_backref(_M_cur_int_value(10))));
315 else if (_M_match_token(_ScannerT::_S_token_quoted_class))
316 __INSERT_REGEX_MATCHER(_M_insert_character_class_matcher);
317 else if (_M_match_token(_ScannerT::_S_token_subexpr_no_group_begin))
319 _StateSeqT __r(*_M_nfa, _M_nfa->_M_insert_dummy());
320 this->_M_disjunction();
321 if (!_M_match_token(_ScannerT::_S_token_subexpr_end))
322 __throw_regex_error(regex_constants::error_paren);
323 __r._M_append(_M_pop());
326 else if (_M_match_token(_ScannerT::_S_token_subexpr_begin))
328 _StateSeqT __r(*_M_nfa, _M_nfa->_M_insert_subexpr_begin());
329 this->_M_disjunction();
330 if (!_M_match_token(_ScannerT::_S_token_subexpr_end))
331 __throw_regex_error(regex_constants::error_paren);
332 __r._M_append(_M_pop());
333 __r._M_append(_M_nfa->_M_insert_subexpr_end());
336 else if (!_M_bracket_expression())
341 template<typename _TraitsT>
343 _Compiler<_TraitsT>::
344 _M_bracket_expression()
347 _M_match_token(_ScannerT::_S_token_bracket_neg_begin);
348 if (!(__neg || _M_match_token(_ScannerT::_S_token_bracket_begin)))
350 __INSERT_REGEX_MATCHER(_M_insert_bracket_matcher, __neg);
353 #undef __INSERT_REGEX_MATCHER
355 template<typename _TraitsT>
356 template<bool __icase, bool __collate>
358 _Compiler<_TraitsT>::
359 _M_insert_any_matcher_ecma()
361 _M_stack.push(_StateSeqT(*_M_nfa,
362 _M_nfa->_M_insert_matcher
363 (_AnyMatcher<_TraitsT, true, __icase, __collate>
367 template<typename _TraitsT>
368 template<bool __icase, bool __collate>
370 _Compiler<_TraitsT>::
371 _M_insert_any_matcher_posix()
373 _M_stack.push(_StateSeqT(*_M_nfa,
374 _M_nfa->_M_insert_matcher
375 (_AnyMatcher<_TraitsT, false, __icase, __collate>
379 template<typename _TraitsT>
380 template<bool __icase, bool __collate>
382 _Compiler<_TraitsT>::
383 _M_insert_char_matcher()
385 _M_stack.push(_StateSeqT(*_M_nfa,
386 _M_nfa->_M_insert_matcher
387 (_CharMatcher<_TraitsT, __icase, __collate>
388 (_M_value[0], _M_traits))));
391 template<typename _TraitsT>
392 template<bool __icase, bool __collate>
394 _Compiler<_TraitsT>::
395 _M_insert_character_class_matcher()
397 __glibcxx_assert(_M_value.size() == 1);
398 _BracketMatcher<__icase, __collate> __matcher
399 (_M_ctype.is(_CtypeT::upper, _M_value[0]), _M_traits);
400 __matcher._M_add_character_class(_M_value, false);
401 __matcher._M_ready();
402 _M_stack.push(_StateSeqT(*_M_nfa,
403 _M_nfa->_M_insert_matcher(std::move(__matcher))));
406 template<typename _TraitsT>
407 template<bool __icase, bool __collate>
409 _Compiler<_TraitsT>::
410 _M_insert_bracket_matcher(bool __neg)
412 _BracketMatcher<__icase, __collate> __matcher(__neg, _M_traits);
413 _BracketState __last_char;
415 __last_char.set(_M_value[0]);
416 else if (_M_match_token(_ScannerT::_S_token_bracket_dash))
417 // Dash as first character is a normal character.
418 __last_char.set('-');
419 while (_M_expression_term(__last_char, __matcher))
421 if (__last_char._M_is_char())
422 __matcher._M_add_char(__last_char.get());
423 __matcher._M_ready();
424 _M_stack.push(_StateSeqT(
426 _M_nfa->_M_insert_matcher(std::move(__matcher))));
429 template<typename _TraitsT>
430 template<bool __icase, bool __collate>
432 _Compiler<_TraitsT>::
433 _M_expression_term(_BracketState& __last_char,
434 _BracketMatcher<__icase, __collate>& __matcher)
436 if (_M_match_token(_ScannerT::_S_token_bracket_end))
439 // Add any previously cached char into the matcher and update cache.
440 const auto __push_char = [&](_CharT __ch)
442 if (__last_char._M_is_char())
443 __matcher._M_add_char(__last_char.get());
444 __last_char.set(__ch);
446 // Add any previously cached char into the matcher and update cache.
447 const auto __push_class = [&]
449 if (__last_char._M_is_char())
450 __matcher._M_add_char(__last_char.get());
451 // We don't cache anything here, just record that the last thing
452 // processed was a character class (or similar).
453 __last_char.reset(_BracketState::_Type::_Class);
456 if (_M_match_token(_ScannerT::_S_token_collsymbol))
458 auto __symbol = __matcher._M_add_collate_element(_M_value);
459 if (__symbol.size() == 1)
460 __push_char(__symbol[0]);
464 else if (_M_match_token(_ScannerT::_S_token_equiv_class_name))
467 __matcher._M_add_equivalence_class(_M_value);
469 else if (_M_match_token(_ScannerT::_S_token_char_class_name))
472 __matcher._M_add_character_class(_M_value, false);
474 else if (_M_try_char())
475 __push_char(_M_value[0]);
476 // POSIX doesn't allow '-' as a start-range char (say [a-z--0]),
477 // except when the '-' is the first or last character in the bracket
478 // expression ([--0]). ECMAScript treats all '-' after a range as a
479 // normal character. Also see above, where _M_expression_term gets called.
481 // As a result, POSIX rejects [-----], but ECMAScript doesn't.
482 // Boost (1.57.0) always uses POSIX style even in its ECMAScript syntax.
483 // Clang (3.5) always uses ECMAScript style even in its POSIX syntax.
485 // It turns out that no one reads BNFs ;)
486 else if (_M_match_token(_ScannerT::_S_token_bracket_dash))
488 if (_M_match_token(_ScannerT::_S_token_bracket_end))
490 // For "-]" the dash is a literal character.
494 else if (__last_char._M_is_class())
496 // "\\w-" is invalid, start of range must be a single char.
497 __throw_regex_error(regex_constants::error_range,
498 "Invalid start of '[x-x]' range in "
499 "regular expression");
501 else if (__last_char._M_is_char())
506 __matcher._M_make_range(__last_char.get(), _M_value[0]);
509 else if (_M_match_token(_ScannerT::_S_token_bracket_dash))
512 __matcher._M_make_range(__last_char.get(), '-');
516 __throw_regex_error(regex_constants::error_range,
517 "Invalid end of '[x-x]' range in "
518 "regular expression");
520 else if (_M_flags & regex_constants::ECMAScript)
522 // A dash that is not part of an existing range. Might be the
523 // start of a new range, or might just be a literal '-' char.
524 // Only ECMAScript allows that in the middle of a bracket expr.
528 __throw_regex_error(regex_constants::error_range,
529 "Invalid location of '-' within '[...]' in "
530 "POSIX regular expression");
532 else if (_M_match_token(_ScannerT::_S_token_quoted_class))
535 __matcher._M_add_character_class(_M_value,
536 _M_ctype.is(_CtypeT::upper,
540 __throw_regex_error(regex_constants::error_brack,
541 "Unexpected character within '[...]' in "
542 "regular expression");
546 template<typename _TraitsT>
548 _Compiler<_TraitsT>::
551 bool __is_char = false;
552 if (_M_match_token(_ScannerT::_S_token_oct_num))
555 _M_value.assign(1, _M_cur_int_value(8));
557 else if (_M_match_token(_ScannerT::_S_token_hex_num))
560 _M_value.assign(1, _M_cur_int_value(16));
562 else if (_M_match_token(_ScannerT::_S_token_ord_char))
567 template<typename _TraitsT>
569 _Compiler<_TraitsT>::
570 _M_match_token(_TokenT __token)
572 if (__token == _M_scanner._M_get_token())
574 _M_value = _M_scanner._M_get_value();
575 _M_scanner._M_advance();
581 template<typename _TraitsT>
583 _Compiler<_TraitsT>::
584 _M_cur_int_value(int __radix)
587 for (_CharT __c : _M_value)
588 if (__builtin_mul_overflow(__v, __radix, &__v)
589 || __builtin_add_overflow(__v, _M_traits.value(__c, __radix), &__v))
590 std::__throw_regex_error(regex_constants::error_backref,
591 "invalid back reference");
595 template<typename _TraitsT, bool __icase, bool __collate>
597 _BracketMatcher<_TraitsT, __icase, __collate>::
598 _M_apply(_CharT __ch, false_type) const
602 if (std::binary_search(_M_char_set.begin(), _M_char_set.end(),
603 _M_translator._M_translate(__ch)))
605 auto __s = _M_translator._M_transform(__ch);
606 for (auto& __it : _M_range_set)
607 if (_M_translator._M_match_range(__it.first, __it.second, __s))
609 if (_M_traits.isctype(__ch, _M_class_set))
611 if (std::find(_M_equiv_set.begin(), _M_equiv_set.end(),
612 _M_traits.transform_primary(&__ch, &__ch+1))
613 != _M_equiv_set.end())
615 for (auto& __it : _M_neg_class_set)
616 if (!_M_traits.isctype(__ch, __it))
619 }() ^ _M_is_non_matching;
621 } // namespace __detail
623 _GLIBCXX_END_NAMESPACE_VERSION