1 // class template regex -*- C++ -*-
3 // Copyright (C) 2013 Free Software Foundation, Inc.
5 // This file is part of the GNU ISO C++ Library. This library is free
6 // software; you can redistribute it and/or modify it under the
7 // terms of the GNU General Public License as published by the
8 // Free Software Foundation; either version 3, or (at your option)
11 // This library is distributed in the hope that it will be useful,
12 // but WITHOUT ANY WARRANTY; without even the implied warranty of
13 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 // GNU General Public License for more details.
16 // Under Section 7 of GPL version 3, you are granted additional
17 // permissions described in the GCC Runtime Library Exception, version
18 // 3.1, as published by the Free Software Foundation.
20 // You should have received a copy of the GNU General Public License and
21 // a copy of the GCC Runtime Library Exception along with this program;
22 // see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
23 // <http://www.gnu.org/licenses/>.
26 * @file bits/regex_compiler.tcc
27 * This is an internal header file, included by other library headers.
28 * Do not attempt to use it directly. @headername{regex}
31 namespace std _GLIBCXX_VISIBILITY(default)
35 _GLIBCXX_BEGIN_NAMESPACE_VERSION
37 template<typename _BiIter>
42 if (_M_current == _M_end)
44 _M_curToken = _S_token_eof;
48 _CharT __c = *_M_current;
49 if (_M_state & _S_state_in_bracket)
54 if (_M_state & _S_state_in_brace)
60 // TODO: re-enable line anchors when _M_assertion is implemented.
61 // See PR libstdc++/47724
62 else if (_M_state & _S_state_at_start && __c == _M_ctype.widen('^'))
64 _M_curToken = _S_token_line_begin;
68 else if (__c == _M_ctype.widen('$'))
70 _M_curToken = _S_token_line_end;
75 else if (__c == _M_ctype.widen('.'))
77 _M_curToken = _S_token_anychar;
81 else if (__c == _M_ctype.widen('*'))
83 _M_curToken = _S_token_closure0;
87 else if (__c == _M_ctype.widen('+'))
89 _M_curToken = _S_token_closure1;
93 else if (__c == _M_ctype.widen('|'))
95 _M_curToken = _S_token_or;
99 else if (__c == _M_ctype.widen('['))
101 if (*++_M_current == _M_ctype.widen('^'))
103 _M_curToken = _S_token_bracket_inverse_begin;
107 _M_curToken = _S_token_bracket_begin;
108 _M_state |= _S_state_in_bracket;
111 else if (__c == _M_ctype.widen('\\'))
116 else if (!(_M_flags & (regex_constants::basic | regex_constants::grep)))
118 if (__c == _M_ctype.widen('('))
120 _M_curToken = _S_token_subexpr_begin;
124 else if (__c == _M_ctype.widen(')'))
126 _M_curToken = _S_token_subexpr_end;
130 else if (__c == _M_ctype.widen('{'))
132 _M_curToken = _S_token_interval_begin;
133 _M_state |= _S_state_in_brace;
139 _M_curToken = _S_token_ord_char;
140 _M_curValue.assign(1, __c);
144 template<typename _BiIter>
149 if (_M_ctype.is(_CtypeT::digit, *_M_current))
151 _M_curToken = _S_token_dup_count;
152 _M_curValue.assign(1, *_M_current);
154 while (_M_current != _M_end
155 && _M_ctype.is(_CtypeT::digit, *_M_current))
157 _M_curValue += *_M_current;
162 else if (*_M_current == _M_ctype.widen(','))
164 _M_curToken = _S_token_comma;
168 if (_M_flags & (regex_constants::basic | regex_constants::grep))
170 if (*_M_current == _M_ctype.widen('\\'))
175 if (*_M_current == _M_ctype.widen('}'))
177 _M_curToken = _S_token_interval_end;
178 _M_state &= ~_S_state_in_brace;
185 template<typename _BiIter>
190 if (*_M_current == _M_ctype.widen('['))
193 if (_M_current == _M_end)
195 _M_curToken = _S_token_eof;
199 if (*_M_current == _M_ctype.widen('.'))
201 _M_curToken = _S_token_collsymbol;
205 else if (*_M_current == _M_ctype.widen(':'))
207 _M_curToken = _S_token_char_class_name;
211 else if (*_M_current == _M_ctype.widen('='))
213 _M_curToken = _S_token_equiv_class_name;
218 else if (*_M_current == _M_ctype.widen('-'))
220 _M_curToken = _S_token_dash;
224 else if (*_M_current == _M_ctype.widen(']'))
226 _M_curToken = _S_token_bracket_end;
227 _M_state &= ~_S_state_in_bracket;
231 else if (*_M_current == _M_ctype.widen('\\'))
236 _M_curToken = _S_token_collelem_single;
237 _M_curValue.assign(1, *_M_current);
242 template<typename _BiIter>
248 if (_M_current == _M_end)
250 _M_curToken = _S_token_eof;
253 _CharT __c = *_M_current;
256 if (__c == _M_ctype.widen('('))
258 if (!(_M_flags & (regex_constants::basic | regex_constants::grep)))
260 _M_curToken = _S_token_ord_char;
261 _M_curValue.assign(1, __c);
264 _M_curToken = _S_token_subexpr_begin;
266 else if (__c == _M_ctype.widen(')'))
268 if (!(_M_flags & (regex_constants::basic | regex_constants::grep)))
270 _M_curToken = _S_token_ord_char;
271 _M_curValue.assign(1, __c);
274 _M_curToken = _S_token_subexpr_end;
276 else if (__c == _M_ctype.widen('{'))
278 if (!(_M_flags & (regex_constants::basic | regex_constants::grep)))
280 _M_curToken = _S_token_ord_char;
281 _M_curValue.assign(1, __c);
285 _M_curToken = _S_token_interval_begin;
286 _M_state |= _S_state_in_brace;
289 else if (__c == _M_ctype.widen('}'))
291 if (!(_M_flags & (regex_constants::basic | regex_constants::grep)))
293 _M_curToken = _S_token_ord_char;
294 _M_curValue.assign(1, __c);
298 if (!(_M_state && _S_state_in_brace))
299 __throw_regex_error(regex_constants::error_badbrace);
300 _M_state &= ~_S_state_in_brace;
301 _M_curToken = _S_token_interval_end;
304 else if (__c == _M_ctype.widen('x'))
307 if (_M_current == _M_end)
309 _M_curToken = _S_token_eof;
312 if (_M_ctype.is(_CtypeT::digit, *_M_current))
314 _M_curValue.assign(1, *_M_current);
316 if (_M_current == _M_end)
318 _M_curToken = _S_token_eof;
321 if (_M_ctype.is(_CtypeT::digit, *_M_current))
323 _M_curValue += *_M_current;
329 else if (__c == _M_ctype.widen('^')
330 || __c == _M_ctype.widen('.')
331 || __c == _M_ctype.widen('*')
332 || __c == _M_ctype.widen('$')
333 || __c == _M_ctype.widen('\\'))
335 _M_curToken = _S_token_ord_char;
336 _M_curValue.assign(1, __c);
338 else if (_M_ctype.is(_CtypeT::digit, __c))
340 _M_curToken = _S_token_backref;
341 _M_curValue.assign(1, __c);
343 else if (_M_state & _S_state_in_bracket)
345 if (__c == _M_ctype.widen('-')
346 || __c == _M_ctype.widen('[')
347 || __c == _M_ctype.widen(']'))
349 _M_curToken = _S_token_ord_char;
350 _M_curValue.assign(1, __c);
352 else if ((_M_flags & regex_constants::ECMAScript)
353 && __c == _M_ctype.widen('b'))
355 _M_curToken = _S_token_ord_char;
356 _M_curValue.assign(1, _M_ctype.widen(' '));
359 __throw_regex_error(regex_constants::error_escape);
362 __throw_regex_error(regex_constants::error_escape);
365 // Eats a character class or throwns an exception.
366 // current point to ':' delimiter on entry, char after ']' on return
367 template<typename _BiIter>
372 ++_M_current; // skip ':'
373 if (_M_current == _M_end)
374 __throw_regex_error(regex_constants::error_ctype);
375 for (_M_curValue.clear();
376 _M_current != _M_end && *_M_current != _M_ctype.widen(':');
378 _M_curValue += *_M_current;
379 if (_M_current == _M_end)
380 __throw_regex_error(regex_constants::error_ctype);
381 ++_M_current; // skip ':'
382 if (*_M_current != _M_ctype.widen(']'))
383 __throw_regex_error(regex_constants::error_ctype);
384 ++_M_current; // skip ']'
388 template<typename _BiIter>
393 ++_M_current; // skip '='
394 if (_M_current == _M_end)
395 __throw_regex_error(regex_constants::error_collate);
396 for (_M_curValue.clear();
397 _M_current != _M_end && *_M_current != _M_ctype.widen('=');
399 _M_curValue += *_M_current;
400 if (_M_current == _M_end)
401 __throw_regex_error(regex_constants::error_collate);
402 ++_M_current; // skip '='
403 if (*_M_current != _M_ctype.widen(']'))
404 __throw_regex_error(regex_constants::error_collate);
405 ++_M_current; // skip ']'
409 template<typename _BiIter>
414 ++_M_current; // skip '.'
415 if (_M_current == _M_end)
416 __throw_regex_error(regex_constants::error_collate);
417 for (_M_curValue.clear();
418 _M_current != _M_end && *_M_current != _M_ctype.widen('.');
420 _M_curValue += *_M_current;
421 if (_M_current == _M_end)
422 __throw_regex_error(regex_constants::error_collate);
423 ++_M_current; // skip '.'
424 if (*_M_current != _M_ctype.widen(']'))
425 __throw_regex_error(regex_constants::error_collate);
426 ++_M_current; // skip ']'
429 #ifdef _GLIBCXX_DEBUG
430 template<typename _BiIter>
433 _M_print(std::ostream& ostr)
437 case _S_token_anychar:
438 ostr << "any-character\n";
440 case _S_token_backref:
443 case _S_token_bracket_begin:
444 ostr << "bracket-begin\n";
446 case _S_token_bracket_inverse_begin:
447 ostr << "bracket-inverse-begin\n";
449 case _S_token_bracket_end:
450 ostr << "bracket-end\n";
452 case _S_token_char_class_name:
453 ostr << "char-class-name \"" << _M_curValue << "\"\n";
455 case _S_token_closure0:
456 ostr << "closure0\n";
458 case _S_token_closure1:
459 ostr << "closure1\n";
461 case _S_token_collelem_multi:
462 ostr << "coll-elem-multi \"" << _M_curValue << "\"\n";
464 case _S_token_collelem_single:
465 ostr << "coll-elem-single \"" << _M_curValue << "\"\n";
467 case _S_token_collsymbol:
468 ostr << "collsymbol \"" << _M_curValue << "\"\n";
476 case _S_token_dup_count:
477 ostr << "dup count: " << _M_curValue << "\n";
482 case _S_token_equiv_class_name:
483 ostr << "equiv-class-name \"" << _M_curValue << "\"\n";
485 case _S_token_interval_begin:
486 ostr << "interval begin\n";
488 case _S_token_interval_end:
489 ostr << "interval end\n";
491 case _S_token_line_begin:
492 ostr << "line begin\n";
494 case _S_token_line_end:
495 ostr << "line end\n";
503 case _S_token_ord_char:
504 ostr << "ordinary character: \"" << _M_value() << "\"\n";
506 case _S_token_subexpr_begin:
507 ostr << "subexpr begin\n";
509 case _S_token_subexpr_end:
510 ostr << "subexpr end\n";
512 case _S_token_word_begin:
513 ostr << "word begin\n";
515 case _S_token_word_end:
516 ostr << "word end\n";
518 case _S_token_unknown:
519 ostr << "-- unknown token --\n";
522 _GLIBCXX_DEBUG_ASSERT(false);
528 template<typename _InputIter, typename _CharT, typename _TraitsT>
529 _Compiler<_InputIter, _CharT, _TraitsT>::
530 _Compiler(_InputIter __b, _InputIter __e,
531 const _TraitsT& __traits, _FlagT __flags)
532 : _M_traits(__traits), _M_scanner(__b, __e, __flags, _M_traits.getloc()),
533 _M_state_store(__flags), _M_flags(__flags)
535 _StateSeqT __r(_M_state_store,
536 _M_state_store._M_insert_subexpr_begin());
538 if (!_M_stack.empty())
540 __r._M_append(_M_stack.top());
543 __r._M_append(_M_state_store._M_insert_subexpr_end());
544 __r._M_append(_M_state_store._M_insert_accept());
547 template<typename _InputIter, typename _CharT, typename _TraitsT>
549 _Compiler<_InputIter, _CharT, _TraitsT>::
550 _M_match_token(_Compiler<_InputIter, _CharT, _TraitsT>::_TokenT token)
552 if (token == _M_scanner._M_token())
554 _M_cur_value = _M_scanner._M_value();
555 _M_scanner._M_advance();
561 template<typename _InputIter, typename _CharT, typename _TraitsT>
563 _Compiler<_InputIter, _CharT, _TraitsT>::
566 this->_M_alternative();
567 if (_M_match_token(_ScannerT::_S_token_or))
569 _StateSeqT __alt1 = _M_stack.top(); _M_stack.pop();
570 this->_M_disjunction();
571 _StateSeqT __alt2 = _M_stack.top(); _M_stack.pop();
572 _M_stack.push(_StateSeqT(__alt1, __alt2));
576 template<typename _InputIter, typename _CharT, typename _TraitsT>
578 _Compiler<_InputIter, _CharT, _TraitsT>::
583 _StateSeqT __re = _M_stack.top(); _M_stack.pop();
584 this->_M_alternative();
585 if (!_M_stack.empty())
587 __re._M_append(_M_stack.top());
594 template<typename _InputIter, typename _CharT, typename _TraitsT>
596 _Compiler<_InputIter, _CharT, _TraitsT>::
599 if (this->_M_assertion())
603 this->_M_quantifier();
609 template<typename _InputIter, typename _CharT, typename _TraitsT>
611 _Compiler<_InputIter, _CharT, _TraitsT>::
614 if (_M_match_token(_ScannerT::_S_token_line_begin))
616 // __m.push(_Matcher::_S_opcode_line_begin);
619 if (_M_match_token(_ScannerT::_S_token_line_end))
621 // __m.push(_Matcher::_S_opcode_line_end);
624 if (_M_match_token(_ScannerT::_S_token_word_begin))
626 // __m.push(_Matcher::_S_opcode_word_begin);
629 if (_M_match_token(_ScannerT::_S_token_word_end))
631 // __m.push(_Matcher::_S_opcode_word_end);
637 template<typename _InputIter, typename _CharT, typename _TraitsT>
639 _Compiler<_InputIter, _CharT, _TraitsT>::
642 if (_M_match_token(_ScannerT::_S_token_closure0))
644 if (_M_stack.empty())
645 __throw_regex_error(regex_constants::error_badrepeat);
646 _StateSeqT __r(_M_stack.top(), -1);
647 __r._M_append(__r._M_front());
652 if (_M_match_token(_ScannerT::_S_token_closure1))
654 if (_M_stack.empty())
655 __throw_regex_error(regex_constants::error_badrepeat);
656 _StateSeqT __r(_M_state_store,
658 _M_insert_alt(_S_invalid_state_id,
659 _M_stack.top()._M_front()));
660 _M_stack.top()._M_append(__r);
663 if (_M_match_token(_ScannerT::_S_token_opt))
665 if (_M_stack.empty())
666 __throw_regex_error(regex_constants::error_badrepeat);
667 _StateSeqT __r(_M_stack.top(), -1);
672 if (_M_match_token(_ScannerT::_S_token_interval_begin))
674 if (_M_stack.empty())
675 __throw_regex_error(regex_constants::error_badrepeat);
676 if (!_M_match_token(_ScannerT::_S_token_dup_count))
677 __throw_regex_error(regex_constants::error_badbrace);
678 _StateSeqT __r(_M_stack.top());
679 int __min_rep = _M_cur_int_value(10);
680 for (int __i = 1; __i < __min_rep; ++__i)
681 _M_stack.top()._M_append(__r._M_clone());
682 if (_M_match_token(_ScannerT::_S_token_comma))
683 if (_M_match_token(_ScannerT::_S_token_dup_count))
685 int __n = _M_cur_int_value(10) - __min_rep;
687 __throw_regex_error(regex_constants::error_badbrace);
688 for (int __i = 0; __i < __n; ++__i)
690 _StateSeqT __r(_M_state_store,
692 _M_insert_alt(_S_invalid_state_id,
693 _M_stack.top()._M_front()));
694 _M_stack.top()._M_append(__r);
699 _StateSeqT __r(_M_stack.top(), -1);
700 __r._M_push_back(__r._M_front());
704 if (!_M_match_token(_ScannerT::_S_token_interval_end))
705 __throw_regex_error(regex_constants::error_brace);
710 template<typename _InputIter, typename _CharT, typename _TraitsT>
712 _Compiler<_InputIter, _CharT, _TraitsT>::
715 if (_M_match_token(_ScannerT::_S_token_anychar))
718 __any_matcher = [](_CharT) -> bool
721 _M_stack.push(_StateSeqT(_M_state_store,
722 _M_state_store._M_insert_matcher
726 if (_M_match_token(_ScannerT::_S_token_ord_char))
728 auto __c = _M_cur_value[0];
729 __detail::_Matcher<_CharT> f;
730 if (_M_flags & regex_constants::icase)
732 auto __traits = this->_M_traits;
733 __c = __traits.translate_nocase(__c);
734 f = [__traits, __c](_CharT __ch) -> bool
735 { return __traits.translate_nocase(__ch) == __c; };
738 f = [__c](_CharT __ch) -> bool
739 { return __ch == __c; };
741 _M_stack.push(_StateSeqT(_M_state_store,
742 _M_state_store._M_insert_matcher(f)));
745 if (_M_match_token(_ScannerT::_S_token_backref))
747 // __m.push(_Matcher::_S_opcode_ordchar, _M_cur_value);
748 _M_state_store._M_set_backref(true);
751 if (_M_match_token(_ScannerT::_S_token_subexpr_begin))
753 int __mark = _M_state_store._M_sub_count();
754 _StateSeqT __r(_M_state_store,
756 _M_insert_subexpr_begin());
757 this->_M_disjunction();
758 if (!_M_match_token(_ScannerT::_S_token_subexpr_end))
759 __throw_regex_error(regex_constants::error_paren);
760 if (!_M_stack.empty())
762 __r._M_append(_M_stack.top());
765 __r._M_append(_M_state_store._M_insert_subexpr_end());
769 return _M_bracket_expression();
772 template<typename _InputIter, typename _CharT, typename _TraitsT>
774 _Compiler<_InputIter, _CharT, _TraitsT>::
775 _M_bracket_expression()
778 _M_match_token(_ScannerT::_S_token_bracket_inverse_begin);
779 if (!(__inverse || _M_match_token(_ScannerT::_S_token_bracket_begin)))
781 _BMatcherT __matcher( __inverse, _M_traits, _M_flags);
782 // special case: only if _not_ chr first after
783 // '[' or '[^' or if ECMAscript
784 if (!_M_bracket_list(__matcher) // list is empty
785 && !(_M_flags & regex_constants::ECMAScript))
786 __throw_regex_error(regex_constants::error_brack);
787 _M_stack.push(_StateSeqT(_M_state_store,
788 _M_state_store._M_insert_matcher(__matcher)));
792 template<typename _InputIter, typename _CharT, typename _TraitsT>
793 bool // list is non-empty
794 _Compiler<_InputIter, _CharT, _TraitsT>::
795 _M_bracket_list(_BMatcherT& __matcher)
797 if (_M_match_token(_ScannerT::_S_token_bracket_end))
799 _M_expression_term(__matcher);
800 _M_bracket_list(__matcher);
804 template<typename _InputIter, typename _CharT, typename _TraitsT>
806 _Compiler<_InputIter, _CharT, _TraitsT>::
807 _M_expression_term(_BMatcherT& __matcher)
809 if (_M_match_token(_ScannerT::_S_token_collsymbol))
811 __matcher._M_add_collating_element(_M_cur_value);
814 if (_M_match_token(_ScannerT::_S_token_equiv_class_name))
816 __matcher._M_add_equivalence_class(_M_cur_value);
819 if (_M_match_token(_ScannerT::_S_token_char_class_name))
821 __matcher._M_add_character_class(_M_cur_value);
824 if (_M_match_token(_ScannerT::_S_token_collelem_single)) // [a
826 auto __ch = _M_cur_value[0];
827 if (_M_match_token(_ScannerT::_S_token_dash)) // [a-
829 // If the dash is the last character in the bracket expression,
830 // it is not special.
831 if (_M_scanner._M_token() == _ScannerT::_S_token_bracket_end)
832 __matcher._M_add_char(_M_cur_value[0]); // [a-] <=> [a\-]
835 if (!_M_match_token(_ScannerT::_S_token_collelem_single))
836 __throw_regex_error(regex_constants::error_range);
837 __matcher._M_make_range(__ch, _M_cur_value[0]);
841 __matcher._M_add_char(__ch);
844 __throw_regex_error(regex_constants::error_brack);
847 template<typename _InputIter, typename _CharT, typename _TraitsT>
849 _Compiler<_InputIter, _CharT, _TraitsT>::
850 _M_cur_int_value(int __radix)
853 for (typename _StringT::size_type __i = 0;
854 __i < _M_cur_value.length(); ++__i)
855 __v =__v * __radix + _M_traits.value(_M_cur_value[__i], __radix);
859 template<typename _CharT, typename _TraitsT>
860 bool _BracketMatcher<_CharT, _TraitsT>::
861 operator()(_CharT __ch) const
864 if (_M_flags & regex_constants::collate)
866 __ch = _M_traits.translate_nocase(__ch);
868 __ch = _M_traits.translate(__ch);
871 for (auto __c : _M_char_set)
877 if (!__ret && _M_traits.isctype(__oldch, _M_class_set))
881 _StringT __s = _M_get_str(__ch);
882 for (auto& __it : _M_range_set)
883 if (__it.first <= __s && __s <= __it.second)
889 if (_M_is_non_matching)
894 _GLIBCXX_END_NAMESPACE_VERSION
895 } // namespace __detail