1 ///////////////////////////////////////////////////////////////////////////////
2 /// \file regex_compiler.hpp
3 /// Contains the definition of regex_compiler, a factory for building regex objects
6 // Copyright 2008 Eric Niebler. Distributed under the Boost
7 // Software License, Version 1.0. (See accompanying file
8 // LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
10 #ifndef BOOST_XPRESSIVE_REGEX_COMPILER_HPP_EAN_10_04_2005
11 #define BOOST_XPRESSIVE_REGEX_COMPILER_HPP_EAN_10_04_2005
13 // MS compatible compilers support #pragma once
19 #include <boost/config.hpp>
20 #include <boost/assert.hpp>
21 #include <boost/next_prior.hpp>
22 #include <boost/range/begin.hpp>
23 #include <boost/range/end.hpp>
24 #include <boost/mpl/assert.hpp>
25 #include <boost/throw_exception.hpp>
26 #include <boost/type_traits/is_same.hpp>
27 #include <boost/type_traits/is_pointer.hpp>
28 #include <boost/utility/enable_if.hpp>
29 #include <boost/iterator/iterator_traits.hpp>
30 #include <boost/xpressive/basic_regex.hpp>
31 #include <boost/xpressive/detail/dynamic/parser.hpp>
32 #include <boost/xpressive/detail/dynamic/parse_charset.hpp>
33 #include <boost/xpressive/detail/dynamic/parser_enum.hpp>
34 #include <boost/xpressive/detail/dynamic/parser_traits.hpp>
35 #include <boost/xpressive/detail/core/linker.hpp>
36 #include <boost/xpressive/detail/core/optimize.hpp>
38 namespace boost
{ namespace xpressive
41 ///////////////////////////////////////////////////////////////////////////////
44 /// \brief Class template regex_compiler is a factory for building basic_regex objects from a string.
46 /// Class template regex_compiler is used to construct a basic_regex object from a string. The string
47 /// should contain a valid regular expression. You can imbue a regex_compiler object with a locale,
48 /// after which all basic_regex objects created with that regex_compiler object will use that locale.
49 /// After creating a regex_compiler object, and optionally imbueing it with a locale, you can call the
50 /// compile() method to construct a basic_regex object, passing it the string representing the regular
51 /// expression. You can call compile() multiple times on the same regex_compiler object. Two basic_regex
52 /// objects compiled from the same string will have different regex_id's.
53 template<typename BidiIter
, typename RegexTraits
, typename CompilerTraits
>
56 typedef BidiIter iterator_type
;
57 typedef typename iterator_value
<BidiIter
>::type char_type
;
58 typedef regex_constants::syntax_option_type flag_type
;
59 typedef RegexTraits traits_type
;
60 typedef typename
traits_type::string_type string_type
;
61 typedef typename
traits_type::locale_type locale_type
;
62 typedef typename
traits_type::char_class_type char_class_type
;
64 explicit regex_compiler(RegexTraits
const &traits
= RegexTraits())
66 , hidden_mark_count_(0)
72 this->upper_
= lookup_classname(this->rxtraits(), "upper");
75 ///////////////////////////////////////////////////////////////////////////
77 /// Specify the locale to be used by a regex_compiler.
79 /// \param loc The locale that this regex_compiler should use.
80 /// \return The previous locale.
81 locale_type
imbue(locale_type loc
)
83 locale_type oldloc
= this->traits_
.imbue(loc
);
84 this->upper_
= lookup_classname(this->rxtraits(), "upper");
88 ///////////////////////////////////////////////////////////////////////////
90 /// Get the locale used by a regex_compiler.
92 /// \return The locale used by this regex_compiler.
93 locale_type
getloc() const
95 return this->traits_
.getloc();
98 ///////////////////////////////////////////////////////////////////////////
100 /// Builds a basic_regex object from a range of characters.
102 /// \param begin The beginning of a range of characters representing the
103 /// regular expression to compile.
104 /// \param end The end of a range of characters representing the
105 /// regular expression to compile.
106 /// \param flags Optional bitmask that determines how the pat string is
107 /// interpreted. (See syntax_option_type.)
108 /// \return A basic_regex object corresponding to the regular expression
109 /// represented by the character range.
110 /// \pre InputIter is a model of the InputIterator concept.
111 /// \pre [begin,end) is a valid range.
112 /// \pre The range of characters specified by [begin,end) contains a
113 /// valid string-based representation of a regular expression.
114 /// \throw regex_error when the range of characters has invalid regular
115 /// expression syntax.
116 template<typename InputIter
>
117 basic_regex
<BidiIter
>
118 compile(InputIter begin
, InputIter end
, flag_type flags
= regex_constants::ECMAScript
)
120 typedef typename iterator_category
<InputIter
>::type category
;
121 return this->compile_(begin
, end
, flags
, category());
126 template<typename InputRange
>
127 typename disable_if
<is_pointer
<InputRange
>, basic_regex
<BidiIter
> >::type
128 compile(InputRange
const &pat
, flag_type flags
= regex_constants::ECMAScript
)
130 return this->compile(boost::begin(pat
), boost::end(pat
), flags
);
135 basic_regex
<BidiIter
>
136 compile(char_type
const *begin
, flag_type flags
= regex_constants::ECMAScript
)
138 BOOST_ASSERT(0 != begin
);
139 char_type
const *end
= begin
+ std::char_traits
<char_type
>::length(begin
);
140 return this->compile(begin
, end
, flags
);
145 basic_regex
<BidiIter
> compile(char_type
const *begin
, std::size_t size
, flag_type flags
)
147 BOOST_ASSERT(0 != begin
);
148 char_type
const *end
= begin
+ size
;
149 return this->compile(begin
, end
, flags
);
152 ///////////////////////////////////////////////////////////////////////////
154 /// Return a reference to the named regular expression. If no such named
155 /// regular expression exists, create a new regular expression and return
156 /// a reference to it.
158 /// \param name A std::string containing the name of the regular expression.
159 /// \pre The string is not empty.
160 /// \throw bad_alloc on allocation failure.
161 basic_regex
<BidiIter
> &operator [](string_type
const &name
)
163 BOOST_ASSERT(!name
.empty());
164 return this->rules_
[name
];
169 basic_regex
<BidiIter
> const &operator [](string_type
const &name
) const
171 BOOST_ASSERT(!name
.empty());
172 return this->rules_
[name
];
177 typedef detail::escape_value
<char_type
, char_class_type
> escape_value
;
178 typedef detail::alternate_matcher
<detail::alternates_vector
<BidiIter
>, RegexTraits
> alternate_matcher
;
180 ///////////////////////////////////////////////////////////////////////////
183 template<typename FwdIter
>
184 basic_regex
<BidiIter
> compile_(FwdIter begin
, FwdIter end
, flag_type flags
, std::forward_iterator_tag
)
186 BOOST_MPL_ASSERT((is_same
<char_type
, typename iterator_value
<FwdIter
>::type
>));
187 using namespace regex_constants
;
189 this->traits_
.flags(flags
);
191 basic_regex
<BidiIter
> rextmp
, *prex
= &rextmp
;
194 // Check if this regex is a named rule:
196 if(token_group_begin
== this->traits_
.get_token(tmp
, end
) &&
197 BOOST_XPR_ENSURE_(tmp
!= end
, error_paren
, "mismatched parenthesis") &&
198 token_rule_assign
== this->traits_
.get_group_type(tmp
, end
, name
))
203 begin
!= end
&& token_group_end
== this->traits_
.get_token(begin
, end
)
205 , "mismatched parenthesis"
207 prex
= &this->rules_
[name
];
210 this->self_
= detail::core_access
<BidiIter
>::get_regex_impl(*prex
);
212 // at the top level, a regex is a sequence of alternates
213 detail::sequence
<BidiIter
> seq
= this->parse_alternates(begin
, end
);
214 BOOST_XPR_ENSURE_(begin
== end
, error_paren
, "mismatched parenthesis");
216 // terminate the sequence
217 seq
+= detail::make_dynamic
<BidiIter
>(detail::end_matcher());
219 // bundle the regex information into a regex_impl object
220 detail::common_compile(seq
.xpr().matchable(), *this->self_
, this->rxtraits());
222 this->self_
->traits_
= new detail::traits_holder
<RegexTraits
>(this->rxtraits());
223 this->self_
->mark_count_
= this->mark_count_
;
224 this->self_
->hidden_mark_count_
= this->hidden_mark_count_
;
226 // References changed, update dependencies.
227 this->self_
->tracking_update();
232 ///////////////////////////////////////////////////////////////////////////
235 template<typename InputIter
>
236 basic_regex
<BidiIter
> compile_(InputIter begin
, InputIter end
, flag_type flags
, std::input_iterator_tag
)
238 string_type
pat(begin
, end
);
239 return this->compile_(boost::begin(pat
), boost::end(pat
), flags
, std::forward_iterator_tag());
242 ///////////////////////////////////////////////////////////////////////////
247 this->mark_count_
= 0;
248 this->hidden_mark_count_
= 0;
249 this->traits_
.flags(regex_constants::ECMAScript
);
252 ///////////////////////////////////////////////////////////////////////////
255 traits_type
&rxtraits()
257 return this->traits_
.traits();
260 ///////////////////////////////////////////////////////////////////////////
263 traits_type
const &rxtraits() const
265 return this->traits_
.traits();
268 ///////////////////////////////////////////////////////////////////////////
271 template<typename FwdIter
>
272 detail::sequence
<BidiIter
> parse_alternates(FwdIter
&begin
, FwdIter end
)
274 using namespace regex_constants
;
277 detail::sequence
<BidiIter
> seq
;
282 seq
= this->parse_sequence(tmp
, end
);
285 seq
= detail::make_dynamic
<BidiIter
>(alternate_matcher()) | seq
;
288 seq
|= this->parse_sequence(tmp
, end
);
290 while((begin
= tmp
) != end
&& token_alternate
== this->traits_
.get_token(tmp
, end
));
295 ///////////////////////////////////////////////////////////////////////////
298 template<typename FwdIter
>
299 detail::sequence
<BidiIter
> parse_group(FwdIter
&begin
, FwdIter end
)
301 using namespace regex_constants
;
304 bool lookahead
= false;
305 bool lookbehind
= false;
306 bool negative
= false;
309 detail::sequence
<BidiIter
> seq
, seq_end
;
310 FwdIter tmp
= FwdIter();
312 syntax_option_type old_flags
= this->traits_
.flags();
314 switch(this->traits_
.get_group_type(begin
, end
, name
))
317 // Don't process empty groups like (?:) or (?i)
318 // BUGBUG this doesn't handle the degenerate (?:)+ correctly
319 if(token_group_end
== this->traits_
.get_token(tmp
= begin
, end
))
321 return this->parse_atom(begin
= tmp
, end
);
325 case token_negative_lookahead
:
328 case token_positive_lookahead
:
332 case token_negative_lookbehind
:
335 case token_positive_lookbehind
:
339 case token_independent_sub_expression
:
344 while(BOOST_XPR_ENSURE_(begin
!= end
, error_paren
, "mismatched parenthesis"))
346 switch(this->traits_
.get_token(begin
, end
))
348 case token_group_end
:
349 return this->parse_atom(begin
, end
);
351 BOOST_XPR_ENSURE_(begin
!= end
, error_escape
, "incomplete escape sequence");
365 begin
!= end
&& token_group_end
== this->traits_
.get_token(begin
, end
)
367 , "mismatched parenthesis"
369 return detail::make_dynamic
<BidiIter
>(detail::regex_byref_matcher
<BidiIter
>(this->self_
));
371 case token_rule_assign
:
372 BOOST_THROW_EXCEPTION(
373 regex_error(error_badrule
, "rule assignments must be at the front of the regex")
379 typedef detail::core_access
<BidiIter
> access
;
382 begin
!= end
&& token_group_end
== this->traits_
.get_token(begin
, end
)
384 , "mismatched parenthesis"
386 basic_regex
<BidiIter
> &rex
= this->rules_
[name
];
387 shared_ptr
<detail::regex_impl
<BidiIter
> > impl
= access::get_regex_impl(rex
);
388 this->self_
->track_reference(*impl
);
389 return detail::make_dynamic
<BidiIter
>(detail::regex_byref_matcher
<BidiIter
>(impl
));
392 case token_named_mark
:
393 mark_nbr
= static_cast<int>(++this->mark_count_
);
394 for(std::size_t i
= 0; i
< this->self_
->named_marks_
.size(); ++i
)
396 BOOST_XPR_ENSURE_(this->self_
->named_marks_
[i
].name_
!= name
, error_badmark
, "named mark already exists");
398 this->self_
->named_marks_
.push_back(detail::named_mark
<char_type
>(name
, this->mark_count_
));
399 seq
= detail::make_dynamic
<BidiIter
>(detail::mark_begin_matcher(mark_nbr
));
400 seq_end
= detail::make_dynamic
<BidiIter
>(detail::mark_end_matcher(mark_nbr
));
403 case token_named_mark_ref
:
406 begin
!= end
&& token_group_end
== this->traits_
.get_token(begin
, end
)
408 , "mismatched parenthesis"
410 for(std::size_t i
= 0; i
< this->self_
->named_marks_
.size(); ++i
)
412 if(this->self_
->named_marks_
[i
].name_
== name
)
414 mark_nbr
= static_cast<int>(this->self_
->named_marks_
[i
].mark_nbr_
);
415 return detail::make_backref_xpression
<BidiIter
>
417 mark_nbr
, this->traits_
.flags(), this->rxtraits()
421 BOOST_THROW_EXCEPTION(regex_error(error_badmark
, "invalid named back-reference"));
425 mark_nbr
= static_cast<int>(++this->mark_count_
);
426 seq
= detail::make_dynamic
<BidiIter
>(detail::mark_begin_matcher(mark_nbr
));
427 seq_end
= detail::make_dynamic
<BidiIter
>(detail::mark_end_matcher(mark_nbr
));
432 seq
+= this->parse_alternates(begin
, end
);
436 begin
!= end
&& token_group_end
== this->traits_
.get_token(begin
, end
)
438 , "mismatched parenthesis"
441 typedef detail::shared_matchable
<BidiIter
> xpr_type
;
444 seq
+= detail::make_independent_end_xpression
<BidiIter
>(seq
.pure());
445 detail::lookahead_matcher
<xpr_type
> lam(seq
.xpr(), negative
, seq
.pure());
446 seq
= detail::make_dynamic
<BidiIter
>(lam
);
450 seq
+= detail::make_independent_end_xpression
<BidiIter
>(seq
.pure());
451 detail::lookbehind_matcher
<xpr_type
> lbm(seq
.xpr(), seq
.width().value(), negative
, seq
.pure());
452 seq
= detail::make_dynamic
<BidiIter
>(lbm
);
454 else if(keeper
) // independent sub-expression
456 seq
+= detail::make_independent_end_xpression
<BidiIter
>(seq
.pure());
457 detail::keeper_matcher
<xpr_type
> km(seq
.xpr(), seq
.pure());
458 seq
= detail::make_dynamic
<BidiIter
>(km
);
461 // restore the modifiers
462 this->traits_
.flags(old_flags
);
466 ///////////////////////////////////////////////////////////////////////////
469 template<typename FwdIter
>
470 detail::sequence
<BidiIter
> parse_charset(FwdIter
&begin
, FwdIter end
)
472 detail::compound_charset
<traits_type
> chset
;
474 // call out to a helper to actually parse the character set
475 detail::parse_charset(begin
, end
, chset
, this->traits_
);
477 return detail::make_charset_xpression
<BidiIter
>
481 , this->traits_
.flags()
485 ///////////////////////////////////////////////////////////////////////////
488 template<typename FwdIter
>
489 detail::sequence
<BidiIter
> parse_atom(FwdIter
&begin
, FwdIter end
)
491 using namespace regex_constants
;
492 escape_value esc
= { 0, 0, 0, detail::escape_char
};
493 FwdIter old_begin
= begin
;
495 switch(this->traits_
.get_token(begin
, end
))
498 return detail::make_literal_xpression
<BidiIter
>
500 this->parse_literal(begin
, end
), this->traits_
.flags(), this->rxtraits()
504 return detail::make_any_xpression
<BidiIter
>(this->traits_
.flags(), this->rxtraits());
506 case token_assert_begin_sequence
:
507 return detail::make_dynamic
<BidiIter
>(detail::assert_bos_matcher());
509 case token_assert_end_sequence
:
510 return detail::make_dynamic
<BidiIter
>(detail::assert_eos_matcher());
512 case token_assert_begin_line
:
513 return detail::make_assert_begin_line
<BidiIter
>(this->traits_
.flags(), this->rxtraits());
515 case token_assert_end_line
:
516 return detail::make_assert_end_line
<BidiIter
>(this->traits_
.flags(), this->rxtraits());
518 case token_assert_word_boundary
:
519 return detail::make_assert_word
<BidiIter
>(detail::word_boundary
<mpl::true_
>(), this->rxtraits());
521 case token_assert_not_word_boundary
:
522 return detail::make_assert_word
<BidiIter
>(detail::word_boundary
<mpl::false_
>(), this->rxtraits());
524 case token_assert_word_begin
:
525 return detail::make_assert_word
<BidiIter
>(detail::word_begin(), this->rxtraits());
527 case token_assert_word_end
:
528 return detail::make_assert_word
<BidiIter
>(detail::word_end(), this->rxtraits());
531 esc
= this->parse_escape(begin
, end
);
534 case detail::escape_mark
:
535 return detail::make_backref_xpression
<BidiIter
>
537 esc
.mark_nbr_
, this->traits_
.flags(), this->rxtraits()
539 case detail::escape_char
:
540 return detail::make_char_xpression
<BidiIter
>
542 esc
.ch_
, this->traits_
.flags(), this->rxtraits()
544 case detail::escape_class
:
545 return detail::make_posix_charset_xpression
<BidiIter
>
548 , this->is_upper_(*begin
++)
549 , this->traits_
.flags()
554 case token_group_begin
:
555 return this->parse_group(begin
, end
);
557 case token_charset_begin
:
558 return this->parse_charset(begin
, end
);
560 case token_invalid_quantifier
:
561 BOOST_THROW_EXCEPTION(regex_error(error_badrepeat
, "quantifier not expected"));
564 case token_quote_meta_begin
:
565 return detail::make_literal_xpression
<BidiIter
>
567 this->parse_quote_meta(begin
, end
), this->traits_
.flags(), this->rxtraits()
570 case token_quote_meta_end
:
571 BOOST_THROW_EXCEPTION(
574 , "found quote-meta end without corresponding quote-meta begin"
579 case token_end_of_pattern
:
587 return detail::sequence
<BidiIter
>();
590 ///////////////////////////////////////////////////////////////////////////
593 template<typename FwdIter
>
594 detail::sequence
<BidiIter
> parse_quant(FwdIter
&begin
, FwdIter end
)
596 BOOST_ASSERT(begin
!= end
);
597 detail::quant_spec spec
= { 0, 0, false, &this->hidden_mark_count_
};
598 detail::sequence
<BidiIter
> seq
= this->parse_atom(begin
, end
);
600 // BUGBUG this doesn't handle the degenerate (?:)+ correctly
601 if(!seq
.empty() && begin
!= end
&& detail::quant_none
!= seq
.quant())
603 if(this->traits_
.get_quant_spec(begin
, end
, spec
))
605 BOOST_ASSERT(spec
.min_
<= spec
.max_
);
607 if(0 == spec
.max_
) // quant {0,0} is degenerate -- matches nothing.
609 seq
= this->parse_quant(begin
, end
);
621 ///////////////////////////////////////////////////////////////////////////
624 template<typename FwdIter
>
625 detail::sequence
<BidiIter
> parse_sequence(FwdIter
&begin
, FwdIter end
)
627 detail::sequence
<BidiIter
> seq
;
631 detail::sequence
<BidiIter
> seq_quant
= this->parse_quant(begin
, end
);
633 // did we find a quantified atom?
634 if(seq_quant
.empty())
637 // chain it to the end of the xpression sequence
644 ///////////////////////////////////////////////////////////////////////////
646 // scan ahead looking for char literals to be globbed together into a string literal
648 template<typename FwdIter
>
649 string_type
parse_literal(FwdIter
&begin
, FwdIter end
)
651 using namespace regex_constants
;
652 BOOST_ASSERT(begin
!= end
);
653 BOOST_ASSERT(token_literal
== this->traits_
.get_token(begin
, end
));
654 escape_value esc
= { 0, 0, 0, detail::escape_char
};
655 string_type
literal(1, *begin
);
657 for(FwdIter prev
= begin
, tmp
= ++begin
; begin
!= end
; prev
= begin
, begin
= tmp
)
659 detail::quant_spec spec
= { 0, 0, false, &this->hidden_mark_count_
};
660 if(this->traits_
.get_quant_spec(tmp
, end
, spec
))
662 if(literal
.size() != 1)
665 literal
.erase(boost::prior(literal
.end()));
669 else switch(this->traits_
.get_token(tmp
, end
))
672 esc
= this->parse_escape(tmp
, end
);
673 if(detail::escape_char
!= esc
.type_
) return literal
;
674 literal
.insert(literal
.end(), esc
.ch_
);
677 literal
.insert(literal
.end(), *tmp
++);
687 ///////////////////////////////////////////////////////////////////////////
689 // scan ahead looking for char literals to be globbed together into a string literal
691 template<typename FwdIter
>
692 string_type
parse_quote_meta(FwdIter
&begin
, FwdIter end
)
694 using namespace regex_constants
;
695 FwdIter old_begin
= begin
, old_end
;
696 while(end
!= (old_end
= begin
))
698 switch(this->traits_
.get_token(begin
, end
))
700 case token_quote_meta_end
:
701 return string_type(old_begin
, old_end
);
703 BOOST_XPR_ENSURE_(begin
!= end
, error_escape
, "incomplete escape sequence");
705 case token_invalid_quantifier
:
713 return string_type(old_begin
, begin
);
716 ///////////////////////////////////////////////////////////////////////////////
719 template<typename FwdIter
>
720 escape_value
parse_escape(FwdIter
&begin
, FwdIter end
)
722 BOOST_XPR_ENSURE_(begin
!= end
, regex_constants::error_escape
, "incomplete escape sequence");
724 // first, check to see if this can be a backreference
725 if(0 < this->rxtraits().value(*begin
, 10))
727 // Parse at most 3 decimal digits.
729 int mark_nbr
= detail::toi(tmp
, end
, this->rxtraits(), 10, 999);
731 // If the resulting number could conceivably be a backref, then it is.
732 if(10 > mark_nbr
|| mark_nbr
<= static_cast<int>(this->mark_count_
))
735 escape_value esc
= {0, mark_nbr
, 0, detail::escape_mark
};
740 // Not a backreference, defer to the parse_escape helper
741 return detail::parse_escape(begin
, end
, this->traits_
);
744 bool is_upper_(char_type ch
) const
746 return 0 != this->upper_
&& this->rxtraits().isctype(ch
, this->upper_
);
749 std::size_t mark_count_
;
750 std::size_t hidden_mark_count_
;
751 CompilerTraits traits_
;
752 typename
RegexTraits::char_class_type upper_
;
753 shared_ptr
<detail::regex_impl
<BidiIter
> > self_
;
754 std::map
<string_type
, basic_regex
<BidiIter
> > rules_
;
757 }} // namespace boost::xpressive