1 ///////////////////////////////////////////////////////////////////////////////
2 /// \file regex_compiler.hpp
3 /// Contains the definition of regex_compiler, a factory for building regex objects
6 // Copyright 2008 Eric Niebler. Distributed under the Boost
7 // Software License, Version 1.0. (See accompanying file
8 // LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
10 #ifndef BOOST_XPRESSIVE_REGEX_COMPILER_HPP_EAN_10_04_2005
11 #define BOOST_XPRESSIVE_REGEX_COMPILER_HPP_EAN_10_04_2005
13 // MS compatible compilers support #pragma once
14 #if defined(_MSC_VER) && (_MSC_VER >= 1020)
19 #include <boost/assert.hpp>
20 #include <boost/next_prior.hpp>
21 #include <boost/range/begin.hpp>
22 #include <boost/range/end.hpp>
23 #include <boost/mpl/assert.hpp>
24 #include <boost/throw_exception.hpp>
25 #include <boost/type_traits/is_same.hpp>
26 #include <boost/type_traits/is_pointer.hpp>
27 #include <boost/utility/enable_if.hpp>
28 #include <boost/iterator/iterator_traits.hpp>
29 #include <boost/xpressive/basic_regex.hpp>
30 #include <boost/xpressive/detail/dynamic/parser.hpp>
31 #include <boost/xpressive/detail/dynamic/parse_charset.hpp>
32 #include <boost/xpressive/detail/dynamic/parser_enum.hpp>
33 #include <boost/xpressive/detail/dynamic/parser_traits.hpp>
34 #include <boost/xpressive/detail/core/linker.hpp>
35 #include <boost/xpressive/detail/core/optimize.hpp>
37 namespace boost
{ namespace xpressive
40 ///////////////////////////////////////////////////////////////////////////////
43 /// \brief Class template regex_compiler is a factory for building basic_regex objects from a string.
45 /// Class template regex_compiler is used to construct a basic_regex object from a string. The string
46 /// should contain a valid regular expression. You can imbue a regex_compiler object with a locale,
47 /// after which all basic_regex objects created with that regex_compiler object will use that locale.
48 /// After creating a regex_compiler object, and optionally imbueing it with a locale, you can call the
49 /// compile() method to construct a basic_regex object, passing it the string representing the regular
50 /// expression. You can call compile() multiple times on the same regex_compiler object. Two basic_regex
51 /// objects compiled from the same string will have different regex_id's.
52 template<typename BidiIter
, typename RegexTraits
, typename CompilerTraits
>
55 typedef BidiIter iterator_type
;
56 typedef typename iterator_value
<BidiIter
>::type char_type
;
57 typedef regex_constants::syntax_option_type flag_type
;
58 typedef RegexTraits traits_type
;
59 typedef typename
traits_type::string_type string_type
;
60 typedef typename
traits_type::locale_type locale_type
;
61 typedef typename
traits_type::char_class_type char_class_type
;
63 explicit regex_compiler(RegexTraits
const &traits
= RegexTraits())
65 , hidden_mark_count_(0)
71 this->upper_
= lookup_classname(this->rxtraits(), "upper");
74 ///////////////////////////////////////////////////////////////////////////
76 /// Specify the locale to be used by a regex_compiler.
78 /// \param loc The locale that this regex_compiler should use.
79 /// \return The previous locale.
80 locale_type
imbue(locale_type loc
)
82 locale_type oldloc
= this->traits_
.imbue(loc
);
83 this->upper_
= lookup_classname(this->rxtraits(), "upper");
87 ///////////////////////////////////////////////////////////////////////////
89 /// Get the locale used by a regex_compiler.
91 /// \return The locale used by this regex_compiler.
92 locale_type
getloc() const
94 return this->traits_
.getloc();
97 ///////////////////////////////////////////////////////////////////////////
99 /// Builds a basic_regex object from a range of characters.
101 /// \param begin The beginning of a range of characters representing the
102 /// regular expression to compile.
103 /// \param end The end of a range of characters representing the
104 /// regular expression to compile.
105 /// \param flags Optional bitmask that determines how the pat string is
106 /// interpreted. (See syntax_option_type.)
107 /// \return A basic_regex object corresponding to the regular expression
108 /// represented by the character range.
109 /// \pre InputIter is a model of the InputIterator concept.
110 /// \pre [begin,end) is a valid range.
111 /// \pre The range of characters specified by [begin,end) contains a
112 /// valid string-based representation of a regular expression.
113 /// \throw regex_error when the range of characters has invalid regular
114 /// expression syntax.
115 template<typename InputIter
>
116 basic_regex
<BidiIter
>
117 compile(InputIter begin
, InputIter end
, flag_type flags
= regex_constants::ECMAScript
)
119 typedef typename iterator_category
<InputIter
>::type category
;
120 return this->compile_(begin
, end
, flags
, category());
125 template<typename InputRange
>
126 typename disable_if
<is_pointer
<InputRange
>, basic_regex
<BidiIter
> >::type
127 compile(InputRange
const &pat
, flag_type flags
= regex_constants::ECMAScript
)
129 return this->compile(boost::begin(pat
), boost::end(pat
), flags
);
134 basic_regex
<BidiIter
>
135 compile(char_type
const *begin
, flag_type flags
= regex_constants::ECMAScript
)
137 BOOST_ASSERT(0 != begin
);
138 char_type
const *end
= begin
+ std::char_traits
<char_type
>::length(begin
);
139 return this->compile(begin
, end
, flags
);
144 basic_regex
<BidiIter
> compile(char_type
const *begin
, std::size_t size
, flag_type flags
)
146 BOOST_ASSERT(0 != begin
);
147 char_type
const *end
= begin
+ size
;
148 return this->compile(begin
, end
, flags
);
151 ///////////////////////////////////////////////////////////////////////////
153 /// Return a reference to the named regular expression. If no such named
154 /// regular expression exists, create a new regular expression and return
155 /// a reference to it.
157 /// \param name A std::string containing the name of the regular expression.
158 /// \pre The string is not empty.
159 /// \throw bad_alloc on allocation failure.
160 basic_regex
<BidiIter
> &operator [](string_type
const &name
)
162 BOOST_ASSERT(!name
.empty());
163 return this->rules_
[name
];
168 basic_regex
<BidiIter
> const &operator [](string_type
const &name
) const
170 BOOST_ASSERT(!name
.empty());
171 return this->rules_
[name
];
176 typedef detail::escape_value
<char_type
, char_class_type
> escape_value
;
177 typedef detail::alternate_matcher
<detail::alternates_vector
<BidiIter
>, RegexTraits
> alternate_matcher
;
179 ///////////////////////////////////////////////////////////////////////////
182 template<typename FwdIter
>
183 basic_regex
<BidiIter
> compile_(FwdIter begin
, FwdIter end
, flag_type flags
, std::forward_iterator_tag
)
185 BOOST_MPL_ASSERT((is_same
<char_type
, typename iterator_value
<FwdIter
>::type
>));
186 using namespace regex_constants
;
188 this->traits_
.flags(flags
);
190 basic_regex
<BidiIter
> rextmp
, *prex
= &rextmp
;
193 // Check if this regex is a named rule:
195 if(token_group_begin
== this->traits_
.get_token(tmp
, end
) &&
196 BOOST_XPR_ENSURE_(tmp
!= end
, error_paren
, "mismatched parenthesis") &&
197 token_rule_assign
== this->traits_
.get_group_type(tmp
, end
, name
))
202 begin
!= end
&& token_group_end
== this->traits_
.get_token(begin
, end
)
204 , "mismatched parenthesis"
206 prex
= &this->rules_
[name
];
209 this->self_
= detail::core_access
<BidiIter
>::get_regex_impl(*prex
);
211 // at the top level, a regex is a sequence of alternates
212 detail::sequence
<BidiIter
> seq
= this->parse_alternates(begin
, end
);
213 BOOST_XPR_ENSURE_(begin
== end
, error_paren
, "mismatched parenthesis");
215 // terminate the sequence
216 seq
+= detail::make_dynamic
<BidiIter
>(detail::end_matcher());
218 // bundle the regex information into a regex_impl object
219 detail::common_compile(seq
.xpr().matchable(), *this->self_
, this->rxtraits());
221 this->self_
->traits_
= new detail::traits_holder
<RegexTraits
>(this->rxtraits());
222 this->self_
->mark_count_
= this->mark_count_
;
223 this->self_
->hidden_mark_count_
= this->hidden_mark_count_
;
225 // References changed, update dependencies.
226 this->self_
->tracking_update();
231 ///////////////////////////////////////////////////////////////////////////
234 template<typename InputIter
>
235 basic_regex
<BidiIter
> compile_(InputIter begin
, InputIter end
, flag_type flags
, std::input_iterator_tag
)
237 string_type
pat(begin
, end
);
238 return this->compile_(boost::begin(pat
), boost::end(pat
), flags
, std::forward_iterator_tag());
241 ///////////////////////////////////////////////////////////////////////////
246 this->mark_count_
= 0;
247 this->hidden_mark_count_
= 0;
248 this->traits_
.flags(regex_constants::ECMAScript
);
251 ///////////////////////////////////////////////////////////////////////////
254 traits_type
&rxtraits()
256 return this->traits_
.traits();
259 ///////////////////////////////////////////////////////////////////////////
262 traits_type
const &rxtraits() const
264 return this->traits_
.traits();
267 ///////////////////////////////////////////////////////////////////////////
270 template<typename FwdIter
>
271 detail::sequence
<BidiIter
> parse_alternates(FwdIter
&begin
, FwdIter end
)
273 using namespace regex_constants
;
276 detail::sequence
<BidiIter
> seq
;
281 seq
= this->parse_sequence(tmp
, end
);
284 seq
= detail::make_dynamic
<BidiIter
>(alternate_matcher()) | seq
;
287 seq
|= this->parse_sequence(tmp
, end
);
289 while((begin
= tmp
) != end
&& token_alternate
== this->traits_
.get_token(tmp
, end
));
294 ///////////////////////////////////////////////////////////////////////////
297 template<typename FwdIter
>
298 detail::sequence
<BidiIter
> parse_group(FwdIter
&begin
, FwdIter end
)
300 using namespace regex_constants
;
303 bool lookahead
= false;
304 bool lookbehind
= false;
305 bool negative
= false;
308 detail::sequence
<BidiIter
> seq
, seq_end
;
309 FwdIter tmp
= FwdIter();
311 syntax_option_type old_flags
= this->traits_
.flags();
313 switch(this->traits_
.get_group_type(begin
, end
, name
))
316 // Don't process empty groups like (?:) or (?i)
317 // BUGBUG this doesn't handle the degenerate (?:)+ correctly
318 if(token_group_end
== this->traits_
.get_token(tmp
= begin
, end
))
320 return this->parse_atom(begin
= tmp
, end
);
324 case token_negative_lookahead
:
325 negative
= true; // fall-through
326 case token_positive_lookahead
:
330 case token_negative_lookbehind
:
331 negative
= true; // fall-through
332 case token_positive_lookbehind
:
336 case token_independent_sub_expression
:
341 while(BOOST_XPR_ENSURE_(begin
!= end
, error_paren
, "mismatched parenthesis"))
343 switch(this->traits_
.get_token(begin
, end
))
345 case token_group_end
: return this->parse_atom(begin
, end
);
346 case token_escape
: BOOST_XPR_ENSURE_(begin
!= end
, error_escape
, "incomplete escape sequence");
347 case token_literal
: ++begin
;
356 begin
!= end
&& token_group_end
== this->traits_
.get_token(begin
, end
)
358 , "mismatched parenthesis"
360 return detail::make_dynamic
<BidiIter
>(detail::regex_byref_matcher
<BidiIter
>(this->self_
));
362 case token_rule_assign
:
363 BOOST_THROW_EXCEPTION(
364 regex_error(error_badrule
, "rule assignments must be at the front of the regex")
370 typedef detail::core_access
<BidiIter
> access
;
373 begin
!= end
&& token_group_end
== this->traits_
.get_token(begin
, end
)
375 , "mismatched parenthesis"
377 basic_regex
<BidiIter
> &rex
= this->rules_
[name
];
378 shared_ptr
<detail::regex_impl
<BidiIter
> > impl
= access::get_regex_impl(rex
);
379 this->self_
->track_reference(*impl
);
380 return detail::make_dynamic
<BidiIter
>(detail::regex_byref_matcher
<BidiIter
>(impl
));
383 case token_named_mark
:
384 mark_nbr
= static_cast<int>(++this->mark_count_
);
385 for(std::size_t i
= 0; i
< this->self_
->named_marks_
.size(); ++i
)
387 BOOST_XPR_ENSURE_(this->self_
->named_marks_
[i
].name_
!= name
, error_badmark
, "named mark already exists");
389 this->self_
->named_marks_
.push_back(detail::named_mark
<char_type
>(name
, this->mark_count_
));
390 seq
= detail::make_dynamic
<BidiIter
>(detail::mark_begin_matcher(mark_nbr
));
391 seq_end
= detail::make_dynamic
<BidiIter
>(detail::mark_end_matcher(mark_nbr
));
394 case token_named_mark_ref
:
397 begin
!= end
&& token_group_end
== this->traits_
.get_token(begin
, end
)
399 , "mismatched parenthesis"
401 for(std::size_t i
= 0; i
< this->self_
->named_marks_
.size(); ++i
)
403 if(this->self_
->named_marks_
[i
].name_
== name
)
405 mark_nbr
= static_cast<int>(this->self_
->named_marks_
[i
].mark_nbr_
);
406 return detail::make_backref_xpression
<BidiIter
>
408 mark_nbr
, this->traits_
.flags(), this->rxtraits()
412 BOOST_THROW_EXCEPTION(regex_error(error_badmark
, "invalid named back-reference"));
416 mark_nbr
= static_cast<int>(++this->mark_count_
);
417 seq
= detail::make_dynamic
<BidiIter
>(detail::mark_begin_matcher(mark_nbr
));
418 seq_end
= detail::make_dynamic
<BidiIter
>(detail::mark_end_matcher(mark_nbr
));
423 seq
+= this->parse_alternates(begin
, end
);
427 begin
!= end
&& token_group_end
== this->traits_
.get_token(begin
, end
)
429 , "mismatched parenthesis"
432 typedef detail::shared_matchable
<BidiIter
> xpr_type
;
435 seq
+= detail::make_independent_end_xpression
<BidiIter
>(seq
.pure());
436 detail::lookahead_matcher
<xpr_type
> lookahead(seq
.xpr(), negative
, seq
.pure());
437 seq
= detail::make_dynamic
<BidiIter
>(lookahead
);
441 seq
+= detail::make_independent_end_xpression
<BidiIter
>(seq
.pure());
442 detail::lookbehind_matcher
<xpr_type
> lookbehind(seq
.xpr(), seq
.width().value(), negative
, seq
.pure());
443 seq
= detail::make_dynamic
<BidiIter
>(lookbehind
);
445 else if(keeper
) // independent sub-expression
447 seq
+= detail::make_independent_end_xpression
<BidiIter
>(seq
.pure());
448 detail::keeper_matcher
<xpr_type
> keeper(seq
.xpr(), seq
.pure());
449 seq
= detail::make_dynamic
<BidiIter
>(keeper
);
452 // restore the modifiers
453 this->traits_
.flags(old_flags
);
457 ///////////////////////////////////////////////////////////////////////////
460 template<typename FwdIter
>
461 detail::sequence
<BidiIter
> parse_charset(FwdIter
&begin
, FwdIter end
)
463 detail::compound_charset
<traits_type
> chset
;
465 // call out to a helper to actually parse the character set
466 detail::parse_charset(begin
, end
, chset
, this->traits_
);
468 return detail::make_charset_xpression
<BidiIter
>
472 , this->traits_
.flags()
476 ///////////////////////////////////////////////////////////////////////////
479 template<typename FwdIter
>
480 detail::sequence
<BidiIter
> parse_atom(FwdIter
&begin
, FwdIter end
)
482 using namespace regex_constants
;
483 escape_value esc
= { 0, 0, 0, detail::escape_char
};
484 FwdIter old_begin
= begin
;
486 switch(this->traits_
.get_token(begin
, end
))
489 return detail::make_literal_xpression
<BidiIter
>
491 this->parse_literal(begin
, end
), this->traits_
.flags(), this->rxtraits()
495 return detail::make_any_xpression
<BidiIter
>(this->traits_
.flags(), this->rxtraits());
497 case token_assert_begin_sequence
:
498 return detail::make_dynamic
<BidiIter
>(detail::assert_bos_matcher());
500 case token_assert_end_sequence
:
501 return detail::make_dynamic
<BidiIter
>(detail::assert_eos_matcher());
503 case token_assert_begin_line
:
504 return detail::make_assert_begin_line
<BidiIter
>(this->traits_
.flags(), this->rxtraits());
506 case token_assert_end_line
:
507 return detail::make_assert_end_line
<BidiIter
>(this->traits_
.flags(), this->rxtraits());
509 case token_assert_word_boundary
:
510 return detail::make_assert_word
<BidiIter
>(detail::word_boundary
<mpl::true_
>(), this->rxtraits());
512 case token_assert_not_word_boundary
:
513 return detail::make_assert_word
<BidiIter
>(detail::word_boundary
<mpl::false_
>(), this->rxtraits());
515 case token_assert_word_begin
:
516 return detail::make_assert_word
<BidiIter
>(detail::word_begin(), this->rxtraits());
518 case token_assert_word_end
:
519 return detail::make_assert_word
<BidiIter
>(detail::word_end(), this->rxtraits());
522 esc
= this->parse_escape(begin
, end
);
525 case detail::escape_mark
:
526 return detail::make_backref_xpression
<BidiIter
>
528 esc
.mark_nbr_
, this->traits_
.flags(), this->rxtraits()
530 case detail::escape_char
:
531 return detail::make_char_xpression
<BidiIter
>
533 esc
.ch_
, this->traits_
.flags(), this->rxtraits()
535 case detail::escape_class
:
536 return detail::make_posix_charset_xpression
<BidiIter
>
539 , this->is_upper_(*begin
++)
540 , this->traits_
.flags()
545 case token_group_begin
:
546 return this->parse_group(begin
, end
);
548 case token_charset_begin
:
549 return this->parse_charset(begin
, end
);
551 case token_invalid_quantifier
:
552 BOOST_THROW_EXCEPTION(regex_error(error_badrepeat
, "quantifier not expected"));
555 case token_quote_meta_begin
:
556 return detail::make_literal_xpression
<BidiIter
>
558 this->parse_quote_meta(begin
, end
), this->traits_
.flags(), this->rxtraits()
561 case token_quote_meta_end
:
562 BOOST_THROW_EXCEPTION(
565 , "found quote-meta end without corresponding quote-meta begin"
570 case token_end_of_pattern
:
578 return detail::sequence
<BidiIter
>();
581 ///////////////////////////////////////////////////////////////////////////
584 template<typename FwdIter
>
585 detail::sequence
<BidiIter
> parse_quant(FwdIter
&begin
, FwdIter end
)
587 BOOST_ASSERT(begin
!= end
);
588 detail::quant_spec spec
= { 0, 0, false, &this->hidden_mark_count_
};
589 detail::sequence
<BidiIter
> seq
= this->parse_atom(begin
, end
);
591 // BUGBUG this doesn't handle the degenerate (?:)+ correctly
592 if(!seq
.empty() && begin
!= end
&& detail::quant_none
!= seq
.quant())
594 if(this->traits_
.get_quant_spec(begin
, end
, spec
))
596 BOOST_ASSERT(spec
.min_
<= spec
.max_
);
598 if(0 == spec
.max_
) // quant {0,0} is degenerate -- matches nothing.
600 seq
= this->parse_quant(begin
, end
);
612 ///////////////////////////////////////////////////////////////////////////
615 template<typename FwdIter
>
616 detail::sequence
<BidiIter
> parse_sequence(FwdIter
&begin
, FwdIter end
)
618 detail::sequence
<BidiIter
> seq
;
622 detail::sequence
<BidiIter
> seq_quant
= this->parse_quant(begin
, end
);
624 // did we find a quantified atom?
625 if(seq_quant
.empty())
628 // chain it to the end of the xpression sequence
635 ///////////////////////////////////////////////////////////////////////////
637 // scan ahead looking for char literals to be globbed together into a string literal
639 template<typename FwdIter
>
640 string_type
parse_literal(FwdIter
&begin
, FwdIter end
)
642 using namespace regex_constants
;
643 BOOST_ASSERT(begin
!= end
);
644 BOOST_ASSERT(token_literal
== this->traits_
.get_token(begin
, end
));
645 escape_value esc
= { 0, 0, 0, detail::escape_char
};
646 string_type
literal(1, *begin
);
648 for(FwdIter prev
= begin
, tmp
= ++begin
; begin
!= end
; prev
= begin
, begin
= tmp
)
650 detail::quant_spec spec
= { 0, 0, false, &this->hidden_mark_count_
};
651 if(this->traits_
.get_quant_spec(tmp
, end
, spec
))
653 if(literal
.size() != 1)
656 literal
.erase(boost::prior(literal
.end()));
660 else switch(this->traits_
.get_token(tmp
, end
))
663 esc
= this->parse_escape(tmp
, end
);
664 if(detail::escape_char
!= esc
.type_
) return literal
;
665 literal
.insert(literal
.end(), esc
.ch_
);
668 literal
.insert(literal
.end(), *tmp
++);
678 ///////////////////////////////////////////////////////////////////////////
680 // scan ahead looking for char literals to be globbed together into a string literal
682 template<typename FwdIter
>
683 string_type
parse_quote_meta(FwdIter
&begin
, FwdIter end
)
685 using namespace regex_constants
;
686 FwdIter old_begin
= begin
, old_end
;
687 while(end
!= (old_end
= begin
))
689 switch(this->traits_
.get_token(begin
, end
))
691 case token_quote_meta_end
: return string_type(old_begin
, old_end
);
692 case token_escape
: BOOST_XPR_ENSURE_(begin
!= end
, error_escape
, "incomplete escape sequence");
693 case token_literal
: ++begin
;
697 return string_type(old_begin
, begin
);
700 ///////////////////////////////////////////////////////////////////////////////
703 template<typename FwdIter
>
704 escape_value
parse_escape(FwdIter
&begin
, FwdIter end
)
706 BOOST_XPR_ENSURE_(begin
!= end
, regex_constants::error_escape
, "incomplete escape sequence");
708 // first, check to see if this can be a backreference
709 if(0 < this->rxtraits().value(*begin
, 10))
711 // Parse at most 3 decimal digits.
713 int mark_nbr
= detail::toi(tmp
, end
, this->rxtraits(), 10, 999);
715 // If the resulting number could conceivably be a backref, then it is.
716 if(10 > mark_nbr
|| mark_nbr
<= static_cast<int>(this->mark_count_
))
719 escape_value esc
= {0, mark_nbr
, 0, detail::escape_mark
};
724 // Not a backreference, defer to the parse_escape helper
725 return detail::parse_escape(begin
, end
, this->traits_
);
728 bool is_upper_(char_type ch
) const
730 return 0 != this->upper_
&& this->rxtraits().isctype(ch
, this->upper_
);
733 std::size_t mark_count_
;
734 std::size_t hidden_mark_count_
;
735 CompilerTraits traits_
;
736 typename
RegexTraits::char_class_type upper_
;
737 shared_ptr
<detail::regex_impl
<BidiIter
> > self_
;
738 std::map
<string_type
, basic_regex
<BidiIter
> > rules_
;
741 }} // namespace boost::xpressive