1 // Boost token_functions.hpp ------------------------------------------------//
3 // Copyright John R. Bandela 2001.
5 // Distributed under the Boost Software License, Version 1.0. (See
6 // accompanying file LICENSE_1_0.txt or copy at
7 // http://www.boost.org/LICENSE_1_0.txt)
9 // See http://www.boost.org/libs/tokenizer/ for documentation.
12 // 01 Oct 2004 Joaquin M Lopez Munoz
13 // Workaround for a problem with string::assign in msvc-stlport
14 // 06 Apr 2004 John Bandela
15 // Fixed a bug involving using char_delimiter with a true input iterator
16 // 28 Nov 2003 Robert Zeh and John Bandela
17 // Converted into "fast" functions that avoid using += when
18 // the supplied iterator isn't an input_iterator; based on
19 // some work done at Archelon and a version that was checked into
20 // the boost CVS for a short period of time.
21 // 20 Feb 2002 John Maddock
22 // Removed using namespace std declarations and added
23 // workaround for BOOST_NO_STDC_NAMESPACE (the library
24 // can be safely mixed with regex).
25 // 06 Feb 2002 Jeremy Siek
26 // Added char_separator.
27 // 02 Feb 2002 Jeremy Siek
28 // Removed tabs and a little cleanup.
31 #ifndef BOOST_TOKEN_FUNCTIONS_JRB120303_HPP_
32 #define BOOST_TOKEN_FUNCTIONS_JRB120303_HPP_
38 #include <algorithm> // for find_if
39 #include <boost/config.hpp>
40 #include <boost/assert.hpp>
41 #include <boost/detail/workaround.hpp>
42 #include <boost/mpl/if.hpp>
43 #if !defined(BOOST_NO_CWCTYPE)
48 // the following must not be macros if we are to prefix them
49 // with std:: (they shouldn't be macros anyway...)
64 // fix namespace problems:
66 #ifdef BOOST_NO_STDC_NAMESPACE
70 #if !defined(BOOST_NO_CWCTYPE)
78 //===========================================================================
79 // The escaped_list_separator class. Which is a model of TokenizerFunction
80 // An escaped list is a super-set of what is commonly known as a comma
81 // separated value (csv) list.It is separated into fields by a comma or
82 // other character. If the delimiting character is inside quotes, then it is
83 // counted as a regular character.To allow for embedded quotes in a field,
84 // there can be escape sequences using the \ much like C.
85 // The role of the comma, the quotation mark, and the escape
86 // character (backslash \), can be assigned to other characters.
88 struct escaped_list_error
: public std::runtime_error
{
89 escaped_list_error(const std::string
& what_arg
):std::runtime_error(what_arg
) { }
93 // The out of the box GCC 2.95 on cygwin does not have a char_traits class.
94 // MSVC does not like the following typename
96 class Traits
= BOOST_DEDUCED_TYPENAME
std::basic_string
<Char
>::traits_type
>
97 class escaped_list_separator
{
100 typedef std::basic_string
<Char
,Traits
> string_type
;
103 char_eq(Char e
):e_(e
) { }
104 bool operator()(Char c
) {
105 return Traits::eq(e_
,c
);
113 bool is_escape(Char e
) {
115 return std::find_if(escape_
.begin(),escape_
.end(),f
)!=escape_
.end();
119 return std::find_if(c_
.begin(),c_
.end(),f
)!=c_
.end();
121 bool is_quote(Char e
) {
123 return std::find_if(quote_
.begin(),quote_
.end(),f
)!=quote_
.end();
125 template <typename iterator
, typename Token
>
126 void do_escape(iterator
& next
,iterator end
,Token
& tok
) {
128 throw escaped_list_error(std::string("cannot end with escape"));
129 if (Traits::eq(*next
,'n')) {
133 else if (is_quote(*next
)) {
137 else if (is_c(*next
)) {
141 else if (is_escape(*next
)) {
146 throw escaped_list_error(std::string("unknown escape sequence"));
151 explicit escaped_list_separator(Char e
= '\\',
152 Char c
= ',',Char q
= '\"')
153 : escape_(1,e
), c_(1,c
), quote_(1,q
), last_(false) { }
155 escaped_list_separator(string_type e
, string_type c
, string_type q
)
156 : escape_(e
), c_(c
), quote_(q
), last_(false) { }
158 void reset() {last_
=false;}
160 template <typename InputIterator
, typename Token
>
161 bool operator()(InputIterator
& next
,InputIterator end
,Token
& tok
) {
162 bool bInQuote
= false;
174 for (;next
!= end
;++next
) {
175 if (is_escape(*next
)) {
176 do_escape(next
,end
,tok
);
178 else if (is_c(*next
)) {
180 // If we are not in quote, then we are done
182 // The last character was a c, that means there is
183 // 1 more blank field
189 else if (is_quote(*next
)) {
200 //===========================================================================
201 // The classes here are used by offset_separator and char_separator to implement
202 // faster assigning of tokens using assign instead of +=
204 namespace tokenizer_detail
{
205 //===========================================================================
206 // Tokenizer was broken for wide character separators, at least on Windows, since
207 // CRT functions isspace etc only expect values in [0, 0xFF]. Debug build asserts
208 // if higher values are passed in. The traits extension class should take care of this.
209 // Assuming that the conditional will always get optimized out in the function
210 // implementations, argument types are not a problem since both forms of character classifiers
213 #if !defined(BOOST_NO_CWCTYPE)
214 template<typename traits
, int N
>
215 struct traits_extension_details
: public traits
{
216 typedef typename
traits::char_type char_type
;
217 static bool isspace(char_type c
)
219 return std::iswspace(c
) != 0;
221 static bool ispunct(char_type c
)
223 return std::iswpunct(c
) != 0;
227 template<typename traits
>
228 struct traits_extension_details
<traits
, 1> : public traits
{
229 typedef typename
traits::char_type char_type
;
230 static bool isspace(char_type c
)
232 return std::isspace(c
) != 0;
234 static bool ispunct(char_type c
)
236 return std::ispunct(c
) != 0;
242 // In case there is no cwctype header, we implement the checks manually.
243 // We make use of the fact that the tested categories should fit in ASCII.
244 template<typename traits
>
245 struct traits_extension
: public traits
{
246 typedef typename
traits::char_type char_type
;
247 static bool isspace(char_type c
)
249 #if !defined(BOOST_NO_CWCTYPE)
250 return traits_extension_details
<traits
, sizeof(char_type
)>::isspace(c
);
252 return static_cast< unsigned >(c
) <= 255 && std::isspace(c
) != 0;
256 static bool ispunct(char_type c
)
258 #if !defined(BOOST_NO_CWCTYPE)
259 return traits_extension_details
<traits
, sizeof(char_type
)>::ispunct(c
);
261 return static_cast< unsigned >(c
) <= 255 && std::ispunct(c
) != 0;
266 // The assign_or_plus_equal struct contains functions that implement
267 // assign, +=, and clearing based on the iterator type. The
268 // generic case does nothing for plus_equal and clearing, while
269 // passing through the call for assign.
271 // When an input iterator is being used, the situation is reversed.
272 // The assign method does nothing, plus_equal invokes operator +=,
273 // and the clearing method sets the supplied token to the default
274 // token constructor's result.
277 template<class IteratorTag
>
278 struct assign_or_plus_equal
{
279 template<class Iterator
, class Token
>
280 static void assign(Iterator b
, Iterator e
, Token
&t
) {
282 #if BOOST_WORKAROUND(BOOST_MSVC, < 1300) &&\
283 BOOST_WORKAROUND(__SGI_STL_PORT, < 0x500) &&\
284 defined(_STLP_DEBUG) &&\
285 (defined(_STLP_USE_DYNAMIC_LIB) || defined(_DLL))
286 // Problem with string::assign for msvc-stlport in debug mode: the
287 // linker tries to import the templatized version of this memfun,
288 // which is obviously not exported.
289 // See http://www.stlport.com/dcforum/DCForumID6/1763.html for details.
292 while(b
!= e
) t
+= *b
++;
299 template<class Token
, class Value
>
300 static void plus_equal(Token
&, const Value
&) { }
302 // If we are doing an assign, there is no need for the
305 template<class Token
>
306 static void clear(Token
&) { }
310 struct assign_or_plus_equal
<std::input_iterator_tag
> {
311 template<class Iterator
, class Token
>
312 static void assign(Iterator b
, Iterator e
, Token
&t
) { }
313 template<class Token
, class Value
>
314 static void plus_equal(Token
&t
, const Value
&v
) {
317 template<class Token
>
318 static void clear(Token
&t
) {
324 template<class Iterator
>
325 struct pointer_iterator_category
{
326 typedef std::random_access_iterator_tag type
;
330 template<class Iterator
>
331 struct class_iterator_category
{
332 typedef typename
Iterator::iterator_category type
;
337 // This portably gets the iterator_tag without partial template specialization
338 template<class Iterator
>
339 struct get_iterator_category
{
340 typedef typename
mpl::if_
<is_pointer
<Iterator
>,
341 pointer_iterator_category
<Iterator
>,
342 class_iterator_category
<Iterator
>
345 typedef typename
cat::type iterator_category
;
349 } // namespace tokenizer_detail
352 //===========================================================================
353 // The offset_separator class, which is a model of TokenizerFunction.
354 // Offset breaks a string into tokens based on a range of offsets
356 class offset_separator
{
359 std::vector
<int> offsets_
;
360 unsigned int current_offset_
;
362 bool return_partial_last_
;
365 template <typename Iter
>
366 offset_separator(Iter begin
, Iter end
, bool wrap_offsets
= true,
367 bool return_partial_last
= true)
368 : offsets_(begin
,end
), current_offset_(0),
369 wrap_offsets_(wrap_offsets
),
370 return_partial_last_(return_partial_last
) { }
373 : offsets_(1,1), current_offset_(),
374 wrap_offsets_(true), return_partial_last_(true) { }
380 template <typename InputIterator
, typename Token
>
381 bool operator()(InputIterator
& next
, InputIterator end
, Token
& tok
)
383 typedef tokenizer_detail::assign_or_plus_equal
<
384 BOOST_DEDUCED_TYPENAME
tokenizer_detail::get_iterator_category
<
389 BOOST_ASSERT(!offsets_
.empty());
391 assigner::clear(tok
);
392 InputIterator
start(next
);
397 if (current_offset_
== offsets_
.size())
405 int c
= offsets_
[current_offset_
];
408 if (next
== end
)break;
409 assigner::plus_equal(tok
,*next
++);
411 assigner::assign(start
,next
,tok
);
413 if (!return_partial_last_
)
423 //===========================================================================
424 // The char_separator class breaks a sequence of characters into
425 // tokens based on the character delimiters (very much like bad old
426 // strtok). A delimiter character can either be kept or dropped. A
427 // kept delimiter shows up as an output token, whereas a dropped
428 // delimiter does not.
430 // This class replaces the char_delimiters_separator class. The
431 // constructor for the char_delimiters_separator class was too
432 // confusing and needed to be deprecated. However, because of the
433 // default arguments to the constructor, adding the new constructor
434 // would cause ambiguity, so instead I deprecated the whole class.
435 // The implementation of the class was also simplified considerably.
437 enum empty_token_policy
{ drop_empty_tokens
, keep_empty_tokens
};
439 // The out of the box GCC 2.95 on cygwin does not have a char_traits class.
440 template <typename Char
,
441 typename Tr
= BOOST_DEDUCED_TYPENAME
std::basic_string
<Char
>::traits_type
>
444 typedef tokenizer_detail::traits_extension
<Tr
> Traits
;
445 typedef std::basic_string
<Char
,Tr
> string_type
;
448 char_separator(const Char
* dropped_delims
,
449 const Char
* kept_delims
= 0,
450 empty_token_policy empty_tokens
= drop_empty_tokens
)
451 : m_dropped_delims(dropped_delims
),
452 m_use_ispunct(false),
453 m_use_isspace(false),
454 m_empty_tokens(empty_tokens
),
457 // Borland workaround
459 m_kept_delims
= kept_delims
;
462 // use ispunct() for kept delimiters and isspace for dropped.
465 : m_use_ispunct(true),
467 m_empty_tokens(drop_empty_tokens
) { }
471 template <typename InputIterator
, typename Token
>
472 bool operator()(InputIterator
& next
, InputIterator end
, Token
& tok
)
474 typedef tokenizer_detail::assign_or_plus_equal
<
475 BOOST_DEDUCED_TYPENAME
tokenizer_detail::get_iterator_category
<
480 assigner::clear(tok
);
482 // skip past all dropped_delims
483 if (m_empty_tokens
== drop_empty_tokens
)
484 for (; next
!= end
&& is_dropped(*next
); ++next
)
487 InputIterator
start(next
);
489 if (m_empty_tokens
== drop_empty_tokens
) {
495 // if we are on a kept_delims move past it and stop
496 if (is_kept(*next
)) {
497 assigner::plus_equal(tok
,*next
);
500 // append all the non delim characters
501 for (; next
!= end
&& !is_dropped(*next
) && !is_kept(*next
); ++next
)
502 assigner::plus_equal(tok
,*next
);
504 else { // m_empty_tokens == keep_empty_tokens
506 // Handle empty token at the end
509 if (m_output_done
== false)
511 m_output_done
= true;
512 assigner::assign(start
,next
,tok
);
519 if (is_kept(*next
)) {
520 if (m_output_done
== false)
521 m_output_done
= true;
523 assigner::plus_equal(tok
,*next
);
525 m_output_done
= false;
528 else if (m_output_done
== false && is_dropped(*next
)) {
529 m_output_done
= true;
532 if (is_dropped(*next
))
534 for (; next
!= end
&& !is_dropped(*next
) && !is_kept(*next
); ++next
)
535 assigner::plus_equal(tok
,*next
);
536 m_output_done
= true;
539 assigner::assign(start
,next
,tok
);
544 string_type m_kept_delims
;
545 string_type m_dropped_delims
;
548 empty_token_policy m_empty_tokens
;
551 bool is_kept(Char E
) const
553 if (m_kept_delims
.length())
554 return m_kept_delims
.find(E
) != string_type::npos
;
555 else if (m_use_ispunct
) {
556 return Traits::ispunct(E
) != 0;
560 bool is_dropped(Char E
) const
562 if (m_dropped_delims
.length())
563 return m_dropped_delims
.find(E
) != string_type::npos
;
564 else if (m_use_isspace
) {
565 return Traits::isspace(E
) != 0;
571 //===========================================================================
572 // The following class is DEPRECATED, use class char_separators instead.
574 // The char_delimiters_separator class, which is a model of
575 // TokenizerFunction. char_delimiters_separator breaks a string
576 // into tokens based on character delimiters. There are 2 types of
577 // delimiters. returnable delimiters can be returned as
578 // tokens. These are often punctuation. nonreturnable delimiters
579 // cannot be returned as tokens. These are often whitespace
581 // The out of the box GCC 2.95 on cygwin does not have a char_traits class.
582 template <class Char
,
583 class Tr
= BOOST_DEDUCED_TYPENAME
std::basic_string
<Char
>::traits_type
>
584 class char_delimiters_separator
{
587 typedef tokenizer_detail::traits_extension
<Tr
> Traits
;
588 typedef std::basic_string
<Char
,Tr
> string_type
;
589 string_type returnable_
;
590 string_type nonreturnable_
;
595 bool is_ret(Char E
)const
597 if (returnable_
.length())
598 return returnable_
.find(E
) != string_type::npos
;
600 if (no_ispunct_
) {return false;}
602 int r
= Traits::ispunct(E
);
607 bool is_nonret(Char E
)const
609 if (nonreturnable_
.length())
610 return nonreturnable_
.find(E
) != string_type::npos
;
612 if (no_isspace_
) {return false;}
614 int r
= Traits::isspace(E
);
621 explicit char_delimiters_separator(bool return_delims
= false,
622 const Char
* returnable
= 0,
623 const Char
* nonreturnable
= 0)
624 : returnable_(returnable
? returnable
: string_type().c_str()),
625 nonreturnable_(nonreturnable
? nonreturnable
:string_type().c_str()),
626 return_delims_(return_delims
), no_ispunct_(returnable
!=0),
627 no_isspace_(nonreturnable
!=0) { }
633 template <typename InputIterator
, typename Token
>
634 bool operator()(InputIterator
& next
, InputIterator end
,Token
& tok
) {
637 // skip past all nonreturnable delims
638 // skip past the returnable only if we are not returning delims
639 for (;next
!=end
&& ( is_nonret(*next
) || (is_ret(*next
)
640 && !return_delims_
) );++next
) { }
646 // if we are to return delims and we are one a returnable one
647 // move past it and stop
648 if (is_ret(*next
) && return_delims_
) {
653 // append all the non delim characters
654 for (;next
!=end
&& !is_nonret(*next
) && !is_ret(*next
);++next
)