1 #ifndef BOOST_UTF8_CODECVT_FACET_HPP
2 #define BOOST_UTF8_CODECVT_FACET_HPP
4 #include <boost/iostreams/detail/config/wide_streams.hpp>
5 #ifdef BOOST_IOSTREAMS_NO_WIDE_STREAMS
6 # error wide streams not supported on this platform
9 // MS compatible compilers support #pragma once
10 #if defined(_MSC_VER) && (_MSC_VER >= 1020)
14 /////////1/////////2/////////3/////////4/////////5/////////6/////////7/////////8
15 // utf8_codecvt_facet.hpp
17 // Copyright © 2001 Ronald Garcia, Indiana University (garcia@osl.iu.edu)
18 // Andrew Lumsdaine, Indiana University (lums@osl.iu.edu).
19 // Distributed under the Boost Software License, Version 1.0. (See accompany-
20 // ing file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
22 // Note:(Robert Ramey). I have made the following alterations in the original
24 // a) Rendered utf8_codecvt<wchar_t, char> with using templates
25 // b) Move longer functions outside class definition to prevent inlining
26 // and make code smaller
27 // c) added on a derived class to permit translation to/from current
30 // See http://www.boost.org for updates, documentation, and revision history.
32 // archives stored as text - note these ar templated on the basic
33 // stream templates to accommodate wide (and other?) kind of characters
35 // note the fact that on libraries without wide characters, ostream is
36 // is not a specialization of basic_ostream which in fact is not defined
37 // in such cases. So we can't use basic_ostream<OStream::char_type> but rather
38 // use two template parameters
41 // This is an implementation of a std::codecvt facet for translating
42 // from UTF-8 externally to UCS-4. Note that this is not tied to
43 // any specific types in order to allow customization on platforms
44 // where wchar_t is not big enough.
46 // NOTES: The current implementation jumps through some unpleasant hoops in
47 // order to deal with signed character types. As a std::codecvt_base::result,
48 // it is necessary for the ExternType to be convertible to unsigned char.
49 // I chose not to tie the extern_type explicitly to char. But if any combination
50 // of types other than <wchar_t,char_t> is used, then std::codecvt must be
51 // specialized on those types for this to work.
54 #include <cstddef> // size_t
55 #include <boost/integer_traits.hpp>
56 #include <boost/iostreams/detail/config/wide_streams.hpp>
57 #include <boost/iostreams/detail/codecvt_helper.hpp>
59 // maximum lenght of a multibyte string
60 #define MB_LENGTH_MAX 8
62 struct utf8_codecvt_facet_wchar_t
63 : public boost::iostreams::detail::codecvt_helper
<wchar_t, char, std::mbstate_t>
66 explicit utf8_codecvt_facet_wchar_t(std::size_t no_locale_manage
= 0)
67 : boost::iostreams::detail::codecvt_helper
<wchar_t, char, std::mbstate_t>
71 virtual std::codecvt_base::result
do_in(
72 std::mbstate_t& state
,
74 const char * from_end
,
75 const char * & from_next
,
81 virtual std::codecvt_base::result
do_out(
82 std::mbstate_t & state
, const wchar_t * from
,
83 const wchar_t * from_end
, const wchar_t* & from_next
,
84 char * to
, char * to_end
, char * & to_next
87 bool invalid_continuing_octet(unsigned char octet_1
) const {
88 return (octet_1
< 0x80|| 0xbf< octet_1
);
91 bool invalid_leading_octet(unsigned char octet_1
) const {
92 return (0x7f < octet_1
&& octet_1
< 0xc0) ||
96 // continuing octets = octets except for the leading octet
97 static unsigned int get_cont_octet_count(unsigned char lead_octet
) {
98 return get_octet_count(lead_octet
) - 1;
101 static unsigned int get_octet_count(unsigned char lead_octet
);
103 // How many "continuing octets" will be needed for this word
104 // == total octets - 1.
105 int get_cont_octet_out_count(wchar_t word
) const ;
107 virtual bool do_always_noconv() const throw() { return false; }
109 // UTF-8 isn't really stateful since we rewind on partial conversions
110 virtual std::codecvt_base::result
do_unshift(
120 virtual int do_encoding() const throw() {
121 const int variable_byte_external_encoding
=0;
122 return variable_byte_external_encoding
;
125 // How many char objects can I process to get <= max_limit
127 virtual int do_length(
128 BOOST_IOSTREAMS_CODECVT_CV_QUALIFIER
std::mbstate_t &,
130 const char * from_end
,
131 std::size_t max_limit
134 // Largest possible value do_length(state,from,from_end,1) could return.
135 virtual int do_max_length() const throw () {
136 return 6; // largest UTF-8 encoding of a UCS-4 character
140 #if 0 // not used - incorrect in any case
141 // Robert Ramey - use the above to make a code converter from multi-byte
142 // char strings to utf8 encoding
143 struct utf8_codecvt_facet_char
: public utf8_codecvt_facet_wchar_t
145 typedef utf8_codecvt_facet_wchar_t base_class
;
147 explicit utf8_codecvt_facet_char(std::size_t no_locale_manage
=0)
148 : base_class(no_locale_manage
)
151 virtual std::codecvt_base::result
do_in(
152 std::mbstate_t & state
,
154 const char * from_end
,
155 const char * & from_next
,
161 virtual std::codecvt_base::result
do_out(
162 std::mbstate_t & state
,
164 const char * from_end
,
165 const char* & from_next
,
171 // How many char objects can I process to get <= max_limit
173 virtual int do_length(
174 const std::mbstate_t&,
176 const char * from_end
,
177 std::size_t max_limit
182 template<class Internal
, class External
>
183 struct utf8_codecvt_facet
187 struct utf8_codecvt_facet
<wchar_t, char>
188 : public utf8_codecvt_facet_wchar_t
193 struct utf8_codecvt_facet
<char, char>
194 : public utf8_codecvt_facet_char
198 #endif // BOOST_UTF8_CODECVT_FACET_HPP