1 // This file is part of the ustl library, an STL implementation.
3 // Copyright (C) 2005 by Mike Sharov <msharov@users.sourceforge.net>
4 // This file is free software, distributed under the MIT License.
8 // This file contains stream iterators that read and write UTF-8 encoded
9 // characters. The encoding is defined as follows:
11 // U-00000000 - U-0000007F: 0xxxxxxx
12 // U-00000080 - U-000007FF: 110xxxxx 10xxxxxx
13 // U-00000800 - U-0000FFFF: 1110xxxx 10xxxxxx 10xxxxxx
14 // U-00010000 - U-001FFFFF: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
15 // U-00200000 - U-03FFFFFF: 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
16 // U-04000000 - U-7FFFFFFF: 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
17 // U-80000000 - U-FFFFFFFF: 11111110 100000xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
19 // The last range in not in the UTF-8 standard because Unicode forbids
20 // characters of those values. However, since ustl::string uses this code
21 // to write its length, the support is here. The reason it was put here
22 // in the first place, is that extra code would have been necessary to
23 // flag that range as invalid.
25 #ifndef UTF8_H_3D7AEEEB3A88928D4D280B785F78B6F4
26 #define UTF8_H_3D7AEEEB3A88928D4D280B785F78B6F4
28 #include "uiterator.h"
32 //----------------------------------------------------------------------
34 typedef uint8_t utf8subchar_t
; ///< Type for the encoding subcharacters.
36 //----------------------------------------------------------------------
38 /// Returns the number of bytes required to UTF-8 encode \p v.
39 inline size_t Utf8Bytes (wchar_t v
)
41 static const uint32_t c_Bounds
[] = { 0x0000007F, 0x000007FF, 0x0000FFFF, 0x001FFFFF, 0x03FFFFFF, 0x7FFFFFFF, 0xFFFFFFFF, };
43 while (c_Bounds
[bi
++] < uint32_t(v
));
47 /// Measures the size of a wchar_t array in UTF-8 encoding.
48 inline size_t Utf8Bytes (const wchar_t* first
, const wchar_t* last
)
51 for (; first
< last
; ++first
)
52 bc
+= Utf8Bytes(*first
);
56 /// Returns the number of bytes in a UTF-8 sequence that starts with \p c.
57 inline size_t Utf8SequenceBytes (wchar_t c
) // a wchar_t to keep c in a full register
59 // Count the leading bits. Header bits are 1 * nBytes followed by a 0.
60 // 0 - single byte character. Take 7 bits (0xFF >> 1)
61 // 1 - error, in the middle of the character. Take 6 bits (0xFF >> 2)
62 // so you will keep reading invalid entries until you hit the next character.
63 // >2 - multibyte character. Take remaining bits, and get the next bytes.
64 // All errors are ignored, since the user can not correct them.
68 for (; c
& mask
; ++nBytes
)
70 return (nBytes
? nBytes
: 1); // A sequence is always at least 1 byte.
73 //----------------------------------------------------------------------
75 /// \class utf8in_iterator utf8.h ustl.h
76 /// \ingroup IteratorAdaptors
78 /// \brief An iterator adaptor to character containers for reading UTF-8 encoded text.
80 /// For example, you can copy from ustl::string to ustl::vector<wchar_t> with
81 /// copy (utf8in (str.begin()), utf8in (str.end()), back_inserter(wvect));
82 /// There is no error handling; if the reading frame slips you'll get extra
83 /// characters, one for every misaligned byte. Although it is possible to skip
84 /// to the start of the next character, that would result in omitting the
85 /// misformatted character and the one after it, making it very difficult to
86 /// detect by the user. It is better to write some strange characters and let
87 /// the user know his file is corrupted. Another problem is overflow on bad
88 /// encodings (like a 0xFF on the end of a string). This is checked through
89 /// the end-of-string nul character, which will always be there as long as
90 /// you are using the string class.
92 template <typename Iterator
, typename WChar
= wchar_t>
93 class utf8in_iterator
{
95 typedef typename iterator_traits
<Iterator
>::value_type value_type
;
96 typedef typename iterator_traits
<Iterator
>::difference_type difference_type
;
97 typedef typename iterator_traits
<Iterator
>::pointer pointer
;
98 typedef typename iterator_traits
<Iterator
>::reference reference
;
100 explicit utf8in_iterator (const Iterator
& is
) : m_i (is
), m_v (0) { Read(); }
101 utf8in_iterator (const utf8in_iterator
& i
) : m_i (i
.m_i
), m_v (i
.m_v
) {}
102 inline const utf8in_iterator
& operator= (const utf8in_iterator
& i
) { m_i
= i
.m_i
; m_v
= i
.m_v
; return (*this); }
103 inline Iterator
base (void) const { return (m_i
- (Utf8Bytes(m_v
) - 1)); }
104 /// Reads and returns the next value.
105 inline WChar
operator* (void) const { return (m_v
); }
106 inline utf8in_iterator
& operator++ (void) { ++m_i
; Read(); return (*this); }
107 inline utf8in_iterator
operator++ (int) { utf8in_iterator
old (*this); operator++(); return (old
); }
108 inline utf8in_iterator
& operator+= (uoff_t n
) { while (n
--) operator++(); return (*this); }
109 inline utf8in_iterator
operator+ (uoff_t n
) { utf8in_iterator
v (*this); return (v
+= n
); }
110 inline bool operator== (const utf8in_iterator
& i
) const { return (m_i
== i
.m_i
); }
111 inline bool operator< (const utf8in_iterator
& i
) const { return (m_i
< i
.m_i
); }
112 difference_type
operator- (const utf8in_iterator
& i
) const;
120 /// Steps to the next character and updates current returnable value.
121 template <typename Iterator
, typename WChar
>
122 void utf8in_iterator
<Iterator
,WChar
>::Read (void)
124 const utf8subchar_t c
= *m_i
;
125 size_t nBytes
= Utf8SequenceBytes (c
);
126 m_v
= c
& (0xFF >> nBytes
); // First byte contains bits after the header.
127 while (--nBytes
&& *++m_i
) // Each subsequent byte has 6 bits.
128 m_v
= (m_v
<< 6) | (*m_i
& 0x3F);
131 /// Returns the distance in characters (as opposed to the distance in bytes).
132 template <typename Iterator
, typename WChar
>
133 typename utf8in_iterator
<Iterator
,WChar
>::difference_type
134 utf8in_iterator
<Iterator
,WChar
>::operator- (const utf8in_iterator
<Iterator
,WChar
>& last
) const
136 difference_type dist
= 0;
137 for (Iterator
first (last
.m_i
); first
< m_i
; ++dist
)
138 first
= advance (first
, Utf8SequenceBytes (*first
));
142 //----------------------------------------------------------------------
144 /// \class utf8out_iterator utf8.h ustl.h
145 /// \ingroup IteratorAdaptors
147 /// \brief An iterator adaptor to character containers for writing UTF-8 encoded text.
149 template <typename Iterator
, typename WChar
= wchar_t>
150 class utf8out_iterator
{
152 typedef typename iterator_traits
<Iterator
>::value_type value_type
;
153 typedef typename iterator_traits
<Iterator
>::difference_type difference_type
;
154 typedef typename iterator_traits
<Iterator
>::pointer pointer
;
155 typedef typename iterator_traits
<Iterator
>::reference reference
;
157 explicit utf8out_iterator (const Iterator
& os
) : m_i (os
) {}
158 utf8out_iterator (const utf8out_iterator
& i
) : m_i (i
.m_i
) {}
159 inline const Iterator
& base (void) const { return (m_i
); }
160 /// Writes \p v into the stream.
161 utf8out_iterator
& operator= (WChar v
);
162 inline utf8out_iterator
& operator* (void) { return (*this); }
163 inline utf8out_iterator
& operator++ (void) { return (*this); }
164 inline utf8out_iterator
operator++ (int) { return (*this); }
165 inline bool operator== (const utf8out_iterator
& i
) const { return (m_i
== i
.m_i
); }
166 inline bool operator< (const utf8out_iterator
& i
) const { return (m_i
< i
.m_i
); }
171 /// Writes \p v into the stream.
172 template <typename Iterator
, typename WChar
>
173 utf8out_iterator
<Iterator
,WChar
>& utf8out_iterator
<Iterator
,WChar
>::operator= (WChar v
)
175 const size_t nBytes
= Utf8Bytes (v
);
177 // Write the bits 6 bits at a time, except for the first one,
178 // which may be less than 6 bits.
179 register wchar_t shift
= nBytes
* 6;
180 *m_i
++ = ((v
>> (shift
-= 6)) & 0x3F) | (0xFF << (8 - nBytes
));
182 *m_i
++ = ((v
>> (shift
-= 6)) & 0x3F) | 0x80;
183 } else // If only one byte, there is no header.
188 //----------------------------------------------------------------------
190 /// Returns a UTF-8 adaptor writing to \p i. Useful in conjuction with back_insert_iterator.
191 template <typename Iterator
>
192 inline utf8out_iterator
<Iterator
> utf8out (Iterator i
)
194 return (utf8out_iterator
<Iterator
> (i
));
197 /// Returns a UTF-8 adaptor reading from \p i.
198 template <typename Iterator
>
199 inline utf8in_iterator
<Iterator
> utf8in (Iterator i
)
201 return (utf8in_iterator
<Iterator
> (i
));
204 //----------------------------------------------------------------------