Added static_assert from Loki
[ustl.git] / utf8.h
blob26c380e08178be38b42a76ca2db17b902695e4ba
1 // This file is part of the ustl library, an STL implementation.
2 //
3 // Copyright (C) 2005 by Mike Sharov <msharov@users.sourceforge.net>
4 // This file is free software, distributed under the MIT License.
5 //
6 // utf8.h
7 //
8 // This file contains stream iterators that read and write UTF-8 encoded
9 // characters. The encoding is defined as follows:
11 // U-00000000 - U-0000007F: 0xxxxxxx
12 // U-00000080 - U-000007FF: 110xxxxx 10xxxxxx
13 // U-00000800 - U-0000FFFF: 1110xxxx 10xxxxxx 10xxxxxx
14 // U-00010000 - U-001FFFFF: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
15 // U-00200000 - U-03FFFFFF: 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
16 // U-04000000 - U-7FFFFFFF: 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
17 // U-80000000 - U-FFFFFFFF: 11111110 100000xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
19 // The last range in not in the UTF-8 standard because Unicode forbids
20 // characters of those values. However, since ustl::string uses this code
21 // to write its length, the support is here. The reason it was put here
22 // in the first place, is that extra code would have been necessary to
23 // flag that range as invalid.
25 #ifndef UTF8_H_3D7AEEEB3A88928D4D280B785F78B6F4
26 #define UTF8_H_3D7AEEEB3A88928D4D280B785F78B6F4
28 #include "uiterator.h"
30 namespace ustl {
32 //----------------------------------------------------------------------
34 typedef uint8_t utf8subchar_t; ///< Type for the encoding subcharacters.
36 //----------------------------------------------------------------------
38 /// Returns the number of bytes required to UTF-8 encode \p v.
39 inline size_t Utf8Bytes (wchar_t v)
41 static const uint32_t c_Bounds[] = { 0x0000007F, 0x000007FF, 0x0000FFFF, 0x001FFFFF, 0x03FFFFFF, 0x7FFFFFFF, 0xFFFFFFFF, };
42 size_t bi = 0;
43 while (c_Bounds[bi++] < uint32_t(v));
44 return (bi);
47 /// Measures the size of a wchar_t array in UTF-8 encoding.
48 inline size_t Utf8Bytes (const wchar_t* first, const wchar_t* last)
50 size_t bc = 0;
51 for (; first < last; ++first)
52 bc += Utf8Bytes(*first);
53 return (bc);
56 /// Returns the number of bytes in a UTF-8 sequence that starts with \p c.
57 inline size_t Utf8SequenceBytes (wchar_t c) // a wchar_t to keep c in a full register
59 // Count the leading bits. Header bits are 1 * nBytes followed by a 0.
60 // 0 - single byte character. Take 7 bits (0xFF >> 1)
61 // 1 - error, in the middle of the character. Take 6 bits (0xFF >> 2)
62 // so you will keep reading invalid entries until you hit the next character.
63 // >2 - multibyte character. Take remaining bits, and get the next bytes.
64 // All errors are ignored, since the user can not correct them.
66 wchar_t mask = 0x80;
67 size_t nBytes = 0;
68 for (; c & mask; ++nBytes)
69 mask >>= 1;
70 return (nBytes ? nBytes : 1); // A sequence is always at least 1 byte.
73 //----------------------------------------------------------------------
75 /// \class utf8in_iterator utf8.h ustl.h
76 /// \ingroup IteratorAdaptors
77 ///
78 /// \brief An iterator adaptor to character containers for reading UTF-8 encoded text.
79 ///
80 /// For example, you can copy from ustl::string to ustl::vector<wchar_t> with
81 /// copy (utf8in (str.begin()), utf8in (str.end()), back_inserter(wvect));
82 /// There is no error handling; if the reading frame slips you'll get extra
83 /// characters, one for every misaligned byte. Although it is possible to skip
84 /// to the start of the next character, that would result in omitting the
85 /// misformatted character and the one after it, making it very difficult to
86 /// detect by the user. It is better to write some strange characters and let
87 /// the user know his file is corrupted. Another problem is overflow on bad
88 /// encodings (like a 0xFF on the end of a string). This is checked through
89 /// the end-of-string nul character, which will always be there as long as
90 /// you are using the string class.
91 ///
92 template <typename Iterator, typename WChar = wchar_t>
93 class utf8in_iterator {
94 public:
95 typedef typename iterator_traits<Iterator>::value_type value_type;
96 typedef typename iterator_traits<Iterator>::difference_type difference_type;
97 typedef typename iterator_traits<Iterator>::pointer pointer;
98 typedef typename iterator_traits<Iterator>::reference reference;
99 public:
100 explicit utf8in_iterator (const Iterator& is) : m_i (is), m_v (0) { Read(); }
101 utf8in_iterator (const utf8in_iterator& i) : m_i (i.m_i), m_v (i.m_v) {}
102 inline const utf8in_iterator& operator= (const utf8in_iterator& i) { m_i = i.m_i; m_v = i.m_v; return (*this); }
103 inline Iterator base (void) const { return (m_i - (Utf8Bytes(m_v) - 1)); }
104 /// Reads and returns the next value.
105 inline WChar operator* (void) const { return (m_v); }
106 inline utf8in_iterator& operator++ (void) { ++m_i; Read(); return (*this); }
107 inline utf8in_iterator operator++ (int) { utf8in_iterator old (*this); operator++(); return (old); }
108 inline utf8in_iterator& operator+= (uoff_t n) { while (n--) operator++(); return (*this); }
109 inline utf8in_iterator operator+ (uoff_t n) { utf8in_iterator v (*this); return (v += n); }
110 inline bool operator== (const utf8in_iterator& i) const { return (m_i == i.m_i); }
111 inline bool operator< (const utf8in_iterator& i) const { return (m_i < i.m_i); }
112 difference_type operator- (const utf8in_iterator& i) const;
113 private:
114 void Read (void);
115 private:
116 Iterator m_i;
117 WChar m_v;
120 /// Steps to the next character and updates current returnable value.
121 template <typename Iterator, typename WChar>
122 void utf8in_iterator<Iterator,WChar>::Read (void)
124 const utf8subchar_t c = *m_i;
125 size_t nBytes = Utf8SequenceBytes (c);
126 m_v = c & (0xFF >> nBytes); // First byte contains bits after the header.
127 while (--nBytes && *++m_i) // Each subsequent byte has 6 bits.
128 m_v = (m_v << 6) | (*m_i & 0x3F);
131 /// Returns the distance in characters (as opposed to the distance in bytes).
132 template <typename Iterator, typename WChar>
133 typename utf8in_iterator<Iterator,WChar>::difference_type
134 utf8in_iterator<Iterator,WChar>::operator- (const utf8in_iterator<Iterator,WChar>& last) const
136 difference_type dist = 0;
137 for (Iterator first (last.m_i); first < m_i; ++dist)
138 first = advance (first, Utf8SequenceBytes (*first));
139 return (dist);
142 //----------------------------------------------------------------------
144 /// \class utf8out_iterator utf8.h ustl.h
145 /// \ingroup IteratorAdaptors
147 /// \brief An iterator adaptor to character containers for writing UTF-8 encoded text.
149 template <typename Iterator, typename WChar = wchar_t>
150 class utf8out_iterator {
151 public:
152 typedef typename iterator_traits<Iterator>::value_type value_type;
153 typedef typename iterator_traits<Iterator>::difference_type difference_type;
154 typedef typename iterator_traits<Iterator>::pointer pointer;
155 typedef typename iterator_traits<Iterator>::reference reference;
156 public:
157 explicit utf8out_iterator (const Iterator& os) : m_i (os) {}
158 utf8out_iterator (const utf8out_iterator& i) : m_i (i.m_i) {}
159 inline const Iterator& base (void) const { return (m_i); }
160 /// Writes \p v into the stream.
161 utf8out_iterator& operator= (WChar v);
162 inline utf8out_iterator& operator* (void) { return (*this); }
163 inline utf8out_iterator& operator++ (void) { return (*this); }
164 inline utf8out_iterator operator++ (int) { return (*this); }
165 inline bool operator== (const utf8out_iterator& i) const { return (m_i == i.m_i); }
166 inline bool operator< (const utf8out_iterator& i) const { return (m_i < i.m_i); }
167 private:
168 Iterator m_i;
171 /// Writes \p v into the stream.
172 template <typename Iterator, typename WChar>
173 utf8out_iterator<Iterator,WChar>& utf8out_iterator<Iterator,WChar>::operator= (WChar v)
175 const size_t nBytes = Utf8Bytes (v);
176 if (nBytes > 1) {
177 // Write the bits 6 bits at a time, except for the first one,
178 // which may be less than 6 bits.
179 register wchar_t shift = nBytes * 6;
180 *m_i++ = ((v >> (shift -= 6)) & 0x3F) | (0xFF << (8 - nBytes));
181 while (shift)
182 *m_i++ = ((v >> (shift -= 6)) & 0x3F) | 0x80;
183 } else // If only one byte, there is no header.
184 *m_i++ = v;
185 return (*this);
188 //----------------------------------------------------------------------
190 /// Returns a UTF-8 adaptor writing to \p i. Useful in conjuction with back_insert_iterator.
191 template <typename Iterator>
192 inline utf8out_iterator<Iterator> utf8out (Iterator i)
194 return (utf8out_iterator<Iterator> (i));
197 /// Returns a UTF-8 adaptor reading from \p i.
198 template <typename Iterator>
199 inline utf8in_iterator<Iterator> utf8in (Iterator i)
201 return (utf8in_iterator<Iterator> (i));
204 //----------------------------------------------------------------------
206 } // namespace ustl
208 #endif