utf8.h

   1 // This file is part of the ustl library, an STL implementation.
   2 //
   3 // Copyright (C) 2005 by Mike Sharov <msharov@users.sourceforge.net>
   4 // This file is free software, distributed under the MIT License.
   5 //
   6 // utf8.h
   7 //
   8 // This file contains stream iterators that read and write UTF-8 encoded
   9 // characters. The encoding is defined as follows:
  10 //
  11 // U-00000000 - U-0000007F: 0xxxxxxx
  12 // U-00000080 - U-000007FF: 110xxxxx 10xxxxxx
  13 // U-00000800 - U-0000FFFF: 1110xxxx 10xxxxxx 10xxxxxx
  14 // U-00010000 - U-001FFFFF: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
  15 // U-00200000 - U-03FFFFFF: 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
  16 // U-04000000 - U-7FFFFFFF: 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
  17 // U-80000000 - U-FFFFFFFF: 11111110 100000xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
  18 //
  19 // The last range in not in the UTF-8 standard because Unicode forbids
  20 // characters of those values. However, since ustl::string uses this code
  21 // to write its length, the support is here. The reason it was put here
  22 // in the first place, is that extra code would have been necessary to
  23 // flag that range as invalid.
  24 //
  25 #ifndef UTF8_H_3D7AEEEB3A88928D4D280B785F78B6F4
  26 #define UTF8_H_3D7AEEEB3A88928D4D280B785F78B6F4
  27
  28 #include "uiterator.h"
  29
  30 namespace ustl {
  31
  32 //----------------------------------------------------------------------
  33
  34 typedef uint8_t utf8subchar_t;  ///< Type for the encoding subcharacters.
  35
  36 //----------------------------------------------------------------------
  37
  38 /// Returns the number of bytes required to UTF-8 encode \p v.
  39 inline size_t Utf8Bytes (wchar_t v)
  40 {
  41     static const uint32_t c_Bounds[] = { 0x0000007F, 0x000007FF, 0x0000FFFF, 0x001FFFFF, 0x03FFFFFF, 0x7FFFFFFF, 0xFFFFFFFF, };
  42     size_t bi = 0;
  43     while (c_Bounds[bi++] < uint32_t(v));
  44     return (bi);
  45 }
  46
  47 /// Measures the size of a wchar_t array in UTF-8 encoding.
  48 inline size_t Utf8Bytes (const wchar_t* first, const wchar_t* last)
  49 {
  50     size_t bc = 0;
  51     for (; first < last; ++first)
  52         bc += Utf8Bytes(*first);
  53     return (bc);
  54 }
  55
  56 /// Returns the number of bytes in a UTF-8 sequence that starts with \p c.
  57 inline size_t Utf8SequenceBytes (wchar_t c)     // a wchar_t to keep c in a full register
  58 {
  59     // Count the leading bits. Header bits are 1 * nBytes followed by a 0.
  60     //  0 - single byte character. Take 7 bits (0xFF >> 1)
  61     //  1 - error, in the middle of the character. Take 6 bits (0xFF >> 2)
  62     //      so you will keep reading invalid entries until you hit the next character.
  63     //  >2 - multibyte character. Take remaining bits, and get the next bytes.
  64     // All errors are ignored, since the user can not correct them.
  65     //
  66     wchar_t mask = 0x80;
  67     size_t nBytes = 0;
  68     for (; c & mask; ++nBytes)
  69         mask >>= 1;
  70     return (nBytes ? nBytes : 1); // A sequence is always at least 1 byte.
  71 }
  72
  73 //----------------------------------------------------------------------
  74
  75 /// \class utf8in_iterator utf8.h ustl.h
  76 /// \ingroup IteratorAdaptors
  77 ///
  78 /// \brief An iterator adaptor to character containers for reading UTF-8 encoded text.
  79 ///
  80 /// For example, you can copy from ustl::string to ustl::vector<wchar_t> with
  81 /// copy (utf8in (str.begin()), utf8in (str.end()), back_inserter(wvect));
  82 /// There is no error handling; if the reading frame slips you'll get extra
  83 /// characters, one for every misaligned byte. Although it is possible to skip
  84 /// to the start of the next character, that would result in omitting the
  85 /// misformatted character and the one after it, making it very difficult to
  86 /// detect by the user. It is better to write some strange characters and let
  87 /// the user know his file is corrupted. Another problem is overflow on bad
  88 /// encodings (like a 0xFF on the end of a string). This is checked through
  89 /// the end-of-string nul character, which will always be there as long as
  90 /// you are using the string class.
  91 ///
  92 template <typename Iterator, typename WChar = wchar_t>
  93 class utf8in_iterator {
  94 public:
  95     typedef typename iterator_traits<Iterator>::value_type      value_type;
  96     typedef typename iterator_traits<Iterator>::difference_type difference_type;
  97     typedef typename iterator_traits<Iterator>::pointer         pointer;
  98     typedef typename iterator_traits<Iterator>::reference       reference;
  99 public:
 100     explicit                    utf8in_iterator (const Iterator& is)            : m_i (is), m_v (0) { Read(); }
 101                                 utf8in_iterator (const utf8in_iterator& i)      : m_i (i.m_i), m_v (i.m_v) {}
 102     inline const utf8in_iterator& operator= (const utf8in_iterator& i)          { m_i = i.m_i; m_v = i.m_v; return (*this); }
 103     inline Iterator             base (void) const       { return (m_i - (Utf8Bytes(m_v) - 1)); }
 104     /// Reads and returns the next value.
 105     inline WChar                operator* (void) const  { return (m_v); }
 106     inline utf8in_iterator&     operator++ (void)       { ++m_i; Read(); return (*this); }
 107     inline utf8in_iterator      operator++ (int)        { utf8in_iterator old (*this); operator++(); return (old); }
 108     inline utf8in_iterator&     operator+= (uoff_t n)   { while (n--) operator++(); return (*this); }
 109     inline utf8in_iterator      operator+ (uoff_t n)    { utf8in_iterator v (*this); return (v += n); }
 110     inline bool                 operator== (const utf8in_iterator& i) const     { return (m_i == i.m_i); }
 111     inline bool                 operator< (const utf8in_iterator& i) const      { return (m_i < i.m_i); }
 112     difference_type             operator- (const utf8in_iterator& i) const;
 113 private:
 114     void                        Read (void);
 115 private:
 116     Iterator                    m_i;
 117     WChar                       m_v;
 118 };
 119
 120 /// Steps to the next character and updates current returnable value.
 121 template <typename Iterator, typename WChar>
 122 void utf8in_iterator<Iterator,WChar>::Read (void)
 123 {
 124     const utf8subchar_t c = *m_i;
 125     size_t nBytes = Utf8SequenceBytes (c);
 126     m_v = c & (0xFF >> nBytes); // First byte contains bits after the header.
 127     while (--nBytes && *++m_i)  // Each subsequent byte has 6 bits.
 128         m_v = (m_v << 6) | (*m_i & 0x3F);
 129 }
 130
 131 /// Returns the distance in characters (as opposed to the distance in bytes).
 132 template <typename Iterator, typename WChar>
 133 typename utf8in_iterator<Iterator,WChar>::difference_type
 134 utf8in_iterator<Iterator,WChar>::operator- (const utf8in_iterator<Iterator,WChar>& last) const
 135 {
 136     difference_type dist = 0;
 137     for (Iterator first (last.m_i); first < m_i; ++dist)
 138         first = advance (first, Utf8SequenceBytes (*first));
 139     return (dist);
 140 }
 141
 142 //----------------------------------------------------------------------
 143
 144 /// \class utf8out_iterator utf8.h ustl.h
 145 /// \ingroup IteratorAdaptors
 146 ///
 147 /// \brief An iterator adaptor to character containers for writing UTF-8 encoded text.
 148 ///
 149 template <typename Iterator, typename WChar = wchar_t>
 150 class utf8out_iterator {
 151 public:
 152     typedef typename iterator_traits<Iterator>::value_type      value_type;
 153     typedef typename iterator_traits<Iterator>::difference_type difference_type;
 154     typedef typename iterator_traits<Iterator>::pointer         pointer;
 155     typedef typename iterator_traits<Iterator>::reference       reference;
 156 public:
 157     explicit                    utf8out_iterator (const Iterator& os) : m_i (os) {}
 158                                 utf8out_iterator (const utf8out_iterator& i) : m_i (i.m_i) {}
 159     inline const Iterator&      base (void) const { return (m_i); }
 160     /// Writes \p v into the stream.
 161     utf8out_iterator&           operator= (WChar v);
 162     inline utf8out_iterator&    operator* (void) { return (*this); }
 163     inline utf8out_iterator&    operator++ (void) { return (*this); }
 164     inline utf8out_iterator     operator++ (int) { return (*this); }
 165     inline bool                 operator== (const utf8out_iterator& i) const { return (m_i == i.m_i); }
 166     inline bool                 operator< (const utf8out_iterator& i) const { return (m_i < i.m_i); }
 167 private:
 168     Iterator                    m_i;
 169 };
 170
 171 /// Writes \p v into the stream.
 172 template <typename Iterator, typename WChar>
 173 utf8out_iterator<Iterator,WChar>& utf8out_iterator<Iterator,WChar>::operator= (WChar v)
 174 {
 175     const size_t nBytes = Utf8Bytes (v);
 176     if (nBytes > 1) {
 177         // Write the bits 6 bits at a time, except for the first one,
 178         // which may be less than 6 bits.
 179         register wchar_t shift = nBytes * 6;
 180         *m_i++ = ((v >> (shift -= 6)) & 0x3F) | (0xFF << (8 - nBytes));
 181         while (shift)
 182             *m_i++ = ((v >> (shift -= 6)) & 0x3F) | 0x80;
 183     } else      // If only one byte, there is no header.
 184         *m_i++ = v;
 185     return (*this);
 186 }
 187
 188 //----------------------------------------------------------------------
 189
 190 /// Returns a UTF-8 adaptor writing to \p i. Useful in conjuction with back_insert_iterator.
 191 template <typename Iterator>
 192 inline utf8out_iterator<Iterator> utf8out (Iterator i)
 193 {
 194     return (utf8out_iterator<Iterator> (i));
 195 }
 196
 197 /// Returns a UTF-8 adaptor reading from \p i.
 198 template <typename Iterator>
 199 inline utf8in_iterator<Iterator> utf8in (Iterator i)
 200 {
 201     return (utf8in_iterator<Iterator> (i));
 202 }
 203
 204 //----------------------------------------------------------------------
 205
 206 } // namespace ustl
 207
 208 #endif
 209