libbase/utf8.cpp

   1 // utf8.cpp: utilities for converting to and from UTF-8
   2 //
   3 //   Copyright (C) 2008, 2009, 2010 Free Software Foundation, Inc.
   4 //
   5 // This program is free software; you can redistribute it and/or modify
   6 // it under the terms of the GNU General Public License as published by
   7 // the Free Software Foundation; either version 3 of the License, or
   8 // (at your option) any later version.
   9 //
  10 // This program is distributed in the hope that it will be useful,
  11 // but WITHOUT ANY WARRANTY; without even the implied warranty of
  12 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  13 // GNU General Public License for more details.
  14 //
  15 // You should have received a copy of the GNU General Public License
  16 // along with this program; if not, write to the Free Software
  17 // Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
  18 //
  19 // Based on the public domain work of Thatcher Ulrich <tu@tulrich.com> 2004
  20 //
  21 // Much useful info at "UTF-8 and Unicode FAQ" http://www.cl.cam.ac.uk/~mgk25/unicode.html
  22
  23
  24 #include "utf8.h"
  25
  26 // This isn't actually an invalid character; it's a valid char that
  27 // looks like an inverted question mark.
  28 #define INVALID_CHAR 0x0FFFD
  29
  30 std::wstring
  31 utf8::decodeCanonicalString(const std::string& str, int version)
  32 {
  33
  34     std::wstring wstr;
  35
  36     std::string::const_iterator it = str.begin(), e = str.end();
  37
  38     if (version > 5)
  39     {
  40         while (boost::uint32_t code = decodeNextUnicodeCharacter(it, e))
  41         {
  42             if (code == utf8::invalid)
  43             {
  44                 wstr.push_back(static_cast<wchar_t>(INVALID_CHAR));
  45                 continue;
  46             }
  47             wstr.push_back(static_cast<wchar_t>(code));
  48         }
  49     }
  50     else
  51     {
  52         while (it != str.end())
  53         {
  54             // This mangles UTF-8 (UCS4) strings, but is what is
  55             // wanted for SWF5.
  56             wstr.push_back(static_cast<unsigned char>(*it++));
  57         }
  58     }
  59
  60     return wstr;
  61
  62 }
  63
  64 std::string
  65 utf8::encodeCanonicalString(const std::wstring& wstr, int version)
  66 {
  67
  68     std::string str;
  69
  70     std::wstring::const_iterator it = wstr.begin();
  71     while ( it != wstr.end())
  72     {
  73         if (version > 5) str.append(encodeUnicodeCharacter(*it++));
  74         else str.append(encodeLatin1Character(*it++));
  75     }
  76
  77     return str;
  78
  79 }
  80
  81 std::string
  82 utf8::encodeLatin1Character(boost::uint32_t ucsCharacter)
  83 {
  84     std::string text;
  85     text.push_back(static_cast<unsigned char>(ucsCharacter));
  86     return text;
  87 }
  88
  89
  90 boost::uint32_t
  91 utf8::decodeNextUnicodeCharacter(std::string::const_iterator& it,
  92                                  const std::string::const_iterator& e)
  93 {
  94     boost::uint32_t    uc;
  95
  96     // Security considerations:
  97     //
  98     // If we hit a zero byte, we want to return 0 without stepping
  99     // the buffer pointer past the 0.
 100     //
 101     // If we hit an "overlong sequence"; i.e. a character encoded
 102     // in a longer multibyte string than is necessary, then we
 103     // need to discard the character.  This is so attackers can't
 104     // disguise dangerous characters or character sequences --
 105     // there is only one valid encoding for each character.
 106     //
 107     // If we decode characters { 0xD800 .. 0xDFFF } or { 0xFFFE,
 108     // 0xFFFF } then we ignore them; they are not valid in UTF-8.
 109
 110 #define FIRST_BYTE(mask, shift)        \
 111     /* Post-increment iterator */ \
 112     uc = (*it++ & (mask)) << (shift);
 113
 114 #define NEXT_BYTE(shift)                        \
 115                     \
 116     if (it == e || *it == 0) return 0; /* end of buffer, do not advance */    \
 117     if ((*it & 0xC0) != 0x80) return utf8::invalid; /* standard check */    \
 118     /* Post-increment iterator: */        \
 119     uc |= (*it++ & 0x3F) << shift;
 120
 121     if (it == e || *it == 0) return 0;    // End of buffer.  Do not advance.
 122
 123     // Conventional 7-bit ASCII; return and increment iterator:
 124     if ((*it & 0x80) == 0) return static_cast<boost::uint32_t>(*it++);
 125
 126     // Multi-byte sequences
 127     if ((*it & 0xE0) == 0xC0)
 128     {
 129         // Two-byte sequence.
 130         FIRST_BYTE(0x1F, 6);
 131         NEXT_BYTE(0);
 132         if (uc < 0x80) return utf8::invalid;    // overlong
 133         return uc;
 134     }
 135     else if ((*it & 0xF0) == 0xE0)
 136     {
 137         // Three-byte sequence.
 138         FIRST_BYTE(0x0F, 12);
 139         NEXT_BYTE(6);
 140         NEXT_BYTE(0);
 141         if (uc < 0x800) return utf8::invalid;    // overlong
 142         if (uc >= 0x0D800 && uc <= 0x0DFFF) return utf8::invalid;    // not valid ISO 10646
 143         if (uc == 0x0FFFE || uc == 0x0FFFF) return utf8::invalid;    // not valid ISO 10646
 144         return uc;
 145     }
 146     else if ((*it & 0xF8) == 0xF0)
 147     {
 148         // Four-byte sequence.
 149         FIRST_BYTE(0x07, 18);
 150         NEXT_BYTE(12);
 151         NEXT_BYTE(6);
 152         NEXT_BYTE(0);
 153         if (uc < 0x010000) return utf8::invalid;    // overlong
 154         return uc;
 155     }
 156     else if ((*it & 0xFC) == 0xF8)
 157     {
 158         // Five-byte sequence.
 159         FIRST_BYTE(0x03, 24);
 160         NEXT_BYTE(18);
 161         NEXT_BYTE(12);
 162         NEXT_BYTE(6);
 163         NEXT_BYTE(0);
 164         if (uc < 0x0200000) return utf8::invalid;    // overlong
 165         return uc;
 166     }
 167     else if ((*it & 0xFE) == 0xFC)
 168     {
 169         // Six-byte sequence.
 170         FIRST_BYTE(0x01, 30);
 171         NEXT_BYTE(24);
 172         NEXT_BYTE(18);
 173         NEXT_BYTE(12);
 174         NEXT_BYTE(6);
 175         NEXT_BYTE(0);
 176         if (uc < 0x04000000) return utf8::invalid;    // overlong
 177         return uc;
 178     }
 179     else
 180     {
 181         // Invalid.
 182         it++;
 183         return utf8::invalid;
 184     }
 185 }
 186
 187 // TODO: buffer as std::string; index (iterator);
 188
 189 std::string
 190 utf8::encodeUnicodeCharacter(boost::uint32_t ucs_character)
 191 {
 192
 193     std::string text;
 194
 195     if (ucs_character <= 0x7F)
 196     {
 197         // Plain single-byte ASCII.
 198         text.push_back(ucs_character);
 199     }
 200     else if (ucs_character <= 0x7FF)
 201     {
 202         // Two bytes.
 203         text.push_back(0xC0 | (ucs_character >> 6));
 204         text.push_back(0x80 | ((ucs_character >> 0) & 0x3F));
 205     }
 206     else if (ucs_character <= 0xFFFF)
 207     {
 208         // Three bytes.
 209         text.push_back(0xE0 | (ucs_character >> 12));
 210         text.push_back(0x80 | ((ucs_character >> 6) & 0x3F));
 211         text.push_back(0x80 | ((ucs_character >> 0) & 0x3F));
 212     }
 213     else if (ucs_character <= 0x1FFFFF)
 214     {
 215         // Four bytes.
 216         text.push_back(0xF0 | (ucs_character >> 18));
 217         text.push_back(0x80 | ((ucs_character >> 12) & 0x3F));
 218         text.push_back(0x80 | ((ucs_character >> 6) & 0x3F));
 219         text.push_back(0x80 | ((ucs_character >> 0) & 0x3F));
 220     }
 221     else if (ucs_character <= 0x3FFFFFF)
 222     {
 223         // Five bytes.
 224         text.push_back(0xF8 | (ucs_character >> 24));
 225         text.push_back(0x80 | ((ucs_character >> 18) & 0x3F));
 226         text.push_back(0x80 | ((ucs_character >> 12) & 0x3F));
 227         text.push_back(0x80 | ((ucs_character >> 6) & 0x3F));
 228         text.push_back(0x80 | ((ucs_character >> 0) & 0x3F));
 229     }
 230     else if (ucs_character <= 0x7FFFFFFF)
 231     {
 232         // Six bytes.
 233         text.push_back(0xFC | (ucs_character >> 30));
 234         text.push_back(0x80 | ((ucs_character >> 24) & 0x3F));
 235         text.push_back(0x80 | ((ucs_character >> 18) & 0x3F));
 236         text.push_back(0x80 | ((ucs_character >> 12) & 0x3F));
 237         text.push_back(0x80 | ((ucs_character >> 6) & 0x3F));
 238         text.push_back(0x80 | ((ucs_character >> 0) & 0x3F));
 239     }
 240     else
 241     {
 242         // Invalid char; don't encode anything.
 243     }
 244
 245     return text;
 246 }
 247
 248
 249 #define ENC_DEFAULT 0
 250 #define ENC_UTF8 1
 251 #define ENC_UTF16BE 2
 252 #define ENC_UTF16LE 3
 253
 254 char*
 255 utf8::stripBOM(char* in, size_t& size, TextEncoding& encoding)
 256 {
 257     encoding = encUNSPECIFIED;
 258     if ( size > 2 )
 259     {
 260         // need *ptr to be unsigned or cast all 0xNN
 261         unsigned char* ptr = reinterpret_cast<unsigned char*>(in);
 262
 263         if ( *ptr == 0xFF && *(ptr+1) == 0xFE )
 264         {
 265             // Text is UTF-16 LE
 266             encoding = encUTF16LE;
 267             in+=2;
 268             size-=2;
 269         }
 270         else if ( *ptr == 0xFE && *(ptr+1) == 0xFF )
 271         {
 272             // Text is UTF-16 BE
 273             encoding = encUTF16BE;
 274             in+=2;
 275             size-=2;
 276         }
 277         else if (size > 3 && *ptr == 0xEF && *(ptr+1) == 0xBB &&
 278                 *(ptr+2) == 0xBF )
 279         {
 280             // Text is UTF-8
 281             encoding = encUTF8;
 282             in+=3;
 283             size-=3;
 284         }
 285         else if ( size > 4 && *ptr == 0x00 && *(ptr+1) == 0x00 &&
 286                 *(ptr+2) == 0xFE && *(ptr+3) == 0xFF )
 287         {
 288             // Text is UTF-32 BE
 289             encoding = encUTF32BE;
 290             in+=4;
 291             size-=4;
 292         }
 293         else if ( size > 4 && *ptr == 0xFF && *(ptr+1) == 0xFE &&
 294                 *(ptr+2) == 0x00 && *(ptr+3) == 0x00 )
 295         {
 296             // Text is UTF-32 LE
 297             encoding = encUTF32LE;
 298             in+=4;
 299             size-=4;
 300         }
 301
 302         // TODO: check other kinds of boms !
 303         // See http://en.wikipedia.org/wiki/Byte-order_mark#Representations_of_byte_order_marks_by_encoding
 304     }
 305
 306     return in;
 307 }
 308
 309 const char*
 310 utf8::textEncodingName(TextEncoding enc)
 311 {
 312     switch (enc)
 313     {
 314         case encUNSPECIFIED: return "Unspecified";
 315         case encUTF8: return "UTF8";
 316         case encUTF16BE: return "UTF16BE";
 317         case encUTF16LE: return "UTF16LE";
 318         case encUTF32BE: return "UTF32BE";
 319         case encUTF32LE: return "UTF32LE";
 320         case encSCSU: return "SCSU";
 321         case encUTF7: return "UTF7";
 322         case encUTFEBCDIC: return "UTFEBCDIC";
 323         case encBOCU1: return "BOCU1";
 324         default: return "INVALID";
 325     }
 326 }
 327
 328
 329 // Local Variables:
 330 // mode: C++
 331 // c-basic-offset: 8
 332 // tab-width: 8
 333 // indent-tabs-mode: t
 334 // End: