libbase/utf8.cpp

   1 // utf8.cpp: utilities for converting to and from UTF-8
   2 //
   3 //   Copyright (C) 2008, 2009, 2010 Free Software Foundation, Inc.
   4 //
   5 // This program is free software; you can redistribute it and/or modify
   6 // it under the terms of the GNU General Public License as published by
   7 // the Free Software Foundation; either version 3 of the License, or
   8 // (at your option) any later version.
   9 //
  10 // This program is distributed in the hope that it will be useful,
  11 // but WITHOUT ANY WARRANTY; without even the implied warranty of
  12 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  13 // GNU General Public License for more details.
  14 //
  15 // You should have received a copy of the GNU General Public License
  16 // along with this program; if not, write to the Free Software
  17 // Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
  18 //
  19 // Based on the public domain work of Thatcher Ulrich <tu@tulrich.com> 2004
  20 //
  21 // Much useful info at "UTF-8 and Unicode FAQ" http://www.cl.cam.ac.uk/~mgk25/unicode.html
  22
  23 #include "utf8.h"
  24
  25 #include <limits>
  26 #include <boost/cstdint.hpp>
  27 #include <string>
  28 #include <vector>
  29 #include <cstdlib>
  30
  31 namespace gnash {
  32 namespace utf8 {
  33
  34 namespace {
  35     const boost::uint32_t invalid = std::numeric_limits<boost::uint32_t>::max();
  36 }
  37
  38 std::wstring
  39 decodeCanonicalString(const std::string& str, int version)
  40 {
  41
  42     std::wstring wstr;
  43
  44     std::string::const_iterator it = str.begin(), e = str.end();
  45
  46     if (version > 5) {
  47         while (boost::uint32_t code = decodeNextUnicodeCharacter(it, e)) {
  48             if (code == invalid) {
  49                 continue;
  50             }
  51             wstr.push_back(static_cast<wchar_t>(code));
  52         }
  53     }
  54     else {
  55         while (it != str.end()) {
  56             // This mangles UTF-8 (UCS4) strings, but is what is
  57             // wanted for SWF5.
  58             wstr.push_back(static_cast<unsigned char>(*it++));
  59         }
  60     }
  61
  62     return wstr;
  63
  64 }
  65
  66 std::string
  67 encodeCanonicalString(const std::wstring& wstr, int version)
  68 {
  69
  70     std::string str;
  71
  72     std::wstring::const_iterator it = wstr.begin();
  73     while ( it != wstr.end())
  74     {
  75         if (version > 5) str.append(encodeUnicodeCharacter(*it++));
  76         else str.append(encodeLatin1Character(*it++));
  77     }
  78
  79     return str;
  80
  81 }
  82
  83 std::string
  84 encodeLatin1Character(boost::uint32_t ucsCharacter)
  85 {
  86     std::string text;
  87     text.push_back(static_cast<unsigned char>(ucsCharacter));
  88     return text;
  89 }
  90
  91
  92 boost::uint32_t
  93 decodeNextUnicodeCharacter(std::string::const_iterator& it,
  94                              const std::string::const_iterator& e)
  95 {
  96     boost::uint32_t uc;
  97
  98     // Security considerations:
  99     //
 100     // If we hit a zero byte, we want to return 0 without stepping
 101     // the buffer pointer past the 0.
 102     //
 103     // If we hit an "overlong sequence"; i.e. a character encoded
 104     // in a longer multibyte string than is necessary, then we
 105     // need to discard the character.  This is so attackers can't
 106     // disguise dangerous characters or character sequences --
 107     // there is only one valid encoding for each character.
 108     //
 109     // If we decode characters { 0xD800 .. 0xDFFF } or { 0xFFFE,
 110     // 0xFFFF } then we ignore them; they are not valid in UTF-8.
 111
 112 #define FIRST_BYTE(mask, shift)        \
 113     /* Post-increment iterator */ \
 114     uc = (*it++ & (mask)) << (shift);
 115
 116 #define NEXT_BYTE(shift)                        \
 117                     \
 118     if (it == e || *it == 0) return 0; /* end of buffer, do not advance */    \
 119     if ((*it & 0xC0) != 0x80) return invalid; /* standard check */    \
 120     /* Post-increment iterator: */        \
 121     uc |= (*it++ & 0x3F) << shift;
 122
 123     if (it == e || *it == 0) return 0;    // End of buffer.  Do not advance.
 124
 125     // Conventional 7-bit ASCII; return and increment iterator:
 126     if ((*it & 0x80) == 0) return static_cast<boost::uint32_t>(*it++);
 127
 128     // Multi-byte sequences
 129     if ((*it & 0xE0) == 0xC0) {
 130         // Two-byte sequence.
 131         FIRST_BYTE(0x1F, 6);
 132         NEXT_BYTE(0);
 133         if (uc < 0x80) return invalid;    // overlong
 134         return uc;
 135     }
 136     else if ((*it & 0xF0) == 0xE0) {
 137         // Three-byte sequence.
 138         FIRST_BYTE(0x0F, 12);
 139         NEXT_BYTE(6);
 140         NEXT_BYTE(0);
 141         if (uc < 0x800) {
 142             return invalid;
 143         }
 144         return uc;
 145     }
 146     else if ((*it & 0xF8) == 0xF0) {
 147         // Four-byte sequence.
 148         FIRST_BYTE(0x07, 18);
 149         NEXT_BYTE(12);
 150         NEXT_BYTE(6);
 151         NEXT_BYTE(0);
 152         if (uc < 0x010000) return invalid;    // overlong
 153         return uc;
 154     }
 155     else {
 156         // Invalid.
 157         it++;
 158         return invalid;
 159     }
 160 }
 161
 162 // TODO: buffer as std::string; index (iterator);
 163
 164 std::string
 165 encodeUnicodeCharacter(boost::uint32_t ucs_character)
 166 {
 167
 168     std::string text;
 169
 170     if (ucs_character <= 0x7F)
 171     {
 172         // Plain single-byte ASCII.
 173         text.push_back(ucs_character);
 174     }
 175     else if (ucs_character <= 0x7FF)
 176     {
 177         // Two bytes.
 178         text.push_back(0xC0 | (ucs_character >> 6));
 179         text.push_back(0x80 | ((ucs_character >> 0) & 0x3F));
 180     }
 181     else if (ucs_character <= 0xFFFF) {
 182         // Three bytes.
 183         text.push_back(0xE0 | (ucs_character >> 12));
 184         text.push_back(0x80 | ((ucs_character >> 6) & 0x3F));
 185         text.push_back(0x80 | ((ucs_character >> 0) & 0x3F));
 186     }
 187     else if (ucs_character <= 0x1FFFFF) {
 188         // Four bytes.
 189         text.push_back(0xF0 | (ucs_character >> 18));
 190         text.push_back(0x80 | ((ucs_character >> 12) & 0x3F));
 191         text.push_back(0x80 | ((ucs_character >> 6) & 0x3F));
 192         text.push_back(0x80 | ((ucs_character >> 0) & 0x3F));
 193     }
 194     else {
 195         // Invalid char; don't encode anything.
 196     }
 197
 198     return text;
 199 }
 200
 201
 202 #define ENC_DEFAULT 0
 203 #define ENC_UTF8 1
 204 #define ENC_UTF16BE 2
 205 #define ENC_UTF16LE 3
 206
 207 char*
 208 stripBOM(char* in, size_t& size, TextEncoding& encoding)
 209 {
 210     encoding = encUNSPECIFIED;
 211     if ( size > 2 )
 212     {
 213         // need *ptr to be unsigned or cast all 0xNN
 214         unsigned char* ptr = reinterpret_cast<unsigned char*>(in);
 215
 216         if (*ptr == 0xFF && *(ptr+1) == 0xFE) {
 217             // Text is UTF-16 LE
 218             encoding = encUTF16LE;
 219             in+=2;
 220             size-=2;
 221         }
 222         else if ( *ptr == 0xFE && *(ptr+1) == 0xFF )
 223         {
 224             // Text is UTF-16 BE
 225             encoding = encUTF16BE;
 226             in+=2;
 227             size-=2;
 228         }
 229         else if (size > 3 && *ptr == 0xEF && *(ptr+1) == 0xBB &&
 230                 *(ptr+2) == 0xBF )
 231         {
 232             // Text is UTF-8
 233             encoding = encUTF8;
 234             in+=3;
 235             size-=3;
 236         }
 237         else if ( size > 4 && *ptr == 0x00 && *(ptr+1) == 0x00 &&
 238                 *(ptr+2) == 0xFE && *(ptr+3) == 0xFF )
 239         {
 240             // Text is UTF-32 BE
 241             encoding = encUTF32BE;
 242             in+=4;
 243             size-=4;
 244         }
 245         else if ( size > 4 && *ptr == 0xFF && *(ptr+1) == 0xFE &&
 246                 *(ptr+2) == 0x00 && *(ptr+3) == 0x00 )
 247         {
 248             // Text is UTF-32 LE
 249             encoding = encUTF32LE;
 250             in+=4;
 251             size-=4;
 252         }
 253
 254         // TODO: check other kinds of boms !
 255         // See http://en.wikipedia.org/wiki/Byte-order_mark#Representations_of_byte_order_marks_by_encoding
 256     }
 257
 258     return in;
 259 }
 260
 261 const char*
 262 textEncodingName(TextEncoding enc)
 263 {
 264     switch (enc)
 265     {
 266         case encUNSPECIFIED: return "Unspecified";
 267         case encUTF8: return "UTF8";
 268         case encUTF16BE: return "UTF16BE";
 269         case encUTF16LE: return "UTF16LE";
 270         case encUTF32BE: return "UTF32BE";
 271         case encUTF32LE: return "UTF32LE";
 272         case encSCSU: return "SCSU";
 273         case encUTF7: return "UTF7";
 274         case encUTFEBCDIC: return "UTFEBCDIC";
 275         case encBOCU1: return "BOCU1";
 276         default: return "INVALID";
 277     }
 278 }
 279
 280 EncodingGuess
 281 guessEncoding(const std::string &str, int &length, std::vector<int>& offsets)
 282 {
 283     int width = 0; // The remaining width, not the total.
 284     bool is_sought = true;
 285
 286     std::string::const_iterator it = str.begin();
 287     const std::string::const_iterator e = str.end();
 288
 289     length = 0;
 290
 291     // First, assume it's UTF8 and try to be wrong.
 292     while (it != e && is_sought) {
 293         ++length;
 294
 295         offsets.push_back(it - str.begin()); // current position
 296
 297         // Advances the iterator to point to the next
 298         boost::uint32_t c = utf8::decodeNextUnicodeCharacter(it, e);
 299
 300         if (c == utf8::invalid) {
 301             is_sought = false;
 302             break;
 303         }
 304     }
 305
 306     offsets.push_back(it - str.begin()); // current position
 307
 308     if (it == e && is_sought) {
 309         // No characters left, so it's almost certainly UTF8.
 310         return ENCGUESS_UNICODE;
 311     }
 312
 313     it = str.begin();
 314     int index = 0;
 315     is_sought = true;
 316     width = 0;
 317     length = 0;
 318     bool was_odd = true;
 319     bool was_even = true;
 320     // Now, assume it's SHIFT_JIS and try to be wrong.
 321     while (it != e && is_sought) {
 322         int c = static_cast<int> (*it);
 323
 324         if (width) {
 325             --width;
 326             if ((c < 0x40) || ((c < 0x9F) && was_even) ||
 327                 ((c > 0x9E) && was_odd) || (c == 0x7F)) {
 328                 is_sought = false;
 329             }
 330             continue;
 331         }
 332
 333         ++length;
 334         offsets.push_back(index); // [length - 1] = index;
 335
 336         if ((c == 0x80) || (c == 0xA0) || (c >= 0xF0)) {
 337             is_sought = false;
 338             break;
 339         }
 340
 341         if (((c >= 0x81) && (c <= 0x9F)) || ((c >= 0xE0) && (c <= 0xEF))) {
 342             width = 1;
 343             was_odd = c & 0x01;
 344             was_even = !was_odd;
 345         }
 346
 347         it++;
 348         index++;
 349     }
 350     offsets.push_back(index); // [length - 1] = index;
 351
 352     if (!width && is_sought) {
 353         // No width left, so it's probably SHIFT_JIS.
 354         return ENCGUESS_JIS;
 355     }
 356
 357     // It's something else.
 358 #ifdef ANDROID
 359     length = str.size();
 360 #else
 361     length = std::mbstowcs(NULL, str.c_str(), 0);
 362 #endif
 363     if (length == -1)
 364     {
 365         length = str.length();
 366     }
 367     return ENCGUESS_OTHER;
 368 }
 369
 370
 371 } // namespace utf8
 372 } // namespace gnash
 373
 374 // Local Variables:
 375 // mode: C++
 376 // c-basic-offset: 8
 377 // tab-width: 8
 378 // indent-tabs-mode: t
 379 // End: