source/lib/utf8.cpp

   1 /* Copyright (c) 2010 Wildfire Games
   2  *
   3  * Permission is hereby granted, free of charge, to any person obtaining
   4  * a copy of this software and associated documentation files (the
   5  * "Software"), to deal in the Software without restriction, including
   6  * without limitation the rights to use, copy, modify, merge, publish,
   7  * distribute, sublicense, and/or sell copies of the Software, and to
   8  * permit persons to whom the Software is furnished to do so, subject to
   9  * the following conditions:
  10  *
  11  * The above copyright notice and this permission notice shall be included
  12  * in all copies or substantial portions of the Software.
  13  *
  14  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
  15  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  16  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
  17  * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
  18  * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
  19  * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
  20  * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  21  */
  22
  23 #include "precompiled.h"
  24 #include "lib/utf8.h"
  25
  26 static const StatusDefinition utf8StatusDefinitions[] = {
  27         { ERR::UTF8_SURROGATE, L"UTF-16 surrogate pairs aren't supported" },
  28         { ERR::UTF8_OUTSIDE_BMP, L"Code point outside BMP (> 0x10000)" },
  29         { ERR::UTF8_NONCHARACTER, L"Noncharacter (e.g. WEOF)" },
  30         { ERR::UTF8_INVALID_UTF8, L"Invalid UTF-8 sequence" }
  31 };
  32 STATUS_ADD_DEFINITIONS(utf8StatusDefinitions);
  33
  34
  35 // adapted from http://unicode.org/Public/PROGRAMS/CVTUTF/ConvertUTF.c
  36 // which bears the following notice:
  37 /*
  38 * Copyright 2001-2004 Unicode, Inc.
  39 *
  40 * Disclaimer
  41 *
  42 * This source code is provided as is by Unicode, Inc. No claims are
  43 * made as to fitness for any particular purpose. No warranties of any
  44 * kind are expressed or implied. The recipient agrees to determine
  45 * applicability of information provided. If this file has been
  46 * purchased on magnetic or optical media from Unicode, Inc., the
  47 * sole remedy for any claim will be exchange of defective media
  48 * within 90 days of receipt.
  49 *
  50 * Limitations on Rights to Redistribute This Code
  51 *
  52 * Unicode, Inc. hereby grants the right to freely use the information
  53 * supplied in this file in the creation of products supporting the
  54 * Unicode Standard, and to make copies of this file in any form
  55 * for internal or external distribution as long as this notice
  56 * remains attached.
  57 */
  58
  59 // design rationale:
  60 // - to cope with wchar_t differences between VC (UTF-16) and
  61 //   GCC (UCS-4), we only allow codepoints in the BMP.
  62 //   encoded UTF-8 sequences are therefore no longer than 3 bytes.
  63 // - surrogates are disabled because variable-length strings
  64 //   violate the purpose of using wchar_t instead of UTF-8.
  65 // - replacing disallowed characters instead of aborting outright
  66 //   avoids overly inconveniencing users and eases debugging.
  67
  68 // this implementation survives http://www.cl.cam.ac.uk/~mgk25/ucs/examples/UTF-8-test.txt
  69
  70 // (must be unsigned to avoid sign extension)
  71 typedef u8 UTF8;
  72 typedef u32 UTF32;
  73
  74
  75 // called from ReplaceIfInvalid and UTF8Codec::Decode
  76 static UTF32 RaiseError(Status err, Status* perr)
  77 {
  78         if(perr)        // caller wants return code, not warning dialog
  79         {
  80                 if(*perr == INFO::OK)   // only return the first error (see header)
  81                         *perr = err;
  82         }
  83         else
  84                 DEBUG_WARN_ERR(err);
  85
  86         return 0xFFFDul;        // replacement character
  87 }
  88
  89
  90 static UTF32 ReplaceIfInvalid(UTF32 u, Status* err)
  91 {
  92         // disallow surrogates
  93         if(0xD800ul <= u && u <= 0xDFFFul)
  94                 return RaiseError(ERR::UTF8_SURROGATE, err);
  95         // outside BMP (UTF-16 representation would require surrogates)
  96         if(u > 0xFFFFul)
  97                 return RaiseError(ERR::UTF8_OUTSIDE_BMP, err);
  98         // noncharacter (note: WEOF (0xFFFF) causes VC's swprintf to fail)
  99         if(u == 0xFFFEul || u == 0xFFFFul || (0xFDD0ul <= u && u <= 0xFDEFul))
 100                 return RaiseError(ERR::UTF8_NONCHARACTER, err);
 101         return u;
 102 }
 103
 104
 105 class UTF8Codec
 106 {
 107 public:
 108         static void Encode(UTF32 u, UTF8*& dstPos)
 109         {
 110                 switch (Size(u))
 111                 {
 112                 case 1:
 113                         *dstPos++ = UTF8(u);
 114                         break;
 115                 case 2:
 116                         *dstPos++ = UTF8((u >> 6) | 0xC0);
 117                         *dstPos++ = UTF8((u | 0x80u) & 0xBFu);
 118                         break;
 119                 case 3:
 120                         *dstPos++ = UTF8((u >> 12) | 0xE0);
 121                         *dstPos++ = UTF8(((u >> 6) | 0x80u) & 0xBFu);
 122                         *dstPos++ = UTF8((u | 0x80u) & 0xBFu);
 123                         break;
 124                 }
 125         }
 126
 127         // @return decoded scalar, or replacementCharacter on error
 128         static UTF32 Decode(const UTF8*& srcPos, const UTF8* const srcEnd, Status* err)
 129         {
 130                 const size_t size = SizeFromFirstByte(*srcPos);
 131                 if(!IsValid(srcPos, size, srcEnd))
 132                 {
 133                         srcPos += 1;    // only skip the offending byte (increases chances of resynchronization)
 134                         return RaiseError(ERR::UTF8_INVALID_UTF8, err);
 135                 }
 136
 137                 UTF32 u = 0;
 138                 for(size_t i = 0; i < size-1; i++)
 139                 {
 140                         u += UTF32(*srcPos++);
 141                         u <<= 6;
 142                 }
 143                 u += UTF32(*srcPos++);
 144
 145                 static const UTF32 offsets[1+4] = { 0, 0x00000000ul, 0x00003080ul, 0x000E2080ul, 0x03C82080UL };
 146                 u -= offsets[size];
 147                 return u;
 148         }
 149
 150 private:
 151         static inline size_t Size(UTF32 u)
 152         {
 153                 if(u < 0x80)
 154                         return 1;
 155                 if(u < 0x800)
 156                         return 2;
 157                 // ReplaceIfInvalid ensures > 3 byte encodings are never used.
 158                 return 3;
 159         }
 160
 161         static inline size_t SizeFromFirstByte(UTF8 firstByte)
 162         {
 163                 if(firstByte < 0xC0)
 164                         return 1;
 165                 if(firstByte < 0xE0)
 166                         return 2;
 167                 if(firstByte < 0xF0)
 168                         return 3;
 169                 // IsValid rejects firstByte values that would cause > 4 byte encodings.
 170                 return 4;
 171         }
 172
 173         // c.f. Unicode 3.1 Table 3-7
 174         // @param size obtained via SizeFromFirstByte (our caller also uses it)
 175         static bool IsValid(const UTF8* const src, size_t size, const UTF8* const srcEnd)
 176         {
 177                 if(src+size > srcEnd)   // not enough data
 178                         return false;
 179
 180                 if(src[0] < 0x80)
 181                         return true;
 182                 if(!(0xC2 <= src[0] && src[0] <= 0xF4))
 183                         return false;
 184
 185                 // special cases (stricter than the loop)
 186                 if(src[0] == 0xE0 && src[1] < 0xA0)
 187                         return false;
 188                 if(src[0] == 0xED && src[1] > 0x9F)
 189                         return false;
 190                 if(src[0] == 0xF0 && src[1] < 0x90)
 191                         return false;
 192                 if(src[0] == 0xF4 && src[1] > 0x8F)
 193                         return false;
 194
 195                 for(size_t i = 1; i < size; i++)
 196                 {
 197                         if(!(0x80 <= src[i] && src[i] <= 0xBF))
 198                                 return false;
 199                 }
 200
 201                 return true;
 202         }
 203 };
 204
 205
 206 //-----------------------------------------------------------------------------
 207
 208 std::string utf8_from_wstring(const std::wstring& src, Status* err)
 209 {
 210         if(err)
 211                 *err = INFO::OK;
 212
 213         std::string dst(src.size()*3+1, ' ');   // see UTF8Codec::Size; +1 ensures &dst[0] is valid
 214         UTF8* dstPos = (UTF8*)&dst[0];
 215         for(size_t i = 0; i < src.size(); i++)
 216         {
 217                 const UTF32 u = ReplaceIfInvalid(UTF32(src[i]), err);
 218                 UTF8Codec::Encode(u, dstPos);
 219         }
 220         dst.resize(dstPos - (UTF8*)&dst[0]);
 221         return dst;
 222 }
 223
 224
 225 std::wstring wstring_from_utf8(const std::string& src, Status* err)
 226 {
 227         if(err)
 228                 *err = INFO::OK;
 229
 230         std::wstring dst;
 231         dst.reserve(src.size());
 232         const UTF8* srcPos = (const UTF8*)src.data();
 233         const UTF8* const srcEnd = srcPos + src.size();
 234         while(srcPos < srcEnd)
 235         {
 236                 const UTF32 u = UTF8Codec::Decode(srcPos, srcEnd, err);
 237                 dst.push_back((wchar_t)ReplaceIfInvalid(u, err));
 238         }
 239         return dst;
 240 }