1 /* Copyright (c) 2010 Wildfire Games
3 * Permission is hereby granted, free of charge, to any person obtaining
4 * a copy of this software and associated documentation files (the
5 * "Software"), to deal in the Software without restriction, including
6 * without limitation the rights to use, copy, modify, merge, publish,
7 * distribute, sublicense, and/or sell copies of the Software, and to
8 * permit persons to whom the Software is furnished to do so, subject to
9 * the following conditions:
11 * The above copyright notice and this permission notice shall be included
12 * in all copies or substantial portions of the Software.
14 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
15 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
16 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
17 * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
18 * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
19 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
20 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
23 #include "precompiled.h"
26 static const StatusDefinition utf8StatusDefinitions
[] = {
27 { ERR::UTF8_SURROGATE
, L
"UTF-16 surrogate pairs aren't supported" },
28 { ERR::UTF8_OUTSIDE_BMP
, L
"Code point outside BMP (> 0x10000)" },
29 { ERR::UTF8_NONCHARACTER
, L
"Noncharacter (e.g. WEOF)" },
30 { ERR::UTF8_INVALID_UTF8
, L
"Invalid UTF-8 sequence" }
32 STATUS_ADD_DEFINITIONS(utf8StatusDefinitions
);
35 // adapted from http://unicode.org/Public/PROGRAMS/CVTUTF/ConvertUTF.c
36 // which bears the following notice:
38 * Copyright 2001-2004 Unicode, Inc.
42 * This source code is provided as is by Unicode, Inc. No claims are
43 * made as to fitness for any particular purpose. No warranties of any
44 * kind are expressed or implied. The recipient agrees to determine
45 * applicability of information provided. If this file has been
46 * purchased on magnetic or optical media from Unicode, Inc., the
47 * sole remedy for any claim will be exchange of defective media
48 * within 90 days of receipt.
50 * Limitations on Rights to Redistribute This Code
52 * Unicode, Inc. hereby grants the right to freely use the information
53 * supplied in this file in the creation of products supporting the
54 * Unicode Standard, and to make copies of this file in any form
55 * for internal or external distribution as long as this notice
60 // - to cope with wchar_t differences between VC (UTF-16) and
61 // GCC (UCS-4), we only allow codepoints in the BMP.
62 // encoded UTF-8 sequences are therefore no longer than 3 bytes.
63 // - surrogates are disabled because variable-length strings
64 // violate the purpose of using wchar_t instead of UTF-8.
65 // - replacing disallowed characters instead of aborting outright
66 // avoids overly inconveniencing users and eases debugging.
68 // this implementation survives http://www.cl.cam.ac.uk/~mgk25/ucs/examples/UTF-8-test.txt
70 // (must be unsigned to avoid sign extension)
75 // called from ReplaceIfInvalid and UTF8Codec::Decode
76 static UTF32
RaiseError(Status err
, Status
* perr
)
78 if(perr
) // caller wants return code, not warning dialog
80 if(*perr
== INFO::OK
) // only return the first error (see header)
86 return 0xFFFDul
; // replacement character
90 static UTF32
ReplaceIfInvalid(UTF32 u
, Status
* err
)
92 // disallow surrogates
93 if(0xD800ul
<= u
&& u
<= 0xDFFFul
)
94 return RaiseError(ERR::UTF8_SURROGATE
, err
);
95 // outside BMP (UTF-16 representation would require surrogates)
97 return RaiseError(ERR::UTF8_OUTSIDE_BMP
, err
);
98 // noncharacter (note: WEOF (0xFFFF) causes VC's swprintf to fail)
99 if(u
== 0xFFFEul
|| u
== 0xFFFFul
|| (0xFDD0ul
<= u
&& u
<= 0xFDEFul
))
100 return RaiseError(ERR::UTF8_NONCHARACTER
, err
);
108 static void Encode(UTF32 u
, UTF8
*& dstPos
)
116 *dstPos
++ = UTF8((u
>> 6) | 0xC0);
117 *dstPos
++ = UTF8((u
| 0x80u
) & 0xBFu
);
120 *dstPos
++ = UTF8((u
>> 12) | 0xE0);
121 *dstPos
++ = UTF8(((u
>> 6) | 0x80u
) & 0xBFu
);
122 *dstPos
++ = UTF8((u
| 0x80u
) & 0xBFu
);
127 // @return decoded scalar, or replacementCharacter on error
128 static UTF32
Decode(const UTF8
*& srcPos
, const UTF8
* const srcEnd
, Status
* err
)
130 const size_t size
= SizeFromFirstByte(*srcPos
);
131 if(!IsValid(srcPos
, size
, srcEnd
))
133 srcPos
+= 1; // only skip the offending byte (increases chances of resynchronization)
134 return RaiseError(ERR::UTF8_INVALID_UTF8
, err
);
138 for(size_t i
= 0; i
< size
-1; i
++)
140 u
+= UTF32(*srcPos
++);
143 u
+= UTF32(*srcPos
++);
145 static const UTF32 offsets
[1+4] = { 0, 0x00000000ul
, 0x00003080ul
, 0x000E2080ul
, 0x03C82080UL
};
151 static inline size_t Size(UTF32 u
)
157 // ReplaceIfInvalid ensures > 3 byte encodings are never used.
161 static inline size_t SizeFromFirstByte(UTF8 firstByte
)
169 // IsValid rejects firstByte values that would cause > 4 byte encodings.
173 // c.f. Unicode 3.1 Table 3-7
174 // @param size obtained via SizeFromFirstByte (our caller also uses it)
175 static bool IsValid(const UTF8
* const src
, size_t size
, const UTF8
* const srcEnd
)
177 if(src
+size
> srcEnd
) // not enough data
182 if(!(0xC2 <= src
[0] && src
[0] <= 0xF4))
185 // special cases (stricter than the loop)
186 if(src
[0] == 0xE0 && src
[1] < 0xA0)
188 if(src
[0] == 0xED && src
[1] > 0x9F)
190 if(src
[0] == 0xF0 && src
[1] < 0x90)
192 if(src
[0] == 0xF4 && src
[1] > 0x8F)
195 for(size_t i
= 1; i
< size
; i
++)
197 if(!(0x80 <= src
[i
] && src
[i
] <= 0xBF))
206 //-----------------------------------------------------------------------------
208 std::string
utf8_from_wstring(const std::wstring
& src
, Status
* err
)
213 std::string
dst(src
.size()*3+1, ' '); // see UTF8Codec::Size; +1 ensures &dst[0] is valid
214 UTF8
* dstPos
= (UTF8
*)&dst
[0];
215 for(size_t i
= 0; i
< src
.size(); i
++)
217 const UTF32 u
= ReplaceIfInvalid(UTF32(src
[i
]), err
);
218 UTF8Codec::Encode(u
, dstPos
);
220 dst
.resize(dstPos
- (UTF8
*)&dst
[0]);
225 std::wstring
wstring_from_utf8(const std::string
& src
, Status
* err
)
231 dst
.reserve(src
.size());
232 const UTF8
* srcPos
= (const UTF8
*)src
.data();
233 const UTF8
* const srcEnd
= srcPos
+ src
.size();
234 while(srcPos
< srcEnd
)
236 const UTF32 u
= UTF8Codec::Decode(srcPos
, srcEnd
, err
);
237 dst
.push_back((wchar_t)ReplaceIfInvalid(u
, err
));