1 // utf8.cpp: utilities for converting to and from UTF-8
3 // Copyright (C) 2008, 2009, 2010 Free Software Foundation, Inc.
5 // This program is free software; you can redistribute it and/or modify
6 // it under the terms of the GNU General Public License as published by
7 // the Free Software Foundation; either version 3 of the License, or
8 // (at your option) any later version.
10 // This program is distributed in the hope that it will be useful,
11 // but WITHOUT ANY WARRANTY; without even the implied warranty of
12 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 // GNU General Public License for more details.
15 // You should have received a copy of the GNU General Public License
16 // along with this program; if not, write to the Free Software
17 // Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
19 // Based on the public domain work of Thatcher Ulrich <tu@tulrich.com> 2004
21 // Much useful info at "UTF-8 and Unicode FAQ" http://www.cl.cam.ac.uk/~mgk25/unicode.html
26 #include <boost/cstdint.hpp>
35 const boost::uint32_t invalid
= std::numeric_limits
<boost::uint32_t>::max();
39 decodeCanonicalString(const std::string
& str
, int version
)
44 std::string::const_iterator it
= str
.begin(), e
= str
.end();
47 while (boost::uint32_t code
= decodeNextUnicodeCharacter(it
, e
)) {
48 if (code
== invalid
) {
51 wstr
.push_back(static_cast<wchar_t>(code
));
55 while (it
!= str
.end()) {
56 // This mangles UTF-8 (UCS4) strings, but is what is
58 wstr
.push_back(static_cast<unsigned char>(*it
++));
67 encodeCanonicalString(const std::wstring
& wstr
, int version
)
72 std::wstring::const_iterator it
= wstr
.begin();
73 while ( it
!= wstr
.end())
75 if (version
> 5) str
.append(encodeUnicodeCharacter(*it
++));
76 else str
.append(encodeLatin1Character(*it
++));
84 encodeLatin1Character(boost::uint32_t ucsCharacter
)
87 text
.push_back(static_cast<unsigned char>(ucsCharacter
));
93 decodeNextUnicodeCharacter(std::string::const_iterator
& it
,
94 const std::string::const_iterator
& e
)
98 // Security considerations:
100 // If we hit a zero byte, we want to return 0 without stepping
101 // the buffer pointer past the 0.
103 // If we hit an "overlong sequence"; i.e. a character encoded
104 // in a longer multibyte string than is necessary, then we
105 // need to discard the character. This is so attackers can't
106 // disguise dangerous characters or character sequences --
107 // there is only one valid encoding for each character.
109 // If we decode characters { 0xD800 .. 0xDFFF } or { 0xFFFE,
110 // 0xFFFF } then we ignore them; they are not valid in UTF-8.
112 #define FIRST_BYTE(mask, shift) \
113 /* Post-increment iterator */ \
114 uc = (*it++ & (mask)) << (shift);
116 #define NEXT_BYTE(shift) \
118 if (it == e || *it == 0) return 0; /* end of buffer, do not advance */ \
119 if ((*it & 0xC0) != 0x80) return invalid; /* standard check */ \
120 /* Post-increment iterator: */ \
121 uc |= (*it++ & 0x3F) << shift;
123 if (it
== e
|| *it
== 0) return 0; // End of buffer. Do not advance.
125 // Conventional 7-bit ASCII; return and increment iterator:
126 if ((*it
& 0x80) == 0) return static_cast<boost::uint32_t>(*it
++);
128 // Multi-byte sequences
129 if ((*it
& 0xE0) == 0xC0) {
130 // Two-byte sequence.
133 if (uc
< 0x80) return invalid
; // overlong
136 else if ((*it
& 0xF0) == 0xE0) {
137 // Three-byte sequence.
138 FIRST_BYTE(0x0F, 12);
146 else if ((*it
& 0xF8) == 0xF0) {
147 // Four-byte sequence.
148 FIRST_BYTE(0x07, 18);
152 if (uc
< 0x010000) return invalid
; // overlong
162 // TODO: buffer as std::string; index (iterator);
165 encodeUnicodeCharacter(boost::uint32_t ucs_character
)
170 if (ucs_character
<= 0x7F)
172 // Plain single-byte ASCII.
173 text
.push_back(ucs_character
);
175 else if (ucs_character
<= 0x7FF)
178 text
.push_back(0xC0 | (ucs_character
>> 6));
179 text
.push_back(0x80 | ((ucs_character
>> 0) & 0x3F));
181 else if (ucs_character
<= 0xFFFF) {
183 text
.push_back(0xE0 | (ucs_character
>> 12));
184 text
.push_back(0x80 | ((ucs_character
>> 6) & 0x3F));
185 text
.push_back(0x80 | ((ucs_character
>> 0) & 0x3F));
187 else if (ucs_character
<= 0x1FFFFF) {
189 text
.push_back(0xF0 | (ucs_character
>> 18));
190 text
.push_back(0x80 | ((ucs_character
>> 12) & 0x3F));
191 text
.push_back(0x80 | ((ucs_character
>> 6) & 0x3F));
192 text
.push_back(0x80 | ((ucs_character
>> 0) & 0x3F));
195 // Invalid char; don't encode anything.
202 #define ENC_DEFAULT 0
204 #define ENC_UTF16BE 2
205 #define ENC_UTF16LE 3
208 stripBOM(char* in
, size_t& size
, TextEncoding
& encoding
)
210 encoding
= encUNSPECIFIED
;
213 // need *ptr to be unsigned or cast all 0xNN
214 unsigned char* ptr
= reinterpret_cast<unsigned char*>(in
);
216 if (*ptr
== 0xFF && *(ptr
+1) == 0xFE) {
218 encoding
= encUTF16LE
;
222 else if ( *ptr
== 0xFE && *(ptr
+1) == 0xFF )
225 encoding
= encUTF16BE
;
229 else if (size
> 3 && *ptr
== 0xEF && *(ptr
+1) == 0xBB &&
237 else if ( size
> 4 && *ptr
== 0x00 && *(ptr
+1) == 0x00 &&
238 *(ptr
+2) == 0xFE && *(ptr
+3) == 0xFF )
241 encoding
= encUTF32BE
;
245 else if ( size
> 4 && *ptr
== 0xFF && *(ptr
+1) == 0xFE &&
246 *(ptr
+2) == 0x00 && *(ptr
+3) == 0x00 )
249 encoding
= encUTF32LE
;
254 // TODO: check other kinds of boms !
255 // See http://en.wikipedia.org/wiki/Byte-order_mark#Representations_of_byte_order_marks_by_encoding
262 textEncodingName(TextEncoding enc
)
266 case encUNSPECIFIED
: return "Unspecified";
267 case encUTF8
: return "UTF8";
268 case encUTF16BE
: return "UTF16BE";
269 case encUTF16LE
: return "UTF16LE";
270 case encUTF32BE
: return "UTF32BE";
271 case encUTF32LE
: return "UTF32LE";
272 case encSCSU
: return "SCSU";
273 case encUTF7
: return "UTF7";
274 case encUTFEBCDIC
: return "UTFEBCDIC";
275 case encBOCU1
: return "BOCU1";
276 default: return "INVALID";
281 guessEncoding(const std::string
&str
, int &length
, std::vector
<int>& offsets
)
283 int width
= 0; // The remaining width, not the total.
284 bool is_sought
= true;
286 std::string::const_iterator it
= str
.begin();
287 const std::string::const_iterator e
= str
.end();
291 // First, assume it's UTF8 and try to be wrong.
292 while (it
!= e
&& is_sought
) {
295 offsets
.push_back(it
- str
.begin()); // current position
297 // Advances the iterator to point to the next
298 boost::uint32_t c
= utf8::decodeNextUnicodeCharacter(it
, e
);
300 if (c
== utf8::invalid
) {
306 offsets
.push_back(it
- str
.begin()); // current position
308 if (it
== e
&& is_sought
) {
309 // No characters left, so it's almost certainly UTF8.
310 return ENCGUESS_UNICODE
;
319 bool was_even
= true;
320 // Now, assume it's SHIFT_JIS and try to be wrong.
321 while (it
!= e
&& is_sought
) {
322 int c
= static_cast<int> (*it
);
326 if ((c
< 0x40) || ((c
< 0x9F) && was_even
) ||
327 ((c
> 0x9E) && was_odd
) || (c
== 0x7F)) {
334 offsets
.push_back(index
); // [length - 1] = index;
336 if ((c
== 0x80) || (c
== 0xA0) || (c
>= 0xF0)) {
341 if (((c
>= 0x81) && (c
<= 0x9F)) || ((c
>= 0xE0) && (c
<= 0xEF))) {
350 offsets
.push_back(index
); // [length - 1] = index;
352 if (!width
&& is_sought
) {
353 // No width left, so it's probably SHIFT_JIS.
357 // It's something else.
361 length
= std::mbstowcs(NULL
, str
.c_str(), 0);
365 length
= str
.length();
367 return ENCGUESS_OTHER
;
378 // indent-tabs-mode: t