change bzr to git
[gnash.git] / libbase / utf8.cpp
blob9e429ff612d594fb0a3d0fc5938f9829e56b8f9e
1 // utf8.cpp: utilities for converting to and from UTF-8
2 //
3 // Copyright (C) 2008, 2009, 2010 Free Software Foundation, Inc.
4 //
5 // This program is free software; you can redistribute it and/or modify
6 // it under the terms of the GNU General Public License as published by
7 // the Free Software Foundation; either version 3 of the License, or
8 // (at your option) any later version.
9 //
10 // This program is distributed in the hope that it will be useful,
11 // but WITHOUT ANY WARRANTY; without even the implied warranty of
12 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 // GNU General Public License for more details.
14 //
15 // You should have received a copy of the GNU General Public License
16 // along with this program; if not, write to the Free Software
17 // Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
19 // Based on the public domain work of Thatcher Ulrich <tu@tulrich.com> 2004
21 // Much useful info at "UTF-8 and Unicode FAQ" http://www.cl.cam.ac.uk/~mgk25/unicode.html
24 #include "utf8.h"
26 // This isn't actually an invalid character; it's a valid char that
27 // looks like an inverted question mark.
28 #define INVALID_CHAR 0x0FFFD
30 std::wstring
31 utf8::decodeCanonicalString(const std::string& str, int version)
34 std::wstring wstr;
36 std::string::const_iterator it = str.begin(), e = str.end();
38 if (version > 5)
40 while (boost::uint32_t code = decodeNextUnicodeCharacter(it, e))
42 if (code == utf8::invalid)
44 wstr.push_back(static_cast<wchar_t>(INVALID_CHAR));
45 continue;
47 wstr.push_back(static_cast<wchar_t>(code));
50 else
52 while (it != str.end())
54 // This mangles UTF-8 (UCS4) strings, but is what is
55 // wanted for SWF5.
56 wstr.push_back(static_cast<unsigned char>(*it++));
60 return wstr;
64 std::string
65 utf8::encodeCanonicalString(const std::wstring& wstr, int version)
68 std::string str;
70 std::wstring::const_iterator it = wstr.begin();
71 while ( it != wstr.end())
73 if (version > 5) str.append(encodeUnicodeCharacter(*it++));
74 else str.append(encodeLatin1Character(*it++));
77 return str;
81 std::string
82 utf8::encodeLatin1Character(boost::uint32_t ucsCharacter)
84 std::string text;
85 text.push_back(static_cast<unsigned char>(ucsCharacter));
86 return text;
90 boost::uint32_t
91 utf8::decodeNextUnicodeCharacter(std::string::const_iterator& it,
92 const std::string::const_iterator& e)
94 boost::uint32_t uc;
96 // Security considerations:
98 // If we hit a zero byte, we want to return 0 without stepping
99 // the buffer pointer past the 0.
101 // If we hit an "overlong sequence"; i.e. a character encoded
102 // in a longer multibyte string than is necessary, then we
103 // need to discard the character. This is so attackers can't
104 // disguise dangerous characters or character sequences --
105 // there is only one valid encoding for each character.
107 // If we decode characters { 0xD800 .. 0xDFFF } or { 0xFFFE,
108 // 0xFFFF } then we ignore them; they are not valid in UTF-8.
110 #define FIRST_BYTE(mask, shift) \
111 /* Post-increment iterator */ \
112 uc = (*it++ & (mask)) << (shift);
114 #define NEXT_BYTE(shift) \
116 if (it == e || *it == 0) return 0; /* end of buffer, do not advance */ \
117 if ((*it & 0xC0) != 0x80) return utf8::invalid; /* standard check */ \
118 /* Post-increment iterator: */ \
119 uc |= (*it++ & 0x3F) << shift;
121 if (it == e || *it == 0) return 0; // End of buffer. Do not advance.
123 // Conventional 7-bit ASCII; return and increment iterator:
124 if ((*it & 0x80) == 0) return static_cast<boost::uint32_t>(*it++);
126 // Multi-byte sequences
127 if ((*it & 0xE0) == 0xC0)
129 // Two-byte sequence.
130 FIRST_BYTE(0x1F, 6);
131 NEXT_BYTE(0);
132 if (uc < 0x80) return utf8::invalid; // overlong
133 return uc;
135 else if ((*it & 0xF0) == 0xE0)
137 // Three-byte sequence.
138 FIRST_BYTE(0x0F, 12);
139 NEXT_BYTE(6);
140 NEXT_BYTE(0);
141 if (uc < 0x800) return utf8::invalid; // overlong
142 if (uc >= 0x0D800 && uc <= 0x0DFFF) return utf8::invalid; // not valid ISO 10646
143 if (uc == 0x0FFFE || uc == 0x0FFFF) return utf8::invalid; // not valid ISO 10646
144 return uc;
146 else if ((*it & 0xF8) == 0xF0)
148 // Four-byte sequence.
149 FIRST_BYTE(0x07, 18);
150 NEXT_BYTE(12);
151 NEXT_BYTE(6);
152 NEXT_BYTE(0);
153 if (uc < 0x010000) return utf8::invalid; // overlong
154 return uc;
156 else if ((*it & 0xFC) == 0xF8)
158 // Five-byte sequence.
159 FIRST_BYTE(0x03, 24);
160 NEXT_BYTE(18);
161 NEXT_BYTE(12);
162 NEXT_BYTE(6);
163 NEXT_BYTE(0);
164 if (uc < 0x0200000) return utf8::invalid; // overlong
165 return uc;
167 else if ((*it & 0xFE) == 0xFC)
169 // Six-byte sequence.
170 FIRST_BYTE(0x01, 30);
171 NEXT_BYTE(24);
172 NEXT_BYTE(18);
173 NEXT_BYTE(12);
174 NEXT_BYTE(6);
175 NEXT_BYTE(0);
176 if (uc < 0x04000000) return utf8::invalid; // overlong
177 return uc;
179 else
181 // Invalid.
182 it++;
183 return utf8::invalid;
187 // TODO: buffer as std::string; index (iterator);
189 std::string
190 utf8::encodeUnicodeCharacter(boost::uint32_t ucs_character)
193 std::string text;
195 if (ucs_character <= 0x7F)
197 // Plain single-byte ASCII.
198 text.push_back(ucs_character);
200 else if (ucs_character <= 0x7FF)
202 // Two bytes.
203 text.push_back(0xC0 | (ucs_character >> 6));
204 text.push_back(0x80 | ((ucs_character >> 0) & 0x3F));
206 else if (ucs_character <= 0xFFFF)
208 // Three bytes.
209 text.push_back(0xE0 | (ucs_character >> 12));
210 text.push_back(0x80 | ((ucs_character >> 6) & 0x3F));
211 text.push_back(0x80 | ((ucs_character >> 0) & 0x3F));
213 else if (ucs_character <= 0x1FFFFF)
215 // Four bytes.
216 text.push_back(0xF0 | (ucs_character >> 18));
217 text.push_back(0x80 | ((ucs_character >> 12) & 0x3F));
218 text.push_back(0x80 | ((ucs_character >> 6) & 0x3F));
219 text.push_back(0x80 | ((ucs_character >> 0) & 0x3F));
221 else if (ucs_character <= 0x3FFFFFF)
223 // Five bytes.
224 text.push_back(0xF8 | (ucs_character >> 24));
225 text.push_back(0x80 | ((ucs_character >> 18) & 0x3F));
226 text.push_back(0x80 | ((ucs_character >> 12) & 0x3F));
227 text.push_back(0x80 | ((ucs_character >> 6) & 0x3F));
228 text.push_back(0x80 | ((ucs_character >> 0) & 0x3F));
230 else if (ucs_character <= 0x7FFFFFFF)
232 // Six bytes.
233 text.push_back(0xFC | (ucs_character >> 30));
234 text.push_back(0x80 | ((ucs_character >> 24) & 0x3F));
235 text.push_back(0x80 | ((ucs_character >> 18) & 0x3F));
236 text.push_back(0x80 | ((ucs_character >> 12) & 0x3F));
237 text.push_back(0x80 | ((ucs_character >> 6) & 0x3F));
238 text.push_back(0x80 | ((ucs_character >> 0) & 0x3F));
240 else
242 // Invalid char; don't encode anything.
245 return text;
249 #define ENC_DEFAULT 0
250 #define ENC_UTF8 1
251 #define ENC_UTF16BE 2
252 #define ENC_UTF16LE 3
254 char*
255 utf8::stripBOM(char* in, size_t& size, TextEncoding& encoding)
257 encoding = encUNSPECIFIED;
258 if ( size > 2 )
260 // need *ptr to be unsigned or cast all 0xNN
261 unsigned char* ptr = reinterpret_cast<unsigned char*>(in);
263 if ( *ptr == 0xFF && *(ptr+1) == 0xFE )
265 // Text is UTF-16 LE
266 encoding = encUTF16LE;
267 in+=2;
268 size-=2;
270 else if ( *ptr == 0xFE && *(ptr+1) == 0xFF )
272 // Text is UTF-16 BE
273 encoding = encUTF16BE;
274 in+=2;
275 size-=2;
277 else if (size > 3 && *ptr == 0xEF && *(ptr+1) == 0xBB &&
278 *(ptr+2) == 0xBF )
280 // Text is UTF-8
281 encoding = encUTF8;
282 in+=3;
283 size-=3;
285 else if ( size > 4 && *ptr == 0x00 && *(ptr+1) == 0x00 &&
286 *(ptr+2) == 0xFE && *(ptr+3) == 0xFF )
288 // Text is UTF-32 BE
289 encoding = encUTF32BE;
290 in+=4;
291 size-=4;
293 else if ( size > 4 && *ptr == 0xFF && *(ptr+1) == 0xFE &&
294 *(ptr+2) == 0x00 && *(ptr+3) == 0x00 )
296 // Text is UTF-32 LE
297 encoding = encUTF32LE;
298 in+=4;
299 size-=4;
302 // TODO: check other kinds of boms !
303 // See http://en.wikipedia.org/wiki/Byte-order_mark#Representations_of_byte_order_marks_by_encoding
306 return in;
309 const char*
310 utf8::textEncodingName(TextEncoding enc)
312 switch (enc)
314 case encUNSPECIFIED: return "Unspecified";
315 case encUTF8: return "UTF8";
316 case encUTF16BE: return "UTF16BE";
317 case encUTF16LE: return "UTF16LE";
318 case encUTF32BE: return "UTF32BE";
319 case encUTF32LE: return "UTF32LE";
320 case encSCSU: return "SCSU";
321 case encUTF7: return "UTF7";
322 case encUTFEBCDIC: return "UTFEBCDIC";
323 case encBOCU1: return "BOCU1";
324 default: return "INVALID";
329 // Local Variables:
330 // mode: C++
331 // c-basic-offset: 8
332 // tab-width: 8
333 // indent-tabs-mode: t
334 // End: