reduce verbosity
[gnash.git] / libbase / utf8.cpp
blob771ce1ad438dd10ed88cf3010e83cf704a55d68e
1 // utf8.cpp: utilities for converting to and from UTF-8
2 //
3 // Copyright (C) 2008, 2009, 2010 Free Software Foundation, Inc.
4 //
5 // This program is free software; you can redistribute it and/or modify
6 // it under the terms of the GNU General Public License as published by
7 // the Free Software Foundation; either version 3 of the License, or
8 // (at your option) any later version.
9 //
10 // This program is distributed in the hope that it will be useful,
11 // but WITHOUT ANY WARRANTY; without even the implied warranty of
12 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 // GNU General Public License for more details.
14 //
15 // You should have received a copy of the GNU General Public License
16 // along with this program; if not, write to the Free Software
17 // Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
19 // Based on the public domain work of Thatcher Ulrich <tu@tulrich.com> 2004
21 // Much useful info at "UTF-8 and Unicode FAQ" http://www.cl.cam.ac.uk/~mgk25/unicode.html
23 #include "utf8.h"
25 #include <limits>
26 #include <boost/cstdint.hpp>
27 #include <string>
28 #include <vector>
29 #include <cstdlib>
31 namespace gnash {
32 namespace utf8 {
34 namespace {
35 const boost::uint32_t invalid = std::numeric_limits<boost::uint32_t>::max();
38 std::wstring
39 decodeCanonicalString(const std::string& str, int version)
42 std::wstring wstr;
44 std::string::const_iterator it = str.begin(), e = str.end();
46 if (version > 5) {
47 while (boost::uint32_t code = decodeNextUnicodeCharacter(it, e)) {
48 if (code == invalid) {
49 continue;
51 wstr.push_back(static_cast<wchar_t>(code));
54 else {
55 while (it != str.end()) {
56 // This mangles UTF-8 (UCS4) strings, but is what is
57 // wanted for SWF5.
58 wstr.push_back(static_cast<unsigned char>(*it++));
62 return wstr;
66 std::string
67 encodeCanonicalString(const std::wstring& wstr, int version)
70 std::string str;
72 std::wstring::const_iterator it = wstr.begin();
73 while ( it != wstr.end())
75 if (version > 5) str.append(encodeUnicodeCharacter(*it++));
76 else str.append(encodeLatin1Character(*it++));
79 return str;
83 std::string
84 encodeLatin1Character(boost::uint32_t ucsCharacter)
86 std::string text;
87 text.push_back(static_cast<unsigned char>(ucsCharacter));
88 return text;
92 boost::uint32_t
93 decodeNextUnicodeCharacter(std::string::const_iterator& it,
94 const std::string::const_iterator& e)
96 boost::uint32_t uc;
98 // Security considerations:
100 // If we hit a zero byte, we want to return 0 without stepping
101 // the buffer pointer past the 0.
103 // If we hit an "overlong sequence"; i.e. a character encoded
104 // in a longer multibyte string than is necessary, then we
105 // need to discard the character. This is so attackers can't
106 // disguise dangerous characters or character sequences --
107 // there is only one valid encoding for each character.
109 // If we decode characters { 0xD800 .. 0xDFFF } or { 0xFFFE,
110 // 0xFFFF } then we ignore them; they are not valid in UTF-8.
112 #define FIRST_BYTE(mask, shift) \
113 /* Post-increment iterator */ \
114 uc = (*it++ & (mask)) << (shift);
116 #define NEXT_BYTE(shift) \
118 if (it == e || *it == 0) return 0; /* end of buffer, do not advance */ \
119 if ((*it & 0xC0) != 0x80) return invalid; /* standard check */ \
120 /* Post-increment iterator: */ \
121 uc |= (*it++ & 0x3F) << shift;
123 if (it == e || *it == 0) return 0; // End of buffer. Do not advance.
125 // Conventional 7-bit ASCII; return and increment iterator:
126 if ((*it & 0x80) == 0) return static_cast<boost::uint32_t>(*it++);
128 // Multi-byte sequences
129 if ((*it & 0xE0) == 0xC0) {
130 // Two-byte sequence.
131 FIRST_BYTE(0x1F, 6);
132 NEXT_BYTE(0);
133 if (uc < 0x80) return invalid; // overlong
134 return uc;
136 else if ((*it & 0xF0) == 0xE0) {
137 // Three-byte sequence.
138 FIRST_BYTE(0x0F, 12);
139 NEXT_BYTE(6);
140 NEXT_BYTE(0);
141 if (uc < 0x800) {
142 return invalid;
144 return uc;
146 else if ((*it & 0xF8) == 0xF0) {
147 // Four-byte sequence.
148 FIRST_BYTE(0x07, 18);
149 NEXT_BYTE(12);
150 NEXT_BYTE(6);
151 NEXT_BYTE(0);
152 if (uc < 0x010000) return invalid; // overlong
153 return uc;
155 else {
156 // Invalid.
157 it++;
158 return invalid;
162 // TODO: buffer as std::string; index (iterator);
164 std::string
165 encodeUnicodeCharacter(boost::uint32_t ucs_character)
168 std::string text;
170 if (ucs_character <= 0x7F)
172 // Plain single-byte ASCII.
173 text.push_back(ucs_character);
175 else if (ucs_character <= 0x7FF)
177 // Two bytes.
178 text.push_back(0xC0 | (ucs_character >> 6));
179 text.push_back(0x80 | ((ucs_character >> 0) & 0x3F));
181 else if (ucs_character <= 0xFFFF) {
182 // Three bytes.
183 text.push_back(0xE0 | (ucs_character >> 12));
184 text.push_back(0x80 | ((ucs_character >> 6) & 0x3F));
185 text.push_back(0x80 | ((ucs_character >> 0) & 0x3F));
187 else if (ucs_character <= 0x1FFFFF) {
188 // Four bytes.
189 text.push_back(0xF0 | (ucs_character >> 18));
190 text.push_back(0x80 | ((ucs_character >> 12) & 0x3F));
191 text.push_back(0x80 | ((ucs_character >> 6) & 0x3F));
192 text.push_back(0x80 | ((ucs_character >> 0) & 0x3F));
194 else {
195 // Invalid char; don't encode anything.
198 return text;
202 #define ENC_DEFAULT 0
203 #define ENC_UTF8 1
204 #define ENC_UTF16BE 2
205 #define ENC_UTF16LE 3
207 char*
208 stripBOM(char* in, size_t& size, TextEncoding& encoding)
210 encoding = encUNSPECIFIED;
211 if ( size > 2 )
213 // need *ptr to be unsigned or cast all 0xNN
214 unsigned char* ptr = reinterpret_cast<unsigned char*>(in);
216 if (*ptr == 0xFF && *(ptr+1) == 0xFE) {
217 // Text is UTF-16 LE
218 encoding = encUTF16LE;
219 in+=2;
220 size-=2;
222 else if ( *ptr == 0xFE && *(ptr+1) == 0xFF )
224 // Text is UTF-16 BE
225 encoding = encUTF16BE;
226 in+=2;
227 size-=2;
229 else if (size > 3 && *ptr == 0xEF && *(ptr+1) == 0xBB &&
230 *(ptr+2) == 0xBF )
232 // Text is UTF-8
233 encoding = encUTF8;
234 in+=3;
235 size-=3;
237 else if ( size > 4 && *ptr == 0x00 && *(ptr+1) == 0x00 &&
238 *(ptr+2) == 0xFE && *(ptr+3) == 0xFF )
240 // Text is UTF-32 BE
241 encoding = encUTF32BE;
242 in+=4;
243 size-=4;
245 else if ( size > 4 && *ptr == 0xFF && *(ptr+1) == 0xFE &&
246 *(ptr+2) == 0x00 && *(ptr+3) == 0x00 )
248 // Text is UTF-32 LE
249 encoding = encUTF32LE;
250 in+=4;
251 size-=4;
254 // TODO: check other kinds of boms !
255 // See http://en.wikipedia.org/wiki/Byte-order_mark#Representations_of_byte_order_marks_by_encoding
258 return in;
261 const char*
262 textEncodingName(TextEncoding enc)
264 switch (enc)
266 case encUNSPECIFIED: return "Unspecified";
267 case encUTF8: return "UTF8";
268 case encUTF16BE: return "UTF16BE";
269 case encUTF16LE: return "UTF16LE";
270 case encUTF32BE: return "UTF32BE";
271 case encUTF32LE: return "UTF32LE";
272 case encSCSU: return "SCSU";
273 case encUTF7: return "UTF7";
274 case encUTFEBCDIC: return "UTFEBCDIC";
275 case encBOCU1: return "BOCU1";
276 default: return "INVALID";
280 EncodingGuess
281 guessEncoding(const std::string &str, int &length, std::vector<int>& offsets)
283 int width = 0; // The remaining width, not the total.
284 bool is_sought = true;
286 std::string::const_iterator it = str.begin();
287 const std::string::const_iterator e = str.end();
289 length = 0;
291 // First, assume it's UTF8 and try to be wrong.
292 while (it != e && is_sought) {
293 ++length;
295 offsets.push_back(it - str.begin()); // current position
297 // Advances the iterator to point to the next
298 boost::uint32_t c = utf8::decodeNextUnicodeCharacter(it, e);
300 if (c == utf8::invalid) {
301 is_sought = false;
302 break;
306 offsets.push_back(it - str.begin()); // current position
308 if (it == e && is_sought) {
309 // No characters left, so it's almost certainly UTF8.
310 return ENCGUESS_UNICODE;
313 it = str.begin();
314 int index = 0;
315 is_sought = true;
316 width = 0;
317 length = 0;
318 bool was_odd = true;
319 bool was_even = true;
320 // Now, assume it's SHIFT_JIS and try to be wrong.
321 while (it != e && is_sought) {
322 int c = static_cast<int> (*it);
324 if (width) {
325 --width;
326 if ((c < 0x40) || ((c < 0x9F) && was_even) ||
327 ((c > 0x9E) && was_odd) || (c == 0x7F)) {
328 is_sought = false;
330 continue;
333 ++length;
334 offsets.push_back(index); // [length - 1] = index;
336 if ((c == 0x80) || (c == 0xA0) || (c >= 0xF0)) {
337 is_sought = false;
338 break;
341 if (((c >= 0x81) && (c <= 0x9F)) || ((c >= 0xE0) && (c <= 0xEF))) {
342 width = 1;
343 was_odd = c & 0x01;
344 was_even = !was_odd;
347 it++;
348 index++;
350 offsets.push_back(index); // [length - 1] = index;
352 if (!width && is_sought) {
353 // No width left, so it's probably SHIFT_JIS.
354 return ENCGUESS_JIS;
357 // It's something else.
358 #ifdef ANDROID
359 length = str.size();
360 #else
361 length = std::mbstowcs(NULL, str.c_str(), 0);
362 #endif
363 if (length == -1)
365 length = str.length();
367 return ENCGUESS_OTHER;
371 } // namespace utf8
372 } // namespace gnash
374 // Local Variables:
375 // mode: C++
376 // c-basic-offset: 8
377 // tab-width: 8
378 // indent-tabs-mode: t
379 // End: