Fix test for bug #32625
[gnash.git] / libbase / utf8.h
blob98b60d1ca274566cb2c25b8d6e3a6901a68275a8
1 // utf8.h: utilities for converting to and from UTF-8
2 //
3 // Copyright (C) 2008, 2009, 2010 Free Software Foundation, Inc.
4 //
5 // This program is free software; you can redistribute it and/or modify
6 // it under the terms of the GNU General Public License as published by
7 // the Free Software Foundation; either version 3 of the License, or
8 // (at your option) any later version.
9 //
10 // This program is distributed in the hope that it will be useful,
11 // but WITHOUT ANY WARRANTY; without even the implied warranty of
12 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 // GNU General Public License for more details.
14 //
15 // You should have received a copy of the GNU General Public License
16 // along with this program; if not, write to the Free Software
17 // Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
19 // Based on the public domain work of Thatcher Ulrich <tu@tulrich.com> 2004
21 #ifndef UTF8_H
22 #define UTF8_H
24 #include <string>
25 #include <boost/cstdint.hpp> // for C99 int types
26 #include <vector>
28 #include "dsodefs.h" // For DSOEXPORT
30 namespace gnash {
32 /// Utilities to convert between std::string and std::wstring.
34 /// Strings in Gnash are generally stored as std::strings.
35 /// We have to deal, however, with characters larger than standard
36 /// ASCII (128), which can be encoded in two different ways.
37 ///
38 /// SWF6 and later use UTF-8, encoded as multibyte characters and
39 /// allowing many thousands of unique codes. Multibyte characters are
40 /// difficult to handle, as their length - used for many string
41 /// operations - is not certain without parsing the string.
42 /// Converting the string to a wstring (generally a uint32_t - the
43 /// pp seems only to handle characters up to 65535 - two bytes is
44 /// the minimum size of a wchar) facilitates string operations, as
45 /// the length of the string is equal to the number of valid characters.
46 ///
47 /// SWF5 and earlier, however, used the ISO-8859 specification,
48 /// allowing the standard 128 ASCII characters plus 128 extra
49 /// characters that depend on the particular subset of ISO-8859.
50 /// Characters are 8 bits, not the ASCII standard 7. SWF5 cannot
51 /// handle multi-byte characters without special functions.
52 ///
53 /// It is important that SWF5 can distinguish between the two encodings,
54 /// so we cannot convert all strings to UTF-8.
56 /// Please note that, although this is called utf8, what the Adobe
57 /// player uses is only loosely related to real unicode, so the
58 /// encoding support here is correspondingly non-standard.
59 namespace utf8 {
61 /// Converts a std::string with multibyte characters into a std::wstring.
63 /// @return a version-dependent wstring.
64 /// @param str the canonical string to convert.
65 /// @param version the SWF version, used to decide how to decode the string.
67 /// For SWF5, UTF-8 (or any other) multibyte encoded characters are
68 /// converted char by char, mangling the string.
69 DSOEXPORT std::wstring decodeCanonicalString(const std::string& str, int version);
71 /// Converts a std::wstring into canonical std::string.
73 /// @return a version-dependent encoded std::string.
74 /// @param wstr the wide string to convert.
75 /// @param version the SWF version, used to decide how to encode the string.
76 ///
77 /// For SWF 5, each character is stored as an 8-bit (at least) char, rather
78 /// than converting it to a canonical UTF-8 byte sequence. Gnash can then
79 /// distinguish between 8-bit characters, which it handles correctly, and
80 /// multi-byte characters, which are regarded as multiple characters for
81 /// string methods.
82 DSOEXPORT std::string encodeCanonicalString(const std::wstring& wstr, int version);
84 /// Return the next Unicode character in the UTF-8 encoded string.
86 /// Invalid UTF-8 sequences produce a U+FFFD character
87 /// as output. Advances string iterator past the character
88 /// returned, unless the returned character is '\0', in which
89 /// case the iterator does not advance.
90 DSOEXPORT boost::uint32_t decodeNextUnicodeCharacter(std::string::const_iterator& it,
91 const std::string::const_iterator& e);
93 /// \brief Encodes the given wide character into a canonical
94 /// string, theoretically up to 6 chars in length.
95 DSOEXPORT std::string encodeUnicodeCharacter(boost::uint32_t ucs_character);
97 /// Encodes the given wide character into an at least 8-bit character.
99 /// Allows storage of Latin1 (ISO-8859-1) characters. This
100 /// is the format of SWF5 and below.
101 DSOEXPORT std::string encodeLatin1Character(boost::uint32_t ucsCharacter);
103 enum TextEncoding {
104 encUNSPECIFIED,
105 encUTF8,
106 encUTF16BE,
107 encUTF16LE,
108 encUTF32BE,
109 encUTF32LE,
110 encSCSU,
111 encUTF7,
112 encUTFEBCDIC,
113 encBOCU1
116 /// Interpret (and skip) Byte Order Mark in input stream
118 /// This function takes a pointer to a buffer and returns
119 /// the start of actual data after an eventual BOM.
120 /// No conversion is performed, no bytes copy, just skipping of
121 /// the BOM snippet and interpretation of it returned to the
122 /// encoding input parameter.
124 /// See http://en.wikipedia.org/wiki/Byte-order_mark
126 /// @param in
127 /// The input buffer.
129 /// @param size
130 /// Size of the input buffer, will be decremented by the
131 /// size of the BOM, if any.
133 /// @param encoding
134 /// Output parameter, will always be set.
135 /// encUNSPECIFIED if no BOM is found.
137 /// @returns
138 /// A pointer either equal to 'in' or some bytes inside it.
140 DSOEXPORT char* stripBOM(char* in, size_t& size, TextEncoding& encoding);
142 /// Return name of a text encoding
143 DSOEXPORT const char* textEncodingName(TextEncoding enc);
145 enum EncodingGuess {
146 ENCGUESS_UNICODE = 0,
147 ENCGUESS_JIS = 1,
148 ENCGUESS_OTHER = 2
151 /// Common code for guessing at the encoding of random text, between
152 // Shift-Jis, UTF8, and other. Puts the DisplayObject count in length,
153 // and the offsets to the DisplayObjects in offsets, if offsets is not NULL.
154 // If not NULL, offsets should be at least s.length().
155 // offsets are not accurate if the return value is GUESSENC_OTHER
157 /// TODO: It's doubtful if this even works, and it may not be useful at
158 /// all.
159 EncodingGuess guessEncoding(const std::string& s, int& length,
160 std::vector<int>& offsets);
163 } // namespace utf8
164 } // namespace gnash
166 #endif // UTF8_H
169 // Local Variables:
170 // mode: C++
171 // c-basic-offset: 8
172 // tab-width: 8
173 // indent-tabs-mode: t
174 // End: