libbase/utf8.h

   1 // utf8.h: utilities for converting to and from UTF-8
   2 //
   3 //   Copyright (C) 2008, 2009, 2010 Free Software Foundation, Inc.
   4 //
   5 // This program is free software; you can redistribute it and/or modify
   6 // it under the terms of the GNU General Public License as published by
   7 // the Free Software Foundation; either version 3 of the License, or
   8 // (at your option) any later version.
   9 //
  10 // This program is distributed in the hope that it will be useful,
  11 // but WITHOUT ANY WARRANTY; without even the implied warranty of
  12 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  13 // GNU General Public License for more details.
  14 //
  15 // You should have received a copy of the GNU General Public License
  16 // along with this program; if not, write to the Free Software
  17 // Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
  18 //
  19 // Based on the public domain work of Thatcher Ulrich <tu@tulrich.com> 2004
  20
  21 #ifndef UTF8_H
  22 #define UTF8_H
  23
  24 #include <string>
  25 #include <boost/cstdint.hpp> // for C99 int types
  26 #include <vector>
  27
  28 #include "dsodefs.h" // For DSOEXPORT
  29
  30 namespace gnash {
  31
  32 /// Utilities to convert between std::string and std::wstring.
  33 //
  34 /// Strings in Gnash are generally stored as std::strings.
  35 /// We have to deal, however, with characters larger than standard
  36 /// ASCII (128), which can be encoded in two different ways.
  37 ///
  38 /// SWF6 and later use UTF-8, encoded as multibyte characters and
  39 /// allowing many thousands of unique codes. Multibyte characters are
  40 /// difficult to handle, as their length - used for many string
  41 /// operations - is not certain without parsing the string.
  42 /// Converting the string to a wstring (generally a uint32_t - the
  43 /// pp seems only to handle characters up to 65535 - two bytes is
  44 /// the minimum size of a wchar) facilitates string operations, as
  45 /// the length of the string is equal to the number of valid characters.
  46 ///
  47 /// SWF5 and earlier, however, used the ISO-8859 specification,
  48 /// allowing the standard 128 ASCII characters plus 128 extra
  49 /// characters that depend on the particular subset of ISO-8859.
  50 /// Characters are 8 bits, not the ASCII standard 7. SWF5 cannot
  51 /// handle multi-byte characters without special functions.
  52 ///
  53 /// It is important that SWF5 can distinguish between the two encodings,
  54 /// so we cannot convert all strings to UTF-8.
  55 //
  56 /// Please note that, although this is called utf8, what the Adobe
  57 /// player uses is only loosely related to real unicode, so the
  58 /// encoding support here is correspondingly non-standard.
  59 namespace utf8 {
  60
  61     /// Converts a std::string with multibyte characters into a std::wstring.
  62     //
  63     /// @return a version-dependent wstring.
  64     /// @param str the canonical string to convert.
  65     /// @param version the SWF version, used to decide how to decode the string.
  66     //
  67     /// For SWF5, UTF-8 (or any other) multibyte encoded characters are
  68     /// converted char by char, mangling the string.
  69     DSOEXPORT std::wstring decodeCanonicalString(const std::string& str, int version);
  70
  71     /// Converts a std::wstring into canonical std::string.
  72     //
  73     /// @return a version-dependent encoded std::string.
  74     /// @param wstr the wide string to convert.
  75     /// @param version the SWF version, used to decide how to encode the string.
  76     ///
  77     /// For SWF 5, each character is stored as an 8-bit (at least) char, rather
  78     /// than converting it to a canonical UTF-8 byte sequence. Gnash can then
  79     /// distinguish between 8-bit characters, which it handles correctly, and
  80     /// multi-byte characters, which are regarded as multiple characters for
  81     /// string methods.
  82     DSOEXPORT std::string encodeCanonicalString(const std::wstring& wstr, int version);
  83
  84     /// Return the next Unicode character in the UTF-8 encoded string.
  85     //
  86     /// Invalid UTF-8 sequences produce a U+FFFD character
  87     /// as output.  Advances string iterator past the character
  88     /// returned, unless the returned character is '\0', in which
  89     /// case the iterator does not advance.
  90     DSOEXPORT boost::uint32_t decodeNextUnicodeCharacter(std::string::const_iterator& it,
  91                                                      const std::string::const_iterator& e);
  92
  93     /// \brief Encodes the given wide character into a canonical
  94     /// string, theoretically up to 6 chars in length.
  95     DSOEXPORT std::string encodeUnicodeCharacter(boost::uint32_t ucs_character);
  96
  97     /// Encodes the given wide character into an at least 8-bit character.
  98     //
  99     /// Allows storage of Latin1 (ISO-8859-1) characters. This
 100     /// is the format of SWF5 and below.
 101     DSOEXPORT std::string encodeLatin1Character(boost::uint32_t ucsCharacter);
 102
 103     enum TextEncoding {
 104         encUNSPECIFIED,
 105         encUTF8,
 106         encUTF16BE,
 107         encUTF16LE,
 108         encUTF32BE,
 109         encUTF32LE,
 110         encSCSU,
 111         encUTF7,
 112         encUTFEBCDIC,
 113         encBOCU1
 114     };
 115
 116     /// Interpret (and skip) Byte Order Mark in input stream
 117     //
 118     /// This function takes a pointer to a buffer and returns
 119     /// the start of actual data after an eventual BOM.
 120     /// No conversion is performed, no bytes copy, just skipping of
 121     /// the BOM snippet and interpretation of it returned to the
 122     /// encoding input parameter.
 123     ///
 124     /// See http://en.wikipedia.org/wiki/Byte-order_mark
 125     ///
 126     /// @param in
 127     ///    The input buffer.
 128     ///
 129     /// @param size
 130     ///    Size of the input buffer, will be decremented by the
 131     ///    size of the BOM, if any.
 132     ///
 133     /// @param encoding
 134     ///    Output parameter, will always be set.
 135     ///    encUNSPECIFIED if no BOM is found.
 136     ///
 137     /// @returns
 138     ///    A pointer either equal to 'in' or some bytes inside it.
 139     ///
 140     DSOEXPORT char* stripBOM(char* in, size_t& size, TextEncoding& encoding);
 141
 142     /// Return name of a text encoding
 143     DSOEXPORT const char* textEncodingName(TextEncoding enc);
 144
 145     enum EncodingGuess {
 146         ENCGUESS_UNICODE = 0,
 147         ENCGUESS_JIS = 1,
 148         ENCGUESS_OTHER = 2
 149     };
 150
 151     /// Common code for guessing at the encoding of random text, between
 152     // Shift-Jis, UTF8, and other. Puts the DisplayObject count in length,
 153     // and the offsets to the DisplayObjects in offsets, if offsets is not NULL.
 154     // If not NULL, offsets should be at least s.length().
 155     // offsets are not accurate if the return value is GUESSENC_OTHER
 156     //
 157     /// TODO: It's doubtful if this even works, and it may not be useful at
 158     /// all.
 159     EncodingGuess guessEncoding(const std::string& s, int& length,
 160             std::vector<int>& offsets);
 161
 162
 163 } // namespace utf8
 164 } // namespace gnash
 165
 166 #endif // UTF8_H
 167
 168
 169 // Local Variables:
 170 // mode: C++
 171 // c-basic-offset: 8
 172 // tab-width: 8
 173 // indent-tabs-mode: t
 174 // End: