lib/Unicode.cc

   1 // -*- mode: C++; indent-tabs-mode: nil; c-basic-offset: 2; -*-
   2 // Unicode.cc for Blackbox - an X11 Window manager
   3 // Copyright (c) 2001 - 2003 Sean 'Shaleh' Perry <shaleh@debian.org>
   4 // Copyright (c) 1997 - 2000, 2002 - 2004
   5 //         Bradley T Hughes <bhughes at trolltech.com>
   6 //
   7 // Permission is hereby granted, free of charge, to any person obtaining a
   8 // copy of this software and associated documentation files (the "Software"),
   9 // to deal in the Software without restriction, including without limitation
  10 // the rights to use, copy, modify, merge, publish, distribute, sublicense,
  11 // and/or sell copies of the Software, and to permit persons to whom the
  12 // Software is furnished to do so, subject to the following conditions:
  13 //
  14 // The above copyright notice and this permission notice shall be included in
  15 // all copies or substantial portions of the Software.
  16 //
  17 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  18 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  19 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  20 // THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  21 // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  22 // FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
  23 // DEALINGS IN THE SOFTWARE.
  24
  25 #include "Unicode.hh"
  26
  27 #include <algorithm>
  28
  29 #include <errno.h>
  30 #include <iconv.h>
  31 #include <locale.h>
  32 #include <stdio.h>
  33
  34 #ifdef HAVE_CONFIG_H
  35 #  include "../config.h"
  36 #endif
  37
  38 #ifdef HAVE_NL_LANGINFO
  39 #  include <langinfo.h>
  40 #endif
  41
  42
  43 namespace bt {
  44
  45   static const iconv_t invalid = reinterpret_cast<iconv_t>(-1);
  46   static std::string codeset;
  47
  48   static unsigned int byte_swap(unsigned int c) {
  49     wchar_t ret;
  50     int x = sizeof(wchar_t);
  51     char *s = reinterpret_cast<char *>(&c);
  52     char *d = reinterpret_cast<char *>(&ret) + x - 1;
  53     while (x-- > 0)
  54       *d-- = *s++;
  55     return ret;
  56   }
  57
  58   static ustring native_endian(const ustring &string) {
  59     if (string.empty())
  60       return string;
  61     if (*string.begin() == 0x0000feff) {
  62       // begins with BOM in native endian
  63       return ustring(string.begin() + 1, string.end());
  64     } else if (*string.begin() == 0xfffe0000) {
  65       // BOM is byte swapped, convert to native endian
  66       ustring ret = ustring(string.begin() + 1, string.end());
  67       ustring::iterator it = ret.begin();
  68       const ustring::iterator end = ret.end();
  69       for (; it != end; ++it)
  70         *it = byte_swap(*it);
  71       return ret;
  72     } else {
  73       return string;
  74     }
  75   }
  76
  77   static ustring add_bom(const ustring &string) {
  78     ustring ret;
  79     ret.push_back(0x0000feff);
  80     return ret + string;
  81   }
  82
  83   template <typename _Source, typename _Target>
  84   static void convert(const char *target, const char *source,
  85                       const _Source &in, _Target &out) {
  86     iconv_t cd = iconv_open(target, source);
  87     if (cd == invalid)
  88       return;
  89
  90 #ifdef HAVE_GNU_LIBICONV
  91     // GNU libiconv
  92     const char *inp = reinterpret_cast<const char *>(in.data());
  93 #else
  94     // POSIX compliant iconv(3)
  95     char *inp =
  96       reinterpret_cast<char *>
  97       (const_cast<typename _Source::value_type *>(in.data()));
  98 #endif
  99     const typename _Source::size_type in_size =
 100       in.size() * sizeof(typename _Source::value_type);
 101     typename _Source::size_type in_bytes = in_size;
 102
 103     out.resize(in_size);
 104
 105     char *outp =
 106       reinterpret_cast<char *>
 107       (const_cast<typename _Target::value_type *>(out.data()));
 108     typename _Target::size_type out_size =
 109       out.size() * sizeof(typename _Target::value_type);
 110     typename _Target::size_type out_bytes = out_size;
 111
 112     do {
 113       size_t l = iconv(cd, &inp, &in_bytes, &outp, &out_bytes);
 114
 115       if (l == (size_t) -1) {
 116         switch (errno) {
 117         case EILSEQ:
 118         case EINVAL:
 119           {
 120             const typename _Source::size_type off = in_size - in_bytes + 1;
 121 #ifdef HAVE_GNU_LIBICONV
 122             // GNU libiconv
 123             inp = reinterpret_cast<const char *>(in.data()) + off;
 124 #else
 125             // POSIX compliant iconv(3)
 126             inp =
 127               reinterpret_cast<char *>
 128               (const_cast<typename _Source::value_type *>(in.data()));
 129 #endif
 130             in_bytes = in_size - off;
 131             break;
 132           }
 133         case E2BIG:
 134           {
 135             const typename _Target::size_type off = out_size - out_bytes;
 136             out.resize(out.size() * 2);
 137             out_size = out.size() * sizeof(typename _Target::value_type);
 138
 139             outp =
 140               reinterpret_cast<char *>
 141               (const_cast<typename _Target::value_type *>(out.data())) + off;
 142             out_bytes = out_size - off;
 143             break;
 144           }
 145         default:
 146           perror("iconv");
 147           out = _Target();
 148           iconv_close(cd);
 149           return;
 150         }
 151       }
 152     } while (in_bytes != 0);
 153
 154     out.resize((out_size - out_bytes) / sizeof(typename _Target::value_type));
 155     iconv_close(cd);
 156   }
 157
 158 } // namespace bt
 159
 160 bool bt::hasUnicode() {
 161   static bool has_unicode = true;
 162   static bool done = false;
 163
 164   if (done)
 165     return has_unicode;
 166
 167   setlocale(LC_ALL, "");
 168
 169 #ifdef HAVE_NL_LANGINFO
 170   codeset = nl_langinfo(CODESET);
 171 #else
 172   std::string locale = setlocale(LC_CTYPE, 0);
 173   std::string::const_iterator it = locale.begin();
 174   const std::string::const_iterator end = locale.end();
 175   codeset = ""; // empty by default, not null
 176   for (; it != end; ++it) {
 177     if (*it == '.') {
 178       // found codeset separator
 179       ++it;
 180       codeset = std::string(it, end);
 181     }
 182   }
 183 #endif // HAVE_NL_LANGINFO
 184
 185   struct {
 186     const char *to;
 187     const char *from;
 188   } conversions[] = {
 189     { "UTF-32", codeset.c_str() },
 190     { "UTF-32", "UTF-8" },
 191     { "UTF-8", "UTF-32" },
 192     { codeset.c_str(), "UTF-32" },
 193   };
 194   static const int conversions_count = 4;
 195
 196   for (int x = 0; x < conversions_count; ++x) {
 197     iconv_t cd = iconv_open(conversions[x].to, conversions[x].from);
 198
 199     if (cd == invalid) {
 200       has_unicode = false;
 201       break;
 202     }
 203
 204     iconv_close(cd);
 205   }
 206
 207   done = true;
 208   return has_unicode;
 209 }
 210
 211 bt::ustring bt::toUnicode(const std::string &string) {
 212   bt::ustring ret;
 213   if (!hasUnicode()) {
 214     // cannot convert to Unicode, return something instead of nothing
 215     ret.resize(string.size());
 216     std::copy(string.begin(), string.end(), ret.begin());
 217     return ret;
 218   }
 219   ret.reserve(string.size());
 220   convert("UTF-32", codeset.c_str(), string, ret);
 221   return native_endian(ret);
 222 }
 223
 224 std::string bt::toLocale(const bt::ustring &string) {
 225   std::string ret;
 226   if (!hasUnicode()) {
 227     // cannot convert from Unicode, return something instead of nothing
 228     ret.resize(string.size());
 229     std::copy(string.begin(), string.end(), ret.begin());
 230     return ret;
 231   }
 232   ret.reserve(string.size());
 233   convert(codeset.c_str(), "UTF-32", add_bom(string), ret);
 234   return ret;
 235 }
 236
 237 std::string bt::toUtf8(const bt::ustring &utf32) {
 238   std::string ret;
 239   if (!hasUnicode())
 240     return ret;
 241   ret.reserve(utf32.size());
 242   convert("UTF-8", "UTF-32", add_bom(utf32), ret);
 243   return ret;
 244 }
 245
 246 bt::ustring bt::toUtf32(const std::string &utf8) {
 247   ustring ret;
 248   if (!hasUnicode())
 249     return ret;
 250   ret.reserve(utf8.size());
 251   convert("UTF-32", "UTF-8", utf8, ret);
 252   return native_endian(ret);
 253 }