remove some todo items (either done or will never be done)
[blackbox.git] / lib / Unicode.cc
blob35be2c9ff1f5e8855859bd14b9a90326f433bc60
1 // -*- mode: C++; indent-tabs-mode: nil; c-basic-offset: 2; -*-
2 // Unicode.cc for Blackbox - an X11 Window manager
3 // Copyright (c) 2001 - 2003 Sean 'Shaleh' Perry <shaleh@debian.org>
4 // Copyright (c) 1997 - 2000, 2002 - 2004
5 // Bradley T Hughes <bhughes at trolltech.com>
6 //
7 // Permission is hereby granted, free of charge, to any person obtaining a
8 // copy of this software and associated documentation files (the "Software"),
9 // to deal in the Software without restriction, including without limitation
10 // the rights to use, copy, modify, merge, publish, distribute, sublicense,
11 // and/or sell copies of the Software, and to permit persons to whom the
12 // Software is furnished to do so, subject to the following conditions:
14 // The above copyright notice and this permission notice shall be included in
15 // all copies or substantial portions of the Software.
17 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
20 // THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21 // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
22 // FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
23 // DEALINGS IN THE SOFTWARE.
25 #include "Unicode.hh"
27 #include <algorithm>
29 #include <errno.h>
30 #include <iconv.h>
31 #include <locale.h>
32 #include <stdio.h>
34 #ifdef HAVE_CONFIG_H
35 # include "../config.h"
36 #endif
38 #ifdef HAVE_NL_LANGINFO
39 # include <langinfo.h>
40 #endif
43 namespace bt {
45 static const iconv_t invalid = reinterpret_cast<iconv_t>(-1);
46 static std::string codeset;
48 static unsigned int byte_swap(unsigned int c) {
49 wchar_t ret;
50 int x = sizeof(wchar_t);
51 char *s = reinterpret_cast<char *>(&c);
52 char *d = reinterpret_cast<char *>(&ret) + x - 1;
53 while (x-- > 0)
54 *d-- = *s++;
55 return ret;
58 static ustring native_endian(const ustring &string) {
59 if (string.empty())
60 return string;
61 if (*string.begin() == 0x0000feff) {
62 // begins with BOM in native endian
63 return ustring(string.begin() + 1, string.end());
64 } else if (*string.begin() == 0xfffe0000) {
65 // BOM is byte swapped, convert to native endian
66 ustring ret = ustring(string.begin() + 1, string.end());
67 ustring::iterator it = ret.begin();
68 const ustring::iterator end = ret.end();
69 for (; it != end; ++it)
70 *it = byte_swap(*it);
71 return ret;
72 } else {
73 return string;
77 static ustring add_bom(const ustring &string) {
78 ustring ret;
79 ret.push_back(0x0000feff);
80 return ret + string;
83 template <typename _Source, typename _Target>
84 static void convert(const char *target, const char *source,
85 const _Source &in, _Target &out) {
86 iconv_t cd = iconv_open(target, source);
87 if (cd == invalid)
88 return;
90 #ifdef HAVE_GNU_LIBICONV
91 // GNU libiconv
92 const char *inp = reinterpret_cast<const char *>(in.data());
93 #else
94 // POSIX compliant iconv(3)
95 char *inp =
96 reinterpret_cast<char *>
97 (const_cast<typename _Source::value_type *>(in.data()));
98 #endif
99 const typename _Source::size_type in_size =
100 in.size() * sizeof(typename _Source::value_type);
101 typename _Source::size_type in_bytes = in_size;
103 out.resize(in_size);
105 char *outp =
106 reinterpret_cast<char *>
107 (const_cast<typename _Target::value_type *>(out.data()));
108 typename _Target::size_type out_size =
109 out.size() * sizeof(typename _Target::value_type);
110 typename _Target::size_type out_bytes = out_size;
112 do {
113 size_t l = iconv(cd, &inp, &in_bytes, &outp, &out_bytes);
115 if (l == (size_t) -1) {
116 switch (errno) {
117 case EILSEQ:
118 case EINVAL:
120 const typename _Source::size_type off = in_size - in_bytes + 1;
121 #ifdef HAVE_GNU_LIBICONV
122 // GNU libiconv
123 inp = reinterpret_cast<const char *>(in.data()) + off;
124 #else
125 // POSIX compliant iconv(3)
126 inp =
127 reinterpret_cast<char *>
128 (const_cast<typename _Source::value_type *>(in.data()));
129 #endif
130 in_bytes = in_size - off;
131 break;
133 case E2BIG:
135 const typename _Target::size_type off = out_size - out_bytes;
136 out.resize(out.size() * 2);
137 out_size = out.size() * sizeof(typename _Target::value_type);
139 outp =
140 reinterpret_cast<char *>
141 (const_cast<typename _Target::value_type *>(out.data())) + off;
142 out_bytes = out_size - off;
143 break;
145 default:
146 perror("iconv");
147 out = _Target();
148 iconv_close(cd);
149 return;
152 } while (in_bytes != 0);
154 out.resize((out_size - out_bytes) / sizeof(typename _Target::value_type));
155 iconv_close(cd);
158 } // namespace bt
160 bool bt::hasUnicode() {
161 static bool has_unicode = true;
162 static bool done = false;
164 if (done)
165 return has_unicode;
167 setlocale(LC_ALL, "");
169 #ifdef HAVE_NL_LANGINFO
170 codeset = nl_langinfo(CODESET);
171 #else
172 std::string locale = setlocale(LC_CTYPE, 0);
173 std::string::const_iterator it = locale.begin();
174 const std::string::const_iterator end = locale.end();
175 codeset = ""; // empty by default, not null
176 for (; it != end; ++it) {
177 if (*it == '.') {
178 // found codeset separator
179 ++it;
180 codeset = std::string(it, end);
183 #endif // HAVE_NL_LANGINFO
185 struct {
186 const char *to;
187 const char *from;
188 } conversions[] = {
189 { "UTF-32", codeset.c_str() },
190 { "UTF-32", "UTF-8" },
191 { "UTF-8", "UTF-32" },
192 { codeset.c_str(), "UTF-32" },
194 static const int conversions_count = 4;
196 for (int x = 0; x < conversions_count; ++x) {
197 iconv_t cd = iconv_open(conversions[x].to, conversions[x].from);
199 if (cd == invalid) {
200 has_unicode = false;
201 break;
204 iconv_close(cd);
207 done = true;
208 return has_unicode;
211 bt::ustring bt::toUnicode(const std::string &string) {
212 bt::ustring ret;
213 if (!hasUnicode()) {
214 // cannot convert to Unicode, return something instead of nothing
215 ret.resize(string.size());
216 std::copy(string.begin(), string.end(), ret.begin());
217 return ret;
219 ret.reserve(string.size());
220 convert("UTF-32", codeset.c_str(), string, ret);
221 return native_endian(ret);
224 std::string bt::toLocale(const bt::ustring &string) {
225 std::string ret;
226 if (!hasUnicode()) {
227 // cannot convert from Unicode, return something instead of nothing
228 ret.resize(string.size());
229 std::copy(string.begin(), string.end(), ret.begin());
230 return ret;
232 ret.reserve(string.size());
233 convert(codeset.c_str(), "UTF-32", add_bom(string), ret);
234 return ret;
237 std::string bt::toUtf8(const bt::ustring &utf32) {
238 std::string ret;
239 if (!hasUnicode())
240 return ret;
241 ret.reserve(utf32.size());
242 convert("UTF-8", "UTF-32", add_bom(utf32), ret);
243 return ret;
246 bt::ustring bt::toUtf32(const std::string &utf8) {
247 ustring ret;
248 if (!hasUnicode())
249 return ret;
250 ret.reserve(utf8.size());
251 convert("UTF-32", "UTF-8", utf8, ret);
252 return native_endian(ret);