Fix bug #9147 - winbind can't fetch user or group info from AD via LDAP
[Samba.git] / lib / util / charset / charcnv.c
bloba479f4442653eb4aa9907d7047d53987cd30ed04
1 /*
2 Unix SMB/CIFS implementation.
3 Character set conversion Extensions
4 Copyright (C) Igor Vergeichik <iverg@mail.ru> 2001
5 Copyright (C) Andrew Tridgell 2001
6 Copyright (C) Simo Sorce 2001
7 Copyright (C) Jelmer Vernooij 2007
9 This program is free software; you can redistribute it and/or modify
10 it under the terms of the GNU General Public License as published by
11 the Free Software Foundation; either version 3 of the License, or
12 (at your option) any later version.
14 This program is distributed in the hope that it will be useful,
15 but WITHOUT ANY WARRANTY; without even the implied warranty of
16 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 GNU General Public License for more details.
19 You should have received a copy of the GNU General Public License
20 along with this program. If not, see <http://www.gnu.org/licenses/>.
23 #include "includes.h"
24 #include "system/iconv.h"
26 /**
27 * @file
29 * @brief Character-set conversion routines built on our iconv.
31 * @note Samba's internal character set (at least in the 3.0 series)
32 * is always the same as the one for the Unix filesystem. It is
33 * <b>not</b> necessarily UTF-8 and may be different on machines that
34 * need i18n filenames to be compatible with Unix software. It does
35 * have to be a superset of ASCII. All multibyte sequences must start
36 * with a byte with the high bit set.
38 * @sa lib/iconv.c
41 struct smb_iconv_convenience {
42 const char *unix_charset;
43 const char *dos_charset;
44 bool native_iconv;
45 smb_iconv_t conv_handles[NUM_CHARSETS][NUM_CHARSETS];
49 /**
50 * Return the name of a charset to give to iconv().
51 **/
52 static const char *charset_name(struct smb_iconv_convenience *ic, charset_t ch)
54 switch (ch) {
55 case CH_UTF16: return "UTF-16LE";
56 case CH_UNIX: return ic->unix_charset;
57 case CH_DOS: return ic->dos_charset;
58 case CH_UTF8: return "UTF8";
59 case CH_UTF16BE: return "UTF-16BE";
60 case CH_UTF16MUNGED: return "UTF16_MUNGED";
61 default:
62 return "ASCII";
66 /**
67 re-initialize iconv conversion descriptors
68 **/
69 static int close_iconv_convenience(struct smb_iconv_convenience *data)
71 unsigned c1, c2;
72 for (c1=0;c1<NUM_CHARSETS;c1++) {
73 for (c2=0;c2<NUM_CHARSETS;c2++) {
74 if (data->conv_handles[c1][c2] != NULL) {
75 if (data->conv_handles[c1][c2] != (smb_iconv_t)-1) {
76 smb_iconv_close(data->conv_handles[c1][c2]);
78 data->conv_handles[c1][c2] = NULL;
83 return 0;
86 _PUBLIC_ struct smb_iconv_convenience *smb_iconv_convenience_init(TALLOC_CTX *mem_ctx,
87 const char *dos_charset,
88 const char *unix_charset,
89 bool native_iconv)
91 struct smb_iconv_convenience *ret = talloc_zero(mem_ctx,
92 struct smb_iconv_convenience);
94 if (ret == NULL) {
95 return NULL;
98 talloc_set_destructor(ret, close_iconv_convenience);
100 ret->dos_charset = talloc_strdup(ret, dos_charset);
101 ret->unix_charset = talloc_strdup(ret, unix_charset);
102 ret->native_iconv = native_iconv;
104 return ret;
108 on-demand initialisation of conversion handles
110 static smb_iconv_t get_conv_handle(struct smb_iconv_convenience *ic,
111 charset_t from, charset_t to)
113 const char *n1, *n2;
114 static bool initialised;
116 if (initialised == false) {
117 initialised = true;
119 #ifdef LC_ALL
120 /* we set back the locale to C to get ASCII-compatible
121 toupper/lower functions. For now we do not need
122 any other POSIX localisations anyway. When we
123 should really need localized string functions one
124 day we need to write our own ascii_tolower etc.
126 setlocale(LC_ALL, "C");
127 #endif
130 if (ic->conv_handles[from][to]) {
131 return ic->conv_handles[from][to];
134 n1 = charset_name(ic, from);
135 n2 = charset_name(ic, to);
137 ic->conv_handles[from][to] = smb_iconv_open_ex(ic, n2, n1,
138 ic->native_iconv);
140 if (ic->conv_handles[from][to] == (smb_iconv_t)-1) {
141 if ((from == CH_DOS || to == CH_DOS) &&
142 strcasecmp(charset_name(ic, CH_DOS), "ASCII") != 0) {
143 DEBUG(0,("dos charset '%s' unavailable - using ASCII\n",
144 charset_name(ic, CH_DOS)));
145 ic->dos_charset = "ASCII";
147 n1 = charset_name(ic, from);
148 n2 = charset_name(ic, to);
150 ic->conv_handles[from][to] =
151 smb_iconv_open_ex(ic, n2, n1, ic->native_iconv);
155 return ic->conv_handles[from][to];
159 * Convert string from one encoding to another, making error checking etc
161 * @param mem_ctx Memory context
162 * @param cd Iconv handle
163 * @param src pointer to source string (multibyte or singlebyte)
164 * @param srclen length of the source string in bytes
165 * @param dest pointer to destination string (multibyte or singlebyte)
166 * @param destlen maximal length allowed for string
167 * @returns the number of bytes occupied in the destination
169 _PUBLIC_ ssize_t iconv_talloc(TALLOC_CTX *ctx,
170 smb_iconv_t cd,
171 void const *src, size_t srclen,
172 void *dst)
174 size_t i_len, o_len, destlen;
175 void **dest = (void **)dst;
176 size_t retval;
177 const char *inbuf = (const char *)src;
178 char *outbuf, *ob;
180 *dest = NULL;
182 /* it is _very_ rare that a conversion increases the size by
183 more than 3x */
184 destlen = srclen;
185 outbuf = NULL;
186 convert:
187 destlen = 2 + (destlen*3);
188 ob = talloc_realloc(ctx, outbuf, char, destlen);
189 if (!ob) {
190 DEBUG(0, ("iconv_talloc: realloc failed!\n"));
191 talloc_free(outbuf);
192 return (size_t)-1;
193 } else {
194 outbuf = ob;
197 /* we give iconv 2 less bytes to allow us to terminate at the
198 end */
199 i_len = srclen;
200 o_len = destlen-2;
201 retval = smb_iconv(cd,
202 &inbuf, &i_len,
203 &outbuf, &o_len);
204 if(retval == (size_t)-1) {
205 const char *reason="unknown error";
206 switch(errno) {
207 case EINVAL:
208 reason="Incomplete multibyte sequence";
209 break;
210 case E2BIG:
211 goto convert;
212 case EILSEQ:
213 reason="Illegal multibyte sequence";
214 break;
216 DEBUG(0,("Conversion error: %s(%s)\n",reason,inbuf));
217 talloc_free(ob);
218 return (size_t)-1;
221 destlen = (destlen-2) - o_len;
223 /* guarantee null termination in all charsets */
224 SSVAL(ob, destlen, 0);
226 *dest = ob;
228 return destlen;
233 * Convert string from one encoding to another, making error checking etc
235 * @param src pointer to source string (multibyte or singlebyte)
236 * @param srclen length of the source string in bytes
237 * @param dest pointer to destination string (multibyte or singlebyte)
238 * @param destlen maximal length allowed for string
239 * @returns the number of bytes occupied in the destination
241 _PUBLIC_ bool convert_string_convenience(struct smb_iconv_convenience *ic,
242 charset_t from, charset_t to,
243 void const *src, size_t srclen,
244 void *dest, size_t destlen, size_t *converted_size,
245 bool allow_badcharcnv)
247 size_t i_len, o_len;
248 size_t retval;
249 const char* inbuf = (const char*)src;
250 char* outbuf = (char*)dest;
251 smb_iconv_t descriptor;
253 if (allow_badcharcnv) {
254 /* Not implemented yet */
255 return false;
258 if (srclen == (size_t)-1)
259 srclen = strlen(inbuf)+1;
261 descriptor = get_conv_handle(ic, from, to);
263 if (descriptor == (smb_iconv_t)-1 || descriptor == (smb_iconv_t)0) {
264 /* conversion not supported, use as is */
265 size_t len = MIN(srclen,destlen);
266 memcpy(dest,src,len);
267 *converted_size = len;
268 return true;
271 i_len=srclen;
272 o_len=destlen;
273 retval = smb_iconv(descriptor, &inbuf, &i_len, &outbuf, &o_len);
274 if(retval==(size_t)-1) {
275 const char *reason;
276 switch(errno) {
277 case EINVAL:
278 reason="Incomplete multibyte sequence";
279 return false;
280 case E2BIG:
281 reason="No more room";
282 if (from == CH_UNIX) {
283 DEBUG(0,("E2BIG: convert_string(%s,%s): srclen=%d destlen=%d - '%s'\n",
284 charset_name(ic, from), charset_name(ic, to),
285 (int)srclen, (int)destlen,
286 (const char *)src));
287 } else {
288 DEBUG(0,("E2BIG: convert_string(%s,%s): srclen=%d destlen=%d\n",
289 charset_name(ic, from), charset_name(ic, to),
290 (int)srclen, (int)destlen));
292 return false;
293 case EILSEQ:
294 reason="Illegal multibyte sequence";
295 return false;
297 /* smb_panic(reason); */
299 if (converted_size != NULL)
300 *converted_size = destlen-o_len;
301 return true;
305 * Convert between character sets, allocating a new buffer using talloc for the result.
307 * @param srclen length of source buffer.
308 * @param dest always set at least to NULL
309 * @note -1 is not accepted for srclen.
311 * @returns Size in bytes of the converted string; or -1 in case of error.
314 _PUBLIC_ bool convert_string_talloc_convenience(TALLOC_CTX *ctx,
315 struct smb_iconv_convenience *ic,
316 charset_t from, charset_t to,
317 void const *src, size_t srclen,
318 void *dst, size_t *converted_size,
319 bool allow_badcharcnv)
321 void **dest = (void **)dst;
322 smb_iconv_t descriptor;
323 ssize_t ret;
325 if (allow_badcharcnv)
326 return false; /* Not implemented yet */
328 *dest = NULL;
330 if (src == NULL || srclen == (size_t)-1 || srclen == 0)
331 return false;
333 descriptor = get_conv_handle(ic, from, to);
335 if (descriptor == (smb_iconv_t)-1 || descriptor == (smb_iconv_t)0) {
336 /* conversion not supported, return -1*/
337 DEBUG(3, ("convert_string_talloc: conversion from %s to %s not supported!\n",
338 charset_name(ic, from),
339 charset_name(ic, to)));
340 return false;
343 ret = iconv_talloc(ctx, descriptor, src, srclen, dest);
344 if (ret == -1)
345 return false;
346 if (converted_size != NULL)
347 *converted_size = ret;
348 return true;
352 return the unicode codepoint for the next multi-byte CH_UNIX character
353 in the string
355 also return the number of bytes consumed (which tells the caller
356 how many bytes to skip to get to the next CH_UNIX character)
358 return INVALID_CODEPOINT if the next character cannot be converted
360 _PUBLIC_ codepoint_t next_codepoint_convenience(struct smb_iconv_convenience *ic,
361 const char *str, size_t *size)
363 /* it cannot occupy more than 4 bytes in UTF16 format */
364 uint8_t buf[4];
365 smb_iconv_t descriptor;
366 size_t ilen_orig;
367 size_t ilen;
368 size_t olen;
369 char *outbuf;
371 if ((str[0] & 0x80) == 0) {
372 *size = 1;
373 return (codepoint_t)str[0];
376 /* we assume that no multi-byte character can take
377 more than 5 bytes. This is OK as we only
378 support codepoints up to 1M */
379 ilen_orig = strnlen(str, 5);
380 ilen = ilen_orig;
382 descriptor = get_conv_handle(ic, CH_UNIX, CH_UTF16);
383 if (descriptor == (smb_iconv_t)-1) {
384 *size = 1;
385 return INVALID_CODEPOINT;
388 /* this looks a little strange, but it is needed to cope
389 with codepoints above 64k */
390 olen = 2;
391 outbuf = (char *)buf;
392 smb_iconv(descriptor, &str, &ilen, &outbuf, &olen);
393 if (olen == 2) {
394 olen = 4;
395 outbuf = (char *)buf;
396 smb_iconv(descriptor, &str, &ilen, &outbuf, &olen);
397 if (olen == 4) {
398 /* we didn't convert any bytes */
399 *size = 1;
400 return INVALID_CODEPOINT;
402 olen = 4 - olen;
403 } else {
404 olen = 2 - olen;
407 *size = ilen_orig - ilen;
409 if (olen == 2) {
410 return (codepoint_t)SVAL(buf, 0);
412 if (olen == 4) {
413 /* decode a 4 byte UTF16 character manually */
414 return (codepoint_t)0x10000 +
415 (buf[2] | ((buf[3] & 0x3)<<8) |
416 (buf[0]<<10) | ((buf[1] & 0x3)<<18));
419 /* no other length is valid */
420 return INVALID_CODEPOINT;
424 push a single codepoint into a CH_UNIX string the target string must
425 be able to hold the full character, which is guaranteed if it is at
426 least 5 bytes in size. The caller may pass less than 5 bytes if they
427 are sure the character will fit (for example, you can assume that
428 uppercase/lowercase of a character will not add more than 1 byte)
430 return the number of bytes occupied by the CH_UNIX character, or
431 -1 on failure
433 _PUBLIC_ ssize_t push_codepoint_convenience(struct smb_iconv_convenience *ic,
434 char *str, codepoint_t c)
436 smb_iconv_t descriptor;
437 uint8_t buf[4];
438 size_t ilen, olen;
439 const char *inbuf;
441 if (c < 128) {
442 *str = c;
443 return 1;
446 descriptor = get_conv_handle(ic,
447 CH_UTF16, CH_UNIX);
448 if (descriptor == (smb_iconv_t)-1) {
449 return -1;
452 if (c < 0x10000) {
453 ilen = 2;
454 olen = 5;
455 inbuf = (char *)buf;
456 SSVAL(buf, 0, c);
457 smb_iconv(descriptor, &inbuf, &ilen, &str, &olen);
458 if (ilen != 0) {
459 return -1;
461 return 5 - olen;
464 c -= 0x10000;
466 buf[0] = (c>>10) & 0xFF;
467 buf[1] = (c>>18) | 0xd8;
468 buf[2] = c & 0xFF;
469 buf[3] = ((c>>8) & 0x3) | 0xdc;
471 ilen = 4;
472 olen = 5;
473 inbuf = (char *)buf;
475 smb_iconv(descriptor, &inbuf, &ilen, &str, &olen);
476 if (ilen != 0) {
477 return -1;
479 return 5 - olen;