2 Unix SMB/CIFS implementation.
3 Character set conversion Extensions
4 Copyright (C) Igor Vergeichik <iverg@mail.ru> 2001
5 Copyright (C) Andrew Tridgell 2001-2011
6 Copyright (C) Andrew Bartlett 2011
7 Copyright (C) Simo Sorce 2001
8 Copyright (C) Martin Pool 2003
10 This program is free software; you can redistribute it and/or modify
11 it under the terms of the GNU General Public License as published by
12 the Free Software Foundation; either version 3 of the License, or
13 (at your option) any later version.
15 This program is distributed in the hope that it will be useful,
16 but WITHOUT ANY WARRANTY; without even the implied warranty of
17 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 GNU General Public License for more details.
20 You should have received a copy of the GNU General Public License
21 along with this program. If not, see <http://www.gnu.org/licenses/>.
25 #include "system/iconv.h"
30 * @brief Character-set conversion routines built on our iconv.
32 * @note Samba's internal character set (at least in the 3.0 series)
33 * is always the same as the one for the Unix filesystem. It is
34 * <b>not</b> necessarily UTF-8 and may be different on machines that
35 * need i18n filenames to be compatible with Unix software. It does
36 * have to be a superset of ASCII. All multibyte sequences must start
37 * with a byte with the high bit set.
44 * Convert string from one encoding to another, making error checking etc
45 * Slow path version - uses (slow) iconv.
47 * @param src pointer to source string (multibyte or singlebyte)
48 * @param srclen length of the source string in bytes
49 * @param dest pointer to destination string (multibyte or singlebyte)
50 * @param destlen maximal length allowed for string
51 * @param converted size is the number of bytes occupied in the destination
53 * @returns false and sets errno on fail, true on success.
55 * Ensure the srclen contains the terminating zero.
59 static bool convert_string_internal(struct smb_iconv_handle
*ic
,
60 charset_t from
, charset_t to
,
61 void const *src
, size_t srclen
,
62 void *dest
, size_t destlen
, size_t *converted_size
)
66 const char* inbuf
= (const char*)src
;
67 char* outbuf
= (char*)dest
;
68 smb_iconv_t descriptor
;
70 descriptor
= get_conv_handle(ic
, from
, to
);
72 if (srclen
== (size_t)-1) {
73 if (from
== CH_UTF16LE
|| from
== CH_UTF16BE
) {
74 srclen
= (strlen_w((const smb_ucs2_t
*)src
)+1) * 2;
76 srclen
= strlen((const char *)src
)+1;
81 if (descriptor
== (smb_iconv_t
)-1 || descriptor
== (smb_iconv_t
)0) {
89 retval
= smb_iconv(descriptor
, &inbuf
, &i_len
, &outbuf
, &o_len
);
90 *converted_size
= destlen
-o_len
;
92 return (retval
!= (size_t)-1);
96 * Convert string from one encoding to another, making error checking etc
97 * Fast path version - handles ASCII first.
99 * @param src pointer to source string (multibyte or singlebyte)
100 * @param srclen length of the source string in bytes, or -1 for nul terminated.
101 * @param dest pointer to destination string (multibyte or singlebyte)
102 * @param destlen maximal length allowed for string - *NEVER* -1.
103 * @param converted size is the number of bytes occupied in the destination
105 * @returns false and sets errno on fail, true on success.
107 * Ensure the srclen contains the terminating zero.
109 * This function has been hand-tuned to provide a fast path.
110 * Don't change unless you really know what you are doing. JRA.
113 bool convert_string_error_handle(struct smb_iconv_handle
*ic
,
114 charset_t from
, charset_t to
,
115 void const *src
, size_t srclen
,
116 void *dest
, size_t destlen
,
117 size_t *converted_size
)
120 * NB. We deliberately don't do a strlen here if srclen == -1.
121 * This is very expensive over millions of calls and is taken
122 * care of in the slow path in convert_string_internal. JRA.
126 SMB_ASSERT(destlen
!= (size_t)-1);
134 if (from
!= CH_UTF16LE
&& from
!= CH_UTF16BE
&& to
!= CH_UTF16LE
&& to
!= CH_UTF16BE
) {
135 const unsigned char *p
= (const unsigned char *)src
;
136 unsigned char *q
= (unsigned char *)dest
;
137 size_t slen
= srclen
;
138 size_t dlen
= destlen
;
139 unsigned char lastp
= '\0';
142 /* If all characters are ascii, fast path here. */
143 while (slen
&& dlen
) {
144 if ((lastp
= *p
) <= 0x7f) {
146 if (slen
!= (size_t)-1) {
154 #ifdef BROKEN_UNICODE_COMPOSE_CHARACTERS
157 bool ret
= convert_string_internal(ic
, from
, to
, p
, slen
, q
, dlen
, converted_size
);
158 *converted_size
+= retval
;
164 *converted_size
= retval
;
167 /* Even if we fast path we should note if we ran out of room. */
168 if (((slen
!= (size_t)-1) && slen
) ||
169 ((slen
== (size_t)-1) && lastp
)) {
175 } else if (from
== CH_UTF16LE
&& to
!= CH_UTF16LE
) {
176 const unsigned char *p
= (const unsigned char *)src
;
177 unsigned char *q
= (unsigned char *)dest
;
179 size_t slen
= srclen
;
180 size_t dlen
= destlen
;
181 unsigned char lastp
= '\0';
182 #ifndef BROKEN_UNICODE_COMPOSE_CHARACTERS
186 if (slen
== (size_t)-1) {
188 ((lastp
= *p
) <= 0x7f) && (p
[1] == 0)) {
196 if (lastp
!= 0) goto slow_path
;
198 while (slen
>= 2 && dlen
&&
199 (*p
<= 0x7f) && (p
[1] == 0)) {
206 if (slen
!= 0) goto slow_path
;
209 *converted_size
= retval
;
212 /* Even if we fast path we should note if we ran out of room. */
213 if (((slen
!= (size_t)-1) && slen
) ||
214 ((slen
== (size_t)-1) && lastp
)) {
222 /* come here when we hit a character we can't deal
223 * with in the fast path
225 #ifdef BROKEN_UNICODE_COMPOSE_CHARACTERS
228 ret
= convert_string_internal(ic
, from
, to
, p
, slen
, q
, dlen
, converted_size
);
229 *converted_size
+= retval
;
233 } else if (from
!= CH_UTF16LE
&& from
!= CH_UTF16BE
&& to
== CH_UTF16LE
) {
234 const unsigned char *p
= (const unsigned char *)src
;
235 unsigned char *q
= (unsigned char *)dest
;
237 size_t slen
= srclen
;
238 size_t dlen
= destlen
;
239 unsigned char lastp
= '\0';
241 /* If all characters are ascii, fast path here. */
242 while (slen
&& (dlen
>= 1)) {
243 if (dlen
>=2 && (lastp
= *p
) <= 0x7F) {
246 if (slen
!= (size_t)-1) {
254 #ifdef BROKEN_UNICODE_COMPOSE_CHARACTERS
257 bool ret
= convert_string_internal(ic
, from
, to
, p
, slen
, q
, dlen
, converted_size
);
258 *converted_size
+= retval
;
264 *converted_size
= retval
;
267 /* Even if we fast path we should note if we ran out of room. */
268 if (((slen
!= (size_t)-1) && slen
) ||
269 ((slen
== (size_t)-1) && lastp
)) {
277 #ifdef BROKEN_UNICODE_COMPOSE_CHARACTERS
280 return convert_string_internal(ic
, from
, to
, src
, srclen
, dest
, destlen
, converted_size
);
283 bool convert_string_handle(struct smb_iconv_handle
*ic
,
284 charset_t from
, charset_t to
,
285 void const *src
, size_t srclen
,
286 void *dest
, size_t destlen
,
287 size_t *converted_size
)
289 bool ret
= convert_string_error_handle(ic
, from
, to
, src
, srclen
, dest
, destlen
, converted_size
);
292 const char *reason
="unknown error";
295 reason
="Incomplete multibyte sequence";
296 DEBUG(3,("convert_string_internal: Conversion error: %s(%s)\n",
297 reason
, (const char *)src
));
301 reason
="No more room";
302 if (from
== CH_UNIX
) {
303 DEBUG(3,("E2BIG: convert_string(%s,%s): srclen=%u destlen=%u - '%s'\n",
304 charset_name(ic
, from
), charset_name(ic
, to
),
305 (unsigned int)srclen
, (unsigned int)destlen
, (const char *)src
));
307 DEBUG(3,("E2BIG: convert_string(%s,%s): srclen=%u destlen=%u\n",
308 charset_name(ic
, from
), charset_name(ic
, to
),
309 (unsigned int)srclen
, (unsigned int)destlen
));
314 reason
="Illegal multibyte sequence";
315 DEBUG(3,("convert_string_internal: Conversion error: %s(%s)\n",
316 reason
, (const char *)src
));
319 DEBUG(0,("convert_string_internal: Conversion error: %s(%s)\n",
320 reason
, (const char *)src
));
323 /* smb_panic(reason); */
330 * Convert between character sets, allocating a new buffer using talloc for the result.
332 * @param srclen length of source buffer.
333 * @param dest always set at least to NULL
334 * @parm converted_size set to the number of bytes occupied by the string in
335 * the destination on success.
336 * @note -1 is not accepted for srclen.
338 * @return true if new buffer was correctly allocated, and string was
341 * Ensure the srclen contains the terminating zero.
343 * I hate the goto's in this function. It's emberrassing.....
344 * There has to be a cleaner way to do this. JRA.
346 bool convert_string_talloc_handle(TALLOC_CTX
*ctx
, struct smb_iconv_handle
*ic
,
347 charset_t from
, charset_t to
,
348 void const *src
, size_t srclen
, void *dst
,
349 size_t *converted_size
)
352 size_t i_len
, o_len
, destlen
= (srclen
* 3) / 2;
354 const char *inbuf
= (const char *)src
;
355 char *outbuf
= NULL
, *ob
= NULL
;
356 smb_iconv_t descriptor
;
357 void **dest
= (void **)dst
;
361 if (src
== NULL
|| srclen
== (size_t)-1) {
367 /* We really should treat this as an error, but
368 there are too many callers that need this to
369 return a NULL terminated string in the correct
371 if (to
== CH_UTF16LE
|| to
== CH_UTF16BE
|| to
== CH_UTF16MUNGED
) {
376 ob
= talloc_zero_array(ctx
, char, destlen
);
381 if (converted_size
!= NULL
) {
382 *converted_size
= destlen
;
388 descriptor
= get_conv_handle(ic
, from
, to
);
390 if (descriptor
== (smb_iconv_t
)-1 || descriptor
== (smb_iconv_t
)0) {
391 DEBUG(0,("convert_string_talloc: Conversion not supported.\n"));
398 /* +2 is for ucs2 null termination. */
399 if ((destlen
*2)+2 < destlen
) {
400 /* wrapped ! abort. */
401 DEBUG(0, ("convert_string_talloc: destlen wrapped !\n"));
406 destlen
= destlen
* 2;
409 /* +2 is for ucs2 null termination. */
410 ob
= talloc_realloc(ctx
, ob
, char, destlen
+ 2);
413 DEBUG(0, ("convert_string_talloc: realloc failed!\n"));
421 retval
= smb_iconv(descriptor
,
424 if(retval
== (size_t)-1) {
425 const char *reason
="unknown error";
428 reason
="Incomplete multibyte sequence";
429 DEBUG(3,("convert_string_talloc: Conversion error: %s(%s)\n",reason
,inbuf
));
434 reason
="Illegal multibyte sequence";
435 DEBUG(3,("convert_string_talloc: Conversion error: %s(%s)\n",reason
,inbuf
));
438 DEBUG(0,("Conversion error: %s(%s)\n",reason
,inbuf
));
439 /* smb_panic(reason); */
444 destlen
= destlen
- o_len
;
445 /* Don't shrink unless we're reclaiming a lot of
446 * space. This is in the hot codepath and these
447 * reallocs *cost*. JRA.
450 /* We're shrinking here so we know the +2 is safe from wrap. */
451 ob
= talloc_realloc(ctx
,ob
, char, destlen
+ 2);
454 if (destlen
&& !ob
) {
455 DEBUG(0, ("convert_string_talloc: out of memory!\n"));
462 /* Must ucs2 null terminate in the extra space we allocated. */
464 ob
[destlen
+1] = '\0';
466 /* Ensure we can never return a *converted_size of zero. */
468 /* As we're now returning false on a bad smb_iconv call,
469 this should never happen. But be safe anyway. */
470 if (to
== CH_UTF16LE
|| to
== CH_UTF16BE
|| to
== CH_UTF16MUNGED
) {
477 if (converted_size
!= NULL
) {
478 *converted_size
= destlen
;
484 * Convert string from one encoding to another, making error checking etc
486 * @param src pointer to source string (multibyte or singlebyte)
487 * @param srclen length of the source string in bytes
488 * @param dest pointer to destination string (multibyte or singlebyte)
489 * @param destlen maximal length allowed for string
490 * @param converted_size the number of bytes occupied in the destination
492 * @returns true on success, false on fail.
494 _PUBLIC_
bool convert_string(charset_t from
, charset_t to
,
495 void const *src
, size_t srclen
,
496 void *dest
, size_t destlen
,
497 size_t *converted_size
)
499 return convert_string_handle(get_iconv_handle(), from
, to
,
501 dest
, destlen
, converted_size
);
505 * Convert string from one encoding to another, making error checking etc
507 * @param src pointer to source string (multibyte or singlebyte)
508 * @param srclen length of the source string in bytes
509 * @param dest pointer to destination string (multibyte or singlebyte)
510 * @param destlen maximal length allowed for string
511 * @param converted_size the number of bytes occupied in the destination
513 * @returns true on success, false on fail.
515 _PUBLIC_
bool convert_string_error(charset_t from
, charset_t to
,
516 void const *src
, size_t srclen
,
517 void *dest
, size_t destlen
,
518 size_t *converted_size
)
520 return convert_string_error_handle(get_iconv_handle(), from
, to
,
522 dest
, destlen
, converted_size
);
526 * Convert between character sets, allocating a new buffer using talloc for the result.
528 * @param srclen length of source buffer.
529 * @param dest always set at least to NULL
530 * @param converted_size Size in bytes of the converted string
531 * @note -1 is not accepted for srclen.
533 * @returns boolean indication whether the conversion succeeded
536 _PUBLIC_
bool convert_string_talloc(TALLOC_CTX
*ctx
,
537 charset_t from
, charset_t to
,
538 void const *src
, size_t srclen
,
539 void *dest
, size_t *converted_size
)
541 return convert_string_talloc_handle(ctx
, get_iconv_handle(),
542 from
, to
, src
, srclen
, dest
,