2 Unix SMB/CIFS implementation.
3 minimal iconv implementation
4 Copyright (C) Andrew Tridgell 2001
5 Copyright (C) Jelmer Vernooij 2002
7 This program is free software; you can redistribute it and/or modify
8 it under the terms of the GNU General Public License as published by
9 the Free Software Foundation; either version 3 of the License, or
10 (at your option) any later version.
12 This program is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 GNU General Public License for more details.
17 You should have received a copy of the GNU General Public License
18 along with this program. If not, see <http://www.gnu.org/licenses/>.
22 #include "../lib/util/dlinklist.h"
23 #include "system/iconv.h"
24 #include "system/filesys.h"
25 #include "charset_proto.h"
34 * @brief Samba wrapper/stub for iconv character set conversion.
36 * iconv is the XPG2 interface for converting between character
37 * encodings. This file provides a Samba wrapper around it, and also
38 * a simple reimplementation that is used if the system does not
41 * Samba only works with encodings that are supersets of ASCII: ascii
42 * characters like whitespace can be tested for directly, multibyte
43 * sequences start with a byte with the high bit set, and strings are
44 * terminated by a nul byte.
46 * Note that the only function provided by iconv is conversion between
47 * characters. It doesn't directly support operations like
48 * uppercasing or comparison. We have to convert to UTF-16LE and
51 * @sa Samba Developers Guide
54 static size_t ascii_pull (void *,const char **, size_t *, char **, size_t *);
55 static size_t ascii_push (void *,const char **, size_t *, char **, size_t *);
56 static size_t latin1_pull(void *,const char **, size_t *, char **, size_t *);
57 static size_t latin1_push(void *,const char **, size_t *, char **, size_t *);
58 static size_t utf8_pull (void *,const char **, size_t *, char **, size_t *);
59 static size_t utf8_push (void *,const char **, size_t *, char **, size_t *);
60 static size_t utf16_munged_pull(void *,const char **, size_t *, char **, size_t *);
61 static size_t ucs2hex_pull(void *,const char **, size_t *, char **, size_t *);
62 static size_t ucs2hex_push(void *,const char **, size_t *, char **, size_t *);
63 static size_t iconv_copy (void *,const char **, size_t *, char **, size_t *);
64 static size_t iconv_swab (void *,const char **, size_t *, char **, size_t *);
66 static const struct charset_functions builtin_functions
[] = {
67 /* windows is closest to UTF-16 */
68 {"UCS-2LE", iconv_copy
, iconv_copy
},
69 {"UTF-16LE", iconv_copy
, iconv_copy
},
70 {"UCS-2BE", iconv_swab
, iconv_swab
},
71 {"UTF-16BE", iconv_swab
, iconv_swab
},
73 /* we include the UTF-8 alias to cope with differing locale settings */
74 {"UTF8", utf8_pull
, utf8_push
},
75 {"UTF-8", utf8_pull
, utf8_push
},
77 /* this handles the munging needed for String2Key */
78 {"UTF16_MUNGED", utf16_munged_pull
, iconv_copy
, true},
80 {"ASCII", ascii_pull
, ascii_push
},
81 {"646", ascii_pull
, ascii_push
},
82 {"ISO-8859-1", latin1_pull
, latin1_push
},
84 {"WEIRD", weird_pull
, weird_push
, true},
87 {"MACOSXFS", macosxfs_encoding_pull
, macosxfs_encoding_push
, true},
89 {"UCS2-HEX", ucs2hex_pull
, ucs2hex_push
, true}
93 #ifdef HAVE_NATIVE_ICONV
94 /* if there was an error then reset the internal state,
95 this ensures that we don't have a shift state remaining for
96 character sets like SJIS */
97 static size_t sys_iconv(void *cd
,
98 const char **inbuf
, size_t *inbytesleft
,
99 char **outbuf
, size_t *outbytesleft
)
101 size_t ret
= iconv((iconv_t
)cd
,
102 discard_const_p(char *, inbuf
), inbytesleft
,
103 outbuf
, outbytesleft
);
104 if (ret
== (size_t)-1) iconv(cd
, NULL
, NULL
, NULL
, NULL
);
110 * This is a simple portable iconv() implementaion.
112 * It only knows about a very small number of character sets - just
113 * enough that Samba works on systems that don't have iconv.
115 _PUBLIC_
size_t smb_iconv(smb_iconv_t cd
,
116 const char **inbuf
, size_t *inbytesleft
,
117 char **outbuf
, size_t *outbytesleft
)
119 /* in many cases we can go direct */
121 return cd
->direct(cd
->cd_direct
,
122 inbuf
, inbytesleft
, outbuf
, outbytesleft
);
125 /* otherwise we have to do it chunks at a time */
127 #ifndef SMB_ICONV_BUFSIZE
128 #define SMB_ICONV_BUFSIZE 2048
134 #if _SAMBA_BUILD_ == 3
135 mem_ctx
= talloc_tos();
139 cvtbuf
= talloc_array(mem_ctx
, char, SMB_ICONV_BUFSIZE
);
145 while (*inbytesleft
> 0) {
146 char *bufp1
= cvtbuf
;
147 const char *bufp2
= cvtbuf
;
148 int saved_errno
= errno
;
149 bool pull_failed
= false;
150 bufsize
= SMB_ICONV_BUFSIZE
;
152 if (cd
->pull(cd
->cd_pull
,
153 inbuf
, inbytesleft
, &bufp1
, &bufsize
) == -1
159 bufsize
= SMB_ICONV_BUFSIZE
- bufsize
;
161 if (cd
->push(cd
->cd_push
,
163 outbuf
, outbytesleft
) == -1) {
166 } else if (pull_failed
) {
167 /* We want the pull errno if possible */
178 static bool is_utf16(const char *name
)
180 return strcasecmp(name
, "UCS-2LE") == 0 ||
181 strcasecmp(name
, "UTF-16LE") == 0;
184 static int smb_iconv_t_destructor(smb_iconv_t hwd
)
186 #ifdef HAVE_NATIVE_ICONV
187 if (hwd
->cd_pull
!= NULL
&& hwd
->cd_pull
!= (iconv_t
)-1)
188 iconv_close(hwd
->cd_pull
);
189 if (hwd
->cd_push
!= NULL
&& hwd
->cd_push
!= (iconv_t
)-1)
190 iconv_close(hwd
->cd_push
);
191 if (hwd
->cd_direct
!= NULL
&& hwd
->cd_direct
!= (iconv_t
)-1)
192 iconv_close(hwd
->cd_direct
);
198 _PUBLIC_ smb_iconv_t
smb_iconv_open_ex(TALLOC_CTX
*mem_ctx
, const char *tocode
,
199 const char *fromcode
, bool use_builtin_handlers
)
202 const struct charset_functions
*from
=NULL
, *to
=NULL
;
205 ret
= (smb_iconv_t
)talloc_named(mem_ctx
,
207 "iconv(%s,%s)", tocode
, fromcode
);
210 return (smb_iconv_t
)-1;
212 memset(ret
, 0, sizeof(*ret
));
213 talloc_set_destructor(ret
, smb_iconv_t_destructor
);
215 /* check for the simplest null conversion */
216 if (strcmp(fromcode
, tocode
) == 0) {
217 ret
->direct
= iconv_copy
;
221 /* check if we have a builtin function for this conversion */
222 for (i
=0;i
<ARRAY_SIZE(builtin_functions
);i
++) {
223 if (strcasecmp(fromcode
, builtin_functions
[i
].name
) == 0) {
224 if (use_builtin_handlers
|| builtin_functions
[i
].samba_internal_charset
) {
225 from
= &builtin_functions
[i
];
228 if (strcasecmp(tocode
, builtin_functions
[i
].name
) == 0) {
229 if (use_builtin_handlers
|| builtin_functions
[i
].samba_internal_charset
) {
230 to
= &builtin_functions
[i
];
235 #ifdef HAVE_NATIVE_ICONV
236 /* the from and to varaibles indicate a samba module or
237 * internal conversion, ret->pull and ret->push are
238 * initialised only in this block for iconv based
242 ret
->cd_pull
= iconv_open("UTF-16LE", fromcode
);
243 if (ret
->cd_pull
== (iconv_t
)-1)
244 ret
->cd_pull
= iconv_open("UCS-2LE", fromcode
);
245 if (ret
->cd_pull
!= (iconv_t
)-1) {
246 ret
->pull
= sys_iconv
;
251 ret
->cd_push
= iconv_open(tocode
, "UTF-16LE");
252 if (ret
->cd_push
== (iconv_t
)-1)
253 ret
->cd_push
= iconv_open(tocode
, "UCS-2LE");
254 if (ret
->cd_push
!= (iconv_t
)-1) {
255 ret
->push
= sys_iconv
;
260 if (ret
->pull
== NULL
&& from
== NULL
) {
264 if (ret
->push
== NULL
&& to
== NULL
) {
268 /* check for conversion to/from ucs2 */
269 if (is_utf16(fromcode
) && to
) {
270 ret
->direct
= to
->push
;
273 if (is_utf16(tocode
) && from
) {
274 ret
->direct
= from
->pull
;
278 #ifdef HAVE_NATIVE_ICONV
279 if (is_utf16(fromcode
)) {
280 ret
->direct
= sys_iconv
;
281 ret
->cd_direct
= ret
->cd_push
;
285 if (is_utf16(tocode
)) {
286 ret
->direct
= sys_iconv
;
287 ret
->cd_direct
= ret
->cd_pull
;
293 /* the general case has to go via a buffer */
294 if (!ret
->pull
) ret
->pull
= from
->pull
;
295 if (!ret
->push
) ret
->push
= to
->push
;
301 return (smb_iconv_t
)-1;
305 simple iconv_open() wrapper
307 _PUBLIC_ smb_iconv_t
smb_iconv_open(const char *tocode
, const char *fromcode
)
309 return smb_iconv_open_ex(NULL
, tocode
, fromcode
, true);
313 simple iconv_close() wrapper
315 _PUBLIC_
int smb_iconv_close(smb_iconv_t cd
)
322 /**********************************************************************
323 the following functions implement the builtin character sets in Samba
324 and also the "test" character sets that are designed to test
325 multi-byte character set support for english users
326 ***********************************************************************/
329 this takes an ASCII sequence and produces a UTF16 sequence
331 The first 127 codepoints of latin1 matches the first 127 codepoints
332 of unicode, and so can be put into the first byte of UTF16LE
336 static size_t ascii_pull(void *cd
, const char **inbuf
, size_t *inbytesleft
,
337 char **outbuf
, size_t *outbytesleft
)
339 while (*inbytesleft
>= 1 && *outbytesleft
>= 2) {
340 if (((*inbuf
)[0] & 0x7F) != (*inbuf
)[0]) {
341 /* If this is multi-byte, then it isn't legal ASCII */
345 (*outbuf
)[0] = (*inbuf
)[0];
348 (*outbytesleft
) -= 2;
353 if (*inbytesleft
> 0) {
362 this takes a UTF16 sequence and produces an ASCII sequence
364 The first 127 codepoints of ASCII matches the first 127 codepoints
365 of unicode, and so can be read directly from the first byte of UTF16LE
368 static size_t ascii_push(void *cd
, const char **inbuf
, size_t *inbytesleft
,
369 char **outbuf
, size_t *outbytesleft
)
373 while (*inbytesleft
>= 2 && *outbytesleft
>= 1) {
374 if (((*inbuf
)[0] & 0x7F) != (*inbuf
)[0] ||
376 /* If this is multi-byte, then it isn't legal ASCII */
380 (*outbuf
)[0] = (*inbuf
)[0];
382 (*outbytesleft
) -= 1;
387 if (*inbytesleft
== 1) {
392 if (*inbytesleft
> 1) {
401 this takes a latin1/ISO-8859-1 sequence and produces a UTF16 sequence
403 The first 256 codepoints of latin1 matches the first 256 codepoints
404 of unicode, and so can be put into the first byte of UTF16LE
407 static size_t latin1_pull(void *cd
, const char **inbuf
, size_t *inbytesleft
,
408 char **outbuf
, size_t *outbytesleft
)
410 while (*inbytesleft
>= 1 && *outbytesleft
>= 2) {
411 (*outbuf
)[0] = (*inbuf
)[0];
414 (*outbytesleft
) -= 2;
419 if (*inbytesleft
> 0) {
428 this takes a UTF16 sequence and produces a latin1/ISO-8859-1 sequence
430 The first 256 codepoints of latin1 matches the first 256 codepoints
431 of unicode, and so can be read directly from the first byte of UTF16LE
434 static size_t latin1_push(void *cd
, const char **inbuf
, size_t *inbytesleft
,
435 char **outbuf
, size_t *outbytesleft
)
439 while (*inbytesleft
>= 2 && *outbytesleft
>= 1) {
440 (*outbuf
)[0] = (*inbuf
)[0];
441 if ((*inbuf
)[1] != 0) {
442 /* If this is multi-byte, then it isn't legal latin1 */
447 (*outbytesleft
) -= 1;
452 if (*inbytesleft
== 1) {
457 if (*inbytesleft
> 1) {
465 static size_t ucs2hex_pull(void *cd
, const char **inbuf
, size_t *inbytesleft
,
466 char **outbuf
, size_t *outbytesleft
)
468 while (*inbytesleft
>= 1 && *outbytesleft
>= 2) {
471 if ((*inbuf
)[0] != '@') {
472 /* seven bit ascii case */
473 (*outbuf
)[0] = (*inbuf
)[0];
476 (*outbytesleft
) -= 2;
481 /* it's a hex character */
482 if (*inbytesleft
< 5) {
487 if (sscanf(&(*inbuf
)[1], "%04x", &v
) != 1) {
492 (*outbuf
)[0] = v
&0xff;
495 (*outbytesleft
) -= 2;
500 if (*inbytesleft
> 0) {
508 static size_t ucs2hex_push(void *cd
, const char **inbuf
, size_t *inbytesleft
,
509 char **outbuf
, size_t *outbytesleft
)
511 while (*inbytesleft
>= 2 && *outbytesleft
>= 1) {
514 if ((*inbuf
)[1] == 0 &&
515 ((*inbuf
)[0] & 0x80) == 0 &&
516 (*inbuf
)[0] != '@') {
517 (*outbuf
)[0] = (*inbuf
)[0];
519 (*outbytesleft
) -= 1;
524 if (*outbytesleft
< 5) {
528 snprintf(buf
, 6, "@%04x", SVAL(*inbuf
, 0));
529 memcpy(*outbuf
, buf
, 5);
531 (*outbytesleft
) -= 5;
536 if (*inbytesleft
== 1) {
541 if (*inbytesleft
> 1) {
549 static size_t iconv_swab(void *cd
, const char **inbuf
, size_t *inbytesleft
,
550 char **outbuf
, size_t *outbytesleft
)
554 n
= MIN(*inbytesleft
, *outbytesleft
);
556 swab(*inbuf
, *outbuf
, (n
&~1));
562 (*outbytesleft
) -= n
;
566 if (*inbytesleft
> 0) {
575 static size_t iconv_copy(void *cd
, const char **inbuf
, size_t *inbytesleft
,
576 char **outbuf
, size_t *outbytesleft
)
580 n
= MIN(*inbytesleft
, *outbytesleft
);
582 memmove(*outbuf
, *inbuf
, n
);
585 (*outbytesleft
) -= n
;
589 if (*inbytesleft
> 0) {
598 this takes a UTF8 sequence and produces a UTF16 sequence
600 static size_t utf8_pull(void *cd
, const char **inbuf
, size_t *inbytesleft
,
601 char **outbuf
, size_t *outbytesleft
)
603 size_t in_left
=*inbytesleft
, out_left
=*outbytesleft
;
604 const uint8_t *c
= (const uint8_t *)*inbuf
;
605 uint8_t *uc
= (uint8_t *)*outbuf
;
607 while (in_left
>= 1 && out_left
>= 2) {
608 if ((c
[0] & 0x80) == 0) {
618 if ((c
[0] & 0xe0) == 0xc0) {
620 (c
[1] & 0xc0) != 0x80) {
624 uc
[1] = (c
[0]>>2) & 0x7;
625 uc
[0] = (c
[0]<<6) | (c
[1]&0x3f);
633 if ((c
[0] & 0xf0) == 0xe0) {
635 (c
[1] & 0xc0) != 0x80 ||
636 (c
[2] & 0xc0) != 0x80) {
640 uc
[1] = ((c
[0]&0xF)<<4) | ((c
[1]>>2)&0xF);
641 uc
[0] = (c
[1]<<6) | (c
[2]&0x3f);
649 if ((c
[0] & 0xf8) == 0xf0) {
650 unsigned int codepoint
;
652 (c
[1] & 0xc0) != 0x80 ||
653 (c
[2] & 0xc0) != 0x80 ||
654 (c
[3] & 0xc0) != 0x80) {
663 if (codepoint
< 0x10000) {
664 /* accept UTF-8 characters that are not
665 minimally packed, but pack the result */
666 uc
[0] = (codepoint
& 0xFF);
667 uc
[1] = (codepoint
>> 8);
675 codepoint
-= 0x10000;
682 uc
[0] = (codepoint
>>10) & 0xFF;
683 uc
[1] = (codepoint
>>18) | 0xd8;
684 uc
[2] = codepoint
& 0xFF;
685 uc
[3] = ((codepoint
>>8) & 0x3) | 0xdc;
693 /* we don't handle 5 byte sequences */
703 *inbytesleft
= in_left
;
704 *outbytesleft
= out_left
;
705 *inbuf
= (const char *)c
;
706 *outbuf
= (char *)uc
;
710 *inbytesleft
= in_left
;
711 *outbytesleft
= out_left
;
712 *inbuf
= (const char *)c
;
713 *outbuf
= (char *)uc
;
719 this takes a UTF16 sequence and produces a UTF8 sequence
721 static size_t utf8_push(void *cd
, const char **inbuf
, size_t *inbytesleft
,
722 char **outbuf
, size_t *outbytesleft
)
724 size_t in_left
=*inbytesleft
, out_left
=*outbytesleft
;
725 uint8_t *c
= (uint8_t *)*outbuf
;
726 const uint8_t *uc
= (const uint8_t *)*inbuf
;
728 while (in_left
>= 2 && out_left
>= 1) {
729 unsigned int codepoint
;
731 if (uc
[1] == 0 && !(uc
[0] & 0x80)) {
741 if ((uc
[1]&0xf8) == 0) {
742 /* next simplest case */
747 c
[0] = 0xc0 | (uc
[0]>>6) | (uc
[1]<<2);
748 c
[1] = 0x80 | (uc
[0] & 0x3f);
756 if ((uc
[1] & 0xfc) == 0xdc) {
757 /* its the second part of a 4 byte sequence. Illegal */
766 if ((uc
[1] & 0xfc) != 0xd8) {
767 codepoint
= uc
[0] | (uc
[1]<<8);
772 c
[0] = 0xe0 | (codepoint
>> 12);
773 c
[1] = 0x80 | ((codepoint
>> 6) & 0x3f);
774 c
[2] = 0x80 | (codepoint
& 0x3f);
783 /* its the first part of a 4 byte sequence */
788 if ((uc
[3] & 0xfc) != 0xdc) {
792 codepoint
= 0x10000 + (uc
[2] | ((uc
[3] & 0x3)<<8) |
793 (uc
[0]<<10) | ((uc
[1] & 0x3)<<18));
799 c
[0] = 0xf0 | (codepoint
>> 18);
800 c
[1] = 0x80 | ((codepoint
>> 12) & 0x3f);
801 c
[2] = 0x80 | ((codepoint
>> 6) & 0x3f);
802 c
[3] = 0x80 | (codepoint
& 0x3f);
820 *inbytesleft
= in_left
;
821 *outbytesleft
= out_left
;
822 *inbuf
= (const char *)uc
;
828 *inbytesleft
= in_left
;
829 *outbytesleft
= out_left
;
830 *inbuf
= (const char *)uc
;
837 this takes a UTF16 munged sequence, modifies it according to the
838 string2key rules, and produces a UTF16 sequence
842 1) any 0x0000 characters are mapped to 0x0001
844 2) convert any instance of 0xD800 - 0xDBFF (high surrogate)
845 without an immediately following 0xDC00 - 0x0xDFFF (low surrogate) to
846 U+FFFD (OBJECT REPLACEMENT CHARACTER).
848 3) the same for any low surrogate that was not preceded by a high surrogate.
851 static size_t utf16_munged_pull(void *cd
, const char **inbuf
, size_t *inbytesleft
,
852 char **outbuf
, size_t *outbytesleft
)
854 size_t in_left
=*inbytesleft
, out_left
=*outbytesleft
;
855 uint8_t *c
= (uint8_t *)*outbuf
;
856 const uint8_t *uc
= (const uint8_t *)*inbuf
;
858 while (in_left
>= 2 && out_left
>= 2) {
859 unsigned int codepoint
= uc
[0] | (uc
[1]<<8);
861 if (codepoint
== 0) {
865 if ((codepoint
& 0xfc00) == 0xd800) {
866 /* a high surrogate */
867 unsigned int codepoint2
;
872 codepoint2
= uc
[2] | (uc
[3]<<8);
873 if ((codepoint2
& 0xfc00) != 0xdc00) {
874 /* high surrogate not followed by low
875 surrogate: convert to 0xfffd */
891 if ((codepoint
& 0xfc00) == 0xdc00) {
892 /* low surrogate not preceded by high
893 surrogate: convert to 0xfffd */
898 c
[0] = codepoint
& 0xFF;
899 c
[1] = (codepoint
>>8) & 0xFF;
918 *inbytesleft
= in_left
;
919 *outbytesleft
= out_left
;
920 *inbuf
= (const char *)uc
;
926 *inbytesleft
= in_left
;
927 *outbytesleft
= out_left
;
928 *inbuf
= (const char *)uc
;