2 Unix SMB/CIFS implementation.
3 minimal iconv implementation
4 Copyright (C) Andrew Tridgell 2001
5 Copyright (C) Jelmer Vernooij 2002
7 This program is free software; you can redistribute it and/or modify
8 it under the terms of the GNU General Public License as published by
9 the Free Software Foundation; either version 3 of the License, or
10 (at your option) any later version.
12 This program is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 GNU General Public License for more details.
17 You should have received a copy of the GNU General Public License
18 along with this program. If not, see <http://www.gnu.org/licenses/>.
22 #include "../lib/util/dlinklist.h"
23 #include "system/iconv.h"
24 #include "system/filesys.h"
25 #include "charset_proto.h"
34 * @brief Samba wrapper/stub for iconv character set conversion.
36 * iconv is the XPG2 interface for converting between character
37 * encodings. This file provides a Samba wrapper around it, and also
38 * a simple reimplementation that is used if the system does not
41 * Samba only works with encodings that are supersets of ASCII: ascii
42 * characters like whitespace can be tested for directly, multibyte
43 * sequences start with a byte with the high bit set, and strings are
44 * terminated by a nul byte.
46 * Note that the only function provided by iconv is conversion between
47 * characters. It doesn't directly support operations like
48 * uppercasing or comparison. We have to convert to UTF-16LE and
51 * @sa Samba Developers Guide
54 static size_t ascii_pull (void *,const char **, size_t *, char **, size_t *);
55 static size_t ascii_push (void *,const char **, size_t *, char **, size_t *);
56 static size_t latin1_pull(void *,const char **, size_t *, char **, size_t *);
57 static size_t latin1_push(void *,const char **, size_t *, char **, size_t *);
58 static size_t utf8_pull (void *,const char **, size_t *, char **, size_t *);
59 static size_t utf8_push (void *,const char **, size_t *, char **, size_t *);
60 static size_t utf16_munged_pull(void *,const char **, size_t *, char **, size_t *);
61 static size_t ucs2hex_pull(void *,const char **, size_t *, char **, size_t *);
62 static size_t ucs2hex_push(void *,const char **, size_t *, char **, size_t *);
63 static size_t iconv_copy (void *,const char **, size_t *, char **, size_t *);
64 static size_t iconv_swab (void *,const char **, size_t *, char **, size_t *);
66 static const struct charset_functions builtin_functions
[] = {
67 /* windows is closest to UTF-16 */
68 {"UCS-2LE", iconv_copy
, iconv_copy
},
69 {"UTF-16LE", iconv_copy
, iconv_copy
},
70 {"UCS-2BE", iconv_swab
, iconv_swab
},
71 {"UTF-16BE", iconv_swab
, iconv_swab
},
73 /* we include the UTF-8 alias to cope with differing locale settings */
74 {"UTF8", utf8_pull
, utf8_push
},
75 {"UTF-8", utf8_pull
, utf8_push
},
77 /* this handles the munging needed for String2Key */
78 {"UTF16_MUNGED", utf16_munged_pull
, iconv_copy
, true},
80 {"ASCII", ascii_pull
, ascii_push
},
81 {"646", ascii_pull
, ascii_push
},
82 {"ISO-8859-1", latin1_pull
, latin1_push
},
84 {"WEIRD", weird_pull
, weird_push
, true},
87 {"MACOSXFS", macosxfs_encoding_pull
, macosxfs_encoding_push
, true},
89 {"UCS2-HEX", ucs2hex_pull
, ucs2hex_push
, true}
93 #ifdef HAVE_NATIVE_ICONV
94 /* if there was an error then reset the internal state,
95 this ensures that we don't have a shift state remaining for
96 character sets like SJIS */
97 static size_t sys_iconv(void *cd
,
98 const char **inbuf
, size_t *inbytesleft
,
99 char **outbuf
, size_t *outbytesleft
)
101 size_t ret
= iconv((iconv_t
)cd
,
102 discard_const_p(char *, inbuf
), inbytesleft
,
103 outbuf
, outbytesleft
);
104 if (ret
== (size_t)-1) iconv(cd
, NULL
, NULL
, NULL
, NULL
);
110 * This is a simple portable iconv() implementaion.
112 * It only knows about a very small number of character sets - just
113 * enough that Samba works on systems that don't have iconv.
115 _PUBLIC_
size_t smb_iconv(smb_iconv_t cd
,
116 const char **inbuf
, size_t *inbytesleft
,
117 char **outbuf
, size_t *outbytesleft
)
119 /* in many cases we can go direct */
121 return cd
->direct(cd
->cd_direct
,
122 inbuf
, inbytesleft
, outbuf
, outbytesleft
);
125 /* otherwise we have to do it chunks at a time */
127 #ifndef SMB_ICONV_BUFSIZE
128 #define SMB_ICONV_BUFSIZE 2048
131 char cvtbuf
[SMB_ICONV_BUFSIZE
];
133 while (*inbytesleft
> 0) {
134 char *bufp1
= cvtbuf
;
135 const char *bufp2
= cvtbuf
;
136 int saved_errno
= errno
;
137 bool pull_failed
= false;
138 bufsize
= SMB_ICONV_BUFSIZE
;
140 if (cd
->pull(cd
->cd_pull
,
141 inbuf
, inbytesleft
, &bufp1
, &bufsize
) == -1
147 bufsize
= SMB_ICONV_BUFSIZE
- bufsize
;
149 if (cd
->push(cd
->cd_push
,
151 outbuf
, outbytesleft
) == -1) {
153 } else if (pull_failed
) {
154 /* We want the pull errno if possible */
164 static bool is_utf16(const char *name
)
166 return strcasecmp(name
, "UCS-2LE") == 0 ||
167 strcasecmp(name
, "UTF-16LE") == 0;
170 static int smb_iconv_t_destructor(smb_iconv_t hwd
)
172 #ifdef HAVE_NATIVE_ICONV
173 if (hwd
->cd_pull
!= NULL
&& hwd
->cd_pull
!= (iconv_t
)-1)
174 iconv_close(hwd
->cd_pull
);
175 if (hwd
->cd_push
!= NULL
&& hwd
->cd_push
!= (iconv_t
)-1)
176 iconv_close(hwd
->cd_push
);
177 if (hwd
->cd_direct
!= NULL
&& hwd
->cd_direct
!= (iconv_t
)-1)
178 iconv_close(hwd
->cd_direct
);
184 _PUBLIC_ smb_iconv_t
smb_iconv_open_ex(TALLOC_CTX
*mem_ctx
, const char *tocode
,
185 const char *fromcode
, bool use_builtin_handlers
)
188 const struct charset_functions
*from
=NULL
, *to
=NULL
;
191 ret
= (smb_iconv_t
)talloc_named(mem_ctx
,
193 "iconv(%s,%s)", tocode
, fromcode
);
196 return (smb_iconv_t
)-1;
198 memset(ret
, 0, sizeof(*ret
));
199 talloc_set_destructor(ret
, smb_iconv_t_destructor
);
201 /* check for the simplest null conversion */
202 if (strcmp(fromcode
, tocode
) == 0) {
203 ret
->direct
= iconv_copy
;
207 /* check if we have a builtin function for this conversion */
208 for (i
=0;i
<ARRAY_SIZE(builtin_functions
);i
++) {
209 if (strcasecmp(fromcode
, builtin_functions
[i
].name
) == 0) {
210 if (use_builtin_handlers
|| builtin_functions
[i
].samba_internal_charset
) {
211 from
= &builtin_functions
[i
];
214 if (strcasecmp(tocode
, builtin_functions
[i
].name
) == 0) {
215 if (use_builtin_handlers
|| builtin_functions
[i
].samba_internal_charset
) {
216 to
= &builtin_functions
[i
];
221 #ifdef HAVE_NATIVE_ICONV
222 /* the from and to varaibles indicate a samba module or
223 * internal conversion, ret->pull and ret->push are
224 * initialised only in this block for iconv based
228 ret
->cd_pull
= iconv_open("UTF-16LE", fromcode
);
229 if (ret
->cd_pull
== (iconv_t
)-1)
230 ret
->cd_pull
= iconv_open("UCS-2LE", fromcode
);
231 if (ret
->cd_pull
!= (iconv_t
)-1) {
232 ret
->pull
= sys_iconv
;
237 ret
->cd_push
= iconv_open(tocode
, "UTF-16LE");
238 if (ret
->cd_push
== (iconv_t
)-1)
239 ret
->cd_push
= iconv_open(tocode
, "UCS-2LE");
240 if (ret
->cd_push
!= (iconv_t
)-1) {
241 ret
->push
= sys_iconv
;
246 if (ret
->pull
== NULL
&& from
== NULL
) {
250 if (ret
->push
== NULL
&& to
== NULL
) {
254 /* check for conversion to/from ucs2 */
255 if (is_utf16(fromcode
) && to
) {
256 ret
->direct
= to
->push
;
259 if (is_utf16(tocode
) && from
) {
260 ret
->direct
= from
->pull
;
264 #ifdef HAVE_NATIVE_ICONV
265 if (is_utf16(fromcode
)) {
266 ret
->direct
= sys_iconv
;
267 ret
->cd_direct
= ret
->cd_push
;
271 if (is_utf16(tocode
)) {
272 ret
->direct
= sys_iconv
;
273 ret
->cd_direct
= ret
->cd_pull
;
279 /* the general case has to go via a buffer */
280 if (!ret
->pull
) ret
->pull
= from
->pull
;
281 if (!ret
->push
) ret
->push
= to
->push
;
287 return (smb_iconv_t
)-1;
291 simple iconv_open() wrapper
293 _PUBLIC_ smb_iconv_t
smb_iconv_open(const char *tocode
, const char *fromcode
)
295 return smb_iconv_open_ex(NULL
, tocode
, fromcode
, true);
299 simple iconv_close() wrapper
301 _PUBLIC_
int smb_iconv_close(smb_iconv_t cd
)
308 /**********************************************************************
309 the following functions implement the builtin character sets in Samba
310 and also the "test" character sets that are designed to test
311 multi-byte character set support for english users
312 ***********************************************************************/
315 this takes an ASCII sequence and produces a UTF16 sequence
317 The first 127 codepoints of latin1 matches the first 127 codepoints
318 of unicode, and so can be put into the first byte of UTF16LE
322 static size_t ascii_pull(void *cd
, const char **inbuf
, size_t *inbytesleft
,
323 char **outbuf
, size_t *outbytesleft
)
325 while (*inbytesleft
>= 1 && *outbytesleft
>= 2) {
326 if (((*inbuf
)[0] & 0x7F) != (*inbuf
)[0]) {
327 /* If this is multi-byte, then it isn't legal ASCII */
331 (*outbuf
)[0] = (*inbuf
)[0];
334 (*outbytesleft
) -= 2;
339 if (*inbytesleft
> 0) {
348 this takes a UTF16 sequence and produces an ASCII sequence
350 The first 127 codepoints of ASCII matches the first 127 codepoints
351 of unicode, and so can be read directly from the first byte of UTF16LE
354 static size_t ascii_push(void *cd
, const char **inbuf
, size_t *inbytesleft
,
355 char **outbuf
, size_t *outbytesleft
)
359 while (*inbytesleft
>= 2 && *outbytesleft
>= 1) {
360 if (((*inbuf
)[0] & 0x7F) != (*inbuf
)[0] ||
362 /* If this is multi-byte, then it isn't legal ASCII */
366 (*outbuf
)[0] = (*inbuf
)[0];
368 (*outbytesleft
) -= 1;
373 if (*inbytesleft
== 1) {
378 if (*inbytesleft
> 1) {
387 this takes a latin1/ISO-8859-1 sequence and produces a UTF16 sequence
389 The first 256 codepoints of latin1 matches the first 256 codepoints
390 of unicode, and so can be put into the first byte of UTF16LE
393 static size_t latin1_pull(void *cd
, const char **inbuf
, size_t *inbytesleft
,
394 char **outbuf
, size_t *outbytesleft
)
396 while (*inbytesleft
>= 1 && *outbytesleft
>= 2) {
397 (*outbuf
)[0] = (*inbuf
)[0];
400 (*outbytesleft
) -= 2;
405 if (*inbytesleft
> 0) {
414 this takes a UTF16 sequence and produces a latin1/ISO-8859-1 sequence
416 The first 256 codepoints of latin1 matches the first 256 codepoints
417 of unicode, and so can be read directly from the first byte of UTF16LE
420 static size_t latin1_push(void *cd
, const char **inbuf
, size_t *inbytesleft
,
421 char **outbuf
, size_t *outbytesleft
)
425 while (*inbytesleft
>= 2 && *outbytesleft
>= 1) {
426 (*outbuf
)[0] = (*inbuf
)[0];
427 if ((*inbuf
)[1] != 0) {
428 /* If this is multi-byte, then it isn't legal latin1 */
433 (*outbytesleft
) -= 1;
438 if (*inbytesleft
== 1) {
443 if (*inbytesleft
> 1) {
451 static size_t ucs2hex_pull(void *cd
, const char **inbuf
, size_t *inbytesleft
,
452 char **outbuf
, size_t *outbytesleft
)
454 while (*inbytesleft
>= 1 && *outbytesleft
>= 2) {
457 if ((*inbuf
)[0] != '@') {
458 /* seven bit ascii case */
459 (*outbuf
)[0] = (*inbuf
)[0];
462 (*outbytesleft
) -= 2;
467 /* it's a hex character */
468 if (*inbytesleft
< 5) {
473 if (sscanf(&(*inbuf
)[1], "%04x", &v
) != 1) {
478 (*outbuf
)[0] = v
&0xff;
481 (*outbytesleft
) -= 2;
486 if (*inbytesleft
> 0) {
494 static size_t ucs2hex_push(void *cd
, const char **inbuf
, size_t *inbytesleft
,
495 char **outbuf
, size_t *outbytesleft
)
497 while (*inbytesleft
>= 2 && *outbytesleft
>= 1) {
500 if ((*inbuf
)[1] == 0 &&
501 ((*inbuf
)[0] & 0x80) == 0 &&
502 (*inbuf
)[0] != '@') {
503 (*outbuf
)[0] = (*inbuf
)[0];
505 (*outbytesleft
) -= 1;
510 if (*outbytesleft
< 5) {
514 snprintf(buf
, 6, "@%04x", SVAL(*inbuf
, 0));
515 memcpy(*outbuf
, buf
, 5);
517 (*outbytesleft
) -= 5;
522 if (*inbytesleft
== 1) {
527 if (*inbytesleft
> 1) {
535 static size_t iconv_swab(void *cd
, const char **inbuf
, size_t *inbytesleft
,
536 char **outbuf
, size_t *outbytesleft
)
540 n
= MIN(*inbytesleft
, *outbytesleft
);
542 swab(*inbuf
, *outbuf
, (n
&~1));
548 (*outbytesleft
) -= n
;
552 if (*inbytesleft
> 0) {
561 static size_t iconv_copy(void *cd
, const char **inbuf
, size_t *inbytesleft
,
562 char **outbuf
, size_t *outbytesleft
)
566 n
= MIN(*inbytesleft
, *outbytesleft
);
568 memmove(*outbuf
, *inbuf
, n
);
571 (*outbytesleft
) -= n
;
575 if (*inbytesleft
> 0) {
584 this takes a UTF8 sequence and produces a UTF16 sequence
586 static size_t utf8_pull(void *cd
, const char **inbuf
, size_t *inbytesleft
,
587 char **outbuf
, size_t *outbytesleft
)
589 size_t in_left
=*inbytesleft
, out_left
=*outbytesleft
;
590 const uint8_t *c
= (const uint8_t *)*inbuf
;
591 uint8_t *uc
= (uint8_t *)*outbuf
;
593 while (in_left
>= 1 && out_left
>= 2) {
594 if ((c
[0] & 0x80) == 0) {
604 if ((c
[0] & 0xe0) == 0xc0) {
606 (c
[1] & 0xc0) != 0x80) {
610 uc
[1] = (c
[0]>>2) & 0x7;
611 uc
[0] = (c
[0]<<6) | (c
[1]&0x3f);
619 if ((c
[0] & 0xf0) == 0xe0) {
621 (c
[1] & 0xc0) != 0x80 ||
622 (c
[2] & 0xc0) != 0x80) {
626 uc
[1] = ((c
[0]&0xF)<<4) | ((c
[1]>>2)&0xF);
627 uc
[0] = (c
[1]<<6) | (c
[2]&0x3f);
635 if ((c
[0] & 0xf8) == 0xf0) {
636 unsigned int codepoint
;
638 (c
[1] & 0xc0) != 0x80 ||
639 (c
[2] & 0xc0) != 0x80 ||
640 (c
[3] & 0xc0) != 0x80) {
649 if (codepoint
< 0x10000) {
650 /* accept UTF-8 characters that are not
651 minimally packed, but pack the result */
652 uc
[0] = (codepoint
& 0xFF);
653 uc
[1] = (codepoint
>> 8);
661 codepoint
-= 0x10000;
668 uc
[0] = (codepoint
>>10) & 0xFF;
669 uc
[1] = (codepoint
>>18) | 0xd8;
670 uc
[2] = codepoint
& 0xFF;
671 uc
[3] = ((codepoint
>>8) & 0x3) | 0xdc;
679 /* we don't handle 5 byte sequences */
689 *inbytesleft
= in_left
;
690 *outbytesleft
= out_left
;
691 *inbuf
= (const char *)c
;
692 *outbuf
= (char *)uc
;
696 *inbytesleft
= in_left
;
697 *outbytesleft
= out_left
;
698 *inbuf
= (const char *)c
;
699 *outbuf
= (char *)uc
;
705 this takes a UTF16 sequence and produces a UTF8 sequence
707 static size_t utf8_push(void *cd
, const char **inbuf
, size_t *inbytesleft
,
708 char **outbuf
, size_t *outbytesleft
)
710 size_t in_left
=*inbytesleft
, out_left
=*outbytesleft
;
711 uint8_t *c
= (uint8_t *)*outbuf
;
712 const uint8_t *uc
= (const uint8_t *)*inbuf
;
714 while (in_left
>= 2 && out_left
>= 1) {
715 unsigned int codepoint
;
717 if (uc
[1] == 0 && !(uc
[0] & 0x80)) {
727 if ((uc
[1]&0xf8) == 0) {
728 /* next simplest case */
733 c
[0] = 0xc0 | (uc
[0]>>6) | (uc
[1]<<2);
734 c
[1] = 0x80 | (uc
[0] & 0x3f);
742 if ((uc
[1] & 0xfc) == 0xdc) {
743 /* its the second part of a 4 byte sequence. Illegal */
752 if ((uc
[1] & 0xfc) != 0xd8) {
753 codepoint
= uc
[0] | (uc
[1]<<8);
758 c
[0] = 0xe0 | (codepoint
>> 12);
759 c
[1] = 0x80 | ((codepoint
>> 6) & 0x3f);
760 c
[2] = 0x80 | (codepoint
& 0x3f);
769 /* its the first part of a 4 byte sequence */
774 if ((uc
[3] & 0xfc) != 0xdc) {
778 codepoint
= 0x10000 + (uc
[2] | ((uc
[3] & 0x3)<<8) |
779 (uc
[0]<<10) | ((uc
[1] & 0x3)<<18));
785 c
[0] = 0xf0 | (codepoint
>> 18);
786 c
[1] = 0x80 | ((codepoint
>> 12) & 0x3f);
787 c
[2] = 0x80 | ((codepoint
>> 6) & 0x3f);
788 c
[3] = 0x80 | (codepoint
& 0x3f);
806 *inbytesleft
= in_left
;
807 *outbytesleft
= out_left
;
808 *inbuf
= (const char *)uc
;
814 *inbytesleft
= in_left
;
815 *outbytesleft
= out_left
;
816 *inbuf
= (const char *)uc
;
823 this takes a UTF16 munged sequence, modifies it according to the
824 string2key rules, and produces a UTF16 sequence
828 1) any 0x0000 characters are mapped to 0x0001
830 2) convert any instance of 0xD800 - 0xDBFF (high surrogate)
831 without an immediately following 0xDC00 - 0x0xDFFF (low surrogate) to
832 U+FFFD (OBJECT REPLACEMENT CHARACTER).
834 3) the same for any low surrogate that was not preceded by a high surrogate.
837 static size_t utf16_munged_pull(void *cd
, const char **inbuf
, size_t *inbytesleft
,
838 char **outbuf
, size_t *outbytesleft
)
840 size_t in_left
=*inbytesleft
, out_left
=*outbytesleft
;
841 uint8_t *c
= (uint8_t *)*outbuf
;
842 const uint8_t *uc
= (const uint8_t *)*inbuf
;
844 while (in_left
>= 2 && out_left
>= 2) {
845 unsigned int codepoint
= uc
[0] | (uc
[1]<<8);
847 if (codepoint
== 0) {
851 if ((codepoint
& 0xfc00) == 0xd800) {
852 /* a high surrogate */
853 unsigned int codepoint2
;
858 codepoint2
= uc
[2] | (uc
[3]<<8);
859 if ((codepoint2
& 0xfc00) != 0xdc00) {
860 /* high surrogate not followed by low
861 surrogate: convert to 0xfffd */
877 if ((codepoint
& 0xfc00) == 0xdc00) {
878 /* low surrogate not preceded by high
879 surrogate: convert to 0xfffd */
884 c
[0] = codepoint
& 0xFF;
885 c
[1] = (codepoint
>>8) & 0xFF;
904 *inbytesleft
= in_left
;
905 *outbytesleft
= out_left
;
906 *inbuf
= (const char *)uc
;
912 *inbytesleft
= in_left
;
913 *outbytesleft
= out_left
;
914 *inbuf
= (const char *)uc
;