2 Unix SMB/CIFS implementation.
3 Character set conversion Extensions
4 Copyright (C) Igor Vergeichik <iverg@mail.ru> 2001
5 Copyright (C) Andrew Tridgell 2001
6 Copyright (C) Simo Sorce 2001
7 Copyright (C) Martin Pool 2003
9 This program is free software; you can redistribute it and/or modify
10 it under the terms of the GNU General Public License as published by
11 the Free Software Foundation; either version 2 of the License, or
12 (at your option) any later version.
14 This program is distributed in the hope that it will be useful,
15 but WITHOUT ANY WARRANTY; without even the implied warranty of
16 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 GNU General Public License for more details.
19 You should have received a copy of the GNU General Public License
20 along with this program; if not, write to the Free Software
21 Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
29 * @brief Character-set conversion routines built on our iconv.
31 * @note Samba's internal character set (at least in the 3.0 series)
32 * is always the same as the one for the Unix filesystem. It is
33 * <b>not</b> necessarily UTF-8 and may be different on machines that
34 * need i18n filenames to be compatible with Unix software. It does
35 * have to be a superset of ASCII. All multibyte sequences must start
36 * with a byte with the high bit set.
42 static smb_iconv_t conv_handles
[NUM_CHARSETS
][NUM_CHARSETS
];
46 * Return the name of a charset to give to iconv().
48 static const char *charset_name(charset_t ch
)
50 const char *ret
= NULL
;
52 if (ch
== CH_UCS2
) ret
= "UCS-2LE";
53 else if (ch
== CH_UNIX
) ret
= lp_unix_charset();
54 else if (ch
== CH_DOS
) ret
= lp_dos_charset();
55 else if (ch
== CH_DISPLAY
) ret
= lp_display_charset();
56 else if (ch
== CH_UTF8
) ret
= "UTF8";
58 #if defined(HAVE_NL_LANGINFO) && defined(CODESET)
59 if (ret
&& strcasecmp(ret
, "LOCALE") == 0) {
60 const char *ln
= NULL
;
63 setlocale(LC_ALL
, "");
65 ln
= nl_langinfo(CODESET
);
67 /* Check whether the charset name is supported
69 smb_iconv_t handle
= smb_iconv_open(ln
,"UCS-2LE");
70 if (handle
== (smb_iconv_t
) -1) {
71 DEBUG(5,("Locale charset '%s' unsupported, using ASCII instead\n", ln
));
74 DEBUG(5,("Substituting charset '%s' for LOCALE\n", ln
));
75 smb_iconv_close(handle
);
82 if (!ret
|| !*ret
) ret
= "ASCII";
86 void lazy_initialize_conv(void)
88 static int initialized
= False
;
98 * Initialize iconv conversion descriptors.
100 * This is called the first time it is needed, and also called again
101 * every time the configuration is reloaded, because the charset or
102 * codepage might have changed.
104 void init_iconv(void)
107 BOOL did_reload
= False
;
109 /* so that charset_name() works we need to get the UNIX<->UCS2 going
111 if (!conv_handles
[CH_UNIX
][CH_UCS2
])
112 conv_handles
[CH_UNIX
][CH_UCS2
] = smb_iconv_open("UCS-2LE", "ASCII");
114 if (!conv_handles
[CH_UCS2
][CH_UNIX
])
115 conv_handles
[CH_UCS2
][CH_UNIX
] = smb_iconv_open("ASCII", "UCS-2LE");
117 for (c1
=0;c1
<NUM_CHARSETS
;c1
++) {
118 for (c2
=0;c2
<NUM_CHARSETS
;c2
++) {
119 const char *n1
= charset_name((charset_t
)c1
);
120 const char *n2
= charset_name((charset_t
)c2
);
121 if (conv_handles
[c1
][c2
] &&
122 strcmp(n1
, conv_handles
[c1
][c2
]->from_name
) == 0 &&
123 strcmp(n2
, conv_handles
[c1
][c2
]->to_name
) == 0)
128 if (conv_handles
[c1
][c2
])
129 smb_iconv_close(conv_handles
[c1
][c2
]);
131 conv_handles
[c1
][c2
] = smb_iconv_open(n2
,n1
);
132 if (conv_handles
[c1
][c2
] == (smb_iconv_t
)-1) {
133 DEBUG(0,("Conversion from %s to %s not supported\n",
134 charset_name((charset_t
)c1
), charset_name((charset_t
)c2
)));
135 conv_handles
[c1
][c2
] = NULL
;
141 /* XXX: Does this really get called every time the dos
142 * codepage changes? */
143 /* XXX: Is the did_reload test too strict? */
144 init_doschar_table();
150 * Convert string from one encoding to another, making error checking etc
152 * @param src pointer to source string (multibyte or singlebyte)
153 * @param srclen length of the source string in bytes
154 * @param dest pointer to destination string (multibyte or singlebyte)
155 * @param destlen maximal length allowed for string
156 * @returns the number of bytes occupied in the destination
158 size_t convert_string(charset_t from
, charset_t to
,
159 void const *src
, size_t srclen
,
160 void *dest
, size_t destlen
)
164 const char* inbuf
= (const char*)src
;
165 char* outbuf
= (char*)dest
;
166 smb_iconv_t descriptor
;
168 if (srclen
== (size_t)-1)
169 srclen
= strlen(src
)+1;
171 lazy_initialize_conv();
173 descriptor
= conv_handles
[from
][to
];
175 if (descriptor
== (smb_iconv_t
)-1 || descriptor
== (smb_iconv_t
)0) {
176 /* conversion not supported, use as is */
177 size_t len
= MIN(srclen
,destlen
);
178 memcpy(dest
,src
,len
);
184 retval
= smb_iconv(descriptor
, &inbuf
, &i_len
, &outbuf
, &o_len
);
185 if(retval
==(size_t)-1) {
186 const char *reason
="unknown error";
189 reason
="Incomplete multibyte sequence";
192 reason
="No more room";
193 DEBUG(0, ("convert_string: Required %lu, available %lu\n",
194 (unsigned long)srclen
, (unsigned long)destlen
));
195 /* we are not sure we need srclen bytes,
196 may be more, may be less.
197 We only know we need more than destlen
201 reason
="Illegal multibyte sequence";
204 /* smb_panic(reason); */
206 return destlen
-o_len
;
210 * Convert between character sets, allocating a new buffer for the result.
212 * @param srclen length of source buffer.
213 * @param dest always set at least to NULL
214 * @note -1 is not accepted for srclen.
216 * @returns Size in bytes of the converted string; or -1 in case of error.
219 size_t convert_string_allocate(charset_t from
, charset_t to
,
220 void const *src
, size_t srclen
, void **dest
)
222 size_t i_len
, o_len
, destlen
;
224 const char *inbuf
= (const char *)src
;
226 smb_iconv_t descriptor
;
230 if (src
== NULL
|| srclen
== (size_t)-1)
233 lazy_initialize_conv();
235 descriptor
= conv_handles
[from
][to
];
237 if (descriptor
== (smb_iconv_t
)-1 || descriptor
== (smb_iconv_t
)0) {
238 /* conversion not supported, return -1*/
239 DEBUG(3, ("convert_string_allocate: conversion not supported!\n"));
243 destlen
= MAX(srclen
, 512);
246 destlen
= destlen
* 2;
247 ob
= (char *)Realloc(outbuf
, destlen
);
249 DEBUG(0, ("convert_string_allocate: realloc failed!\n"));
257 retval
= smb_iconv(descriptor
,
260 if(retval
== (size_t)-1) {
261 const char *reason
="unknown error";
264 reason
="Incomplete multibyte sequence";
269 reason
="Illegal multibyte sequence";
272 DEBUG(0,("Conversion error: %s(%s)\n",reason
,inbuf
));
273 /* smb_panic(reason); */
277 destlen
= destlen
- o_len
;
278 *dest
= (char *)Realloc(ob
,destlen
);
279 if (destlen
&& !*dest
) {
280 DEBUG(0, ("convert_string_allocate: out of memory!\n"));
290 * Convert between character sets, allocating a new buffer using talloc for the result.
292 * @param srclen length of source buffer.
293 * @param dest always set at least to NULL
294 * @note -1 is not accepted for srclen.
296 * @returns Size in bytes of the converted string; or -1 in case of error.
298 static size_t convert_string_talloc(TALLOC_CTX
*ctx
, charset_t from
, charset_t to
,
299 void const *src
, size_t srclen
, void **dest
)
301 void *alloced_string
;
304 /* FIXME: Ridiculous to allocate two buffers and then copy the string! */
307 dest_len
=convert_string_allocate(from
, to
, src
, srclen
, &alloced_string
);
308 if (dest_len
== (size_t)-1)
310 *dest
= talloc_memdup(ctx
, alloced_string
, dest_len
);
311 SAFE_FREE(alloced_string
);
317 size_t unix_strupper(const char *src
, size_t srclen
, char *dest
, size_t destlen
)
322 size
= push_ucs2_allocate(&buffer
, src
);
324 smb_panic("failed to create UCS2 buffer");
326 if (!strupper_w(buffer
) && (dest
== src
)) {
331 size
= convert_string(CH_UCS2
, CH_UNIX
, buffer
, size
, dest
, destlen
);
337 strdup() a unix string to upper case.
340 char *strdup_upper(const char *s
)
346 size
= push_ucs2_allocate(&buffer
, s
);
353 size
= pull_ucs2_allocate(&out_buffer
, buffer
);
363 size_t unix_strlower(const char *src
, size_t srclen
, char *dest
, size_t destlen
)
368 size
= convert_string_allocate(CH_UNIX
, CH_UCS2
, src
, srclen
,
371 smb_panic("failed to create UCS2 buffer");
373 if (!strlower_w(buffer
) && (dest
== src
)) {
377 size
= convert_string(CH_UCS2
, CH_UNIX
, buffer
, size
, dest
, destlen
);
383 strdup() a unix string to lower case.
386 char *strdup_lower(const char *s
)
392 size
= push_ucs2_allocate(&buffer
, s
);
399 size
= pull_ucs2_allocate(&out_buffer
, buffer
);
409 static size_t ucs2_align(const void *base_ptr
, const void *p
, int flags
)
411 if (flags
& (STR_NOALIGN
|STR_ASCII
))
413 return PTR_DIFF(p
, base_ptr
) & 1;
418 * Copy a string from a char* unix src to a dos codepage string destination.
420 * @return the number of bytes occupied by the string in the destination.
422 * @param flags can include
424 * <dt>STR_TERMINATE</dt> <dd>means include the null termination</dd>
425 * <dt>STR_UPPER</dt> <dd>means uppercase in the destination</dd>
428 * @param dest_len the maximum length in bytes allowed in the
429 * destination. If @p dest_len is -1 then no maximum is used.
431 size_t push_ascii(void *dest
, const char *src
, size_t dest_len
, int flags
)
433 size_t src_len
= strlen(src
);
436 /* treat a pstring as "unlimited" length */
437 if (dest_len
== (size_t)-1)
438 dest_len
= sizeof(pstring
);
440 if (flags
& STR_UPPER
) {
441 pstrcpy(tmpbuf
, src
);
446 if (flags
& (STR_TERMINATE
| STR_TERMINATE_ASCII
))
449 return convert_string(CH_UNIX
, CH_DOS
, src
, src_len
, dest
, dest_len
);
452 size_t push_ascii_fstring(void *dest
, const char *src
)
454 return push_ascii(dest
, src
, sizeof(fstring
), STR_TERMINATE
);
457 size_t push_ascii_pstring(void *dest
, const char *src
)
459 return push_ascii(dest
, src
, sizeof(pstring
), STR_TERMINATE
);
463 * Copy a string from a dos codepage source to a unix char* destination.
465 * The resulting string in "dest" is always null terminated.
467 * @param flags can have:
469 * <dt>STR_TERMINATE</dt>
470 * <dd>STR_TERMINATE means the string in @p src
471 * is null terminated, and src_len is ignored.</dd>
474 * @param src_len is the length of the source area in bytes.
475 * @returns the number of bytes occupied by the string in @p src.
477 size_t pull_ascii(char *dest
, const void *src
, size_t dest_len
, size_t src_len
, int flags
)
481 if (dest_len
== (size_t)-1)
482 dest_len
= sizeof(pstring
);
484 if (flags
& STR_TERMINATE
) {
485 if (src_len
== (size_t)-1) {
486 src_len
= strlen(src
) + 1;
488 size_t len
= strnlen(src
, src_len
);
495 ret
= convert_string(CH_DOS
, CH_UNIX
, src
, src_len
, dest
, dest_len
);
498 dest
[MIN(ret
, dest_len
-1)] = 0;
505 size_t pull_ascii_pstring(char *dest
, const void *src
)
507 return pull_ascii(dest
, src
, sizeof(pstring
), -1, STR_TERMINATE
);
510 size_t pull_ascii_fstring(char *dest
, const void *src
)
512 return pull_ascii(dest
, src
, sizeof(fstring
), -1, STR_TERMINATE
);
516 * Copy a string from a char* src to a unicode destination.
518 * @returns the number of bytes occupied by the string in the destination.
520 * @param flags can have:
523 * <dt>STR_TERMINATE <dd>means include the null termination.
524 * <dt>STR_UPPER <dd>means uppercase in the destination.
525 * <dt>STR_NOALIGN <dd>means don't do alignment.
528 * @param dest_len is the maximum length allowed in the
529 * destination. If dest_len is -1 then no maxiumum is used.
531 size_t push_ucs2(const void *base_ptr
, void *dest
, const char *src
, size_t dest_len
, int flags
)
534 size_t src_len
= strlen(src
);
536 /* treat a pstring as "unlimited" length */
537 if (dest_len
== (size_t)-1)
538 dest_len
= sizeof(pstring
);
540 if (flags
& STR_TERMINATE
)
543 if (ucs2_align(base_ptr
, dest
, flags
)) {
545 dest
= (void *)((char *)dest
+ 1);
546 if (dest_len
) dest_len
--;
550 /* ucs2 is always a multiple of 2 bytes */
553 len
+= convert_string(CH_UNIX
, CH_UCS2
, src
, src_len
, dest
, dest_len
);
555 if (flags
& STR_UPPER
) {
556 smb_ucs2_t
*dest_ucs2
= dest
;
558 for (i
= 0; i
< (dest_len
/ 2) && dest_ucs2
[i
]; i
++) {
559 smb_ucs2_t v
= toupper_w(dest_ucs2
[i
]);
560 if (v
!= dest_ucs2
[i
]) {
571 * Copy a string from a unix char* src to a UCS2 destination,
572 * allocating a buffer using talloc().
574 * @param dest always set at least to NULL
576 * @returns The number of bytes occupied by the string in the destination
577 * or -1 in case of error.
579 size_t push_ucs2_talloc(TALLOC_CTX
*ctx
, smb_ucs2_t
**dest
, const char *src
)
581 size_t src_len
= strlen(src
)+1;
584 return convert_string_talloc(ctx
, CH_UNIX
, CH_UCS2
, src
, src_len
, (void **)dest
);
589 * Copy a string from a unix char* src to a UCS2 destination, allocating a buffer
591 * @param dest always set at least to NULL
593 * @returns The number of bytes occupied by the string in the destination
594 * or -1 in case of error.
597 size_t push_ucs2_allocate(smb_ucs2_t
**dest
, const char *src
)
599 size_t src_len
= strlen(src
)+1;
602 return convert_string_allocate(CH_UNIX
, CH_UCS2
, src
, src_len
, (void **)dest
);
606 Copy a string from a char* src to a UTF-8 destination.
607 Return the number of bytes occupied by the string in the destination
609 STR_TERMINATE means include the null termination
610 STR_UPPER means uppercase in the destination
611 dest_len is the maximum length allowed in the destination. If dest_len
612 is -1 then no maxiumum is used.
615 static size_t push_utf8(void *dest
, const char *src
, size_t dest_len
, int flags
)
617 size_t src_len
= strlen(src
);
620 /* treat a pstring as "unlimited" length */
621 if (dest_len
== (size_t)-1)
622 dest_len
= sizeof(pstring
);
624 if (flags
& STR_UPPER
) {
625 pstrcpy(tmpbuf
, src
);
630 if (flags
& STR_TERMINATE
)
633 return convert_string(CH_UNIX
, CH_UTF8
, src
, src_len
, dest
, dest_len
);
636 size_t push_utf8_fstring(void *dest
, const char *src
)
638 return push_utf8(dest
, src
, sizeof(fstring
), STR_TERMINATE
);
642 * Copy a string from a unix char* src to a UTF-8 destination, allocating a buffer using talloc
644 * @param dest always set at least to NULL
646 * @returns The number of bytes occupied by the string in the destination
649 size_t push_utf8_talloc(TALLOC_CTX
*ctx
, char **dest
, const char *src
)
651 size_t src_len
= strlen(src
)+1;
654 return convert_string_talloc(ctx
, CH_UNIX
, CH_UTF8
, src
, src_len
, (void**)dest
);
658 * Copy a string from a unix char* src to a UTF-8 destination, allocating a buffer
660 * @param dest always set at least to NULL
662 * @returns The number of bytes occupied by the string in the destination
665 size_t push_utf8_allocate(char **dest
, const char *src
)
667 size_t src_len
= strlen(src
)+1;
670 return convert_string_allocate(CH_UNIX
, CH_UTF8
, src
, src_len
, (void **)dest
);
674 Copy a string from a ucs2 source to a unix char* destination.
676 STR_TERMINATE means the string in src is null terminated.
677 STR_NOALIGN means don't try to align.
678 if STR_TERMINATE is set then src_len is ignored if it is -1.
679 src_len is the length of the source area in bytes
680 Return the number of bytes occupied by the string in src.
681 The resulting string in "dest" is always null terminated.
684 size_t pull_ucs2(const void *base_ptr
, char *dest
, const void *src
, size_t dest_len
, size_t src_len
, int flags
)
688 if (dest_len
== (size_t)-1)
689 dest_len
= sizeof(pstring
);
691 if (ucs2_align(base_ptr
, src
, flags
)) {
692 src
= (const void *)((const char *)src
+ 1);
697 if (flags
& STR_TERMINATE
) {
698 if (src_len
== (size_t)-1) {
699 src_len
= strlen_w(src
)*2 + 2;
701 size_t len
= strnlen_w(src
, src_len
/2);
708 /* ucs2 is always a multiple of 2 bytes */
709 if (src_len
!= (size_t)-1)
712 ret
= convert_string(CH_UCS2
, CH_UNIX
, src
, src_len
, dest
, dest_len
);
714 dest
[MIN(ret
, dest_len
-1)] = 0;
721 size_t pull_ucs2_pstring(char *dest
, const void *src
)
723 return pull_ucs2(NULL
, dest
, src
, sizeof(pstring
), -1, STR_TERMINATE
);
726 size_t pull_ucs2_fstring(char *dest
, const void *src
)
728 return pull_ucs2(NULL
, dest
, src
, sizeof(fstring
), -1, STR_TERMINATE
);
732 * Copy a string from a UCS2 src to a unix char * destination, allocating a buffer using talloc
734 * @param dest always set at least to NULL
736 * @returns The number of bytes occupied by the string in the destination
739 size_t pull_ucs2_talloc(TALLOC_CTX
*ctx
, char **dest
, const smb_ucs2_t
*src
)
741 size_t src_len
= (strlen_w(src
)+1) * sizeof(smb_ucs2_t
);
743 return convert_string_talloc(ctx
, CH_UCS2
, CH_UNIX
, src
, src_len
, (void **)dest
);
747 * Copy a string from a UCS2 src to a unix char * destination, allocating a buffer
749 * @param dest always set at least to NULL
751 * @returns The number of bytes occupied by the string in the destination
754 size_t pull_ucs2_allocate(char **dest
, const smb_ucs2_t
*src
)
756 size_t src_len
= (strlen_w(src
)+1) * sizeof(smb_ucs2_t
);
758 return convert_string_allocate(CH_UCS2
, CH_UNIX
, src
, src_len
, (void **)dest
);
762 * Copy a string from a UTF-8 src to a unix char * destination, allocating a buffer using talloc
764 * @param dest always set at least to NULL
766 * @returns The number of bytes occupied by the string in the destination
769 size_t pull_utf8_talloc(TALLOC_CTX
*ctx
, char **dest
, const char *src
)
771 size_t src_len
= strlen(src
)+1;
773 return convert_string_talloc(ctx
, CH_UTF8
, CH_UNIX
, src
, src_len
, (void **)dest
);
777 * Copy a string from a UTF-8 src to a unix char * destination, allocating a buffer
779 * @param dest always set at least to NULL
781 * @returns The number of bytes occupied by the string in the destination
784 size_t pull_utf8_allocate(void **dest
, const char *src
)
786 size_t src_len
= strlen(src
)+1;
788 return convert_string_allocate(CH_UTF8
, CH_UNIX
, src
, src_len
, dest
);
792 Copy a string from a char* src to a unicode or ascii
793 dos codepage destination choosing unicode or ascii based on the
794 flags in the SMB buffer starting at base_ptr.
795 Return the number of bytes occupied by the string in the destination.
797 STR_TERMINATE means include the null termination.
798 STR_UPPER means uppercase in the destination.
799 STR_ASCII use ascii even with unicode packet.
800 STR_NOALIGN means don't do alignment.
801 dest_len is the maximum length allowed in the destination. If dest_len
802 is -1 then no maxiumum is used.
805 size_t push_string_fn(const char *function
, unsigned int line
, const void *base_ptr
, void *dest
, const char *src
, size_t dest_len
, int flags
)
808 /* We really need to zero fill here, not clobber
809 * region, as we want to ensure that valgrind thinks
810 * all of the outgoing buffer has been written to
811 * so a send() or write() won't trap an error.
815 if (dest_len
!= (size_t)-1)
816 clobber_region(function
, line
, dest
, dest_len
);
818 if (dest_len
!= (size_t)-1)
819 memset(dest
, '\0', dest_len
);
823 if (!(flags
& STR_ASCII
) && \
824 ((flags
& STR_UNICODE
|| \
825 (SVAL(base_ptr
, smb_flg2
) & FLAGS2_UNICODE_STRINGS
)))) {
826 return push_ucs2(base_ptr
, dest
, src
, dest_len
, flags
);
828 return push_ascii(dest
, src
, dest_len
, flags
);
833 Copy a string from a unicode or ascii source (depending on
834 the packet flags) to a char* destination.
836 STR_TERMINATE means the string in src is null terminated.
837 STR_UNICODE means to force as unicode.
838 STR_ASCII use ascii even with unicode packet.
839 STR_NOALIGN means don't do alignment.
840 if STR_TERMINATE is set then src_len is ignored is it is -1
841 src_len is the length of the source area in bytes.
842 Return the number of bytes occupied by the string in src.
843 The resulting string in "dest" is always null terminated.
846 size_t pull_string_fn(const char *function
, unsigned int line
, const void *base_ptr
, char *dest
, const void *src
, size_t dest_len
, size_t src_len
, int flags
)
848 if (dest_len
!= (size_t)-1)
849 clobber_region(function
, line
, dest
, dest_len
);
851 if (!(flags
& STR_ASCII
) && \
852 ((flags
& STR_UNICODE
|| \
853 (SVAL(base_ptr
, smb_flg2
) & FLAGS2_UNICODE_STRINGS
)))) {
854 return pull_ucs2(base_ptr
, dest
, src
, dest_len
, src_len
, flags
);
856 return pull_ascii(dest
, src
, dest_len
, src_len
, flags
);
859 size_t align_string(const void *base_ptr
, const char *p
, int flags
)
861 if (!(flags
& STR_ASCII
) && \
862 ((flags
& STR_UNICODE
|| \
863 (SVAL(base_ptr
, smb_flg2
) & FLAGS2_UNICODE_STRINGS
)))) {
864 return ucs2_align(base_ptr
, p
, flags
);