2 Unix SMB/CIFS implementation.
3 Character set conversion Extensions
4 Copyright (C) Igor Vergeichik <iverg@mail.ru> 2001
5 Copyright (C) Andrew Tridgell 2001
6 Copyright (C) Simo Sorce 2001
7 Copyright (C) Jelmer Vernooij 2007
9 This program is free software; you can redistribute it and/or modify
10 it under the terms of the GNU General Public License as published by
11 the Free Software Foundation; either version 3 of the License, or
12 (at your option) any later version.
14 This program is distributed in the hope that it will be useful,
15 but WITHOUT ANY WARRANTY; without even the implied warranty of
16 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 GNU General Public License for more details.
19 You should have received a copy of the GNU General Public License
20 along with this program. If not, see <http://www.gnu.org/licenses/>.
24 #include "lib/util/charset/charset.h"
25 #include "system/locale.h"
26 #include "dynconfig.h"
34 * @brief Unicode string manipulation
37 /* these 2 tables define the unicode case handling. They are loaded
38 at startup either via mmap() or read() from the lib directory */
39 static void *upcase_table
;
40 static void *lowcase_table
;
43 /*******************************************************************
44 load the case handling tables
46 This is the function that should be called from library code.
47 ********************************************************************/
48 void load_case_tables_library(void)
52 mem_ctx
= talloc_init("load_case_tables");
54 smb_panic("No memory for case_tables");
56 upcase_table
= map_file(talloc_asprintf(mem_ctx
, "%s/upcase.dat", get_dyn_CODEPAGEDIR()), 0x20000);
57 lowcase_table
= map_file(talloc_asprintf(mem_ctx
, "%s/lowcase.dat", get_dyn_CODEPAGEDIR()), 0x20000);
59 if (upcase_table
== NULL
) {
60 DEBUG(1, ("Failed to load upcase.dat, will use lame ASCII-only case sensitivity rules\n"));
61 upcase_table
= (void *)-1;
63 if (lowcase_table
== NULL
) {
64 DEBUG(1, ("Failed to load lowcase.dat, will use lame ASCII-only case sensitivity rules\n"));
65 lowcase_table
= (void *)-1;
69 /*******************************************************************
70 load the case handling tables
72 This MUST only be called from main() in application code, never from a
73 library. We don't know if the calling program has already done
74 setlocale() to another value, and can't tell if they have.
75 ********************************************************************/
76 void load_case_tables(void)
78 /* This is a useful global hook where we can ensure that the
79 * locale is set from the environment. This is needed so that
80 * we can use LOCALE as a codepage */
82 setlocale(LC_ALL
, "");
84 load_case_tables_library();
88 Convert a codepoint_t to upper case.
90 _PUBLIC_ codepoint_t
toupper_m(codepoint_t val
)
95 if (upcase_table
== NULL
) {
96 load_case_tables_library();
98 if (upcase_table
== (void *)-1) {
101 if (val
& 0xFFFF0000) {
104 return SVAL(upcase_table
, val
*2);
108 Convert a codepoint_t to lower case.
110 _PUBLIC_ codepoint_t
tolower_m(codepoint_t val
)
115 if (lowcase_table
== NULL
) {
116 load_case_tables_library();
118 if (lowcase_table
== (void *)-1) {
121 if (val
& 0xFFFF0000) {
124 return SVAL(lowcase_table
, val
*2);
128 If we upper cased this character, would we get the same character?
130 _PUBLIC_
bool islower_m(codepoint_t val
)
132 return (toupper_m(val
) != val
);
136 If we lower cased this character, would we get the same character?
138 _PUBLIC_
bool isupper_m(codepoint_t val
)
140 return (tolower_m(val
) != val
);
144 compare two codepoints case insensitively
146 _PUBLIC_
int codepoint_cmpi(codepoint_t c1
, codepoint_t c2
)
149 toupper_m(c1
) == toupper_m(c2
)) {
156 struct smb_iconv_convenience
{
157 TALLOC_CTX
*child_ctx
;
158 const char *unix_charset
;
159 const char *dos_charset
;
160 const char *display_charset
;
162 smb_iconv_t conv_handles
[NUM_CHARSETS
][NUM_CHARSETS
];
165 struct smb_iconv_convenience
*global_iconv_convenience
= NULL
;
167 struct smb_iconv_convenience
*get_iconv_convenience(void)
169 if (global_iconv_convenience
== NULL
)
170 global_iconv_convenience
= smb_iconv_convenience_reinit(talloc_autofree_context(),
171 "ASCII", "UTF-8", "ASCII", true, NULL
);
172 return global_iconv_convenience
;
176 * Return the name of a charset to give to iconv().
178 const char *charset_name(struct smb_iconv_convenience
*ic
, charset_t ch
)
181 case CH_UTF16
: return "UTF-16LE";
182 case CH_UNIX
: return ic
->unix_charset
;
183 case CH_DOS
: return ic
->dos_charset
;
184 case CH_DISPLAY
: return ic
->display_charset
;
185 case CH_UTF8
: return "UTF8";
186 case CH_UTF16BE
: return "UTF-16BE";
187 case CH_UTF16MUNGED
: return "UTF16_MUNGED";
194 re-initialize iconv conversion descriptors
196 static int close_iconv_convenience(struct smb_iconv_convenience
*data
)
199 for (c1
=0;c1
<NUM_CHARSETS
;c1
++) {
200 for (c2
=0;c2
<NUM_CHARSETS
;c2
++) {
201 if (data
->conv_handles
[c1
][c2
] != NULL
) {
202 if (data
->conv_handles
[c1
][c2
] != (smb_iconv_t
)-1) {
203 smb_iconv_close(data
->conv_handles
[c1
][c2
]);
205 data
->conv_handles
[c1
][c2
] = NULL
;
213 static const char *map_locale(const char *charset
)
215 if (strcmp(charset
, "LOCALE") != 0) {
218 #if defined(HAVE_NL_LANGINFO) && defined(CODESET)
223 ln
= nl_langinfo(CODESET
);
225 DEBUG(1,("Unable to determine charset for LOCALE - using ASCII\n"));
228 /* Check whether the charset name is supported
230 handle
= smb_iconv_open(ln
, "UCS-2LE");
231 if (handle
== (smb_iconv_t
) -1) {
232 DEBUG(5,("Locale charset '%s' unsupported, using ASCII instead\n", ln
));
235 DEBUG(5,("Substituting charset '%s' for LOCALE\n", ln
));
236 smb_iconv_close(handle
);
245 the old_ic is passed in here as the smb_iconv_convenience structure
246 is used as a global pointer in some places (eg. python modules). We
247 don't want to invalidate those global pointers, but we do want to
248 update them with the right charset information when loadparm
249 runs. To do that we need to re-use the structure pointer, but
250 re-fill the elements in the structure with the updated values
252 _PUBLIC_
struct smb_iconv_convenience
*smb_iconv_convenience_reinit(TALLOC_CTX
*mem_ctx
,
253 const char *dos_charset
,
254 const char *unix_charset
,
255 const char *display_charset
,
257 struct smb_iconv_convenience
*old_ic
)
259 struct smb_iconv_convenience
*ret
;
261 display_charset
= map_locale(display_charset
);
263 if (old_ic
!= NULL
) {
265 close_iconv_convenience(ret
);
266 talloc_free(ret
->child_ctx
);
269 ret
= talloc_zero(mem_ctx
, struct smb_iconv_convenience
);
275 /* we use a child context to allow us to free all ptrs without
276 freeing the structure itself */
277 ret
->child_ctx
= talloc_new(ret
);
278 if (ret
->child_ctx
== NULL
) {
282 talloc_set_destructor(ret
, close_iconv_convenience
);
284 ret
->dos_charset
= talloc_strdup(ret
->child_ctx
, dos_charset
);
285 ret
->unix_charset
= talloc_strdup(ret
->child_ctx
, unix_charset
);
286 ret
->display_charset
= talloc_strdup(ret
->child_ctx
, display_charset
);
287 ret
->native_iconv
= native_iconv
;
293 on-demand initialisation of conversion handles
295 smb_iconv_t
get_conv_handle(struct smb_iconv_convenience
*ic
,
296 charset_t from
, charset_t to
)
299 static bool initialised
;
301 if (initialised
== false) {
305 if (ic
->conv_handles
[from
][to
]) {
306 return ic
->conv_handles
[from
][to
];
309 n1
= charset_name(ic
, from
);
310 n2
= charset_name(ic
, to
);
312 ic
->conv_handles
[from
][to
] = smb_iconv_open_ex(ic
, n2
, n1
,
315 if (ic
->conv_handles
[from
][to
] == (smb_iconv_t
)-1) {
316 if ((from
== CH_DOS
|| to
== CH_DOS
) &&
317 strcasecmp(charset_name(ic
, CH_DOS
), "ASCII") != 0) {
318 DEBUG(0,("dos charset '%s' unavailable - using ASCII\n",
319 charset_name(ic
, CH_DOS
)));
320 ic
->dos_charset
= "ASCII";
322 n1
= charset_name(ic
, from
);
323 n2
= charset_name(ic
, to
);
325 ic
->conv_handles
[from
][to
] =
326 smb_iconv_open_ex(ic
, n2
, n1
, ic
->native_iconv
);
330 return ic
->conv_handles
[from
][to
];
334 * Return the unicode codepoint for the next character in the input
335 * string in the given src_charset.
336 * The unicode codepoint (codepoint_t) is an unsinged 32 bit value.
338 * Also return the number of bytes consumed (which tells the caller
339 * how many bytes to skip to get to the next src_charset-character).
341 * This is implemented (in the non-ascii-case) by first converting the
342 * next character in the input string to UTF16_LE and then calculating
343 * the unicode codepoint from that.
345 * Return INVALID_CODEPOINT if the next character cannot be converted.
347 _PUBLIC_ codepoint_t
next_codepoint_convenience_ext(
348 struct smb_iconv_convenience
*ic
,
349 const char *str
, charset_t src_charset
,
350 size_t *bytes_consumed
)
352 /* it cannot occupy more than 4 bytes in UTF16 format */
354 smb_iconv_t descriptor
;
360 if ((str
[0] & 0x80) == 0) {
362 return (codepoint_t
)str
[0];
366 * we assume that no multi-byte character can take more than 5 bytes.
367 * This is OK as we only support codepoints up to 1M (U+100000)
369 ilen_orig
= strnlen(str
, 5);
372 descriptor
= get_conv_handle(ic
, src_charset
, CH_UTF16
);
373 if (descriptor
== (smb_iconv_t
)-1) {
375 return INVALID_CODEPOINT
;
379 * this looks a little strange, but it is needed to cope with
380 * codepoints above 64k (U+1000) which are encoded as per RFC2781.
383 outbuf
= (char *)buf
;
384 smb_iconv(descriptor
, &str
, &ilen
, &outbuf
, &olen
);
387 outbuf
= (char *)buf
;
388 smb_iconv(descriptor
, &str
, &ilen
, &outbuf
, &olen
);
390 /* we didn't convert any bytes */
392 return INVALID_CODEPOINT
;
399 *bytes_consumed
= ilen_orig
- ilen
;
402 return (codepoint_t
)SVAL(buf
, 0);
405 /* decode a 4 byte UTF16 character manually */
406 return (codepoint_t
)0x10000 +
407 (buf
[2] | ((buf
[3] & 0x3)<<8) |
408 (buf
[0]<<10) | ((buf
[1] & 0x3)<<18));
411 /* no other length is valid */
412 return INVALID_CODEPOINT
;
416 return the unicode codepoint for the next multi-byte CH_UNIX character
419 also return the number of bytes consumed (which tells the caller
420 how many bytes to skip to get to the next CH_UNIX character)
422 return INVALID_CODEPOINT if the next character cannot be converted
424 _PUBLIC_ codepoint_t
next_codepoint_convenience(struct smb_iconv_convenience
*ic
,
425 const char *str
, size_t *size
)
427 return next_codepoint_convenience_ext(ic
, str
, CH_UNIX
, size
);
431 push a single codepoint into a CH_UNIX string the target string must
432 be able to hold the full character, which is guaranteed if it is at
433 least 5 bytes in size. The caller may pass less than 5 bytes if they
434 are sure the character will fit (for example, you can assume that
435 uppercase/lowercase of a character will not add more than 1 byte)
437 return the number of bytes occupied by the CH_UNIX character, or
440 _PUBLIC_ ssize_t
push_codepoint_convenience(struct smb_iconv_convenience
*ic
,
441 char *str
, codepoint_t c
)
443 smb_iconv_t descriptor
;
453 descriptor
= get_conv_handle(ic
,
455 if (descriptor
== (smb_iconv_t
)-1) {
464 smb_iconv(descriptor
, &inbuf
, &ilen
, &str
, &olen
);
473 buf
[0] = (c
>>10) & 0xFF;
474 buf
[1] = (c
>>18) | 0xd8;
476 buf
[3] = ((c
>>8) & 0x3) | 0xdc;
482 smb_iconv(descriptor
, &inbuf
, &ilen
, &str
, &olen
);
489 _PUBLIC_ codepoint_t
next_codepoint_ext(const char *str
, charset_t src_charset
,
492 return next_codepoint_convenience_ext(get_iconv_convenience(), str
,
496 _PUBLIC_ codepoint_t
next_codepoint(const char *str
, size_t *size
)
498 return next_codepoint_convenience(get_iconv_convenience(), str
, size
);
501 _PUBLIC_ ssize_t
push_codepoint(char *str
, codepoint_t c
)
503 return push_codepoint_convenience(get_iconv_convenience(), str
, c
);