s3: Remove some unused code
[Samba/gebeck_regimport.git] / lib / util / charset / codepoints.c
blob0984164d4838dffd46a2a4d7d596b06eb0cf937c
1 /*
2 Unix SMB/CIFS implementation.
3 Character set conversion Extensions
4 Copyright (C) Igor Vergeichik <iverg@mail.ru> 2001
5 Copyright (C) Andrew Tridgell 2001
6 Copyright (C) Simo Sorce 2001
7 Copyright (C) Jelmer Vernooij 2007
9 This program is free software; you can redistribute it and/or modify
10 it under the terms of the GNU General Public License as published by
11 the Free Software Foundation; either version 3 of the License, or
12 (at your option) any later version.
14 This program is distributed in the hope that it will be useful,
15 but WITHOUT ANY WARRANTY; without even the implied warranty of
16 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 GNU General Public License for more details.
19 You should have received a copy of the GNU General Public License
20 along with this program. If not, see <http://www.gnu.org/licenses/>.
23 #include "includes.h"
24 #include "lib/util/charset/charset.h"
25 #include "system/locale.h"
26 #include "dynconfig/dynconfig.h"
28 #ifdef strcasecmp
29 #undef strcasecmp
30 #endif
32 /**
33 * @file
34 * @brief Unicode string manipulation
37 /* these 2 tables define the unicode case handling. They are loaded
38 at startup either via mmap() or read() from the lib directory */
39 static void *upcase_table;
40 static void *lowcase_table;
43 /*******************************************************************
44 load the case handling tables
46 This is the function that should be called from library code.
47 ********************************************************************/
48 void load_case_tables_library(void)
50 TALLOC_CTX *mem_ctx;
52 mem_ctx = talloc_init("load_case_tables");
53 if (!mem_ctx) {
54 smb_panic("No memory for case_tables");
56 upcase_table = map_file(talloc_asprintf(mem_ctx, "%s/upcase.dat", get_dyn_CODEPAGEDIR()), 0x20000);
57 lowcase_table = map_file(talloc_asprintf(mem_ctx, "%s/lowcase.dat", get_dyn_CODEPAGEDIR()), 0x20000);
58 talloc_free(mem_ctx);
59 if (upcase_table == NULL) {
60 DEBUG(1, ("Failed to load upcase.dat, will use lame ASCII-only case sensitivity rules\n"));
61 upcase_table = (void *)-1;
63 if (lowcase_table == NULL) {
64 DEBUG(1, ("Failed to load lowcase.dat, will use lame ASCII-only case sensitivity rules\n"));
65 lowcase_table = (void *)-1;
69 /*******************************************************************
70 load the case handling tables
72 This MUST only be called from main() in application code, never from a
73 library. We don't know if the calling program has already done
74 setlocale() to another value, and can't tell if they have.
75 ********************************************************************/
76 void load_case_tables(void)
78 /* This is a useful global hook where we can ensure that the
79 * locale is set from the environment. This is needed so that
80 * we can use LOCALE as a codepage */
81 #ifdef HAVE_SETLOCALE
82 setlocale(LC_ALL, "");
83 #endif
84 load_case_tables_library();
87 /**
88 Convert a codepoint_t to upper case.
89 **/
90 _PUBLIC_ codepoint_t toupper_m(codepoint_t val)
92 if (val < 128) {
93 return toupper(val);
95 if (upcase_table == NULL) {
96 load_case_tables_library();
98 if (upcase_table == (void *)-1) {
99 return val;
101 if (val & 0xFFFF0000) {
102 return val;
104 return SVAL(upcase_table, val*2);
108 Convert a codepoint_t to lower case.
110 _PUBLIC_ codepoint_t tolower_m(codepoint_t val)
112 if (val < 128) {
113 return tolower(val);
115 if (lowcase_table == NULL) {
116 load_case_tables_library();
118 if (lowcase_table == (void *)-1) {
119 return val;
121 if (val & 0xFFFF0000) {
122 return val;
124 return SVAL(lowcase_table, val*2);
128 If we upper cased this character, would we get the same character?
130 _PUBLIC_ bool islower_m(codepoint_t val)
132 return (toupper_m(val) != val);
136 If we lower cased this character, would we get the same character?
138 _PUBLIC_ bool isupper_m(codepoint_t val)
140 return (tolower_m(val) != val);
144 compare two codepoints case insensitively
146 _PUBLIC_ int codepoint_cmpi(codepoint_t c1, codepoint_t c2)
148 if (c1 == c2 ||
149 toupper_m(c1) == toupper_m(c2)) {
150 return 0;
152 return c1 - c2;
156 struct smb_iconv_handle {
157 TALLOC_CTX *child_ctx;
158 const char *unix_charset;
159 const char *dos_charset;
160 const char *display_charset;
161 bool use_builtin_handlers;
162 smb_iconv_t conv_handles[NUM_CHARSETS][NUM_CHARSETS];
165 struct smb_iconv_handle *global_iconv_handle = NULL;
167 struct smb_iconv_handle *get_iconv_handle(void)
169 if (global_iconv_handle == NULL)
170 global_iconv_handle = smb_iconv_handle_reinit(talloc_autofree_context(),
171 "ASCII", "UTF-8", true, NULL);
172 return global_iconv_handle;
175 struct smb_iconv_handle *get_iconv_testing_handle(TALLOC_CTX *mem_ctx,
176 const char *dos_charset,
177 const char *unix_charset,
178 bool use_builtin_handlers)
180 return smb_iconv_handle_reinit(mem_ctx,
181 dos_charset, unix_charset, use_builtin_handlers, NULL);
185 * Return the name of a charset to give to iconv().
187 const char *charset_name(struct smb_iconv_handle *ic, charset_t ch)
189 switch (ch) {
190 case CH_UTF16: return "UTF-16LE";
191 case CH_UNIX: return ic->unix_charset;
192 case CH_DOS: return ic->dos_charset;
193 case CH_UTF8: return "UTF8";
194 case CH_UTF16BE: return "UTF-16BE";
195 case CH_UTF16MUNGED: return "UTF16_MUNGED";
196 default:
197 return "ASCII";
202 re-initialize iconv conversion descriptors
204 static int close_iconv_handle(struct smb_iconv_handle *data)
206 unsigned c1, c2;
207 for (c1=0;c1<NUM_CHARSETS;c1++) {
208 for (c2=0;c2<NUM_CHARSETS;c2++) {
209 if (data->conv_handles[c1][c2] != NULL) {
210 if (data->conv_handles[c1][c2] != (smb_iconv_t)-1) {
211 smb_iconv_close(data->conv_handles[c1][c2]);
213 data->conv_handles[c1][c2] = NULL;
218 return 0;
222 the old_ic is passed in here as the smb_iconv_handle structure
223 is used as a global pointer in some places (eg. python modules). We
224 don't want to invalidate those global pointers, but we do want to
225 update them with the right charset information when loadparm
226 runs. To do that we need to re-use the structure pointer, but
227 re-fill the elements in the structure with the updated values
229 _PUBLIC_ struct smb_iconv_handle *smb_iconv_handle_reinit(TALLOC_CTX *mem_ctx,
230 const char *dos_charset,
231 const char *unix_charset,
232 bool use_builtin_handlers,
233 struct smb_iconv_handle *old_ic)
235 struct smb_iconv_handle *ret;
237 if (old_ic != NULL) {
238 ret = old_ic;
239 close_iconv_handle(ret);
240 talloc_free(ret->child_ctx);
241 ZERO_STRUCTP(ret);
242 } else {
243 ret = talloc_zero(mem_ctx, struct smb_iconv_handle);
245 if (ret == NULL) {
246 return NULL;
249 /* we use a child context to allow us to free all ptrs without
250 freeing the structure itself */
251 ret->child_ctx = talloc_new(ret);
252 if (ret->child_ctx == NULL) {
253 return NULL;
256 talloc_set_destructor(ret, close_iconv_handle);
258 if (strcasecmp(dos_charset, "UTF8") == 0 || strcasecmp(dos_charset, "UTF-8") == 0) {
259 DEBUG(0,("ERROR: invalid DOS charset: 'dos charset' must not be UTF8, using (default value) CP850 instead\n"));
260 dos_charset = "CP850";
263 ret->dos_charset = talloc_strdup(ret->child_ctx, dos_charset);
264 ret->unix_charset = talloc_strdup(ret->child_ctx, unix_charset);
265 ret->use_builtin_handlers = use_builtin_handlers;
267 return ret;
271 on-demand initialisation of conversion handles
273 smb_iconv_t get_conv_handle(struct smb_iconv_handle *ic,
274 charset_t from, charset_t to)
276 const char *n1, *n2;
278 if (ic->conv_handles[from][to]) {
279 return ic->conv_handles[from][to];
282 n1 = charset_name(ic, from);
283 n2 = charset_name(ic, to);
285 ic->conv_handles[from][to] = smb_iconv_open_ex(ic, n2, n1,
286 ic->use_builtin_handlers);
288 if (ic->conv_handles[from][to] == (smb_iconv_t)-1) {
289 if ((from == CH_DOS || to == CH_DOS) &&
290 strcasecmp(charset_name(ic, CH_DOS), "ASCII") != 0) {
291 DEBUG(0,("dos charset '%s' unavailable - using ASCII\n",
292 charset_name(ic, CH_DOS)));
293 ic->dos_charset = "ASCII";
295 n1 = charset_name(ic, from);
296 n2 = charset_name(ic, to);
298 ic->conv_handles[from][to] =
299 smb_iconv_open_ex(ic, n2, n1, ic->use_builtin_handlers);
303 return ic->conv_handles[from][to];
307 * Return the unicode codepoint for the next character in the input
308 * string in the given src_charset.
309 * The unicode codepoint (codepoint_t) is an unsinged 32 bit value.
311 * Also return the number of bytes consumed (which tells the caller
312 * how many bytes to skip to get to the next src_charset-character).
314 * This is implemented (in the non-ascii-case) by first converting the
315 * next character in the input string to UTF16_LE and then calculating
316 * the unicode codepoint from that.
318 * Return INVALID_CODEPOINT if the next character cannot be converted.
320 _PUBLIC_ codepoint_t next_codepoint_handle_ext(
321 struct smb_iconv_handle *ic,
322 const char *str, charset_t src_charset,
323 size_t *bytes_consumed)
325 /* it cannot occupy more than 4 bytes in UTF16 format */
326 uint8_t buf[4];
327 smb_iconv_t descriptor;
328 size_t ilen_orig;
329 size_t ilen;
330 size_t olen;
331 char *outbuf;
333 if ((str[0] & 0x80) == 0) {
334 *bytes_consumed = 1;
335 return (codepoint_t)str[0];
339 * we assume that no multi-byte character can take more than 5 bytes.
340 * This is OK as we only support codepoints up to 1M (U+100000)
342 ilen_orig = strnlen(str, 5);
343 ilen = ilen_orig;
345 descriptor = get_conv_handle(ic, src_charset, CH_UTF16);
346 if (descriptor == (smb_iconv_t)-1) {
347 *bytes_consumed = 1;
348 return INVALID_CODEPOINT;
352 * this looks a little strange, but it is needed to cope with
353 * codepoints above 64k (U+1000) which are encoded as per RFC2781.
355 olen = 2;
356 outbuf = (char *)buf;
357 smb_iconv(descriptor, &str, &ilen, &outbuf, &olen);
358 if (olen == 2) {
359 olen = 4;
360 outbuf = (char *)buf;
361 smb_iconv(descriptor, &str, &ilen, &outbuf, &olen);
362 if (olen == 4) {
363 /* we didn't convert any bytes */
364 *bytes_consumed = 1;
365 return INVALID_CODEPOINT;
367 olen = 4 - olen;
368 } else {
369 olen = 2 - olen;
372 *bytes_consumed = ilen_orig - ilen;
374 if (olen == 2) {
375 return (codepoint_t)SVAL(buf, 0);
377 if (olen == 4) {
378 /* decode a 4 byte UTF16 character manually */
379 return (codepoint_t)0x10000 +
380 (buf[2] | ((buf[3] & 0x3)<<8) |
381 (buf[0]<<10) | ((buf[1] & 0x3)<<18));
384 /* no other length is valid */
385 return INVALID_CODEPOINT;
389 return the unicode codepoint for the next multi-byte CH_UNIX character
390 in the string
392 also return the number of bytes consumed (which tells the caller
393 how many bytes to skip to get to the next CH_UNIX character)
395 return INVALID_CODEPOINT if the next character cannot be converted
397 _PUBLIC_ codepoint_t next_codepoint_handle(struct smb_iconv_handle *ic,
398 const char *str, size_t *size)
400 return next_codepoint_handle_ext(ic, str, CH_UNIX, size);
404 push a single codepoint into a CH_UNIX string the target string must
405 be able to hold the full character, which is guaranteed if it is at
406 least 5 bytes in size. The caller may pass less than 5 bytes if they
407 are sure the character will fit (for example, you can assume that
408 uppercase/lowercase of a character will not add more than 1 byte)
410 return the number of bytes occupied by the CH_UNIX character, or
411 -1 on failure
413 _PUBLIC_ ssize_t push_codepoint_handle(struct smb_iconv_handle *ic,
414 char *str, codepoint_t c)
416 smb_iconv_t descriptor;
417 uint8_t buf[4];
418 size_t ilen, olen;
419 const char *inbuf;
421 if (c < 128) {
422 *str = c;
423 return 1;
426 descriptor = get_conv_handle(ic,
427 CH_UTF16, CH_UNIX);
428 if (descriptor == (smb_iconv_t)-1) {
429 return -1;
432 if (c < 0x10000) {
433 ilen = 2;
434 olen = 5;
435 inbuf = (char *)buf;
436 SSVAL(buf, 0, c);
437 smb_iconv(descriptor, &inbuf, &ilen, &str, &olen);
438 if (ilen != 0) {
439 return -1;
441 return 5 - olen;
444 c -= 0x10000;
446 buf[0] = (c>>10) & 0xFF;
447 buf[1] = (c>>18) | 0xd8;
448 buf[2] = c & 0xFF;
449 buf[3] = ((c>>8) & 0x3) | 0xdc;
451 ilen = 4;
452 olen = 5;
453 inbuf = (char *)buf;
455 smb_iconv(descriptor, &inbuf, &ilen, &str, &olen);
456 if (ilen != 0) {
457 return -1;
459 return 5 - olen;
462 _PUBLIC_ codepoint_t next_codepoint_ext(const char *str, charset_t src_charset,
463 size_t *size)
465 return next_codepoint_handle_ext(get_iconv_handle(), str,
466 src_charset, size);
469 _PUBLIC_ codepoint_t next_codepoint(const char *str, size_t *size)
471 return next_codepoint_handle(get_iconv_handle(), str, size);
474 _PUBLIC_ ssize_t push_codepoint(char *str, codepoint_t c)
476 return push_codepoint_handle(get_iconv_handle(), str, c);