s3: Fix some nonempty line endings
[Samba/gebeck_regimport.git] / lib / util / charset / convert_string.c
blob2e666802c9e06b6f8ced38f1fcdd6443b06a7fd0
1 /*
2 Unix SMB/CIFS implementation.
3 Character set conversion Extensions
4 Copyright (C) Igor Vergeichik <iverg@mail.ru> 2001
5 Copyright (C) Andrew Tridgell 2001-2011
6 Copyright (C) Andrew Bartlett 2011
7 Copyright (C) Simo Sorce 2001
8 Copyright (C) Martin Pool 2003
10 This program is free software; you can redistribute it and/or modify
11 it under the terms of the GNU General Public License as published by
12 the Free Software Foundation; either version 3 of the License, or
13 (at your option) any later version.
15 This program is distributed in the hope that it will be useful,
16 but WITHOUT ANY WARRANTY; without even the implied warranty of
17 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 GNU General Public License for more details.
20 You should have received a copy of the GNU General Public License
21 along with this program. If not, see <http://www.gnu.org/licenses/>.
24 #include "includes.h"
25 #include "system/iconv.h"
27 /**
28 * @file
30 * @brief Character-set conversion routines built on our iconv.
32 * @note Samba's internal character set (at least in the 3.0 series)
33 * is always the same as the one for the Unix filesystem. It is
34 * <b>not</b> necessarily UTF-8 and may be different on machines that
35 * need i18n filenames to be compatible with Unix software. It does
36 * have to be a superset of ASCII. All multibyte sequences must start
37 * with a byte with the high bit set.
39 * @sa lib/iconv.c
43 /**
44 * Convert string from one encoding to another, making error checking etc
45 * Slow path version - uses (slow) iconv.
47 * @param src pointer to source string (multibyte or singlebyte)
48 * @param srclen length of the source string in bytes
49 * @param dest pointer to destination string (multibyte or singlebyte)
50 * @param destlen maximal length allowed for string
51 * @param converted size is the number of bytes occupied in the destination
53 * @returns false and sets errno on fail, true on success.
55 * Ensure the srclen contains the terminating zero.
57 **/
59 static bool convert_string_internal(struct smb_iconv_handle *ic,
60 charset_t from, charset_t to,
61 void const *src, size_t srclen,
62 void *dest, size_t destlen, size_t *converted_size)
64 size_t i_len, o_len;
65 size_t retval;
66 const char* inbuf = (const char*)src;
67 char* outbuf = (char*)dest;
68 smb_iconv_t descriptor;
70 descriptor = get_conv_handle(ic, from, to);
72 if (srclen == (size_t)-1) {
73 if (from == CH_UTF16LE || from == CH_UTF16BE) {
74 srclen = (strlen_w((const smb_ucs2_t *)src)+1) * 2;
75 } else {
76 srclen = strlen((const char *)src)+1;
81 if (descriptor == (smb_iconv_t)-1 || descriptor == (smb_iconv_t)0) {
82 errno = EINVAL;
83 return false;
86 i_len=srclen;
87 o_len=destlen;
89 retval = smb_iconv(descriptor, &inbuf, &i_len, &outbuf, &o_len);
90 *converted_size = destlen-o_len;
92 return (retval != (size_t)-1);
95 /**
96 * Convert string from one encoding to another, making error checking etc
97 * Fast path version - handles ASCII first.
99 * @param src pointer to source string (multibyte or singlebyte)
100 * @param srclen length of the source string in bytes, or -1 for nul terminated.
101 * @param dest pointer to destination string (multibyte or singlebyte)
102 * @param destlen maximal length allowed for string - *NEVER* -1.
103 * @param converted size is the number of bytes occupied in the destination
105 * @returns false and sets errno on fail, true on success.
107 * Ensure the srclen contains the terminating zero.
109 * This function has been hand-tuned to provide a fast path.
110 * Don't change unless you really know what you are doing. JRA.
113 bool convert_string_error_handle(struct smb_iconv_handle *ic,
114 charset_t from, charset_t to,
115 void const *src, size_t srclen,
116 void *dest, size_t destlen,
117 size_t *converted_size)
120 * NB. We deliberately don't do a strlen here if srclen == -1.
121 * This is very expensive over millions of calls and is taken
122 * care of in the slow path in convert_string_internal. JRA.
125 #ifdef DEVELOPER
126 SMB_ASSERT(destlen != (size_t)-1);
127 #endif
129 if (srclen == 0) {
130 *converted_size = 0;
131 return true;
134 if (from != CH_UTF16LE && from != CH_UTF16BE && to != CH_UTF16LE && to != CH_UTF16BE) {
135 const unsigned char *p = (const unsigned char *)src;
136 unsigned char *q = (unsigned char *)dest;
137 size_t slen = srclen;
138 size_t dlen = destlen;
139 unsigned char lastp = '\0';
140 size_t retval = 0;
142 /* If all characters are ascii, fast path here. */
143 while (slen && dlen) {
144 if ((lastp = *p) <= 0x7f) {
145 *q++ = *p++;
146 if (slen != (size_t)-1) {
147 slen--;
149 dlen--;
150 retval++;
151 if (!lastp)
152 break;
153 } else {
154 #ifdef BROKEN_UNICODE_COMPOSE_CHARACTERS
155 goto general_case;
156 #else
157 bool ret = convert_string_internal(ic, from, to, p, slen, q, dlen, converted_size);
158 *converted_size += retval;
159 return ret;
160 #endif
164 *converted_size = retval;
166 if (!dlen) {
167 /* Even if we fast path we should note if we ran out of room. */
168 if (((slen != (size_t)-1) && slen) ||
169 ((slen == (size_t)-1) && lastp)) {
170 errno = E2BIG;
171 return false;
174 return true;
175 } else if (from == CH_UTF16LE && to != CH_UTF16LE) {
176 const unsigned char *p = (const unsigned char *)src;
177 unsigned char *q = (unsigned char *)dest;
178 size_t retval = 0;
179 size_t slen = srclen;
180 size_t dlen = destlen;
181 unsigned char lastp = '\0';
182 #ifndef BROKEN_UNICODE_COMPOSE_CHARACTERS
183 bool ret;
184 #endif
186 if (slen == (size_t)-1) {
187 while (dlen &&
188 ((lastp = *p) <= 0x7f) && (p[1] == 0)) {
189 *q++ = *p;
190 p += 2;
191 dlen--;
192 retval++;
193 if (!lastp)
194 break;
196 if (lastp != 0) goto slow_path;
197 } else {
198 while (slen >= 2 && dlen &&
199 (*p <= 0x7f) && (p[1] == 0)) {
200 *q++ = *p;
201 slen -= 2;
202 p += 2;
203 dlen--;
204 retval++;
206 if (slen != 0) goto slow_path;
209 *converted_size = retval;
211 if (!dlen) {
212 /* Even if we fast path we should note if we ran out of room. */
213 if (((slen != (size_t)-1) && slen) ||
214 ((slen == (size_t)-1) && lastp)) {
215 errno = E2BIG;
216 return false;
219 return true;
221 slow_path:
222 /* come here when we hit a character we can't deal
223 * with in the fast path
225 #ifdef BROKEN_UNICODE_COMPOSE_CHARACTERS
226 goto general_case;
227 #else
228 ret = convert_string_internal(ic, from, to, p, slen, q, dlen, converted_size);
229 *converted_size += retval;
230 return ret;
231 #endif
233 } else if (from != CH_UTF16LE && from != CH_UTF16BE && to == CH_UTF16LE) {
234 const unsigned char *p = (const unsigned char *)src;
235 unsigned char *q = (unsigned char *)dest;
236 size_t retval = 0;
237 size_t slen = srclen;
238 size_t dlen = destlen;
239 unsigned char lastp = '\0';
241 /* If all characters are ascii, fast path here. */
242 while (slen && (dlen >= 1)) {
243 if (dlen >=2 && (lastp = *p) <= 0x7F) {
244 *q++ = *p++;
245 *q++ = '\0';
246 if (slen != (size_t)-1) {
247 slen--;
249 dlen -= 2;
250 retval += 2;
251 if (!lastp)
252 break;
253 } else {
254 #ifdef BROKEN_UNICODE_COMPOSE_CHARACTERS
255 goto general_case;
256 #else
257 bool ret = convert_string_internal(ic, from, to, p, slen, q, dlen, converted_size);
258 *converted_size += retval;
259 return ret;
260 #endif
264 *converted_size = retval;
266 if (!dlen) {
267 /* Even if we fast path we should note if we ran out of room. */
268 if (((slen != (size_t)-1) && slen) ||
269 ((slen == (size_t)-1) && lastp)) {
270 errno = E2BIG;
271 return false;
274 return true;
277 #ifdef BROKEN_UNICODE_COMPOSE_CHARACTERS
278 general_case:
279 #endif
280 return convert_string_internal(ic, from, to, src, srclen, dest, destlen, converted_size);
283 bool convert_string_handle(struct smb_iconv_handle *ic,
284 charset_t from, charset_t to,
285 void const *src, size_t srclen,
286 void *dest, size_t destlen,
287 size_t *converted_size)
289 bool ret = convert_string_error_handle(ic, from, to, src, srclen, dest, destlen, converted_size);
291 if(ret==false) {
292 const char *reason="unknown error";
293 switch(errno) {
294 case EINVAL:
295 reason="Incomplete multibyte sequence";
296 DEBUG(3,("convert_string_internal: Conversion error: %s(%s)\n",
297 reason, (const char *)src));
298 break;
299 case E2BIG:
301 reason="No more room";
302 if (from == CH_UNIX) {
303 DEBUG(3,("E2BIG: convert_string(%s,%s): srclen=%u destlen=%u - '%s'\n",
304 charset_name(ic, from), charset_name(ic, to),
305 (unsigned int)srclen, (unsigned int)destlen, (const char *)src));
306 } else {
307 DEBUG(3,("E2BIG: convert_string(%s,%s): srclen=%u destlen=%u\n",
308 charset_name(ic, from), charset_name(ic, to),
309 (unsigned int)srclen, (unsigned int)destlen));
311 break;
313 case EILSEQ:
314 reason="Illegal multibyte sequence";
315 DEBUG(3,("convert_string_internal: Conversion error: %s(%s)\n",
316 reason, (const char *)src));
317 break;
318 default:
319 DEBUG(0,("convert_string_internal: Conversion error: %s(%s)\n",
320 reason, (const char *)src));
321 break;
323 /* smb_panic(reason); */
325 return ret;
330 * Convert between character sets, allocating a new buffer using talloc for the result.
332 * @param srclen length of source buffer.
333 * @param dest always set at least to NULL
334 * @parm converted_size set to the number of bytes occupied by the string in
335 * the destination on success.
336 * @note -1 is not accepted for srclen.
338 * @return true if new buffer was correctly allocated, and string was
339 * converted.
341 * Ensure the srclen contains the terminating zero.
343 * I hate the goto's in this function. It's emberrassing.....
344 * There has to be a cleaner way to do this. JRA.
346 bool convert_string_talloc_handle(TALLOC_CTX *ctx, struct smb_iconv_handle *ic,
347 charset_t from, charset_t to,
348 void const *src, size_t srclen, void *dst,
349 size_t *converted_size)
352 size_t i_len, o_len, destlen = (srclen * 3) / 2;
353 size_t retval;
354 const char *inbuf = (const char *)src;
355 char *outbuf = NULL, *ob = NULL;
356 smb_iconv_t descriptor;
357 void **dest = (void **)dst;
359 *dest = NULL;
361 if (src == NULL || srclen == (size_t)-1) {
362 errno = EINVAL;
363 return false;
366 if (srclen == 0) {
367 /* We really should treat this as an error, but
368 there are too many callers that need this to
369 return a NULL terminated string in the correct
370 character set. */
371 if (to == CH_UTF16LE|| to == CH_UTF16BE || to == CH_UTF16MUNGED) {
372 destlen = 2;
373 } else {
374 destlen = 1;
376 ob = talloc_zero_array(ctx, char, destlen);
377 if (ob == NULL) {
378 errno = ENOMEM;
379 return false;
381 if (converted_size != NULL) {
382 *converted_size = destlen;
384 *dest = ob;
385 return true;
388 descriptor = get_conv_handle(ic, from, to);
390 if (descriptor == (smb_iconv_t)-1 || descriptor == (smb_iconv_t)0) {
391 DEBUG(0,("convert_string_talloc: Conversion not supported.\n"));
392 errno = EOPNOTSUPP;
393 return false;
396 convert:
398 /* +2 is for ucs2 null termination. */
399 if ((destlen*2)+2 < destlen) {
400 /* wrapped ! abort. */
401 DEBUG(0, ("convert_string_talloc: destlen wrapped !\n"));
402 TALLOC_FREE(outbuf);
403 errno = EOPNOTSUPP;
404 return false;
405 } else {
406 destlen = destlen * 2;
409 /* +2 is for ucs2 null termination. */
410 ob = talloc_realloc(ctx, ob, char, destlen + 2);
412 if (!ob) {
413 DEBUG(0, ("convert_string_talloc: realloc failed!\n"));
414 errno = ENOMEM;
415 return false;
417 outbuf = ob;
418 i_len = srclen;
419 o_len = destlen;
421 retval = smb_iconv(descriptor,
422 &inbuf, &i_len,
423 &outbuf, &o_len);
424 if(retval == (size_t)-1) {
425 const char *reason="unknown error";
426 switch(errno) {
427 case EINVAL:
428 reason="Incomplete multibyte sequence";
429 DEBUG(3,("convert_string_talloc: Conversion error: %s(%s)\n",reason,inbuf));
430 break;
431 case E2BIG:
432 goto convert;
433 case EILSEQ:
434 reason="Illegal multibyte sequence";
435 DEBUG(3,("convert_string_talloc: Conversion error: %s(%s)\n",reason,inbuf));
436 break;
438 DEBUG(0,("Conversion error: %s(%s)\n",reason,inbuf));
439 /* smb_panic(reason); */
440 TALLOC_FREE(ob);
441 return false;
444 destlen = destlen - o_len;
445 /* Don't shrink unless we're reclaiming a lot of
446 * space. This is in the hot codepath and these
447 * reallocs *cost*. JRA.
449 if (o_len > 1024) {
450 /* We're shrinking here so we know the +2 is safe from wrap. */
451 ob = talloc_realloc(ctx,ob, char, destlen + 2);
454 if (destlen && !ob) {
455 DEBUG(0, ("convert_string_talloc: out of memory!\n"));
456 errno = ENOMEM;
457 return false;
460 *dest = ob;
462 /* Must ucs2 null terminate in the extra space we allocated. */
463 ob[destlen] = '\0';
464 ob[destlen+1] = '\0';
466 /* Ensure we can never return a *converted_size of zero. */
467 if (destlen == 0) {
468 /* As we're now returning false on a bad smb_iconv call,
469 this should never happen. But be safe anyway. */
470 if (to == CH_UTF16LE|| to == CH_UTF16BE || to == CH_UTF16MUNGED) {
471 destlen = 2;
472 } else {
473 destlen = 1;
477 if (converted_size != NULL) {
478 *converted_size = destlen;
480 return true;
484 * Convert string from one encoding to another, making error checking etc
486 * @param src pointer to source string (multibyte or singlebyte)
487 * @param srclen length of the source string in bytes
488 * @param dest pointer to destination string (multibyte or singlebyte)
489 * @param destlen maximal length allowed for string
490 * @param converted_size the number of bytes occupied in the destination
492 * @returns true on success, false on fail.
494 _PUBLIC_ bool convert_string(charset_t from, charset_t to,
495 void const *src, size_t srclen,
496 void *dest, size_t destlen,
497 size_t *converted_size)
499 return convert_string_handle(get_iconv_handle(), from, to,
500 src, srclen,
501 dest, destlen, converted_size);
505 * Convert string from one encoding to another, making error checking etc
507 * @param src pointer to source string (multibyte or singlebyte)
508 * @param srclen length of the source string in bytes
509 * @param dest pointer to destination string (multibyte or singlebyte)
510 * @param destlen maximal length allowed for string
511 * @param converted_size the number of bytes occupied in the destination
513 * @returns true on success, false on fail.
515 _PUBLIC_ bool convert_string_error(charset_t from, charset_t to,
516 void const *src, size_t srclen,
517 void *dest, size_t destlen,
518 size_t *converted_size)
520 return convert_string_error_handle(get_iconv_handle(), from, to,
521 src, srclen,
522 dest, destlen, converted_size);
526 * Convert between character sets, allocating a new buffer using talloc for the result.
528 * @param srclen length of source buffer.
529 * @param dest always set at least to NULL
530 * @param converted_size Size in bytes of the converted string
531 * @note -1 is not accepted for srclen.
533 * @returns boolean indication whether the conversion succeeded
536 _PUBLIC_ bool convert_string_talloc(TALLOC_CTX *ctx,
537 charset_t from, charset_t to,
538 void const *src, size_t srclen,
539 void *dest, size_t *converted_size)
541 return convert_string_talloc_handle(ctx, get_iconv_handle(),
542 from, to, src, srclen, dest,
543 converted_size);