file_server: Clarify code by avoiding a goto
[Samba/gebeck_regimport.git] / lib / util / charset / iconv.c
blob1c507b4b13707d1fd3142d09ef79d58593c1ed69
1 /*
2 Unix SMB/CIFS implementation.
3 minimal iconv implementation
4 Copyright (C) Andrew Tridgell 2001
5 Copyright (C) Jelmer Vernooij 2002
7 This program is free software; you can redistribute it and/or modify
8 it under the terms of the GNU General Public License as published by
9 the Free Software Foundation; either version 3 of the License, or
10 (at your option) any later version.
12 This program is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 GNU General Public License for more details.
17 You should have received a copy of the GNU General Public License
18 along with this program. If not, see <http://www.gnu.org/licenses/>.
21 #include "includes.h"
22 #include "../lib/util/dlinklist.h"
23 #include "system/iconv.h"
24 #include "system/filesys.h"
25 #include "charset_proto.h"
27 #ifdef strcasecmp
28 #undef strcasecmp
29 #endif
31 /**
32 * @file
34 * @brief Samba wrapper/stub for iconv character set conversion.
36 * iconv is the XPG2 interface for converting between character
37 * encodings. This file provides a Samba wrapper around it, and also
38 * a simple reimplementation that is used if the system does not
39 * implement iconv.
41 * Samba only works with encodings that are supersets of ASCII: ascii
42 * characters like whitespace can be tested for directly, multibyte
43 * sequences start with a byte with the high bit set, and strings are
44 * terminated by a nul byte.
46 * Note that the only function provided by iconv is conversion between
47 * characters. It doesn't directly support operations like
48 * uppercasing or comparison. We have to convert to UTF-16LE and
49 * compare there.
51 * @sa Samba Developers Guide
52 **/
54 static size_t ascii_pull (void *,const char **, size_t *, char **, size_t *);
55 static size_t ascii_push (void *,const char **, size_t *, char **, size_t *);
56 static size_t latin1_pull(void *,const char **, size_t *, char **, size_t *);
57 static size_t latin1_push(void *,const char **, size_t *, char **, size_t *);
58 static size_t utf8_pull (void *,const char **, size_t *, char **, size_t *);
59 static size_t utf8_push (void *,const char **, size_t *, char **, size_t *);
60 static size_t utf16_munged_pull(void *,const char **, size_t *, char **, size_t *);
61 static size_t ucs2hex_pull(void *,const char **, size_t *, char **, size_t *);
62 static size_t ucs2hex_push(void *,const char **, size_t *, char **, size_t *);
63 static size_t iconv_copy (void *,const char **, size_t *, char **, size_t *);
64 static size_t iconv_swab (void *,const char **, size_t *, char **, size_t *);
66 static const struct charset_functions builtin_functions[] = {
67 /* windows is closest to UTF-16 */
68 {"UCS-2LE", iconv_copy, iconv_copy},
69 {"UTF-16LE", iconv_copy, iconv_copy},
70 {"UCS-2BE", iconv_swab, iconv_swab},
71 {"UTF-16BE", iconv_swab, iconv_swab},
73 /* we include the UTF-8 alias to cope with differing locale settings */
74 {"UTF8", utf8_pull, utf8_push},
75 {"UTF-8", utf8_pull, utf8_push},
77 /* this handles the munging needed for String2Key */
78 {"UTF16_MUNGED", utf16_munged_pull, iconv_copy, true},
80 {"ASCII", ascii_pull, ascii_push},
81 {"646", ascii_pull, ascii_push},
82 {"ISO-8859-1", latin1_pull, latin1_push},
83 #ifdef DEVELOPER
84 {"WEIRD", weird_pull, weird_push, true},
85 #endif
86 #ifdef DARWINOS
87 {"MACOSXFS", macosxfs_encoding_pull, macosxfs_encoding_push, true},
88 #endif
89 {"UCS2-HEX", ucs2hex_pull, ucs2hex_push, true}
93 #ifdef HAVE_NATIVE_ICONV
94 /* if there was an error then reset the internal state,
95 this ensures that we don't have a shift state remaining for
96 character sets like SJIS */
97 static size_t sys_iconv(void *cd,
98 const char **inbuf, size_t *inbytesleft,
99 char **outbuf, size_t *outbytesleft)
101 size_t ret = iconv((iconv_t)cd,
102 discard_const_p(char *, inbuf), inbytesleft,
103 outbuf, outbytesleft);
104 if (ret == (size_t)-1) iconv(cd, NULL, NULL, NULL, NULL);
105 return ret;
107 #endif
110 * This is a simple portable iconv() implementaion.
112 * It only knows about a very small number of character sets - just
113 * enough that Samba works on systems that don't have iconv.
115 _PUBLIC_ size_t smb_iconv(smb_iconv_t cd,
116 const char **inbuf, size_t *inbytesleft,
117 char **outbuf, size_t *outbytesleft)
119 /* in many cases we can go direct */
120 if (cd->direct) {
121 return cd->direct(cd->cd_direct,
122 inbuf, inbytesleft, outbuf, outbytesleft);
125 /* otherwise we have to do it chunks at a time */
127 #ifndef SMB_ICONV_BUFSIZE
128 #define SMB_ICONV_BUFSIZE 2048
129 #endif
130 TALLOC_CTX *mem_ctx;
131 size_t bufsize;
132 char *cvtbuf;
134 #if _SAMBA_BUILD_ == 3
135 mem_ctx = talloc_tos();
136 #else
137 mem_ctx = cd;
138 #endif
139 cvtbuf = talloc_array(mem_ctx, char, SMB_ICONV_BUFSIZE);
141 if (!cvtbuf) {
142 return (size_t)-1;
145 while (*inbytesleft > 0) {
146 char *bufp1 = cvtbuf;
147 const char *bufp2 = cvtbuf;
148 int saved_errno = errno;
149 bool pull_failed = false;
150 bufsize = SMB_ICONV_BUFSIZE;
152 if (cd->pull(cd->cd_pull,
153 inbuf, inbytesleft, &bufp1, &bufsize) == -1
154 && errno != E2BIG) {
155 saved_errno = errno;
156 pull_failed = true;
159 bufsize = SMB_ICONV_BUFSIZE - bufsize;
161 if (cd->push(cd->cd_push,
162 &bufp2, &bufsize,
163 outbuf, outbytesleft) == -1) {
164 talloc_free(cvtbuf);
165 return -1;
166 } else if (pull_failed) {
167 /* We want the pull errno if possible */
168 errno = saved_errno;
169 return -1;
172 talloc_free(cvtbuf);
175 return 0;
178 static bool is_utf16(const char *name)
180 return strcasecmp(name, "UCS-2LE") == 0 ||
181 strcasecmp(name, "UTF-16LE") == 0;
184 static int smb_iconv_t_destructor(smb_iconv_t hwd)
186 #ifdef HAVE_NATIVE_ICONV
187 if (hwd->cd_pull != NULL && hwd->cd_pull != (iconv_t)-1)
188 iconv_close(hwd->cd_pull);
189 if (hwd->cd_push != NULL && hwd->cd_push != (iconv_t)-1)
190 iconv_close(hwd->cd_push);
191 if (hwd->cd_direct != NULL && hwd->cd_direct != (iconv_t)-1)
192 iconv_close(hwd->cd_direct);
193 #endif
195 return 0;
198 _PUBLIC_ smb_iconv_t smb_iconv_open_ex(TALLOC_CTX *mem_ctx, const char *tocode,
199 const char *fromcode, bool use_builtin_handlers)
201 smb_iconv_t ret;
202 const struct charset_functions *from=NULL, *to=NULL;
203 int i;
205 ret = (smb_iconv_t)talloc_named(mem_ctx,
206 sizeof(*ret),
207 "iconv(%s,%s)", tocode, fromcode);
208 if (!ret) {
209 errno = ENOMEM;
210 return (smb_iconv_t)-1;
212 memset(ret, 0, sizeof(*ret));
213 talloc_set_destructor(ret, smb_iconv_t_destructor);
215 /* check for the simplest null conversion */
216 if (strcmp(fromcode, tocode) == 0) {
217 ret->direct = iconv_copy;
218 return ret;
221 /* check if we have a builtin function for this conversion */
222 for (i=0;i<ARRAY_SIZE(builtin_functions);i++) {
223 if (strcasecmp(fromcode, builtin_functions[i].name) == 0) {
224 if (use_builtin_handlers || builtin_functions[i].samba_internal_charset) {
225 from = &builtin_functions[i];
228 if (strcasecmp(tocode, builtin_functions[i].name) == 0) {
229 if (use_builtin_handlers || builtin_functions[i].samba_internal_charset) {
230 to = &builtin_functions[i];
235 #ifdef HAVE_NATIVE_ICONV
236 /* the from and to varaibles indicate a samba module or
237 * internal conversion, ret->pull and ret->push are
238 * initialised only in this block for iconv based
239 * conversions */
241 if (from == NULL) {
242 ret->cd_pull = iconv_open("UTF-16LE", fromcode);
243 if (ret->cd_pull == (iconv_t)-1)
244 ret->cd_pull = iconv_open("UCS-2LE", fromcode);
245 if (ret->cd_pull != (iconv_t)-1) {
246 ret->pull = sys_iconv;
250 if (to == NULL) {
251 ret->cd_push = iconv_open(tocode, "UTF-16LE");
252 if (ret->cd_push == (iconv_t)-1)
253 ret->cd_push = iconv_open(tocode, "UCS-2LE");
254 if (ret->cd_push != (iconv_t)-1) {
255 ret->push = sys_iconv;
258 #endif
260 if (ret->pull == NULL && from == NULL) {
261 goto failed;
264 if (ret->push == NULL && to == NULL) {
265 goto failed;
268 /* check for conversion to/from ucs2 */
269 if (is_utf16(fromcode) && to) {
270 ret->direct = to->push;
271 return ret;
273 if (is_utf16(tocode) && from) {
274 ret->direct = from->pull;
275 return ret;
278 #ifdef HAVE_NATIVE_ICONV
279 if (is_utf16(fromcode)) {
280 ret->direct = sys_iconv;
281 ret->cd_direct = ret->cd_push;
282 ret->cd_push = NULL;
283 return ret;
285 if (is_utf16(tocode)) {
286 ret->direct = sys_iconv;
287 ret->cd_direct = ret->cd_pull;
288 ret->cd_pull = NULL;
289 return ret;
291 #endif
293 /* the general case has to go via a buffer */
294 if (!ret->pull) ret->pull = from->pull;
295 if (!ret->push) ret->push = to->push;
296 return ret;
298 failed:
299 talloc_free(ret);
300 errno = EINVAL;
301 return (smb_iconv_t)-1;
305 simple iconv_open() wrapper
307 _PUBLIC_ smb_iconv_t smb_iconv_open(const char *tocode, const char *fromcode)
309 return smb_iconv_open_ex(NULL, tocode, fromcode, true);
313 simple iconv_close() wrapper
315 _PUBLIC_ int smb_iconv_close(smb_iconv_t cd)
317 talloc_free(cd);
318 return 0;
322 /**********************************************************************
323 the following functions implement the builtin character sets in Samba
324 and also the "test" character sets that are designed to test
325 multi-byte character set support for english users
326 ***********************************************************************/
329 this takes an ASCII sequence and produces a UTF16 sequence
331 The first 127 codepoints of latin1 matches the first 127 codepoints
332 of unicode, and so can be put into the first byte of UTF16LE
336 static size_t ascii_pull(void *cd, const char **inbuf, size_t *inbytesleft,
337 char **outbuf, size_t *outbytesleft)
339 while (*inbytesleft >= 1 && *outbytesleft >= 2) {
340 if (((*inbuf)[0] & 0x7F) != (*inbuf)[0]) {
341 /* If this is multi-byte, then it isn't legal ASCII */
342 errno = EILSEQ;
343 return -1;
345 (*outbuf)[0] = (*inbuf)[0];
346 (*outbuf)[1] = 0;
347 (*inbytesleft) -= 1;
348 (*outbytesleft) -= 2;
349 (*inbuf) += 1;
350 (*outbuf) += 2;
353 if (*inbytesleft > 0) {
354 errno = E2BIG;
355 return -1;
358 return 0;
362 this takes a UTF16 sequence and produces an ASCII sequence
364 The first 127 codepoints of ASCII matches the first 127 codepoints
365 of unicode, and so can be read directly from the first byte of UTF16LE
368 static size_t ascii_push(void *cd, const char **inbuf, size_t *inbytesleft,
369 char **outbuf, size_t *outbytesleft)
371 int ir_count=0;
373 while (*inbytesleft >= 2 && *outbytesleft >= 1) {
374 if (((*inbuf)[0] & 0x7F) != (*inbuf)[0] ||
375 (*inbuf)[1] != 0) {
376 /* If this is multi-byte, then it isn't legal ASCII */
377 errno = EILSEQ;
378 return -1;
380 (*outbuf)[0] = (*inbuf)[0];
381 (*inbytesleft) -= 2;
382 (*outbytesleft) -= 1;
383 (*inbuf) += 2;
384 (*outbuf) += 1;
387 if (*inbytesleft == 1) {
388 errno = EINVAL;
389 return -1;
392 if (*inbytesleft > 1) {
393 errno = E2BIG;
394 return -1;
397 return ir_count;
401 this takes a latin1/ISO-8859-1 sequence and produces a UTF16 sequence
403 The first 256 codepoints of latin1 matches the first 256 codepoints
404 of unicode, and so can be put into the first byte of UTF16LE
407 static size_t latin1_pull(void *cd, const char **inbuf, size_t *inbytesleft,
408 char **outbuf, size_t *outbytesleft)
410 while (*inbytesleft >= 1 && *outbytesleft >= 2) {
411 (*outbuf)[0] = (*inbuf)[0];
412 (*outbuf)[1] = 0;
413 (*inbytesleft) -= 1;
414 (*outbytesleft) -= 2;
415 (*inbuf) += 1;
416 (*outbuf) += 2;
419 if (*inbytesleft > 0) {
420 errno = E2BIG;
421 return -1;
424 return 0;
428 this takes a UTF16 sequence and produces a latin1/ISO-8859-1 sequence
430 The first 256 codepoints of latin1 matches the first 256 codepoints
431 of unicode, and so can be read directly from the first byte of UTF16LE
434 static size_t latin1_push(void *cd, const char **inbuf, size_t *inbytesleft,
435 char **outbuf, size_t *outbytesleft)
437 int ir_count=0;
439 while (*inbytesleft >= 2 && *outbytesleft >= 1) {
440 (*outbuf)[0] = (*inbuf)[0];
441 if ((*inbuf)[1] != 0) {
442 /* If this is multi-byte, then it isn't legal latin1 */
443 errno = EILSEQ;
444 return -1;
446 (*inbytesleft) -= 2;
447 (*outbytesleft) -= 1;
448 (*inbuf) += 2;
449 (*outbuf) += 1;
452 if (*inbytesleft == 1) {
453 errno = EINVAL;
454 return -1;
457 if (*inbytesleft > 1) {
458 errno = E2BIG;
459 return -1;
462 return ir_count;
465 static size_t ucs2hex_pull(void *cd, const char **inbuf, size_t *inbytesleft,
466 char **outbuf, size_t *outbytesleft)
468 while (*inbytesleft >= 1 && *outbytesleft >= 2) {
469 unsigned int v;
471 if ((*inbuf)[0] != '@') {
472 /* seven bit ascii case */
473 (*outbuf)[0] = (*inbuf)[0];
474 (*outbuf)[1] = 0;
475 (*inbytesleft) -= 1;
476 (*outbytesleft) -= 2;
477 (*inbuf) += 1;
478 (*outbuf) += 2;
479 continue;
481 /* it's a hex character */
482 if (*inbytesleft < 5) {
483 errno = EINVAL;
484 return -1;
487 if (sscanf(&(*inbuf)[1], "%04x", &v) != 1) {
488 errno = EILSEQ;
489 return -1;
492 (*outbuf)[0] = v&0xff;
493 (*outbuf)[1] = v>>8;
494 (*inbytesleft) -= 5;
495 (*outbytesleft) -= 2;
496 (*inbuf) += 5;
497 (*outbuf) += 2;
500 if (*inbytesleft > 0) {
501 errno = E2BIG;
502 return -1;
505 return 0;
508 static size_t ucs2hex_push(void *cd, const char **inbuf, size_t *inbytesleft,
509 char **outbuf, size_t *outbytesleft)
511 while (*inbytesleft >= 2 && *outbytesleft >= 1) {
512 char buf[6];
514 if ((*inbuf)[1] == 0 &&
515 ((*inbuf)[0] & 0x80) == 0 &&
516 (*inbuf)[0] != '@') {
517 (*outbuf)[0] = (*inbuf)[0];
518 (*inbytesleft) -= 2;
519 (*outbytesleft) -= 1;
520 (*inbuf) += 2;
521 (*outbuf) += 1;
522 continue;
524 if (*outbytesleft < 5) {
525 errno = E2BIG;
526 return -1;
528 snprintf(buf, 6, "@%04x", SVAL(*inbuf, 0));
529 memcpy(*outbuf, buf, 5);
530 (*inbytesleft) -= 2;
531 (*outbytesleft) -= 5;
532 (*inbuf) += 2;
533 (*outbuf) += 5;
536 if (*inbytesleft == 1) {
537 errno = EINVAL;
538 return -1;
541 if (*inbytesleft > 1) {
542 errno = E2BIG;
543 return -1;
546 return 0;
549 static size_t iconv_swab(void *cd, const char **inbuf, size_t *inbytesleft,
550 char **outbuf, size_t *outbytesleft)
552 int n;
554 n = MIN(*inbytesleft, *outbytesleft);
556 swab(*inbuf, *outbuf, (n&~1));
557 if (n&1) {
558 (*outbuf)[n-1] = 0;
561 (*inbytesleft) -= n;
562 (*outbytesleft) -= n;
563 (*inbuf) += n;
564 (*outbuf) += n;
566 if (*inbytesleft > 0) {
567 errno = E2BIG;
568 return -1;
571 return 0;
575 static size_t iconv_copy(void *cd, const char **inbuf, size_t *inbytesleft,
576 char **outbuf, size_t *outbytesleft)
578 int n;
580 n = MIN(*inbytesleft, *outbytesleft);
582 memmove(*outbuf, *inbuf, n);
584 (*inbytesleft) -= n;
585 (*outbytesleft) -= n;
586 (*inbuf) += n;
587 (*outbuf) += n;
589 if (*inbytesleft > 0) {
590 errno = E2BIG;
591 return -1;
594 return 0;
598 this takes a UTF8 sequence and produces a UTF16 sequence
600 static size_t utf8_pull(void *cd, const char **inbuf, size_t *inbytesleft,
601 char **outbuf, size_t *outbytesleft)
603 size_t in_left=*inbytesleft, out_left=*outbytesleft;
604 const uint8_t *c = (const uint8_t *)*inbuf;
605 uint8_t *uc = (uint8_t *)*outbuf;
607 while (in_left >= 1 && out_left >= 2) {
608 if ((c[0] & 0x80) == 0) {
609 uc[0] = c[0];
610 uc[1] = 0;
611 c += 1;
612 in_left -= 1;
613 out_left -= 2;
614 uc += 2;
615 continue;
618 if ((c[0] & 0xe0) == 0xc0) {
619 if (in_left < 2 ||
620 (c[1] & 0xc0) != 0x80) {
621 errno = EILSEQ;
622 goto error;
624 uc[1] = (c[0]>>2) & 0x7;
625 uc[0] = (c[0]<<6) | (c[1]&0x3f);
626 c += 2;
627 in_left -= 2;
628 out_left -= 2;
629 uc += 2;
630 continue;
633 if ((c[0] & 0xf0) == 0xe0) {
634 if (in_left < 3 ||
635 (c[1] & 0xc0) != 0x80 ||
636 (c[2] & 0xc0) != 0x80) {
637 errno = EILSEQ;
638 goto error;
640 uc[1] = ((c[0]&0xF)<<4) | ((c[1]>>2)&0xF);
641 uc[0] = (c[1]<<6) | (c[2]&0x3f);
642 c += 3;
643 in_left -= 3;
644 out_left -= 2;
645 uc += 2;
646 continue;
649 if ((c[0] & 0xf8) == 0xf0) {
650 unsigned int codepoint;
651 if (in_left < 4 ||
652 (c[1] & 0xc0) != 0x80 ||
653 (c[2] & 0xc0) != 0x80 ||
654 (c[3] & 0xc0) != 0x80) {
655 errno = EILSEQ;
656 goto error;
658 codepoint =
659 (c[3]&0x3f) |
660 ((c[2]&0x3f)<<6) |
661 ((c[1]&0x3f)<<12) |
662 ((c[0]&0x7)<<18);
663 if (codepoint < 0x10000) {
664 /* accept UTF-8 characters that are not
665 minimally packed, but pack the result */
666 uc[0] = (codepoint & 0xFF);
667 uc[1] = (codepoint >> 8);
668 c += 4;
669 in_left -= 4;
670 out_left -= 2;
671 uc += 2;
672 continue;
675 codepoint -= 0x10000;
677 if (out_left < 4) {
678 errno = E2BIG;
679 goto error;
682 uc[0] = (codepoint>>10) & 0xFF;
683 uc[1] = (codepoint>>18) | 0xd8;
684 uc[2] = codepoint & 0xFF;
685 uc[3] = ((codepoint>>8) & 0x3) | 0xdc;
686 c += 4;
687 in_left -= 4;
688 out_left -= 4;
689 uc += 4;
690 continue;
693 /* we don't handle 5 byte sequences */
694 errno = EINVAL;
695 goto error;
698 if (in_left > 0) {
699 errno = E2BIG;
700 goto error;
703 *inbytesleft = in_left;
704 *outbytesleft = out_left;
705 *inbuf = (const char *)c;
706 *outbuf = (char *)uc;
707 return 0;
709 error:
710 *inbytesleft = in_left;
711 *outbytesleft = out_left;
712 *inbuf = (const char *)c;
713 *outbuf = (char *)uc;
714 return -1;
719 this takes a UTF16 sequence and produces a UTF8 sequence
721 static size_t utf8_push(void *cd, const char **inbuf, size_t *inbytesleft,
722 char **outbuf, size_t *outbytesleft)
724 size_t in_left=*inbytesleft, out_left=*outbytesleft;
725 uint8_t *c = (uint8_t *)*outbuf;
726 const uint8_t *uc = (const uint8_t *)*inbuf;
728 while (in_left >= 2 && out_left >= 1) {
729 unsigned int codepoint;
731 if (uc[1] == 0 && !(uc[0] & 0x80)) {
732 /* simplest case */
733 c[0] = uc[0];
734 in_left -= 2;
735 out_left -= 1;
736 uc += 2;
737 c += 1;
738 continue;
741 if ((uc[1]&0xf8) == 0) {
742 /* next simplest case */
743 if (out_left < 2) {
744 errno = E2BIG;
745 goto error;
747 c[0] = 0xc0 | (uc[0]>>6) | (uc[1]<<2);
748 c[1] = 0x80 | (uc[0] & 0x3f);
749 in_left -= 2;
750 out_left -= 2;
751 uc += 2;
752 c += 2;
753 continue;
756 if ((uc[1] & 0xfc) == 0xdc) {
757 /* its the second part of a 4 byte sequence. Illegal */
758 if (in_left < 4) {
759 errno = EINVAL;
760 } else {
761 errno = EILSEQ;
763 goto error;
766 if ((uc[1] & 0xfc) != 0xd8) {
767 codepoint = uc[0] | (uc[1]<<8);
768 if (out_left < 3) {
769 errno = E2BIG;
770 goto error;
772 c[0] = 0xe0 | (codepoint >> 12);
773 c[1] = 0x80 | ((codepoint >> 6) & 0x3f);
774 c[2] = 0x80 | (codepoint & 0x3f);
776 in_left -= 2;
777 out_left -= 3;
778 uc += 2;
779 c += 3;
780 continue;
783 /* its the first part of a 4 byte sequence */
784 if (in_left < 4) {
785 errno = EINVAL;
786 goto error;
788 if ((uc[3] & 0xfc) != 0xdc) {
789 errno = EILSEQ;
790 goto error;
792 codepoint = 0x10000 + (uc[2] | ((uc[3] & 0x3)<<8) |
793 (uc[0]<<10) | ((uc[1] & 0x3)<<18));
795 if (out_left < 4) {
796 errno = E2BIG;
797 goto error;
799 c[0] = 0xf0 | (codepoint >> 18);
800 c[1] = 0x80 | ((codepoint >> 12) & 0x3f);
801 c[2] = 0x80 | ((codepoint >> 6) & 0x3f);
802 c[3] = 0x80 | (codepoint & 0x3f);
804 in_left -= 4;
805 out_left -= 4;
806 uc += 4;
807 c += 4;
810 if (in_left == 1) {
811 errno = EINVAL;
812 goto error;
815 if (in_left > 1) {
816 errno = E2BIG;
817 goto error;
820 *inbytesleft = in_left;
821 *outbytesleft = out_left;
822 *inbuf = (const char *)uc;
823 *outbuf = (char *)c;
825 return 0;
827 error:
828 *inbytesleft = in_left;
829 *outbytesleft = out_left;
830 *inbuf = (const char *)uc;
831 *outbuf = (char *)c;
832 return -1;
837 this takes a UTF16 munged sequence, modifies it according to the
838 string2key rules, and produces a UTF16 sequence
840 The rules are:
842 1) any 0x0000 characters are mapped to 0x0001
844 2) convert any instance of 0xD800 - 0xDBFF (high surrogate)
845 without an immediately following 0xDC00 - 0x0xDFFF (low surrogate) to
846 U+FFFD (OBJECT REPLACEMENT CHARACTER).
848 3) the same for any low surrogate that was not preceded by a high surrogate.
851 static size_t utf16_munged_pull(void *cd, const char **inbuf, size_t *inbytesleft,
852 char **outbuf, size_t *outbytesleft)
854 size_t in_left=*inbytesleft, out_left=*outbytesleft;
855 uint8_t *c = (uint8_t *)*outbuf;
856 const uint8_t *uc = (const uint8_t *)*inbuf;
858 while (in_left >= 2 && out_left >= 2) {
859 unsigned int codepoint = uc[0] | (uc[1]<<8);
861 if (codepoint == 0) {
862 codepoint = 1;
865 if ((codepoint & 0xfc00) == 0xd800) {
866 /* a high surrogate */
867 unsigned int codepoint2;
868 if (in_left < 4) {
869 codepoint = 0xfffd;
870 goto codepoint16;
872 codepoint2 = uc[2] | (uc[3]<<8);
873 if ((codepoint2 & 0xfc00) != 0xdc00) {
874 /* high surrogate not followed by low
875 surrogate: convert to 0xfffd */
876 codepoint = 0xfffd;
877 goto codepoint16;
879 if (out_left < 4) {
880 errno = E2BIG;
881 goto error;
883 memcpy(c, uc, 4);
884 in_left -= 4;
885 out_left -= 4;
886 uc += 4;
887 c += 4;
888 continue;
891 if ((codepoint & 0xfc00) == 0xdc00) {
892 /* low surrogate not preceded by high
893 surrogate: convert to 0xfffd */
894 codepoint = 0xfffd;
897 codepoint16:
898 c[0] = codepoint & 0xFF;
899 c[1] = (codepoint>>8) & 0xFF;
901 in_left -= 2;
902 out_left -= 2;
903 uc += 2;
904 c += 2;
905 continue;
908 if (in_left == 1) {
909 errno = EINVAL;
910 goto error;
913 if (in_left > 1) {
914 errno = E2BIG;
915 goto error;
918 *inbytesleft = in_left;
919 *outbytesleft = out_left;
920 *inbuf = (const char *)uc;
921 *outbuf = (char *)c;
923 return 0;
925 error:
926 *inbytesleft = in_left;
927 *outbytesleft = out_left;
928 *inbuf = (const char *)uc;
929 *outbuf = (char *)c;
930 return -1;