lib: Fix a typo
[Samba.git] / lib / util / charset / iconv.c
blobf4815f1717ea46e6c6b6929506484081457d6ef2
1 /*
2 Unix SMB/CIFS implementation.
3 minimal iconv implementation
4 Copyright (C) Andrew Tridgell 2001
5 Copyright (C) Jelmer Vernooij 2002
7 This program is free software; you can redistribute it and/or modify
8 it under the terms of the GNU General Public License as published by
9 the Free Software Foundation; either version 3 of the License, or
10 (at your option) any later version.
12 This program is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 GNU General Public License for more details.
17 You should have received a copy of the GNU General Public License
18 along with this program. If not, see <http://www.gnu.org/licenses/>.
21 #include "includes.h"
22 #include "../lib/util/dlinklist.h"
23 #include "system/iconv.h"
24 #include "system/filesys.h"
25 #include "charset_proto.h"
27 #ifdef strcasecmp
28 #undef strcasecmp
29 #endif
31 /**
32 * @file
34 * @brief Samba wrapper/stub for iconv character set conversion.
36 * iconv is the XPG2 interface for converting between character
37 * encodings. This file provides a Samba wrapper around it, and also
38 * a simple reimplementation that is used if the system does not
39 * implement iconv.
41 * Samba only works with encodings that are supersets of ASCII: ascii
42 * characters like whitespace can be tested for directly, multibyte
43 * sequences start with a byte with the high bit set, and strings are
44 * terminated by a nul byte.
46 * Note that the only function provided by iconv is conversion between
47 * characters. It doesn't directly support operations like
48 * uppercasing or comparison. We have to convert to UTF-16LE and
49 * compare there.
51 * @sa Samba Developers Guide
52 **/
54 static size_t ascii_pull (void *,const char **, size_t *, char **, size_t *);
55 static size_t ascii_push (void *,const char **, size_t *, char **, size_t *);
56 static size_t latin1_pull(void *,const char **, size_t *, char **, size_t *);
57 static size_t latin1_push(void *,const char **, size_t *, char **, size_t *);
58 static size_t utf8_pull (void *,const char **, size_t *, char **, size_t *);
59 static size_t utf8_push (void *,const char **, size_t *, char **, size_t *);
60 static size_t utf16_munged_pull(void *,const char **, size_t *, char **, size_t *);
61 static size_t ucs2hex_pull(void *,const char **, size_t *, char **, size_t *);
62 static size_t ucs2hex_push(void *,const char **, size_t *, char **, size_t *);
63 static size_t iconv_copy (void *,const char **, size_t *, char **, size_t *);
64 static size_t iconv_swab (void *,const char **, size_t *, char **, size_t *);
66 static const struct charset_functions builtin_functions[] = {
67 /* windows is closest to UTF-16 */
68 {"UCS-2LE", iconv_copy, iconv_copy},
69 {"UTF-16LE", iconv_copy, iconv_copy},
70 {"UCS-2BE", iconv_swab, iconv_swab},
71 {"UTF-16BE", iconv_swab, iconv_swab},
73 /* we include the UTF-8 alias to cope with differing locale settings */
74 {"UTF8", utf8_pull, utf8_push},
75 {"UTF-8", utf8_pull, utf8_push},
77 /* this handles the munging needed for String2Key */
78 {"UTF16_MUNGED", utf16_munged_pull, iconv_copy, true},
80 {"ASCII", ascii_pull, ascii_push},
81 {"646", ascii_pull, ascii_push},
82 {"ISO-8859-1", latin1_pull, latin1_push},
83 #ifdef DEVELOPER
84 {"WEIRD", weird_pull, weird_push, true},
85 #endif
86 #ifdef DARWINOS
87 {"MACOSXFS", macosxfs_encoding_pull, macosxfs_encoding_push, true},
88 #endif
89 {"UCS2-HEX", ucs2hex_pull, ucs2hex_push, true}
93 #ifdef HAVE_NATIVE_ICONV
94 /* if there was an error then reset the internal state,
95 this ensures that we don't have a shift state remaining for
96 character sets like SJIS */
97 static size_t sys_iconv(void *cd,
98 const char **inbuf, size_t *inbytesleft,
99 char **outbuf, size_t *outbytesleft)
101 size_t ret = iconv((iconv_t)cd,
102 discard_const_p(char *, inbuf), inbytesleft,
103 outbuf, outbytesleft);
104 if (ret == (size_t)-1) iconv(cd, NULL, NULL, NULL, NULL);
105 return ret;
107 #endif
110 * This is a simple portable iconv() implementaion.
112 * It only knows about a very small number of character sets - just
113 * enough that Samba works on systems that don't have iconv.
115 _PUBLIC_ size_t smb_iconv(smb_iconv_t cd,
116 const char **inbuf, size_t *inbytesleft,
117 char **outbuf, size_t *outbytesleft)
119 /* in many cases we can go direct */
120 if (cd->direct) {
121 return cd->direct(cd->cd_direct,
122 inbuf, inbytesleft, outbuf, outbytesleft);
125 /* otherwise we have to do it chunks at a time */
127 #ifndef SMB_ICONV_BUFSIZE
128 #define SMB_ICONV_BUFSIZE 2048
129 #endif
130 size_t bufsize;
131 char cvtbuf[SMB_ICONV_BUFSIZE];
133 while (*inbytesleft > 0) {
134 char *bufp1 = cvtbuf;
135 const char *bufp2 = cvtbuf;
136 int saved_errno = errno;
137 bool pull_failed = false;
138 bufsize = SMB_ICONV_BUFSIZE;
140 if (cd->pull(cd->cd_pull,
141 inbuf, inbytesleft, &bufp1, &bufsize) == -1
142 && errno != E2BIG) {
143 saved_errno = errno;
144 pull_failed = true;
147 bufsize = SMB_ICONV_BUFSIZE - bufsize;
149 if (cd->push(cd->cd_push,
150 &bufp2, &bufsize,
151 outbuf, outbytesleft) == -1) {
152 return -1;
153 } else if (pull_failed) {
154 /* We want the pull errno if possible */
155 errno = saved_errno;
156 return -1;
161 return 0;
164 static bool is_utf16(const char *name)
166 return strcasecmp(name, "UCS-2LE") == 0 ||
167 strcasecmp(name, "UTF-16LE") == 0;
170 static int smb_iconv_t_destructor(smb_iconv_t hwd)
172 #ifdef HAVE_NATIVE_ICONV
173 if (hwd->cd_pull != NULL && hwd->cd_pull != (iconv_t)-1)
174 iconv_close(hwd->cd_pull);
175 if (hwd->cd_push != NULL && hwd->cd_push != (iconv_t)-1)
176 iconv_close(hwd->cd_push);
177 if (hwd->cd_direct != NULL && hwd->cd_direct != (iconv_t)-1)
178 iconv_close(hwd->cd_direct);
179 #endif
181 return 0;
184 _PUBLIC_ smb_iconv_t smb_iconv_open_ex(TALLOC_CTX *mem_ctx, const char *tocode,
185 const char *fromcode, bool use_builtin_handlers)
187 smb_iconv_t ret;
188 const struct charset_functions *from=NULL, *to=NULL;
189 int i;
191 ret = (smb_iconv_t)talloc_named(mem_ctx,
192 sizeof(*ret),
193 "iconv(%s,%s)", tocode, fromcode);
194 if (!ret) {
195 errno = ENOMEM;
196 return (smb_iconv_t)-1;
198 memset(ret, 0, sizeof(*ret));
199 talloc_set_destructor(ret, smb_iconv_t_destructor);
201 /* check for the simplest null conversion */
202 if (strcmp(fromcode, tocode) == 0) {
203 ret->direct = iconv_copy;
204 return ret;
207 /* check if we have a builtin function for this conversion */
208 for (i=0;i<ARRAY_SIZE(builtin_functions);i++) {
209 if (strcasecmp(fromcode, builtin_functions[i].name) == 0) {
210 if (use_builtin_handlers || builtin_functions[i].samba_internal_charset) {
211 from = &builtin_functions[i];
214 if (strcasecmp(tocode, builtin_functions[i].name) == 0) {
215 if (use_builtin_handlers || builtin_functions[i].samba_internal_charset) {
216 to = &builtin_functions[i];
221 #ifdef HAVE_NATIVE_ICONV
222 /* the from and to variables indicate a samba module or
223 * internal conversion, ret->pull and ret->push are
224 * initialised only in this block for iconv based
225 * conversions */
227 if (from == NULL) {
228 ret->cd_pull = iconv_open("UTF-16LE", fromcode);
229 if (ret->cd_pull == (iconv_t)-1)
230 ret->cd_pull = iconv_open("UCS-2LE", fromcode);
231 if (ret->cd_pull != (iconv_t)-1) {
232 ret->pull = sys_iconv;
236 if (to == NULL) {
237 ret->cd_push = iconv_open(tocode, "UTF-16LE");
238 if (ret->cd_push == (iconv_t)-1)
239 ret->cd_push = iconv_open(tocode, "UCS-2LE");
240 if (ret->cd_push != (iconv_t)-1) {
241 ret->push = sys_iconv;
244 #endif
246 if (ret->pull == NULL && from == NULL) {
247 goto failed;
250 if (ret->push == NULL && to == NULL) {
251 goto failed;
254 /* check for conversion to/from ucs2 */
255 if (is_utf16(fromcode) && to) {
256 ret->direct = to->push;
257 return ret;
259 if (is_utf16(tocode) && from) {
260 ret->direct = from->pull;
261 return ret;
264 #ifdef HAVE_NATIVE_ICONV
265 if (is_utf16(fromcode)) {
266 ret->direct = sys_iconv;
267 ret->cd_direct = ret->cd_push;
268 ret->cd_push = NULL;
269 return ret;
271 if (is_utf16(tocode)) {
272 ret->direct = sys_iconv;
273 ret->cd_direct = ret->cd_pull;
274 ret->cd_pull = NULL;
275 return ret;
277 #endif
279 /* the general case has to go via a buffer */
280 if (!ret->pull) ret->pull = from->pull;
281 if (!ret->push) ret->push = to->push;
282 return ret;
284 failed:
285 talloc_free(ret);
286 errno = EINVAL;
287 return (smb_iconv_t)-1;
291 simple iconv_open() wrapper
293 _PUBLIC_ smb_iconv_t smb_iconv_open(const char *tocode, const char *fromcode)
295 return smb_iconv_open_ex(NULL, tocode, fromcode, true);
299 simple iconv_close() wrapper
301 _PUBLIC_ int smb_iconv_close(smb_iconv_t cd)
303 talloc_free(cd);
304 return 0;
308 /**********************************************************************
309 the following functions implement the builtin character sets in Samba
310 and also the "test" character sets that are designed to test
311 multi-byte character set support for english users
312 ***********************************************************************/
315 this takes an ASCII sequence and produces a UTF16 sequence
317 The first 127 codepoints of latin1 matches the first 127 codepoints
318 of unicode, and so can be put into the first byte of UTF16LE
322 static size_t ascii_pull(void *cd, const char **inbuf, size_t *inbytesleft,
323 char **outbuf, size_t *outbytesleft)
325 while (*inbytesleft >= 1 && *outbytesleft >= 2) {
326 if (((*inbuf)[0] & 0x7F) != (*inbuf)[0]) {
327 /* If this is multi-byte, then it isn't legal ASCII */
328 errno = EILSEQ;
329 return -1;
331 (*outbuf)[0] = (*inbuf)[0];
332 (*outbuf)[1] = 0;
333 (*inbytesleft) -= 1;
334 (*outbytesleft) -= 2;
335 (*inbuf) += 1;
336 (*outbuf) += 2;
339 if (*inbytesleft > 0) {
340 errno = E2BIG;
341 return -1;
344 return 0;
348 this takes a UTF16 sequence and produces an ASCII sequence
350 The first 127 codepoints of ASCII matches the first 127 codepoints
351 of unicode, and so can be read directly from the first byte of UTF16LE
354 static size_t ascii_push(void *cd, const char **inbuf, size_t *inbytesleft,
355 char **outbuf, size_t *outbytesleft)
357 int ir_count=0;
359 while (*inbytesleft >= 2 && *outbytesleft >= 1) {
360 if (((*inbuf)[0] & 0x7F) != (*inbuf)[0] ||
361 (*inbuf)[1] != 0) {
362 /* If this is multi-byte, then it isn't legal ASCII */
363 errno = EILSEQ;
364 return -1;
366 (*outbuf)[0] = (*inbuf)[0];
367 (*inbytesleft) -= 2;
368 (*outbytesleft) -= 1;
369 (*inbuf) += 2;
370 (*outbuf) += 1;
373 if (*inbytesleft == 1) {
374 errno = EINVAL;
375 return -1;
378 if (*inbytesleft > 1) {
379 errno = E2BIG;
380 return -1;
383 return ir_count;
387 this takes a latin1/ISO-8859-1 sequence and produces a UTF16 sequence
389 The first 256 codepoints of latin1 matches the first 256 codepoints
390 of unicode, and so can be put into the first byte of UTF16LE
393 static size_t latin1_pull(void *cd, const char **inbuf, size_t *inbytesleft,
394 char **outbuf, size_t *outbytesleft)
396 while (*inbytesleft >= 1 && *outbytesleft >= 2) {
397 (*outbuf)[0] = (*inbuf)[0];
398 (*outbuf)[1] = 0;
399 (*inbytesleft) -= 1;
400 (*outbytesleft) -= 2;
401 (*inbuf) += 1;
402 (*outbuf) += 2;
405 if (*inbytesleft > 0) {
406 errno = E2BIG;
407 return -1;
410 return 0;
414 this takes a UTF16 sequence and produces a latin1/ISO-8859-1 sequence
416 The first 256 codepoints of latin1 matches the first 256 codepoints
417 of unicode, and so can be read directly from the first byte of UTF16LE
420 static size_t latin1_push(void *cd, const char **inbuf, size_t *inbytesleft,
421 char **outbuf, size_t *outbytesleft)
423 int ir_count=0;
425 while (*inbytesleft >= 2 && *outbytesleft >= 1) {
426 (*outbuf)[0] = (*inbuf)[0];
427 if ((*inbuf)[1] != 0) {
428 /* If this is multi-byte, then it isn't legal latin1 */
429 errno = EILSEQ;
430 return -1;
432 (*inbytesleft) -= 2;
433 (*outbytesleft) -= 1;
434 (*inbuf) += 2;
435 (*outbuf) += 1;
438 if (*inbytesleft == 1) {
439 errno = EINVAL;
440 return -1;
443 if (*inbytesleft > 1) {
444 errno = E2BIG;
445 return -1;
448 return ir_count;
451 static size_t ucs2hex_pull(void *cd, const char **inbuf, size_t *inbytesleft,
452 char **outbuf, size_t *outbytesleft)
454 while (*inbytesleft >= 1 && *outbytesleft >= 2) {
455 unsigned int v;
457 if ((*inbuf)[0] != '@') {
458 /* seven bit ascii case */
459 (*outbuf)[0] = (*inbuf)[0];
460 (*outbuf)[1] = 0;
461 (*inbytesleft) -= 1;
462 (*outbytesleft) -= 2;
463 (*inbuf) += 1;
464 (*outbuf) += 2;
465 continue;
467 /* it's a hex character */
468 if (*inbytesleft < 5) {
469 errno = EINVAL;
470 return -1;
473 if (sscanf(&(*inbuf)[1], "%04x", &v) != 1) {
474 errno = EILSEQ;
475 return -1;
478 (*outbuf)[0] = v&0xff;
479 (*outbuf)[1] = v>>8;
480 (*inbytesleft) -= 5;
481 (*outbytesleft) -= 2;
482 (*inbuf) += 5;
483 (*outbuf) += 2;
486 if (*inbytesleft > 0) {
487 errno = E2BIG;
488 return -1;
491 return 0;
494 static size_t ucs2hex_push(void *cd, const char **inbuf, size_t *inbytesleft,
495 char **outbuf, size_t *outbytesleft)
497 while (*inbytesleft >= 2 && *outbytesleft >= 1) {
498 char buf[6];
500 if ((*inbuf)[1] == 0 &&
501 ((*inbuf)[0] & 0x80) == 0 &&
502 (*inbuf)[0] != '@') {
503 (*outbuf)[0] = (*inbuf)[0];
504 (*inbytesleft) -= 2;
505 (*outbytesleft) -= 1;
506 (*inbuf) += 2;
507 (*outbuf) += 1;
508 continue;
510 if (*outbytesleft < 5) {
511 errno = E2BIG;
512 return -1;
514 snprintf(buf, 6, "@%04x", SVAL(*inbuf, 0));
515 memcpy(*outbuf, buf, 5);
516 (*inbytesleft) -= 2;
517 (*outbytesleft) -= 5;
518 (*inbuf) += 2;
519 (*outbuf) += 5;
522 if (*inbytesleft == 1) {
523 errno = EINVAL;
524 return -1;
527 if (*inbytesleft > 1) {
528 errno = E2BIG;
529 return -1;
532 return 0;
535 static size_t iconv_swab(void *cd, const char **inbuf, size_t *inbytesleft,
536 char **outbuf, size_t *outbytesleft)
538 int n;
540 n = MIN(*inbytesleft, *outbytesleft);
542 swab(*inbuf, *outbuf, (n&~1));
543 if (n&1) {
544 (*outbuf)[n-1] = 0;
547 (*inbytesleft) -= n;
548 (*outbytesleft) -= n;
549 (*inbuf) += n;
550 (*outbuf) += n;
552 if (*inbytesleft > 0) {
553 errno = E2BIG;
554 return -1;
557 return 0;
561 static size_t iconv_copy(void *cd, const char **inbuf, size_t *inbytesleft,
562 char **outbuf, size_t *outbytesleft)
564 int n;
566 n = MIN(*inbytesleft, *outbytesleft);
568 memmove(*outbuf, *inbuf, n);
570 (*inbytesleft) -= n;
571 (*outbytesleft) -= n;
572 (*inbuf) += n;
573 (*outbuf) += n;
575 if (*inbytesleft > 0) {
576 errno = E2BIG;
577 return -1;
580 return 0;
584 this takes a UTF8 sequence and produces a UTF16 sequence
586 static size_t utf8_pull(void *cd, const char **inbuf, size_t *inbytesleft,
587 char **outbuf, size_t *outbytesleft)
589 size_t in_left=*inbytesleft, out_left=*outbytesleft;
590 const uint8_t *c = (const uint8_t *)*inbuf;
591 uint8_t *uc = (uint8_t *)*outbuf;
593 while (in_left >= 1 && out_left >= 2) {
594 if ((c[0] & 0x80) == 0) {
595 uc[0] = c[0];
596 uc[1] = 0;
597 c += 1;
598 in_left -= 1;
599 out_left -= 2;
600 uc += 2;
601 continue;
604 if ((c[0] & 0xe0) == 0xc0) {
605 if (in_left < 2 ||
606 (c[1] & 0xc0) != 0x80) {
607 errno = EILSEQ;
608 goto error;
610 uc[1] = (c[0]>>2) & 0x7;
611 uc[0] = (c[0]<<6) | (c[1]&0x3f);
612 c += 2;
613 in_left -= 2;
614 out_left -= 2;
615 uc += 2;
616 continue;
619 if ((c[0] & 0xf0) == 0xe0) {
620 if (in_left < 3 ||
621 (c[1] & 0xc0) != 0x80 ||
622 (c[2] & 0xc0) != 0x80) {
623 errno = EILSEQ;
624 goto error;
626 uc[1] = ((c[0]&0xF)<<4) | ((c[1]>>2)&0xF);
627 uc[0] = (c[1]<<6) | (c[2]&0x3f);
628 c += 3;
629 in_left -= 3;
630 out_left -= 2;
631 uc += 2;
632 continue;
635 if ((c[0] & 0xf8) == 0xf0) {
636 unsigned int codepoint;
637 if (in_left < 4 ||
638 (c[1] & 0xc0) != 0x80 ||
639 (c[2] & 0xc0) != 0x80 ||
640 (c[3] & 0xc0) != 0x80) {
641 errno = EILSEQ;
642 goto error;
644 codepoint =
645 (c[3]&0x3f) |
646 ((c[2]&0x3f)<<6) |
647 ((c[1]&0x3f)<<12) |
648 ((c[0]&0x7)<<18);
649 if (codepoint < 0x10000) {
650 /* accept UTF-8 characters that are not
651 minimally packed, but pack the result */
652 uc[0] = (codepoint & 0xFF);
653 uc[1] = (codepoint >> 8);
654 c += 4;
655 in_left -= 4;
656 out_left -= 2;
657 uc += 2;
658 continue;
661 codepoint -= 0x10000;
663 if (out_left < 4) {
664 errno = E2BIG;
665 goto error;
668 uc[0] = (codepoint>>10) & 0xFF;
669 uc[1] = (codepoint>>18) | 0xd8;
670 uc[2] = codepoint & 0xFF;
671 uc[3] = ((codepoint>>8) & 0x3) | 0xdc;
672 c += 4;
673 in_left -= 4;
674 out_left -= 4;
675 uc += 4;
676 continue;
679 /* we don't handle 5 byte sequences */
680 errno = EINVAL;
681 goto error;
684 if (in_left > 0) {
685 errno = E2BIG;
686 goto error;
689 *inbytesleft = in_left;
690 *outbytesleft = out_left;
691 *inbuf = (const char *)c;
692 *outbuf = (char *)uc;
693 return 0;
695 error:
696 *inbytesleft = in_left;
697 *outbytesleft = out_left;
698 *inbuf = (const char *)c;
699 *outbuf = (char *)uc;
700 return -1;
705 this takes a UTF16 sequence and produces a UTF8 sequence
707 static size_t utf8_push(void *cd, const char **inbuf, size_t *inbytesleft,
708 char **outbuf, size_t *outbytesleft)
710 size_t in_left=*inbytesleft, out_left=*outbytesleft;
711 uint8_t *c = (uint8_t *)*outbuf;
712 const uint8_t *uc = (const uint8_t *)*inbuf;
714 while (in_left >= 2 && out_left >= 1) {
715 unsigned int codepoint;
717 if (uc[1] == 0 && !(uc[0] & 0x80)) {
718 /* simplest case */
719 c[0] = uc[0];
720 in_left -= 2;
721 out_left -= 1;
722 uc += 2;
723 c += 1;
724 continue;
727 if ((uc[1]&0xf8) == 0) {
728 /* next simplest case */
729 if (out_left < 2) {
730 errno = E2BIG;
731 goto error;
733 c[0] = 0xc0 | (uc[0]>>6) | (uc[1]<<2);
734 c[1] = 0x80 | (uc[0] & 0x3f);
735 in_left -= 2;
736 out_left -= 2;
737 uc += 2;
738 c += 2;
739 continue;
742 if ((uc[1] & 0xfc) == 0xdc) {
743 /* its the second part of a 4 byte sequence. Illegal */
744 if (in_left < 4) {
745 errno = EINVAL;
746 } else {
747 errno = EILSEQ;
749 goto error;
752 if ((uc[1] & 0xfc) != 0xd8) {
753 codepoint = uc[0] | (uc[1]<<8);
754 if (out_left < 3) {
755 errno = E2BIG;
756 goto error;
758 c[0] = 0xe0 | (codepoint >> 12);
759 c[1] = 0x80 | ((codepoint >> 6) & 0x3f);
760 c[2] = 0x80 | (codepoint & 0x3f);
762 in_left -= 2;
763 out_left -= 3;
764 uc += 2;
765 c += 3;
766 continue;
769 /* its the first part of a 4 byte sequence */
770 if (in_left < 4) {
771 errno = EINVAL;
772 goto error;
774 if ((uc[3] & 0xfc) != 0xdc) {
775 errno = EILSEQ;
776 goto error;
778 codepoint = 0x10000 + (uc[2] | ((uc[3] & 0x3)<<8) |
779 (uc[0]<<10) | ((uc[1] & 0x3)<<18));
781 if (out_left < 4) {
782 errno = E2BIG;
783 goto error;
785 c[0] = 0xf0 | (codepoint >> 18);
786 c[1] = 0x80 | ((codepoint >> 12) & 0x3f);
787 c[2] = 0x80 | ((codepoint >> 6) & 0x3f);
788 c[3] = 0x80 | (codepoint & 0x3f);
790 in_left -= 4;
791 out_left -= 4;
792 uc += 4;
793 c += 4;
796 if (in_left == 1) {
797 errno = EINVAL;
798 goto error;
801 if (in_left > 1) {
802 errno = E2BIG;
803 goto error;
806 *inbytesleft = in_left;
807 *outbytesleft = out_left;
808 *inbuf = (const char *)uc;
809 *outbuf = (char *)c;
811 return 0;
813 error:
814 *inbytesleft = in_left;
815 *outbytesleft = out_left;
816 *inbuf = (const char *)uc;
817 *outbuf = (char *)c;
818 return -1;
823 this takes a UTF16 munged sequence, modifies it according to the
824 string2key rules, and produces a UTF16 sequence
826 The rules are:
828 1) any 0x0000 characters are mapped to 0x0001
830 2) convert any instance of 0xD800 - 0xDBFF (high surrogate)
831 without an immediately following 0xDC00 - 0x0xDFFF (low surrogate) to
832 U+FFFD (OBJECT REPLACEMENT CHARACTER).
834 3) the same for any low surrogate that was not preceded by a high surrogate.
837 static size_t utf16_munged_pull(void *cd, const char **inbuf, size_t *inbytesleft,
838 char **outbuf, size_t *outbytesleft)
840 size_t in_left=*inbytesleft, out_left=*outbytesleft;
841 uint8_t *c = (uint8_t *)*outbuf;
842 const uint8_t *uc = (const uint8_t *)*inbuf;
844 while (in_left >= 2 && out_left >= 2) {
845 unsigned int codepoint = uc[0] | (uc[1]<<8);
847 if (codepoint == 0) {
848 codepoint = 1;
851 if ((codepoint & 0xfc00) == 0xd800) {
852 /* a high surrogate */
853 unsigned int codepoint2;
854 if (in_left < 4) {
855 codepoint = 0xfffd;
856 goto codepoint16;
858 codepoint2 = uc[2] | (uc[3]<<8);
859 if ((codepoint2 & 0xfc00) != 0xdc00) {
860 /* high surrogate not followed by low
861 surrogate: convert to 0xfffd */
862 codepoint = 0xfffd;
863 goto codepoint16;
865 if (out_left < 4) {
866 errno = E2BIG;
867 goto error;
869 memcpy(c, uc, 4);
870 in_left -= 4;
871 out_left -= 4;
872 uc += 4;
873 c += 4;
874 continue;
877 if ((codepoint & 0xfc00) == 0xdc00) {
878 /* low surrogate not preceded by high
879 surrogate: convert to 0xfffd */
880 codepoint = 0xfffd;
883 codepoint16:
884 c[0] = codepoint & 0xFF;
885 c[1] = (codepoint>>8) & 0xFF;
887 in_left -= 2;
888 out_left -= 2;
889 uc += 2;
890 c += 2;
891 continue;
894 if (in_left == 1) {
895 errno = EINVAL;
896 goto error;
899 if (in_left > 1) {
900 errno = E2BIG;
901 goto error;
904 *inbytesleft = in_left;
905 *outbytesleft = out_left;
906 *inbuf = (const char *)uc;
907 *outbuf = (char *)c;
909 return 0;
911 error:
912 *inbytesleft = in_left;
913 *outbytesleft = out_left;
914 *inbuf = (const char *)uc;
915 *outbuf = (char *)c;
916 return -1;