Intrinsicify SpanHelpers.IndexOf(char) (dotnet/coreclr#22505)
[mono-project.git] / mono / eglib / giconv.c
blob5dee361e97aa5afdf26cb1074e560dabbade3657
1 /* -*- Mode: C; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- */
2 /*
3 * Copyright (C) 2011 Jeffrey Stedfast
5 * Permission is hereby granted, free of charge, to any person
6 * obtaining a copy of this software and associated documentation
7 * files (the "Software"), to deal in the Software without
8 * restriction, including without limitation the rights to use, copy,
9 * modify, merge, publish, distribute, sublicense, and/or sell copies
10 * of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
13 * The above copyright notice and this permission notice shall be
14 * included in all copies or substantial portions of the Software.
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
20 * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
21 * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
22 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
23 * DEALINGS IN THE SOFTWARE.
25 #include <config.h>
26 #include <glib.h>
27 #include <string.h>
28 #ifdef HAVE_ICONV_H
29 #include <iconv.h>
30 #endif
31 #include <errno.h>
32 #include "../utils/mono-errno.h"
34 #ifdef _MSC_VER
35 #define FORCE_INLINE(RET_TYPE) __forceinline RET_TYPE
36 #else
37 #define FORCE_INLINE(RET_TYPE) inline RET_TYPE __attribute__((always_inline))
38 #endif
41 #define UNROLL_DECODE_UTF8 0
42 #define UNROLL_ENCODE_UTF8 0
44 typedef int (* Decoder) (char *inbuf, size_t inleft, gunichar *outchar);
45 typedef int (* Encoder) (gunichar c, char *outbuf, size_t outleft);
47 struct _GIConv {
48 Decoder decode;
49 Encoder encode;
50 gunichar c;
51 #ifdef HAVE_LIBICONV
52 iconv_t cd;
53 #endif
56 static int decode_utf32be (char *inbuf, size_t inleft, gunichar *outchar);
57 static int encode_utf32be (gunichar c, char *outbuf, size_t outleft);
59 static int decode_utf32le (char *inbuf, size_t inleft, gunichar *outchar);
60 static int encode_utf32le (gunichar c, char *outbuf, size_t outleft);
62 static int decode_utf16be (char *inbuf, size_t inleft, gunichar *outchar);
63 static int encode_utf16be (gunichar c, char *outbuf, size_t outleft);
65 static int decode_utf16le (char *inbuf, size_t inleft, gunichar *outchar);
66 static int encode_utf16le (gunichar c, char *outbuf, size_t outleft);
68 static FORCE_INLINE (int) decode_utf8 (char *inbuf, size_t inleft, gunichar *outchar);
69 static int encode_utf8 (gunichar c, char *outbuf, size_t outleft);
71 static int decode_latin1 (char *inbuf, size_t inleft, gunichar *outchar);
72 static int encode_latin1 (gunichar c, char *outbuf, size_t outleft);
74 #if G_BYTE_ORDER == G_LITTLE_ENDIAN
75 #define decode_utf32 decode_utf32le
76 #define encode_utf32 encode_utf32le
77 #define decode_utf16 decode_utf16le
78 #define encode_utf16 encode_utf16le
79 #else
80 #define decode_utf32 decode_utf32be
81 #define encode_utf32 encode_utf32be
82 #define decode_utf16 decode_utf16be
83 #define encode_utf16 encode_utf16be
84 #endif
86 static struct {
87 const char *name;
88 Decoder decoder;
89 Encoder encoder;
90 } charsets[] = {
91 { "ISO-8859-1", decode_latin1, encode_latin1 },
92 { "ISO8859-1", decode_latin1, encode_latin1 },
93 { "UTF-32BE", decode_utf32be, encode_utf32be },
94 { "UTF-32LE", decode_utf32le, encode_utf32le },
95 { "UTF-16BE", decode_utf16be, encode_utf16be },
96 { "UTF-16LE", decode_utf16le, encode_utf16le },
97 { "UTF-32", decode_utf32, encode_utf32 },
98 { "UTF-16", decode_utf16, encode_utf16 },
99 { "UTF-8", decode_utf8, encode_utf8 },
100 { "US-ASCII", decode_latin1, encode_latin1 },
101 { "Latin1", decode_latin1, encode_latin1 },
102 { "ASCII", decode_latin1, encode_latin1 },
103 { "UTF32", decode_utf32, encode_utf32 },
104 { "UTF16", decode_utf16, encode_utf16 },
105 { "UTF8", decode_utf8, encode_utf8 },
109 GIConv
110 g_iconv_open (const char *to_charset, const char *from_charset)
112 #ifdef HAVE_LIBICONV
113 iconv_t icd = (iconv_t) -1;
114 #endif
115 Decoder decoder = NULL;
116 Encoder encoder = NULL;
117 GIConv cd;
118 guint i;
120 if (!to_charset || !from_charset || !to_charset[0] || !from_charset[0]) {
121 mono_set_errno (EINVAL);
123 return (GIConv) -1;
126 for (i = 0; i < G_N_ELEMENTS (charsets); i++) {
127 if (!g_ascii_strcasecmp (charsets[i].name, from_charset))
128 decoder = charsets[i].decoder;
130 if (!g_ascii_strcasecmp (charsets[i].name, to_charset))
131 encoder = charsets[i].encoder;
134 if (!encoder || !decoder) {
135 #ifdef HAVE_LIBICONV
136 if ((icd = iconv_open (to_charset, from_charset)) == (iconv_t) -1)
137 return (GIConv) -1;
138 #else
139 mono_set_errno (EINVAL);
141 return (GIConv) -1;
142 #endif
145 cd = (GIConv) g_malloc (sizeof (struct _GIConv));
146 cd->decode = decoder;
147 cd->encode = encoder;
148 cd->c = -1;
150 #ifdef HAVE_LIBICONV
151 cd->cd = icd;
152 #endif
154 return cd;
158 g_iconv_close (GIConv cd)
160 #ifdef HAVE_LIBICONV
161 if (cd->cd != (iconv_t) -1)
162 iconv_close (cd->cd);
163 #endif
165 g_free (cd);
167 return 0;
170 gsize
171 g_iconv (GIConv cd, gchar **inbytes, gsize *inbytesleft,
172 gchar **outbytes, gsize *outbytesleft)
174 gsize inleft, outleft;
175 char *inptr, *outptr;
176 gunichar c;
177 int rc = 0;
179 #ifdef HAVE_LIBICONV
180 if (cd->cd != (iconv_t) -1) {
181 /* Note: gsize may have a different size than size_t, so we need to
182 remap inbytesleft and outbytesleft to size_t's. */
183 size_t *outleftptr, *inleftptr;
184 size_t n_outleft, n_inleft;
186 if (inbytesleft) {
187 n_inleft = *inbytesleft;
188 inleftptr = &n_inleft;
189 } else {
190 inleftptr = NULL;
193 if (outbytesleft) {
194 n_outleft = *outbytesleft;
195 outleftptr = &n_outleft;
196 } else {
197 outleftptr = NULL;
199 #if defined(__NetBSD__)
200 return iconv (cd->cd, (const gchar **)inbytes, inleftptr, outbytes, outleftptr);
201 #else
202 return iconv (cd->cd, inbytes, inleftptr, outbytes, outleftptr);
203 #endif
205 #endif
207 if (outbytes == NULL || outbytesleft == NULL) {
208 /* reset converter */
209 cd->c = -1;
210 return 0;
213 inleft = inbytesleft ? *inbytesleft : 0;
214 inptr = inbytes ? *inbytes : NULL;
215 outleft = *outbytesleft;
216 outptr = *outbytes;
218 if ((c = cd->c) != (gunichar) -1)
219 goto encode;
221 while (inleft > 0) {
222 if ((rc = cd->decode (inptr, inleft, &c)) < 0)
223 break;
225 inleft -= rc;
226 inptr += rc;
228 encode:
229 if ((rc = cd->encode (c, outptr, outleft)) < 0)
230 break;
232 c = (gunichar) -1;
233 outleft -= rc;
234 outptr += rc;
237 if (inbytesleft)
238 *inbytesleft = inleft;
240 if (inbytes)
241 *inbytes = inptr;
243 *outbytesleft = outleft;
244 *outbytes = outptr;
245 cd->c = c;
247 return rc < 0 ? -1 : 0;
251 * Unicode encoders and decoders
254 static FORCE_INLINE (uint32_t)
255 read_uint32_endian (unsigned char *inptr, unsigned endian)
257 if (endian == G_LITTLE_ENDIAN)
258 return (inptr[3] << 24) | (inptr[2] << 16) | (inptr[1] << 8) | inptr[0];
259 return (inptr[0] << 24) | (inptr[1] << 16) | (inptr[2] << 8) | inptr[3];
262 static int
263 decode_utf32_endian (char *inbuf, size_t inleft, gunichar *outchar, unsigned endian)
265 unsigned char *inptr = (unsigned char *) inbuf;
266 gunichar c;
268 if (inleft < 4) {
269 mono_set_errno (EINVAL);
270 return -1;
273 c = read_uint32_endian (inptr, endian);
275 if (c >= 0xd800 && c < 0xe000) {
276 mono_set_errno (EILSEQ);
277 return -1;
278 } else if (c >= 0x110000) {
279 mono_set_errno (EILSEQ);
280 return -1;
283 *outchar = c;
285 return 4;
288 static int
289 decode_utf32be (char *inbuf, size_t inleft, gunichar *outchar)
291 return decode_utf32_endian (inbuf, inleft, outchar, G_BIG_ENDIAN);
294 static int
295 decode_utf32le (char *inbuf, size_t inleft, gunichar *outchar)
297 return decode_utf32_endian (inbuf, inleft, outchar, G_LITTLE_ENDIAN);
300 static int
301 encode_utf32be (gunichar c, char *outbuf, size_t outleft)
303 unsigned char *outptr = (unsigned char *) outbuf;
305 if (outleft < 4) {
306 mono_set_errno (E2BIG);
307 return -1;
310 outptr[0] = (c >> 24) & 0xff;
311 outptr[1] = (c >> 16) & 0xff;
312 outptr[2] = (c >> 8) & 0xff;
313 outptr[3] = c & 0xff;
315 return 4;
318 static int
319 encode_utf32le (gunichar c, char *outbuf, size_t outleft)
321 unsigned char *outptr = (unsigned char *) outbuf;
323 if (outleft < 4) {
324 mono_set_errno (E2BIG);
325 return -1;
328 outptr[0] = c & 0xff;
329 outptr[1] = (c >> 8) & 0xff;
330 outptr[2] = (c >> 16) & 0xff;
331 outptr[3] = (c >> 24) & 0xff;
333 return 4;
336 static FORCE_INLINE (uint16_t)
337 read_uint16_endian (unsigned char *inptr, unsigned endian)
339 if (endian == G_LITTLE_ENDIAN)
340 return (inptr[1] << 8) | inptr[0];
341 return (inptr[0] << 8) | inptr[1];
344 static FORCE_INLINE (int)
345 decode_utf16_endian (char *inbuf, size_t inleft, gunichar *outchar, unsigned endian)
347 unsigned char *inptr = (unsigned char *) inbuf;
348 gunichar2 c;
349 gunichar u;
351 if (inleft < 2) {
352 mono_set_errno (E2BIG);
353 return -1;
356 u = read_uint16_endian (inptr, endian);
358 if (u < 0xd800) {
359 /* 0x0000 -> 0xd7ff */
360 *outchar = u;
361 return 2;
362 } else if (u < 0xdc00) {
363 /* 0xd800 -> 0xdbff */
364 if (inleft < 4) {
365 mono_set_errno (EINVAL);
366 return -2;
369 c = read_uint16_endian (inptr + 2, endian);
371 if (c < 0xdc00 || c > 0xdfff) {
372 mono_set_errno (EILSEQ);
373 return -2;
376 u = ((u - 0xd800) << 10) + (c - 0xdc00) + 0x0010000UL;
377 *outchar = u;
379 return 4;
380 } else if (u < 0xe000) {
381 /* 0xdc00 -> 0xdfff */
382 mono_set_errno (EILSEQ);
383 return -1;
384 } else {
385 /* 0xe000 -> 0xffff */
386 *outchar = u;
387 return 2;
391 static int
392 decode_utf16be (char *inbuf, size_t inleft, gunichar *outchar)
394 return decode_utf16_endian (inbuf, inleft, outchar, G_BIG_ENDIAN);
397 static int
398 decode_utf16le (char *inbuf, size_t inleft, gunichar *outchar)
400 return decode_utf16_endian (inbuf, inleft, outchar, G_LITTLE_ENDIAN);
403 static FORCE_INLINE (void)
404 write_uint16_endian (unsigned char *outptr, uint16_t c, unsigned endian)
406 if (endian == G_LITTLE_ENDIAN) {
407 outptr[0] = c & 0xff;
408 outptr[1] = (c >> 8) & 0xff;
409 return;
411 outptr[0] = (c >> 8) & 0xff;
412 outptr[1] = c & 0xff;
415 static FORCE_INLINE (int)
416 encode_utf16_endian (gunichar c, char *outbuf, size_t outleft, unsigned endian)
418 unsigned char *outptr = (unsigned char *) outbuf;
419 gunichar2 ch;
420 gunichar c2;
422 if (c < 0x10000) {
423 if (outleft < 2) {
424 mono_set_errno (E2BIG);
425 return -1;
428 write_uint16_endian (outptr, c, endian);
429 return 2;
430 } else {
431 if (outleft < 4) {
432 mono_set_errno (E2BIG);
433 return -1;
436 c2 = c - 0x10000;
438 ch = (gunichar2) ((c2 >> 10) + 0xd800);
439 write_uint16_endian (outptr, ch, endian);
441 ch = (gunichar2) ((c2 & 0x3ff) + 0xdc00);
442 write_uint16_endian (outptr + 2, ch, endian);
443 return 4;
447 static int
448 encode_utf16be (gunichar c, char *outbuf, size_t outleft)
450 return encode_utf16_endian (c, outbuf, outleft, G_BIG_ENDIAN);
453 static int
454 encode_utf16le (gunichar c, char *outbuf, size_t outleft)
456 return encode_utf16_endian (c, outbuf, outleft, G_LITTLE_ENDIAN);
459 static FORCE_INLINE (int)
460 decode_utf8 (char *inbuf, size_t inleft, gunichar *outchar)
462 unsigned char *inptr = (unsigned char *) inbuf;
463 gunichar u;
464 int n, i;
466 u = *inptr;
468 if (u < 0x80) {
469 /* simple ascii case */
470 *outchar = u;
471 return 1;
472 } else if (u < 0xc2) {
473 mono_set_errno (EILSEQ);
474 return -1;
475 } else if (u < 0xe0) {
476 u &= 0x1f;
477 n = 2;
478 } else if (u < 0xf0) {
479 u &= 0x0f;
480 n = 3;
481 } else if (u < 0xf8) {
482 u &= 0x07;
483 n = 4;
484 } else if (u < 0xfc) {
485 u &= 0x03;
486 n = 5;
487 } else if (u < 0xfe) {
488 u &= 0x01;
489 n = 6;
490 } else {
491 mono_set_errno (EILSEQ);
492 return -1;
495 if (n > inleft) {
496 mono_set_errno (EINVAL);
497 return -1;
500 #if UNROLL_DECODE_UTF8
501 switch (n) {
502 case 6: u = (u << 6) | (*++inptr ^ 0x80);
503 case 5: u = (u << 6) | (*++inptr ^ 0x80);
504 case 4: u = (u << 6) | (*++inptr ^ 0x80);
505 case 3: u = (u << 6) | (*++inptr ^ 0x80);
506 case 2: u = (u << 6) | (*++inptr ^ 0x80);
508 #else
509 for (i = 1; i < n; i++)
510 u = (u << 6) | (*++inptr ^ 0x80);
511 #endif
513 *outchar = u;
515 return n;
518 static int
519 encode_utf8 (gunichar c, char *outbuf, size_t outleft)
521 unsigned char *outptr = (unsigned char *) outbuf;
522 int base, n, i;
524 if (c < 0x80) {
525 outptr[0] = c;
526 return 1;
527 } else if (c < 0x800) {
528 base = 192;
529 n = 2;
530 } else if (c < 0x10000) {
531 base = 224;
532 n = 3;
533 } else if (c < 0x200000) {
534 base = 240;
535 n = 4;
536 } else if (c < 0x4000000) {
537 base = 248;
538 n = 5;
539 } else {
540 base = 252;
541 n = 6;
544 if (outleft < n) {
545 mono_set_errno (E2BIG);
546 return -1;
549 #if UNROLL_ENCODE_UTF8
550 switch (n) {
551 case 6: outptr[5] = (c & 0x3f) | 0x80; c >>= 6;
552 case 5: outptr[4] = (c & 0x3f) | 0x80; c >>= 6;
553 case 4: outptr[3] = (c & 0x3f) | 0x80; c >>= 6;
554 case 3: outptr[2] = (c & 0x3f) | 0x80; c >>= 6;
555 case 2: outptr[1] = (c & 0x3f) | 0x80; c >>= 6;
556 case 1: outptr[0] = c | base;
558 #else
559 for (i = n - 1; i > 0; i--) {
560 outptr[i] = (c & 0x3f) | 0x80;
561 c >>= 6;
564 outptr[0] = c | base;
565 #endif
567 return n;
570 static int
571 decode_latin1 (char *inbuf, size_t inleft, gunichar *outchar)
573 *outchar = (unsigned char) *inbuf;
574 return 1;
577 static int
578 encode_latin1 (gunichar c, char *outbuf, size_t outleft)
580 if (outleft < 1) {
581 mono_set_errno (E2BIG);
582 return -1;
585 if (c > 0xff) {
586 mono_set_errno (EILSEQ);
587 return -1;
590 *outbuf = (char) c;
592 return 1;
597 * Simple conversion API
600 static gpointer error_quark = (gpointer)"ConvertError";
602 gpointer
603 g_convert_error_quark (void)
605 return error_quark;
608 gchar *
609 g_convert (const gchar *str, gssize len, const gchar *to_charset, const gchar *from_charset,
610 gsize *bytes_read, gsize *bytes_written, GError **err)
612 gsize outsize, outused, outleft, inleft, grow, rc;
613 char *result, *outbuf, *inbuf;
614 gboolean flush = FALSE;
615 gboolean done = FALSE;
616 GIConv cd;
618 g_return_val_if_fail (str != NULL, NULL);
619 g_return_val_if_fail (to_charset != NULL, NULL);
620 g_return_val_if_fail (from_charset != NULL, NULL);
622 if ((cd = g_iconv_open (to_charset, from_charset)) == (GIConv) -1) {
623 g_set_error (err, G_CONVERT_ERROR, G_CONVERT_ERROR_NO_CONVERSION,
624 "Conversion from %s to %s not supported.",
625 from_charset, to_charset);
627 if (bytes_written)
628 *bytes_written = 0;
630 if (bytes_read)
631 *bytes_read = 0;
633 return NULL;
636 inleft = len < 0 ? strlen (str) : len;
637 inbuf = (char *) str;
639 outleft = outsize = MAX (inleft, 8);
640 outbuf = result = g_malloc (outsize + 4);
642 do {
643 if (!flush)
644 rc = g_iconv (cd, &inbuf, &inleft, &outbuf, &outleft);
645 else
646 rc = g_iconv (cd, NULL, NULL, &outbuf, &outleft);
648 if (rc == (gsize) -1) {
649 switch (errno) {
650 case E2BIG:
651 /* grow our result buffer */
652 grow = MAX (inleft, 8) << 1;
653 outused = outbuf - result;
654 outsize += grow;
655 outleft += grow;
656 result = g_realloc (result, outsize + 4);
657 outbuf = result + outused;
658 break;
659 case EINVAL:
660 /* incomplete input, stop converting and terminate here */
661 if (flush)
662 done = TRUE;
663 else
664 flush = TRUE;
665 break;
666 case EILSEQ:
667 /* illegal sequence in the input */
668 g_set_error (err, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE, "%s", g_strerror (errno));
670 if (bytes_read) {
671 /* save offset of the illegal input sequence */
672 *bytes_read = (inbuf - str);
675 if (bytes_written)
676 *bytes_written = 0;
678 g_iconv_close (cd);
679 g_free (result);
680 return NULL;
681 default:
682 /* unknown errno */
683 g_set_error (err, G_CONVERT_ERROR, G_CONVERT_ERROR_FAILED, "%s", g_strerror (errno));
685 if (bytes_written)
686 *bytes_written = 0;
688 if (bytes_read)
689 *bytes_read = 0;
691 g_iconv_close (cd);
692 g_free (result);
693 return NULL;
695 } else if (flush) {
696 /* input has been converted and output has been flushed */
697 break;
698 } else {
699 /* input has been converted, need to flush the output */
700 flush = TRUE;
702 } while (!done);
704 g_iconv_close (cd);
706 /* Note: not all charsets can be null-terminated with a single
707 null byte. UCS2, for example, needs 2 null bytes and UCS4
708 needs 4. I hope that 4 null bytes is enough to terminate all
709 multibyte charsets? */
711 /* null-terminate the result */
712 memset (outbuf, 0, 4);
714 if (bytes_written)
715 *bytes_written = outbuf - result;
717 if (bytes_read)
718 *bytes_read = inbuf - str;
720 return result;
725 * Unicode conversion
729 * An explanation of the conversion can be found at:
730 * http://home.tiscali.nl/t876506/utf8tbl.html
733 gint
734 g_unichar_to_utf8 (gunichar c, gchar *outbuf)
736 int base, n, i;
738 if (c < 0x80) {
739 base = 0;
740 n = 1;
741 } else if (c < 0x800) {
742 base = 192;
743 n = 2;
744 } else if (c < 0x10000) {
745 base = 224;
746 n = 3;
747 } else if (c < 0x200000) {
748 base = 240;
749 n = 4;
750 } else if (c < 0x4000000) {
751 base = 248;
752 n = 5;
753 } else if (c < 0x80000000) {
754 base = 252;
755 n = 6;
756 } else {
757 return -1;
760 if (outbuf != NULL) {
761 for (i = n - 1; i > 0; i--) {
762 /* mask off 6 bits worth and add 128 */
763 outbuf[i] = (c & 0x3f) | 0x80;
764 c >>= 6;
767 /* first character has a different base */
768 outbuf[0] = c | base;
771 return n;
774 static FORCE_INLINE (int)
775 g_unichar_to_utf16 (gunichar c, gunichar2 *outbuf)
777 gunichar c2;
779 if (c < 0xd800) {
780 if (outbuf)
781 *outbuf = (gunichar2) c;
783 return 1;
784 } else if (c < 0xe000) {
785 return -1;
786 } else if (c < 0x10000) {
787 if (outbuf)
788 *outbuf = (gunichar2) c;
790 return 1;
791 } else if (c < 0x110000) {
792 if (outbuf) {
793 c2 = c - 0x10000;
795 outbuf[0] = (gunichar2) ((c2 >> 10) + 0xd800);
796 outbuf[1] = (gunichar2) ((c2 & 0x3ff) + 0xdc00);
799 return 2;
800 } else {
801 return -1;
805 gunichar *
806 g_utf8_to_ucs4_fast (const gchar *str, glong len, glong *items_written)
808 gunichar *outbuf, *outptr;
809 char *inptr;
810 glong n, i;
812 g_return_val_if_fail (str != NULL, NULL);
814 n = g_utf8_strlen (str, len);
816 if (items_written)
817 *items_written = n;
819 outptr = outbuf = g_malloc ((n + 1) * sizeof (gunichar));
820 inptr = (char *) str;
822 for (i = 0; i < n; i++) {
823 *outptr++ = g_utf8_get_char (inptr);
824 inptr = g_utf8_next_char (inptr);
827 *outptr = 0;
829 return outbuf;
832 static gunichar2 *
833 eg_utf8_to_utf16_general (const gchar *str, glong len, glong *items_read, glong *items_written, gboolean include_nuls, gboolean replace_invalid_codepoints, GError **err)
835 gunichar2 *outbuf, *outptr;
836 size_t outlen = 0;
837 size_t inleft;
838 char *inptr;
839 gunichar c;
840 int u, n;
842 g_return_val_if_fail (str != NULL, NULL);
844 if (len < 0) {
845 if (include_nuls) {
846 g_set_error (err, G_CONVERT_ERROR, G_CONVERT_ERROR_FAILED, "Conversions with embedded nulls must pass the string length");
847 return NULL;
850 len = strlen (str);
853 inptr = (char *) str;
854 inleft = len;
856 while (inleft > 0) {
857 if ((n = decode_utf8 (inptr, inleft, &c)) < 0)
858 goto error;
860 if (c == 0 && !include_nuls)
861 break;
863 if ((u = g_unichar_to_utf16 (c, NULL)) < 0) {
864 if (replace_invalid_codepoints) {
865 u = 2;
866 } else {
867 mono_set_errno (EILSEQ);
868 goto error;
872 outlen += u;
873 inleft -= n;
874 inptr += n;
877 if (items_read)
878 *items_read = inptr - str;
880 if (items_written)
881 *items_written = outlen;
883 outptr = outbuf = g_malloc ((outlen + 1) * sizeof (gunichar2));
884 inptr = (char *) str;
885 inleft = len;
887 while (inleft > 0) {
888 if ((n = decode_utf8 (inptr, inleft, &c)) < 0)
889 break;
891 if (c == 0 && !include_nuls)
892 break;
894 u = g_unichar_to_utf16 (c, outptr);
895 if ((u < 0) && replace_invalid_codepoints) {
896 outptr[0] = 0xFFFD;
897 outptr[1] = 0xFFFD;
898 u = 2;
901 outptr += u;
902 inleft -= n;
903 inptr += n;
906 *outptr = '\0';
908 return outbuf;
910 error:
911 if (errno == EILSEQ) {
912 g_set_error (err, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
913 "Illegal byte sequence encounted in the input.");
914 } else if (items_read) {
915 /* partial input is ok if we can let our caller know... */
916 } else {
917 g_set_error (err, G_CONVERT_ERROR, G_CONVERT_ERROR_PARTIAL_INPUT,
918 "Partial byte sequence encountered in the input.");
921 if (items_read)
922 *items_read = inptr - str;
924 if (items_written)
925 *items_written = 0;
927 return NULL;
930 gunichar2 *
931 g_utf8_to_utf16 (const gchar *str, glong len, glong *items_read, glong *items_written, GError **err)
933 return eg_utf8_to_utf16_general (str, len, items_read, items_written, FALSE, FALSE, err);
936 gunichar2 *
937 eg_utf8_to_utf16_with_nuls (const gchar *str, glong len, glong *items_read, glong *items_written, GError **err)
939 return eg_utf8_to_utf16_general (str, len, items_read, items_written, TRUE, FALSE, err);
942 gunichar2 *
943 eg_wtf8_to_utf16 (const gchar *str, glong len, glong *items_read, glong *items_written, GError **err)
945 return eg_utf8_to_utf16_general (str, len, items_read, items_written, TRUE, TRUE, err);
948 gunichar *
949 g_utf8_to_ucs4 (const gchar *str, glong len, glong *items_read, glong *items_written, GError **err)
951 gunichar *outbuf, *outptr;
952 size_t outlen = 0;
953 size_t inleft;
954 char *inptr;
955 gunichar c;
956 int n;
958 g_return_val_if_fail (str != NULL, NULL);
960 if (len < 0)
961 len = strlen (str);
963 inptr = (char *) str;
964 inleft = len;
966 while (inleft > 0) {
967 if ((n = decode_utf8 (inptr, inleft, &c)) < 0) {
968 if (errno == EILSEQ) {
969 g_set_error (err, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
970 "Illegal byte sequence encounted in the input.");
971 } else if (items_read) {
972 /* partial input is ok if we can let our caller know... */
973 break;
974 } else {
975 g_set_error (err, G_CONVERT_ERROR, G_CONVERT_ERROR_PARTIAL_INPUT,
976 "Partial byte sequence encountered in the input.");
979 if (items_read)
980 *items_read = inptr - str;
982 if (items_written)
983 *items_written = 0;
985 return NULL;
986 } else if (c == 0)
987 break;
989 outlen += 4;
990 inleft -= n;
991 inptr += n;
994 if (items_written)
995 *items_written = outlen / 4;
997 if (items_read)
998 *items_read = inptr - str;
1000 outptr = outbuf = g_malloc (outlen + 4);
1001 inptr = (char *) str;
1002 inleft = len;
1004 while (inleft > 0) {
1005 if ((n = decode_utf8 (inptr, inleft, &c)) < 0)
1006 break;
1007 else if (c == 0)
1008 break;
1010 *outptr++ = c;
1011 inleft -= n;
1012 inptr += n;
1015 *outptr = 0;
1017 return outbuf;
1020 gchar *
1021 g_utf16_to_utf8 (const gunichar2 *str, glong len, glong *items_read, glong *items_written, GError **err)
1023 char *inptr, *outbuf, *outptr;
1024 size_t outlen = 0;
1025 size_t inleft;
1026 gunichar c;
1027 int n;
1029 g_return_val_if_fail (str != NULL, NULL);
1031 if (len < 0) {
1032 len = 0;
1033 while (str[len])
1034 len++;
1037 inptr = (char *) str;
1038 inleft = len * 2;
1040 while (inleft > 0) {
1041 if ((n = decode_utf16 (inptr, inleft, &c)) < 0) {
1042 if (n == -2 && inleft > 2) {
1043 /* This means that the first UTF-16 char was read, but second failed */
1044 inleft -= 2;
1045 inptr += 2;
1048 if (errno == EILSEQ) {
1049 g_set_error (err, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
1050 "Illegal byte sequence encounted in the input.");
1051 } else if (items_read) {
1052 /* partial input is ok if we can let our caller know... */
1053 break;
1054 } else {
1055 g_set_error (err, G_CONVERT_ERROR, G_CONVERT_ERROR_PARTIAL_INPUT,
1056 "Partial byte sequence encountered in the input.");
1059 if (items_read)
1060 *items_read = (inptr - (char *) str) / 2;
1062 if (items_written)
1063 *items_written = 0;
1065 return NULL;
1066 } else if (c == 0)
1067 break;
1069 outlen += g_unichar_to_utf8 (c, NULL);
1070 inleft -= n;
1071 inptr += n;
1074 if (items_read)
1075 *items_read = (inptr - (char *) str) / 2;
1077 if (items_written)
1078 *items_written = outlen;
1080 outptr = outbuf = g_malloc (outlen + 1);
1081 inptr = (char *) str;
1082 inleft = len * 2;
1084 while (inleft > 0) {
1085 if ((n = decode_utf16 (inptr, inleft, &c)) < 0)
1086 break;
1087 else if (c == 0)
1088 break;
1090 outptr += g_unichar_to_utf8 (c, outptr);
1091 inleft -= n;
1092 inptr += n;
1095 *outptr = '\0';
1097 return outbuf;
1100 gunichar *
1101 g_utf16_to_ucs4 (const gunichar2 *str, glong len, glong *items_read, glong *items_written, GError **err)
1103 gunichar *outbuf, *outptr;
1104 size_t outlen = 0;
1105 size_t inleft;
1106 char *inptr;
1107 gunichar c;
1108 int n;
1110 g_return_val_if_fail (str != NULL, NULL);
1112 if (len < 0) {
1113 len = 0;
1114 while (str[len])
1115 len++;
1118 inptr = (char *) str;
1119 inleft = len * 2;
1121 while (inleft > 0) {
1122 if ((n = decode_utf16 (inptr, inleft, &c)) < 0) {
1123 if (n == -2 && inleft > 2) {
1124 /* This means that the first UTF-16 char was read, but second failed */
1125 inleft -= 2;
1126 inptr += 2;
1129 if (errno == EILSEQ) {
1130 g_set_error (err, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
1131 "Illegal byte sequence encounted in the input.");
1132 } else if (items_read) {
1133 /* partial input is ok if we can let our caller know... */
1134 break;
1135 } else {
1136 g_set_error (err, G_CONVERT_ERROR, G_CONVERT_ERROR_PARTIAL_INPUT,
1137 "Partial byte sequence encountered in the input.");
1140 if (items_read)
1141 *items_read = (inptr - (char *) str) / 2;
1143 if (items_written)
1144 *items_written = 0;
1146 return NULL;
1147 } else if (c == 0)
1148 break;
1150 outlen += 4;
1151 inleft -= n;
1152 inptr += n;
1155 if (items_read)
1156 *items_read = (inptr - (char *) str) / 2;
1158 if (items_written)
1159 *items_written = outlen / 4;
1161 outptr = outbuf = g_malloc (outlen + 4);
1162 inptr = (char *) str;
1163 inleft = len * 2;
1165 while (inleft > 0) {
1166 if ((n = decode_utf16 (inptr, inleft, &c)) < 0)
1167 break;
1168 else if (c == 0)
1169 break;
1171 *outptr++ = c;
1172 inleft -= n;
1173 inptr += n;
1176 *outptr = 0;
1178 return outbuf;
1181 gchar *
1182 g_ucs4_to_utf8 (const gunichar *str, glong len, glong *items_read, glong *items_written, GError **err)
1184 char *outbuf, *outptr;
1185 size_t outlen = 0;
1186 glong i;
1187 int n;
1189 g_return_val_if_fail (str != NULL, NULL);
1191 if (len < 0) {
1192 for (i = 0; str[i] != 0; i++) {
1193 if ((n = g_unichar_to_utf8 (str[i], NULL)) < 0) {
1194 g_set_error (err, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
1195 "Illegal byte sequence encounted in the input.");
1197 if (items_written)
1198 *items_written = 0;
1200 if (items_read)
1201 *items_read = i;
1203 return NULL;
1206 outlen += n;
1208 } else {
1209 for (i = 0; i < len && str[i] != 0; i++) {
1210 if ((n = g_unichar_to_utf8 (str[i], NULL)) < 0) {
1211 g_set_error (err, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
1212 "Illegal byte sequence encounted in the input.");
1214 if (items_written)
1215 *items_written = 0;
1217 if (items_read)
1218 *items_read = i;
1220 return NULL;
1223 outlen += n;
1227 len = i;
1229 outptr = outbuf = g_malloc (outlen + 1);
1230 for (i = 0; i < len; i++)
1231 outptr += g_unichar_to_utf8 (str[i], outptr);
1232 *outptr = 0;
1234 if (items_written)
1235 *items_written = outlen;
1237 if (items_read)
1238 *items_read = i;
1240 return outbuf;
1243 gunichar2 *
1244 g_ucs4_to_utf16 (const gunichar *str, glong len, glong *items_read, glong *items_written, GError **err)
1246 gunichar2 *outbuf, *outptr;
1247 size_t outlen = 0;
1248 glong i;
1249 int n;
1251 g_return_val_if_fail (str != NULL, NULL);
1253 if (len < 0) {
1254 for (i = 0; str[i] != 0; i++) {
1255 if ((n = g_unichar_to_utf16 (str[i], NULL)) < 0) {
1256 g_set_error (err, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
1257 "Illegal byte sequence encounted in the input.");
1259 if (items_written)
1260 *items_written = 0;
1262 if (items_read)
1263 *items_read = i;
1265 return NULL;
1268 outlen += n;
1270 } else {
1271 for (i = 0; i < len && str[i] != 0; i++) {
1272 if ((n = g_unichar_to_utf16 (str[i], NULL)) < 0) {
1273 g_set_error (err, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
1274 "Illegal byte sequence encounted in the input.");
1276 if (items_written)
1277 *items_written = 0;
1279 if (items_read)
1280 *items_read = i;
1282 return NULL;
1285 outlen += n;
1289 len = i;
1291 outptr = outbuf = g_malloc ((outlen + 1) * sizeof (gunichar2));
1292 for (i = 0; i < len; i++)
1293 outptr += g_unichar_to_utf16 (str[i], outptr);
1294 *outptr = 0;
1296 if (items_written)
1297 *items_written = outlen;
1299 if (items_read)
1300 *items_read = i;
1302 return outbuf;