mono/eglib/giconv.c

   1 /* -*- Mode: C; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- */
   2 /*
   3  *  Copyright (C) 2011 Jeffrey Stedfast
   4  *
   5  *  Permission is hereby granted, free of charge, to any person
   6  *  obtaining a copy of this software and associated documentation
   7  *  files (the "Software"), to deal in the Software without
   8  *  restriction, including without limitation the rights to use, copy,
   9  *  modify, merge, publish, distribute, sublicense, and/or sell copies
  10  *  of the Software, and to permit persons to whom the Software is
  11  *  furnished to do so, subject to the following conditions:
  12  *
  13  *  The above copyright notice and this permission notice shall be
  14  *  included in all copies or substantial portions of the Software.
  15  *
  16  *  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
  17  *  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  18  *  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
  19  *  NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
  20  *  HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
  21  *  WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  22  *  OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
  23  *  DEALINGS IN THE SOFTWARE.
  24  */
  25 #include <config.h>
  26 #include <glib.h>
  27 #include <string.h>
  28 #ifdef HAVE_ICONV_H
  29 #include <iconv.h>
  30 #endif
  31 #include <errno.h>
  32 #include "../utils/mono-errno.h"
  33
  34 #ifdef _MSC_VER
  35 #define FORCE_INLINE(RET_TYPE) __forceinline RET_TYPE
  36 #else
  37 #define FORCE_INLINE(RET_TYPE) inline RET_TYPE __attribute__((always_inline))
  38 #endif
  39
  40
  41 #define UNROLL_DECODE_UTF8 0
  42 #define UNROLL_ENCODE_UTF8 0
  43
  44 typedef int (* Decoder) (char *inbuf, size_t inleft, gunichar *outchar);
  45 typedef int (* Encoder) (gunichar c, char *outbuf, size_t outleft);
  46
  47 struct _GIConv {
  48         Decoder decode;
  49         Encoder encode;
  50         gunichar c;
  51 #ifdef HAVE_LIBICONV
  52         iconv_t cd;
  53 #endif
  54 };
  55
  56 static int decode_utf32be (char *inbuf, size_t inleft, gunichar *outchar);
  57 static int encode_utf32be (gunichar c, char *outbuf, size_t outleft);
  58
  59 static int decode_utf32le (char *inbuf, size_t inleft, gunichar *outchar);
  60 static int encode_utf32le (gunichar c, char *outbuf, size_t outleft);
  61
  62 static int decode_utf16be (char *inbuf, size_t inleft, gunichar *outchar);
  63 static int encode_utf16be (gunichar c, char *outbuf, size_t outleft);
  64
  65 static int decode_utf16le (char *inbuf, size_t inleft, gunichar *outchar);
  66 static int encode_utf16le (gunichar c, char *outbuf, size_t outleft);
  67
  68 static FORCE_INLINE (int) decode_utf8 (char *inbuf, size_t inleft, gunichar *outchar);
  69 static int encode_utf8 (gunichar c, char *outbuf, size_t outleft);
  70
  71 static int decode_latin1 (char *inbuf, size_t inleft, gunichar *outchar);
  72 static int encode_latin1 (gunichar c, char *outbuf, size_t outleft);
  73
  74 #if G_BYTE_ORDER == G_LITTLE_ENDIAN
  75 #define decode_utf32 decode_utf32le
  76 #define encode_utf32 encode_utf32le
  77 #define decode_utf16 decode_utf16le
  78 #define encode_utf16 encode_utf16le
  79 #else
  80 #define decode_utf32 decode_utf32be
  81 #define encode_utf32 encode_utf32be
  82 #define decode_utf16 decode_utf16be
  83 #define encode_utf16 encode_utf16be
  84 #endif
  85
  86 static struct {
  87         const char *name;
  88         Decoder decoder;
  89         Encoder encoder;
  90 } charsets[] = {
  91         { "ISO-8859-1", decode_latin1,  encode_latin1  },
  92         { "ISO8859-1",  decode_latin1,  encode_latin1  },
  93         { "UTF-32BE",   decode_utf32be, encode_utf32be },
  94         { "UTF-32LE",   decode_utf32le, encode_utf32le },
  95         { "UTF-16BE",   decode_utf16be, encode_utf16be },
  96         { "UTF-16LE",   decode_utf16le, encode_utf16le },
  97         { "UTF-32",     decode_utf32,   encode_utf32   },
  98         { "UTF-16",     decode_utf16,   encode_utf16   },
  99         { "UTF-8",      decode_utf8,    encode_utf8    },
 100         { "US-ASCII",   decode_latin1,  encode_latin1  },
 101         { "Latin1",     decode_latin1,  encode_latin1  },
 102         { "ASCII",      decode_latin1,  encode_latin1  },
 103         { "UTF32",      decode_utf32,   encode_utf32   },
 104         { "UTF16",      decode_utf16,   encode_utf16   },
 105         { "UTF8",       decode_utf8,    encode_utf8    },
 106 };
 107
 108
 109 GIConv
 110 g_iconv_open (const char *to_charset, const char *from_charset)
 111 {
 112 #ifdef HAVE_LIBICONV
 113         iconv_t icd = (iconv_t) -1;
 114 #endif
 115         Decoder decoder = NULL;
 116         Encoder encoder = NULL;
 117         GIConv cd;
 118         guint i;
 119
 120         if (!to_charset || !from_charset || !to_charset[0] || !from_charset[0]) {
 121                 mono_set_errno (EINVAL);
 122
 123                 return (GIConv) -1;
 124         }
 125
 126         for (i = 0; i < G_N_ELEMENTS (charsets); i++) {
 127                 if (!g_ascii_strcasecmp (charsets[i].name, from_charset))
 128                         decoder = charsets[i].decoder;
 129
 130                 if (!g_ascii_strcasecmp (charsets[i].name, to_charset))
 131                         encoder = charsets[i].encoder;
 132         }
 133
 134         if (!encoder || !decoder) {
 135 #ifdef HAVE_LIBICONV
 136                 if ((icd = iconv_open (to_charset, from_charset)) == (iconv_t) -1)
 137                         return (GIConv) -1;
 138 #else
 139                 mono_set_errno (EINVAL);
 140
 141                 return (GIConv) -1;
 142 #endif
 143         }
 144
 145         cd = (GIConv) g_malloc (sizeof (struct _GIConv));
 146         cd->decode = decoder;
 147         cd->encode = encoder;
 148         cd->c = -1;
 149
 150 #ifdef HAVE_LIBICONV
 151         cd->cd = icd;
 152 #endif
 153
 154         return cd;
 155 }
 156
 157 int
 158 g_iconv_close (GIConv cd)
 159 {
 160 #ifdef HAVE_LIBICONV
 161         if (cd->cd != (iconv_t) -1)
 162                 iconv_close (cd->cd);
 163 #endif
 164
 165         g_free (cd);
 166
 167         return 0;
 168 }
 169
 170 gsize
 171 g_iconv (GIConv cd, gchar **inbytes, gsize *inbytesleft,
 172          gchar **outbytes, gsize *outbytesleft)
 173 {
 174         gsize inleft, outleft;
 175         char *inptr, *outptr;
 176         gunichar c;
 177         int rc = 0;
 178
 179 #ifdef HAVE_LIBICONV
 180         if (cd->cd != (iconv_t) -1) {
 181                 /* Note: gsize may have a different size than size_t, so we need to
 182                    remap inbytesleft and outbytesleft to size_t's. */
 183                 size_t *outleftptr, *inleftptr;
 184                 size_t n_outleft, n_inleft;
 185
 186                 if (inbytesleft) {
 187                         n_inleft = *inbytesleft;
 188                         inleftptr = &n_inleft;
 189                 } else {
 190                         inleftptr = NULL;
 191                 }
 192
 193                 if (outbytesleft) {
 194                         n_outleft = *outbytesleft;
 195                         outleftptr = &n_outleft;
 196                 } else {
 197                         outleftptr = NULL;
 198                 }
 199 #if defined(__NetBSD__)
 200                 return iconv (cd->cd, (const gchar **)inbytes, inleftptr, outbytes, outleftptr);
 201 #else
 202                 return iconv (cd->cd, inbytes, inleftptr, outbytes, outleftptr);
 203 #endif
 204         }
 205 #endif
 206
 207         if (outbytes == NULL || outbytesleft == NULL) {
 208                 /* reset converter */
 209                 cd->c = -1;
 210                 return 0;
 211         }
 212
 213         inleft = inbytesleft ? *inbytesleft : 0;
 214         inptr = inbytes ? *inbytes : NULL;
 215         outleft = *outbytesleft;
 216         outptr = *outbytes;
 217
 218         if ((c = cd->c) != (gunichar) -1)
 219                 goto encode;
 220
 221         while (inleft > 0) {
 222                 if ((rc = cd->decode (inptr, inleft, &c)) < 0)
 223                         break;
 224
 225                 inleft -= rc;
 226                 inptr += rc;
 227
 228         encode:
 229                 if ((rc = cd->encode (c, outptr, outleft)) < 0)
 230                         break;
 231
 232                 c = (gunichar) -1;
 233                 outleft -= rc;
 234                 outptr += rc;
 235         }
 236
 237         if (inbytesleft)
 238                 *inbytesleft = inleft;
 239
 240         if (inbytes)
 241                 *inbytes = inptr;
 242
 243         *outbytesleft = outleft;
 244         *outbytes = outptr;
 245         cd->c = c;
 246
 247         return rc < 0 ? -1 : 0;
 248 }
 249
 250 /*
 251  * Unicode encoders and decoders
 252  */
 253
 254 static FORCE_INLINE (uint32_t)
 255 read_uint32_endian (unsigned char *inptr, unsigned endian)
 256 {
 257         if (endian == G_LITTLE_ENDIAN)
 258                 return (inptr[3] << 24) | (inptr[2] << 16) | (inptr[1] << 8) | inptr[0];
 259         return (inptr[0] << 24) | (inptr[1] << 16) | (inptr[2] << 8) | inptr[3];
 260 }
 261
 262 static int
 263 decode_utf32_endian (char *inbuf, size_t inleft, gunichar *outchar, unsigned endian)
 264 {
 265         unsigned char *inptr = (unsigned char *) inbuf;
 266         gunichar c;
 267
 268         if (inleft < 4) {
 269                 mono_set_errno (EINVAL);
 270                 return -1;
 271         }
 272
 273         c = read_uint32_endian (inptr, endian);
 274
 275         if (c >= 0xd800 && c < 0xe000) {
 276                 mono_set_errno (EILSEQ);
 277                 return -1;
 278         } else if (c >= 0x110000) {
 279                 mono_set_errno (EILSEQ);
 280                 return -1;
 281         }
 282
 283         *outchar = c;
 284
 285         return 4;
 286 }
 287
 288 static int
 289 decode_utf32be (char *inbuf, size_t inleft, gunichar *outchar)
 290 {
 291         return decode_utf32_endian (inbuf, inleft, outchar, G_BIG_ENDIAN);
 292 }
 293
 294 static int
 295 decode_utf32le (char *inbuf, size_t inleft, gunichar *outchar)
 296 {
 297         return decode_utf32_endian (inbuf, inleft, outchar, G_LITTLE_ENDIAN);
 298 }
 299
 300 static int
 301 encode_utf32be (gunichar c, char *outbuf, size_t outleft)
 302 {
 303         unsigned char *outptr = (unsigned char *) outbuf;
 304
 305         if (outleft < 4) {
 306                 mono_set_errno (E2BIG);
 307                 return -1;
 308         }
 309
 310         outptr[0] = (c >> 24) & 0xff;
 311         outptr[1] = (c >> 16) & 0xff;
 312         outptr[2] = (c >> 8) & 0xff;
 313         outptr[3] = c & 0xff;
 314
 315         return 4;
 316 }
 317
 318 static int
 319 encode_utf32le (gunichar c, char *outbuf, size_t outleft)
 320 {
 321         unsigned char *outptr = (unsigned char *) outbuf;
 322
 323         if (outleft < 4) {
 324                 mono_set_errno (E2BIG);
 325                 return -1;
 326         }
 327
 328         outptr[0] = c & 0xff;
 329         outptr[1] = (c >> 8) & 0xff;
 330         outptr[2] = (c >> 16) & 0xff;
 331         outptr[3] = (c >> 24) & 0xff;
 332
 333         return 4;
 334 }
 335
 336 static FORCE_INLINE (uint16_t)
 337 read_uint16_endian (unsigned char *inptr, unsigned endian)
 338 {
 339         if (endian == G_LITTLE_ENDIAN)
 340                 return (inptr[1] << 8) | inptr[0];
 341         return (inptr[0] << 8) | inptr[1];
 342 }
 343
 344 static FORCE_INLINE (int)
 345 decode_utf16_endian (char *inbuf, size_t inleft, gunichar *outchar, unsigned endian)
 346 {
 347         unsigned char *inptr = (unsigned char *) inbuf;
 348         gunichar2 c;
 349         gunichar u;
 350
 351         if (inleft < 2) {
 352                 mono_set_errno (E2BIG);
 353                 return -1;
 354         }
 355
 356         u = read_uint16_endian (inptr, endian);
 357
 358         if (u < 0xd800) {
 359                 /* 0x0000 -> 0xd7ff */
 360                 *outchar = u;
 361                 return 2;
 362         } else if (u < 0xdc00) {
 363                 /* 0xd800 -> 0xdbff */
 364                 if (inleft < 4) {
 365                         mono_set_errno (EINVAL);
 366                         return -2;
 367                 }
 368
 369                 c = read_uint16_endian (inptr + 2, endian);
 370
 371                 if (c < 0xdc00 || c > 0xdfff) {
 372                         mono_set_errno (EILSEQ);
 373                         return -2;
 374                 }
 375
 376                 u = ((u - 0xd800) << 10) + (c - 0xdc00) + 0x0010000UL;
 377                 *outchar = u;
 378
 379                 return 4;
 380         } else if (u < 0xe000) {
 381                 /* 0xdc00 -> 0xdfff */
 382                 mono_set_errno (EILSEQ);
 383                 return -1;
 384         } else {
 385                 /* 0xe000 -> 0xffff */
 386                 *outchar = u;
 387                 return 2;
 388         }
 389 }
 390
 391 static int
 392 decode_utf16be (char *inbuf, size_t inleft, gunichar *outchar)
 393 {
 394         return decode_utf16_endian (inbuf, inleft, outchar, G_BIG_ENDIAN);
 395 }
 396
 397 static int
 398 decode_utf16le (char *inbuf, size_t inleft, gunichar *outchar)
 399 {
 400         return decode_utf16_endian (inbuf, inleft, outchar, G_LITTLE_ENDIAN);
 401 }
 402
 403 static FORCE_INLINE (void)
 404 write_uint16_endian (unsigned char *outptr, uint16_t c, unsigned endian)
 405 {
 406         if (endian == G_LITTLE_ENDIAN) {
 407                 outptr[0] = c & 0xff;
 408                 outptr[1] = (c >> 8) & 0xff;
 409                 return;
 410         }
 411         outptr[0] = (c >> 8) & 0xff;
 412         outptr[1] = c & 0xff;
 413 }
 414
 415 static FORCE_INLINE (int)
 416 encode_utf16_endian (gunichar c, char *outbuf, size_t outleft, unsigned endian)
 417 {
 418         unsigned char *outptr = (unsigned char *) outbuf;
 419         gunichar2 ch;
 420         gunichar c2;
 421
 422         if (c < 0x10000) {
 423                 if (outleft < 2) {
 424                         mono_set_errno (E2BIG);
 425                         return -1;
 426                 }
 427
 428                 write_uint16_endian (outptr, c, endian);
 429                 return 2;
 430         } else {
 431                 if (outleft < 4) {
 432                         mono_set_errno (E2BIG);
 433                         return -1;
 434                 }
 435
 436                 c2 = c - 0x10000;
 437
 438                 ch = (gunichar2) ((c2 >> 10) + 0xd800);
 439                 write_uint16_endian (outptr, ch, endian);
 440
 441                 ch = (gunichar2) ((c2 & 0x3ff) + 0xdc00);
 442                 write_uint16_endian (outptr + 2, ch, endian);
 443                 return 4;
 444         }
 445 }
 446
 447 static int
 448 encode_utf16be (gunichar c, char *outbuf, size_t outleft)
 449 {
 450         return encode_utf16_endian (c, outbuf, outleft, G_BIG_ENDIAN);
 451 }
 452
 453 static int
 454 encode_utf16le (gunichar c, char *outbuf, size_t outleft)
 455 {
 456         return encode_utf16_endian (c, outbuf, outleft, G_LITTLE_ENDIAN);
 457 }
 458
 459 static FORCE_INLINE (int)
 460 decode_utf8 (char *inbuf, size_t inleft, gunichar *outchar)
 461 {
 462         unsigned char *inptr = (unsigned char *) inbuf;
 463         gunichar u;
 464         int n, i;
 465
 466         u = *inptr;
 467
 468         if (u < 0x80) {
 469                 /* simple ascii case */
 470                 *outchar = u;
 471                 return 1;
 472         } else if (u < 0xc2) {
 473                 mono_set_errno (EILSEQ);
 474                 return -1;
 475         } else if (u < 0xe0) {
 476                 u &= 0x1f;
 477                 n = 2;
 478         } else if (u < 0xf0) {
 479                 u &= 0x0f;
 480                 n = 3;
 481         } else if (u < 0xf8) {
 482                 u &= 0x07;
 483                 n = 4;
 484         } else if (u < 0xfc) {
 485                 u &= 0x03;
 486                 n = 5;
 487         } else if (u < 0xfe) {
 488                 u &= 0x01;
 489                 n = 6;
 490         } else {
 491                 mono_set_errno (EILSEQ);
 492                 return -1;
 493         }
 494
 495         if (n > inleft) {
 496                 mono_set_errno (EINVAL);
 497                 return -1;
 498         }
 499
 500 #if UNROLL_DECODE_UTF8
 501         switch (n) {
 502         case 6: u = (u << 6) | (*++inptr ^ 0x80);
 503         case 5: u = (u << 6) | (*++inptr ^ 0x80);
 504         case 4: u = (u << 6) | (*++inptr ^ 0x80);
 505         case 3: u = (u << 6) | (*++inptr ^ 0x80);
 506         case 2: u = (u << 6) | (*++inptr ^ 0x80);
 507         }
 508 #else
 509         for (i = 1; i < n; i++)
 510                 u = (u << 6) | (*++inptr ^ 0x80);
 511 #endif
 512
 513         *outchar = u;
 514
 515         return n;
 516 }
 517
 518 static int
 519 encode_utf8 (gunichar c, char *outbuf, size_t outleft)
 520 {
 521         unsigned char *outptr = (unsigned char *) outbuf;
 522         int base, n, i;
 523
 524         if (c < 0x80) {
 525                 outptr[0] = c;
 526                 return 1;
 527         } else if (c < 0x800) {
 528                 base = 192;
 529                 n = 2;
 530         } else if (c < 0x10000) {
 531                 base = 224;
 532                 n = 3;
 533         } else if (c < 0x200000) {
 534                 base = 240;
 535                 n = 4;
 536         } else if (c < 0x4000000) {
 537                 base = 248;
 538                 n = 5;
 539         } else {
 540                 base = 252;
 541                 n = 6;
 542         }
 543
 544         if (outleft < n) {
 545                 mono_set_errno (E2BIG);
 546                 return -1;
 547         }
 548
 549 #if UNROLL_ENCODE_UTF8
 550         switch (n) {
 551         case 6: outptr[5] = (c & 0x3f) | 0x80; c >>= 6;
 552         case 5: outptr[4] = (c & 0x3f) | 0x80; c >>= 6;
 553         case 4: outptr[3] = (c & 0x3f) | 0x80; c >>= 6;
 554         case 3: outptr[2] = (c & 0x3f) | 0x80; c >>= 6;
 555         case 2: outptr[1] = (c & 0x3f) | 0x80; c >>= 6;
 556         case 1: outptr[0] = c | base;
 557         }
 558 #else
 559         for (i = n - 1; i > 0; i--) {
 560                 outptr[i] = (c & 0x3f) | 0x80;
 561                 c >>= 6;
 562         }
 563
 564         outptr[0] = c | base;
 565 #endif
 566
 567         return n;
 568 }
 569
 570 static int
 571 decode_latin1 (char *inbuf, size_t inleft, gunichar *outchar)
 572 {
 573         *outchar = (unsigned char) *inbuf;
 574         return 1;
 575 }
 576
 577 static int
 578 encode_latin1 (gunichar c, char *outbuf, size_t outleft)
 579 {
 580         if (outleft < 1) {
 581                 mono_set_errno (E2BIG);
 582                 return -1;
 583         }
 584
 585         if (c > 0xff) {
 586                 mono_set_errno (EILSEQ);
 587                 return -1;
 588         }
 589
 590         *outbuf = (char) c;
 591
 592         return 1;
 593 }
 594
 595
 596 /*
 597  * Simple conversion API
 598  */
 599
 600 static gpointer error_quark = (gpointer)"ConvertError";
 601
 602 gpointer
 603 g_convert_error_quark (void)
 604 {
 605         return error_quark;
 606 }
 607
 608 gchar *
 609 g_convert (const gchar *str, gssize len, const gchar *to_charset, const gchar *from_charset,
 610            gsize *bytes_read, gsize *bytes_written, GError **err)
 611 {
 612         gsize outsize, outused, outleft, inleft, grow, rc;
 613         char *result, *outbuf, *inbuf;
 614         gboolean flush = FALSE;
 615         gboolean done = FALSE;
 616         GIConv cd;
 617
 618         g_return_val_if_fail (str != NULL, NULL);
 619         g_return_val_if_fail (to_charset != NULL, NULL);
 620         g_return_val_if_fail (from_charset != NULL, NULL);
 621
 622         if ((cd = g_iconv_open (to_charset, from_charset)) == (GIConv) -1) {
 623                 g_set_error (err, G_CONVERT_ERROR, G_CONVERT_ERROR_NO_CONVERSION,
 624                              "Conversion from %s to %s not supported.",
 625                              from_charset, to_charset);
 626
 627                 if (bytes_written)
 628                         *bytes_written = 0;
 629
 630                 if (bytes_read)
 631                         *bytes_read = 0;
 632
 633                 return NULL;
 634         }
 635
 636         inleft = len < 0 ? strlen (str) : len;
 637         inbuf = (char *) str;
 638
 639         outleft = outsize = MAX (inleft, 8);
 640         outbuf = result = g_malloc (outsize + 4);
 641
 642         do {
 643                 if (!flush)
 644                         rc = g_iconv (cd, &inbuf, &inleft, &outbuf, &outleft);
 645                 else
 646                         rc = g_iconv (cd, NULL, NULL, &outbuf, &outleft);
 647
 648                 if (rc == (gsize) -1) {
 649                         switch (errno) {
 650                         case E2BIG:
 651                                 /* grow our result buffer */
 652                                 grow = MAX (inleft, 8) << 1;
 653                                 outused = outbuf - result;
 654                                 outsize += grow;
 655                                 outleft += grow;
 656                                 result = g_realloc (result, outsize + 4);
 657                                 outbuf = result + outused;
 658                                 break;
 659                         case EINVAL:
 660                                 /* incomplete input, stop converting and terminate here */
 661                                 if (flush)
 662                                         done = TRUE;
 663                                 else
 664                                         flush = TRUE;
 665                                 break;
 666                         case EILSEQ:
 667                                 /* illegal sequence in the input */
 668                                 g_set_error (err, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE, "%s", g_strerror (errno));
 669
 670                                 if (bytes_read) {
 671                                         /* save offset of the illegal input sequence */
 672                                         *bytes_read = (inbuf - str);
 673                                 }
 674
 675                                 if (bytes_written)
 676                                         *bytes_written = 0;
 677
 678                                 g_iconv_close (cd);
 679                                 g_free (result);
 680                                 return NULL;
 681                         default:
 682                                 /* unknown errno */
 683                                 g_set_error (err, G_CONVERT_ERROR, G_CONVERT_ERROR_FAILED, "%s", g_strerror (errno));
 684
 685                                 if (bytes_written)
 686                                         *bytes_written = 0;
 687
 688                                 if (bytes_read)
 689                                         *bytes_read = 0;
 690
 691                                 g_iconv_close (cd);
 692                                 g_free (result);
 693                                 return NULL;
 694                         }
 695                 } else if (flush) {
 696                         /* input has been converted and output has been flushed */
 697                         break;
 698                 } else {
 699                         /* input has been converted, need to flush the output */
 700                         flush = TRUE;
 701                 }
 702         } while (!done);
 703
 704         g_iconv_close (cd);
 705
 706         /* Note: not all charsets can be null-terminated with a single
 707            null byte. UCS2, for example, needs 2 null bytes and UCS4
 708            needs 4. I hope that 4 null bytes is enough to terminate all
 709            multibyte charsets? */
 710
 711         /* null-terminate the result */
 712         memset (outbuf, 0, 4);
 713
 714         if (bytes_written)
 715                 *bytes_written = outbuf - result;
 716
 717         if (bytes_read)
 718                 *bytes_read = inbuf - str;
 719
 720         return result;
 721 }
 722
 723
 724 /*
 725  * Unicode conversion
 726  */
 727
 728 /**
 729  * An explanation of the conversion can be found at:
 730  * http://home.tiscali.nl/t876506/utf8tbl.html
 731  *
 732  **/
 733 gint
 734 g_unichar_to_utf8 (gunichar c, gchar *outbuf)
 735 {
 736         int base, n, i;
 737
 738         if (c < 0x80) {
 739                 base = 0;
 740                 n = 1;
 741         } else if (c < 0x800) {
 742                 base = 192;
 743                 n = 2;
 744         } else if (c < 0x10000) {
 745                 base = 224;
 746                 n = 3;
 747         } else if (c < 0x200000) {
 748                 base = 240;
 749                 n = 4;
 750         } else if (c < 0x4000000) {
 751                 base = 248;
 752                 n = 5;
 753         } else if (c < 0x80000000) {
 754                 base = 252;
 755                 n = 6;
 756         } else {
 757                 return -1;
 758         }
 759
 760         if (outbuf != NULL) {
 761                 for (i = n - 1; i > 0; i--) {
 762                         /* mask off 6 bits worth and add 128 */
 763                         outbuf[i] = (c & 0x3f) | 0x80;
 764                         c >>= 6;
 765                 }
 766
 767                 /* first character has a different base */
 768                 outbuf[0] = c | base;
 769         }
 770
 771         return n;
 772 }
 773
 774 static FORCE_INLINE (int)
 775 g_unichar_to_utf16 (gunichar c, gunichar2 *outbuf)
 776 {
 777         gunichar c2;
 778
 779         if (c < 0xd800) {
 780                 if (outbuf)
 781                         *outbuf = (gunichar2) c;
 782
 783                 return 1;
 784         } else if (c < 0xe000) {
 785                 return -1;
 786         } else if (c < 0x10000) {
 787                 if (outbuf)
 788                         *outbuf = (gunichar2) c;
 789
 790                 return 1;
 791         } else if (c < 0x110000) {
 792                 if (outbuf) {
 793                         c2 = c - 0x10000;
 794
 795                         outbuf[0] = (gunichar2) ((c2 >> 10) + 0xd800);
 796                         outbuf[1] = (gunichar2) ((c2 & 0x3ff) + 0xdc00);
 797                 }
 798
 799                 return 2;
 800         } else {
 801                 return -1;
 802         }
 803 }
 804
 805 gunichar *
 806 g_utf8_to_ucs4_fast (const gchar *str, glong len, glong *items_written)
 807 {
 808         gunichar *outbuf, *outptr;
 809         char *inptr;
 810         glong n, i;
 811
 812         g_return_val_if_fail (str != NULL, NULL);
 813
 814         n = g_utf8_strlen (str, len);
 815
 816         if (items_written)
 817                 *items_written = n;
 818
 819         outptr = outbuf = g_malloc ((n + 1) * sizeof (gunichar));
 820         inptr = (char *) str;
 821
 822         for (i = 0; i < n; i++) {
 823                 *outptr++ = g_utf8_get_char (inptr);
 824                 inptr = g_utf8_next_char (inptr);
 825         }
 826
 827         *outptr = 0;
 828
 829         return outbuf;
 830 }
 831
 832 static gunichar2 *
 833 eg_utf8_to_utf16_general (const gchar *str, glong len, glong *items_read, glong *items_written, gboolean include_nuls, gboolean replace_invalid_codepoints,     GError **err)
 834 {
 835         gunichar2 *outbuf, *outptr;
 836         size_t outlen = 0;
 837         size_t inleft;
 838         char *inptr;
 839         gunichar c;
 840         int u, n;
 841
 842         g_return_val_if_fail (str != NULL, NULL);
 843
 844         if (len < 0) {
 845                 if (include_nuls) {
 846                         g_set_error (err, G_CONVERT_ERROR, G_CONVERT_ERROR_FAILED, "Conversions with embedded nulls must pass the string length");
 847                         return NULL;
 848                 }
 849
 850                 len = strlen (str);
 851         }
 852
 853         inptr = (char *) str;
 854         inleft = len;
 855
 856         while (inleft > 0) {
 857                 if ((n = decode_utf8 (inptr, inleft, &c)) < 0)
 858                         goto error;
 859
 860                 if (c == 0 && !include_nuls)
 861                         break;
 862
 863                 if ((u = g_unichar_to_utf16 (c, NULL)) < 0) {
 864                         if (replace_invalid_codepoints) {
 865                                 u = 2;
 866                         } else {
 867                                 mono_set_errno (EILSEQ);
 868                                 goto error;
 869                         }
 870                 }
 871
 872                 outlen += u;
 873                 inleft -= n;
 874                 inptr += n;
 875         }
 876
 877         if (items_read)
 878                 *items_read = inptr - str;
 879
 880         if (items_written)
 881                 *items_written = outlen;
 882
 883         outptr = outbuf = g_malloc ((outlen + 1) * sizeof (gunichar2));
 884         inptr = (char *) str;
 885         inleft = len;
 886
 887         while (inleft > 0) {
 888                 if ((n = decode_utf8 (inptr, inleft, &c)) < 0)
 889                         break;
 890
 891                 if (c == 0 && !include_nuls)
 892                         break;
 893
 894                 u = g_unichar_to_utf16 (c, outptr);
 895                 if ((u < 0) && replace_invalid_codepoints) {
 896                         outptr[0] = 0xFFFD;
 897                         outptr[1] = 0xFFFD;
 898                         u = 2;
 899                 }
 900
 901                 outptr += u;
 902                 inleft -= n;
 903                 inptr += n;
 904         }
 905
 906         *outptr = '\0';
 907
 908         return outbuf;
 909
 910  error:
 911         if (errno == EILSEQ) {
 912                 g_set_error (err, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
 913                              "Illegal byte sequence encounted in the input.");
 914         } else if (items_read) {
 915                 /* partial input is ok if we can let our caller know... */
 916         } else {
 917                 g_set_error (err, G_CONVERT_ERROR, G_CONVERT_ERROR_PARTIAL_INPUT,
 918                              "Partial byte sequence encountered in the input.");
 919         }
 920
 921         if (items_read)
 922                 *items_read = inptr - str;
 923
 924         if (items_written)
 925                 *items_written = 0;
 926
 927         return NULL;
 928 }
 929
 930 gunichar2 *
 931 g_utf8_to_utf16 (const gchar *str, glong len, glong *items_read, glong *items_written, GError **err)
 932 {
 933         return eg_utf8_to_utf16_general (str, len, items_read, items_written, FALSE, FALSE, err);
 934 }
 935
 936 gunichar2 *
 937 eg_utf8_to_utf16_with_nuls (const gchar *str, glong len, glong *items_read, glong *items_written, GError **err)
 938 {
 939         return eg_utf8_to_utf16_general (str, len, items_read, items_written, TRUE, FALSE, err);
 940 }
 941
 942 gunichar2 *
 943 eg_wtf8_to_utf16 (const gchar *str, glong len, glong *items_read, glong *items_written, GError **err)
 944 {
 945         return eg_utf8_to_utf16_general (str, len, items_read, items_written, TRUE, TRUE, err);
 946 }
 947
 948 gunichar *
 949 g_utf8_to_ucs4 (const gchar *str, glong len, glong *items_read, glong *items_written, GError **err)
 950 {
 951         gunichar *outbuf, *outptr;
 952         size_t outlen = 0;
 953         size_t inleft;
 954         char *inptr;
 955         gunichar c;
 956         int n;
 957
 958         g_return_val_if_fail (str != NULL, NULL);
 959
 960         if (len < 0)
 961                 len = strlen (str);
 962
 963         inptr = (char *) str;
 964         inleft = len;
 965
 966         while (inleft > 0) {
 967                 if ((n = decode_utf8 (inptr, inleft, &c)) < 0) {
 968                         if (errno == EILSEQ) {
 969                                 g_set_error (err, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
 970                                              "Illegal byte sequence encounted in the input.");
 971                         } else if (items_read) {
 972                                 /* partial input is ok if we can let our caller know... */
 973                                 break;
 974                         } else {
 975                                 g_set_error (err, G_CONVERT_ERROR, G_CONVERT_ERROR_PARTIAL_INPUT,
 976                                              "Partial byte sequence encountered in the input.");
 977                         }
 978
 979                         if (items_read)
 980                                 *items_read = inptr - str;
 981
 982                         if (items_written)
 983                                 *items_written = 0;
 984
 985                         return NULL;
 986                 } else if (c == 0)
 987                         break;
 988
 989                 outlen += 4;
 990                 inleft -= n;
 991                 inptr += n;
 992         }
 993
 994         if (items_written)
 995                 *items_written = outlen / 4;
 996
 997         if (items_read)
 998                 *items_read = inptr - str;
 999
1000         outptr = outbuf = g_malloc (outlen + 4);
1001         inptr = (char *) str;
1002         inleft = len;
1003
1004         while (inleft > 0) {
1005                 if ((n = decode_utf8 (inptr, inleft, &c)) < 0)
1006                         break;
1007                 else if (c == 0)
1008                         break;
1009
1010                 *outptr++ = c;
1011                 inleft -= n;
1012                 inptr += n;
1013         }
1014
1015         *outptr = 0;
1016
1017         return outbuf;
1018 }
1019
1020 gchar *
1021 g_utf16_to_utf8 (const gunichar2 *str, glong len, glong *items_read, glong *items_written, GError **err)
1022 {
1023         char *inptr, *outbuf, *outptr;
1024         size_t outlen = 0;
1025         size_t inleft;
1026         gunichar c;
1027         int n;
1028
1029         g_return_val_if_fail (str != NULL, NULL);
1030
1031         if (len < 0) {
1032                 len = 0;
1033                 while (str[len])
1034                         len++;
1035         }
1036
1037         inptr = (char *) str;
1038         inleft = len * 2;
1039
1040         while (inleft > 0) {
1041                 if ((n = decode_utf16 (inptr, inleft, &c)) < 0) {
1042                         if (n == -2 && inleft > 2) {
1043                                 /* This means that the first UTF-16 char was read, but second failed */
1044                                 inleft -= 2;
1045                                 inptr += 2;
1046                         }
1047
1048                         if (errno == EILSEQ) {
1049                                 g_set_error (err, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
1050                                              "Illegal byte sequence encounted in the input.");
1051                         } else if (items_read) {
1052                                 /* partial input is ok if we can let our caller know... */
1053                                 break;
1054                         } else {
1055                                 g_set_error (err, G_CONVERT_ERROR, G_CONVERT_ERROR_PARTIAL_INPUT,
1056                                              "Partial byte sequence encountered in the input.");
1057                         }
1058
1059                         if (items_read)
1060                                 *items_read = (inptr - (char *) str) / 2;
1061
1062                         if (items_written)
1063                                 *items_written = 0;
1064
1065                         return NULL;
1066                 } else if (c == 0)
1067                         break;
1068
1069                 outlen += g_unichar_to_utf8 (c, NULL);
1070                 inleft -= n;
1071                 inptr += n;
1072         }
1073
1074         if (items_read)
1075                 *items_read = (inptr - (char *) str) / 2;
1076
1077         if (items_written)
1078                 *items_written = outlen;
1079
1080         outptr = outbuf = g_malloc (outlen + 1);
1081         inptr = (char *) str;
1082         inleft = len * 2;
1083
1084         while (inleft > 0) {
1085                 if ((n = decode_utf16 (inptr, inleft, &c)) < 0)
1086                         break;
1087                 else if (c == 0)
1088                         break;
1089
1090                 outptr += g_unichar_to_utf8 (c, outptr);
1091                 inleft -= n;
1092                 inptr += n;
1093         }
1094
1095         *outptr = '\0';
1096
1097         return outbuf;
1098 }
1099
1100 gunichar *
1101 g_utf16_to_ucs4 (const gunichar2 *str, glong len, glong *items_read, glong *items_written, GError **err)
1102 {
1103         gunichar *outbuf, *outptr;
1104         size_t outlen = 0;
1105         size_t inleft;
1106         char *inptr;
1107         gunichar c;
1108         int n;
1109
1110         g_return_val_if_fail (str != NULL, NULL);
1111
1112         if (len < 0) {
1113                 len = 0;
1114                 while (str[len])
1115                         len++;
1116         }
1117
1118         inptr = (char *) str;
1119         inleft = len * 2;
1120
1121         while (inleft > 0) {
1122                 if ((n = decode_utf16 (inptr, inleft, &c)) < 0) {
1123                         if (n == -2 && inleft > 2) {
1124                                 /* This means that the first UTF-16 char was read, but second failed */
1125                                 inleft -= 2;
1126                                 inptr += 2;
1127                         }
1128
1129                         if (errno == EILSEQ) {
1130                                 g_set_error (err, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
1131                                              "Illegal byte sequence encounted in the input.");
1132                         } else if (items_read) {
1133                                 /* partial input is ok if we can let our caller know... */
1134                                 break;
1135                         } else {
1136                                 g_set_error (err, G_CONVERT_ERROR, G_CONVERT_ERROR_PARTIAL_INPUT,
1137                                              "Partial byte sequence encountered in the input.");
1138                         }
1139
1140                         if (items_read)
1141                                 *items_read = (inptr - (char *) str) / 2;
1142
1143                         if (items_written)
1144                                 *items_written = 0;
1145
1146                         return NULL;
1147                 } else if (c == 0)
1148                         break;
1149
1150                 outlen += 4;
1151                 inleft -= n;
1152                 inptr += n;
1153         }
1154
1155         if (items_read)
1156                 *items_read = (inptr - (char *) str) / 2;
1157
1158         if (items_written)
1159                 *items_written = outlen / 4;
1160
1161         outptr = outbuf = g_malloc (outlen + 4);
1162         inptr = (char *) str;
1163         inleft = len * 2;
1164
1165         while (inleft > 0) {
1166                 if ((n = decode_utf16 (inptr, inleft, &c)) < 0)
1167                         break;
1168                 else if (c == 0)
1169                         break;
1170
1171                 *outptr++ = c;
1172                 inleft -= n;
1173                 inptr += n;
1174         }
1175
1176         *outptr = 0;
1177
1178         return outbuf;
1179 }
1180
1181 gchar *
1182 g_ucs4_to_utf8 (const gunichar *str, glong len, glong *items_read, glong *items_written, GError **err)
1183 {
1184         char *outbuf, *outptr;
1185         size_t outlen = 0;
1186         glong i;
1187         int n;
1188
1189         g_return_val_if_fail (str != NULL, NULL);
1190
1191         if (len < 0) {
1192                 for (i = 0; str[i] != 0; i++) {
1193                         if ((n = g_unichar_to_utf8 (str[i], NULL)) < 0) {
1194                                 g_set_error (err, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
1195                                              "Illegal byte sequence encounted in the input.");
1196
1197                                 if (items_written)
1198                                         *items_written = 0;
1199
1200                                 if (items_read)
1201                                         *items_read = i;
1202
1203                                 return NULL;
1204                         }
1205
1206                         outlen += n;
1207                 }
1208         } else {
1209                 for (i = 0; i < len && str[i] != 0; i++) {
1210                         if ((n = g_unichar_to_utf8 (str[i], NULL)) < 0) {
1211                                 g_set_error (err, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
1212                                              "Illegal byte sequence encounted in the input.");
1213
1214                                 if (items_written)
1215                                         *items_written = 0;
1216
1217                                 if (items_read)
1218                                         *items_read = i;
1219
1220                                 return NULL;
1221                         }
1222
1223                         outlen += n;
1224                 }
1225         }
1226
1227         len = i;
1228
1229         outptr = outbuf = g_malloc (outlen + 1);
1230         for (i = 0; i < len; i++)
1231                 outptr += g_unichar_to_utf8 (str[i], outptr);
1232         *outptr = 0;
1233
1234         if (items_written)
1235                 *items_written = outlen;
1236
1237         if (items_read)
1238                 *items_read = i;
1239
1240         return outbuf;
1241 }
1242
1243 gunichar2 *
1244 g_ucs4_to_utf16 (const gunichar *str, glong len, glong *items_read, glong *items_written, GError **err)
1245 {
1246         gunichar2 *outbuf, *outptr;
1247         size_t outlen = 0;
1248         glong i;
1249         int n;
1250
1251         g_return_val_if_fail (str != NULL, NULL);
1252
1253         if (len < 0) {
1254                 for (i = 0; str[i] != 0; i++) {
1255                         if ((n = g_unichar_to_utf16 (str[i], NULL)) < 0) {
1256                                 g_set_error (err, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
1257                                              "Illegal byte sequence encounted in the input.");
1258
1259                                 if (items_written)
1260                                         *items_written = 0;
1261
1262                                 if (items_read)
1263                                         *items_read = i;
1264
1265                                 return NULL;
1266                         }
1267
1268                         outlen += n;
1269                 }
1270         } else {
1271                 for (i = 0; i < len && str[i] != 0; i++) {
1272                         if ((n = g_unichar_to_utf16 (str[i], NULL)) < 0) {
1273                                 g_set_error (err, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
1274                                              "Illegal byte sequence encounted in the input.");
1275
1276                                 if (items_written)
1277                                         *items_written = 0;
1278
1279                                 if (items_read)
1280                                         *items_read = i;
1281
1282                                 return NULL;
1283                         }
1284
1285                         outlen += n;
1286                 }
1287         }
1288
1289         len = i;
1290
1291         outptr = outbuf = g_malloc ((outlen + 1) * sizeof (gunichar2));
1292         for (i = 0; i < len; i++)
1293                 outptr += g_unichar_to_utf16 (str[i], outptr);
1294         *outptr = 0;
1295
1296         if (items_written)
1297                 *items_written = outlen;
1298
1299         if (items_read)
1300                 *items_read = i;
1301
1302         return outbuf;
1303 }