eglib/src/gutf8.c

   1 /*
   2  * gutf8.c: UTF-8 conversion
   3  *
   4  * Author:
   5  *   Atsushi Enomoto  <atsushi@ximian.com>
   6  *
   7  * (C) 2006 Novell, Inc.
   8  */
   9
  10 #include <stdio.h>
  11 #include <glib.h>
  12
  13 gpointer error_quark = "ERROR";
  14
  15 static glong utf8_to_utf16_len (const gchar *str, glong len, glong *items_read, GError **error);
  16 static glong utf16_to_utf8_len (const gunichar2 *str, glong len, glong *items_read, GError **error);
  17
  18 gpointer
  19 g_convert_error_quark ()
  20 {
  21         return error_quark;
  22 }
  23
  24 static gunichar*
  25 utf8_case_conv (const gchar *str, gssize len, gboolean upper)
  26 {
  27         glong i, u16len, u32len;
  28         gunichar2 *u16str;
  29         gunichar *u32str;
  30         gchar *u8str;
  31         GError **err = NULL;
  32
  33         u16str = g_utf8_to_utf16 (str, (glong)len, NULL, &u16len, err);
  34         u32str = g_utf16_to_ucs4 (u16str, u16len, NULL, &u32len, err);
  35         for (i = 0; i < u32len; i++) {
  36                 u32str [i] = upper ? g_unichar_toupper (u32str [i]) : g_unichar_tolower (u32str [i]);
  37         }
  38         g_free (u16str);
  39         u16str = g_ucs4_to_utf16 (u32str, u32len, NULL, &u16len, err);
  40         u8str = g_utf16_to_utf8 (u16str, u16len, NULL, NULL, err);
  41         g_free (u32str);
  42         g_free (u16str);
  43         return (gunichar*)u8str;
  44 }
  45
  46 gchar*
  47 g_utf8_strup (const gchar *str, gssize len)
  48 {
  49         return (gchar*)utf8_case_conv (str, len, TRUE);
  50 }
  51
  52 gchar*
  53 g_utf8_strdown (const gchar *str, gssize len)
  54 {
  55         return (gchar*)utf8_case_conv (str, len, FALSE);
  56 }
  57
  58 static glong
  59 utf8_to_utf16_len (const gchar *str, glong len, glong *items_read, GError **error)
  60 {
  61         /* It is almost identical to UTF8Encoding.GetCharCount() */
  62         guchar ch, mb_size, mb_remain;
  63         gboolean overlong;
  64         guint32 codepoint;
  65         glong in_pos, ret;
  66
  67         if (len < 0)
  68                 len = (glong) strlen (str);
  69
  70         in_pos = 0;
  71         ret = 0;
  72
  73         /* Common case */
  74         for (in_pos = 0; in_pos < len && (guchar) str [in_pos] < 0x80; in_pos++)
  75                 ret ++;
  76
  77         if (in_pos == len) {
  78                 if (items_read)
  79                         *items_read = in_pos;
  80                 return ret;
  81         }
  82
  83         mb_size = 0;
  84         mb_remain = 0;
  85         overlong = 0;
  86
  87         for (; in_pos < len; in_pos++) {
  88                 ch = str [in_pos];
  89                 if (mb_size == 0) {
  90                         if (ch < 0x80)
  91                                 ret++;
  92                         else if ((ch & 0xE0) == 0xC0) {
  93                                 codepoint = ch & 0x1F;
  94                                 mb_size = 2;
  95                         } else if ((ch & 0xF0) == 0xE0) {
  96                                 codepoint = ch & 0x0F;
  97                                 mb_size = 3;
  98                         } else if ((ch & 0xF8) == 0xF0) {
  99                                 codepoint = ch & 7;
 100                                 mb_size = 4;
 101                         } else if ((ch & 0xFC) == 0xF8) {
 102                                 codepoint = ch & 3;
 103                                 mb_size = 5;
 104                         } else if ((ch & 0xFE) == 0xFC) {
 105                                 codepoint = ch & 3;
 106                                 mb_size = 6;
 107                         } else {
 108                                 /* invalid utf-8 sequence */
 109                                 if (error) {
 110                                         g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE, "invalid utf-8 sequence at %d (illegal first byte)", in_pos);
 111                                         if (items_read)
 112                                                 *items_read = in_pos;
 113                                         return -1;
 114                                 } else {
 115                                         codepoint = 0;
 116                                         mb_remain = mb_size = 0;
 117                                 }
 118                         }
 119                         if (mb_size > 1)
 120                                 mb_remain = mb_size - 1;
 121                 } else {
 122                         if ((ch & 0xC0) == 0x80) {
 123                                 codepoint = (codepoint << 6) | (ch & 0x3F);
 124                                 if (--mb_remain == 0) {
 125                                         /* multi byte character is fully consumed now. */
 126                                         if (codepoint < 0x10000) {
 127                                                 switch (mb_size) {
 128                                                 case 2:
 129                                                         overlong = codepoint < 0x7F;
 130                                                         break;
 131                                                 case 3:
 132                                                         overlong = codepoint < 0x7FF;
 133                                                         break;
 134                                                 case 4:
 135                                                         overlong = codepoint < 0xFFFF;
 136                                                         break;
 137                                                 case 5:
 138                                                         overlong = codepoint < 0x1FFFFF;
 139                                                         break;
 140                                                 case 6:
 141                                                         overlong = codepoint < 0x03FFFFFF;
 142                                                         break;
 143                                                 }
 144                                                 if (overlong) {
 145                                                         /* invalid utf-8 sequence (overlong) */
 146                                                         if (error) {
 147                                                                 g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE, "invalid utf-8 sequence at %d (overlong)", in_pos);
 148                                                                 if (items_read)
 149                                                                         *items_read = in_pos;
 150                                                                 return -1;
 151                                                         } else {
 152                                                                 codepoint = 0;
 153                                                                 mb_remain = 0;
 154                                                                 overlong = FALSE;
 155                                                         }
 156                                                 }
 157                                                 else
 158                                                         ret++;
 159                                         } else if (codepoint < 0x110000) {
 160                                                 /* surrogate pair */
 161                                                 ret += 2;
 162                                         } else {
 163                                                 /* invalid utf-8 sequence (excess) */
 164                                                 if (error) {
 165                                                         g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE, "invalid utf-8 sequence at %d (codepoint range excess)", in_pos);
 166                                                         if (items_read)
 167                                                                 *items_read = in_pos;
 168                                                         return -1;
 169                                                 } else {
 170                                                         codepoint = 0;
 171                                                         mb_remain = 0;
 172                                                 }
 173                                         }
 174                                         mb_size = 0;
 175                                 }
 176                         } else {
 177                                 /* invalid utf-8 sequence */
 178                                 if (error) {
 179                                         g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE, "invalid utf-8 sequence at %d (illegal following bytes)", in_pos);
 180                                         if (items_read)
 181                                                 *items_read = in_pos;
 182                                         return -1;
 183                                 } else {
 184                                         codepoint = 0;
 185                                         mb_remain = mb_size = 0;
 186                                 }
 187                         }
 188                 }
 189         }
 190
 191         if (items_read)
 192                 *items_read = in_pos;
 193         return ret;
 194 }
 195
 196 gunichar2*
 197 g_utf8_to_utf16 (const gchar *str, glong len, glong *items_read, glong *items_written, GError **error)
 198 {
 199         /* The conversion logic is almost identical to UTF8Encoding.GetChars(),
 200            but error check is always done at utf8_to_utf16_len() so that
 201            the conversion core below simply resets erroreous bits */
 202         glong utf16_len;
 203         gunichar2 *ret;
 204         guchar ch, mb_size, mb_remain;
 205         guint32 codepoint;
 206         glong in_pos, out_pos;
 207
 208         utf16_len = 0;
 209         mb_size = 0;
 210         mb_remain = 0;
 211         in_pos = 0;
 212         out_pos = 0;
 213
 214         if (error)
 215                 *error = NULL;
 216
 217         if (len < 0)
 218                 len = (glong) strlen (str);
 219
 220         if (items_read)
 221                 *items_read = 0;
 222         if (items_written)
 223                 *items_written = 0;
 224         utf16_len = utf8_to_utf16_len (str, len, items_read, error);
 225         if (error)
 226                 if (*error)
 227                         return NULL;
 228         if (utf16_len < 0)
 229                 return NULL;
 230
 231         ret = g_malloc ((1 + utf16_len) * sizeof (gunichar2));
 232
 233         /* Common case */
 234         for (in_pos = 0; in_pos < len; in_pos++) {
 235                 ch = (guchar) str [in_pos];
 236
 237                 if (ch >= 0x80)
 238                         break;
 239                 ret [out_pos++] = ch;
 240         }
 241
 242         for (; in_pos < len; in_pos++) {
 243                 ch = (guchar) str [in_pos];
 244                 if (mb_size == 0) {
 245                         if (ch < 0x80)
 246                                 ret [out_pos++] = ch;
 247                         else if ((ch & 0xE0) == 0xC0) {
 248                                 codepoint = ch & 0x1F;
 249                                 mb_size = 2;
 250                         } else if ((ch & 0xF0) == 0xE0) {
 251                                 codepoint = ch & 0x0F;
 252                                 mb_size = 3;
 253                         } else if ((ch & 0xF8) == 0xF0) {
 254                                 codepoint = ch & 7;
 255                                 mb_size = 4;
 256                         } else if ((ch & 0xFC) == 0xF8) {
 257                                 codepoint = ch & 3;
 258                                 mb_size = 5;
 259                         } else if ((ch & 0xFE) == 0xFC) {
 260                                 codepoint = ch & 3;
 261                                 mb_size = 6;
 262                         } else {
 263                                 /* invalid utf-8 sequence */
 264                                 codepoint = 0;
 265                                 mb_remain = mb_size = 0;
 266                         }
 267                         if (mb_size > 1)
 268                                 mb_remain = mb_size - 1;
 269                 } else {
 270                         if ((ch & 0xC0) == 0x80) {
 271                                 codepoint = (codepoint << 6) | (ch & 0x3F);
 272                                 if (--mb_remain == 0) {
 273                                         /* multi byte character is fully consumed now. */
 274                                         if (codepoint < 0x10000) {
 275                                                 ret [out_pos++] = (gunichar2)(codepoint % 0x10000);
 276                                         } else if (codepoint < 0x110000) {
 277                                                 /* surrogate pair */
 278                                                 codepoint -= 0x10000;
 279                                                 ret [out_pos++] = (gunichar2)((codepoint >> 10) + 0xD800);
 280                                                 ret [out_pos++] = (gunichar2)((codepoint & 0x3FF) + 0xDC00);
 281                                         } else {
 282                                                 /* invalid utf-8 sequence (excess) */
 283                                                 codepoint = 0;
 284                                                 mb_remain = 0;
 285                                         }
 286                                         mb_size = 0;
 287                                 }
 288                         } else {
 289                                 /* invalid utf-8 sequence */
 290                                 codepoint = 0;
 291                                 mb_remain = mb_size = 0;
 292                         }
 293                 }
 294         }
 295
 296         ret [out_pos] = 0;
 297         if (items_written)
 298                 *items_written = out_pos;
 299         return ret;
 300 }
 301
 302 gchar*
 303 g_utf16_to_utf8 (const gunichar2 *str, glong len, glong *items_read, glong *items_written, GError **error)
 304 {
 305         /* The conversion logic is almost identical to UTF8Encoding.GetBytes(),
 306            but error check is always done at utf16_to_utf8_len() so that
 307            the conversion core below simply resets erroreous bits */
 308         glong utf8_len;
 309         gchar *ret;
 310         glong in_pos, out_pos;
 311         gunichar2 ch;
 312         guint32 codepoint = 0;
 313         gboolean surrogate;
 314
 315         in_pos = 0;
 316         out_pos = 0;
 317         surrogate = FALSE;
 318
 319         if (items_read)
 320                 *items_read = 0;
 321         if (items_written)
 322                 *items_written = 0;
 323         utf8_len = utf16_to_utf8_len (str, len, items_read, error);
 324         if (error)
 325                 if (*error)
 326                         return NULL;
 327         if (utf8_len < 0)
 328                 return NULL;
 329
 330         ret = g_malloc ((1+utf8_len) * sizeof (gchar));
 331
 332         while (len < 0 ? str [in_pos] : in_pos < len) {
 333                 ch = str [in_pos];
 334                 if (surrogate) {
 335                         if (ch >= 0xDC00 && ch <= 0xDFFF) {
 336                                 codepoint = 0x10000 + (ch - 0xDC00) + ((surrogate - 0xD800) << 10);
 337                                 surrogate = 0;
 338                         } else {
 339                                 surrogate = 0;
 340                                 /* invalid surrogate pair */
 341                                 ++in_pos;
 342                                 continue;
 343                         }
 344                 } else {
 345                         /* fast path optimization */
 346                         if (ch < 0x80) {
 347                                 for (; len < 0 ? str [in_pos] : in_pos < len; in_pos++) {
 348                                         if (str [in_pos] < 0x80)
 349                                                 ret [out_pos++] = (gchar)(str [in_pos]);
 350                                         else
 351                                                 break;
 352                                 }
 353                                 continue;
 354                         }
 355                         else if (ch >= 0xD800 && ch <= 0xDBFF)
 356                                 surrogate = ch;
 357                         else if (ch >= 0xDC00 && ch <= 0xDFFF) {
 358                                 ++in_pos;
 359                                 /* invalid surrogate pair */
 360                                 continue;
 361                         }
 362                         else
 363                                 codepoint = ch;
 364                 }
 365                 in_pos++;
 366
 367                 if (surrogate != 0)
 368                         continue;
 369                 if (codepoint < 0x80)
 370                         ret [out_pos++] = (gchar) codepoint;
 371                 else if (codepoint < 0x0800) {
 372                         ret [out_pos++] = (gchar) (0xC0 | (codepoint >> 6));
 373                         ret [out_pos++] = (gchar) (0x80 | (codepoint & 0x3F));
 374                 } else if (codepoint < 0x10000) {
 375                         ret [out_pos++] = (gchar) (0xE0 | (codepoint >> 12));
 376                         ret [out_pos++] = (gchar) (0x80 | ((codepoint >> 6) & 0x3F));
 377                         ret [out_pos++] = (gchar) (0x80 | (codepoint & 0x3F));
 378                 } else {
 379                         ret [out_pos++] = (gchar) (0xF0 | (codepoint >> 18));
 380                         ret [out_pos++] = (gchar) (0x80 | ((codepoint >> 12) & 0x3F));
 381                         ret [out_pos++] = (gchar) (0x80 | ((codepoint >> 6) & 0x3F));
 382                         ret [out_pos++] = (gchar) (0x80 | (codepoint & 0x3F));
 383                 }
 384         }
 385         ret [out_pos] = 0;
 386
 387         if (items_written)
 388                 *items_written = out_pos;
 389         return ret;
 390 }
 391
 392 static glong
 393 utf16_to_utf8_len (const gunichar2 *str, glong len, glong *items_read, GError **error)
 394 {
 395         glong ret, in_pos;
 396         gunichar2 ch;
 397         gboolean surrogate;
 398
 399         ret = 0;
 400         in_pos = 0;
 401         surrogate = FALSE;
 402
 403         while (len < 0 ? str [in_pos] : in_pos < len) {
 404                 ch = str [in_pos];
 405                 if (surrogate) {
 406                         if (ch >= 0xDC00 && ch <= 0xDFFF) {
 407                                 ret += 4;
 408                         } else {
 409                                 /* invalid surrogate pair */
 410                                 if (error) {
 411                                         g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE, "invalid utf-16 sequence at %d (missing surrogate tail)", in_pos);
 412                                         if (items_read)
 413                                                 *items_read = in_pos;
 414                                         return -1;
 415                                 } /* otherwise just ignore. */
 416                         }
 417                         surrogate = FALSE;
 418                 } else {
 419                         /* fast path optimization */
 420                         if (ch < 0x80) {
 421                                 for (; len < 0 ? str [in_pos] : in_pos < len; in_pos++) {
 422                                         if (str [in_pos] < 0x80)
 423                                                 ++ret;
 424                                         else
 425                                                 break;
 426                                 }
 427                                 continue;
 428                         }
 429                         else if (ch < 0x0800)
 430                                 ret += 2;
 431                         else if (ch >= 0xD800 && ch <= 0xDBFF)
 432                                 surrogate = TRUE;
 433                         else if (ch >= 0xDC00 && ch <= 0xDFFF) {
 434                                 /* invalid surrogate pair */
 435                                 if (error) {
 436                                         g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE, "invalid utf-16 sequence at %d (missing surrogate head)", in_pos);
 437                                         if (items_read)
 438                                                 *items_read = in_pos;
 439                                         return -1;
 440                                 } /* otherwise just ignore. */
 441                         }
 442                         else
 443                                 ret += 3;
 444                 }
 445                 in_pos++;
 446         }
 447
 448         if (items_read)
 449                 *items_read = in_pos;
 450         return ret;
 451 }
 452
 453 gchar *
 454 g_ucs4_to_utf8 (const gunichar *str, glong len, glong *items_read, glong *items_written, GError **error)
 455 {
 456         gchar *outbuf, *outptr;
 457         glong nwritten = 0;
 458         glong i;
 459         gint n;
 460
 461         if (len == -1) {
 462                 for (i = 0; str[i] != 0; i++) {
 463                         if ((n = g_unichar_to_utf8 (str[i], NULL)) < 0) {
 464                                 g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
 465                                              "Invalid sequence in conversion input");
 466
 467                                 if (items_read)
 468                                         *items_read = i;
 469
 470                                 return NULL;
 471                         }
 472
 473                         nwritten += n;
 474                 }
 475         } else {
 476                 for (i = 0; i < len; i++) {
 477                         if ((n = g_unichar_to_utf8 (str[i], NULL)) < 0) {
 478                                 g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
 479                                              "Invalid sequence in conversion input");
 480
 481                                 if (items_read)
 482                                         *items_read = i;
 483
 484                                 return NULL;
 485                         }
 486
 487                         nwritten += n;
 488                 }
 489         }
 490
 491         outptr = outbuf = g_malloc (nwritten + 1);
 492         if (len == -1) {
 493                 for (i = 0; str[i] != 0; i++)
 494                         outptr += g_unichar_to_utf8 (str[i], outptr);
 495         } else {
 496                 for (i = 0; i < len; i++)
 497                         outptr += g_unichar_to_utf8 (str[i], outptr);
 498         }
 499         *outptr = '\0';
 500
 501         if (items_written)
 502                 *items_written = nwritten;
 503
 504         if (items_read != 0)
 505                 *items_read = i;
 506
 507         return outbuf;
 508 }
 509
 510 static glong
 511 g_ucs4_to_utf16_len (const gunichar *str, glong len, glong *items_read, GError **error)
 512 {
 513         glong retlen = 0;
 514         glong errindex = 0;
 515         const gunichar *lstr = str;
 516
 517         if (!str)
 518                 return 0;
 519
 520         while (*lstr != '\0' && len--) {
 521                 gunichar ch;
 522                 ch = *lstr++;
 523                 if (ch <= 0x0000FFFF) {
 524                         if (ch >= 0xD800 && ch <= 0xDFFF) {
 525                                 errindex = (glong)(lstr - str)-1;
 526                                 if (error)
 527                                         g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
 528                                         "Invalid sequence in conversion input");
 529                                 if (items_read)
 530                                         *items_read = errindex;
 531                                 return 0;
 532                         } else {
 533                                 retlen++;
 534                         }
 535                 } else if (ch > 0x10FFFF) {
 536                         errindex = (glong)(lstr - str)-1;
 537                         if (error)
 538                                 g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
 539                                 "Character out of range for UTF-16");
 540                         if (items_read)
 541                                 *items_read = errindex;
 542                         return 0;
 543
 544                 } else {
 545                         retlen+=2;
 546                 }
 547         }
 548
 549         if (items_read)
 550                 *items_read = (glong)(lstr - str);
 551         return retlen;
 552 }
 553
 554 gunichar2*
 555 g_ucs4_to_utf16 (const gunichar *str, glong len, glong *items_read, glong *items_written, GError **error)
 556 {
 557         glong allocsz;
 558         gunichar2 *retstr = 0;
 559         gunichar2 *retch = 0;
 560         glong nwritten = 0;
 561         GError *lerror =0 ;
 562
 563         allocsz = g_ucs4_to_utf16_len (str, len, items_read, &lerror);
 564
 565         if (!lerror) {
 566                 retch = retstr = g_malloc ((allocsz+1) * sizeof (gunichar2));
 567                 retstr[allocsz] = '\0';
 568
 569                 while (*str != '\0' && len--) {
 570                         gunichar ch;
 571                         ch = *str++;
 572                         if (ch <= 0x0000FFFF && (ch < 0xD800 || ch > 0xDFFF)) {
 573                                 *retch++ = (gunichar2)ch;
 574                                 nwritten ++;
 575                         } else {
 576                                 ch -= 0x0010000UL;
 577                                 *retch++ = (gunichar2)((ch >> 10) + 0xD800);
 578                                 *retch++ = (gunichar2)((ch & 0x3FFUL) + 0xDC00);
 579                                 nwritten +=2;
 580                         }
 581                 }
 582         }
 583
 584         if (items_written)
 585                 *items_written = nwritten;
 586         if (error)
 587                 *error = lerror;
 588
 589         return retstr;
 590 }
 591
 592 static glong
 593 g_utf16_to_ucs4_len (const gunichar2 *str, glong len, glong *items_read, GError **error)
 594 {
 595         glong retlen = 0;
 596         glong errindex = 0;
 597         const gunichar2 *lstr = str;
 598         gunichar2 ch,ch2;
 599
 600         if (!str)
 601                 return 0;
 602
 603         while (*lstr != '\0' && len--) {
 604                 ch = *lstr++;
 605                 if (ch >= 0xD800 && ch <= 0xDBFF) {
 606                         if (!len--) {
 607                                 lstr--;
 608                                 break;
 609                         }
 610                         ch2 = *lstr;
 611                         if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
 612                                 lstr++;
 613                         } else {
 614                                 errindex = (glong)(lstr - str);
 615                                 if (error)
 616                                         g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
 617                                         "Invalid sequence in conversion input");
 618                                 if (items_read)
 619                                         *items_read = errindex;
 620                                 return 0;
 621                         }
 622                 } else {
 623                         if (ch >= 0xDC00 && ch <= 0xDFFF) {
 624                                 errindex = (glong)(lstr - str)-1;
 625                                 if (error)
 626                                         g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
 627                                         "Invalid sequence in conversion input");
 628                                 if (items_read)
 629                                         *items_read = errindex;
 630                                 return 0;
 631                         }
 632                 }
 633                 retlen++;
 634         }
 635
 636         if (items_read)
 637                 *items_read = (glong)(lstr - str);
 638
 639         return retlen;
 640 }
 641
 642 gunichar*
 643 g_utf16_to_ucs4 (const gunichar2 *str, glong len, glong *items_read, glong *items_written, GError **error)
 644 {
 645         glong allocsz;
 646         gunichar *retstr = 0;
 647         gunichar *retch = 0;
 648         glong nwritten = 0;
 649         GError *lerror =0 ;
 650         gunichar ch,ch2;
 651
 652         allocsz = g_utf16_to_ucs4_len (str, len, items_read, &lerror);
 653
 654         if (!lerror) {
 655                 retch = retstr = g_malloc ((allocsz+1) * sizeof (gunichar));
 656                 retstr[allocsz] = '\0';
 657                 nwritten = allocsz;
 658
 659                 while (*str != '\0' && allocsz--) {
 660                         ch = *str++;
 661                         if (ch >= 0xD800 && ch <= 0xDBFF) {
 662                                 ch2 = *str++;
 663                                 ch = ((ch - (gunichar)0xD800) << 10)
 664                                       + (ch2 - (gunichar)0xDC00) + (gunichar)0x0010000UL;
 665                         }
 666                         *retch++ = ch;
 667                 }
 668         }
 669
 670         if (items_written)
 671                 *items_written = nwritten;
 672         if (error)
 673                 *error = lerror;
 674
 675         return retstr;
 676 }
 677
 678 gchar *
 679 g_utf8_offset_to_pointer (const gchar *str, glong offset)
 680 {
 681         if (offset == 0)
 682                 return str;
 683
 684         if (offset > 0) {
 685                 gchar *p = (gchar*)str;
 686                 do {
 687                         p = g_utf8_next_char (p);
 688                         offset --;
 689                 } while (offset > 0);
 690
 691                 return p;
 692         }
 693         else {
 694                 // MOONLIGHT_FIXME
 695                 g_assert_not_reached();
 696         }
 697 }
 698
 699 glong
 700 g_utf8_pointer_to_offset (const gchar *str, const gchar *pos)
 701 {
 702         const gchar *inptr, *inend;
 703         glong offset = 0;
 704         glong sign = 1;
 705
 706         if (pos == str)
 707                 return 0;
 708
 709         if (str < pos) {
 710                 inptr = str;
 711                 inend = pos;
 712         } else {
 713                 inptr = pos;
 714                 inend = str;
 715                 sign = -1;
 716         }
 717
 718         do {
 719                 inptr = g_utf8_next_char (inptr);
 720                 offset++;
 721         } while (inptr < inend);
 722
 723         return offset * sign;
 724 }
 725
 726 gunichar*
 727 g_utf8_to_ucs4_fast (const gchar *str, glong len, glong *items_written)
 728 {
 729         gunichar* ucs4;
 730         int ucs4_index;
 731         const char *p;
 732         int mb_size;
 733         gunichar codepoint;
 734
 735         g_return_val_if_fail (str != NULL, NULL);
 736
 737         if (len < 0) {
 738                 /* we need to find the length of str, as len < 0 means it must be 0 terminated */
 739
 740                 len = 0;
 741                 p = str;
 742                 while (*p) {
 743                         len ++;
 744                         p = g_utf8_next_char(p);
 745                 }
 746         }
 747
 748         ucs4 = g_malloc (sizeof(gunichar)*len);
 749         if (items_written)
 750                 *items_written = len;
 751
 752         p = str;
 753         ucs4_index = 0;
 754         while (len) {
 755                 guint8 c = *p++;
 756
 757
 758                 if (c < 0x80) {
 759                         mb_size = 1;
 760                 }
 761                 else if ((c & 0xE0) == 0xC0) {
 762                         c &= 0x1f;
 763
 764                         mb_size = 2;
 765                 }
 766                 else if ((c & 0xF0) == 0xE0) {
 767                         c &= 0x0f;
 768                         mb_size = 3;
 769                 }
 770                 else if ((c & 0xF8) == 0xF0) {
 771                         c &= 0x07;
 772                         mb_size = 4;
 773                 }
 774                 else if ((c & 0xFC) == 0xF8) {
 775                         c &= 0x03;
 776                         mb_size = 5;
 777                 }
 778                 else if ((c & 0xFE) == 0xFC) {
 779                         c &= 0x01;
 780                         mb_size = 6;
 781                 }
 782
 783                 codepoint = c;
 784                 while (--mb_size) {
 785                         codepoint <<= 6 | ((*p)&0x3f);
 786                         p++;
 787                 }
 788
 789                 ucs4[ucs4_index++] = codepoint;
 790                 len --;
 791         }
 792
 793         return ucs4;
 794 }
 795
 796 /**
 797  * from http://home.tiscali.nl/t876506/utf8tbl.html
 798  *
 799  * From Unicode UCS-4 to UTF-8:
 800  * Start with the Unicode number expressed as a decimal number and call this ud.
 801  *
 802  * If ud <128 (7F hex) then UTF-8 is 1 byte long, the value of ud.
 803  *
 804  * If ud >=128 and <=2047 (7FF hex) then UTF-8 is 2 bytes long.
 805  *    byte 1 = 192 + (ud div 64)
 806  *    byte 2 = 128 + (ud mod 64)
 807  *
 808  * If ud >=2048 and <=65535 (FFFF hex) then UTF-8 is 3 bytes long.
 809  *    byte 1 = 224 + (ud div 4096)
 810  *    byte 2 = 128 + ((ud div 64) mod 64)
 811  *    byte 3 = 128 + (ud mod 64)
 812  *
 813  * If ud >=65536 and <=2097151 (1FFFFF hex) then UTF-8 is 4 bytes long.
 814  *    byte 1 = 240 + (ud div 262144)
 815  *    byte 2 = 128 + ((ud div 4096) mod 64)
 816  *    byte 3 = 128 + ((ud div 64) mod 64)
 817  *    byte 4 = 128 + (ud mod 64)
 818  *
 819  * If ud >=2097152 and <=67108863 (3FFFFFF hex) then UTF-8 is 5 bytes long.
 820  *    byte 1 = 248 + (ud div 16777216)
 821  *    byte 2 = 128 + ((ud div 262144) mod 64)
 822  *    byte 3 = 128 + ((ud div 4096) mod 64)
 823  *    byte 4 = 128 + ((ud div 64) mod 64)
 824  *    byte 5 = 128 + (ud mod 64)
 825  *
 826  * If ud >=67108864 and <=2147483647 (7FFFFFFF hex) then UTF-8 is 6 bytes long.
 827  *    byte 1 = 252 + (ud div 1073741824)
 828  *    byte 2 = 128 + ((ud div 16777216) mod 64)
 829  *    byte 3 = 128 + ((ud div 262144) mod 64)
 830  *    byte 4 = 128 + ((ud div 4096) mod 64)
 831  *    byte 5 = 128 + ((ud div 64) mod 64)
 832  *    byte 6 = 128 + (ud mod 64)
 833  **/
 834 gint
 835 g_unichar_to_utf8 (gunichar c, gchar *outbuf)
 836 {
 837         gint len, i;
 838         char base;
 839
 840         if (c < 128UL) {
 841                 base = 0;
 842                 len = 1;
 843         } else if (c < 2048UL) {
 844                 base = 192;
 845                 len = 2;
 846         } else if (c < 65536UL) {
 847                 base = 224;
 848                 len = 3;
 849         } else if (c < 2097152UL) {
 850                 base = 240;
 851                 len = 4;
 852         } else if (c < 67108864UL) {
 853                 base = 248;
 854                 len = 5;
 855         } else if (c < 2147483648UL) {
 856                 base = 252;
 857                 len = 6;
 858         } else
 859                 return -1;
 860
 861         if (outbuf != NULL) {
 862                 for (i = len - 1; i > 0; i--) {
 863                         /* mask off 6 bits worth and add 128 */
 864                         outbuf[i] = 128 + (c & 0x3f);
 865                         c >>= 6;
 866                 }
 867
 868                 /* first character has a different base */
 869                 outbuf[0] = base + (c & 0x3f);
 870         }
 871
 872         return len;
 873 }