eglib/src/gutf8.c

   1 /*
   2  * gutf8.c: UTF-8 conversion
   3  *
   4  * Author:
   5  *   Atsushi Enomoto  <atsushi@ximian.com>
   6  *
   7  * (C) 2006 Novell, Inc.
   8  */
   9
  10 #include <stdio.h>
  11 #include <glib.h>
  12
  13 gpointer error_quark = "ERROR";
  14
  15 static glong utf8_to_utf16_len (const gchar *str, glong len, glong *items_read, GError **error);
  16 static glong utf16_to_utf8_len (const gunichar2 *str, glong len, glong *items_read, GError **error);
  17
  18 gpointer
  19 g_convert_error_quark ()
  20 {
  21         return error_quark;
  22 }
  23
  24 static gunichar*
  25 utf8_case_conv (const gchar *str, gssize len, gboolean upper)
  26 {
  27         glong i, u16len, u32len;
  28         gunichar2 *u16str;
  29         gunichar *u32str;
  30         gchar *u8str;
  31         GError **err = NULL;
  32
  33         u16str = g_utf8_to_utf16 (str, (glong)len, NULL, &u16len, err);
  34         u32str = g_utf16_to_ucs4 (u16str, u16len, NULL, &u32len, err);
  35         for (i = 0; i < u32len; i++) {
  36                 u32str [i] = upper ? g_unichar_toupper (u32str [i]) : g_unichar_tolower (u32str [i]);
  37         }
  38         g_free (u16str);
  39         u16str = g_ucs4_to_utf16 (u32str, u32len, NULL, &u16len, err);
  40         u8str = g_utf16_to_utf8 (u16str, u16len, NULL, NULL, err);
  41         g_free (u32str);
  42         g_free (u16str);
  43         return (gunichar*)u8str;
  44 }
  45
  46 gchar*
  47 g_utf8_strup (const gchar *str, gssize len)
  48 {
  49         return (gchar*)utf8_case_conv (str, len, TRUE);
  50 }
  51
  52 gchar*
  53 g_utf8_strdown (const gchar *str, gssize len)
  54 {
  55         return (gchar*)utf8_case_conv (str, len, FALSE);
  56 }
  57
  58 gunichar2*
  59 g_utf8_to_utf16 (const gchar *str, glong len, glong *items_read, glong *items_written, GError **error)
  60 {
  61         /* The conversion logic is almost identical to UTF8Encoding.GetChars(),
  62            but error check is always done at utf8_to_utf16_len() so that
  63            the conversion core below simply resets erroreous bits */
  64         glong utf16_len;
  65         gunichar2 *ret;
  66         guchar ch, mb_size, mb_remain;
  67         guint32 codepoint;
  68         glong in_pos, out_pos;
  69
  70         utf16_len = 0;
  71         mb_size = 0;
  72         mb_remain = 0;
  73         in_pos = 0;
  74         out_pos = 0;
  75
  76         if (error)
  77                 *error = NULL;
  78
  79         if (items_written)
  80                 *items_written = 0;
  81         utf16_len = utf8_to_utf16_len (str, len, items_read, error);
  82         if (error)
  83                 if (*error)
  84                         return NULL;
  85         if (utf16_len < 0)
  86                 return NULL;
  87
  88         ret = g_malloc ((1 + utf16_len) * sizeof (gunichar2));
  89
  90         for (in_pos = 0; len < 0 ? str [in_pos] : in_pos < len; in_pos++) {
  91                 ch = (guchar) str [in_pos];
  92                 if (mb_size == 0) {
  93                         if (ch < 0x80)
  94                                 ret [out_pos++] = ch;
  95                         else if ((ch & 0xE0) == 0xC0) {
  96                                 codepoint = ch & 0x1F;
  97                                 mb_size = 2;
  98                         } else if ((ch & 0xF0) == 0xE0) {
  99                                 codepoint = ch & 0x0F;
 100                                 mb_size = 3;
 101                         } else if ((ch & 0xF8) == 0xF0) {
 102                                 codepoint = ch & 7;
 103                                 mb_size = 4;
 104                         } else if ((ch & 0xFC) == 0xF8) {
 105                                 codepoint = ch & 3;
 106                                 mb_size = 5;
 107                         } else if ((ch & 0xFE) == 0xFC) {
 108                                 codepoint = ch & 3;
 109                                 mb_size = 6;
 110                         } else {
 111                                 /* invalid utf-8 sequence */
 112                                 codepoint = 0;
 113                                 mb_remain = mb_size = 0;
 114                         }
 115                         if (mb_size > 1)
 116                                 mb_remain = mb_size - 1;
 117                 } else {
 118                         if ((ch & 0xC0) == 0x80) {
 119                                 codepoint = (codepoint << 6) | (ch & 0x3F);
 120                                 if (--mb_remain == 0) {
 121                                         /* multi byte character is fully consumed now. */
 122                                         if (codepoint < 0x10000) {
 123                                                 ret [out_pos++] = (gunichar2)(codepoint % 0x10000);
 124                                         } else if (codepoint < 0x110000) {
 125                                                 /* surrogate pair */
 126                                                 codepoint -= 0x10000;
 127                                                 ret [out_pos++] = (gunichar2)((codepoint >> 10) + 0xD800);
 128                                                 ret [out_pos++] = (gunichar2)((codepoint & 0x3FF) + 0xDC00);
 129                                         } else {
 130                                                 /* invalid utf-8 sequence (excess) */
 131                                                 codepoint = 0;
 132                                                 mb_remain = 0;
 133                                         }
 134                                         mb_size = 0;
 135                                 }
 136                         } else {
 137                                 /* invalid utf-8 sequence */
 138                                 codepoint = 0;
 139                                 mb_remain = mb_size = 0;
 140                         }
 141                 }
 142         }
 143
 144         ret [out_pos] = 0;
 145         if (items_written)
 146                 *items_written = out_pos;
 147         return ret;
 148 }
 149
 150 static glong
 151 utf8_to_utf16_len (const gchar *str, glong len, glong *items_read, GError **error)
 152 {
 153         /* It is almost identical to UTF8Encoding.GetCharCount() */
 154         guchar ch, mb_size, mb_remain;
 155         gboolean overlong;
 156         guint32 codepoint;
 157         glong in_pos, ret;
 158
 159         mb_size = 0;
 160         mb_remain = 0;
 161         overlong = 0;
 162         in_pos = 0;
 163         ret = 0;
 164
 165         for (in_pos = 0; len < 0 ? str [in_pos] : in_pos < len; in_pos++) {
 166                 ch = str [in_pos];
 167                 if (mb_size == 0) {
 168                         if (ch < 0x80)
 169                                 ret++;
 170                         else if ((ch & 0xE0) == 0xC0) {
 171                                 codepoint = ch & 0x1F;
 172                                 mb_size = 2;
 173                         } else if ((ch & 0xF0) == 0xE0) {
 174                                 codepoint = ch & 0x0F;
 175                                 mb_size = 3;
 176                         } else if ((ch & 0xF8) == 0xF0) {
 177                                 codepoint = ch & 7;
 178                                 mb_size = 4;
 179                         } else if ((ch & 0xFC) == 0xF8) {
 180                                 codepoint = ch & 3;
 181                                 mb_size = 5;
 182                         } else if ((ch & 0xFE) == 0xFC) {
 183                                 codepoint = ch & 3;
 184                                 mb_size = 6;
 185                         } else {
 186                                 /* invalid utf-8 sequence */
 187                                 if (error) {
 188                                         g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE, "invalid utf-8 sequence at %d (illegal first byte)", in_pos);
 189                                         if (items_read)
 190                                                 *items_read = in_pos;
 191                                         return -1;
 192                                 } else {
 193                                         codepoint = 0;
 194                                         mb_remain = mb_size = 0;
 195                                 }
 196                         }
 197                         if (mb_size > 1)
 198                                 mb_remain = mb_size - 1;
 199                 } else {
 200                         if ((ch & 0xC0) == 0x80) {
 201                                 codepoint = (codepoint << 6) | (ch & 0x3F);
 202                                 if (--mb_remain == 0) {
 203                                         /* multi byte character is fully consumed now. */
 204                                         if (codepoint < 0x10000) {
 205                                                 switch (mb_size) {
 206                                                 case 2:
 207                                                         overlong = codepoint < 0x7F;
 208                                                         break;
 209                                                 case 3:
 210                                                         overlong = codepoint < 0x7FF;
 211                                                         break;
 212                                                 case 4:
 213                                                         overlong = codepoint < 0xFFFF;
 214                                                         break;
 215                                                 case 5:
 216                                                         overlong = codepoint < 0x1FFFFF;
 217                                                         break;
 218                                                 case 6:
 219                                                         overlong = codepoint < 0x03FFFFFF;
 220                                                         break;
 221                                                 }
 222                                                 if (overlong) {
 223                                                         /* invalid utf-8 sequence (overlong) */
 224                                                         if (error) {
 225                                                                 g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE, "invalid utf-8 sequence at %d (overlong)", in_pos);
 226                                                                 if (items_read)
 227                                                                         *items_read = in_pos;
 228                                                                 return -1;
 229                                                         } else {
 230                                                                 codepoint = 0;
 231                                                                 mb_remain = 0;
 232                                                                 overlong = FALSE;
 233                                                         }
 234                                                 }
 235                                                 else
 236                                                         ret++;
 237                                         } else if (codepoint < 0x110000) {
 238                                                 /* surrogate pair */
 239                                                 ret += 2;
 240                                         } else {
 241                                                 /* invalid utf-8 sequence (excess) */
 242                                                 if (error) {
 243                                                         g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE, "invalid utf-8 sequence at %d (codepoint range excess)", in_pos);
 244                                                         if (items_read)
 245                                                                 *items_read = in_pos;
 246                                                         return -1;
 247                                                 } else {
 248                                                         codepoint = 0;
 249                                                         mb_remain = 0;
 250                                                 }
 251                                         }
 252                                         mb_size = 0;
 253                                 }
 254                         } else {
 255                                 /* invalid utf-8 sequence */
 256                                 if (error) {
 257                                         g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE, "invalid utf-8 sequence at %d (illegal following bytes)", in_pos);
 258                                         if (items_read)
 259                                                 *items_read = in_pos;
 260                                         return -1;
 261                                 } else {
 262                                         codepoint = 0;
 263                                         mb_remain = mb_size = 0;
 264                                 }
 265                         }
 266                 }
 267         }
 268
 269         if (items_read)
 270                 *items_read = in_pos;
 271         return ret;
 272 }
 273
 274 gchar*
 275 g_utf16_to_utf8 (const gunichar2 *str, glong len, glong *items_read, glong *items_written, GError **error)
 276 {
 277         /* The conversion logic is almost identical to UTF8Encoding.GetBytes(),
 278            but error check is always done at utf16_to_utf8_len() so that
 279            the conversion core below simply resets erroreous bits */
 280         glong utf8_len;
 281         gchar *ret;
 282         glong in_pos, out_pos;
 283         gunichar2 ch;
 284         guint32 codepoint = 0;
 285         gboolean surrogate;
 286
 287         in_pos = 0;
 288         out_pos = 0;
 289         surrogate = FALSE;
 290
 291         if (items_written)
 292                 *items_written = 0;
 293         utf8_len = utf16_to_utf8_len (str, len, items_read, error);
 294         if (error)
 295                 if (*error)
 296                         return NULL;
 297         if (utf8_len < 0)
 298                 return NULL;
 299
 300         ret = g_malloc ((1+utf8_len) * sizeof (gchar));
 301
 302         while (len < 0 ? str [in_pos] : in_pos < len) {
 303                 ch = str [in_pos];
 304                 if (surrogate) {
 305                         if (ch >= 0xDC00 && ch <= 0xDFFF) {
 306                                 codepoint = 0x10000 + (ch - 0xDC00) + ((surrogate - 0xD800) << 10);
 307                                 surrogate = 0;
 308                         } else {
 309                                 surrogate = 0;
 310                                 /* invalid surrogate pair */
 311                                 continue;
 312                         }
 313                 } else {
 314                         /* fast path optimization */
 315                         if (ch < 0x80) {
 316                                 for (; len < 0 ? str [in_pos] : in_pos < len; in_pos++) {
 317                                         if (str [in_pos] < 0x80)
 318                                                 ret [out_pos++] = (gchar)(str [in_pos]);
 319                                         else
 320                                                 break;
 321                                 }
 322                                 continue;
 323                         }
 324                         else if (ch >= 0xD800 && ch <= 0xDBFF)
 325                                 surrogate = ch;
 326                         else if (ch >= 0xDC00 && ch <= 0xDFFF) {
 327                                 /* invalid surrogate pair */
 328                                 continue;
 329                         }
 330                         else
 331                                 codepoint = ch;
 332                 }
 333                 in_pos++;
 334
 335                 if (surrogate != 0)
 336                         continue;
 337                 if (codepoint < 0x80)
 338                         ret [out_pos++] = (gchar) codepoint;
 339                 else if (codepoint < 0x0800) {
 340                         ret [out_pos++] = (gchar) (0xC0 | (codepoint >> 6));
 341                         ret [out_pos++] = (gchar) (0x80 | (codepoint & 0x3F));
 342                 } else if (codepoint < 0x10000) {
 343                         ret [out_pos++] = (gchar) (0xE0 | (codepoint >> 12));
 344                         ret [out_pos++] = (gchar) (0x80 | ((codepoint >> 6) & 0x3F));
 345                         ret [out_pos++] = (gchar) (0x80 | (codepoint & 0x3F));
 346                 } else {
 347                         ret [out_pos++] = (gchar) (0xF0 | (codepoint >> 18));
 348                         ret [out_pos++] = (gchar) (0x80 | ((codepoint >> 12) & 0x3F));
 349                         ret [out_pos++] = (gchar) (0x80 | ((codepoint >> 6) & 0x3F));
 350                         ret [out_pos++] = (gchar) (0x80 | (codepoint & 0x3F));
 351                 }
 352         }
 353         ret [out_pos] = 0;
 354
 355         if (items_written)
 356                 *items_written = out_pos;
 357         return ret;
 358 }
 359
 360 static glong
 361 utf16_to_utf8_len (const gunichar2 *str, glong len, glong *items_read, GError **error)
 362 {
 363         glong ret, in_pos;
 364         gunichar2 ch;
 365         gboolean surrogate;
 366
 367         ret = 0;
 368         in_pos = 0;
 369         surrogate = FALSE;
 370
 371         while (len < 0 ? str [in_pos] : in_pos < len) {
 372                 ch = str [in_pos];
 373                 if (surrogate) {
 374                         if (ch >= 0xDC00 && ch <= 0xDFFF) {
 375                                 ret += 4;
 376                         } else {
 377                                 /* invalid surrogate pair */
 378                                 if (error) {
 379                                         g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE, "invalid utf-16 sequence at %d (missing surrogate tail)", in_pos);
 380                                         if (items_read)
 381                                                 *items_read = in_pos;
 382                                         return -1;
 383                                 } /* otherwise just ignore. */
 384                         }
 385                         surrogate = FALSE;
 386                 } else {
 387                         /* fast path optimization */
 388                         if (ch < 0x80) {
 389                                 for (; len < 0 ? str [in_pos] : in_pos < len; in_pos++) {
 390                                         if (str [in_pos] < 0x80)
 391                                                 ++ret;
 392                                         else
 393                                                 break;
 394                                 }
 395                                 continue;
 396                         }
 397                         else if (ch < 0x0800)
 398                                 ret += 2;
 399                         else if (ch >= 0xD800 && ch <= 0xDBFF)
 400                                 surrogate = TRUE;
 401                         else if (ch >= 0xDC00 && ch <= 0xDFFF) {
 402                                 /* invalid surrogate pair */
 403                                 if (error) {
 404                                         g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE, "invalid utf-16 sequence at %d (missing surrogate head)", in_pos);
 405                                         if (items_read)
 406                                                 *items_read = in_pos;
 407                                         return -1;
 408                                 } /* otherwise just ignore. */
 409                         }
 410                         else
 411                                 ret += 3;
 412                 }
 413                 in_pos++;
 414         }
 415
 416         if (items_read)
 417                 *items_read = in_pos;
 418         return ret;
 419 }
 420
 421 static glong
 422 g_ucs4_to_utf16_len (const gunichar *str, glong len, glong *items_read, GError **error)
 423 {
 424         glong retlen = 0;
 425         glong errindex = 0;
 426         const gunichar *lstr = str;
 427
 428         if (!str)
 429                 return 0;
 430
 431         while (*lstr != '\0' && len--) {
 432                 gunichar ch;
 433                 ch = *lstr++;
 434                 if (ch <= 0x0000FFFF) {
 435                         if (ch >= 0xD800 && ch <= 0xDFFF) {
 436                                 errindex = (glong)(lstr - str)-1;
 437                                 if (error)
 438                                         g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
 439                                         "Invalid sequence in conversion input");
 440                                 if (items_read)
 441                                         *items_read = errindex;
 442                                 return 0;
 443                         } else {
 444                                 retlen++;
 445                         }
 446                 } else if (ch > 0x10FFFF) {
 447                         errindex = (glong)(lstr - str)-1;
 448                         if (error)
 449                                 g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
 450                                 "Character out of range for UTF-16");
 451                         if (items_read)
 452                                 *items_read = errindex;
 453                         return 0;
 454
 455                 } else {
 456                         retlen+=2;
 457                 }
 458         }
 459
 460         if (items_read)
 461                 *items_read = (glong)(lstr - str);
 462         return retlen;
 463 }
 464
 465 gunichar2*
 466 g_ucs4_to_utf16 (const gunichar *str, glong len, glong *items_read, glong *items_written, GError **error)
 467 {
 468         glong allocsz;
 469         gunichar2 *retstr = 0;
 470         gunichar2 *retch = 0;
 471         glong nwritten = 0;
 472         GError *lerror =0 ;
 473
 474         allocsz = g_ucs4_to_utf16_len (str, len, items_read, &lerror);
 475
 476         if (!lerror) {
 477                 retch = retstr = g_malloc ((allocsz+1) * sizeof (gunichar2));
 478                 retstr[allocsz] = '\0';
 479
 480                 while (*str != '\0' && len--) {
 481                         gunichar ch;
 482                         ch = *str++;
 483                         if (ch <= 0x0000FFFF && (ch < 0xD800 || ch > 0xDFFF)) {
 484                                 *retch++ = (gunichar2)ch;
 485                                 nwritten ++;
 486                         } else {
 487                                 ch -= 0x0010000UL;
 488                                 *retch++ = (gunichar2)((ch >> 10) + 0xD800);
 489                                 *retch++ = (gunichar2)((ch & 0x3FFUL) + 0xDC00);
 490                                 nwritten +=2;
 491                         }
 492                 }
 493         }
 494
 495         if (items_written)
 496                 *items_written = nwritten;
 497         if (error)
 498                 *error = lerror;
 499
 500         return retstr;
 501 }
 502
 503 static glong
 504 g_utf16_to_ucs4_len (const gunichar2 *str, glong len, glong *items_read, GError **error)
 505 {
 506         glong retlen = 0;
 507         glong errindex = 0;
 508         const gunichar2 *lstr = str;
 509         gunichar2 ch,ch2;
 510
 511         if (!str)
 512                 return 0;
 513
 514         while (*lstr != '\0' && len--) {
 515                 ch = *lstr++;
 516                 if (ch >= 0xD800 && ch <= 0xDBFF) {
 517                         if (!len--) {
 518                                 lstr--;
 519                                 break;
 520                         }
 521                         ch2 = *lstr;
 522                         if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
 523                                 lstr++;
 524                         } else {
 525                                 errindex = (glong)(lstr - str);
 526                                 if (error)
 527                                         g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
 528                                         "Invalid sequence in conversion input");
 529                                 if (items_read)
 530                                         *items_read = errindex;
 531                                 return 0;
 532                         }
 533                 } else {
 534                         if (ch >= 0xDC00 && ch <= 0xDFFF) {
 535                                 errindex = (glong)(lstr - str)-1;
 536                                 if (error)
 537                                         g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
 538                                         "Invalid sequence in conversion input");
 539                                 if (items_read)
 540                                         *items_read = errindex;
 541                                 return 0;
 542                         }
 543                 }
 544                 retlen++;
 545         }
 546
 547         if (items_read)
 548                 *items_read = (glong)(lstr - str);
 549
 550         return retlen;
 551 }
 552
 553 gunichar*
 554 g_utf16_to_ucs4 (const gunichar2 *str, glong len, glong *items_read, glong *items_written, GError **error)
 555 {
 556         glong allocsz;
 557         gunichar *retstr = 0;
 558         gunichar *retch = 0;
 559         glong nwritten = 0;
 560         GError *lerror =0 ;
 561         gunichar ch,ch2;
 562
 563         allocsz = g_utf16_to_ucs4_len (str, len, items_read, &lerror);
 564
 565         if (!lerror) {
 566                 retch = retstr = g_malloc ((allocsz+1) * sizeof (gunichar));
 567                 retstr[allocsz] = '\0';
 568                 nwritten = allocsz;
 569
 570                 while (*str != '\0' && allocsz--) {
 571                         ch = *str++;
 572                         if (ch >= 0xD800 && ch <= 0xDBFF) {
 573                                 ch2 = *str++;
 574                                 ch = ((ch - (gunichar)0xD800) << 10)
 575                                       + (ch2 - (gunichar)0xDC00) + (gunichar)0x0010000UL;
 576                         }
 577                         *retch++ = ch;
 578                 }
 579         }
 580
 581         if (items_written)
 582                 *items_written = nwritten;
 583         if (error)
 584                 *error = lerror;
 585
 586         return retstr;
 587 }