nfkc.c

   1 /* nfkc.c       unicode normalization utilities
   2  * Copyright (C) 2002  Simon Josefsson
   3  *
   4  * This file is part of libstringprep.
   5  *
   6  * Libstringprep is free software; you can redistribute it and/or
   7  * modify it under the terms of the GNU Lesser General Public
   8  * License as published by the Free Software Foundation; either
   9  * version 2.1 of the License, or (at your option) any later version.
  10  *
  11  * Libstringprep is distributed in the hope that it will be useful,
  12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14  * Lesser General Public License for more details.
  15  *
  16  * You should have received a copy of the GNU Lesser General Public
  17  * License along with libstringprep; if not, write to the Free Software
  18  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  19  *
  20  */
  21
  22 #ifdef HAVE_CONFIG_H
  23 #include "config.h"
  24 #endif
  25
  26 /* This file contains functions from GLIB including gutf8.c and
  27  * gunidecomp.c, all with the following license.
  28  *
  29  *  Copyright (C) 1999, 2000 Tom Tromey
  30  *  Copyright 2000 Red Hat, Inc.
  31  *
  32  * The Gnome Library is free software; you can redistribute it and/or
  33  * modify it under the terms of the GNU Lesser General Public License as
  34  * published by the Free Software Foundation; either version 2 of the
  35  * License, or (at your option) any later version.
  36  *
  37  * The Gnome Library is distributed in the hope that it will be useful,
  38  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  39  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  40  * Lesser General Public License for more details.
  41  *
  42  * You should have received a copy of the GNU Lesser General Public
  43  * License along with the Gnome Library; see the file COPYING.LIB.  If not,
  44  * write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
  45  *   Boston, MA 02111-1307, USA.
  46  */
  47
  48 typedef enum
  49 {
  50   G_NORMALIZE_DEFAULT,
  51   G_NORMALIZE_NFD = G_NORMALIZE_DEFAULT,
  52   G_NORMALIZE_DEFAULT_COMPOSE,
  53   G_NORMALIZE_NFC = G_NORMALIZE_DEFAULT_COMPOSE,
  54   G_NORMALIZE_ALL,
  55   G_NORMALIZE_NFKD = G_NORMALIZE_ALL,
  56   G_NORMALIZE_ALL_COMPOSE,
  57   G_NORMALIZE_NFKC = G_NORMALIZE_ALL_COMPOSE
  58 }
  59 GNormalizeMode;
  60
  61 #include "gunidecomp.h"
  62 #include "gunicomp.h"
  63
  64 #include <stdlib.h>
  65
  66 #define UTF8_COMPUTE(Char, Mask, Len)                                         \
  67   if (Char < 128)                                                             \
  68     {                                                                         \
  69       Len = 1;                                                                \
  70       Mask = 0x7f;                                                            \
  71     }                                                                         \
  72   else if ((Char & 0xe0) == 0xc0)                                             \
  73     {                                                                         \
  74       Len = 2;                                                                \
  75       Mask = 0x1f;                                                            \
  76     }                                                                         \
  77   else if ((Char & 0xf0) == 0xe0)                                             \
  78     {                                                                         \
  79       Len = 3;                                                                \
  80       Mask = 0x0f;                                                            \
  81     }                                                                         \
  82   else if ((Char & 0xf8) == 0xf0)                                             \
  83     {                                                                         \
  84       Len = 4;                                                                \
  85       Mask = 0x07;                                                            \
  86     }                                                                         \
  87   else if ((Char & 0xfc) == 0xf8)                                             \
  88     {                                                                         \
  89       Len = 5;                                                                \
  90       Mask = 0x03;                                                            \
  91     }                                                                         \
  92   else if ((Char & 0xfe) == 0xfc)                                             \
  93     {                                                                         \
  94       Len = 6;                                                                \
  95       Mask = 0x01;                                                            \
  96     }                                                                         \
  97   else                                                                        \
  98     Len = -1;
  99
 100 #define UTF8_LENGTH(Char)              \
 101   ((Char) < 0x80 ? 1 :                 \
 102    ((Char) < 0x800 ? 2 :               \
 103     ((Char) < 0x10000 ? 3 :            \
 104      ((Char) < 0x200000 ? 4 :          \
 105       ((Char) < 0x4000000 ? 5 : 6)))))
 106
 107
 108 #define UTF8_GET(Result, Chars, Count, Mask, Len)                             \
 109   (Result) = (Chars)[0] & (Mask);                                             \
 110   for ((Count) = 1; (Count) < (Len); ++(Count))                               \
 111     {                                                                         \
 112       if (((Chars)[(Count)] & 0xc0) != 0x80)                                  \
 113         {                                                                     \
 114           (Result) = -1;                                                      \
 115           break;                                                              \
 116         }                                                                     \
 117       (Result) <<= 6;                                                         \
 118       (Result) |= ((Chars)[(Count)] & 0x3f);                                  \
 119     }
 120
 121 #define UNICODE_VALID(Char)                   \
 122     ((Char) < 0x110000 &&                     \
 123      ((Char) < 0xD800 || (Char) >= 0xE000) && \
 124      (Char) != 0xFFFE && (Char) != 0xFFFF)
 125
 126 static const char utf8_skip_data[256] = {
 127   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
 128   1, 1, 1, 1, 1, 1, 1,
 129   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
 130   1, 1, 1, 1, 1, 1, 1,
 131   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
 132   1, 1, 1, 1, 1, 1, 1,
 133   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
 134   1, 1, 1, 1, 1, 1, 1,
 135   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
 136   1, 1, 1, 1, 1, 1, 1,
 137   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
 138   1, 1, 1, 1, 1, 1, 1,
 139   2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
 140   2, 2, 2, 2, 2, 2, 2,
 141   3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 5,
 142   5, 5, 5, 6, 6, 1, 1
 143 };
 144 static const char *const g_utf8_skip = utf8_skip_data;
 145
 146 #define g_utf8_next_char(p) (char *)((p) + g_utf8_skip[*(unsigned char *)(p)])
 147
 148 /**
 149  * g_utf8_get_char:
 150  * @p: a pointer to Unicode character encoded as UTF-8
 151  *
 152  * Converts a sequence of bytes encoded as UTF-8 to a Unicode character.
 153  * If @p does not point to a valid UTF-8 encoded character, results are
 154  * undefined. If you are not sure that the bytes are complete
 155  * valid Unicode characters, you should use g_utf8_get_char_validated()
 156  * instead.
 157  *
 158  * Return value: the resulting character
 159  **/
 160 static long
 161 g_utf8_get_char (const char * p)
 162 {
 163   int i, mask = 0, len;
 164   long result;
 165   unsigned char c = (unsigned char) *p;
 166
 167   UTF8_COMPUTE (c, mask, len);
 168   if (len == -1)
 169     return (long) - 1;
 170   UTF8_GET (result, p, i, mask, len);
 171
 172   return result;
 173 }
 174
 175 #define CC(Page, Char) \
 176   ((combining_class_table[Page] >= G_UNICODE_MAX_TABLE_INDEX) \
 177    ? (combining_class_table[Page] - G_UNICODE_MAX_TABLE_INDEX) \
 178    : (cclass_data[combining_class_table[Page]][Char]))
 179
 180 #define COMBINING_CLASS(Char) \
 181      (((Char) > (G_UNICODE_LAST_CHAR)) ? 0 : CC((Char) >> 8, (Char) & 0xff))
 182
 183 /**
 184  * g_unicode_canonical_ordering:
 185  * @string: a UCS-4 encoded string.
 186  * @len: the maximum length of @string to use.
 187  *
 188  * Computes the canonical ordering of a string in-place.
 189  * This rearranges decomposed characters in the string
 190  * according to their combining classes.  See the Unicode
 191  * manual for more information.
 192  **/
 193 static void
 194 g_unicode_canonical_ordering (long * string, size_t len)
 195 {
 196   size_t i;
 197   int swap = 1;
 198
 199   while (swap)
 200     {
 201       int last;
 202       swap = 0;
 203       last = COMBINING_CLASS (string[0]);
 204       for (i = 0; i < len - 1; ++i)
 205         {
 206           int next = COMBINING_CLASS (string[i + 1]);
 207           if (next != 0 && last > next)
 208             {
 209               size_t j;
 210               /* Percolate item leftward through string.  */
 211               for (j = i; j > 0; --j)
 212                 {
 213                   long t;
 214                   if (COMBINING_CLASS (string[j]) <= next)
 215                     break;
 216                   t = string[j + 1];
 217                   string[j + 1] = string[j];
 218                   string[j] = t;
 219                   swap = 1;
 220                 }
 221               /* We're re-entering the loop looking at the old
 222                  character again.  */
 223               next = last;
 224             }
 225           last = next;
 226         }
 227     }
 228 }
 229
 230 static const unsigned char *
 231 find_decomposition (long ch, int compat)
 232 {
 233   int start = 0;
 234   int end = sizeof (decomp_table) / sizeof ((decomp_table)[0]);
 235
 236   if (ch >= decomp_table[start].ch && ch <= decomp_table[end - 1].ch)
 237     {
 238       while (1)
 239         {
 240           int half = (start + end) / 2;
 241           if (ch == decomp_table[half].ch)
 242             {
 243               int offset;
 244
 245               if (compat)
 246                 {
 247                   offset = decomp_table[half].compat_offset;
 248                   if (offset == 0xff)
 249                     offset = decomp_table[half].canon_offset;
 250                 }
 251               else
 252                 {
 253                   offset = decomp_table[half].canon_offset;
 254                   if (offset == 0xff)
 255                     return NULL;
 256                 }
 257
 258               return
 259                 &(decomp_expansion_string
 260                   [decomp_table[half].expansion_offset + offset]);
 261             }
 262           else if (half == start)
 263             break;
 264           else if (ch > decomp_table[half].ch)
 265             start = half;
 266           else
 267             end = half;
 268         }
 269     }
 270
 271   return NULL;
 272 }
 273
 274 #define CI(Page, Char) \
 275   ((compose_table[Page] >= G_UNICODE_MAX_TABLE_INDEX) \
 276    ? (compose_table[Page] - G_UNICODE_MAX_TABLE_INDEX) \
 277    : (compose_data[compose_table[Page]][Char]))
 278
 279 #define COMPOSE_INDEX(Char) \
 280      (((Char) > (G_UNICODE_LAST_CHAR)) ? 0 : CI((Char) >> 8, (Char) & 0xff))
 281
 282 static int
 283 combine (long a, long b, long * result)
 284 {
 285   int index_a, index_b;
 286
 287   index_a = COMPOSE_INDEX (a);
 288   if (index_a >= COMPOSE_FIRST_SINGLE_START && index_a < COMPOSE_SECOND_START)
 289     {
 290       if (b == compose_first_single[index_a - COMPOSE_FIRST_SINGLE_START][0])
 291         {
 292           *result =
 293             compose_first_single[index_a - COMPOSE_FIRST_SINGLE_START][1];
 294           return 1;
 295         }
 296       else
 297         return 0;
 298     }
 299
 300   index_b = COMPOSE_INDEX (b);
 301   if (index_b >= COMPOSE_SECOND_SINGLE_START)
 302     {
 303       if (a ==
 304           compose_second_single[index_b - COMPOSE_SECOND_SINGLE_START][0])
 305         {
 306           *result =
 307             compose_second_single[index_b - COMPOSE_SECOND_SINGLE_START][1];
 308           return 1;
 309         }
 310       else
 311         return 0;
 312     }
 313
 314   if (index_a >= COMPOSE_FIRST_START && index_a < COMPOSE_FIRST_SINGLE_START
 315       && index_b >= COMPOSE_SECOND_START
 316       && index_a < COMPOSE_SECOND_SINGLE_START)
 317     {
 318       long res =
 319         compose_array[index_a - COMPOSE_FIRST_START][index_b -
 320                                                      COMPOSE_SECOND_START];
 321
 322       if (res)
 323         {
 324           *result = res;
 325           return 1;
 326         }
 327     }
 328
 329   return 0;
 330 }
 331
 332 static long *
 333 _g_utf8_normalize_wc (const char * str, int max_len, GNormalizeMode mode)
 334 {
 335   size_t n_wc;
 336   long *wc_buffer;
 337   const char *p;
 338   size_t last_start;
 339   int do_compat = (mode == G_NORMALIZE_NFKC || mode == G_NORMALIZE_NFKD);
 340   int do_compose = (mode == G_NORMALIZE_NFC || mode == G_NORMALIZE_NFKC);
 341
 342   n_wc = 0;
 343   p = str;
 344   while ((max_len < 0 || p < str + max_len) && *p)
 345     {
 346       long wc = g_utf8_get_char (p);
 347
 348       const unsigned char *decomp = find_decomposition (wc, do_compat);
 349
 350       if (decomp)
 351         {
 352           int len;
 353           /* We store as a double-nul terminated string.  */
 354           for (len = 0; (decomp[len] || decomp[len + 1]); len += 2)
 355             ;
 356           n_wc += len / 2;
 357         }
 358       else
 359         n_wc++;
 360
 361       p = g_utf8_next_char (p);
 362     }
 363
 364   wc_buffer = malloc (sizeof(long) * (n_wc + 1));
 365
 366   last_start = 0;
 367   n_wc = 0;
 368   p = str;
 369   while ((max_len < 0 || p < str + max_len) && *p)
 370     {
 371       long wc = g_utf8_get_char (p);
 372       const unsigned char *decomp;
 373       int cc;
 374       size_t old_n_wc = n_wc;
 375
 376       decomp = find_decomposition (wc, do_compat);
 377
 378       if (decomp)
 379         {
 380           int len;
 381           /* We store as a double-nul terminated string.  */
 382           for (len = 0; (decomp[len] || decomp[len + 1]); len += 2)
 383             wc_buffer[n_wc++] = (decomp[len] << 8 | decomp[len + 1]);
 384         }
 385       else
 386         wc_buffer[n_wc++] = wc;
 387
 388       if (n_wc > 0)
 389         {
 390           cc = COMBINING_CLASS (wc_buffer[old_n_wc]);
 391
 392           if (cc == 0)
 393             {
 394               g_unicode_canonical_ordering (wc_buffer + last_start,
 395                                             n_wc - last_start);
 396               last_start = old_n_wc;
 397             }
 398         }
 399
 400       p = g_utf8_next_char (p);
 401     }
 402
 403   if (n_wc > 0)
 404     {
 405       g_unicode_canonical_ordering (wc_buffer + last_start,
 406                                     n_wc - last_start);
 407       last_start = n_wc;
 408     }
 409
 410   wc_buffer[n_wc] = 0;
 411
 412   /* All decomposed and reordered */
 413
 414
 415   if (do_compose && n_wc > 0)
 416     {
 417       size_t i, j;
 418       int last_cc = 0;
 419       last_start = 0;
 420
 421       for (i = 0; i < n_wc; i++)
 422         {
 423           int cc = COMBINING_CLASS (wc_buffer[i]);
 424
 425           if (i > 0 &&
 426               (last_cc == 0 || last_cc != cc) &&
 427               combine (wc_buffer[last_start], wc_buffer[i],
 428                        &wc_buffer[last_start]))
 429             {
 430               for (j = i + 1; j < n_wc; j++)
 431                 wc_buffer[j - 1] = wc_buffer[j];
 432               n_wc--;
 433               i--;
 434
 435               if (i == last_start)
 436                 last_cc = 0;
 437               else
 438                 last_cc = COMBINING_CLASS (wc_buffer[i - 1]);
 439
 440               continue;
 441             }
 442
 443           if (cc == 0)
 444             last_start = i;
 445
 446           last_cc = cc;
 447         }
 448     }
 449
 450   wc_buffer[n_wc] = 0;
 451
 452   return wc_buffer;
 453 }
 454
 455 /**
 456  * g_unichar_to_utf8:
 457  * @c: a ISO10646 character code
 458  * @outbuf: output buffer, must have at least 6 bytes of space.
 459  *       If %NULL, the length will be computed and returned
 460  *       and nothing will be written to @outbuf.
 461  *
 462  * Converts a single character to UTF-8.
 463  *
 464  * Return value: number of bytes written
 465  **/
 466 int
 467 stringprep_unichar_to_utf8 (long c, char * outbuf)
 468 {
 469   int len = 0;
 470   int first;
 471   int i;
 472
 473   if (c < 0x80)
 474     {
 475       first = 0;
 476       len = 1;
 477     }
 478   else if (c < 0x800)
 479     {
 480       first = 0xc0;
 481       len = 2;
 482     }
 483   else if (c < 0x10000)
 484     {
 485       first = 0xe0;
 486       len = 3;
 487     }
 488   else if (c < 0x200000)
 489     {
 490       first = 0xf0;
 491       len = 4;
 492     }
 493   else if (c < 0x4000000)
 494     {
 495       first = 0xf8;
 496       len = 5;
 497     }
 498   else
 499     {
 500       first = 0xfc;
 501       len = 6;
 502     }
 503
 504   if (outbuf)
 505     {
 506       for (i = len - 1; i > 0; --i)
 507         {
 508           outbuf[i] = (c & 0x3f) | 0x80;
 509           c >>= 6;
 510         }
 511       outbuf[0] = c | first;
 512     }
 513
 514   return len;
 515 }
 516
 517 /**
 518  * stringgprep_utf8_to_ucs4_fast:
 519  * @str: a UTF-8 encoded string
 520  * @len: the maximum length of @str to use. If @len < 0, then
 521  *       the string is nul-terminated.
 522  * @items_written: location to store the number of characters in the
 523  *                 result, or %NULL.
 524  *
 525  * Convert a string from UTF-8 to a 32-bit fixed width
 526  * representation as UCS-4, assuming valid UTF-8 input.
 527  * This function is roughly twice as fast as g_utf8_to_ucs4()
 528  * but does no error checking on the input.
 529  *
 530  * Return value: a pointer to a newly allocated UCS-4 string.
 531  *               This value must be freed with g_free().
 532  **/
 533 long *
 534 stringprep_utf8_to_ucs4_fast (const char *str,
 535                               int  len,
 536                               int *items_written)
 537 {
 538   int j, charlen;
 539   long *result;
 540   int n_chars, i;
 541   const char *p;
 542
 543   p = str;
 544   n_chars = 0;
 545   if (len < 0)
 546     {
 547       while (*p)
 548         {
 549           p = g_utf8_next_char (p);
 550           ++n_chars;
 551         }
 552     }
 553   else
 554     {
 555       while (p < str + len && *p)
 556         {
 557           p = g_utf8_next_char (p);
 558           ++n_chars;
 559         }
 560     }
 561
 562   result = malloc(sizeof(long) * (n_chars + 1));
 563
 564   p = str;
 565   for (i=0; i < n_chars; i++)
 566     {
 567       long wc = ((unsigned char *)p)[0];
 568
 569       if (wc < 0x80)
 570         {
 571           result[i] = wc;
 572           p++;
 573         }
 574       else
 575         {
 576           if (wc < 0xe0)
 577             {
 578               charlen = 2;
 579               wc &= 0x1f;
 580             }
 581           else if (wc < 0xf0)
 582             {
 583               charlen = 3;
 584               wc &= 0x0f;
 585             }
 586           else if (wc < 0xf8)
 587             {
 588               charlen = 4;
 589               wc &= 0x07;
 590             }
 591           else if (wc < 0xfc)
 592             {
 593               charlen = 5;
 594               wc &= 0x03;
 595             }
 596           else
 597             {
 598               charlen = 6;
 599               wc &= 0x01;
 600             }
 601
 602           for (j = 1; j < charlen; j++)
 603             {
 604               wc <<= 6;
 605               wc |= ((unsigned char *)p)[j] & 0x3f;
 606             }
 607
 608           result[i] = wc;
 609           p += charlen;
 610         }
 611     }
 612   result[i] = 0;
 613
 614   if (items_written)
 615     *items_written = i;
 616
 617   return result;
 618 }
 619
 620 /**
 621  * g_ucs4_to_utf8:
 622  * @str: a UCS-4 encoded string
 623  * @len: the maximum length of @str to use. If @len < 0, then
 624  *       the string is terminated with a 0 character.
 625  * @items_read: location to store number of characters read read, or %NULL.
 626  * @items_written: location to store number of bytes written or %NULL.
 627  *                 The value here stored does not include the trailing 0
 628  *                 byte.
 629  * @error: location to store the error occuring, or %NULL to ignore
 630  *         errors. Any of the errors in #GConvertError other than
 631  *         %G_CONVERT_ERROR_NO_CONVERSION may occur.
 632  *
 633  * Convert a string from a 32-bit fixed width representation as UCS-4.
 634  * to UTF-8. The result will be terminated with a 0 byte.
 635  *
 636  * Return value: a pointer to a newly allocated UTF-8 string.
 637  *               This value must be freed with g_free(). If an
 638  *               error occurs, %NULL will be returned and
 639  *               @error set.
 640  **/
 641 char *
 642 stringprep_ucs4_to_utf8 (const long * str,
 643                          int len, int * items_read, int * items_written)
 644 {
 645   int result_length;
 646   char *result = NULL;
 647   char *p;
 648   int i;
 649
 650   result_length = 0;
 651   for (i = 0; len < 0 || i < len; i++)
 652     {
 653       if (!str[i])
 654         break;
 655
 656       if (str[i] >= 0x80000000)
 657         {
 658           if (items_read)
 659             *items_read = i;
 660
 661           goto err_out;
 662         }
 663
 664       result_length += UTF8_LENGTH (str[i]);
 665     }
 666
 667   result = malloc (result_length + 1);
 668   p = result;
 669
 670   i = 0;
 671   while (p < result + result_length)
 672     p += stringprep_unichar_to_utf8 (str[i++], p);
 673
 674   *p = '\0';
 675
 676   if (items_written)
 677     *items_written = p - result;
 678
 679 err_out:
 680   if (items_read)
 681     *items_read = i;
 682
 683   return result;
 684 }
 685
 686 /**
 687  * g_utf8_normalize:
 688  * @str: a UTF-8 encoded string.
 689  * @len: length of @str, in bytes, or -1 if @str is nul-terminated.
 690  * @mode: the type of normalization to perform.
 691  *
 692  * Converts a string into canonical form, standardizing
 693  * such issues as whether a character with an accent
 694  * is represented as a base character and combining
 695  * accent or as a single precomposed character. You
 696  * should generally call g_utf8_normalize() before
 697  * comparing two Unicode strings.
 698  *
 699  * The normalization mode %G_NORMALIZE_DEFAULT only
 700  * standardizes differences that do not affect the
 701  * text content, such as the above-mentioned accent
 702  * representation. %G_NORMALIZE_ALL also standardizes
 703  * the "compatibility" characters in Unicode, such
 704  * as SUPERSCRIPT THREE to the standard forms
 705  * (in this case DIGIT THREE). Formatting information
 706  * may be lost but for most text operations such
 707  * characters should be considered the same.
 708  * For example, g_utf8_collate() normalizes
 709  * with %G_NORMALIZE_ALL as its first step.
 710  *
 711  * %G_NORMALIZE_DEFAULT_COMPOSE and %G_NORMALIZE_ALL_COMPOSE
 712  * are like %G_NORMALIZE_DEFAULT and %G_NORMALIZE_ALL,
 713  * but returned a result with composed forms rather
 714  * than a maximally decomposed form. This is often
 715  * useful if you intend to convert the string to
 716  * a legacy encoding or pass it to a system with
 717  * less capable Unicode handling.
 718  *
 719  * Return value: a newly allocated string, that is the
 720  *   normalized form of @str.
 721  **/
 722 static char *
 723 g_utf8_normalize (const char * str, int len, GNormalizeMode mode)
 724 {
 725   long *result_wc = _g_utf8_normalize_wc (str, len, mode);
 726   char *result;
 727
 728   result = stringprep_ucs4_to_utf8 (result_wc, -1, NULL, NULL);
 729   free (result_wc);
 730
 731   return result;
 732 }
 733
 734 char *
 735 stringprep_utf8_nfkc_normalize (const char *str, int len)
 736 {
 737   return g_utf8_normalize (str, len, G_NORMALIZE_NFKC);
 738 }
 739
 740 long *
 741 stringprep_ucs4_nfkc_normalize (long *str, int len)
 742 {
 743   char *p;
 744   long *result_wc;
 745
 746   p = stringprep_ucs4_to_utf8 (str, len, 0, 0);
 747   result_wc = _g_utf8_normalize_wc (p, -1, G_NORMALIZE_NFKC);
 748   free(p);
 749
 750   return result_wc;
 751 }
 752