nfkc.c

   1 /* nfkc.c       Unicode normalization utilities.
   2  * Copyright (C) 2002, 2003  Simon Josefsson
   3  *
   4  * This file is part of GNU Libidn.
   5  *
   6  * GNU Libidn is free software; you can redistribute it and/or
   7  * modify it under the terms of the GNU Lesser General Public
   8  * License as published by the Free Software Foundation; either
   9  * version 2.1 of the License, or (at your option) any later version.
  10  *
  11  * GNU Libidn is distributed in the hope that it will be useful,
  12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14  * Lesser General Public License for more details.
  15  *
  16  * You should have received a copy of the GNU Lesser General Public
  17  * License along with GNU Libidn; if not, write to the Free Software
  18  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  19  *
  20  */
  21
  22 #include "internal.h"
  23
  24 /* This file contains functions from GLIB including gutf8.c and
  25  * gunidecomp.c, all with the following license.
  26  *
  27  *  Copyright (C) 1999, 2000 Tom Tromey
  28  *  Copyright 2000 Red Hat, Inc.
  29  *
  30  * The Gnome Library is free software; you can redistribute it and/or
  31  * modify it under the terms of the GNU Lesser General Public License as
  32  * published by the Free Software Foundation; either version 2 of the
  33  * License, or (at your option) any later version.
  34  *
  35  * The Gnome Library is distributed in the hope that it will be useful,
  36  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  37  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  38  * Lesser General Public License for more details.
  39  *
  40  * You should have received a copy of the GNU Lesser General Public
  41  * License along with the Gnome Library; see the file COPYING.LIB.  If not,
  42  * write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
  43  *   Boston, MA 02111-1307, USA.
  44  */
  45
  46 typedef enum
  47 {
  48   G_NORMALIZE_DEFAULT,
  49   G_NORMALIZE_NFD = G_NORMALIZE_DEFAULT,
  50   G_NORMALIZE_DEFAULT_COMPOSE,
  51   G_NORMALIZE_NFC = G_NORMALIZE_DEFAULT_COMPOSE,
  52   G_NORMALIZE_ALL,
  53   G_NORMALIZE_NFKD = G_NORMALIZE_ALL,
  54   G_NORMALIZE_ALL_COMPOSE,
  55   G_NORMALIZE_NFKC = G_NORMALIZE_ALL_COMPOSE
  56 }
  57 GNormalizeMode;
  58
  59 #include "gunidecomp.h"
  60 #include "gunicomp.h"
  61
  62 #define UTF8_COMPUTE(Char, Mask, Len)                                         \
  63   if (Char < 128)                                                             \
  64     {                                                                         \
  65       Len = 1;                                                                \
  66       Mask = 0x7f;                                                            \
  67     }                                                                         \
  68   else if ((Char & 0xe0) == 0xc0)                                             \
  69     {                                                                         \
  70       Len = 2;                                                                \
  71       Mask = 0x1f;                                                            \
  72     }                                                                         \
  73   else if ((Char & 0xf0) == 0xe0)                                             \
  74     {                                                                         \
  75       Len = 3;                                                                \
  76       Mask = 0x0f;                                                            \
  77     }                                                                         \
  78   else if ((Char & 0xf8) == 0xf0)                                             \
  79     {                                                                         \
  80       Len = 4;                                                                \
  81       Mask = 0x07;                                                            \
  82     }                                                                         \
  83   else if ((Char & 0xfc) == 0xf8)                                             \
  84     {                                                                         \
  85       Len = 5;                                                                \
  86       Mask = 0x03;                                                            \
  87     }                                                                         \
  88   else if ((Char & 0xfe) == 0xfc)                                             \
  89     {                                                                         \
  90       Len = 6;                                                                \
  91       Mask = 0x01;                                                            \
  92     }                                                                         \
  93   else                                                                        \
  94     Len = -1;
  95
  96 #define UTF8_LENGTH(Char)              \
  97   ((Char) < 0x80 ? 1 :                 \
  98    ((Char) < 0x800 ? 2 :               \
  99     ((Char) < 0x10000 ? 3 :            \
 100      ((Char) < 0x200000 ? 4 :          \
 101       ((Char) < 0x4000000 ? 5 : 6)))))
 102
 103
 104 #define UTF8_GET(Result, Chars, Count, Mask, Len)                             \
 105   (Result) = (Chars)[0] & (Mask);                                             \
 106   for ((Count) = 1; (Count) < (Len); ++(Count))                               \
 107     {                                                                         \
 108       if (((Chars)[(Count)] & 0xc0) != 0x80)                                  \
 109         {                                                                     \
 110           (Result) = -1;                                                      \
 111           break;                                                              \
 112         }                                                                     \
 113       (Result) <<= 6;                                                         \
 114       (Result) |= ((Chars)[(Count)] & 0x3f);                                  \
 115     }
 116
 117 #define UNICODE_VALID(Char)                   \
 118     ((Char) < 0x110000 &&                     \
 119      ((Char) < 0xD800 || (Char) >= 0xE000) && \
 120      (Char) != 0xFFFE && (Char) != 0xFFFF)
 121
 122 static const char utf8_skip_data[256] = {
 123   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
 124   1, 1, 1, 1, 1, 1, 1,
 125   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
 126   1, 1, 1, 1, 1, 1, 1,
 127   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
 128   1, 1, 1, 1, 1, 1, 1,
 129   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
 130   1, 1, 1, 1, 1, 1, 1,
 131   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
 132   1, 1, 1, 1, 1, 1, 1,
 133   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
 134   1, 1, 1, 1, 1, 1, 1,
 135   2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
 136   2, 2, 2, 2, 2, 2, 2,
 137   3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 5,
 138   5, 5, 5, 6, 6, 1, 1
 139 };
 140 static const char *const g_utf8_skip = utf8_skip_data;
 141
 142 #define g_utf8_next_char(p) (const char *)((p) + g_utf8_skip[*(const unsigned char *)(p)])
 143
 144 /**
 145  * stringprep_utf8_to_unichar:
 146  * @p: a pointer to Unicode character encoded as UTF-8
 147  *
 148  * Converts a sequence of bytes encoded as UTF-8 to a Unicode character.
 149  * If @p does not point to a valid UTF-8 encoded character, results are
 150  * undefined.
 151  *
 152  * Return value: the resulting character
 153  **/
 154 unsigned long
 155 stringprep_utf8_to_unichar (const char *p)
 156 {
 157   int i, mask = 0, len;
 158   unsigned long result;
 159   unsigned char c = (unsigned char) *p;
 160
 161   UTF8_COMPUTE (c, mask, len);
 162   if (len == -1)
 163     return (unsigned long) -1;
 164   UTF8_GET (result, p, i, mask, len);
 165
 166   return result;
 167 }
 168
 169 #define CC(Page, Char) \
 170   ((combining_class_table[Page] >= G_UNICODE_MAX_TABLE_INDEX) \
 171    ? (combining_class_table[Page] - G_UNICODE_MAX_TABLE_INDEX) \
 172    : (cclass_data[combining_class_table[Page]][Char]))
 173
 174 #define COMBINING_CLASS(Char) \
 175      (((Char) > (G_UNICODE_LAST_CHAR)) ? 0 : CC((Char) >> 8, (Char) & 0xff))
 176
 177 /*
 178  * g_unicode_canonical_ordering:
 179  * @string: a UCS-4 encoded string.
 180  * @len: the maximum length of @string to use.
 181  *
 182  * Computes the canonical ordering of a string in-place.
 183  * This rearranges decomposed characters in the string
 184  * according to their combining classes.  See the Unicode
 185  * manual for more information.
 186  **/
 187 static void
 188 g_unicode_canonical_ordering (unsigned long *string, size_t len)
 189 {
 190   size_t i;
 191   int swap = 1;
 192
 193   while (swap)
 194     {
 195       int last;
 196       swap = 0;
 197       last = COMBINING_CLASS (string[0]);
 198       for (i = 0; i < len - 1; ++i)
 199         {
 200           int next = COMBINING_CLASS (string[i + 1]);
 201           if (next != 0 && last > next)
 202             {
 203               size_t j;
 204               /* Percolate item leftward through string.  */
 205               for (j = i; j > 0; --j)
 206                 {
 207                   unsigned long t;
 208                   if (COMBINING_CLASS (string[j]) <= next)
 209                     break;
 210                   t = string[j + 1];
 211                   string[j + 1] = string[j];
 212                   string[j] = t;
 213                   swap = 1;
 214                 }
 215               /* We're re-entering the loop looking at the old
 216                  character again.  */
 217               next = last;
 218             }
 219           last = next;
 220         }
 221     }
 222 }
 223
 224 static const unsigned char *
 225 find_decomposition (unsigned long ch, int compat)
 226 {
 227   int start = 0;
 228   int end = sizeof (decomp_table) / sizeof ((decomp_table)[0]);
 229
 230   if (ch >= decomp_table[start].ch && ch <= decomp_table[end - 1].ch)
 231     {
 232       while (1)
 233         {
 234           int half = (start + end) / 2;
 235           if (ch == decomp_table[half].ch)
 236             {
 237               int offset;
 238
 239               if (compat)
 240                 {
 241                   offset = decomp_table[half].compat_offset;
 242                   if (offset == 0xff)
 243                     offset = decomp_table[half].canon_offset;
 244                 }
 245               else
 246                 {
 247                   offset = decomp_table[half].canon_offset;
 248                   if (offset == 0xff)
 249                     return NULL;
 250                 }
 251
 252               return
 253                 &(decomp_expansion_string
 254                   [decomp_table[half].expansion_offset + offset]);
 255             }
 256           else if (half == start)
 257             break;
 258           else if (ch > decomp_table[half].ch)
 259             start = half;
 260           else
 261             end = half;
 262         }
 263     }
 264
 265   return NULL;
 266 }
 267
 268 #define CI(Page, Char) \
 269   ((compose_table[Page] >= G_UNICODE_MAX_TABLE_INDEX) \
 270    ? (compose_table[Page] - G_UNICODE_MAX_TABLE_INDEX) \
 271    : (compose_data[compose_table[Page]][Char]))
 272
 273 #define COMPOSE_INDEX(Char) \
 274      (((Char) > (G_UNICODE_LAST_CHAR)) ? 0 : CI((Char) >> 8, (Char) & 0xff))
 275
 276 static int
 277 combine (unsigned long a, unsigned long b, unsigned long *result)
 278 {
 279   int index_a, index_b;
 280
 281   index_a = COMPOSE_INDEX (a);
 282   if (index_a >= COMPOSE_FIRST_SINGLE_START && index_a < COMPOSE_SECOND_START)
 283     {
 284       if (b == compose_first_single[index_a - COMPOSE_FIRST_SINGLE_START][0])
 285         {
 286           *result =
 287             compose_first_single[index_a - COMPOSE_FIRST_SINGLE_START][1];
 288           return 1;
 289         }
 290       else
 291         return 0;
 292     }
 293
 294   index_b = COMPOSE_INDEX (b);
 295   if (index_b >= COMPOSE_SECOND_SINGLE_START)
 296     {
 297       if (a ==
 298           compose_second_single[index_b - COMPOSE_SECOND_SINGLE_START][0])
 299         {
 300           *result =
 301             compose_second_single[index_b - COMPOSE_SECOND_SINGLE_START][1];
 302           return 1;
 303         }
 304       else
 305         return 0;
 306     }
 307
 308   if (index_a >= COMPOSE_FIRST_START && index_a < COMPOSE_FIRST_SINGLE_START
 309       && index_b >= COMPOSE_SECOND_START
 310       && index_a < COMPOSE_SECOND_SINGLE_START)
 311     {
 312       unsigned long res =
 313         compose_array[index_a - COMPOSE_FIRST_START][index_b -
 314                                                      COMPOSE_SECOND_START];
 315
 316       if (res)
 317         {
 318           *result = res;
 319           return 1;
 320         }
 321     }
 322
 323   return 0;
 324 }
 325
 326 static unsigned long *
 327 _g_utf8_normalize_wc (const char *str, ssize_t max_len, GNormalizeMode mode)
 328 {
 329   size_t n_wc;
 330   unsigned long *wc_buffer;
 331   const char *p;
 332   size_t last_start;
 333   int do_compat = (mode == G_NORMALIZE_NFKC || mode == G_NORMALIZE_NFKD);
 334   int do_compose = (mode == G_NORMALIZE_NFC || mode == G_NORMALIZE_NFKC);
 335
 336   n_wc = 0;
 337   p = str;
 338   while ((max_len < 0 || p < str + max_len) && *p)
 339     {
 340       unsigned long wc = stringprep_utf8_to_unichar (p);
 341
 342       const unsigned char *decomp = find_decomposition (wc, do_compat);
 343
 344       if (decomp)
 345         {
 346           int len;
 347           /* We store as a double-nul terminated string.  */
 348           for (len = 0; (decomp[len] || decomp[len + 1]); len += 2)
 349             ;
 350           n_wc += len / 2;
 351         }
 352       else
 353         n_wc++;
 354
 355       p = g_utf8_next_char (p);
 356     }
 357
 358   wc_buffer = malloc (sizeof (unsigned long) * (n_wc + 1));
 359
 360   last_start = 0;
 361   n_wc = 0;
 362   p = str;
 363   while ((max_len < 0 || p < str + max_len) && *p)
 364     {
 365       unsigned long wc = stringprep_utf8_to_unichar (p);
 366       const unsigned char *decomp;
 367       int cc;
 368       size_t old_n_wc = n_wc;
 369
 370       decomp = find_decomposition (wc, do_compat);
 371
 372       if (decomp)
 373         {
 374           int len;
 375           /* We store as a double-nul terminated string.  */
 376           for (len = 0; (decomp[len] || decomp[len + 1]); len += 2)
 377             wc_buffer[n_wc++] = (decomp[len] << 8 | decomp[len + 1]);
 378         }
 379       else
 380         wc_buffer[n_wc++] = wc;
 381
 382       if (n_wc > 0)
 383         {
 384           cc = COMBINING_CLASS (wc_buffer[old_n_wc]);
 385
 386           if (cc == 0)
 387             {
 388               g_unicode_canonical_ordering (wc_buffer + last_start,
 389                                             n_wc - last_start);
 390               last_start = old_n_wc;
 391             }
 392         }
 393
 394       p = g_utf8_next_char (p);
 395     }
 396
 397   if (n_wc > 0)
 398     {
 399       g_unicode_canonical_ordering (wc_buffer + last_start,
 400                                     n_wc - last_start);
 401       last_start = n_wc;
 402     }
 403
 404   wc_buffer[n_wc] = 0;
 405
 406   /* All decomposed and reordered */
 407
 408
 409   if (do_compose && n_wc > 0)
 410     {
 411       size_t i, j;
 412       int last_cc = 0;
 413       last_start = 0;
 414
 415       for (i = 0; i < n_wc; i++)
 416         {
 417           int cc = COMBINING_CLASS (wc_buffer[i]);
 418
 419           if (i > 0 &&
 420               (last_cc == 0 || last_cc != cc) &&
 421               combine (wc_buffer[last_start], wc_buffer[i],
 422                        &wc_buffer[last_start]))
 423             {
 424               for (j = i + 1; j < n_wc; j++)
 425                 wc_buffer[j - 1] = wc_buffer[j];
 426               n_wc--;
 427               i--;
 428
 429               if (i == last_start)
 430                 last_cc = 0;
 431               else
 432                 last_cc = COMBINING_CLASS (wc_buffer[i - 1]);
 433
 434               continue;
 435             }
 436
 437           if (cc == 0)
 438             last_start = i;
 439
 440           last_cc = cc;
 441         }
 442     }
 443
 444   wc_buffer[n_wc] = 0;
 445
 446   return wc_buffer;
 447 }
 448
 449 /**
 450  * stringprep_unichar_to_utf8:
 451  * @c: a ISO10646 character code
 452  * @outbuf: output buffer, must have at least 6 bytes of space.
 453  *       If %NULL, the length will be computed and returned
 454  *       and nothing will be written to @outbuf.
 455  *
 456  * Converts a single character to UTF-8.
 457  *
 458  * Return value: number of bytes written
 459  **/
 460 int
 461 stringprep_unichar_to_utf8 (unsigned long c, char *outbuf)
 462 {
 463   int len = 0;
 464   int first;
 465   int i;
 466
 467   if (c < 0x80)
 468     {
 469       first = 0;
 470       len = 1;
 471     }
 472   else if (c < 0x800)
 473     {
 474       first = 0xc0;
 475       len = 2;
 476     }
 477   else if (c < 0x10000)
 478     {
 479       first = 0xe0;
 480       len = 3;
 481     }
 482   else if (c < 0x200000)
 483     {
 484       first = 0xf0;
 485       len = 4;
 486     }
 487   else if (c < 0x4000000)
 488     {
 489       first = 0xf8;
 490       len = 5;
 491     }
 492   else
 493     {
 494       first = 0xfc;
 495       len = 6;
 496     }
 497
 498   if (outbuf)
 499     {
 500       for (i = len - 1; i > 0; --i)
 501         {
 502           outbuf[i] = (c & 0x3f) | 0x80;
 503           c >>= 6;
 504         }
 505       outbuf[0] = c | first;
 506     }
 507
 508   return len;
 509 }
 510
 511 /**
 512  * stringprep_utf8_to_ucs4:
 513  * @str: a UTF-8 encoded string
 514  * @len: the maximum length of @str to use. If @len < 0, then
 515  *       the string is nul-terminated.
 516  * @items_written: location to store the number of characters in the
 517  *                 result, or %NULL.
 518  *
 519  * Convert a string from UTF-8 to a 32-bit fixed width
 520  * representation as UCS-4, assuming valid UTF-8 input.
 521  * This function does no error checking on the input.
 522  *
 523  * Return value: a pointer to a newly allocated UCS-4 string.
 524  *               This value must be freed with free().
 525  **/
 526 unsigned long *
 527 stringprep_utf8_to_ucs4 (const char *str, ssize_t len, size_t *items_written)
 528 {
 529   int j, charlen;
 530   unsigned long *result;
 531   int n_chars, i;
 532   const char *p;
 533
 534   p = str;
 535   n_chars = 0;
 536   if (len < 0)
 537     {
 538       while (*p)
 539         {
 540           p = g_utf8_next_char (p);
 541           ++n_chars;
 542         }
 543     }
 544   else
 545     {
 546       while (p < str + len && *p)
 547         {
 548           p = g_utf8_next_char (p);
 549           ++n_chars;
 550         }
 551     }
 552
 553   result = malloc (sizeof (unsigned long) * (n_chars + 1));
 554
 555   p = str;
 556   for (i = 0; i < n_chars; i++)
 557     {
 558       unsigned long wc = ((const unsigned char *) p)[0];
 559
 560       if (wc < 0x80)
 561         {
 562           result[i] = wc;
 563           p++;
 564         }
 565       else
 566         {
 567           if (wc < 0xe0)
 568             {
 569               charlen = 2;
 570               wc &= 0x1f;
 571             }
 572           else if (wc < 0xf0)
 573             {
 574               charlen = 3;
 575               wc &= 0x0f;
 576             }
 577           else if (wc < 0xf8)
 578             {
 579               charlen = 4;
 580               wc &= 0x07;
 581             }
 582           else if (wc < 0xfc)
 583             {
 584               charlen = 5;
 585               wc &= 0x03;
 586             }
 587           else
 588             {
 589               charlen = 6;
 590               wc &= 0x01;
 591             }
 592
 593           for (j = 1; j < charlen; j++)
 594             {
 595               wc <<= 6;
 596               wc |= ((const unsigned char *) p)[j] & 0x3f;
 597             }
 598
 599           result[i] = wc;
 600           p += charlen;
 601         }
 602     }
 603   result[i] = 0;
 604
 605   if (items_written)
 606     *items_written = i;
 607
 608   return result;
 609 }
 610
 611 /**
 612  * stringprep_ucs4_to_utf8:
 613  * @str: a UCS-4 encoded string
 614  * @len: the maximum length of @str to use. If @len < 0, then
 615  *       the string is terminated with a 0 character.
 616  * @items_read: location to store number of characters read read, or %NULL.
 617  * @items_written: location to store number of bytes written or %NULL.
 618  *                 The value here stored does not include the trailing 0
 619  *                 byte.
 620  *
 621  * Convert a string from a 32-bit fixed width representation as UCS-4.
 622  * to UTF-8. The result will be terminated with a 0 byte.
 623  *
 624  * Return value: a pointer to a newly allocated UTF-8 string.
 625  *               This value must be freed with free(). If an
 626  *               error occurs, %NULL will be returned and
 627  *               @error set.
 628  **/
 629 char *
 630 stringprep_ucs4_to_utf8 (const unsigned long *str, ssize_t len,
 631                          size_t *items_read, size_t *items_written)
 632 {
 633   int result_length;
 634   char *result = NULL;
 635   char *p;
 636   int i;
 637
 638   result_length = 0;
 639   for (i = 0; len < 0 || i < len; i++)
 640     {
 641       if (!str[i])
 642         break;
 643
 644       if (str[i] >= 0x80000000)
 645         {
 646           if (items_read)
 647             *items_read = i;
 648
 649           goto err_out;
 650         }
 651
 652       result_length += UTF8_LENGTH (str[i]);
 653     }
 654
 655   result = malloc (result_length + 1);
 656   p = result;
 657
 658   i = 0;
 659   while (p < result + result_length)
 660     p += stringprep_unichar_to_utf8 (str[i++], p);
 661
 662   *p = '\0';
 663
 664   if (items_written)
 665     *items_written = p - result;
 666
 667 err_out:
 668   if (items_read)
 669     *items_read = i;
 670
 671   return result;
 672 }
 673
 674 /*
 675  * g_utf8_normalize:
 676  * @str: a UTF-8 encoded string.
 677  * @len: length of @str, in bytes, or -1 if @str is nul-terminated.
 678  * @mode: the type of normalization to perform.
 679  *
 680  * Converts a string into canonical form, standardizing
 681  * such issues as whether a character with an accent
 682  * is represented as a base character and combining
 683  * accent or as a single precomposed character. You
 684  * should generally call g_utf8_normalize() before
 685  * comparing two Unicode strings.
 686  *
 687  * The normalization mode %G_NORMALIZE_DEFAULT only
 688  * standardizes differences that do not affect the
 689  * text content, such as the above-mentioned accent
 690  * representation. %G_NORMALIZE_ALL also standardizes
 691  * the "compatibility" characters in Unicode, such
 692  * as SUPERSCRIPT THREE to the standard forms
 693  * (in this case DIGIT THREE). Formatting information
 694  * may be lost but for most text operations such
 695  * characters should be considered the same.
 696  * For example, g_utf8_collate() normalizes
 697  * with %G_NORMALIZE_ALL as its first step.
 698  *
 699  * %G_NORMALIZE_DEFAULT_COMPOSE and %G_NORMALIZE_ALL_COMPOSE
 700  * are like %G_NORMALIZE_DEFAULT and %G_NORMALIZE_ALL,
 701  * but returned a result with composed forms rather
 702  * than a maximally decomposed form. This is often
 703  * useful if you intend to convert the string to
 704  * a legacy encoding or pass it to a system with
 705  * less capable Unicode handling.
 706  *
 707  * Return value: a newly allocated string, that is the
 708  *   normalized form of @str.
 709  **/
 710 static char *
 711 g_utf8_normalize (const char *str, ssize_t len, GNormalizeMode mode)
 712 {
 713   unsigned long *result_wc = _g_utf8_normalize_wc (str, len, mode);
 714   char *result;
 715
 716   result = stringprep_ucs4_to_utf8 (result_wc, -1, NULL, NULL);
 717   free (result_wc);
 718
 719   return result;
 720 }
 721
 722 /**
 723  * stringprep_utf8_nfkc_normalize:
 724  * @str: a UTF-8 encoded string.
 725  * @len: length of @str, in bytes, or -1 if @str is nul-terminated.
 726  *
 727  * Converts a string into canonical form, standardizing
 728  * such issues as whether a character with an accent
 729  * is represented as a base character and combining
 730  * accent or as a single precomposed character. You
 731  * should generally call g_utf8_normalize() before
 732  * comparing two Unicode strings.
 733  *
 734  * The normalization mode is NFKC (ALL COMPOSE).  It standardizes
 735  * differences that do not affect the text content, such as the
 736  * above-mentioned accent representation. It standardizes the
 737  * "compatibility" characters in Unicode, such as SUPERSCRIPT THREE to
 738  * the standard forms (in this case DIGIT THREE). Formatting
 739  * information may be lost but for most text operations such
 740  * characters should be considered the same. It returns a result with
 741  * composed forms rather than a maximally decomposed form.
 742  *
 743  * Return value: a newly allocated string, that is the
 744  *   NFKC normalized form of @str.
 745  **/
 746 char *
 747 stringprep_utf8_nfkc_normalize (const char *str, ssize_t len)
 748 {
 749   return g_utf8_normalize (str, len, G_NORMALIZE_NFKC);
 750 }
 751
 752 /**
 753  * stringprep_ucs4_nfkc_normalize:
 754  * @str: a Unicode string.
 755  * @len: length of @str array, or -1 if @str is nul-terminated.
 756  *
 757  * Converts UCS4 string into UTF-8 and runs
 758  * stringprep_utf8_nfkc_normalize().
 759  *
 760  * Return value: a newly allocated Unicode string, that is the NFKC
 761  *   normalized form of @str.
 762  **/
 763 unsigned long *
 764 stringprep_ucs4_nfkc_normalize (unsigned long *str, ssize_t len)
 765 {
 766   char *p;
 767   unsigned long *result_wc;
 768
 769   p = stringprep_ucs4_to_utf8 (str, len, 0, 0);
 770   result_wc = _g_utf8_normalize_wc (p, -1, G_NORMALIZE_NFKC);
 771   free (p);
 772
 773   return result_wc;
 774 }