nfkc.c

   1 /* nfkc.c       unicode normalization utilities
   2  * Copyright (C) 2002  Simon Josefsson
   3  *
   4  * This file is part of libstringprep.
   5  *
   6  * Libstringprep is free software; you can redistribute it and/or
   7  * modify it under the terms of the GNU Lesser General Public
   8  * License as published by the Free Software Foundation; either
   9  * version 2.1 of the License, or (at your option) any later version.
  10  *
  11  * Libstringprep is distributed in the hope that it will be useful,
  12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14  * Lesser General Public License for more details.
  15  *
  16  * You should have received a copy of the GNU Lesser General Public
  17  * License along with libstringprep; if not, write to the Free Software
  18  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  19  *
  20  */
  21
  22 #ifdef HAVE_CONFIG_H
  23 #include "config.h"
  24 #endif
  25
  26 /* This file contains functions from GLIB including gutf8.c and
  27  * gunidecomp.c, all with the following license.
  28  *
  29  *  Copyright (C) 1999, 2000 Tom Tromey
  30  *  Copyright 2000 Red Hat, Inc.
  31  *
  32  * The Gnome Library is free software; you can redistribute it and/or
  33  * modify it under the terms of the GNU Lesser General Public License as
  34  * published by the Free Software Foundation; either version 2 of the
  35  * License, or (at your option) any later version.
  36  *
  37  * The Gnome Library is distributed in the hope that it will be useful,
  38  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  39  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  40  * Lesser General Public License for more details.
  41  *
  42  * You should have received a copy of the GNU Lesser General Public
  43  * License along with the Gnome Library; see the file COPYING.LIB.  If not,
  44  * write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
  45  *   Boston, MA 02111-1307, USA.
  46  */
  47
  48 typedef enum
  49 {
  50   G_NORMALIZE_DEFAULT,
  51   G_NORMALIZE_NFD = G_NORMALIZE_DEFAULT,
  52   G_NORMALIZE_DEFAULT_COMPOSE,
  53   G_NORMALIZE_NFC = G_NORMALIZE_DEFAULT_COMPOSE,
  54   G_NORMALIZE_ALL,
  55   G_NORMALIZE_NFKD = G_NORMALIZE_ALL,
  56   G_NORMALIZE_ALL_COMPOSE,
  57   G_NORMALIZE_NFKC = G_NORMALIZE_ALL_COMPOSE
  58 }
  59 GNormalizeMode;
  60
  61 #include "gunidecomp.h"
  62 #include "gunicomp.h"
  63
  64 #include <stdlib.h>
  65
  66 #define UTF8_COMPUTE(Char, Mask, Len)                                         \
  67   if (Char < 128)                                                             \
  68     {                                                                         \
  69       Len = 1;                                                                \
  70       Mask = 0x7f;                                                            \
  71     }                                                                         \
  72   else if ((Char & 0xe0) == 0xc0)                                             \
  73     {                                                                         \
  74       Len = 2;                                                                \
  75       Mask = 0x1f;                                                            \
  76     }                                                                         \
  77   else if ((Char & 0xf0) == 0xe0)                                             \
  78     {                                                                         \
  79       Len = 3;                                                                \
  80       Mask = 0x0f;                                                            \
  81     }                                                                         \
  82   else if ((Char & 0xf8) == 0xf0)                                             \
  83     {                                                                         \
  84       Len = 4;                                                                \
  85       Mask = 0x07;                                                            \
  86     }                                                                         \
  87   else if ((Char & 0xfc) == 0xf8)                                             \
  88     {                                                                         \
  89       Len = 5;                                                                \
  90       Mask = 0x03;                                                            \
  91     }                                                                         \
  92   else if ((Char & 0xfe) == 0xfc)                                             \
  93     {                                                                         \
  94       Len = 6;                                                                \
  95       Mask = 0x01;                                                            \
  96     }                                                                         \
  97   else                                                                        \
  98     Len = -1;
  99
 100 #define UTF8_LENGTH(Char)              \
 101   ((Char) < 0x80 ? 1 :                 \
 102    ((Char) < 0x800 ? 2 :               \
 103     ((Char) < 0x10000 ? 3 :            \
 104      ((Char) < 0x200000 ? 4 :          \
 105       ((Char) < 0x4000000 ? 5 : 6)))))
 106
 107
 108 #define UTF8_GET(Result, Chars, Count, Mask, Len)                             \
 109   (Result) = (Chars)[0] & (Mask);                                             \
 110   for ((Count) = 1; (Count) < (Len); ++(Count))                               \
 111     {                                                                         \
 112       if (((Chars)[(Count)] & 0xc0) != 0x80)                                  \
 113         {                                                                     \
 114           (Result) = -1;                                                      \
 115           break;                                                              \
 116         }                                                                     \
 117       (Result) <<= 6;                                                         \
 118       (Result) |= ((Chars)[(Count)] & 0x3f);                                  \
 119     }
 120
 121 #define UNICODE_VALID(Char)                   \
 122     ((Char) < 0x110000 &&                     \
 123      ((Char) < 0xD800 || (Char) >= 0xE000) && \
 124      (Char) != 0xFFFE && (Char) != 0xFFFF)
 125
 126 static const char utf8_skip_data[256] = {
 127   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
 128   1, 1, 1, 1, 1, 1, 1,
 129   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
 130   1, 1, 1, 1, 1, 1, 1,
 131   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
 132   1, 1, 1, 1, 1, 1, 1,
 133   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
 134   1, 1, 1, 1, 1, 1, 1,
 135   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
 136   1, 1, 1, 1, 1, 1, 1,
 137   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
 138   1, 1, 1, 1, 1, 1, 1,
 139   2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
 140   2, 2, 2, 2, 2, 2, 2,
 141   3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 5,
 142   5, 5, 5, 6, 6, 1, 1
 143 };
 144 static const char *const g_utf8_skip = utf8_skip_data;
 145
 146 #define g_utf8_next_char(p) (char *)((p) + g_utf8_skip[*(unsigned char *)(p)])
 147
 148 /**
 149  * stringprep_utf8_to_unichar:
 150  * @p: a pointer to Unicode character encoded as UTF-8
 151  *
 152  * Converts a sequence of bytes encoded as UTF-8 to a Unicode character.
 153  * If @p does not point to a valid UTF-8 encoded character, results are
 154  * undefined.
 155  *
 156  * Return value: the resulting character
 157  **/
 158 long
 159 stringprep_utf8_to_unichar (const char *p)
 160 {
 161   int i, mask = 0, len;
 162   long result;
 163   unsigned char c = (unsigned char) *p;
 164
 165   UTF8_COMPUTE (c, mask, len);
 166   if (len == -1)
 167     return (long) -1;
 168   UTF8_GET (result, p, i, mask, len);
 169
 170   return result;
 171 }
 172
 173 #define CC(Page, Char) \
 174   ((combining_class_table[Page] >= G_UNICODE_MAX_TABLE_INDEX) \
 175    ? (combining_class_table[Page] - G_UNICODE_MAX_TABLE_INDEX) \
 176    : (cclass_data[combining_class_table[Page]][Char]))
 177
 178 #define COMBINING_CLASS(Char) \
 179      (((Char) > (G_UNICODE_LAST_CHAR)) ? 0 : CC((Char) >> 8, (Char) & 0xff))
 180
 181 /**
 182  * g_unicode_canonical_ordering:
 183  * @string: a UCS-4 encoded string.
 184  * @len: the maximum length of @string to use.
 185  *
 186  * Computes the canonical ordering of a string in-place.
 187  * This rearranges decomposed characters in the string
 188  * according to their combining classes.  See the Unicode
 189  * manual for more information.
 190  **/
 191 static void
 192 g_unicode_canonical_ordering (long *string, size_t len)
 193 {
 194   size_t i;
 195   int swap = 1;
 196
 197   while (swap)
 198     {
 199       int last;
 200       swap = 0;
 201       last = COMBINING_CLASS (string[0]);
 202       for (i = 0; i < len - 1; ++i)
 203         {
 204           int next = COMBINING_CLASS (string[i + 1]);
 205           if (next != 0 && last > next)
 206             {
 207               size_t j;
 208               /* Percolate item leftward through string.  */
 209               for (j = i; j > 0; --j)
 210                 {
 211                   long t;
 212                   if (COMBINING_CLASS (string[j]) <= next)
 213                     break;
 214                   t = string[j + 1];
 215                   string[j + 1] = string[j];
 216                   string[j] = t;
 217                   swap = 1;
 218                 }
 219               /* We're re-entering the loop looking at the old
 220                  character again.  */
 221               next = last;
 222             }
 223           last = next;
 224         }
 225     }
 226 }
 227
 228 static const unsigned char *
 229 find_decomposition (long ch, int compat)
 230 {
 231   int start = 0;
 232   int end = sizeof (decomp_table) / sizeof ((decomp_table)[0]);
 233
 234   if (ch >= decomp_table[start].ch && ch <= decomp_table[end - 1].ch)
 235     {
 236       while (1)
 237         {
 238           int half = (start + end) / 2;
 239           if (ch == decomp_table[half].ch)
 240             {
 241               int offset;
 242
 243               if (compat)
 244                 {
 245                   offset = decomp_table[half].compat_offset;
 246                   if (offset == 0xff)
 247                     offset = decomp_table[half].canon_offset;
 248                 }
 249               else
 250                 {
 251                   offset = decomp_table[half].canon_offset;
 252                   if (offset == 0xff)
 253                     return NULL;
 254                 }
 255
 256               return
 257                 &(decomp_expansion_string
 258                   [decomp_table[half].expansion_offset + offset]);
 259             }
 260           else if (half == start)
 261             break;
 262           else if (ch > decomp_table[half].ch)
 263             start = half;
 264           else
 265             end = half;
 266         }
 267     }
 268
 269   return NULL;
 270 }
 271
 272 #define CI(Page, Char) \
 273   ((compose_table[Page] >= G_UNICODE_MAX_TABLE_INDEX) \
 274    ? (compose_table[Page] - G_UNICODE_MAX_TABLE_INDEX) \
 275    : (compose_data[compose_table[Page]][Char]))
 276
 277 #define COMPOSE_INDEX(Char) \
 278      (((Char) > (G_UNICODE_LAST_CHAR)) ? 0 : CI((Char) >> 8, (Char) & 0xff))
 279
 280 static int
 281 combine (long a, long b, long *result)
 282 {
 283   int index_a, index_b;
 284
 285   index_a = COMPOSE_INDEX (a);
 286   if (index_a >= COMPOSE_FIRST_SINGLE_START && index_a < COMPOSE_SECOND_START)
 287     {
 288       if (b == compose_first_single[index_a - COMPOSE_FIRST_SINGLE_START][0])
 289         {
 290           *result =
 291             compose_first_single[index_a - COMPOSE_FIRST_SINGLE_START][1];
 292           return 1;
 293         }
 294       else
 295         return 0;
 296     }
 297
 298   index_b = COMPOSE_INDEX (b);
 299   if (index_b >= COMPOSE_SECOND_SINGLE_START)
 300     {
 301       if (a ==
 302           compose_second_single[index_b - COMPOSE_SECOND_SINGLE_START][0])
 303         {
 304           *result =
 305             compose_second_single[index_b - COMPOSE_SECOND_SINGLE_START][1];
 306           return 1;
 307         }
 308       else
 309         return 0;
 310     }
 311
 312   if (index_a >= COMPOSE_FIRST_START && index_a < COMPOSE_FIRST_SINGLE_START
 313       && index_b >= COMPOSE_SECOND_START
 314       && index_a < COMPOSE_SECOND_SINGLE_START)
 315     {
 316       long res =
 317         compose_array[index_a - COMPOSE_FIRST_START][index_b -
 318                                                      COMPOSE_SECOND_START];
 319
 320       if (res)
 321         {
 322           *result = res;
 323           return 1;
 324         }
 325     }
 326
 327   return 0;
 328 }
 329
 330 static long *
 331 _g_utf8_normalize_wc (const char *str, int max_len, GNormalizeMode mode)
 332 {
 333   size_t n_wc;
 334   long *wc_buffer;
 335   const char *p;
 336   size_t last_start;
 337   int do_compat = (mode == G_NORMALIZE_NFKC || mode == G_NORMALIZE_NFKD);
 338   int do_compose = (mode == G_NORMALIZE_NFC || mode == G_NORMALIZE_NFKC);
 339
 340   n_wc = 0;
 341   p = str;
 342   while ((max_len < 0 || p < str + max_len) && *p)
 343     {
 344       long wc = stringprep_utf8_to_unichar (p);
 345
 346       const unsigned char *decomp = find_decomposition (wc, do_compat);
 347
 348       if (decomp)
 349         {
 350           int len;
 351           /* We store as a double-nul terminated string.  */
 352           for (len = 0; (decomp[len] || decomp[len + 1]); len += 2)
 353             ;
 354           n_wc += len / 2;
 355         }
 356       else
 357         n_wc++;
 358
 359       p = g_utf8_next_char (p);
 360     }
 361
 362   wc_buffer = malloc (sizeof (long) * (n_wc + 1));
 363
 364   last_start = 0;
 365   n_wc = 0;
 366   p = str;
 367   while ((max_len < 0 || p < str + max_len) && *p)
 368     {
 369       long wc = stringprep_utf8_to_unichar (p);
 370       const unsigned char *decomp;
 371       int cc;
 372       size_t old_n_wc = n_wc;
 373
 374       decomp = find_decomposition (wc, do_compat);
 375
 376       if (decomp)
 377         {
 378           int len;
 379           /* We store as a double-nul terminated string.  */
 380           for (len = 0; (decomp[len] || decomp[len + 1]); len += 2)
 381             wc_buffer[n_wc++] = (decomp[len] << 8 | decomp[len + 1]);
 382         }
 383       else
 384         wc_buffer[n_wc++] = wc;
 385
 386       if (n_wc > 0)
 387         {
 388           cc = COMBINING_CLASS (wc_buffer[old_n_wc]);
 389
 390           if (cc == 0)
 391             {
 392               g_unicode_canonical_ordering (wc_buffer + last_start,
 393                                             n_wc - last_start);
 394               last_start = old_n_wc;
 395             }
 396         }
 397
 398       p = g_utf8_next_char (p);
 399     }
 400
 401   if (n_wc > 0)
 402     {
 403       g_unicode_canonical_ordering (wc_buffer + last_start,
 404                                     n_wc - last_start);
 405       last_start = n_wc;
 406     }
 407
 408   wc_buffer[n_wc] = 0;
 409
 410   /* All decomposed and reordered */
 411
 412
 413   if (do_compose && n_wc > 0)
 414     {
 415       size_t i, j;
 416       int last_cc = 0;
 417       last_start = 0;
 418
 419       for (i = 0; i < n_wc; i++)
 420         {
 421           int cc = COMBINING_CLASS (wc_buffer[i]);
 422
 423           if (i > 0 &&
 424               (last_cc == 0 || last_cc != cc) &&
 425               combine (wc_buffer[last_start], wc_buffer[i],
 426                        &wc_buffer[last_start]))
 427             {
 428               for (j = i + 1; j < n_wc; j++)
 429                 wc_buffer[j - 1] = wc_buffer[j];
 430               n_wc--;
 431               i--;
 432
 433               if (i == last_start)
 434                 last_cc = 0;
 435               else
 436                 last_cc = COMBINING_CLASS (wc_buffer[i - 1]);
 437
 438               continue;
 439             }
 440
 441           if (cc == 0)
 442             last_start = i;
 443
 444           last_cc = cc;
 445         }
 446     }
 447
 448   wc_buffer[n_wc] = 0;
 449
 450   return wc_buffer;
 451 }
 452
 453 /**
 454  * g_unichar_to_utf8:
 455  * @c: a ISO10646 character code
 456  * @outbuf: output buffer, must have at least 6 bytes of space.
 457  *       If %NULL, the length will be computed and returned
 458  *       and nothing will be written to @outbuf.
 459  *
 460  * Converts a single character to UTF-8.
 461  *
 462  * Return value: number of bytes written
 463  **/
 464 int
 465 stringprep_unichar_to_utf8 (long c, char *outbuf)
 466 {
 467   int len = 0;
 468   int first;
 469   int i;
 470
 471   if (c < 0x80)
 472     {
 473       first = 0;
 474       len = 1;
 475     }
 476   else if (c < 0x800)
 477     {
 478       first = 0xc0;
 479       len = 2;
 480     }
 481   else if (c < 0x10000)
 482     {
 483       first = 0xe0;
 484       len = 3;
 485     }
 486   else if (c < 0x200000)
 487     {
 488       first = 0xf0;
 489       len = 4;
 490     }
 491   else if (c < 0x4000000)
 492     {
 493       first = 0xf8;
 494       len = 5;
 495     }
 496   else
 497     {
 498       first = 0xfc;
 499       len = 6;
 500     }
 501
 502   if (outbuf)
 503     {
 504       for (i = len - 1; i > 0; --i)
 505         {
 506           outbuf[i] = (c & 0x3f) | 0x80;
 507           c >>= 6;
 508         }
 509       outbuf[0] = c | first;
 510     }
 511
 512   return len;
 513 }
 514
 515 /**
 516  * stringgprep_utf8_to_ucs4:
 517  * @str: a UTF-8 encoded string
 518  * @len: the maximum length of @str to use. If @len < 0, then
 519  *       the string is nul-terminated.
 520  * @items_written: location to store the number of characters in the
 521  *                 result, or %NULL.
 522  *
 523  * Convert a string from UTF-8 to a 32-bit fixed width
 524  * representation as UCS-4, assuming valid UTF-8 input.
 525  * This function does no error checking on the input.
 526  *
 527  * Return value: a pointer to a newly allocated UCS-4 string.
 528  *               This value must be freed with g_free().
 529  **/
 530 long *
 531 stringprep_utf8_to_ucs4 (const char *str, int len, int *items_written)
 532 {
 533   int j, charlen;
 534   long *result;
 535   int n_chars, i;
 536   const char *p;
 537
 538   p = str;
 539   n_chars = 0;
 540   if (len < 0)
 541     {
 542       while (*p)
 543         {
 544           p = g_utf8_next_char (p);
 545           ++n_chars;
 546         }
 547     }
 548   else
 549     {
 550       while (p < str + len && *p)
 551         {
 552           p = g_utf8_next_char (p);
 553           ++n_chars;
 554         }
 555     }
 556
 557   result = malloc (sizeof (long) * (n_chars + 1));
 558
 559   p = str;
 560   for (i = 0; i < n_chars; i++)
 561     {
 562       long wc = ((unsigned char *) p)[0];
 563
 564       if (wc < 0x80)
 565         {
 566           result[i] = wc;
 567           p++;
 568         }
 569       else
 570         {
 571           if (wc < 0xe0)
 572             {
 573               charlen = 2;
 574               wc &= 0x1f;
 575             }
 576           else if (wc < 0xf0)
 577             {
 578               charlen = 3;
 579               wc &= 0x0f;
 580             }
 581           else if (wc < 0xf8)
 582             {
 583               charlen = 4;
 584               wc &= 0x07;
 585             }
 586           else if (wc < 0xfc)
 587             {
 588               charlen = 5;
 589               wc &= 0x03;
 590             }
 591           else
 592             {
 593               charlen = 6;
 594               wc &= 0x01;
 595             }
 596
 597           for (j = 1; j < charlen; j++)
 598             {
 599               wc <<= 6;
 600               wc |= ((unsigned char *) p)[j] & 0x3f;
 601             }
 602
 603           result[i] = wc;
 604           p += charlen;
 605         }
 606     }
 607   result[i] = 0;
 608
 609   if (items_written)
 610     *items_written = i;
 611
 612   return result;
 613 }
 614
 615 /* This one is kept around for binary backwards compatibility with
 616    library version CURRENT=1. */
 617 long *
 618 stringprep_utf8_to_ucs4_fast (const char *str, int len, int *items_written)
 619 {
 620   return stringprep_utf8_to_ucs4 (str, len, items_written);
 621
 622 }
 623
 624 /**
 625  * g_ucs4_to_utf8:
 626  * @str: a UCS-4 encoded string
 627  * @len: the maximum length of @str to use. If @len < 0, then
 628  *       the string is terminated with a 0 character.
 629  * @items_read: location to store number of characters read read, or %NULL.
 630  * @items_written: location to store number of bytes written or %NULL.
 631  *                 The value here stored does not include the trailing 0
 632  *                 byte.
 633  * @error: location to store the error occuring, or %NULL to ignore
 634  *         errors. Any of the errors in #GConvertError other than
 635  *         %G_CONVERT_ERROR_NO_CONVERSION may occur.
 636  *
 637  * Convert a string from a 32-bit fixed width representation as UCS-4.
 638  * to UTF-8. The result will be terminated with a 0 byte.
 639  *
 640  * Return value: a pointer to a newly allocated UTF-8 string.
 641  *               This value must be freed with g_free(). If an
 642  *               error occurs, %NULL will be returned and
 643  *               @error set.
 644  **/
 645 char *
 646 stringprep_ucs4_to_utf8 (const long *str,
 647                          int len, int *items_read, int *items_written)
 648 {
 649   int result_length;
 650   char *result = NULL;
 651   char *p;
 652   int i;
 653
 654   result_length = 0;
 655   for (i = 0; len < 0 || i < len; i++)
 656     {
 657       if (!str[i])
 658         break;
 659
 660       if (str[i] >= 0x80000000)
 661         {
 662           if (items_read)
 663             *items_read = i;
 664
 665           goto err_out;
 666         }
 667
 668       result_length += UTF8_LENGTH (str[i]);
 669     }
 670
 671   result = malloc (result_length + 1);
 672   p = result;
 673
 674   i = 0;
 675   while (p < result + result_length)
 676     p += stringprep_unichar_to_utf8 (str[i++], p);
 677
 678   *p = '\0';
 679
 680   if (items_written)
 681     *items_written = p - result;
 682
 683 err_out:
 684   if (items_read)
 685     *items_read = i;
 686
 687   return result;
 688 }
 689
 690 /**
 691  * g_utf8_normalize:
 692  * @str: a UTF-8 encoded string.
 693  * @len: length of @str, in bytes, or -1 if @str is nul-terminated.
 694  * @mode: the type of normalization to perform.
 695  *
 696  * Converts a string into canonical form, standardizing
 697  * such issues as whether a character with an accent
 698  * is represented as a base character and combining
 699  * accent or as a single precomposed character. You
 700  * should generally call g_utf8_normalize() before
 701  * comparing two Unicode strings.
 702  *
 703  * The normalization mode %G_NORMALIZE_DEFAULT only
 704  * standardizes differences that do not affect the
 705  * text content, such as the above-mentioned accent
 706  * representation. %G_NORMALIZE_ALL also standardizes
 707  * the "compatibility" characters in Unicode, such
 708  * as SUPERSCRIPT THREE to the standard forms
 709  * (in this case DIGIT THREE). Formatting information
 710  * may be lost but for most text operations such
 711  * characters should be considered the same.
 712  * For example, g_utf8_collate() normalizes
 713  * with %G_NORMALIZE_ALL as its first step.
 714  *
 715  * %G_NORMALIZE_DEFAULT_COMPOSE and %G_NORMALIZE_ALL_COMPOSE
 716  * are like %G_NORMALIZE_DEFAULT and %G_NORMALIZE_ALL,
 717  * but returned a result with composed forms rather
 718  * than a maximally decomposed form. This is often
 719  * useful if you intend to convert the string to
 720  * a legacy encoding or pass it to a system with
 721  * less capable Unicode handling.
 722  *
 723  * Return value: a newly allocated string, that is the
 724  *   normalized form of @str.
 725  **/
 726 static char *
 727 g_utf8_normalize (const char *str, int len, GNormalizeMode mode)
 728 {
 729   long *result_wc = _g_utf8_normalize_wc (str, len, mode);
 730   char *result;
 731
 732   result = stringprep_ucs4_to_utf8 (result_wc, -1, NULL, NULL);
 733   free (result_wc);
 734
 735   return result;
 736 }
 737
 738 char *
 739 stringprep_utf8_nfkc_normalize (const char *str, int len)
 740 {
 741   return g_utf8_normalize (str, len, G_NORMALIZE_NFKC);
 742 }
 743
 744 long *
 745 stringprep_ucs4_nfkc_normalize (long *str, int len)
 746 {
 747   char *p;
 748   long *result_wc;
 749
 750   p = stringprep_ucs4_to_utf8 (str, len, 0, 0);
 751   result_wc = _g_utf8_normalize_wc (p, -1, G_NORMALIZE_NFKC);
 752   free (p);
 753
 754   return result_wc;
 755 }