libidn/nfkc.c

   1 /* nfkc.c       Unicode normalization utilities.
   2  * Copyright (C) 2002, 2003  Simon Josefsson
   3  *
   4  * This file is part of GNU Libidn.
   5  *
   6  * GNU Libidn is free software; you can redistribute it and/or
   7  * modify it under the terms of the GNU Lesser General Public
   8  * License as published by the Free Software Foundation; either
   9  * version 2.1 of the License, or (at your option) any later version.
  10  *
  11  * GNU Libidn is distributed in the hope that it will be useful,
  12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14  * Lesser General Public License for more details.
  15  *
  16  * You should have received a copy of the GNU Lesser General Public
  17  * License along with GNU Libidn; if not, see <http://www.gnu.org/licenses/>.
  18  */
  19
  20 #if HAVE_CONFIG_H
  21 # include "config.h"
  22 #endif
  23
  24 #include <stdlib.h>
  25 #include <string.h>
  26 #include <stdint.h>
  27
  28 #include "stringprep.h"
  29
  30 /* This file contains functions from GLIB, including gutf8.c and
  31  * gunidecomp.c, all licensed under LGPL and copyright hold by:
  32  *
  33  *  Copyright (C) 1999, 2000 Tom Tromey
  34  *  Copyright 2000 Red Hat, Inc.
  35  */
  36
  37 /* Hacks to make syncing with GLIB code easier. */
  38 #define gboolean int
  39 #define gchar char
  40 #define guchar unsigned char
  41 #define glong long
  42 #define gint int
  43 #define guint unsigned int
  44 #define gushort unsigned short
  45 #define gint16 int16_t
  46 #define guint16 uint16_t
  47 #define gunichar uint32_t
  48 #define gsize size_t
  49 #define gssize ssize_t
  50 #define g_malloc malloc
  51 #define g_free free
  52 #define GError void
  53 #define g_set_error(a,b,c,d) ((void) 0)
  54 #define g_new(struct_type, n_structs)                                   \
  55   ((struct_type *) g_malloc (((gsize) sizeof (struct_type)) * ((gsize) (n_structs))))
  56 #  if defined (__GNUC__) && !defined (__STRICT_ANSI__) && !defined (__cplusplus)
  57 #    define G_STMT_START        (void)(
  58 #    define G_STMT_END          )
  59 #  else
  60 #    if (defined (sun) || defined (__sun__))
  61 #      define G_STMT_START      if (1)
  62 #      define G_STMT_END        else (void)0
  63 #    else
  64 #      define G_STMT_START      do
  65 #      define G_STMT_END        while (0)
  66 #    endif
  67 #  endif
  68 #define g_return_val_if_fail(expr,val)          G_STMT_START{ (void)0; }G_STMT_END
  69 #define G_N_ELEMENTS(arr)               (sizeof (arr) / sizeof ((arr)[0]))
  70 #define TRUE 1
  71 #define FALSE 0
  72
  73 /* Code from GLIB gunicode.h starts here. */
  74
  75 typedef enum
  76 {
  77   G_NORMALIZE_DEFAULT,
  78   G_NORMALIZE_NFD = G_NORMALIZE_DEFAULT,
  79   G_NORMALIZE_DEFAULT_COMPOSE,
  80   G_NORMALIZE_NFC = G_NORMALIZE_DEFAULT_COMPOSE,
  81   G_NORMALIZE_ALL,
  82   G_NORMALIZE_NFKD = G_NORMALIZE_ALL,
  83   G_NORMALIZE_ALL_COMPOSE,
  84   G_NORMALIZE_NFKC = G_NORMALIZE_ALL_COMPOSE
  85 }
  86 GNormalizeMode;
  87
  88 /* Code from GLIB gutf8.c starts here. */
  89
  90 #define UTF8_COMPUTE(Char, Mask, Len)           \
  91   if (Char < 128)                               \
  92     {                                           \
  93       Len = 1;                                  \
  94       Mask = 0x7f;                              \
  95     }                                           \
  96   else if ((Char & 0xe0) == 0xc0)               \
  97     {                                           \
  98       Len = 2;                                  \
  99       Mask = 0x1f;                              \
 100     }                                           \
 101   else if ((Char & 0xf0) == 0xe0)               \
 102     {                                           \
 103       Len = 3;                                  \
 104       Mask = 0x0f;                              \
 105     }                                           \
 106   else if ((Char & 0xf8) == 0xf0)               \
 107     {                                           \
 108       Len = 4;                                  \
 109       Mask = 0x07;                              \
 110     }                                           \
 111   else if ((Char & 0xfc) == 0xf8)               \
 112     {                                           \
 113       Len = 5;                                  \
 114       Mask = 0x03;                              \
 115     }                                           \
 116   else if ((Char & 0xfe) == 0xfc)               \
 117     {                                           \
 118       Len = 6;                                  \
 119       Mask = 0x01;                              \
 120     }                                           \
 121   else                                          \
 122     Len = -1;
 123
 124 #define UTF8_LENGTH(Char)                       \
 125   ((Char) < 0x80 ? 1 :                          \
 126    ((Char) < 0x800 ? 2 :                        \
 127     ((Char) < 0x10000 ? 3 :                     \
 128      ((Char) < 0x200000 ? 4 :                   \
 129       ((Char) < 0x4000000 ? 5 : 6)))))
 130
 131
 132 #define UTF8_GET(Result, Chars, Count, Mask, Len)       \
 133   (Result) = (Chars)[0] & (Mask);                       \
 134   for ((Count) = 1; (Count) < (Len); ++(Count))         \
 135     {                                                   \
 136       if (((Chars)[(Count)] & 0xc0) != 0x80)            \
 137         {                                               \
 138           (Result) = -1;                                \
 139           break;                                        \
 140         }                                               \
 141       (Result) <<= 6;                                   \
 142       (Result) |= ((Chars)[(Count)] & 0x3f);            \
 143     }
 144
 145 #define UNICODE_VALID(Char)                     \
 146   ((Char) < 0x110000 &&                         \
 147    (((Char) & 0xFFFFF800) != 0xD800) &&         \
 148    ((Char) < 0xFDD0 || (Char) > 0xFDEF) &&      \
 149    ((Char) & 0xFFFE) != 0xFFFE)
 150
 151
 152 static const gchar utf8_skip_data[256] = {
 153   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
 154   1, 1, 1, 1, 1, 1, 1,
 155   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
 156   1, 1, 1, 1, 1, 1, 1,
 157   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
 158   1, 1, 1, 1, 1, 1, 1,
 159   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
 160   1, 1, 1, 1, 1, 1, 1,
 161   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
 162   1, 1, 1, 1, 1, 1, 1,
 163   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
 164   1, 1, 1, 1, 1, 1, 1,
 165   2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
 166   2, 2, 2, 2, 2, 2, 2,
 167   3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 5,
 168   5, 5, 5, 6, 6, 1, 1
 169 };
 170
 171 const gchar *const g_utf8_skip = utf8_skip_data;
 172
 173 #define g_utf8_next_char(p) (char *)((p) + g_utf8_skip[*(guchar *)(p)])
 174
 175 /*
 176  * g_utf8_strlen:
 177  * @p: pointer to the start of a UTF-8 encoded string.
 178  * @max: the maximum number of bytes to examine. If @max
 179  *       is less than 0, then the string is assumed to be
 180  *       nul-terminated. If @max is 0, @p will not be examined and
 181  *       may be %NULL.
 182  *
 183  * Returns the length of the string in characters.
 184  *
 185  * Return value: the length of the string in characters
 186  **/
 187 static glong
 188 g_utf8_strlen (const gchar * p, gssize max)
 189 {
 190   glong len = 0;
 191   const gchar *start = p;
 192   g_return_val_if_fail (p != NULL || max == 0, 0);
 193
 194   if (max < 0)
 195     {
 196       while (*p)
 197         {
 198           p = g_utf8_next_char (p);
 199           ++len;
 200         }
 201     }
 202   else
 203     {
 204       if (max == 0 || !*p)
 205         return 0;
 206
 207       p = g_utf8_next_char (p);
 208
 209       while (p - start < max && *p)
 210         {
 211           ++len;
 212           p = g_utf8_next_char (p);
 213         }
 214
 215       /* only do the last len increment if we got a complete
 216        * char (don't count partial chars)
 217        */
 218       if (p - start == max)
 219         ++len;
 220     }
 221
 222   return len;
 223 }
 224
 225 /*
 226  * g_utf8_get_char:
 227  * @p: a pointer to Unicode character encoded as UTF-8
 228  *
 229  * Converts a sequence of bytes encoded as UTF-8 to a Unicode character.
 230  * If @p does not point to a valid UTF-8 encoded character, results are
 231  * undefined. If you are not sure that the bytes are complete
 232  * valid Unicode characters, you should use g_utf8_get_char_validated()
 233  * instead.
 234  *
 235  * Return value: the resulting character
 236  **/
 237 static gunichar
 238 g_utf8_get_char (const gchar * p)
 239 {
 240   int i, mask = 0, len;
 241   gunichar result;
 242   unsigned char c = (unsigned char) *p;
 243
 244   UTF8_COMPUTE (c, mask, len);
 245   if (len == -1)
 246     return (gunichar) - 1;
 247   UTF8_GET (result, p, i, mask, len);
 248
 249   return result;
 250 }
 251
 252 /*
 253  * g_unichar_to_utf8:
 254  * @c: a ISO10646 character code
 255  * @outbuf: output buffer, must have at least 6 bytes of space.
 256  *       If %NULL, the length will be computed and returned
 257  *       and nothing will be written to @outbuf.
 258  *
 259  * Converts a single character to UTF-8.
 260  *
 261  * Return value: number of bytes written
 262  **/
 263 static int
 264 g_unichar_to_utf8 (gunichar c, gchar * outbuf)
 265 {
 266   guint len = 0;
 267   int first;
 268   int i;
 269
 270   if (c < 0x80)
 271     {
 272       first = 0;
 273       len = 1;
 274     }
 275   else if (c < 0x800)
 276     {
 277       first = 0xc0;
 278       len = 2;
 279     }
 280   else if (c < 0x10000)
 281     {
 282       first = 0xe0;
 283       len = 3;
 284     }
 285   else if (c < 0x200000)
 286     {
 287       first = 0xf0;
 288       len = 4;
 289     }
 290   else if (c < 0x4000000)
 291     {
 292       first = 0xf8;
 293       len = 5;
 294     }
 295   else
 296     {
 297       first = 0xfc;
 298       len = 6;
 299     }
 300
 301   if (outbuf)
 302     {
 303       for (i = len - 1; i > 0; --i)
 304         {
 305           outbuf[i] = (c & 0x3f) | 0x80;
 306           c >>= 6;
 307         }
 308       outbuf[0] = c | first;
 309     }
 310
 311   return len;
 312 }
 313
 314 /*
 315  * g_utf8_to_ucs4_fast:
 316  * @str: a UTF-8 encoded string
 317  * @len: the maximum length of @str to use. If @len < 0, then
 318  *       the string is nul-terminated.
 319  * @items_written: location to store the number of characters in the
 320  *                 result, or %NULL.
 321  *
 322  * Convert a string from UTF-8 to a 32-bit fixed width
 323  * representation as UCS-4, assuming valid UTF-8 input.
 324  * This function is roughly twice as fast as g_utf8_to_ucs4()
 325  * but does no error checking on the input.
 326  *
 327  * Return value: a pointer to a newly allocated UCS-4 string.
 328  *               This value must be freed with g_free().
 329  **/
 330 static gunichar *
 331 g_utf8_to_ucs4_fast (const gchar * str, glong len, glong * items_written)
 332 {
 333   gint j, charlen;
 334   gunichar *result;
 335   gint n_chars, i;
 336   const gchar *p;
 337
 338   g_return_val_if_fail (str != NULL, NULL);
 339
 340   p = str;
 341   n_chars = 0;
 342   if (len < 0)
 343     {
 344       while (*p)
 345         {
 346           p = g_utf8_next_char (p);
 347           ++n_chars;
 348         }
 349     }
 350   else
 351     {
 352       while (p < str + len && *p)
 353         {
 354           p = g_utf8_next_char (p);
 355           ++n_chars;
 356         }
 357     }
 358
 359   result = g_new (gunichar, n_chars + 1);
 360   if (!result)
 361     return NULL;
 362
 363   p = str;
 364   for (i = 0; i < n_chars; i++)
 365     {
 366       gunichar wc = ((unsigned char *) p)[0];
 367
 368       if (wc < 0x80)
 369         {
 370           result[i] = wc;
 371           p++;
 372         }
 373       else
 374         {
 375           if (wc < 0xe0)
 376             {
 377               charlen = 2;
 378               wc &= 0x1f;
 379             }
 380           else if (wc < 0xf0)
 381             {
 382               charlen = 3;
 383               wc &= 0x0f;
 384             }
 385           else if (wc < 0xf8)
 386             {
 387               charlen = 4;
 388               wc &= 0x07;
 389             }
 390           else if (wc < 0xfc)
 391             {
 392               charlen = 5;
 393               wc &= 0x03;
 394             }
 395           else
 396             {
 397               charlen = 6;
 398               wc &= 0x01;
 399             }
 400
 401           for (j = 1; j < charlen; j++)
 402             {
 403               wc <<= 6;
 404               wc |= ((unsigned char *) p)[j] & 0x3f;
 405             }
 406
 407           result[i] = wc;
 408           p += charlen;
 409         }
 410     }
 411   result[i] = 0;
 412
 413   if (items_written)
 414     *items_written = i;
 415
 416   return result;
 417 }
 418
 419 /*
 420  * g_ucs4_to_utf8:
 421  * @str: a UCS-4 encoded string
 422  * @len: the maximum length of @str to use. If @len < 0, then
 423  *       the string is terminated with a 0 character.
 424  * @items_read: location to store number of characters read read, or %NULL.
 425  * @items_written: location to store number of bytes written or %NULL.
 426  *                 The value here stored does not include the trailing 0
 427  *                 byte.
 428  * @error: location to store the error occuring, or %NULL to ignore
 429  *         errors. Any of the errors in #GConvertError other than
 430  *         %G_CONVERT_ERROR_NO_CONVERSION may occur.
 431  *
 432  * Convert a string from a 32-bit fixed width representation as UCS-4.
 433  * to UTF-8. The result will be terminated with a 0 byte.
 434  *
 435  * Return value: a pointer to a newly allocated UTF-8 string.
 436  *               This value must be freed with g_free(). If an
 437  *               error occurs, %NULL will be returned and
 438  *               @error set.
 439  **/
 440 static gchar *
 441 g_ucs4_to_utf8 (const gunichar * str,
 442                 glong len,
 443                 glong * items_read, glong * items_written, GError ** error)
 444 {
 445   gint result_length;
 446   gchar *result = NULL;
 447   gchar *p;
 448   gint i;
 449
 450   result_length = 0;
 451   for (i = 0; len < 0 || i < len; i++)
 452     {
 453       if (!str[i])
 454         break;
 455
 456       if (str[i] >= 0x80000000)
 457         {
 458           if (items_read)
 459             *items_read = i;
 460
 461           g_set_error (error, G_CONVERT_ERROR,
 462                        G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
 463                        _("Character out of range for UTF-8"));
 464           goto err_out;
 465         }
 466
 467       result_length += UTF8_LENGTH (str[i]);
 468     }
 469
 470   result = g_malloc (result_length + 1);
 471   if (!result)
 472     return NULL;
 473   p = result;
 474
 475   i = 0;
 476   while (p < result + result_length)
 477     p += g_unichar_to_utf8 (str[i++], p);
 478
 479   *p = '\0';
 480
 481   if (items_written)
 482     *items_written = p - result;
 483
 484 err_out:
 485   if (items_read)
 486     *items_read = i;
 487
 488   return result;
 489 }
 490
 491 /* Code from GLIB gunidecomp.c starts here. */
 492
 493 #include "gunidecomp.h"
 494 #include "gunicomp.h"
 495
 496 #define CC_PART1(Page, Char) \
 497   ((combining_class_table_part1[Page] >= G_UNICODE_MAX_TABLE_INDEX) \
 498    ? (combining_class_table_part1[Page] - G_UNICODE_MAX_TABLE_INDEX) \
 499    : (cclass_data[combining_class_table_part1[Page]][Char]))
 500
 501 #define CC_PART2(Page, Char) \
 502   ((combining_class_table_part2[Page] >= G_UNICODE_MAX_TABLE_INDEX) \
 503    ? (combining_class_table_part2[Page] - G_UNICODE_MAX_TABLE_INDEX) \
 504    : (cclass_data[combining_class_table_part2[Page]][Char]))
 505
 506 #define COMBINING_CLASS(Char) \
 507   (((Char) <= G_UNICODE_LAST_CHAR_PART1) \
 508    ? CC_PART1 ((Char) >> 8, (Char) & 0xff) \
 509    : (((Char) >= 0xe0000 && (Char) <= G_UNICODE_LAST_CHAR) \
 510       ? CC_PART2 (((Char) - 0xe0000) >> 8, (Char) & 0xff) \
 511       : 0))
 512
 513 /* constants for hangul syllable [de]composition */
 514 #define SBase 0xAC00
 515 #define LBase 0x1100
 516 #define VBase 0x1161
 517 #define TBase 0x11A7
 518 #define LCount 19
 519 #define VCount 21
 520 #define TCount 28
 521 #define NCount (VCount * TCount)
 522 #define SCount (LCount * NCount)
 523
 524 /*
 525  * g_unicode_canonical_ordering:
 526  * @string: a UCS-4 encoded string.
 527  * @len: the maximum length of @string to use.
 528  *
 529  * Computes the canonical ordering of a string in-place.
 530  * This rearranges decomposed characters in the string
 531  * according to their combining classes.  See the Unicode
 532  * manual for more information.
 533  **/
 534 static void
 535 g_unicode_canonical_ordering (gunichar * string, gsize len)
 536 {
 537   gsize i;
 538   int swap = 1;
 539
 540   while (swap)
 541     {
 542       int last;
 543       swap = 0;
 544       last = COMBINING_CLASS (string[0]);
 545       for (i = 0; i < len - 1; ++i)
 546         {
 547           int next = COMBINING_CLASS (string[i + 1]);
 548           if (next != 0 && last > next)
 549             {
 550               gsize j;
 551               /* Percolate item leftward through string.  */
 552               for (j = i + 1; j > 0; --j)
 553                 {
 554                   gunichar t;
 555                   if (COMBINING_CLASS (string[j - 1]) <= next)
 556                     break;
 557                   t = string[j];
 558                   string[j] = string[j - 1];
 559                   string[j - 1] = t;
 560                   swap = 1;
 561                 }
 562               /* We're re-entering the loop looking at the old
 563                  character again.  */
 564               next = last;
 565             }
 566           last = next;
 567         }
 568     }
 569 }
 570
 571 /* http://www.unicode.org/unicode/reports/tr15/#Hangul
 572  * r should be null or have sufficient space. Calling with r == NULL will
 573  * only calculate the result_len; however, a buffer with space for three
 574  * characters will always be big enough. */
 575 static void
 576 decompose_hangul (gunichar s, gunichar * r, gsize * result_len)
 577 {
 578   gint SIndex = s - SBase;
 579
 580   /* not a hangul syllable */
 581   if (SIndex < 0 || SIndex >= SCount)
 582     {
 583       if (r)
 584         r[0] = s;
 585       *result_len = 1;
 586     }
 587   else
 588     {
 589       gunichar L = LBase + SIndex / NCount;
 590       gunichar V = VBase + (SIndex % NCount) / TCount;
 591       gunichar T = TBase + SIndex % TCount;
 592
 593       if (r)
 594         {
 595           r[0] = L;
 596           r[1] = V;
 597         }
 598
 599       if (T != TBase)
 600         {
 601           if (r)
 602             r[2] = T;
 603           *result_len = 3;
 604         }
 605       else
 606         *result_len = 2;
 607     }
 608 }
 609
 610 /* returns a pointer to a null-terminated UTF-8 string */
 611 static const gchar *
 612 find_decomposition (gunichar ch, gboolean compat)
 613 {
 614   int start = 0;
 615   int end = G_N_ELEMENTS (decomp_table);
 616
 617   if (ch >= decomp_table[start].ch && ch <= decomp_table[end - 1].ch)
 618     {
 619       while (TRUE)
 620         {
 621           int half = (start + end) / 2;
 622           if (ch == decomp_table[half].ch)
 623             {
 624               int offset;
 625
 626               if (compat)
 627                 {
 628                   offset = decomp_table[half].compat_offset;
 629                   if (offset == G_UNICODE_NOT_PRESENT_OFFSET)
 630                     offset = decomp_table[half].canon_offset;
 631                 }
 632               else
 633                 {
 634                   offset = decomp_table[half].canon_offset;
 635                   if (offset == G_UNICODE_NOT_PRESENT_OFFSET)
 636                     return NULL;
 637                 }
 638
 639               return &(decomp_expansion_string[offset]);
 640             }
 641           else if (half == start)
 642             break;
 643           else if (ch > decomp_table[half].ch)
 644             start = half;
 645           else
 646             end = half;
 647         }
 648     }
 649
 650   return NULL;
 651 }
 652
 653 /* L,V => LV and LV,T => LVT  */
 654 static gboolean
 655 combine_hangul (gunichar a, gunichar b, gunichar * result)
 656 {
 657   gint LIndex = a - LBase;
 658   gint SIndex = a - SBase;
 659
 660   gint VIndex = b - VBase;
 661   gint TIndex = b - TBase;
 662
 663   if (0 <= LIndex && LIndex < LCount && 0 <= VIndex && VIndex < VCount)
 664     {
 665       *result = SBase + (LIndex * VCount + VIndex) * TCount;
 666       return TRUE;
 667     }
 668   else if (0 <= SIndex && SIndex < SCount && (SIndex % TCount) == 0
 669            && 0 <= TIndex && TIndex <= TCount)
 670     {
 671       *result = a + TIndex;
 672       return TRUE;
 673     }
 674
 675   return FALSE;
 676 }
 677
 678 #define CI(Page, Char) \
 679   ((compose_table[Page] >= G_UNICODE_MAX_TABLE_INDEX) \
 680    ? (compose_table[Page] - G_UNICODE_MAX_TABLE_INDEX) \
 681    : (compose_data[compose_table[Page]][Char]))
 682
 683 #define COMPOSE_INDEX(Char) \
 684      ((((Char) >> 8) > (COMPOSE_TABLE_LAST)) ? 0 : CI((Char) >> 8, (Char) & 0xff))
 685
 686 static gboolean
 687 combine (gunichar a, gunichar b, gunichar * result)
 688 {
 689   gushort index_a, index_b;
 690
 691   if (combine_hangul (a, b, result))
 692     return TRUE;
 693
 694   index_a = COMPOSE_INDEX (a);
 695
 696   if (index_a >= COMPOSE_FIRST_SINGLE_START && index_a < COMPOSE_SECOND_START)
 697     {
 698       if (b == compose_first_single[index_a - COMPOSE_FIRST_SINGLE_START][0])
 699         {
 700           *result =
 701             compose_first_single[index_a - COMPOSE_FIRST_SINGLE_START][1];
 702           return TRUE;
 703         }
 704       else
 705         return FALSE;
 706     }
 707
 708   index_b = COMPOSE_INDEX (b);
 709
 710   if (index_b >= COMPOSE_SECOND_SINGLE_START)
 711     {
 712       if (a ==
 713           compose_second_single[index_b - COMPOSE_SECOND_SINGLE_START][0])
 714         {
 715           *result =
 716             compose_second_single[index_b - COMPOSE_SECOND_SINGLE_START][1];
 717           return TRUE;
 718         }
 719       else
 720         return FALSE;
 721     }
 722
 723   if (index_a >= COMPOSE_FIRST_START && index_a < COMPOSE_FIRST_SINGLE_START
 724       && index_b >= COMPOSE_SECOND_START
 725       && index_b < COMPOSE_SECOND_SINGLE_START)
 726     {
 727       gunichar res =
 728         compose_array[index_a - COMPOSE_FIRST_START][index_b -
 729                                                      COMPOSE_SECOND_START];
 730
 731       if (res)
 732         {
 733           *result = res;
 734           return TRUE;
 735         }
 736     }
 737
 738   return FALSE;
 739 }
 740
 741 static gunichar *
 742 _g_utf8_normalize_wc (const gchar * str, gssize max_len, GNormalizeMode mode)
 743 {
 744   gsize n_wc;
 745   gunichar *wc_buffer;
 746   const char *p;
 747   gsize last_start;
 748   gboolean do_compat = (mode == G_NORMALIZE_NFKC || mode == G_NORMALIZE_NFKD);
 749   gboolean do_compose = (mode == G_NORMALIZE_NFC || mode == G_NORMALIZE_NFKC);
 750
 751   n_wc = 0;
 752   p = str;
 753   while ((max_len < 0 || p < str + max_len) && *p)
 754     {
 755       const gchar *decomp;
 756       gunichar wc = g_utf8_get_char (p);
 757
 758       if (wc >= 0xac00 && wc <= 0xd7af)
 759         {
 760           gsize result_len;
 761           decompose_hangul (wc, NULL, &result_len);
 762           n_wc += result_len;
 763         }
 764       else
 765         {
 766           decomp = find_decomposition (wc, do_compat);
 767
 768           if (decomp)
 769             n_wc += g_utf8_strlen (decomp, -1);
 770           else
 771             n_wc++;
 772         }
 773
 774       p = g_utf8_next_char (p);
 775     }
 776
 777   wc_buffer = g_new (gunichar, n_wc + 1);
 778   if (!wc_buffer)
 779     return NULL;
 780
 781   last_start = 0;
 782   n_wc = 0;
 783   p = str;
 784   while ((max_len < 0 || p < str + max_len) && *p)
 785     {
 786       gunichar wc = g_utf8_get_char (p);
 787       const gchar *decomp;
 788       int cc;
 789       gsize old_n_wc = n_wc;
 790
 791       if (wc >= 0xac00 && wc <= 0xd7af)
 792         {
 793           gsize result_len;
 794           decompose_hangul (wc, wc_buffer + n_wc, &result_len);
 795           n_wc += result_len;
 796         }
 797       else
 798         {
 799           decomp = find_decomposition (wc, do_compat);
 800
 801           if (decomp)
 802             {
 803               const char *pd;
 804               for (pd = decomp; *pd != '\0'; pd = g_utf8_next_char (pd))
 805                 wc_buffer[n_wc++] = g_utf8_get_char (pd);
 806             }
 807           else
 808             wc_buffer[n_wc++] = wc;
 809         }
 810
 811       if (n_wc > 0)
 812         {
 813           cc = COMBINING_CLASS (wc_buffer[old_n_wc]);
 814
 815           if (cc == 0)
 816             {
 817               g_unicode_canonical_ordering (wc_buffer + last_start,
 818                                             n_wc - last_start);
 819               last_start = old_n_wc;
 820             }
 821         }
 822
 823       p = g_utf8_next_char (p);
 824     }
 825
 826   if (n_wc > 0)
 827     {
 828       g_unicode_canonical_ordering (wc_buffer + last_start,
 829                                     n_wc - last_start);
 830       last_start = n_wc;
 831     }
 832
 833   wc_buffer[n_wc] = 0;
 834
 835   /* All decomposed and reordered */
 836
 837   if (do_compose && n_wc > 0)
 838     {
 839       gsize i, j;
 840       int last_cc = 0;
 841       last_start = 0;
 842
 843       for (i = 0; i < n_wc; i++)
 844         {
 845           int cc = COMBINING_CLASS (wc_buffer[i]);
 846
 847           if (i > 0 &&
 848               (last_cc == 0 || last_cc != cc) &&
 849               combine (wc_buffer[last_start], wc_buffer[i],
 850                        &wc_buffer[last_start]))
 851             {
 852               for (j = i + 1; j < n_wc; j++)
 853                 wc_buffer[j - 1] = wc_buffer[j];
 854               n_wc--;
 855               i--;
 856
 857               if (i == last_start)
 858                 last_cc = 0;
 859               else
 860                 last_cc = COMBINING_CLASS (wc_buffer[i - 1]);
 861
 862               continue;
 863             }
 864
 865           if (cc == 0)
 866             last_start = i;
 867
 868           last_cc = cc;
 869         }
 870     }
 871
 872   wc_buffer[n_wc] = 0;
 873
 874   return wc_buffer;
 875 }
 876
 877 /*
 878  * g_utf8_normalize:
 879  * @str: a UTF-8 encoded string.
 880  * @len: length of @str, in bytes, or -1 if @str is nul-terminated.
 881  * @mode: the type of normalization to perform.
 882  *
 883  * Converts a string into canonical form, standardizing
 884  * such issues as whether a character with an accent
 885  * is represented as a base character and combining
 886  * accent or as a single precomposed character. You
 887  * should generally call g_utf8_normalize() before
 888  * comparing two Unicode strings.
 889  *
 890  * The normalization mode %G_NORMALIZE_DEFAULT only
 891  * standardizes differences that do not affect the
 892  * text content, such as the above-mentioned accent
 893  * representation. %G_NORMALIZE_ALL also standardizes
 894  * the "compatibility" characters in Unicode, such
 895  * as SUPERSCRIPT THREE to the standard forms
 896  * (in this case DIGIT THREE). Formatting information
 897  * may be lost but for most text operations such
 898  * characters should be considered the same.
 899  * For example, g_utf8_collate() normalizes
 900  * with %G_NORMALIZE_ALL as its first step.
 901  *
 902  * %G_NORMALIZE_DEFAULT_COMPOSE and %G_NORMALIZE_ALL_COMPOSE
 903  * are like %G_NORMALIZE_DEFAULT and %G_NORMALIZE_ALL,
 904  * but returned a result with composed forms rather
 905  * than a maximally decomposed form. This is often
 906  * useful if you intend to convert the string to
 907  * a legacy encoding or pass it to a system with
 908  * less capable Unicode handling.
 909  *
 910  * Return value: a newly allocated string, that is the
 911  *   normalized form of @str.
 912  **/
 913 static gchar *
 914 g_utf8_normalize (const gchar * str, gssize len, GNormalizeMode mode)
 915 {
 916   gunichar *result_wc = _g_utf8_normalize_wc (str, len, mode);
 917   gchar *result;
 918
 919   result = g_ucs4_to_utf8 (result_wc, -1, NULL, NULL, NULL);
 920   g_free (result_wc);
 921
 922   return result;
 923 }
 924
 925 /* Public Libidn API starts here. */
 926
 927 /**
 928  * stringprep_utf8_to_unichar:
 929  * @p: a pointer to Unicode character encoded as UTF-8
 930  *
 931  * Converts a sequence of bytes encoded as UTF-8 to a Unicode character.
 932  * If @p does not point to a valid UTF-8 encoded character, results are
 933  * undefined.
 934  *
 935  * Return value: the resulting character.
 936  **/
 937 uint32_t
 938 stringprep_utf8_to_unichar (const char *p)
 939 {
 940   return g_utf8_get_char (p);
 941 }
 942
 943 /**
 944  * stringprep_unichar_to_utf8:
 945  * @c: a ISO10646 character code
 946  * @outbuf: output buffer, must have at least 6 bytes of space.
 947  *       If %NULL, the length will be computed and returned
 948  *       and nothing will be written to @outbuf.
 949  *
 950  * Converts a single character to UTF-8.
 951  *
 952  * Return value: number of bytes written.
 953  **/
 954 int
 955 stringprep_unichar_to_utf8 (uint32_t c, char *outbuf)
 956 {
 957   return g_unichar_to_utf8 (c, outbuf);
 958 }
 959
 960 /**
 961  * stringprep_utf8_to_ucs4:
 962  * @str: a UTF-8 encoded string
 963  * @len: the maximum length of @str to use. If @len < 0, then
 964  *       the string is nul-terminated.
 965  * @items_written: location to store the number of characters in the
 966  *                 result, or %NULL.
 967  *
 968  * Convert a string from UTF-8 to a 32-bit fixed width
 969  * representation as UCS-4, assuming valid UTF-8 input.
 970  * This function does no error checking on the input.
 971  *
 972  * Return value: a pointer to a newly allocated UCS-4 string.
 973  *               This value must be freed with free().
 974  **/
 975 uint32_t *
 976 stringprep_utf8_to_ucs4 (const char *str, ssize_t len, size_t * items_written)
 977 {
 978   return g_utf8_to_ucs4_fast (str, (glong) len, (glong *) items_written);
 979 }
 980
 981 /**
 982  * stringprep_ucs4_to_utf8:
 983  * @str: a UCS-4 encoded string
 984  * @len: the maximum length of @str to use. If @len < 0, then
 985  *       the string is terminated with a 0 character.
 986  * @items_read: location to store number of characters read read, or %NULL.
 987  * @items_written: location to store number of bytes written or %NULL.
 988  *                 The value here stored does not include the trailing 0
 989  *                 byte.
 990  *
 991  * Convert a string from a 32-bit fixed width representation as UCS-4.
 992  * to UTF-8. The result will be terminated with a 0 byte.
 993  *
 994  * Return value: a pointer to a newly allocated UTF-8 string.
 995  *               This value must be freed with free(). If an
 996  *               error occurs, %NULL will be returned and
 997  *               @error set.
 998  **/
 999 char *
1000 stringprep_ucs4_to_utf8 (const uint32_t * str, ssize_t len,
1001                          size_t * items_read, size_t * items_written)
1002 {
1003   return g_ucs4_to_utf8 (str, len, (glong *) items_read,
1004                          (glong *) items_written, NULL);
1005 }
1006
1007 /**
1008  * stringprep_utf8_nfkc_normalize:
1009  * @str: a UTF-8 encoded string.
1010  * @len: length of @str, in bytes, or -1 if @str is nul-terminated.
1011  *
1012  * Converts a string into canonical form, standardizing
1013  * such issues as whether a character with an accent
1014  * is represented as a base character and combining
1015  * accent or as a single precomposed character.
1016  *
1017  * The normalization mode is NFKC (ALL COMPOSE).  It standardizes
1018  * differences that do not affect the text content, such as the
1019  * above-mentioned accent representation. It standardizes the
1020  * "compatibility" characters in Unicode, such as SUPERSCRIPT THREE to
1021  * the standard forms (in this case DIGIT THREE). Formatting
1022  * information may be lost but for most text operations such
1023  * characters should be considered the same. It returns a result with
1024  * composed forms rather than a maximally decomposed form.
1025  *
1026  * Return value: a newly allocated string, that is the
1027  *   NFKC normalized form of @str.
1028  **/
1029 char *
1030 stringprep_utf8_nfkc_normalize (const char *str, ssize_t len)
1031 {
1032   return g_utf8_normalize (str, len, G_NORMALIZE_NFKC);
1033 }
1034
1035 /**
1036  * stringprep_ucs4_nfkc_normalize:
1037  * @str: a Unicode string.
1038  * @len: length of @str array, or -1 if @str is nul-terminated.
1039  *
1040  * Converts UCS4 string into UTF-8 and runs
1041  * stringprep_utf8_nfkc_normalize().
1042  *
1043  * Return value: a newly allocated Unicode string, that is the NFKC
1044  *   normalized form of @str.
1045  **/
1046 uint32_t *
1047 stringprep_ucs4_nfkc_normalize (uint32_t * str, ssize_t len)
1048 {
1049   char *p;
1050   uint32_t *result_wc;
1051
1052   p = stringprep_ucs4_to_utf8 (str, len, 0, 0);
1053   result_wc = _g_utf8_normalize_wc (p, -1, G_NORMALIZE_NFKC);
1054   free (p);
1055
1056   return result_wc;
1057 }