lib/nfkc.c

   1 /* nfkc.c       Unicode normalization utilities.
   2  * Copyright (C) 2002, 2003  Simon Josefsson
   3  *
   4  * This file is part of GNU Libidn.
   5  *
   6  * GNU Libidn is free software; you can redistribute it and/or
   7  * modify it under the terms of the GNU Lesser General Public
   8  * License as published by the Free Software Foundation; either
   9  * version 2.1 of the License, or (at your option) any later version.
  10  *
  11  * GNU Libidn is distributed in the hope that it will be useful,
  12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14  * Lesser General Public License for more details.
  15  *
  16  * You should have received a copy of the GNU Lesser General Public
  17  * License along with GNU Libidn; if not, write to the Free Software
  18  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  19  *
  20  */
  21
  22 #if HAVE_CONFIG_H
  23 # include "config.h"
  24 #endif
  25
  26 #include <stdlib.h>
  27 #include <string.h>
  28
  29 #include "stringprep.h"
  30
  31 /* This file contains functions from GLIB, including gutf8.c and
  32  * gunidecomp.c, all licensed under LGPL and copyright hold by:
  33  *
  34  *  Copyright (C) 1999, 2000 Tom Tromey
  35  *  Copyright 2000 Red Hat, Inc.
  36  */
  37
  38 /* Hacks to make syncing with GLIB code easier. */
  39 #define gboolean int
  40 #define gchar char
  41 #define guchar unsigned char
  42 #define glong long
  43 #define gint int
  44 #define guint unsigned int
  45 #define gushort unsigned short
  46 #define gint16 int16_t
  47 #define guint16 uint16_t
  48 #define gunichar uint32_t
  49 #define gsize size_t
  50 #define gssize ssize_t
  51 #define g_malloc malloc
  52 #define g_free free
  53 #define GError void
  54 #define g_set_error(a,b,c,d) 0
  55 #define g_new(struct_type, n_structs)                                   \
  56   ((struct_type *) g_malloc (((gsize) sizeof (struct_type)) * ((gsize) (n_structs))))
  57 #  if defined (__GNUC__) && !defined (__STRICT_ANSI__) && !defined (__cplusplus)
  58 #    define G_STMT_START        (void)(
  59 #    define G_STMT_END          )
  60 #  else
  61 #    if (defined (sun) || defined (__sun__))
  62 #      define G_STMT_START      if (1)
  63 #      define G_STMT_END        else (void)0
  64 #    else
  65 #      define G_STMT_START      do
  66 #      define G_STMT_END        while (0)
  67 #    endif
  68 #  endif
  69 #define g_return_val_if_fail(expr,val)          G_STMT_START{ (void)0; }G_STMT_END
  70 #define G_N_ELEMENTS(arr)               (sizeof (arr) / sizeof ((arr)[0]))
  71 #define TRUE 1
  72 #define FALSE 0
  73
  74 /* Code from GLIB gunicode.h starts here. */
  75
  76 typedef enum
  77 {
  78   G_NORMALIZE_DEFAULT,
  79   G_NORMALIZE_NFD = G_NORMALIZE_DEFAULT,
  80   G_NORMALIZE_DEFAULT_COMPOSE,
  81   G_NORMALIZE_NFC = G_NORMALIZE_DEFAULT_COMPOSE,
  82   G_NORMALIZE_ALL,
  83   G_NORMALIZE_NFKD = G_NORMALIZE_ALL,
  84   G_NORMALIZE_ALL_COMPOSE,
  85   G_NORMALIZE_NFKC = G_NORMALIZE_ALL_COMPOSE
  86 }
  87 GNormalizeMode;
  88
  89 /* Code from GLIB gutf8.c starts here. */
  90
  91 #define UTF8_COMPUTE(Char, Mask, Len)           \
  92   if (Char < 128)                               \
  93     {                                           \
  94       Len = 1;                                  \
  95       Mask = 0x7f;                              \
  96     }                                           \
  97   else if ((Char & 0xe0) == 0xc0)               \
  98     {                                           \
  99       Len = 2;                                  \
 100       Mask = 0x1f;                              \
 101     }                                           \
 102   else if ((Char & 0xf0) == 0xe0)               \
 103     {                                           \
 104       Len = 3;                                  \
 105       Mask = 0x0f;                              \
 106     }                                           \
 107   else if ((Char & 0xf8) == 0xf0)               \
 108     {                                           \
 109       Len = 4;                                  \
 110       Mask = 0x07;                              \
 111     }                                           \
 112   else if ((Char & 0xfc) == 0xf8)               \
 113     {                                           \
 114       Len = 5;                                  \
 115       Mask = 0x03;                              \
 116     }                                           \
 117   else if ((Char & 0xfe) == 0xfc)               \
 118     {                                           \
 119       Len = 6;                                  \
 120       Mask = 0x01;                              \
 121     }                                           \
 122   else                                          \
 123     Len = -1;
 124
 125 #define UTF8_LENGTH(Char)                       \
 126   ((Char) < 0x80 ? 1 :                          \
 127    ((Char) < 0x800 ? 2 :                        \
 128     ((Char) < 0x10000 ? 3 :                     \
 129      ((Char) < 0x200000 ? 4 :                   \
 130       ((Char) < 0x4000000 ? 5 : 6)))))
 131
 132
 133 #define UTF8_GET(Result, Chars, Count, Mask, Len)       \
 134   (Result) = (Chars)[0] & (Mask);                       \
 135   for ((Count) = 1; (Count) < (Len); ++(Count))         \
 136     {                                                   \
 137       if (((Chars)[(Count)] & 0xc0) != 0x80)            \
 138         {                                               \
 139           (Result) = -1;                                \
 140           break;                                        \
 141         }                                               \
 142       (Result) <<= 6;                                   \
 143       (Result) |= ((Chars)[(Count)] & 0x3f);            \
 144     }
 145
 146 #define UNICODE_VALID(Char)                     \
 147   ((Char) < 0x110000 &&                         \
 148    (((Char) & 0xFFFFF800) != 0xD800) &&         \
 149    ((Char) < 0xFDD0 || (Char) > 0xFDEF) &&      \
 150    ((Char) & 0xFFFE) != 0xFFFE)
 151
 152
 153 static const gchar utf8_skip_data[256] = {
 154   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
 155   1, 1, 1, 1, 1, 1, 1,
 156   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
 157   1, 1, 1, 1, 1, 1, 1,
 158   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
 159   1, 1, 1, 1, 1, 1, 1,
 160   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
 161   1, 1, 1, 1, 1, 1, 1,
 162   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
 163   1, 1, 1, 1, 1, 1, 1,
 164   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
 165   1, 1, 1, 1, 1, 1, 1,
 166   2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
 167   2, 2, 2, 2, 2, 2, 2,
 168   3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 5,
 169   5, 5, 5, 6, 6, 1, 1
 170 };
 171
 172 const gchar *const g_utf8_skip = utf8_skip_data;
 173
 174 #define g_utf8_next_char(p) (char *)((p) + g_utf8_skip[*(guchar *)(p)])
 175
 176 /*
 177  * g_utf8_strlen:
 178  * @p: pointer to the start of a UTF-8 encoded string.
 179  * @max: the maximum number of bytes to examine. If @max
 180  *       is less than 0, then the string is assumed to be
 181  *       nul-terminated. If @max is 0, @p will not be examined and
 182  *       may be %NULL.
 183  *
 184  * Returns the length of the string in characters.
 185  *
 186  * Return value: the length of the string in characters
 187  **/
 188 static glong
 189 g_utf8_strlen (const gchar * p, gssize max)
 190 {
 191   glong len = 0;
 192   const gchar *start = p;
 193   g_return_val_if_fail (p != NULL || max == 0, 0);
 194
 195   if (max < 0)
 196     {
 197       while (*p)
 198         {
 199           p = g_utf8_next_char (p);
 200           ++len;
 201         }
 202     }
 203   else
 204     {
 205       if (max == 0 || !*p)
 206         return 0;
 207
 208       p = g_utf8_next_char (p);
 209
 210       while (p - start < max && *p)
 211         {
 212           ++len;
 213           p = g_utf8_next_char (p);
 214         }
 215
 216       /* only do the last len increment if we got a complete
 217        * char (don't count partial chars)
 218        */
 219       if (p - start == max)
 220         ++len;
 221     }
 222
 223   return len;
 224 }
 225
 226 /*
 227  * g_utf8_get_char:
 228  * @p: a pointer to Unicode character encoded as UTF-8
 229  *
 230  * Converts a sequence of bytes encoded as UTF-8 to a Unicode character.
 231  * If @p does not point to a valid UTF-8 encoded character, results are
 232  * undefined. If you are not sure that the bytes are complete
 233  * valid Unicode characters, you should use g_utf8_get_char_validated()
 234  * instead.
 235  *
 236  * Return value: the resulting character
 237  **/
 238 static gunichar
 239 g_utf8_get_char (const gchar * p)
 240 {
 241   int i, mask = 0, len;
 242   gunichar result;
 243   unsigned char c = (unsigned char) *p;
 244
 245   UTF8_COMPUTE (c, mask, len);
 246   if (len == -1)
 247     return (gunichar) - 1;
 248   UTF8_GET (result, p, i, mask, len);
 249
 250   return result;
 251 }
 252
 253 /*
 254  * g_unichar_to_utf8:
 255  * @c: a ISO10646 character code
 256  * @outbuf: output buffer, must have at least 6 bytes of space.
 257  *       If %NULL, the length will be computed and returned
 258  *       and nothing will be written to @outbuf.
 259  *
 260  * Converts a single character to UTF-8.
 261  *
 262  * Return value: number of bytes written
 263  **/
 264 static int
 265 g_unichar_to_utf8 (gunichar c, gchar * outbuf)
 266 {
 267   guint len = 0;
 268   int first;
 269   int i;
 270
 271   if (c < 0x80)
 272     {
 273       first = 0;
 274       len = 1;
 275     }
 276   else if (c < 0x800)
 277     {
 278       first = 0xc0;
 279       len = 2;
 280     }
 281   else if (c < 0x10000)
 282     {
 283       first = 0xe0;
 284       len = 3;
 285     }
 286   else if (c < 0x200000)
 287     {
 288       first = 0xf0;
 289       len = 4;
 290     }
 291   else if (c < 0x4000000)
 292     {
 293       first = 0xf8;
 294       len = 5;
 295     }
 296   else
 297     {
 298       first = 0xfc;
 299       len = 6;
 300     }
 301
 302   if (outbuf)
 303     {
 304       for (i = len - 1; i > 0; --i)
 305         {
 306           outbuf[i] = (c & 0x3f) | 0x80;
 307           c >>= 6;
 308         }
 309       outbuf[0] = c | first;
 310     }
 311
 312   return len;
 313 }
 314
 315 /*
 316  * g_utf8_to_ucs4_fast:
 317  * @str: a UTF-8 encoded string
 318  * @len: the maximum length of @str to use. If @len < 0, then
 319  *       the string is nul-terminated.
 320  * @items_written: location to store the number of characters in the
 321  *                 result, or %NULL.
 322  *
 323  * Convert a string from UTF-8 to a 32-bit fixed width
 324  * representation as UCS-4, assuming valid UTF-8 input.
 325  * This function is roughly twice as fast as g_utf8_to_ucs4()
 326  * but does no error checking on the input.
 327  *
 328  * Return value: a pointer to a newly allocated UCS-4 string.
 329  *               This value must be freed with g_free().
 330  **/
 331 static gunichar *
 332 g_utf8_to_ucs4_fast (const gchar * str, glong len, glong * items_written)
 333 {
 334   gint j, charlen;
 335   gunichar *result;
 336   gint n_chars, i;
 337   const gchar *p;
 338
 339   g_return_val_if_fail (str != NULL, NULL);
 340
 341   p = str;
 342   n_chars = 0;
 343   if (len < 0)
 344     {
 345       while (*p)
 346         {
 347           p = g_utf8_next_char (p);
 348           ++n_chars;
 349         }
 350     }
 351   else
 352     {
 353       while (p < str + len && *p)
 354         {
 355           p = g_utf8_next_char (p);
 356           ++n_chars;
 357         }
 358     }
 359
 360   result = g_new (gunichar, n_chars + 1);
 361   if (!result)
 362     return NULL;
 363
 364   p = str;
 365   for (i = 0; i < n_chars; i++)
 366     {
 367       gunichar wc = ((unsigned char *) p)[0];
 368
 369       if (wc < 0x80)
 370         {
 371           result[i] = wc;
 372           p++;
 373         }
 374       else
 375         {
 376           if (wc < 0xe0)
 377             {
 378               charlen = 2;
 379               wc &= 0x1f;
 380             }
 381           else if (wc < 0xf0)
 382             {
 383               charlen = 3;
 384               wc &= 0x0f;
 385             }
 386           else if (wc < 0xf8)
 387             {
 388               charlen = 4;
 389               wc &= 0x07;
 390             }
 391           else if (wc < 0xfc)
 392             {
 393               charlen = 5;
 394               wc &= 0x03;
 395             }
 396           else
 397             {
 398               charlen = 6;
 399               wc &= 0x01;
 400             }
 401
 402           for (j = 1; j < charlen; j++)
 403             {
 404               wc <<= 6;
 405               wc |= ((unsigned char *) p)[j] & 0x3f;
 406             }
 407
 408           result[i] = wc;
 409           p += charlen;
 410         }
 411     }
 412   result[i] = 0;
 413
 414   if (items_written)
 415     *items_written = i;
 416
 417   return result;
 418 }
 419
 420 /*
 421  * g_ucs4_to_utf8:
 422  * @str: a UCS-4 encoded string
 423  * @len: the maximum length of @str to use. If @len < 0, then
 424  *       the string is terminated with a 0 character.
 425  * @items_read: location to store number of characters read read, or %NULL.
 426  * @items_written: location to store number of bytes written or %NULL.
 427  *                 The value here stored does not include the trailing 0
 428  *                 byte.
 429  * @error: location to store the error occuring, or %NULL to ignore
 430  *         errors. Any of the errors in #GConvertError other than
 431  *         %G_CONVERT_ERROR_NO_CONVERSION may occur.
 432  *
 433  * Convert a string from a 32-bit fixed width representation as UCS-4.
 434  * to UTF-8. The result will be terminated with a 0 byte.
 435  *
 436  * Return value: a pointer to a newly allocated UTF-8 string.
 437  *               This value must be freed with g_free(). If an
 438  *               error occurs, %NULL will be returned and
 439  *               @error set.
 440  **/
 441 static gchar *
 442 g_ucs4_to_utf8 (const gunichar * str,
 443                 glong len,
 444                 glong * items_read, glong * items_written, GError ** error)
 445 {
 446   gint result_length;
 447   gchar *result = NULL;
 448   gchar *p;
 449   gint i;
 450
 451   result_length = 0;
 452   for (i = 0; len < 0 || i < len; i++)
 453     {
 454       if (!str[i])
 455         break;
 456
 457       if (str[i] >= 0x80000000)
 458         {
 459           if (items_read)
 460             *items_read = i;
 461
 462           g_set_error (error, G_CONVERT_ERROR,
 463                        G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
 464                        _("Character out of range for UTF-8"));
 465           goto err_out;
 466         }
 467
 468       result_length += UTF8_LENGTH (str[i]);
 469     }
 470
 471   result = g_malloc (result_length + 1);
 472   if (!result)
 473     return NULL;
 474   p = result;
 475
 476   i = 0;
 477   while (p < result + result_length)
 478     p += g_unichar_to_utf8 (str[i++], p);
 479
 480   *p = '\0';
 481
 482   if (items_written)
 483     *items_written = p - result;
 484
 485 err_out:
 486   if (items_read)
 487     *items_read = i;
 488
 489   return result;
 490 }
 491
 492 /* Code from GLIB gunidecomp.c starts here. */
 493
 494 #include "gunidecomp.h"
 495 #include "gunicomp.h"
 496
 497 #define CC_PART1(Page, Char) \
 498   ((combining_class_table_part1[Page] >= G_UNICODE_MAX_TABLE_INDEX) \
 499    ? (combining_class_table_part1[Page] - G_UNICODE_MAX_TABLE_INDEX) \
 500    : (cclass_data[combining_class_table_part1[Page]][Char]))
 501
 502 #define CC_PART2(Page, Char) \
 503   ((combining_class_table_part2[Page] >= G_UNICODE_MAX_TABLE_INDEX) \
 504    ? (combining_class_table_part2[Page] - G_UNICODE_MAX_TABLE_INDEX) \
 505    : (cclass_data[combining_class_table_part2[Page]][Char]))
 506
 507 #define COMBINING_CLASS(Char) \
 508   (((Char) <= G_UNICODE_LAST_CHAR_PART1) \
 509    ? CC_PART1 ((Char) >> 8, (Char) & 0xff) \
 510    : (((Char) >= 0xe0000 && (Char) <= G_UNICODE_LAST_CHAR) \
 511       ? CC_PART2 (((Char) - 0xe0000) >> 8, (Char) & 0xff) \
 512       : 0))
 513
 514 /* constants for hangul syllable [de]composition */
 515 #define SBase 0xAC00
 516 #define LBase 0x1100
 517 #define VBase 0x1161
 518 #define TBase 0x11A7
 519 #define LCount 19
 520 #define VCount 21
 521 #define TCount 28
 522 #define NCount (VCount * TCount)
 523 #define SCount (LCount * NCount)
 524
 525 /*
 526  * g_unicode_canonical_ordering:
 527  * @string: a UCS-4 encoded string.
 528  * @len: the maximum length of @string to use.
 529  *
 530  * Computes the canonical ordering of a string in-place.
 531  * This rearranges decomposed characters in the string
 532  * according to their combining classes.  See the Unicode
 533  * manual for more information.
 534  **/
 535 static void
 536 g_unicode_canonical_ordering (gunichar * string, gsize len)
 537 {
 538   gsize i;
 539   int swap = 1;
 540
 541   while (swap)
 542     {
 543       int last;
 544       swap = 0;
 545       last = COMBINING_CLASS (string[0]);
 546       for (i = 0; i < len - 1; ++i)
 547         {
 548           int next = COMBINING_CLASS (string[i + 1]);
 549           if (next != 0 && last > next)
 550             {
 551               gsize j;
 552               /* Percolate item leftward through string.  */
 553               for (j = i + 1; j > 0; --j)
 554                 {
 555                   gunichar t;
 556                   if (COMBINING_CLASS (string[j - 1]) <= next)
 557                     break;
 558                   t = string[j];
 559                   string[j] = string[j - 1];
 560                   string[j - 1] = t;
 561                   swap = 1;
 562                 }
 563               /* We're re-entering the loop looking at the old
 564                  character again.  */
 565               next = last;
 566             }
 567           last = next;
 568         }
 569     }
 570 }
 571
 572 /* http://www.unicode.org/unicode/reports/tr15/#Hangul
 573  * r should be null or have sufficient space. Calling with r == NULL will
 574  * only calculate the result_len; however, a buffer with space for three
 575  * characters will always be big enough. */
 576 static void
 577 decompose_hangul (gunichar s, gunichar * r, gsize * result_len)
 578 {
 579   gint SIndex = s - SBase;
 580
 581   /* not a hangul syllable */
 582   if (SIndex < 0 || SIndex >= SCount)
 583     {
 584       if (r)
 585         r[0] = s;
 586       *result_len = 1;
 587     }
 588   else
 589     {
 590       gunichar L = LBase + SIndex / NCount;
 591       gunichar V = VBase + (SIndex % NCount) / TCount;
 592       gunichar T = TBase + SIndex % TCount;
 593
 594       if (r)
 595         {
 596           r[0] = L;
 597           r[1] = V;
 598         }
 599
 600       if (T != TBase)
 601         {
 602           if (r)
 603             r[2] = T;
 604           *result_len = 3;
 605         }
 606       else
 607         *result_len = 2;
 608     }
 609 }
 610
 611 /* returns a pointer to a null-terminated UTF-8 string */
 612 static const gchar *
 613 find_decomposition (gunichar ch, gboolean compat)
 614 {
 615   int start = 0;
 616   int end = G_N_ELEMENTS (decomp_table);
 617
 618   if (ch >= decomp_table[start].ch && ch <= decomp_table[end - 1].ch)
 619     {
 620       while (TRUE)
 621         {
 622           int half = (start + end) / 2;
 623           if (ch == decomp_table[half].ch)
 624             {
 625               int offset;
 626
 627               if (compat)
 628                 {
 629                   offset = decomp_table[half].compat_offset;
 630                   if (offset == G_UNICODE_NOT_PRESENT_OFFSET)
 631                     offset = decomp_table[half].canon_offset;
 632                 }
 633               else
 634                 {
 635                   offset = decomp_table[half].canon_offset;
 636                   if (offset == G_UNICODE_NOT_PRESENT_OFFSET)
 637                     return NULL;
 638                 }
 639
 640               return &(decomp_expansion_string[offset]);
 641             }
 642           else if (half == start)
 643             break;
 644           else if (ch > decomp_table[half].ch)
 645             start = half;
 646           else
 647             end = half;
 648         }
 649     }
 650
 651   return NULL;
 652 }
 653
 654 /* L,V => LV and LV,T => LVT  */
 655 static gboolean
 656 combine_hangul (gunichar a, gunichar b, gunichar * result)
 657 {
 658   gint LIndex = a - LBase;
 659   gint SIndex = a - SBase;
 660
 661   gint VIndex = b - VBase;
 662   gint TIndex = b - TBase;
 663
 664   if (0 <= LIndex && LIndex < LCount && 0 <= VIndex && VIndex < VCount)
 665     {
 666       *result = SBase + (LIndex * VCount + VIndex) * TCount;
 667       return TRUE;
 668     }
 669   else if (0 <= SIndex && SIndex < SCount && (SIndex % TCount) == 0
 670            && 0 <= TIndex && TIndex <= TCount)
 671     {
 672       *result = a + TIndex;
 673       return TRUE;
 674     }
 675
 676   return FALSE;
 677 }
 678
 679 #define CI(Page, Char) \
 680   ((compose_table[Page] >= G_UNICODE_MAX_TABLE_INDEX) \
 681    ? (compose_table[Page] - G_UNICODE_MAX_TABLE_INDEX) \
 682    : (compose_data[compose_table[Page]][Char]))
 683
 684 #define COMPOSE_INDEX(Char) \
 685      ((((Char) >> 8) > (COMPOSE_TABLE_LAST)) ? 0 : CI((Char) >> 8, (Char) & 0xff))
 686
 687 static gboolean
 688 combine (gunichar a, gunichar b, gunichar * result)
 689 {
 690   gushort index_a, index_b;
 691
 692   if (combine_hangul (a, b, result))
 693     return TRUE;
 694
 695   index_a = COMPOSE_INDEX (a);
 696
 697   if (index_a >= COMPOSE_FIRST_SINGLE_START && index_a < COMPOSE_SECOND_START)
 698     {
 699       if (b == compose_first_single[index_a - COMPOSE_FIRST_SINGLE_START][0])
 700         {
 701           *result =
 702             compose_first_single[index_a - COMPOSE_FIRST_SINGLE_START][1];
 703           return TRUE;
 704         }
 705       else
 706         return FALSE;
 707     }
 708
 709   index_b = COMPOSE_INDEX (b);
 710
 711   if (index_b >= COMPOSE_SECOND_SINGLE_START)
 712     {
 713       if (a ==
 714           compose_second_single[index_b - COMPOSE_SECOND_SINGLE_START][0])
 715         {
 716           *result =
 717             compose_second_single[index_b - COMPOSE_SECOND_SINGLE_START][1];
 718           return TRUE;
 719         }
 720       else
 721         return FALSE;
 722     }
 723
 724   if (index_a >= COMPOSE_FIRST_START && index_a < COMPOSE_FIRST_SINGLE_START
 725       && index_b >= COMPOSE_SECOND_START
 726       && index_b < COMPOSE_SECOND_SINGLE_START)
 727     {
 728       gunichar res =
 729         compose_array[index_a - COMPOSE_FIRST_START][index_b -
 730                                                      COMPOSE_SECOND_START];
 731
 732       if (res)
 733         {
 734           *result = res;
 735           return TRUE;
 736         }
 737     }
 738
 739   return FALSE;
 740 }
 741
 742 static gunichar *
 743 _g_utf8_normalize_wc (const gchar * str, gssize max_len, GNormalizeMode mode)
 744 {
 745   gsize n_wc;
 746   gunichar *wc_buffer;
 747   const char *p;
 748   gsize last_start;
 749   gboolean do_compat = (mode == G_NORMALIZE_NFKC || mode == G_NORMALIZE_NFKD);
 750   gboolean do_compose = (mode == G_NORMALIZE_NFC || mode == G_NORMALIZE_NFKC);
 751
 752   n_wc = 0;
 753   p = str;
 754   while ((max_len < 0 || p < str + max_len) && *p)
 755     {
 756       const gchar *decomp;
 757       gunichar wc = g_utf8_get_char (p);
 758
 759       if (wc >= 0xac00 && wc <= 0xd7af)
 760         {
 761           gsize result_len;
 762           decompose_hangul (wc, NULL, &result_len);
 763           n_wc += result_len;
 764         }
 765       else
 766         {
 767           decomp = find_decomposition (wc, do_compat);
 768
 769           if (decomp)
 770             n_wc += g_utf8_strlen (decomp, -1);
 771           else
 772             n_wc++;
 773         }
 774
 775       p = g_utf8_next_char (p);
 776     }
 777
 778   wc_buffer = g_new (gunichar, n_wc + 1);
 779   if (!wc_buffer)
 780     return NULL;
 781
 782   last_start = 0;
 783   n_wc = 0;
 784   p = str;
 785   while ((max_len < 0 || p < str + max_len) && *p)
 786     {
 787       gunichar wc = g_utf8_get_char (p);
 788       const gchar *decomp;
 789       int cc;
 790       gsize old_n_wc = n_wc;
 791
 792       if (wc >= 0xac00 && wc <= 0xd7af)
 793         {
 794           gsize result_len;
 795           decompose_hangul (wc, wc_buffer + n_wc, &result_len);
 796           n_wc += result_len;
 797         }
 798       else
 799         {
 800           decomp = find_decomposition (wc, do_compat);
 801
 802           if (decomp)
 803             {
 804               const char *pd;
 805               for (pd = decomp; *pd != '\0'; pd = g_utf8_next_char (pd))
 806                 wc_buffer[n_wc++] = g_utf8_get_char (pd);
 807             }
 808           else
 809             wc_buffer[n_wc++] = wc;
 810         }
 811
 812       if (n_wc > 0)
 813         {
 814           cc = COMBINING_CLASS (wc_buffer[old_n_wc]);
 815
 816           if (cc == 0)
 817             {
 818               g_unicode_canonical_ordering (wc_buffer + last_start,
 819                                             n_wc - last_start);
 820               last_start = old_n_wc;
 821             }
 822         }
 823
 824       p = g_utf8_next_char (p);
 825     }
 826
 827   if (n_wc > 0)
 828     {
 829       g_unicode_canonical_ordering (wc_buffer + last_start,
 830                                     n_wc - last_start);
 831       last_start = n_wc;
 832     }
 833
 834   wc_buffer[n_wc] = 0;
 835
 836   /* All decomposed and reordered */
 837
 838   if (do_compose && n_wc > 0)
 839     {
 840       gsize i, j;
 841       int last_cc = 0;
 842       last_start = 0;
 843
 844       for (i = 0; i < n_wc; i++)
 845         {
 846           int cc = COMBINING_CLASS (wc_buffer[i]);
 847
 848           if (i > 0 &&
 849               (last_cc == 0 || last_cc != cc) &&
 850               combine (wc_buffer[last_start], wc_buffer[i],
 851                        &wc_buffer[last_start]))
 852             {
 853               for (j = i + 1; j < n_wc; j++)
 854                 wc_buffer[j - 1] = wc_buffer[j];
 855               n_wc--;
 856               i--;
 857
 858               if (i == last_start)
 859                 last_cc = 0;
 860               else
 861                 last_cc = COMBINING_CLASS (wc_buffer[i - 1]);
 862
 863               continue;
 864             }
 865
 866           if (cc == 0)
 867             last_start = i;
 868
 869           last_cc = cc;
 870         }
 871     }
 872
 873   wc_buffer[n_wc] = 0;
 874
 875   return wc_buffer;
 876 }
 877
 878 /*
 879  * g_utf8_normalize:
 880  * @str: a UTF-8 encoded string.
 881  * @len: length of @str, in bytes, or -1 if @str is nul-terminated.
 882  * @mode: the type of normalization to perform.
 883  *
 884  * Converts a string into canonical form, standardizing
 885  * such issues as whether a character with an accent
 886  * is represented as a base character and combining
 887  * accent or as a single precomposed character. You
 888  * should generally call g_utf8_normalize() before
 889  * comparing two Unicode strings.
 890  *
 891  * The normalization mode %G_NORMALIZE_DEFAULT only
 892  * standardizes differences that do not affect the
 893  * text content, such as the above-mentioned accent
 894  * representation. %G_NORMALIZE_ALL also standardizes
 895  * the "compatibility" characters in Unicode, such
 896  * as SUPERSCRIPT THREE to the standard forms
 897  * (in this case DIGIT THREE). Formatting information
 898  * may be lost but for most text operations such
 899  * characters should be considered the same.
 900  * For example, g_utf8_collate() normalizes
 901  * with %G_NORMALIZE_ALL as its first step.
 902  *
 903  * %G_NORMALIZE_DEFAULT_COMPOSE and %G_NORMALIZE_ALL_COMPOSE
 904  * are like %G_NORMALIZE_DEFAULT and %G_NORMALIZE_ALL,
 905  * but returned a result with composed forms rather
 906  * than a maximally decomposed form. This is often
 907  * useful if you intend to convert the string to
 908  * a legacy encoding or pass it to a system with
 909  * less capable Unicode handling.
 910  *
 911  * Return value: a newly allocated string, that is the
 912  *   normalized form of @str.
 913  **/
 914 static gchar *
 915 g_utf8_normalize (const gchar * str, gssize len, GNormalizeMode mode)
 916 {
 917   gunichar *result_wc = _g_utf8_normalize_wc (str, len, mode);
 918   gchar *result;
 919
 920   result = g_ucs4_to_utf8 (result_wc, -1, NULL, NULL, NULL);
 921   g_free (result_wc);
 922
 923   return result;
 924 }
 925
 926 /* Public Libidn API starts here. */
 927
 928 /**
 929  * stringprep_utf8_to_unichar:
 930  * @p: a pointer to Unicode character encoded as UTF-8
 931  *
 932  * Converts a sequence of bytes encoded as UTF-8 to a Unicode character.
 933  * If @p does not point to a valid UTF-8 encoded character, results are
 934  * undefined. If you are not sure that the bytes are complete
 935  * valid Unicode characters, you should use g_utf8_get_char_validated()
 936  * instead.
 937  *
 938  * Return value: the resulting character
 939  **/
 940 uint32_t
 941 stringprep_utf8_to_unichar (const char *p)
 942 {
 943   return g_utf8_get_char (p);
 944 }
 945
 946 /**
 947  * stringprep_unichar_to_utf8:
 948  * @c: a ISO10646 character code
 949  * @outbuf: output buffer, must have at least 6 bytes of space.
 950  *       If %NULL, the length will be computed and returned
 951  *       and nothing will be written to @outbuf.
 952  *
 953  * Converts a single character to UTF-8.
 954  *
 955  * Return value: number of bytes written
 956  **/
 957 int
 958 stringprep_unichar_to_utf8 (uint32_t c, char *outbuf)
 959 {
 960   return g_unichar_to_utf8 (c, outbuf);
 961 }
 962
 963 /**
 964  * stringprep_utf8_to_ucs4:
 965  * @str: a UTF-8 encoded string
 966  * @len: the maximum length of @str to use. If @len < 0, then
 967  *       the string is nul-terminated.
 968  * @items_written: location to store the number of characters in the
 969  *                 result, or %NULL.
 970  *
 971  * Convert a string from UTF-8 to a 32-bit fixed width
 972  * representation as UCS-4, assuming valid UTF-8 input.
 973  * This function does no error checking on the input.
 974  *
 975  * Return value: a pointer to a newly allocated UCS-4 string.
 976  *               This value must be freed with free().
 977  **/
 978 uint32_t *
 979 stringprep_utf8_to_ucs4 (const char *str, ssize_t len, size_t * items_written)
 980 {
 981   return g_utf8_to_ucs4_fast (str, (glong) len, (glong *) items_written);
 982 }
 983
 984 /**
 985  * stringprep_ucs4_to_utf8:
 986  * @str: a UCS-4 encoded string
 987  * @len: the maximum length of @str to use. If @len < 0, then
 988  *       the string is terminated with a 0 character.
 989  * @items_read: location to store number of characters read read, or %NULL.
 990  * @items_written: location to store number of bytes written or %NULL.
 991  *                 The value here stored does not include the trailing 0
 992  *                 byte.
 993  *
 994  * Convert a string from a 32-bit fixed width representation as UCS-4.
 995  * to UTF-8. The result will be terminated with a 0 byte.
 996  *
 997  * Return value: a pointer to a newly allocated UTF-8 string.
 998  *               This value must be freed with free(). If an
 999  *               error occurs, %NULL will be returned and
1000  *               @error set.
1001  **/
1002 char *
1003 stringprep_ucs4_to_utf8 (const uint32_t * str, ssize_t len,
1004                          size_t * items_read, size_t * items_written)
1005 {
1006   return g_ucs4_to_utf8 (str, len, (glong *) items_read,
1007                          (glong *) items_written, NULL);
1008 }
1009
1010 /**
1011  * stringprep_utf8_nfkc_normalize:
1012  * @str: a UTF-8 encoded string.
1013  * @len: length of @str, in bytes, or -1 if @str is nul-terminated.
1014  *
1015  * Converts a string into canonical form, standardizing
1016  * such issues as whether a character with an accent
1017  * is represented as a base character and combining
1018  * accent or as a single precomposed character.
1019  *
1020  * The normalization mode is NFKC (ALL COMPOSE).  It standardizes
1021  * differences that do not affect the text content, such as the
1022  * above-mentioned accent representation. It standardizes the
1023  * "compatibility" characters in Unicode, such as SUPERSCRIPT THREE to
1024  * the standard forms (in this case DIGIT THREE). Formatting
1025  * information may be lost but for most text operations such
1026  * characters should be considered the same. It returns a result with
1027  * composed forms rather than a maximally decomposed form.
1028  *
1029  * Return value: a newly allocated string, that is the
1030  *   NFKC normalized form of @str.
1031  **/
1032 char *
1033 stringprep_utf8_nfkc_normalize (const char *str, ssize_t len)
1034 {
1035   return g_utf8_normalize (str, len, G_NORMALIZE_NFKC);
1036 }
1037
1038 /**
1039  * stringprep_ucs4_nfkc_normalize:
1040  * @str: a Unicode string.
1041  * @len: length of @str array, or -1 if @str is nul-terminated.
1042  *
1043  * Converts UCS4 string into UTF-8 and runs
1044  * stringprep_utf8_nfkc_normalize().
1045  *
1046  * Return value: a newly allocated Unicode string, that is the NFKC
1047  *   normalized form of @str.
1048  **/
1049 uint32_t *
1050 stringprep_ucs4_nfkc_normalize (uint32_t * str, ssize_t len)
1051 {
1052   char *p;
1053   uint32_t *result_wc;
1054
1055   p = stringprep_ucs4_to_utf8 (str, len, 0, 0);
1056   result_wc = _g_utf8_normalize_wc (p, -1, G_NORMALIZE_NFKC);
1057   free (p);
1058
1059   return result_wc;
1060 }