2.3.3-93
[glibc.git] / libidn / nfkc.c
bloba89f49ed7948da4df13ca0ae25c6b8ff76975505
1 /* nfkc.c Unicode normalization utilities.
2 * Copyright (C) 2002, 2003 Simon Josefsson
4 * This file is part of GNU Libidn.
6 * GNU Libidn is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2.1 of the License, or (at your option) any later version.
11 * GNU Libidn is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with GNU Libidn; if not, write to the Free Software
18 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
22 #if HAVE_CONFIG_H
23 # include "config.h"
24 #endif
26 #include <stdlib.h>
27 #include <string.h>
29 #include "stringprep.h"
31 /* This file contains functions from GLIB, including gutf8.c and
32 * gunidecomp.c, all licensed under LGPL and copyright hold by:
34 * Copyright (C) 1999, 2000 Tom Tromey
35 * Copyright 2000 Red Hat, Inc.
38 /* Hacks to make syncing with GLIB code easier. */
39 #define gboolean int
40 #define gchar char
41 #define guchar unsigned char
42 #define glong long
43 #define gint int
44 #define guint unsigned int
45 #define gushort unsigned short
46 #define gint16 int16_t
47 #define guint16 uint16_t
48 #define gunichar uint32_t
49 #define gsize size_t
50 #define gssize ssize_t
51 #define g_malloc malloc
52 #define g_free free
53 #define GError void
54 #define g_set_error(a,b,c,d) ((void) 0)
55 #define g_new(struct_type, n_structs) \
56 ((struct_type *) g_malloc (((gsize) sizeof (struct_type)) * ((gsize) (n_structs))))
57 # if defined (__GNUC__) && !defined (__STRICT_ANSI__) && !defined (__cplusplus)
58 # define G_STMT_START (void)(
59 # define G_STMT_END )
60 # else
61 # if (defined (sun) || defined (__sun__))
62 # define G_STMT_START if (1)
63 # define G_STMT_END else (void)0
64 # else
65 # define G_STMT_START do
66 # define G_STMT_END while (0)
67 # endif
68 # endif
69 #define g_return_val_if_fail(expr,val) G_STMT_START{ (void)0; }G_STMT_END
70 #define G_N_ELEMENTS(arr) (sizeof (arr) / sizeof ((arr)[0]))
71 #define TRUE 1
72 #define FALSE 0
74 /* Code from GLIB gunicode.h starts here. */
76 typedef enum
78 G_NORMALIZE_DEFAULT,
79 G_NORMALIZE_NFD = G_NORMALIZE_DEFAULT,
80 G_NORMALIZE_DEFAULT_COMPOSE,
81 G_NORMALIZE_NFC = G_NORMALIZE_DEFAULT_COMPOSE,
82 G_NORMALIZE_ALL,
83 G_NORMALIZE_NFKD = G_NORMALIZE_ALL,
84 G_NORMALIZE_ALL_COMPOSE,
85 G_NORMALIZE_NFKC = G_NORMALIZE_ALL_COMPOSE
87 GNormalizeMode;
89 /* Code from GLIB gutf8.c starts here. */
91 #define UTF8_COMPUTE(Char, Mask, Len) \
92 if (Char < 128) \
93 { \
94 Len = 1; \
95 Mask = 0x7f; \
96 } \
97 else if ((Char & 0xe0) == 0xc0) \
98 { \
99 Len = 2; \
100 Mask = 0x1f; \
102 else if ((Char & 0xf0) == 0xe0) \
104 Len = 3; \
105 Mask = 0x0f; \
107 else if ((Char & 0xf8) == 0xf0) \
109 Len = 4; \
110 Mask = 0x07; \
112 else if ((Char & 0xfc) == 0xf8) \
114 Len = 5; \
115 Mask = 0x03; \
117 else if ((Char & 0xfe) == 0xfc) \
119 Len = 6; \
120 Mask = 0x01; \
122 else \
123 Len = -1;
125 #define UTF8_LENGTH(Char) \
126 ((Char) < 0x80 ? 1 : \
127 ((Char) < 0x800 ? 2 : \
128 ((Char) < 0x10000 ? 3 : \
129 ((Char) < 0x200000 ? 4 : \
130 ((Char) < 0x4000000 ? 5 : 6)))))
133 #define UTF8_GET(Result, Chars, Count, Mask, Len) \
134 (Result) = (Chars)[0] & (Mask); \
135 for ((Count) = 1; (Count) < (Len); ++(Count)) \
137 if (((Chars)[(Count)] & 0xc0) != 0x80) \
139 (Result) = -1; \
140 break; \
142 (Result) <<= 6; \
143 (Result) |= ((Chars)[(Count)] & 0x3f); \
146 #define UNICODE_VALID(Char) \
147 ((Char) < 0x110000 && \
148 (((Char) & 0xFFFFF800) != 0xD800) && \
149 ((Char) < 0xFDD0 || (Char) > 0xFDEF) && \
150 ((Char) & 0xFFFE) != 0xFFFE)
153 static const gchar utf8_skip_data[256] = {
154 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
155 1, 1, 1, 1, 1, 1, 1,
156 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
157 1, 1, 1, 1, 1, 1, 1,
158 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
159 1, 1, 1, 1, 1, 1, 1,
160 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
161 1, 1, 1, 1, 1, 1, 1,
162 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
163 1, 1, 1, 1, 1, 1, 1,
164 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
165 1, 1, 1, 1, 1, 1, 1,
166 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
167 2, 2, 2, 2, 2, 2, 2,
168 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 5,
169 5, 5, 5, 6, 6, 1, 1
172 const gchar *const g_utf8_skip = utf8_skip_data;
174 #define g_utf8_next_char(p) (char *)((p) + g_utf8_skip[*(guchar *)(p)])
177 * g_utf8_strlen:
178 * @p: pointer to the start of a UTF-8 encoded string.
179 * @max: the maximum number of bytes to examine. If @max
180 * is less than 0, then the string is assumed to be
181 * nul-terminated. If @max is 0, @p will not be examined and
182 * may be %NULL.
184 * Returns the length of the string in characters.
186 * Return value: the length of the string in characters
188 static glong
189 g_utf8_strlen (const gchar * p, gssize max)
191 glong len = 0;
192 const gchar *start = p;
193 g_return_val_if_fail (p != NULL || max == 0, 0);
195 if (max < 0)
197 while (*p)
199 p = g_utf8_next_char (p);
200 ++len;
203 else
205 if (max == 0 || !*p)
206 return 0;
208 p = g_utf8_next_char (p);
210 while (p - start < max && *p)
212 ++len;
213 p = g_utf8_next_char (p);
216 /* only do the last len increment if we got a complete
217 * char (don't count partial chars)
219 if (p - start == max)
220 ++len;
223 return len;
227 * g_utf8_get_char:
228 * @p: a pointer to Unicode character encoded as UTF-8
230 * Converts a sequence of bytes encoded as UTF-8 to a Unicode character.
231 * If @p does not point to a valid UTF-8 encoded character, results are
232 * undefined. If you are not sure that the bytes are complete
233 * valid Unicode characters, you should use g_utf8_get_char_validated()
234 * instead.
236 * Return value: the resulting character
238 static gunichar
239 g_utf8_get_char (const gchar * p)
241 int i, mask = 0, len;
242 gunichar result;
243 unsigned char c = (unsigned char) *p;
245 UTF8_COMPUTE (c, mask, len);
246 if (len == -1)
247 return (gunichar) - 1;
248 UTF8_GET (result, p, i, mask, len);
250 return result;
254 * g_unichar_to_utf8:
255 * @c: a ISO10646 character code
256 * @outbuf: output buffer, must have at least 6 bytes of space.
257 * If %NULL, the length will be computed and returned
258 * and nothing will be written to @outbuf.
260 * Converts a single character to UTF-8.
262 * Return value: number of bytes written
264 static int
265 g_unichar_to_utf8 (gunichar c, gchar * outbuf)
267 guint len = 0;
268 int first;
269 int i;
271 if (c < 0x80)
273 first = 0;
274 len = 1;
276 else if (c < 0x800)
278 first = 0xc0;
279 len = 2;
281 else if (c < 0x10000)
283 first = 0xe0;
284 len = 3;
286 else if (c < 0x200000)
288 first = 0xf0;
289 len = 4;
291 else if (c < 0x4000000)
293 first = 0xf8;
294 len = 5;
296 else
298 first = 0xfc;
299 len = 6;
302 if (outbuf)
304 for (i = len - 1; i > 0; --i)
306 outbuf[i] = (c & 0x3f) | 0x80;
307 c >>= 6;
309 outbuf[0] = c | first;
312 return len;
316 * g_utf8_to_ucs4_fast:
317 * @str: a UTF-8 encoded string
318 * @len: the maximum length of @str to use. If @len < 0, then
319 * the string is nul-terminated.
320 * @items_written: location to store the number of characters in the
321 * result, or %NULL.
323 * Convert a string from UTF-8 to a 32-bit fixed width
324 * representation as UCS-4, assuming valid UTF-8 input.
325 * This function is roughly twice as fast as g_utf8_to_ucs4()
326 * but does no error checking on the input.
328 * Return value: a pointer to a newly allocated UCS-4 string.
329 * This value must be freed with g_free().
331 static gunichar *
332 g_utf8_to_ucs4_fast (const gchar * str, glong len, glong * items_written)
334 gint j, charlen;
335 gunichar *result;
336 gint n_chars, i;
337 const gchar *p;
339 g_return_val_if_fail (str != NULL, NULL);
341 p = str;
342 n_chars = 0;
343 if (len < 0)
345 while (*p)
347 p = g_utf8_next_char (p);
348 ++n_chars;
351 else
353 while (p < str + len && *p)
355 p = g_utf8_next_char (p);
356 ++n_chars;
360 result = g_new (gunichar, n_chars + 1);
361 if (!result)
362 return NULL;
364 p = str;
365 for (i = 0; i < n_chars; i++)
367 gunichar wc = ((unsigned char *) p)[0];
369 if (wc < 0x80)
371 result[i] = wc;
372 p++;
374 else
376 if (wc < 0xe0)
378 charlen = 2;
379 wc &= 0x1f;
381 else if (wc < 0xf0)
383 charlen = 3;
384 wc &= 0x0f;
386 else if (wc < 0xf8)
388 charlen = 4;
389 wc &= 0x07;
391 else if (wc < 0xfc)
393 charlen = 5;
394 wc &= 0x03;
396 else
398 charlen = 6;
399 wc &= 0x01;
402 for (j = 1; j < charlen; j++)
404 wc <<= 6;
405 wc |= ((unsigned char *) p)[j] & 0x3f;
408 result[i] = wc;
409 p += charlen;
412 result[i] = 0;
414 if (items_written)
415 *items_written = i;
417 return result;
421 * g_ucs4_to_utf8:
422 * @str: a UCS-4 encoded string
423 * @len: the maximum length of @str to use. If @len < 0, then
424 * the string is terminated with a 0 character.
425 * @items_read: location to store number of characters read read, or %NULL.
426 * @items_written: location to store number of bytes written or %NULL.
427 * The value here stored does not include the trailing 0
428 * byte.
429 * @error: location to store the error occuring, or %NULL to ignore
430 * errors. Any of the errors in #GConvertError other than
431 * %G_CONVERT_ERROR_NO_CONVERSION may occur.
433 * Convert a string from a 32-bit fixed width representation as UCS-4.
434 * to UTF-8. The result will be terminated with a 0 byte.
436 * Return value: a pointer to a newly allocated UTF-8 string.
437 * This value must be freed with g_free(). If an
438 * error occurs, %NULL will be returned and
439 * @error set.
441 static gchar *
442 g_ucs4_to_utf8 (const gunichar * str,
443 glong len,
444 glong * items_read, glong * items_written, GError ** error)
446 gint result_length;
447 gchar *result = NULL;
448 gchar *p;
449 gint i;
451 result_length = 0;
452 for (i = 0; len < 0 || i < len; i++)
454 if (!str[i])
455 break;
457 if (str[i] >= 0x80000000)
459 if (items_read)
460 *items_read = i;
462 g_set_error (error, G_CONVERT_ERROR,
463 G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
464 _("Character out of range for UTF-8"));
465 goto err_out;
468 result_length += UTF8_LENGTH (str[i]);
471 result = g_malloc (result_length + 1);
472 if (!result)
473 return NULL;
474 p = result;
476 i = 0;
477 while (p < result + result_length)
478 p += g_unichar_to_utf8 (str[i++], p);
480 *p = '\0';
482 if (items_written)
483 *items_written = p - result;
485 err_out:
486 if (items_read)
487 *items_read = i;
489 return result;
492 /* Code from GLIB gunidecomp.c starts here. */
494 #include "gunidecomp.h"
495 #include "gunicomp.h"
497 #define CC_PART1(Page, Char) \
498 ((combining_class_table_part1[Page] >= G_UNICODE_MAX_TABLE_INDEX) \
499 ? (combining_class_table_part1[Page] - G_UNICODE_MAX_TABLE_INDEX) \
500 : (cclass_data[combining_class_table_part1[Page]][Char]))
502 #define CC_PART2(Page, Char) \
503 ((combining_class_table_part2[Page] >= G_UNICODE_MAX_TABLE_INDEX) \
504 ? (combining_class_table_part2[Page] - G_UNICODE_MAX_TABLE_INDEX) \
505 : (cclass_data[combining_class_table_part2[Page]][Char]))
507 #define COMBINING_CLASS(Char) \
508 (((Char) <= G_UNICODE_LAST_CHAR_PART1) \
509 ? CC_PART1 ((Char) >> 8, (Char) & 0xff) \
510 : (((Char) >= 0xe0000 && (Char) <= G_UNICODE_LAST_CHAR) \
511 ? CC_PART2 (((Char) - 0xe0000) >> 8, (Char) & 0xff) \
512 : 0))
514 /* constants for hangul syllable [de]composition */
515 #define SBase 0xAC00
516 #define LBase 0x1100
517 #define VBase 0x1161
518 #define TBase 0x11A7
519 #define LCount 19
520 #define VCount 21
521 #define TCount 28
522 #define NCount (VCount * TCount)
523 #define SCount (LCount * NCount)
526 * g_unicode_canonical_ordering:
527 * @string: a UCS-4 encoded string.
528 * @len: the maximum length of @string to use.
530 * Computes the canonical ordering of a string in-place.
531 * This rearranges decomposed characters in the string
532 * according to their combining classes. See the Unicode
533 * manual for more information.
535 static void
536 g_unicode_canonical_ordering (gunichar * string, gsize len)
538 gsize i;
539 int swap = 1;
541 while (swap)
543 int last;
544 swap = 0;
545 last = COMBINING_CLASS (string[0]);
546 for (i = 0; i < len - 1; ++i)
548 int next = COMBINING_CLASS (string[i + 1]);
549 if (next != 0 && last > next)
551 gsize j;
552 /* Percolate item leftward through string. */
553 for (j = i + 1; j > 0; --j)
555 gunichar t;
556 if (COMBINING_CLASS (string[j - 1]) <= next)
557 break;
558 t = string[j];
559 string[j] = string[j - 1];
560 string[j - 1] = t;
561 swap = 1;
563 /* We're re-entering the loop looking at the old
564 character again. */
565 next = last;
567 last = next;
572 /* http://www.unicode.org/unicode/reports/tr15/#Hangul
573 * r should be null or have sufficient space. Calling with r == NULL will
574 * only calculate the result_len; however, a buffer with space for three
575 * characters will always be big enough. */
576 static void
577 decompose_hangul (gunichar s, gunichar * r, gsize * result_len)
579 gint SIndex = s - SBase;
581 /* not a hangul syllable */
582 if (SIndex < 0 || SIndex >= SCount)
584 if (r)
585 r[0] = s;
586 *result_len = 1;
588 else
590 gunichar L = LBase + SIndex / NCount;
591 gunichar V = VBase + (SIndex % NCount) / TCount;
592 gunichar T = TBase + SIndex % TCount;
594 if (r)
596 r[0] = L;
597 r[1] = V;
600 if (T != TBase)
602 if (r)
603 r[2] = T;
604 *result_len = 3;
606 else
607 *result_len = 2;
611 /* returns a pointer to a null-terminated UTF-8 string */
612 static const gchar *
613 find_decomposition (gunichar ch, gboolean compat)
615 int start = 0;
616 int end = G_N_ELEMENTS (decomp_table);
618 if (ch >= decomp_table[start].ch && ch <= decomp_table[end - 1].ch)
620 while (TRUE)
622 int half = (start + end) / 2;
623 if (ch == decomp_table[half].ch)
625 int offset;
627 if (compat)
629 offset = decomp_table[half].compat_offset;
630 if (offset == G_UNICODE_NOT_PRESENT_OFFSET)
631 offset = decomp_table[half].canon_offset;
633 else
635 offset = decomp_table[half].canon_offset;
636 if (offset == G_UNICODE_NOT_PRESENT_OFFSET)
637 return NULL;
640 return &(decomp_expansion_string[offset]);
642 else if (half == start)
643 break;
644 else if (ch > decomp_table[half].ch)
645 start = half;
646 else
647 end = half;
651 return NULL;
654 /* L,V => LV and LV,T => LVT */
655 static gboolean
656 combine_hangul (gunichar a, gunichar b, gunichar * result)
658 gint LIndex = a - LBase;
659 gint SIndex = a - SBase;
661 gint VIndex = b - VBase;
662 gint TIndex = b - TBase;
664 if (0 <= LIndex && LIndex < LCount && 0 <= VIndex && VIndex < VCount)
666 *result = SBase + (LIndex * VCount + VIndex) * TCount;
667 return TRUE;
669 else if (0 <= SIndex && SIndex < SCount && (SIndex % TCount) == 0
670 && 0 <= TIndex && TIndex <= TCount)
672 *result = a + TIndex;
673 return TRUE;
676 return FALSE;
679 #define CI(Page, Char) \
680 ((compose_table[Page] >= G_UNICODE_MAX_TABLE_INDEX) \
681 ? (compose_table[Page] - G_UNICODE_MAX_TABLE_INDEX) \
682 : (compose_data[compose_table[Page]][Char]))
684 #define COMPOSE_INDEX(Char) \
685 ((((Char) >> 8) > (COMPOSE_TABLE_LAST)) ? 0 : CI((Char) >> 8, (Char) & 0xff))
687 static gboolean
688 combine (gunichar a, gunichar b, gunichar * result)
690 gushort index_a, index_b;
692 if (combine_hangul (a, b, result))
693 return TRUE;
695 index_a = COMPOSE_INDEX (a);
697 if (index_a >= COMPOSE_FIRST_SINGLE_START && index_a < COMPOSE_SECOND_START)
699 if (b == compose_first_single[index_a - COMPOSE_FIRST_SINGLE_START][0])
701 *result =
702 compose_first_single[index_a - COMPOSE_FIRST_SINGLE_START][1];
703 return TRUE;
705 else
706 return FALSE;
709 index_b = COMPOSE_INDEX (b);
711 if (index_b >= COMPOSE_SECOND_SINGLE_START)
713 if (a ==
714 compose_second_single[index_b - COMPOSE_SECOND_SINGLE_START][0])
716 *result =
717 compose_second_single[index_b - COMPOSE_SECOND_SINGLE_START][1];
718 return TRUE;
720 else
721 return FALSE;
724 if (index_a >= COMPOSE_FIRST_START && index_a < COMPOSE_FIRST_SINGLE_START
725 && index_b >= COMPOSE_SECOND_START
726 && index_b < COMPOSE_SECOND_SINGLE_START)
728 gunichar res =
729 compose_array[index_a - COMPOSE_FIRST_START][index_b -
730 COMPOSE_SECOND_START];
732 if (res)
734 *result = res;
735 return TRUE;
739 return FALSE;
742 static gunichar *
743 _g_utf8_normalize_wc (const gchar * str, gssize max_len, GNormalizeMode mode)
745 gsize n_wc;
746 gunichar *wc_buffer;
747 const char *p;
748 gsize last_start;
749 gboolean do_compat = (mode == G_NORMALIZE_NFKC || mode == G_NORMALIZE_NFKD);
750 gboolean do_compose = (mode == G_NORMALIZE_NFC || mode == G_NORMALIZE_NFKC);
752 n_wc = 0;
753 p = str;
754 while ((max_len < 0 || p < str + max_len) && *p)
756 const gchar *decomp;
757 gunichar wc = g_utf8_get_char (p);
759 if (wc >= 0xac00 && wc <= 0xd7af)
761 gsize result_len;
762 decompose_hangul (wc, NULL, &result_len);
763 n_wc += result_len;
765 else
767 decomp = find_decomposition (wc, do_compat);
769 if (decomp)
770 n_wc += g_utf8_strlen (decomp, -1);
771 else
772 n_wc++;
775 p = g_utf8_next_char (p);
778 wc_buffer = g_new (gunichar, n_wc + 1);
779 if (!wc_buffer)
780 return NULL;
782 last_start = 0;
783 n_wc = 0;
784 p = str;
785 while ((max_len < 0 || p < str + max_len) && *p)
787 gunichar wc = g_utf8_get_char (p);
788 const gchar *decomp;
789 int cc;
790 gsize old_n_wc = n_wc;
792 if (wc >= 0xac00 && wc <= 0xd7af)
794 gsize result_len;
795 decompose_hangul (wc, wc_buffer + n_wc, &result_len);
796 n_wc += result_len;
798 else
800 decomp = find_decomposition (wc, do_compat);
802 if (decomp)
804 const char *pd;
805 for (pd = decomp; *pd != '\0'; pd = g_utf8_next_char (pd))
806 wc_buffer[n_wc++] = g_utf8_get_char (pd);
808 else
809 wc_buffer[n_wc++] = wc;
812 if (n_wc > 0)
814 cc = COMBINING_CLASS (wc_buffer[old_n_wc]);
816 if (cc == 0)
818 g_unicode_canonical_ordering (wc_buffer + last_start,
819 n_wc - last_start);
820 last_start = old_n_wc;
824 p = g_utf8_next_char (p);
827 if (n_wc > 0)
829 g_unicode_canonical_ordering (wc_buffer + last_start,
830 n_wc - last_start);
831 last_start = n_wc;
834 wc_buffer[n_wc] = 0;
836 /* All decomposed and reordered */
838 if (do_compose && n_wc > 0)
840 gsize i, j;
841 int last_cc = 0;
842 last_start = 0;
844 for (i = 0; i < n_wc; i++)
846 int cc = COMBINING_CLASS (wc_buffer[i]);
848 if (i > 0 &&
849 (last_cc == 0 || last_cc != cc) &&
850 combine (wc_buffer[last_start], wc_buffer[i],
851 &wc_buffer[last_start]))
853 for (j = i + 1; j < n_wc; j++)
854 wc_buffer[j - 1] = wc_buffer[j];
855 n_wc--;
856 i--;
858 if (i == last_start)
859 last_cc = 0;
860 else
861 last_cc = COMBINING_CLASS (wc_buffer[i - 1]);
863 continue;
866 if (cc == 0)
867 last_start = i;
869 last_cc = cc;
873 wc_buffer[n_wc] = 0;
875 return wc_buffer;
879 * g_utf8_normalize:
880 * @str: a UTF-8 encoded string.
881 * @len: length of @str, in bytes, or -1 if @str is nul-terminated.
882 * @mode: the type of normalization to perform.
884 * Converts a string into canonical form, standardizing
885 * such issues as whether a character with an accent
886 * is represented as a base character and combining
887 * accent or as a single precomposed character. You
888 * should generally call g_utf8_normalize() before
889 * comparing two Unicode strings.
891 * The normalization mode %G_NORMALIZE_DEFAULT only
892 * standardizes differences that do not affect the
893 * text content, such as the above-mentioned accent
894 * representation. %G_NORMALIZE_ALL also standardizes
895 * the "compatibility" characters in Unicode, such
896 * as SUPERSCRIPT THREE to the standard forms
897 * (in this case DIGIT THREE). Formatting information
898 * may be lost but for most text operations such
899 * characters should be considered the same.
900 * For example, g_utf8_collate() normalizes
901 * with %G_NORMALIZE_ALL as its first step.
903 * %G_NORMALIZE_DEFAULT_COMPOSE and %G_NORMALIZE_ALL_COMPOSE
904 * are like %G_NORMALIZE_DEFAULT and %G_NORMALIZE_ALL,
905 * but returned a result with composed forms rather
906 * than a maximally decomposed form. This is often
907 * useful if you intend to convert the string to
908 * a legacy encoding or pass it to a system with
909 * less capable Unicode handling.
911 * Return value: a newly allocated string, that is the
912 * normalized form of @str.
914 static gchar *
915 g_utf8_normalize (const gchar * str, gssize len, GNormalizeMode mode)
917 gunichar *result_wc = _g_utf8_normalize_wc (str, len, mode);
918 gchar *result;
920 result = g_ucs4_to_utf8 (result_wc, -1, NULL, NULL, NULL);
921 g_free (result_wc);
923 return result;
926 /* Public Libidn API starts here. */
929 * stringprep_utf8_to_unichar:
930 * @p: a pointer to Unicode character encoded as UTF-8
932 * Converts a sequence of bytes encoded as UTF-8 to a Unicode character.
933 * If @p does not point to a valid UTF-8 encoded character, results are
934 * undefined.
936 * Return value: the resulting character.
938 uint32_t
939 stringprep_utf8_to_unichar (const char *p)
941 return g_utf8_get_char (p);
945 * stringprep_unichar_to_utf8:
946 * @c: a ISO10646 character code
947 * @outbuf: output buffer, must have at least 6 bytes of space.
948 * If %NULL, the length will be computed and returned
949 * and nothing will be written to @outbuf.
951 * Converts a single character to UTF-8.
953 * Return value: number of bytes written.
956 stringprep_unichar_to_utf8 (uint32_t c, char *outbuf)
958 return g_unichar_to_utf8 (c, outbuf);
962 * stringprep_utf8_to_ucs4:
963 * @str: a UTF-8 encoded string
964 * @len: the maximum length of @str to use. If @len < 0, then
965 * the string is nul-terminated.
966 * @items_written: location to store the number of characters in the
967 * result, or %NULL.
969 * Convert a string from UTF-8 to a 32-bit fixed width
970 * representation as UCS-4, assuming valid UTF-8 input.
971 * This function does no error checking on the input.
973 * Return value: a pointer to a newly allocated UCS-4 string.
974 * This value must be freed with free().
976 uint32_t *
977 stringprep_utf8_to_ucs4 (const char *str, ssize_t len, size_t * items_written)
979 return g_utf8_to_ucs4_fast (str, (glong) len, (glong *) items_written);
983 * stringprep_ucs4_to_utf8:
984 * @str: a UCS-4 encoded string
985 * @len: the maximum length of @str to use. If @len < 0, then
986 * the string is terminated with a 0 character.
987 * @items_read: location to store number of characters read read, or %NULL.
988 * @items_written: location to store number of bytes written or %NULL.
989 * The value here stored does not include the trailing 0
990 * byte.
992 * Convert a string from a 32-bit fixed width representation as UCS-4.
993 * to UTF-8. The result will be terminated with a 0 byte.
995 * Return value: a pointer to a newly allocated UTF-8 string.
996 * This value must be freed with free(). If an
997 * error occurs, %NULL will be returned and
998 * @error set.
1000 char *
1001 stringprep_ucs4_to_utf8 (const uint32_t * str, ssize_t len,
1002 size_t * items_read, size_t * items_written)
1004 return g_ucs4_to_utf8 (str, len, (glong *) items_read,
1005 (glong *) items_written, NULL);
1009 * stringprep_utf8_nfkc_normalize:
1010 * @str: a UTF-8 encoded string.
1011 * @len: length of @str, in bytes, or -1 if @str is nul-terminated.
1013 * Converts a string into canonical form, standardizing
1014 * such issues as whether a character with an accent
1015 * is represented as a base character and combining
1016 * accent or as a single precomposed character.
1018 * The normalization mode is NFKC (ALL COMPOSE). It standardizes
1019 * differences that do not affect the text content, such as the
1020 * above-mentioned accent representation. It standardizes the
1021 * "compatibility" characters in Unicode, such as SUPERSCRIPT THREE to
1022 * the standard forms (in this case DIGIT THREE). Formatting
1023 * information may be lost but for most text operations such
1024 * characters should be considered the same. It returns a result with
1025 * composed forms rather than a maximally decomposed form.
1027 * Return value: a newly allocated string, that is the
1028 * NFKC normalized form of @str.
1030 char *
1031 stringprep_utf8_nfkc_normalize (const char *str, ssize_t len)
1033 return g_utf8_normalize (str, len, G_NORMALIZE_NFKC);
1037 * stringprep_ucs4_nfkc_normalize:
1038 * @str: a Unicode string.
1039 * @len: length of @str array, or -1 if @str is nul-terminated.
1041 * Converts UCS4 string into UTF-8 and runs
1042 * stringprep_utf8_nfkc_normalize().
1044 * Return value: a newly allocated Unicode string, that is the NFKC
1045 * normalized form of @str.
1047 uint32_t *
1048 stringprep_ucs4_nfkc_normalize (uint32_t * str, ssize_t len)
1050 char *p;
1051 uint32_t *result_wc;
1053 p = stringprep_ucs4_to_utf8 (str, len, 0, 0);
1054 result_wc = _g_utf8_normalize_wc (p, -1, G_NORMALIZE_NFKC);
1055 free (p);
1057 return result_wc;