malloc: Add realloc test.
[glibc.git] / libidn / nfkc.c
blobf3e41d038b86894694f2a9bda6f76d1876ef336d
1 /* nfkc.c Unicode normalization utilities.
2 * Copyright (C) 2002, 2003 Simon Josefsson
4 * This file is part of GNU Libidn.
6 * GNU Libidn is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2.1 of the License, or (at your option) any later version.
11 * GNU Libidn is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with GNU Libidn; if not, see <http://www.gnu.org/licenses/>.
20 #if HAVE_CONFIG_H
21 # include "config.h"
22 #endif
24 #include <stdlib.h>
25 #include <string.h>
26 #include <stdint.h>
28 #include "stringprep.h"
30 /* This file contains functions from GLIB, including gutf8.c and
31 * gunidecomp.c, all licensed under LGPL and copyright hold by:
33 * Copyright (C) 1999, 2000 Tom Tromey
34 * Copyright 2000 Red Hat, Inc.
37 /* Hacks to make syncing with GLIB code easier. */
38 #define gboolean int
39 #define gchar char
40 #define guchar unsigned char
41 #define glong long
42 #define gint int
43 #define guint unsigned int
44 #define gushort unsigned short
45 #define gint16 int16_t
46 #define guint16 uint16_t
47 #define gunichar uint32_t
48 #define gsize size_t
49 #define gssize ssize_t
50 #define g_malloc malloc
51 #define g_free free
52 #define GError void
53 #define g_set_error(a,b,c,d) ((void) 0)
54 #define g_new(struct_type, n_structs) \
55 ((struct_type *) g_malloc (((gsize) sizeof (struct_type)) * ((gsize) (n_structs))))
56 # if defined (__GNUC__) && !defined (__STRICT_ANSI__) && !defined (__cplusplus)
57 # define G_STMT_START (void)(
58 # define G_STMT_END )
59 # else
60 # if (defined (sun) || defined (__sun__))
61 # define G_STMT_START if (1)
62 # define G_STMT_END else (void)0
63 # else
64 # define G_STMT_START do
65 # define G_STMT_END while (0)
66 # endif
67 # endif
68 #define g_return_val_if_fail(expr,val) G_STMT_START{ (void)0; }G_STMT_END
69 #define G_N_ELEMENTS(arr) (sizeof (arr) / sizeof ((arr)[0]))
70 #define TRUE 1
71 #define FALSE 0
73 /* Code from GLIB gunicode.h starts here. */
75 typedef enum
77 G_NORMALIZE_DEFAULT,
78 G_NORMALIZE_NFD = G_NORMALIZE_DEFAULT,
79 G_NORMALIZE_DEFAULT_COMPOSE,
80 G_NORMALIZE_NFC = G_NORMALIZE_DEFAULT_COMPOSE,
81 G_NORMALIZE_ALL,
82 G_NORMALIZE_NFKD = G_NORMALIZE_ALL,
83 G_NORMALIZE_ALL_COMPOSE,
84 G_NORMALIZE_NFKC = G_NORMALIZE_ALL_COMPOSE
86 GNormalizeMode;
88 /* Code from GLIB gutf8.c starts here. */
90 #define UTF8_COMPUTE(Char, Mask, Len) \
91 if (Char < 128) \
92 { \
93 Len = 1; \
94 Mask = 0x7f; \
95 } \
96 else if ((Char & 0xe0) == 0xc0) \
97 { \
98 Len = 2; \
99 Mask = 0x1f; \
101 else if ((Char & 0xf0) == 0xe0) \
103 Len = 3; \
104 Mask = 0x0f; \
106 else if ((Char & 0xf8) == 0xf0) \
108 Len = 4; \
109 Mask = 0x07; \
111 else if ((Char & 0xfc) == 0xf8) \
113 Len = 5; \
114 Mask = 0x03; \
116 else if ((Char & 0xfe) == 0xfc) \
118 Len = 6; \
119 Mask = 0x01; \
121 else \
122 Len = -1;
124 #define UTF8_LENGTH(Char) \
125 ((Char) < 0x80 ? 1 : \
126 ((Char) < 0x800 ? 2 : \
127 ((Char) < 0x10000 ? 3 : \
128 ((Char) < 0x200000 ? 4 : \
129 ((Char) < 0x4000000 ? 5 : 6)))))
132 #define UTF8_GET(Result, Chars, Count, Mask, Len) \
133 (Result) = (Chars)[0] & (Mask); \
134 for ((Count) = 1; (Count) < (Len); ++(Count)) \
136 if (((Chars)[(Count)] & 0xc0) != 0x80) \
138 (Result) = -1; \
139 break; \
141 (Result) <<= 6; \
142 (Result) |= ((Chars)[(Count)] & 0x3f); \
145 #define UNICODE_VALID(Char) \
146 ((Char) < 0x110000 && \
147 (((Char) & 0xFFFFF800) != 0xD800) && \
148 ((Char) < 0xFDD0 || (Char) > 0xFDEF) && \
149 ((Char) & 0xFFFE) != 0xFFFE)
152 static const gchar utf8_skip_data[256] = {
153 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
154 1, 1, 1, 1, 1, 1, 1,
155 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
156 1, 1, 1, 1, 1, 1, 1,
157 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
158 1, 1, 1, 1, 1, 1, 1,
159 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
160 1, 1, 1, 1, 1, 1, 1,
161 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
162 1, 1, 1, 1, 1, 1, 1,
163 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
164 1, 1, 1, 1, 1, 1, 1,
165 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
166 2, 2, 2, 2, 2, 2, 2,
167 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 5,
168 5, 5, 5, 6, 6, 1, 1
171 const gchar *const g_utf8_skip = utf8_skip_data;
173 #define g_utf8_next_char(p) (char *)((p) + g_utf8_skip[*(guchar *)(p)])
176 * g_utf8_strlen:
177 * @p: pointer to the start of a UTF-8 encoded string.
178 * @max: the maximum number of bytes to examine. If @max
179 * is less than 0, then the string is assumed to be
180 * nul-terminated. If @max is 0, @p will not be examined and
181 * may be %NULL.
183 * Returns the length of the string in characters.
185 * Return value: the length of the string in characters
187 static glong
188 g_utf8_strlen (const gchar * p, gssize max)
190 glong len = 0;
191 const gchar *start = p;
192 g_return_val_if_fail (p != NULL || max == 0, 0);
194 if (max < 0)
196 while (*p)
198 p = g_utf8_next_char (p);
199 ++len;
202 else
204 if (max == 0 || !*p)
205 return 0;
207 p = g_utf8_next_char (p);
209 while (p - start < max && *p)
211 ++len;
212 p = g_utf8_next_char (p);
215 /* only do the last len increment if we got a complete
216 * char (don't count partial chars)
218 if (p - start == max)
219 ++len;
222 return len;
226 * g_utf8_get_char:
227 * @p: a pointer to Unicode character encoded as UTF-8
229 * Converts a sequence of bytes encoded as UTF-8 to a Unicode character.
230 * If @p does not point to a valid UTF-8 encoded character, results are
231 * undefined. If you are not sure that the bytes are complete
232 * valid Unicode characters, you should use g_utf8_get_char_validated()
233 * instead.
235 * Return value: the resulting character
237 static gunichar
238 g_utf8_get_char (const gchar * p)
240 int i, mask = 0, len;
241 gunichar result;
242 unsigned char c = (unsigned char) *p;
244 UTF8_COMPUTE (c, mask, len);
245 if (len == -1)
246 return (gunichar) - 1;
247 UTF8_GET (result, p, i, mask, len);
249 return result;
253 * g_unichar_to_utf8:
254 * @c: a ISO10646 character code
255 * @outbuf: output buffer, must have at least 6 bytes of space.
256 * If %NULL, the length will be computed and returned
257 * and nothing will be written to @outbuf.
259 * Converts a single character to UTF-8.
261 * Return value: number of bytes written
263 static int
264 g_unichar_to_utf8 (gunichar c, gchar * outbuf)
266 guint len = 0;
267 int first;
268 int i;
270 if (c < 0x80)
272 first = 0;
273 len = 1;
275 else if (c < 0x800)
277 first = 0xc0;
278 len = 2;
280 else if (c < 0x10000)
282 first = 0xe0;
283 len = 3;
285 else if (c < 0x200000)
287 first = 0xf0;
288 len = 4;
290 else if (c < 0x4000000)
292 first = 0xf8;
293 len = 5;
295 else
297 first = 0xfc;
298 len = 6;
301 if (outbuf)
303 for (i = len - 1; i > 0; --i)
305 outbuf[i] = (c & 0x3f) | 0x80;
306 c >>= 6;
308 outbuf[0] = c | first;
311 return len;
315 * g_utf8_to_ucs4_fast:
316 * @str: a UTF-8 encoded string
317 * @len: the maximum length of @str to use. If @len < 0, then
318 * the string is nul-terminated.
319 * @items_written: location to store the number of characters in the
320 * result, or %NULL.
322 * Convert a string from UTF-8 to a 32-bit fixed width
323 * representation as UCS-4, assuming valid UTF-8 input.
324 * This function is roughly twice as fast as g_utf8_to_ucs4()
325 * but does no error checking on the input.
327 * Return value: a pointer to a newly allocated UCS-4 string.
328 * This value must be freed with g_free().
330 static gunichar *
331 g_utf8_to_ucs4_fast (const gchar * str, glong len, glong * items_written)
333 gint j, charlen;
334 gunichar *result;
335 gint n_chars, i;
336 const gchar *p;
338 g_return_val_if_fail (str != NULL, NULL);
340 p = str;
341 n_chars = 0;
342 if (len < 0)
344 while (*p)
346 p = g_utf8_next_char (p);
347 ++n_chars;
350 else
352 while (p < str + len && *p)
354 p = g_utf8_next_char (p);
355 ++n_chars;
359 result = g_new (gunichar, n_chars + 1);
360 if (!result)
361 return NULL;
363 p = str;
364 for (i = 0; i < n_chars; i++)
366 gunichar wc = ((unsigned char *) p)[0];
368 if (wc < 0x80)
370 result[i] = wc;
371 p++;
373 else
375 if (wc < 0xe0)
377 charlen = 2;
378 wc &= 0x1f;
380 else if (wc < 0xf0)
382 charlen = 3;
383 wc &= 0x0f;
385 else if (wc < 0xf8)
387 charlen = 4;
388 wc &= 0x07;
390 else if (wc < 0xfc)
392 charlen = 5;
393 wc &= 0x03;
395 else
397 charlen = 6;
398 wc &= 0x01;
401 for (j = 1; j < charlen; j++)
403 wc <<= 6;
404 wc |= ((unsigned char *) p)[j] & 0x3f;
407 result[i] = wc;
408 p += charlen;
411 result[i] = 0;
413 if (items_written)
414 *items_written = i;
416 return result;
420 * g_ucs4_to_utf8:
421 * @str: a UCS-4 encoded string
422 * @len: the maximum length of @str to use. If @len < 0, then
423 * the string is terminated with a 0 character.
424 * @items_read: location to store number of characters read read, or %NULL.
425 * @items_written: location to store number of bytes written or %NULL.
426 * The value here stored does not include the trailing 0
427 * byte.
428 * @error: location to store the error occuring, or %NULL to ignore
429 * errors. Any of the errors in #GConvertError other than
430 * %G_CONVERT_ERROR_NO_CONVERSION may occur.
432 * Convert a string from a 32-bit fixed width representation as UCS-4.
433 * to UTF-8. The result will be terminated with a 0 byte.
435 * Return value: a pointer to a newly allocated UTF-8 string.
436 * This value must be freed with g_free(). If an
437 * error occurs, %NULL will be returned and
438 * @error set.
440 static gchar *
441 g_ucs4_to_utf8 (const gunichar * str,
442 glong len,
443 glong * items_read, glong * items_written, GError ** error)
445 gint result_length;
446 gchar *result = NULL;
447 gchar *p;
448 gint i;
450 result_length = 0;
451 for (i = 0; len < 0 || i < len; i++)
453 if (!str[i])
454 break;
456 if (str[i] >= 0x80000000)
458 if (items_read)
459 *items_read = i;
461 g_set_error (error, G_CONVERT_ERROR,
462 G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
463 _("Character out of range for UTF-8"));
464 goto err_out;
467 result_length += UTF8_LENGTH (str[i]);
470 result = g_malloc (result_length + 1);
471 if (!result)
472 return NULL;
473 p = result;
475 i = 0;
476 while (p < result + result_length)
477 p += g_unichar_to_utf8 (str[i++], p);
479 *p = '\0';
481 if (items_written)
482 *items_written = p - result;
484 err_out:
485 if (items_read)
486 *items_read = i;
488 return result;
491 /* Code from GLIB gunidecomp.c starts here. */
493 #include "gunidecomp.h"
494 #include "gunicomp.h"
496 #define CC_PART1(Page, Char) \
497 ((combining_class_table_part1[Page] >= G_UNICODE_MAX_TABLE_INDEX) \
498 ? (combining_class_table_part1[Page] - G_UNICODE_MAX_TABLE_INDEX) \
499 : (cclass_data[combining_class_table_part1[Page]][Char]))
501 #define CC_PART2(Page, Char) \
502 ((combining_class_table_part2[Page] >= G_UNICODE_MAX_TABLE_INDEX) \
503 ? (combining_class_table_part2[Page] - G_UNICODE_MAX_TABLE_INDEX) \
504 : (cclass_data[combining_class_table_part2[Page]][Char]))
506 #define COMBINING_CLASS(Char) \
507 (((Char) <= G_UNICODE_LAST_CHAR_PART1) \
508 ? CC_PART1 ((Char) >> 8, (Char) & 0xff) \
509 : (((Char) >= 0xe0000 && (Char) <= G_UNICODE_LAST_CHAR) \
510 ? CC_PART2 (((Char) - 0xe0000) >> 8, (Char) & 0xff) \
511 : 0))
513 /* constants for hangul syllable [de]composition */
514 #define SBase 0xAC00
515 #define LBase 0x1100
516 #define VBase 0x1161
517 #define TBase 0x11A7
518 #define LCount 19
519 #define VCount 21
520 #define TCount 28
521 #define NCount (VCount * TCount)
522 #define SCount (LCount * NCount)
525 * g_unicode_canonical_ordering:
526 * @string: a UCS-4 encoded string.
527 * @len: the maximum length of @string to use.
529 * Computes the canonical ordering of a string in-place.
530 * This rearranges decomposed characters in the string
531 * according to their combining classes. See the Unicode
532 * manual for more information.
534 static void
535 g_unicode_canonical_ordering (gunichar * string, gsize len)
537 gsize i;
538 int swap = 1;
540 while (swap)
542 int last;
543 swap = 0;
544 last = COMBINING_CLASS (string[0]);
545 for (i = 0; i < len - 1; ++i)
547 int next = COMBINING_CLASS (string[i + 1]);
548 if (next != 0 && last > next)
550 gsize j;
551 /* Percolate item leftward through string. */
552 for (j = i + 1; j > 0; --j)
554 gunichar t;
555 if (COMBINING_CLASS (string[j - 1]) <= next)
556 break;
557 t = string[j];
558 string[j] = string[j - 1];
559 string[j - 1] = t;
560 swap = 1;
562 /* We're re-entering the loop looking at the old
563 character again. */
564 next = last;
566 last = next;
571 /* http://www.unicode.org/unicode/reports/tr15/#Hangul
572 * r should be null or have sufficient space. Calling with r == NULL will
573 * only calculate the result_len; however, a buffer with space for three
574 * characters will always be big enough. */
575 static void
576 decompose_hangul (gunichar s, gunichar * r, gsize * result_len)
578 gint SIndex = s - SBase;
580 /* not a hangul syllable */
581 if (SIndex < 0 || SIndex >= SCount)
583 if (r)
584 r[0] = s;
585 *result_len = 1;
587 else
589 gunichar L = LBase + SIndex / NCount;
590 gunichar V = VBase + (SIndex % NCount) / TCount;
591 gunichar T = TBase + SIndex % TCount;
593 if (r)
595 r[0] = L;
596 r[1] = V;
599 if (T != TBase)
601 if (r)
602 r[2] = T;
603 *result_len = 3;
605 else
606 *result_len = 2;
610 /* returns a pointer to a null-terminated UTF-8 string */
611 static const gchar *
612 find_decomposition (gunichar ch, gboolean compat)
614 int start = 0;
615 int end = G_N_ELEMENTS (decomp_table);
617 if (ch >= decomp_table[start].ch && ch <= decomp_table[end - 1].ch)
619 while (TRUE)
621 int half = (start + end) / 2;
622 if (ch == decomp_table[half].ch)
624 int offset;
626 if (compat)
628 offset = decomp_table[half].compat_offset;
629 if (offset == G_UNICODE_NOT_PRESENT_OFFSET)
630 offset = decomp_table[half].canon_offset;
632 else
634 offset = decomp_table[half].canon_offset;
635 if (offset == G_UNICODE_NOT_PRESENT_OFFSET)
636 return NULL;
639 return &(decomp_expansion_string[offset]);
641 else if (half == start)
642 break;
643 else if (ch > decomp_table[half].ch)
644 start = half;
645 else
646 end = half;
650 return NULL;
653 /* L,V => LV and LV,T => LVT */
654 static gboolean
655 combine_hangul (gunichar a, gunichar b, gunichar * result)
657 gint LIndex = a - LBase;
658 gint SIndex = a - SBase;
660 gint VIndex = b - VBase;
661 gint TIndex = b - TBase;
663 if (0 <= LIndex && LIndex < LCount && 0 <= VIndex && VIndex < VCount)
665 *result = SBase + (LIndex * VCount + VIndex) * TCount;
666 return TRUE;
668 else if (0 <= SIndex && SIndex < SCount && (SIndex % TCount) == 0
669 && 0 <= TIndex && TIndex <= TCount)
671 *result = a + TIndex;
672 return TRUE;
675 return FALSE;
678 #define CI(Page, Char) \
679 ((compose_table[Page] >= G_UNICODE_MAX_TABLE_INDEX) \
680 ? (compose_table[Page] - G_UNICODE_MAX_TABLE_INDEX) \
681 : (compose_data[compose_table[Page]][Char]))
683 #define COMPOSE_INDEX(Char) \
684 ((((Char) >> 8) > (COMPOSE_TABLE_LAST)) ? 0 : CI((Char) >> 8, (Char) & 0xff))
686 static gboolean
687 combine (gunichar a, gunichar b, gunichar * result)
689 gushort index_a, index_b;
691 if (combine_hangul (a, b, result))
692 return TRUE;
694 index_a = COMPOSE_INDEX (a);
696 if (index_a >= COMPOSE_FIRST_SINGLE_START && index_a < COMPOSE_SECOND_START)
698 if (b == compose_first_single[index_a - COMPOSE_FIRST_SINGLE_START][0])
700 *result =
701 compose_first_single[index_a - COMPOSE_FIRST_SINGLE_START][1];
702 return TRUE;
704 else
705 return FALSE;
708 index_b = COMPOSE_INDEX (b);
710 if (index_b >= COMPOSE_SECOND_SINGLE_START)
712 if (a ==
713 compose_second_single[index_b - COMPOSE_SECOND_SINGLE_START][0])
715 *result =
716 compose_second_single[index_b - COMPOSE_SECOND_SINGLE_START][1];
717 return TRUE;
719 else
720 return FALSE;
723 if (index_a >= COMPOSE_FIRST_START && index_a < COMPOSE_FIRST_SINGLE_START
724 && index_b >= COMPOSE_SECOND_START
725 && index_b < COMPOSE_SECOND_SINGLE_START)
727 gunichar res =
728 compose_array[index_a - COMPOSE_FIRST_START][index_b -
729 COMPOSE_SECOND_START];
731 if (res)
733 *result = res;
734 return TRUE;
738 return FALSE;
741 static gunichar *
742 _g_utf8_normalize_wc (const gchar * str, gssize max_len, GNormalizeMode mode)
744 gsize n_wc;
745 gunichar *wc_buffer;
746 const char *p;
747 gsize last_start;
748 gboolean do_compat = (mode == G_NORMALIZE_NFKC || mode == G_NORMALIZE_NFKD);
749 gboolean do_compose = (mode == G_NORMALIZE_NFC || mode == G_NORMALIZE_NFKC);
751 n_wc = 0;
752 p = str;
753 while ((max_len < 0 || p < str + max_len) && *p)
755 const gchar *decomp;
756 gunichar wc = g_utf8_get_char (p);
758 if (wc >= 0xac00 && wc <= 0xd7af)
760 gsize result_len;
761 decompose_hangul (wc, NULL, &result_len);
762 n_wc += result_len;
764 else
766 decomp = find_decomposition (wc, do_compat);
768 if (decomp)
769 n_wc += g_utf8_strlen (decomp, -1);
770 else
771 n_wc++;
774 p = g_utf8_next_char (p);
777 wc_buffer = g_new (gunichar, n_wc + 1);
778 if (!wc_buffer)
779 return NULL;
781 last_start = 0;
782 n_wc = 0;
783 p = str;
784 while ((max_len < 0 || p < str + max_len) && *p)
786 gunichar wc = g_utf8_get_char (p);
787 const gchar *decomp;
788 int cc;
789 gsize old_n_wc = n_wc;
791 if (wc >= 0xac00 && wc <= 0xd7af)
793 gsize result_len;
794 decompose_hangul (wc, wc_buffer + n_wc, &result_len);
795 n_wc += result_len;
797 else
799 decomp = find_decomposition (wc, do_compat);
801 if (decomp)
803 const char *pd;
804 for (pd = decomp; *pd != '\0'; pd = g_utf8_next_char (pd))
805 wc_buffer[n_wc++] = g_utf8_get_char (pd);
807 else
808 wc_buffer[n_wc++] = wc;
811 if (n_wc > 0)
813 cc = COMBINING_CLASS (wc_buffer[old_n_wc]);
815 if (cc == 0)
817 g_unicode_canonical_ordering (wc_buffer + last_start,
818 n_wc - last_start);
819 last_start = old_n_wc;
823 p = g_utf8_next_char (p);
826 if (n_wc > 0)
828 g_unicode_canonical_ordering (wc_buffer + last_start,
829 n_wc - last_start);
830 last_start = n_wc;
833 wc_buffer[n_wc] = 0;
835 /* All decomposed and reordered */
837 if (do_compose && n_wc > 0)
839 gsize i, j;
840 int last_cc = 0;
841 last_start = 0;
843 for (i = 0; i < n_wc; i++)
845 int cc = COMBINING_CLASS (wc_buffer[i]);
847 if (i > 0 &&
848 (last_cc == 0 || last_cc != cc) &&
849 combine (wc_buffer[last_start], wc_buffer[i],
850 &wc_buffer[last_start]))
852 for (j = i + 1; j < n_wc; j++)
853 wc_buffer[j - 1] = wc_buffer[j];
854 n_wc--;
855 i--;
857 if (i == last_start)
858 last_cc = 0;
859 else
860 last_cc = COMBINING_CLASS (wc_buffer[i - 1]);
862 continue;
865 if (cc == 0)
866 last_start = i;
868 last_cc = cc;
872 wc_buffer[n_wc] = 0;
874 return wc_buffer;
878 * g_utf8_normalize:
879 * @str: a UTF-8 encoded string.
880 * @len: length of @str, in bytes, or -1 if @str is nul-terminated.
881 * @mode: the type of normalization to perform.
883 * Converts a string into canonical form, standardizing
884 * such issues as whether a character with an accent
885 * is represented as a base character and combining
886 * accent or as a single precomposed character. You
887 * should generally call g_utf8_normalize() before
888 * comparing two Unicode strings.
890 * The normalization mode %G_NORMALIZE_DEFAULT only
891 * standardizes differences that do not affect the
892 * text content, such as the above-mentioned accent
893 * representation. %G_NORMALIZE_ALL also standardizes
894 * the "compatibility" characters in Unicode, such
895 * as SUPERSCRIPT THREE to the standard forms
896 * (in this case DIGIT THREE). Formatting information
897 * may be lost but for most text operations such
898 * characters should be considered the same.
899 * For example, g_utf8_collate() normalizes
900 * with %G_NORMALIZE_ALL as its first step.
902 * %G_NORMALIZE_DEFAULT_COMPOSE and %G_NORMALIZE_ALL_COMPOSE
903 * are like %G_NORMALIZE_DEFAULT and %G_NORMALIZE_ALL,
904 * but returned a result with composed forms rather
905 * than a maximally decomposed form. This is often
906 * useful if you intend to convert the string to
907 * a legacy encoding or pass it to a system with
908 * less capable Unicode handling.
910 * Return value: a newly allocated string, that is the
911 * normalized form of @str.
913 static gchar *
914 g_utf8_normalize (const gchar * str, gssize len, GNormalizeMode mode)
916 gunichar *result_wc = _g_utf8_normalize_wc (str, len, mode);
917 gchar *result;
919 result = g_ucs4_to_utf8 (result_wc, -1, NULL, NULL, NULL);
920 g_free (result_wc);
922 return result;
925 /* Public Libidn API starts here. */
928 * stringprep_utf8_to_unichar:
929 * @p: a pointer to Unicode character encoded as UTF-8
931 * Converts a sequence of bytes encoded as UTF-8 to a Unicode character.
932 * If @p does not point to a valid UTF-8 encoded character, results are
933 * undefined.
935 * Return value: the resulting character.
937 uint32_t
938 stringprep_utf8_to_unichar (const char *p)
940 return g_utf8_get_char (p);
944 * stringprep_unichar_to_utf8:
945 * @c: a ISO10646 character code
946 * @outbuf: output buffer, must have at least 6 bytes of space.
947 * If %NULL, the length will be computed and returned
948 * and nothing will be written to @outbuf.
950 * Converts a single character to UTF-8.
952 * Return value: number of bytes written.
955 stringprep_unichar_to_utf8 (uint32_t c, char *outbuf)
957 return g_unichar_to_utf8 (c, outbuf);
961 * stringprep_utf8_to_ucs4:
962 * @str: a UTF-8 encoded string
963 * @len: the maximum length of @str to use. If @len < 0, then
964 * the string is nul-terminated.
965 * @items_written: location to store the number of characters in the
966 * result, or %NULL.
968 * Convert a string from UTF-8 to a 32-bit fixed width
969 * representation as UCS-4, assuming valid UTF-8 input.
970 * This function does no error checking on the input.
972 * Return value: a pointer to a newly allocated UCS-4 string.
973 * This value must be freed with free().
975 uint32_t *
976 stringprep_utf8_to_ucs4 (const char *str, ssize_t len, size_t * items_written)
978 return g_utf8_to_ucs4_fast (str, (glong) len, (glong *) items_written);
982 * stringprep_ucs4_to_utf8:
983 * @str: a UCS-4 encoded string
984 * @len: the maximum length of @str to use. If @len < 0, then
985 * the string is terminated with a 0 character.
986 * @items_read: location to store number of characters read read, or %NULL.
987 * @items_written: location to store number of bytes written or %NULL.
988 * The value here stored does not include the trailing 0
989 * byte.
991 * Convert a string from a 32-bit fixed width representation as UCS-4.
992 * to UTF-8. The result will be terminated with a 0 byte.
994 * Return value: a pointer to a newly allocated UTF-8 string.
995 * This value must be freed with free(). If an
996 * error occurs, %NULL will be returned and
997 * @error set.
999 char *
1000 stringprep_ucs4_to_utf8 (const uint32_t * str, ssize_t len,
1001 size_t * items_read, size_t * items_written)
1003 return g_ucs4_to_utf8 (str, len, (glong *) items_read,
1004 (glong *) items_written, NULL);
1008 * stringprep_utf8_nfkc_normalize:
1009 * @str: a UTF-8 encoded string.
1010 * @len: length of @str, in bytes, or -1 if @str is nul-terminated.
1012 * Converts a string into canonical form, standardizing
1013 * such issues as whether a character with an accent
1014 * is represented as a base character and combining
1015 * accent or as a single precomposed character.
1017 * The normalization mode is NFKC (ALL COMPOSE). It standardizes
1018 * differences that do not affect the text content, such as the
1019 * above-mentioned accent representation. It standardizes the
1020 * "compatibility" characters in Unicode, such as SUPERSCRIPT THREE to
1021 * the standard forms (in this case DIGIT THREE). Formatting
1022 * information may be lost but for most text operations such
1023 * characters should be considered the same. It returns a result with
1024 * composed forms rather than a maximally decomposed form.
1026 * Return value: a newly allocated string, that is the
1027 * NFKC normalized form of @str.
1029 char *
1030 stringprep_utf8_nfkc_normalize (const char *str, ssize_t len)
1032 return g_utf8_normalize (str, len, G_NORMALIZE_NFKC);
1036 * stringprep_ucs4_nfkc_normalize:
1037 * @str: a Unicode string.
1038 * @len: length of @str array, or -1 if @str is nul-terminated.
1040 * Converts UCS4 string into UTF-8 and runs
1041 * stringprep_utf8_nfkc_normalize().
1043 * Return value: a newly allocated Unicode string, that is the NFKC
1044 * normalized form of @str.
1046 uint32_t *
1047 stringprep_ucs4_nfkc_normalize (uint32_t * str, ssize_t len)
1049 char *p;
1050 uint32_t *result_wc;
1052 p = stringprep_ucs4_to_utf8 (str, len, 0, 0);
1053 result_wc = _g_utf8_normalize_wc (p, -1, G_NORMALIZE_NFKC);
1054 free (p);
1056 return result_wc;