1 /* nfkc.c Unicode normalization utilities.
2 * Copyright (C) 2002, 2003 Simon Josefsson
4 * This file is part of GNU Libidn.
6 * GNU Libidn is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2.1 of the License, or (at your option) any later version.
11 * GNU Libidn is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with GNU Libidn; if not, see <http://www.gnu.org/licenses/>.
28 #include "stringprep.h"
30 /* This file contains functions from GLIB, including gutf8.c and
31 * gunidecomp.c, all licensed under LGPL and copyright hold by:
33 * Copyright (C) 1999, 2000 Tom Tromey
34 * Copyright 2000 Red Hat, Inc.
37 /* Hacks to make syncing with GLIB code easier. */
40 #define guchar unsigned char
43 #define guint unsigned int
44 #define gushort unsigned short
45 #define gint16 int16_t
46 #define guint16 uint16_t
47 #define gunichar uint32_t
49 #define gssize ssize_t
50 #define g_malloc malloc
53 #define g_set_error(a,b,c,d) ((void) 0)
54 #define g_new(struct_type, n_structs) \
55 ((struct_type *) g_malloc (((gsize) sizeof (struct_type)) * ((gsize) (n_structs))))
56 # if defined (__GNUC__) && !defined (__STRICT_ANSI__) && !defined (__cplusplus)
57 # define G_STMT_START (void)(
60 # if (defined (sun) || defined (__sun__))
61 # define G_STMT_START if (1)
62 # define G_STMT_END else (void)0
64 # define G_STMT_START do
65 # define G_STMT_END while (0)
68 #define g_return_val_if_fail(expr,val) G_STMT_START{ (void)0; }G_STMT_END
69 #define G_N_ELEMENTS(arr) (sizeof (arr) / sizeof ((arr)[0]))
73 /* Code from GLIB gunicode.h starts here. */
78 G_NORMALIZE_NFD
= G_NORMALIZE_DEFAULT
,
79 G_NORMALIZE_DEFAULT_COMPOSE
,
80 G_NORMALIZE_NFC
= G_NORMALIZE_DEFAULT_COMPOSE
,
82 G_NORMALIZE_NFKD
= G_NORMALIZE_ALL
,
83 G_NORMALIZE_ALL_COMPOSE
,
84 G_NORMALIZE_NFKC
= G_NORMALIZE_ALL_COMPOSE
88 /* Code from GLIB gutf8.c starts here. */
90 #define UTF8_COMPUTE(Char, Mask, Len) \
96 else if ((Char & 0xe0) == 0xc0) \
101 else if ((Char & 0xf0) == 0xe0) \
106 else if ((Char & 0xf8) == 0xf0) \
111 else if ((Char & 0xfc) == 0xf8) \
116 else if ((Char & 0xfe) == 0xfc) \
124 #define UTF8_LENGTH(Char) \
125 ((Char) < 0x80 ? 1 : \
126 ((Char) < 0x800 ? 2 : \
127 ((Char) < 0x10000 ? 3 : \
128 ((Char) < 0x200000 ? 4 : \
129 ((Char) < 0x4000000 ? 5 : 6)))))
132 #define UTF8_GET(Result, Chars, Count, Mask, Len) \
133 (Result) = (Chars)[0] & (Mask); \
134 for ((Count) = 1; (Count) < (Len); ++(Count)) \
136 if (((Chars)[(Count)] & 0xc0) != 0x80) \
142 (Result) |= ((Chars)[(Count)] & 0x3f); \
145 #define UNICODE_VALID(Char) \
146 ((Char) < 0x110000 && \
147 (((Char) & 0xFFFFF800) != 0xD800) && \
148 ((Char) < 0xFDD0 || (Char) > 0xFDEF) && \
149 ((Char) & 0xFFFE) != 0xFFFE)
152 static const gchar utf8_skip_data
[256] = {
153 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
155 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
157 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
159 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
161 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
163 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
165 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
167 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 5,
171 const gchar
*const g_utf8_skip
= utf8_skip_data
;
173 #define g_utf8_next_char(p) (char *)((p) + g_utf8_skip[*(guchar *)(p)])
177 * @p: pointer to the start of a UTF-8 encoded string.
178 * @max: the maximum number of bytes to examine. If @max
179 * is less than 0, then the string is assumed to be
180 * nul-terminated. If @max is 0, @p will not be examined and
183 * Returns the length of the string in characters.
185 * Return value: the length of the string in characters
188 g_utf8_strlen (const gchar
* p
, gssize max
)
191 const gchar
*start
= p
;
192 g_return_val_if_fail (p
!= NULL
|| max
== 0, 0);
198 p
= g_utf8_next_char (p
);
207 p
= g_utf8_next_char (p
);
209 while (p
- start
< max
&& *p
)
212 p
= g_utf8_next_char (p
);
215 /* only do the last len increment if we got a complete
216 * char (don't count partial chars)
218 if (p
- start
== max
)
227 * @p: a pointer to Unicode character encoded as UTF-8
229 * Converts a sequence of bytes encoded as UTF-8 to a Unicode character.
230 * If @p does not point to a valid UTF-8 encoded character, results are
231 * undefined. If you are not sure that the bytes are complete
232 * valid Unicode characters, you should use g_utf8_get_char_validated()
235 * Return value: the resulting character
238 g_utf8_get_char (const gchar
* p
)
240 int i
, mask
= 0, len
;
242 unsigned char c
= (unsigned char) *p
;
244 UTF8_COMPUTE (c
, mask
, len
);
246 return (gunichar
) - 1;
247 UTF8_GET (result
, p
, i
, mask
, len
);
254 * @c: a ISO10646 character code
255 * @outbuf: output buffer, must have at least 6 bytes of space.
256 * If %NULL, the length will be computed and returned
257 * and nothing will be written to @outbuf.
259 * Converts a single character to UTF-8.
261 * Return value: number of bytes written
264 g_unichar_to_utf8 (gunichar c
, gchar
* outbuf
)
280 else if (c
< 0x10000)
285 else if (c
< 0x200000)
290 else if (c
< 0x4000000)
303 for (i
= len
- 1; i
> 0; --i
)
305 outbuf
[i
] = (c
& 0x3f) | 0x80;
308 outbuf
[0] = c
| first
;
315 * g_utf8_to_ucs4_fast:
316 * @str: a UTF-8 encoded string
317 * @len: the maximum length of @str to use. If @len < 0, then
318 * the string is nul-terminated.
319 * @items_written: location to store the number of characters in the
322 * Convert a string from UTF-8 to a 32-bit fixed width
323 * representation as UCS-4, assuming valid UTF-8 input.
324 * This function is roughly twice as fast as g_utf8_to_ucs4()
325 * but does no error checking on the input.
327 * Return value: a pointer to a newly allocated UCS-4 string.
328 * This value must be freed with g_free().
331 g_utf8_to_ucs4_fast (const gchar
* str
, glong len
, glong
* items_written
)
338 g_return_val_if_fail (str
!= NULL
, NULL
);
346 p
= g_utf8_next_char (p
);
352 while (p
< str
+ len
&& *p
)
354 p
= g_utf8_next_char (p
);
359 result
= g_new (gunichar
, n_chars
+ 1);
364 for (i
= 0; i
< n_chars
; i
++)
366 gunichar wc
= ((unsigned char *) p
)[0];
401 for (j
= 1; j
< charlen
; j
++)
404 wc
|= ((unsigned char *) p
)[j
] & 0x3f;
421 * @str: a UCS-4 encoded string
422 * @len: the maximum length of @str to use. If @len < 0, then
423 * the string is terminated with a 0 character.
424 * @items_read: location to store number of characters read read, or %NULL.
425 * @items_written: location to store number of bytes written or %NULL.
426 * The value here stored does not include the trailing 0
428 * @error: location to store the error occuring, or %NULL to ignore
429 * errors. Any of the errors in #GConvertError other than
430 * %G_CONVERT_ERROR_NO_CONVERSION may occur.
432 * Convert a string from a 32-bit fixed width representation as UCS-4.
433 * to UTF-8. The result will be terminated with a 0 byte.
435 * Return value: a pointer to a newly allocated UTF-8 string.
436 * This value must be freed with g_free(). If an
437 * error occurs, %NULL will be returned and
441 g_ucs4_to_utf8 (const gunichar
* str
,
443 glong
* items_read
, glong
* items_written
, GError
** error
)
446 gchar
*result
= NULL
;
451 for (i
= 0; len
< 0 || i
< len
; i
++)
456 if (str
[i
] >= 0x80000000)
461 g_set_error (error
, G_CONVERT_ERROR
,
462 G_CONVERT_ERROR_ILLEGAL_SEQUENCE
,
463 _("Character out of range for UTF-8"));
467 result_length
+= UTF8_LENGTH (str
[i
]);
470 result
= g_malloc (result_length
+ 1);
476 while (p
< result
+ result_length
)
477 p
+= g_unichar_to_utf8 (str
[i
++], p
);
482 *items_written
= p
- result
;
491 /* Code from GLIB gunidecomp.c starts here. */
493 #include "gunidecomp.h"
494 #include "gunicomp.h"
496 #define CC_PART1(Page, Char) \
497 ((combining_class_table_part1[Page] >= G_UNICODE_MAX_TABLE_INDEX) \
498 ? (combining_class_table_part1[Page] - G_UNICODE_MAX_TABLE_INDEX) \
499 : (cclass_data[combining_class_table_part1[Page]][Char]))
501 #define CC_PART2(Page, Char) \
502 ((combining_class_table_part2[Page] >= G_UNICODE_MAX_TABLE_INDEX) \
503 ? (combining_class_table_part2[Page] - G_UNICODE_MAX_TABLE_INDEX) \
504 : (cclass_data[combining_class_table_part2[Page]][Char]))
506 #define COMBINING_CLASS(Char) \
507 (((Char) <= G_UNICODE_LAST_CHAR_PART1) \
508 ? CC_PART1 ((Char) >> 8, (Char) & 0xff) \
509 : (((Char) >= 0xe0000 && (Char) <= G_UNICODE_LAST_CHAR) \
510 ? CC_PART2 (((Char) - 0xe0000) >> 8, (Char) & 0xff) \
513 /* constants for hangul syllable [de]composition */
521 #define NCount (VCount * TCount)
522 #define SCount (LCount * NCount)
525 * g_unicode_canonical_ordering:
526 * @string: a UCS-4 encoded string.
527 * @len: the maximum length of @string to use.
529 * Computes the canonical ordering of a string in-place.
530 * This rearranges decomposed characters in the string
531 * according to their combining classes. See the Unicode
532 * manual for more information.
535 g_unicode_canonical_ordering (gunichar
* string
, gsize len
)
544 last
= COMBINING_CLASS (string
[0]);
545 for (i
= 0; i
< len
- 1; ++i
)
547 int next
= COMBINING_CLASS (string
[i
+ 1]);
548 if (next
!= 0 && last
> next
)
551 /* Percolate item leftward through string. */
552 for (j
= i
+ 1; j
> 0; --j
)
555 if (COMBINING_CLASS (string
[j
- 1]) <= next
)
558 string
[j
] = string
[j
- 1];
562 /* We're re-entering the loop looking at the old
571 /* http://www.unicode.org/unicode/reports/tr15/#Hangul
572 * r should be null or have sufficient space. Calling with r == NULL will
573 * only calculate the result_len; however, a buffer with space for three
574 * characters will always be big enough. */
576 decompose_hangul (gunichar s
, gunichar
* r
, gsize
* result_len
)
578 gint SIndex
= s
- SBase
;
580 /* not a hangul syllable */
581 if (SIndex
< 0 || SIndex
>= SCount
)
589 gunichar L
= LBase
+ SIndex
/ NCount
;
590 gunichar V
= VBase
+ (SIndex
% NCount
) / TCount
;
591 gunichar T
= TBase
+ SIndex
% TCount
;
610 /* returns a pointer to a null-terminated UTF-8 string */
612 find_decomposition (gunichar ch
, gboolean compat
)
615 int end
= G_N_ELEMENTS (decomp_table
);
617 if (ch
>= decomp_table
[start
].ch
&& ch
<= decomp_table
[end
- 1].ch
)
621 int half
= (start
+ end
) / 2;
622 if (ch
== decomp_table
[half
].ch
)
628 offset
= decomp_table
[half
].compat_offset
;
629 if (offset
== G_UNICODE_NOT_PRESENT_OFFSET
)
630 offset
= decomp_table
[half
].canon_offset
;
634 offset
= decomp_table
[half
].canon_offset
;
635 if (offset
== G_UNICODE_NOT_PRESENT_OFFSET
)
639 return &(decomp_expansion_string
[offset
]);
641 else if (half
== start
)
643 else if (ch
> decomp_table
[half
].ch
)
653 /* L,V => LV and LV,T => LVT */
655 combine_hangul (gunichar a
, gunichar b
, gunichar
* result
)
657 gint LIndex
= a
- LBase
;
658 gint SIndex
= a
- SBase
;
660 gint VIndex
= b
- VBase
;
661 gint TIndex
= b
- TBase
;
663 if (0 <= LIndex
&& LIndex
< LCount
&& 0 <= VIndex
&& VIndex
< VCount
)
665 *result
= SBase
+ (LIndex
* VCount
+ VIndex
) * TCount
;
668 else if (0 <= SIndex
&& SIndex
< SCount
&& (SIndex
% TCount
) == 0
669 && 0 <= TIndex
&& TIndex
<= TCount
)
671 *result
= a
+ TIndex
;
678 #define CI(Page, Char) \
679 ((compose_table[Page] >= G_UNICODE_MAX_TABLE_INDEX) \
680 ? (compose_table[Page] - G_UNICODE_MAX_TABLE_INDEX) \
681 : (compose_data[compose_table[Page]][Char]))
683 #define COMPOSE_INDEX(Char) \
684 ((((Char) >> 8) > (COMPOSE_TABLE_LAST)) ? 0 : CI((Char) >> 8, (Char) & 0xff))
687 combine (gunichar a
, gunichar b
, gunichar
* result
)
689 gushort index_a
, index_b
;
691 if (combine_hangul (a
, b
, result
))
694 index_a
= COMPOSE_INDEX (a
);
696 if (index_a
>= COMPOSE_FIRST_SINGLE_START
&& index_a
< COMPOSE_SECOND_START
)
698 if (b
== compose_first_single
[index_a
- COMPOSE_FIRST_SINGLE_START
][0])
701 compose_first_single
[index_a
- COMPOSE_FIRST_SINGLE_START
][1];
708 index_b
= COMPOSE_INDEX (b
);
710 if (index_b
>= COMPOSE_SECOND_SINGLE_START
)
713 compose_second_single
[index_b
- COMPOSE_SECOND_SINGLE_START
][0])
716 compose_second_single
[index_b
- COMPOSE_SECOND_SINGLE_START
][1];
723 if (index_a
>= COMPOSE_FIRST_START
&& index_a
< COMPOSE_FIRST_SINGLE_START
724 && index_b
>= COMPOSE_SECOND_START
725 && index_b
< COMPOSE_SECOND_SINGLE_START
)
728 compose_array
[index_a
- COMPOSE_FIRST_START
][index_b
-
729 COMPOSE_SECOND_START
];
742 _g_utf8_normalize_wc (const gchar
* str
, gssize max_len
, GNormalizeMode mode
)
748 gboolean do_compat
= (mode
== G_NORMALIZE_NFKC
|| mode
== G_NORMALIZE_NFKD
);
749 gboolean do_compose
= (mode
== G_NORMALIZE_NFC
|| mode
== G_NORMALIZE_NFKC
);
753 while ((max_len
< 0 || p
< str
+ max_len
) && *p
)
756 gunichar wc
= g_utf8_get_char (p
);
758 if (wc
>= 0xac00 && wc
<= 0xd7af)
761 decompose_hangul (wc
, NULL
, &result_len
);
766 decomp
= find_decomposition (wc
, do_compat
);
769 n_wc
+= g_utf8_strlen (decomp
, -1);
774 p
= g_utf8_next_char (p
);
777 wc_buffer
= g_new (gunichar
, n_wc
+ 1);
784 while ((max_len
< 0 || p
< str
+ max_len
) && *p
)
786 gunichar wc
= g_utf8_get_char (p
);
789 gsize old_n_wc
= n_wc
;
791 if (wc
>= 0xac00 && wc
<= 0xd7af)
794 decompose_hangul (wc
, wc_buffer
+ n_wc
, &result_len
);
799 decomp
= find_decomposition (wc
, do_compat
);
804 for (pd
= decomp
; *pd
!= '\0'; pd
= g_utf8_next_char (pd
))
805 wc_buffer
[n_wc
++] = g_utf8_get_char (pd
);
808 wc_buffer
[n_wc
++] = wc
;
813 cc
= COMBINING_CLASS (wc_buffer
[old_n_wc
]);
817 g_unicode_canonical_ordering (wc_buffer
+ last_start
,
819 last_start
= old_n_wc
;
823 p
= g_utf8_next_char (p
);
828 g_unicode_canonical_ordering (wc_buffer
+ last_start
,
835 /* All decomposed and reordered */
837 if (do_compose
&& n_wc
> 0)
843 for (i
= 0; i
< n_wc
; i
++)
845 int cc
= COMBINING_CLASS (wc_buffer
[i
]);
848 (last_cc
== 0 || last_cc
!= cc
) &&
849 combine (wc_buffer
[last_start
], wc_buffer
[i
],
850 &wc_buffer
[last_start
]))
852 for (j
= i
+ 1; j
< n_wc
; j
++)
853 wc_buffer
[j
- 1] = wc_buffer
[j
];
860 last_cc
= COMBINING_CLASS (wc_buffer
[i
- 1]);
879 * @str: a UTF-8 encoded string.
880 * @len: length of @str, in bytes, or -1 if @str is nul-terminated.
881 * @mode: the type of normalization to perform.
883 * Converts a string into canonical form, standardizing
884 * such issues as whether a character with an accent
885 * is represented as a base character and combining
886 * accent or as a single precomposed character. You
887 * should generally call g_utf8_normalize() before
888 * comparing two Unicode strings.
890 * The normalization mode %G_NORMALIZE_DEFAULT only
891 * standardizes differences that do not affect the
892 * text content, such as the above-mentioned accent
893 * representation. %G_NORMALIZE_ALL also standardizes
894 * the "compatibility" characters in Unicode, such
895 * as SUPERSCRIPT THREE to the standard forms
896 * (in this case DIGIT THREE). Formatting information
897 * may be lost but for most text operations such
898 * characters should be considered the same.
899 * For example, g_utf8_collate() normalizes
900 * with %G_NORMALIZE_ALL as its first step.
902 * %G_NORMALIZE_DEFAULT_COMPOSE and %G_NORMALIZE_ALL_COMPOSE
903 * are like %G_NORMALIZE_DEFAULT and %G_NORMALIZE_ALL,
904 * but returned a result with composed forms rather
905 * than a maximally decomposed form. This is often
906 * useful if you intend to convert the string to
907 * a legacy encoding or pass it to a system with
908 * less capable Unicode handling.
910 * Return value: a newly allocated string, that is the
911 * normalized form of @str.
914 g_utf8_normalize (const gchar
* str
, gssize len
, GNormalizeMode mode
)
916 gunichar
*result_wc
= _g_utf8_normalize_wc (str
, len
, mode
);
919 result
= g_ucs4_to_utf8 (result_wc
, -1, NULL
, NULL
, NULL
);
925 /* Public Libidn API starts here. */
928 * stringprep_utf8_to_unichar:
929 * @p: a pointer to Unicode character encoded as UTF-8
931 * Converts a sequence of bytes encoded as UTF-8 to a Unicode character.
932 * If @p does not point to a valid UTF-8 encoded character, results are
935 * Return value: the resulting character.
938 stringprep_utf8_to_unichar (const char *p
)
940 return g_utf8_get_char (p
);
944 * stringprep_unichar_to_utf8:
945 * @c: a ISO10646 character code
946 * @outbuf: output buffer, must have at least 6 bytes of space.
947 * If %NULL, the length will be computed and returned
948 * and nothing will be written to @outbuf.
950 * Converts a single character to UTF-8.
952 * Return value: number of bytes written.
955 stringprep_unichar_to_utf8 (uint32_t c
, char *outbuf
)
957 return g_unichar_to_utf8 (c
, outbuf
);
961 * stringprep_utf8_to_ucs4:
962 * @str: a UTF-8 encoded string
963 * @len: the maximum length of @str to use. If @len < 0, then
964 * the string is nul-terminated.
965 * @items_written: location to store the number of characters in the
968 * Convert a string from UTF-8 to a 32-bit fixed width
969 * representation as UCS-4, assuming valid UTF-8 input.
970 * This function does no error checking on the input.
972 * Return value: a pointer to a newly allocated UCS-4 string.
973 * This value must be freed with free().
976 stringprep_utf8_to_ucs4 (const char *str
, ssize_t len
, size_t * items_written
)
978 return g_utf8_to_ucs4_fast (str
, (glong
) len
, (glong
*) items_written
);
982 * stringprep_ucs4_to_utf8:
983 * @str: a UCS-4 encoded string
984 * @len: the maximum length of @str to use. If @len < 0, then
985 * the string is terminated with a 0 character.
986 * @items_read: location to store number of characters read read, or %NULL.
987 * @items_written: location to store number of bytes written or %NULL.
988 * The value here stored does not include the trailing 0
991 * Convert a string from a 32-bit fixed width representation as UCS-4.
992 * to UTF-8. The result will be terminated with a 0 byte.
994 * Return value: a pointer to a newly allocated UTF-8 string.
995 * This value must be freed with free(). If an
996 * error occurs, %NULL will be returned and
1000 stringprep_ucs4_to_utf8 (const uint32_t * str
, ssize_t len
,
1001 size_t * items_read
, size_t * items_written
)
1003 return g_ucs4_to_utf8 (str
, len
, (glong
*) items_read
,
1004 (glong
*) items_written
, NULL
);
1008 * stringprep_utf8_nfkc_normalize:
1009 * @str: a UTF-8 encoded string.
1010 * @len: length of @str, in bytes, or -1 if @str is nul-terminated.
1012 * Converts a string into canonical form, standardizing
1013 * such issues as whether a character with an accent
1014 * is represented as a base character and combining
1015 * accent or as a single precomposed character.
1017 * The normalization mode is NFKC (ALL COMPOSE). It standardizes
1018 * differences that do not affect the text content, such as the
1019 * above-mentioned accent representation. It standardizes the
1020 * "compatibility" characters in Unicode, such as SUPERSCRIPT THREE to
1021 * the standard forms (in this case DIGIT THREE). Formatting
1022 * information may be lost but for most text operations such
1023 * characters should be considered the same. It returns a result with
1024 * composed forms rather than a maximally decomposed form.
1026 * Return value: a newly allocated string, that is the
1027 * NFKC normalized form of @str.
1030 stringprep_utf8_nfkc_normalize (const char *str
, ssize_t len
)
1032 return g_utf8_normalize (str
, len
, G_NORMALIZE_NFKC
);
1036 * stringprep_ucs4_nfkc_normalize:
1037 * @str: a Unicode string.
1038 * @len: length of @str array, or -1 if @str is nul-terminated.
1040 * Converts UCS4 string into UTF-8 and runs
1041 * stringprep_utf8_nfkc_normalize().
1043 * Return value: a newly allocated Unicode string, that is the NFKC
1044 * normalized form of @str.
1047 stringprep_ucs4_nfkc_normalize (uint32_t * str
, ssize_t len
)
1050 uint32_t *result_wc
;
1052 p
= stringprep_ucs4_to_utf8 (str
, len
, 0, 0);
1053 result_wc
= _g_utf8_normalize_wc (p
, -1, G_NORMALIZE_NFKC
);