1 /* nfkc.c Unicode normalization utilities.
2 * Copyright (C) 2002, 2003 Simon Josefsson
4 * This file is part of GNU Libidn.
6 * GNU Libidn is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2.1 of the License, or (at your option) any later version.
11 * GNU Libidn is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with GNU Libidn; if not, write to the Free Software
18 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
29 #include "stringprep.h"
31 /* This file contains functions from GLIB, including gutf8.c and
32 * gunidecomp.c, all licensed under LGPL and copyright hold by:
34 * Copyright (C) 1999, 2000 Tom Tromey
35 * Copyright 2000 Red Hat, Inc.
38 /* Hacks to make syncing with GLIB code easier. */
41 #define guchar unsigned char
44 #define guint unsigned int
45 #define gushort unsigned short
46 #define gint16 int16_t
47 #define guint16 uint16_t
48 #define gunichar uint32_t
50 #define gssize ssize_t
51 #define g_malloc malloc
54 #define g_set_error(a,b,c,d) 0
55 #define g_new(struct_type, n_structs) \
56 ((struct_type *) g_malloc (((gsize) sizeof (struct_type)) * ((gsize) (n_structs))))
57 # if defined (__GNUC__) && !defined (__STRICT_ANSI__) && !defined (__cplusplus)
58 # define G_STMT_START (void)(
61 # if (defined (sun) || defined (__sun__))
62 # define G_STMT_START if (1)
63 # define G_STMT_END else (void)0
65 # define G_STMT_START do
66 # define G_STMT_END while (0)
69 #define g_return_val_if_fail(expr,val) G_STMT_START{ (void)0; }G_STMT_END
70 #define G_N_ELEMENTS(arr) (sizeof (arr) / sizeof ((arr)[0]))
74 /* Code from GLIB gunicode.h starts here. */
79 G_NORMALIZE_NFD
= G_NORMALIZE_DEFAULT
,
80 G_NORMALIZE_DEFAULT_COMPOSE
,
81 G_NORMALIZE_NFC
= G_NORMALIZE_DEFAULT_COMPOSE
,
83 G_NORMALIZE_NFKD
= G_NORMALIZE_ALL
,
84 G_NORMALIZE_ALL_COMPOSE
,
85 G_NORMALIZE_NFKC
= G_NORMALIZE_ALL_COMPOSE
89 /* Code from GLIB gutf8.c starts here. */
91 #define UTF8_COMPUTE(Char, Mask, Len) \
97 else if ((Char & 0xe0) == 0xc0) \
102 else if ((Char & 0xf0) == 0xe0) \
107 else if ((Char & 0xf8) == 0xf0) \
112 else if ((Char & 0xfc) == 0xf8) \
117 else if ((Char & 0xfe) == 0xfc) \
125 #define UTF8_LENGTH(Char) \
126 ((Char) < 0x80 ? 1 : \
127 ((Char) < 0x800 ? 2 : \
128 ((Char) < 0x10000 ? 3 : \
129 ((Char) < 0x200000 ? 4 : \
130 ((Char) < 0x4000000 ? 5 : 6)))))
133 #define UTF8_GET(Result, Chars, Count, Mask, Len) \
134 (Result) = (Chars)[0] & (Mask); \
135 for ((Count) = 1; (Count) < (Len); ++(Count)) \
137 if (((Chars)[(Count)] & 0xc0) != 0x80) \
143 (Result) |= ((Chars)[(Count)] & 0x3f); \
146 #define UNICODE_VALID(Char) \
147 ((Char) < 0x110000 && \
148 (((Char) & 0xFFFFF800) != 0xD800) && \
149 ((Char) < 0xFDD0 || (Char) > 0xFDEF) && \
150 ((Char) & 0xFFFE) != 0xFFFE)
153 static const gchar utf8_skip_data
[256] = {
154 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
156 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
158 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
160 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
162 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
164 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
166 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
168 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 5,
172 const gchar
*const g_utf8_skip
= utf8_skip_data
;
174 #define g_utf8_next_char(p) (char *)((p) + g_utf8_skip[*(guchar *)(p)])
178 * @p: pointer to the start of a UTF-8 encoded string.
179 * @max: the maximum number of bytes to examine. If @max
180 * is less than 0, then the string is assumed to be
181 * nul-terminated. If @max is 0, @p will not be examined and
184 * Returns the length of the string in characters.
186 * Return value: the length of the string in characters
189 g_utf8_strlen (const gchar
* p
, gssize max
)
192 const gchar
*start
= p
;
193 g_return_val_if_fail (p
!= NULL
|| max
== 0, 0);
199 p
= g_utf8_next_char (p
);
208 p
= g_utf8_next_char (p
);
210 while (p
- start
< max
&& *p
)
213 p
= g_utf8_next_char (p
);
216 /* only do the last len increment if we got a complete
217 * char (don't count partial chars)
219 if (p
- start
== max
)
228 * @p: a pointer to Unicode character encoded as UTF-8
230 * Converts a sequence of bytes encoded as UTF-8 to a Unicode character.
231 * If @p does not point to a valid UTF-8 encoded character, results are
232 * undefined. If you are not sure that the bytes are complete
233 * valid Unicode characters, you should use g_utf8_get_char_validated()
236 * Return value: the resulting character
239 g_utf8_get_char (const gchar
* p
)
241 int i
, mask
= 0, len
;
243 unsigned char c
= (unsigned char) *p
;
245 UTF8_COMPUTE (c
, mask
, len
);
247 return (gunichar
) - 1;
248 UTF8_GET (result
, p
, i
, mask
, len
);
255 * @c: a ISO10646 character code
256 * @outbuf: output buffer, must have at least 6 bytes of space.
257 * If %NULL, the length will be computed and returned
258 * and nothing will be written to @outbuf.
260 * Converts a single character to UTF-8.
262 * Return value: number of bytes written
265 g_unichar_to_utf8 (gunichar c
, gchar
* outbuf
)
281 else if (c
< 0x10000)
286 else if (c
< 0x200000)
291 else if (c
< 0x4000000)
304 for (i
= len
- 1; i
> 0; --i
)
306 outbuf
[i
] = (c
& 0x3f) | 0x80;
309 outbuf
[0] = c
| first
;
316 * g_utf8_to_ucs4_fast:
317 * @str: a UTF-8 encoded string
318 * @len: the maximum length of @str to use. If @len < 0, then
319 * the string is nul-terminated.
320 * @items_written: location to store the number of characters in the
323 * Convert a string from UTF-8 to a 32-bit fixed width
324 * representation as UCS-4, assuming valid UTF-8 input.
325 * This function is roughly twice as fast as g_utf8_to_ucs4()
326 * but does no error checking on the input.
328 * Return value: a pointer to a newly allocated UCS-4 string.
329 * This value must be freed with g_free().
332 g_utf8_to_ucs4_fast (const gchar
* str
, glong len
, glong
* items_written
)
339 g_return_val_if_fail (str
!= NULL
, NULL
);
347 p
= g_utf8_next_char (p
);
353 while (p
< str
+ len
&& *p
)
355 p
= g_utf8_next_char (p
);
360 result
= g_new (gunichar
, n_chars
+ 1);
365 for (i
= 0; i
< n_chars
; i
++)
367 gunichar wc
= ((unsigned char *) p
)[0];
402 for (j
= 1; j
< charlen
; j
++)
405 wc
|= ((unsigned char *) p
)[j
] & 0x3f;
422 * @str: a UCS-4 encoded string
423 * @len: the maximum length of @str to use. If @len < 0, then
424 * the string is terminated with a 0 character.
425 * @items_read: location to store number of characters read read, or %NULL.
426 * @items_written: location to store number of bytes written or %NULL.
427 * The value here stored does not include the trailing 0
429 * @error: location to store the error occuring, or %NULL to ignore
430 * errors. Any of the errors in #GConvertError other than
431 * %G_CONVERT_ERROR_NO_CONVERSION may occur.
433 * Convert a string from a 32-bit fixed width representation as UCS-4.
434 * to UTF-8. The result will be terminated with a 0 byte.
436 * Return value: a pointer to a newly allocated UTF-8 string.
437 * This value must be freed with g_free(). If an
438 * error occurs, %NULL will be returned and
442 g_ucs4_to_utf8 (const gunichar
* str
,
444 glong
* items_read
, glong
* items_written
, GError
** error
)
447 gchar
*result
= NULL
;
452 for (i
= 0; len
< 0 || i
< len
; i
++)
457 if (str
[i
] >= 0x80000000)
462 g_set_error (error
, G_CONVERT_ERROR
,
463 G_CONVERT_ERROR_ILLEGAL_SEQUENCE
,
464 _("Character out of range for UTF-8"));
468 result_length
+= UTF8_LENGTH (str
[i
]);
471 result
= g_malloc (result_length
+ 1);
477 while (p
< result
+ result_length
)
478 p
+= g_unichar_to_utf8 (str
[i
++], p
);
483 *items_written
= p
- result
;
492 /* Code from GLIB gunidecomp.c starts here. */
494 #include "gunidecomp.h"
495 #include "gunicomp.h"
497 #define CC_PART1(Page, Char) \
498 ((combining_class_table_part1[Page] >= G_UNICODE_MAX_TABLE_INDEX) \
499 ? (combining_class_table_part1[Page] - G_UNICODE_MAX_TABLE_INDEX) \
500 : (cclass_data[combining_class_table_part1[Page]][Char]))
502 #define CC_PART2(Page, Char) \
503 ((combining_class_table_part2[Page] >= G_UNICODE_MAX_TABLE_INDEX) \
504 ? (combining_class_table_part2[Page] - G_UNICODE_MAX_TABLE_INDEX) \
505 : (cclass_data[combining_class_table_part2[Page]][Char]))
507 #define COMBINING_CLASS(Char) \
508 (((Char) <= G_UNICODE_LAST_CHAR_PART1) \
509 ? CC_PART1 ((Char) >> 8, (Char) & 0xff) \
510 : (((Char) >= 0xe0000 && (Char) <= G_UNICODE_LAST_CHAR) \
511 ? CC_PART2 (((Char) - 0xe0000) >> 8, (Char) & 0xff) \
514 /* constants for hangul syllable [de]composition */
522 #define NCount (VCount * TCount)
523 #define SCount (LCount * NCount)
526 * g_unicode_canonical_ordering:
527 * @string: a UCS-4 encoded string.
528 * @len: the maximum length of @string to use.
530 * Computes the canonical ordering of a string in-place.
531 * This rearranges decomposed characters in the string
532 * according to their combining classes. See the Unicode
533 * manual for more information.
536 g_unicode_canonical_ordering (gunichar
* string
, gsize len
)
545 last
= COMBINING_CLASS (string
[0]);
546 for (i
= 0; i
< len
- 1; ++i
)
548 int next
= COMBINING_CLASS (string
[i
+ 1]);
549 if (next
!= 0 && last
> next
)
552 /* Percolate item leftward through string. */
553 for (j
= i
+ 1; j
> 0; --j
)
556 if (COMBINING_CLASS (string
[j
- 1]) <= next
)
559 string
[j
] = string
[j
- 1];
563 /* We're re-entering the loop looking at the old
572 /* http://www.unicode.org/unicode/reports/tr15/#Hangul
573 * r should be null or have sufficient space. Calling with r == NULL will
574 * only calculate the result_len; however, a buffer with space for three
575 * characters will always be big enough. */
577 decompose_hangul (gunichar s
, gunichar
* r
, gsize
* result_len
)
579 gint SIndex
= s
- SBase
;
581 /* not a hangul syllable */
582 if (SIndex
< 0 || SIndex
>= SCount
)
590 gunichar L
= LBase
+ SIndex
/ NCount
;
591 gunichar V
= VBase
+ (SIndex
% NCount
) / TCount
;
592 gunichar T
= TBase
+ SIndex
% TCount
;
611 /* returns a pointer to a null-terminated UTF-8 string */
613 find_decomposition (gunichar ch
, gboolean compat
)
616 int end
= G_N_ELEMENTS (decomp_table
);
618 if (ch
>= decomp_table
[start
].ch
&& ch
<= decomp_table
[end
- 1].ch
)
622 int half
= (start
+ end
) / 2;
623 if (ch
== decomp_table
[half
].ch
)
629 offset
= decomp_table
[half
].compat_offset
;
630 if (offset
== G_UNICODE_NOT_PRESENT_OFFSET
)
631 offset
= decomp_table
[half
].canon_offset
;
635 offset
= decomp_table
[half
].canon_offset
;
636 if (offset
== G_UNICODE_NOT_PRESENT_OFFSET
)
640 return &(decomp_expansion_string
[offset
]);
642 else if (half
== start
)
644 else if (ch
> decomp_table
[half
].ch
)
654 /* L,V => LV and LV,T => LVT */
656 combine_hangul (gunichar a
, gunichar b
, gunichar
* result
)
658 gint LIndex
= a
- LBase
;
659 gint SIndex
= a
- SBase
;
661 gint VIndex
= b
- VBase
;
662 gint TIndex
= b
- TBase
;
664 if (0 <= LIndex
&& LIndex
< LCount
&& 0 <= VIndex
&& VIndex
< VCount
)
666 *result
= SBase
+ (LIndex
* VCount
+ VIndex
) * TCount
;
669 else if (0 <= SIndex
&& SIndex
< SCount
&& (SIndex
% TCount
) == 0
670 && 0 <= TIndex
&& TIndex
<= TCount
)
672 *result
= a
+ TIndex
;
679 #define CI(Page, Char) \
680 ((compose_table[Page] >= G_UNICODE_MAX_TABLE_INDEX) \
681 ? (compose_table[Page] - G_UNICODE_MAX_TABLE_INDEX) \
682 : (compose_data[compose_table[Page]][Char]))
684 #define COMPOSE_INDEX(Char) \
685 ((((Char) >> 8) > (COMPOSE_TABLE_LAST)) ? 0 : CI((Char) >> 8, (Char) & 0xff))
688 combine (gunichar a
, gunichar b
, gunichar
* result
)
690 gushort index_a
, index_b
;
692 if (combine_hangul (a
, b
, result
))
695 index_a
= COMPOSE_INDEX (a
);
697 if (index_a
>= COMPOSE_FIRST_SINGLE_START
&& index_a
< COMPOSE_SECOND_START
)
699 if (b
== compose_first_single
[index_a
- COMPOSE_FIRST_SINGLE_START
][0])
702 compose_first_single
[index_a
- COMPOSE_FIRST_SINGLE_START
][1];
709 index_b
= COMPOSE_INDEX (b
);
711 if (index_b
>= COMPOSE_SECOND_SINGLE_START
)
714 compose_second_single
[index_b
- COMPOSE_SECOND_SINGLE_START
][0])
717 compose_second_single
[index_b
- COMPOSE_SECOND_SINGLE_START
][1];
724 if (index_a
>= COMPOSE_FIRST_START
&& index_a
< COMPOSE_FIRST_SINGLE_START
725 && index_b
>= COMPOSE_SECOND_START
726 && index_b
< COMPOSE_SECOND_SINGLE_START
)
729 compose_array
[index_a
- COMPOSE_FIRST_START
][index_b
-
730 COMPOSE_SECOND_START
];
743 _g_utf8_normalize_wc (const gchar
* str
, gssize max_len
, GNormalizeMode mode
)
749 gboolean do_compat
= (mode
== G_NORMALIZE_NFKC
|| mode
== G_NORMALIZE_NFKD
);
750 gboolean do_compose
= (mode
== G_NORMALIZE_NFC
|| mode
== G_NORMALIZE_NFKC
);
754 while ((max_len
< 0 || p
< str
+ max_len
) && *p
)
757 gunichar wc
= g_utf8_get_char (p
);
759 if (wc
>= 0xac00 && wc
<= 0xd7af)
762 decompose_hangul (wc
, NULL
, &result_len
);
767 decomp
= find_decomposition (wc
, do_compat
);
770 n_wc
+= g_utf8_strlen (decomp
, -1);
775 p
= g_utf8_next_char (p
);
778 wc_buffer
= g_new (gunichar
, n_wc
+ 1);
785 while ((max_len
< 0 || p
< str
+ max_len
) && *p
)
787 gunichar wc
= g_utf8_get_char (p
);
790 gsize old_n_wc
= n_wc
;
792 if (wc
>= 0xac00 && wc
<= 0xd7af)
795 decompose_hangul (wc
, wc_buffer
+ n_wc
, &result_len
);
800 decomp
= find_decomposition (wc
, do_compat
);
805 for (pd
= decomp
; *pd
!= '\0'; pd
= g_utf8_next_char (pd
))
806 wc_buffer
[n_wc
++] = g_utf8_get_char (pd
);
809 wc_buffer
[n_wc
++] = wc
;
814 cc
= COMBINING_CLASS (wc_buffer
[old_n_wc
]);
818 g_unicode_canonical_ordering (wc_buffer
+ last_start
,
820 last_start
= old_n_wc
;
824 p
= g_utf8_next_char (p
);
829 g_unicode_canonical_ordering (wc_buffer
+ last_start
,
836 /* All decomposed and reordered */
838 if (do_compose
&& n_wc
> 0)
844 for (i
= 0; i
< n_wc
; i
++)
846 int cc
= COMBINING_CLASS (wc_buffer
[i
]);
849 (last_cc
== 0 || last_cc
!= cc
) &&
850 combine (wc_buffer
[last_start
], wc_buffer
[i
],
851 &wc_buffer
[last_start
]))
853 for (j
= i
+ 1; j
< n_wc
; j
++)
854 wc_buffer
[j
- 1] = wc_buffer
[j
];
861 last_cc
= COMBINING_CLASS (wc_buffer
[i
- 1]);
880 * @str: a UTF-8 encoded string.
881 * @len: length of @str, in bytes, or -1 if @str is nul-terminated.
882 * @mode: the type of normalization to perform.
884 * Converts a string into canonical form, standardizing
885 * such issues as whether a character with an accent
886 * is represented as a base character and combining
887 * accent or as a single precomposed character. You
888 * should generally call g_utf8_normalize() before
889 * comparing two Unicode strings.
891 * The normalization mode %G_NORMALIZE_DEFAULT only
892 * standardizes differences that do not affect the
893 * text content, such as the above-mentioned accent
894 * representation. %G_NORMALIZE_ALL also standardizes
895 * the "compatibility" characters in Unicode, such
896 * as SUPERSCRIPT THREE to the standard forms
897 * (in this case DIGIT THREE). Formatting information
898 * may be lost but for most text operations such
899 * characters should be considered the same.
900 * For example, g_utf8_collate() normalizes
901 * with %G_NORMALIZE_ALL as its first step.
903 * %G_NORMALIZE_DEFAULT_COMPOSE and %G_NORMALIZE_ALL_COMPOSE
904 * are like %G_NORMALIZE_DEFAULT and %G_NORMALIZE_ALL,
905 * but returned a result with composed forms rather
906 * than a maximally decomposed form. This is often
907 * useful if you intend to convert the string to
908 * a legacy encoding or pass it to a system with
909 * less capable Unicode handling.
911 * Return value: a newly allocated string, that is the
912 * normalized form of @str.
915 g_utf8_normalize (const gchar
* str
, gssize len
, GNormalizeMode mode
)
917 gunichar
*result_wc
= _g_utf8_normalize_wc (str
, len
, mode
);
920 result
= g_ucs4_to_utf8 (result_wc
, -1, NULL
, NULL
, NULL
);
926 /* Public Libidn API starts here. */
929 * stringprep_utf8_to_unichar:
930 * @p: a pointer to Unicode character encoded as UTF-8
932 * Converts a sequence of bytes encoded as UTF-8 to a Unicode character.
933 * If @p does not point to a valid UTF-8 encoded character, results are
934 * undefined. If you are not sure that the bytes are complete
935 * valid Unicode characters, you should use g_utf8_get_char_validated()
938 * Return value: the resulting character
941 stringprep_utf8_to_unichar (const char *p
)
943 return g_utf8_get_char (p
);
947 * stringprep_unichar_to_utf8:
948 * @c: a ISO10646 character code
949 * @outbuf: output buffer, must have at least 6 bytes of space.
950 * If %NULL, the length will be computed and returned
951 * and nothing will be written to @outbuf.
953 * Converts a single character to UTF-8.
955 * Return value: number of bytes written
958 stringprep_unichar_to_utf8 (uint32_t c
, char *outbuf
)
960 return g_unichar_to_utf8 (c
, outbuf
);
964 * stringprep_utf8_to_ucs4:
965 * @str: a UTF-8 encoded string
966 * @len: the maximum length of @str to use. If @len < 0, then
967 * the string is nul-terminated.
968 * @items_written: location to store the number of characters in the
971 * Convert a string from UTF-8 to a 32-bit fixed width
972 * representation as UCS-4, assuming valid UTF-8 input.
973 * This function does no error checking on the input.
975 * Return value: a pointer to a newly allocated UCS-4 string.
976 * This value must be freed with free().
979 stringprep_utf8_to_ucs4 (const char *str
, ssize_t len
, size_t * items_written
)
981 return g_utf8_to_ucs4_fast (str
, (glong
) len
, (glong
*) items_written
);
985 * stringprep_ucs4_to_utf8:
986 * @str: a UCS-4 encoded string
987 * @len: the maximum length of @str to use. If @len < 0, then
988 * the string is terminated with a 0 character.
989 * @items_read: location to store number of characters read read, or %NULL.
990 * @items_written: location to store number of bytes written or %NULL.
991 * The value here stored does not include the trailing 0
994 * Convert a string from a 32-bit fixed width representation as UCS-4.
995 * to UTF-8. The result will be terminated with a 0 byte.
997 * Return value: a pointer to a newly allocated UTF-8 string.
998 * This value must be freed with free(). If an
999 * error occurs, %NULL will be returned and
1003 stringprep_ucs4_to_utf8 (const uint32_t * str
, ssize_t len
,
1004 size_t * items_read
, size_t * items_written
)
1006 return g_ucs4_to_utf8 (str
, len
, (glong
*) items_read
,
1007 (glong
*) items_written
, NULL
);
1011 * stringprep_utf8_nfkc_normalize:
1012 * @str: a UTF-8 encoded string.
1013 * @len: length of @str, in bytes, or -1 if @str is nul-terminated.
1015 * Converts a string into canonical form, standardizing
1016 * such issues as whether a character with an accent
1017 * is represented as a base character and combining
1018 * accent or as a single precomposed character.
1020 * The normalization mode is NFKC (ALL COMPOSE). It standardizes
1021 * differences that do not affect the text content, such as the
1022 * above-mentioned accent representation. It standardizes the
1023 * "compatibility" characters in Unicode, such as SUPERSCRIPT THREE to
1024 * the standard forms (in this case DIGIT THREE). Formatting
1025 * information may be lost but for most text operations such
1026 * characters should be considered the same. It returns a result with
1027 * composed forms rather than a maximally decomposed form.
1029 * Return value: a newly allocated string, that is the
1030 * NFKC normalized form of @str.
1033 stringprep_utf8_nfkc_normalize (const char *str
, ssize_t len
)
1035 return g_utf8_normalize (str
, len
, G_NORMALIZE_NFKC
);
1039 * stringprep_ucs4_nfkc_normalize:
1040 * @str: a Unicode string.
1041 * @len: length of @str array, or -1 if @str is nul-terminated.
1043 * Converts UCS4 string into UTF-8 and runs
1044 * stringprep_utf8_nfkc_normalize().
1046 * Return value: a newly allocated Unicode string, that is the NFKC
1047 * normalized form of @str.
1050 stringprep_ucs4_nfkc_normalize (uint32_t * str
, ssize_t len
)
1053 uint32_t *result_wc
;
1055 p
= stringprep_ucs4_to_utf8 (str
, len
, 0, 0);
1056 result_wc
= _g_utf8_normalize_wc (p
, -1, G_NORMALIZE_NFKC
);