1 /* nfkc.c Unicode normalization utilities.
2 * Copyright (C) 2002, 2003 Simon Josefsson
4 * This file is part of GNU Libidn.
6 * GNU Libidn is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2.1 of the License, or (at your option) any later version.
11 * GNU Libidn is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with GNU Libidn; if not, write to the Free Software
18 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
24 /* This file contains functions from GLIB including gutf8.c and
25 * gunidecomp.c, all with the following license.
27 * Copyright (C) 1999, 2000 Tom Tromey
28 * Copyright 2000 Red Hat, Inc.
30 * The Gnome Library is free software; you can redistribute it and/or
31 * modify it under the terms of the GNU Lesser General Public License as
32 * published by the Free Software Foundation; either version 2 of the
33 * License, or (at your option) any later version.
35 * The Gnome Library is distributed in the hope that it will be useful,
36 * but WITHOUT ANY WARRANTY; without even the implied warranty of
37 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
38 * Lesser General Public License for more details.
40 * You should have received a copy of the GNU Lesser General Public
41 * License along with the Gnome Library; see the file COPYING.LIB. If not,
42 * write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
43 * Boston, MA 02111-1307, USA.
49 G_NORMALIZE_NFD
= G_NORMALIZE_DEFAULT
,
50 G_NORMALIZE_DEFAULT_COMPOSE
,
51 G_NORMALIZE_NFC
= G_NORMALIZE_DEFAULT_COMPOSE
,
53 G_NORMALIZE_NFKD
= G_NORMALIZE_ALL
,
54 G_NORMALIZE_ALL_COMPOSE
,
55 G_NORMALIZE_NFKC
= G_NORMALIZE_ALL_COMPOSE
59 #include "gunidecomp.h"
62 #define UTF8_COMPUTE(Char, Mask, Len) \
68 else if ((Char & 0xe0) == 0xc0) \
73 else if ((Char & 0xf0) == 0xe0) \
78 else if ((Char & 0xf8) == 0xf0) \
83 else if ((Char & 0xfc) == 0xf8) \
88 else if ((Char & 0xfe) == 0xfc) \
96 #define UTF8_LENGTH(Char) \
97 ((Char) < 0x80 ? 1 : \
98 ((Char) < 0x800 ? 2 : \
99 ((Char) < 0x10000 ? 3 : \
100 ((Char) < 0x200000 ? 4 : \
101 ((Char) < 0x4000000 ? 5 : 6)))))
104 #define UTF8_GET(Result, Chars, Count, Mask, Len) \
105 (Result) = (Chars)[0] & (Mask); \
106 for ((Count) = 1; (Count) < (Len); ++(Count)) \
108 if (((Chars)[(Count)] & 0xc0) != 0x80) \
114 (Result) |= ((Chars)[(Count)] & 0x3f); \
117 #define UNICODE_VALID(Char) \
118 ((Char) < 0x110000 && \
119 ((Char) < 0xD800 || (Char) >= 0xE000) && \
120 (Char) != 0xFFFE && (Char) != 0xFFFF)
122 static const char utf8_skip_data
[256] = {
123 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
125 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
127 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
129 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
131 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
133 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
135 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
137 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 5,
140 static const char *const g_utf8_skip
= utf8_skip_data
;
142 #define g_utf8_next_char(p) (const char *)((p) + g_utf8_skip[*(const unsigned char *)(p)])
145 * stringprep_utf8_to_unichar:
146 * @p: a pointer to Unicode character encoded as UTF-8
148 * Converts a sequence of bytes encoded as UTF-8 to a Unicode character.
149 * If @p does not point to a valid UTF-8 encoded character, results are
152 * Return value: the resulting character
155 stringprep_utf8_to_unichar (const char *p
)
157 int i
, mask
= 0, len
;
159 unsigned char c
= (unsigned char) *p
;
161 UTF8_COMPUTE (c
, mask
, len
);
163 return (uint32_t) -1;
164 UTF8_GET (result
, p
, i
, mask
, len
);
169 #define CC(Page, Char) \
170 ((combining_class_table[Page] >= G_UNICODE_MAX_TABLE_INDEX) \
171 ? (combining_class_table[Page] - G_UNICODE_MAX_TABLE_INDEX) \
172 : (cclass_data[combining_class_table[Page]][Char]))
174 #define COMBINING_CLASS(Char) \
175 (((Char) > (G_UNICODE_LAST_CHAR)) ? 0 : CC((Char) >> 8, (Char) & 0xff))
178 * g_unicode_canonical_ordering:
179 * @string: a UCS-4 encoded string.
180 * @len: the maximum length of @string to use.
182 * Computes the canonical ordering of a string in-place.
183 * This rearranges decomposed characters in the string
184 * according to their combining classes. See the Unicode
185 * manual for more information.
188 g_unicode_canonical_ordering (uint32_t *string
, size_t len
)
197 last
= COMBINING_CLASS (string
[0]);
198 for (i
= 0; i
< len
- 1; ++i
)
200 int next
= COMBINING_CLASS (string
[i
+ 1]);
201 if (next
!= 0 && last
> next
)
204 /* Percolate item leftward through string. */
205 for (j
= i
; j
> 0; --j
)
208 if (COMBINING_CLASS (string
[j
]) <= next
)
211 string
[j
+ 1] = string
[j
];
215 /* We're re-entering the loop looking at the old
224 static const unsigned char *
225 find_decomposition (uint32_t ch
, int compat
)
228 int end
= sizeof (decomp_table
) / sizeof ((decomp_table
)[0]);
230 if (ch
>= decomp_table
[start
].ch
&& ch
<= decomp_table
[end
- 1].ch
)
234 int half
= (start
+ end
) / 2;
235 if (ch
== decomp_table
[half
].ch
)
241 offset
= decomp_table
[half
].compat_offset
;
243 offset
= decomp_table
[half
].canon_offset
;
247 offset
= decomp_table
[half
].canon_offset
;
253 &(decomp_expansion_string
254 [decomp_table
[half
].expansion_offset
+ offset
]);
256 else if (half
== start
)
258 else if (ch
> decomp_table
[half
].ch
)
268 #define CI(Page, Char) \
269 ((compose_table[Page] >= G_UNICODE_MAX_TABLE_INDEX) \
270 ? (compose_table[Page] - G_UNICODE_MAX_TABLE_INDEX) \
271 : (compose_data[compose_table[Page]][Char]))
273 #define COMPOSE_INDEX(Char) \
274 (((Char) > (G_UNICODE_LAST_CHAR)) ? 0 : CI((Char) >> 8, (Char) & 0xff))
277 combine (uint32_t a
, uint32_t b
, uint32_t *result
)
279 int index_a
, index_b
;
281 index_a
= COMPOSE_INDEX (a
);
282 if (index_a
>= COMPOSE_FIRST_SINGLE_START
&& index_a
< COMPOSE_SECOND_START
)
284 if (b
== compose_first_single
[index_a
- COMPOSE_FIRST_SINGLE_START
][0])
287 compose_first_single
[index_a
- COMPOSE_FIRST_SINGLE_START
][1];
294 index_b
= COMPOSE_INDEX (b
);
295 if (index_b
>= COMPOSE_SECOND_SINGLE_START
)
298 compose_second_single
[index_b
- COMPOSE_SECOND_SINGLE_START
][0])
301 compose_second_single
[index_b
- COMPOSE_SECOND_SINGLE_START
][1];
308 if (index_a
>= COMPOSE_FIRST_START
&& index_a
< COMPOSE_FIRST_SINGLE_START
309 && index_b
>= COMPOSE_SECOND_START
310 && index_b
< COMPOSE_SECOND_SINGLE_START
)
313 compose_array
[index_a
- COMPOSE_FIRST_START
][index_b
-
314 COMPOSE_SECOND_START
];
327 _g_utf8_normalize_wc (const char *str
, ssize_t max_len
, GNormalizeMode mode
)
333 int do_compat
= (mode
== G_NORMALIZE_NFKC
|| mode
== G_NORMALIZE_NFKD
);
334 int do_compose
= (mode
== G_NORMALIZE_NFC
|| mode
== G_NORMALIZE_NFKC
);
338 while ((max_len
< 0 || p
< str
+ max_len
) && *p
)
340 uint32_t wc
= stringprep_utf8_to_unichar (p
);
342 const unsigned char *decomp
= find_decomposition (wc
, do_compat
);
347 /* We store as a double-nul terminated string. */
348 for (len
= 0; (decomp
[len
] || decomp
[len
+ 1]); len
+= 2)
355 p
= g_utf8_next_char (p
);
358 wc_buffer
= malloc (sizeof (uint32_t) * (n_wc
+ 1));
363 while ((max_len
< 0 || p
< str
+ max_len
) && *p
)
365 uint32_t wc
= stringprep_utf8_to_unichar (p
);
366 const unsigned char *decomp
;
368 size_t old_n_wc
= n_wc
;
370 decomp
= find_decomposition (wc
, do_compat
);
375 /* We store as a double-nul terminated string. */
376 for (len
= 0; (decomp
[len
] || decomp
[len
+ 1]); len
+= 2)
377 wc_buffer
[n_wc
++] = (decomp
[len
] << 8 | decomp
[len
+ 1]);
380 wc_buffer
[n_wc
++] = wc
;
384 cc
= COMBINING_CLASS (wc_buffer
[old_n_wc
]);
388 g_unicode_canonical_ordering (wc_buffer
+ last_start
,
390 last_start
= old_n_wc
;
394 p
= g_utf8_next_char (p
);
399 g_unicode_canonical_ordering (wc_buffer
+ last_start
,
406 /* All decomposed and reordered */
409 if (do_compose
&& n_wc
> 0)
415 for (i
= 0; i
< n_wc
; i
++)
417 int cc
= COMBINING_CLASS (wc_buffer
[i
]);
420 (last_cc
== 0 || last_cc
!= cc
) &&
421 combine (wc_buffer
[last_start
], wc_buffer
[i
],
422 &wc_buffer
[last_start
]))
424 for (j
= i
+ 1; j
< n_wc
; j
++)
425 wc_buffer
[j
- 1] = wc_buffer
[j
];
432 last_cc
= COMBINING_CLASS (wc_buffer
[i
- 1]);
450 * stringprep_unichar_to_utf8:
451 * @c: a ISO10646 character code
452 * @outbuf: output buffer, must have at least 6 bytes of space.
453 * If %NULL, the length will be computed and returned
454 * and nothing will be written to @outbuf.
456 * Converts a single character to UTF-8.
458 * Return value: number of bytes written
461 stringprep_unichar_to_utf8 (uint32_t c
, char *outbuf
)
477 else if (c
< 0x10000)
482 else if (c
< 0x200000)
487 else if (c
< 0x4000000)
500 for (i
= len
- 1; i
> 0; --i
)
502 outbuf
[i
] = (c
& 0x3f) | 0x80;
505 outbuf
[0] = c
| first
;
512 * stringprep_utf8_to_ucs4:
513 * @str: a UTF-8 encoded string
514 * @len: the maximum length of @str to use. If @len < 0, then
515 * the string is nul-terminated.
516 * @items_written: location to store the number of characters in the
519 * Convert a string from UTF-8 to a 32-bit fixed width
520 * representation as UCS-4, assuming valid UTF-8 input.
521 * This function does no error checking on the input.
523 * Return value: a pointer to a newly allocated UCS-4 string.
524 * This value must be freed with free().
527 stringprep_utf8_to_ucs4 (const char *str
, ssize_t len
, size_t * items_written
)
540 p
= g_utf8_next_char (p
);
546 while (p
< str
+ len
&& *p
)
548 p
= g_utf8_next_char (p
);
553 result
= malloc (sizeof (uint32_t) * (n_chars
+ 1));
556 for (i
= 0; i
< n_chars
; i
++)
558 uint32_t wc
= ((const unsigned char *) p
)[0];
593 for (j
= 1; j
< charlen
; j
++)
596 wc
|= ((const unsigned char *) p
)[j
] & 0x3f;
612 * stringprep_ucs4_to_utf8:
613 * @str: a UCS-4 encoded string
614 * @len: the maximum length of @str to use. If @len < 0, then
615 * the string is terminated with a 0 character.
616 * @items_read: location to store number of characters read read, or %NULL.
617 * @items_written: location to store number of bytes written or %NULL.
618 * The value here stored does not include the trailing 0
621 * Convert a string from a 32-bit fixed width representation as UCS-4.
622 * to UTF-8. The result will be terminated with a 0 byte.
624 * Return value: a pointer to a newly allocated UTF-8 string.
625 * This value must be freed with free(). If an
626 * error occurs, %NULL will be returned and
630 stringprep_ucs4_to_utf8 (const uint32_t *str
, ssize_t len
,
631 size_t * items_read
, size_t * items_written
)
639 for (i
= 0; len
< 0 || i
< len
; i
++)
644 if (str
[i
] >= 0x80000000)
652 result_length
+= UTF8_LENGTH (str
[i
]);
655 result
= malloc (result_length
+ 1);
659 while (p
< result
+ result_length
)
660 p
+= stringprep_unichar_to_utf8 (str
[i
++], p
);
665 *items_written
= p
- result
;
676 * @str: a UTF-8 encoded string.
677 * @len: length of @str, in bytes, or -1 if @str is nul-terminated.
678 * @mode: the type of normalization to perform.
680 * Converts a string into canonical form, standardizing
681 * such issues as whether a character with an accent
682 * is represented as a base character and combining
683 * accent or as a single precomposed character. You
684 * should generally call g_utf8_normalize() before
685 * comparing two Unicode strings.
687 * The normalization mode %G_NORMALIZE_DEFAULT only
688 * standardizes differences that do not affect the
689 * text content, such as the above-mentioned accent
690 * representation. %G_NORMALIZE_ALL also standardizes
691 * the "compatibility" characters in Unicode, such
692 * as SUPERSCRIPT THREE to the standard forms
693 * (in this case DIGIT THREE). Formatting information
694 * may be lost but for most text operations such
695 * characters should be considered the same.
696 * For example, g_utf8_collate() normalizes
697 * with %G_NORMALIZE_ALL as its first step.
699 * %G_NORMALIZE_DEFAULT_COMPOSE and %G_NORMALIZE_ALL_COMPOSE
700 * are like %G_NORMALIZE_DEFAULT and %G_NORMALIZE_ALL,
701 * but returned a result with composed forms rather
702 * than a maximally decomposed form. This is often
703 * useful if you intend to convert the string to
704 * a legacy encoding or pass it to a system with
705 * less capable Unicode handling.
707 * Return value: a newly allocated string, that is the
708 * normalized form of @str.
711 g_utf8_normalize (const char *str
, ssize_t len
, GNormalizeMode mode
)
713 uint32_t *result_wc
= _g_utf8_normalize_wc (str
, len
, mode
);
716 result
= stringprep_ucs4_to_utf8 (result_wc
, -1, NULL
, NULL
);
723 * stringprep_utf8_nfkc_normalize:
724 * @str: a UTF-8 encoded string.
725 * @len: length of @str, in bytes, or -1 if @str is nul-terminated.
727 * Converts a string into canonical form, standardizing
728 * such issues as whether a character with an accent
729 * is represented as a base character and combining
730 * accent or as a single precomposed character. You
731 * should generally call g_utf8_normalize() before
732 * comparing two Unicode strings.
734 * The normalization mode is NFKC (ALL COMPOSE). It standardizes
735 * differences that do not affect the text content, such as the
736 * above-mentioned accent representation. It standardizes the
737 * "compatibility" characters in Unicode, such as SUPERSCRIPT THREE to
738 * the standard forms (in this case DIGIT THREE). Formatting
739 * information may be lost but for most text operations such
740 * characters should be considered the same. It returns a result with
741 * composed forms rather than a maximally decomposed form.
743 * Return value: a newly allocated string, that is the
744 * NFKC normalized form of @str.
747 stringprep_utf8_nfkc_normalize (const char *str
, ssize_t len
)
749 return g_utf8_normalize (str
, len
, G_NORMALIZE_NFKC
);
753 * stringprep_ucs4_nfkc_normalize:
754 * @str: a Unicode string.
755 * @len: length of @str array, or -1 if @str is nul-terminated.
757 * Converts UCS4 string into UTF-8 and runs
758 * stringprep_utf8_nfkc_normalize().
760 * Return value: a newly allocated Unicode string, that is the NFKC
761 * normalized form of @str.
764 stringprep_ucs4_nfkc_normalize (uint32_t *str
, ssize_t len
)
769 p
= stringprep_ucs4_to_utf8 (str
, len
, 0, 0);
770 result_wc
= _g_utf8_normalize_wc (p
, -1, G_NORMALIZE_NFKC
);