*** empty log message ***
[libidn.git] / nfkc.c
blobe47136154ea737285905d4db4719af7f87af5f95
1 /* nfkc.c Unicode normalization utilities.
2 * Copyright (C) 2002, 2003 Simon Josefsson
4 * This file is part of GNU Libidn.
6 * GNU Libidn is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2.1 of the License, or (at your option) any later version.
11 * GNU Libidn is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with GNU Libidn; if not, write to the Free Software
18 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
22 #include "internal.h"
24 /* This file contains functions from GLIB including gutf8.c and
25 * gunidecomp.c, all with the following license.
27 * Copyright (C) 1999, 2000 Tom Tromey
28 * Copyright 2000 Red Hat, Inc.
30 * The Gnome Library is free software; you can redistribute it and/or
31 * modify it under the terms of the GNU Lesser General Public License as
32 * published by the Free Software Foundation; either version 2 of the
33 * License, or (at your option) any later version.
35 * The Gnome Library is distributed in the hope that it will be useful,
36 * but WITHOUT ANY WARRANTY; without even the implied warranty of
37 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
38 * Lesser General Public License for more details.
40 * You should have received a copy of the GNU Lesser General Public
41 * License along with the Gnome Library; see the file COPYING.LIB. If not,
42 * write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
43 * Boston, MA 02111-1307, USA.
46 typedef enum
48 G_NORMALIZE_DEFAULT,
49 G_NORMALIZE_NFD = G_NORMALIZE_DEFAULT,
50 G_NORMALIZE_DEFAULT_COMPOSE,
51 G_NORMALIZE_NFC = G_NORMALIZE_DEFAULT_COMPOSE,
52 G_NORMALIZE_ALL,
53 G_NORMALIZE_NFKD = G_NORMALIZE_ALL,
54 G_NORMALIZE_ALL_COMPOSE,
55 G_NORMALIZE_NFKC = G_NORMALIZE_ALL_COMPOSE
57 GNormalizeMode;
59 #include "gunidecomp.h"
60 #include "gunicomp.h"
62 #define UTF8_COMPUTE(Char, Mask, Len) \
63 if (Char < 128) \
64 { \
65 Len = 1; \
66 Mask = 0x7f; \
67 } \
68 else if ((Char & 0xe0) == 0xc0) \
69 { \
70 Len = 2; \
71 Mask = 0x1f; \
72 } \
73 else if ((Char & 0xf0) == 0xe0) \
74 { \
75 Len = 3; \
76 Mask = 0x0f; \
77 } \
78 else if ((Char & 0xf8) == 0xf0) \
79 { \
80 Len = 4; \
81 Mask = 0x07; \
82 } \
83 else if ((Char & 0xfc) == 0xf8) \
84 { \
85 Len = 5; \
86 Mask = 0x03; \
87 } \
88 else if ((Char & 0xfe) == 0xfc) \
89 { \
90 Len = 6; \
91 Mask = 0x01; \
92 } \
93 else \
94 Len = -1;
96 #define UTF8_LENGTH(Char) \
97 ((Char) < 0x80 ? 1 : \
98 ((Char) < 0x800 ? 2 : \
99 ((Char) < 0x10000 ? 3 : \
100 ((Char) < 0x200000 ? 4 : \
101 ((Char) < 0x4000000 ? 5 : 6)))))
104 #define UTF8_GET(Result, Chars, Count, Mask, Len) \
105 (Result) = (Chars)[0] & (Mask); \
106 for ((Count) = 1; (Count) < (Len); ++(Count)) \
108 if (((Chars)[(Count)] & 0xc0) != 0x80) \
110 (Result) = -1; \
111 break; \
113 (Result) <<= 6; \
114 (Result) |= ((Chars)[(Count)] & 0x3f); \
117 #define UNICODE_VALID(Char) \
118 ((Char) < 0x110000 && \
119 ((Char) < 0xD800 || (Char) >= 0xE000) && \
120 (Char) != 0xFFFE && (Char) != 0xFFFF)
122 static const char utf8_skip_data[256] = {
123 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
124 1, 1, 1, 1, 1, 1, 1,
125 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
126 1, 1, 1, 1, 1, 1, 1,
127 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
128 1, 1, 1, 1, 1, 1, 1,
129 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
130 1, 1, 1, 1, 1, 1, 1,
131 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
132 1, 1, 1, 1, 1, 1, 1,
133 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
134 1, 1, 1, 1, 1, 1, 1,
135 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
136 2, 2, 2, 2, 2, 2, 2,
137 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 5,
138 5, 5, 5, 6, 6, 1, 1
140 static const char *const g_utf8_skip = utf8_skip_data;
142 #define g_utf8_next_char(p) (const char *)((p) + g_utf8_skip[*(const unsigned char *)(p)])
145 * stringprep_utf8_to_unichar:
146 * @p: a pointer to Unicode character encoded as UTF-8
148 * Converts a sequence of bytes encoded as UTF-8 to a Unicode character.
149 * If @p does not point to a valid UTF-8 encoded character, results are
150 * undefined.
152 * Return value: the resulting character
154 unsigned long
155 stringprep_utf8_to_unichar (const char *p)
157 int i, mask = 0, len;
158 unsigned long result;
159 unsigned char c = (unsigned char) *p;
161 UTF8_COMPUTE (c, mask, len);
162 if (len == -1)
163 return (unsigned long) -1;
164 UTF8_GET (result, p, i, mask, len);
166 return result;
169 #define CC(Page, Char) \
170 ((combining_class_table[Page] >= G_UNICODE_MAX_TABLE_INDEX) \
171 ? (combining_class_table[Page] - G_UNICODE_MAX_TABLE_INDEX) \
172 : (cclass_data[combining_class_table[Page]][Char]))
174 #define COMBINING_CLASS(Char) \
175 (((Char) > (G_UNICODE_LAST_CHAR)) ? 0 : CC((Char) >> 8, (Char) & 0xff))
178 * g_unicode_canonical_ordering:
179 * @string: a UCS-4 encoded string.
180 * @len: the maximum length of @string to use.
182 * Computes the canonical ordering of a string in-place.
183 * This rearranges decomposed characters in the string
184 * according to their combining classes. See the Unicode
185 * manual for more information.
187 static void
188 g_unicode_canonical_ordering (unsigned long *string, size_t len)
190 size_t i;
191 int swap = 1;
193 while (swap)
195 int last;
196 swap = 0;
197 last = COMBINING_CLASS (string[0]);
198 for (i = 0; i < len - 1; ++i)
200 int next = COMBINING_CLASS (string[i + 1]);
201 if (next != 0 && last > next)
203 size_t j;
204 /* Percolate item leftward through string. */
205 for (j = i; j > 0; --j)
207 unsigned long t;
208 if (COMBINING_CLASS (string[j]) <= next)
209 break;
210 t = string[j + 1];
211 string[j + 1] = string[j];
212 string[j] = t;
213 swap = 1;
215 /* We're re-entering the loop looking at the old
216 character again. */
217 next = last;
219 last = next;
224 static const unsigned char *
225 find_decomposition (unsigned long ch, int compat)
227 int start = 0;
228 int end = sizeof (decomp_table) / sizeof ((decomp_table)[0]);
230 if (ch >= decomp_table[start].ch && ch <= decomp_table[end - 1].ch)
232 while (1)
234 int half = (start + end) / 2;
235 if (ch == decomp_table[half].ch)
237 int offset;
239 if (compat)
241 offset = decomp_table[half].compat_offset;
242 if (offset == 0xff)
243 offset = decomp_table[half].canon_offset;
245 else
247 offset = decomp_table[half].canon_offset;
248 if (offset == 0xff)
249 return NULL;
252 return
253 &(decomp_expansion_string
254 [decomp_table[half].expansion_offset + offset]);
256 else if (half == start)
257 break;
258 else if (ch > decomp_table[half].ch)
259 start = half;
260 else
261 end = half;
265 return NULL;
268 #define CI(Page, Char) \
269 ((compose_table[Page] >= G_UNICODE_MAX_TABLE_INDEX) \
270 ? (compose_table[Page] - G_UNICODE_MAX_TABLE_INDEX) \
271 : (compose_data[compose_table[Page]][Char]))
273 #define COMPOSE_INDEX(Char) \
274 (((Char) > (G_UNICODE_LAST_CHAR)) ? 0 : CI((Char) >> 8, (Char) & 0xff))
276 static int
277 combine (unsigned long a, unsigned long b, unsigned long *result)
279 int index_a, index_b;
281 index_a = COMPOSE_INDEX (a);
282 if (index_a >= COMPOSE_FIRST_SINGLE_START && index_a < COMPOSE_SECOND_START)
284 if (b == compose_first_single[index_a - COMPOSE_FIRST_SINGLE_START][0])
286 *result =
287 compose_first_single[index_a - COMPOSE_FIRST_SINGLE_START][1];
288 return 1;
290 else
291 return 0;
294 index_b = COMPOSE_INDEX (b);
295 if (index_b >= COMPOSE_SECOND_SINGLE_START)
297 if (a ==
298 compose_second_single[index_b - COMPOSE_SECOND_SINGLE_START][0])
300 *result =
301 compose_second_single[index_b - COMPOSE_SECOND_SINGLE_START][1];
302 return 1;
304 else
305 return 0;
308 if (index_a >= COMPOSE_FIRST_START && index_a < COMPOSE_FIRST_SINGLE_START
309 && index_b >= COMPOSE_SECOND_START
310 && index_a < COMPOSE_SECOND_SINGLE_START)
312 unsigned long res =
313 compose_array[index_a - COMPOSE_FIRST_START][index_b -
314 COMPOSE_SECOND_START];
316 if (res)
318 *result = res;
319 return 1;
323 return 0;
326 static unsigned long *
327 _g_utf8_normalize_wc (const char *str, ssize_t max_len, GNormalizeMode mode)
329 size_t n_wc;
330 unsigned long *wc_buffer;
331 const char *p;
332 size_t last_start;
333 int do_compat = (mode == G_NORMALIZE_NFKC || mode == G_NORMALIZE_NFKD);
334 int do_compose = (mode == G_NORMALIZE_NFC || mode == G_NORMALIZE_NFKC);
336 n_wc = 0;
337 p = str;
338 while ((max_len < 0 || p < str + max_len) && *p)
340 unsigned long wc = stringprep_utf8_to_unichar (p);
342 const unsigned char *decomp = find_decomposition (wc, do_compat);
344 if (decomp)
346 int len;
347 /* We store as a double-nul terminated string. */
348 for (len = 0; (decomp[len] || decomp[len + 1]); len += 2)
350 n_wc += len / 2;
352 else
353 n_wc++;
355 p = g_utf8_next_char (p);
358 wc_buffer = malloc (sizeof (unsigned long) * (n_wc + 1));
360 last_start = 0;
361 n_wc = 0;
362 p = str;
363 while ((max_len < 0 || p < str + max_len) && *p)
365 unsigned long wc = stringprep_utf8_to_unichar (p);
366 const unsigned char *decomp;
367 int cc;
368 size_t old_n_wc = n_wc;
370 decomp = find_decomposition (wc, do_compat);
372 if (decomp)
374 int len;
375 /* We store as a double-nul terminated string. */
376 for (len = 0; (decomp[len] || decomp[len + 1]); len += 2)
377 wc_buffer[n_wc++] = (decomp[len] << 8 | decomp[len + 1]);
379 else
380 wc_buffer[n_wc++] = wc;
382 if (n_wc > 0)
384 cc = COMBINING_CLASS (wc_buffer[old_n_wc]);
386 if (cc == 0)
388 g_unicode_canonical_ordering (wc_buffer + last_start,
389 n_wc - last_start);
390 last_start = old_n_wc;
394 p = g_utf8_next_char (p);
397 if (n_wc > 0)
399 g_unicode_canonical_ordering (wc_buffer + last_start,
400 n_wc - last_start);
401 last_start = n_wc;
404 wc_buffer[n_wc] = 0;
406 /* All decomposed and reordered */
409 if (do_compose && n_wc > 0)
411 size_t i, j;
412 int last_cc = 0;
413 last_start = 0;
415 for (i = 0; i < n_wc; i++)
417 int cc = COMBINING_CLASS (wc_buffer[i]);
419 if (i > 0 &&
420 (last_cc == 0 || last_cc != cc) &&
421 combine (wc_buffer[last_start], wc_buffer[i],
422 &wc_buffer[last_start]))
424 for (j = i + 1; j < n_wc; j++)
425 wc_buffer[j - 1] = wc_buffer[j];
426 n_wc--;
427 i--;
429 if (i == last_start)
430 last_cc = 0;
431 else
432 last_cc = COMBINING_CLASS (wc_buffer[i - 1]);
434 continue;
437 if (cc == 0)
438 last_start = i;
440 last_cc = cc;
444 wc_buffer[n_wc] = 0;
446 return wc_buffer;
450 * stringprep_unichar_to_utf8:
451 * @c: a ISO10646 character code
452 * @outbuf: output buffer, must have at least 6 bytes of space.
453 * If %NULL, the length will be computed and returned
454 * and nothing will be written to @outbuf.
456 * Converts a single character to UTF-8.
458 * Return value: number of bytes written
461 stringprep_unichar_to_utf8 (unsigned long c, char *outbuf)
463 int len = 0;
464 int first;
465 int i;
467 if (c < 0x80)
469 first = 0;
470 len = 1;
472 else if (c < 0x800)
474 first = 0xc0;
475 len = 2;
477 else if (c < 0x10000)
479 first = 0xe0;
480 len = 3;
482 else if (c < 0x200000)
484 first = 0xf0;
485 len = 4;
487 else if (c < 0x4000000)
489 first = 0xf8;
490 len = 5;
492 else
494 first = 0xfc;
495 len = 6;
498 if (outbuf)
500 for (i = len - 1; i > 0; --i)
502 outbuf[i] = (c & 0x3f) | 0x80;
503 c >>= 6;
505 outbuf[0] = c | first;
508 return len;
512 * stringprep_utf8_to_ucs4:
513 * @str: a UTF-8 encoded string
514 * @len: the maximum length of @str to use. If @len < 0, then
515 * the string is nul-terminated.
516 * @items_written: location to store the number of characters in the
517 * result, or %NULL.
519 * Convert a string from UTF-8 to a 32-bit fixed width
520 * representation as UCS-4, assuming valid UTF-8 input.
521 * This function does no error checking on the input.
523 * Return value: a pointer to a newly allocated UCS-4 string.
524 * This value must be freed with free().
526 unsigned long *
527 stringprep_utf8_to_ucs4 (const char *str, ssize_t len, size_t *items_written)
529 int j, charlen;
530 unsigned long *result;
531 int n_chars, i;
532 const char *p;
534 p = str;
535 n_chars = 0;
536 if (len < 0)
538 while (*p)
540 p = g_utf8_next_char (p);
541 ++n_chars;
544 else
546 while (p < str + len && *p)
548 p = g_utf8_next_char (p);
549 ++n_chars;
553 result = malloc (sizeof (unsigned long) * (n_chars + 1));
555 p = str;
556 for (i = 0; i < n_chars; i++)
558 unsigned long wc = ((const unsigned char *) p)[0];
560 if (wc < 0x80)
562 result[i] = wc;
563 p++;
565 else
567 if (wc < 0xe0)
569 charlen = 2;
570 wc &= 0x1f;
572 else if (wc < 0xf0)
574 charlen = 3;
575 wc &= 0x0f;
577 else if (wc < 0xf8)
579 charlen = 4;
580 wc &= 0x07;
582 else if (wc < 0xfc)
584 charlen = 5;
585 wc &= 0x03;
587 else
589 charlen = 6;
590 wc &= 0x01;
593 for (j = 1; j < charlen; j++)
595 wc <<= 6;
596 wc |= ((const unsigned char *) p)[j] & 0x3f;
599 result[i] = wc;
600 p += charlen;
603 result[i] = 0;
605 if (items_written)
606 *items_written = i;
608 return result;
612 * stringprep_ucs4_to_utf8:
613 * @str: a UCS-4 encoded string
614 * @len: the maximum length of @str to use. If @len < 0, then
615 * the string is terminated with a 0 character.
616 * @items_read: location to store number of characters read read, or %NULL.
617 * @items_written: location to store number of bytes written or %NULL.
618 * The value here stored does not include the trailing 0
619 * byte.
621 * Convert a string from a 32-bit fixed width representation as UCS-4.
622 * to UTF-8. The result will be terminated with a 0 byte.
624 * Return value: a pointer to a newly allocated UTF-8 string.
625 * This value must be freed with free(). If an
626 * error occurs, %NULL will be returned and
627 * @error set.
629 char *
630 stringprep_ucs4_to_utf8 (const unsigned long *str, ssize_t len,
631 size_t *items_read, size_t *items_written)
633 int result_length;
634 char *result = NULL;
635 char *p;
636 int i;
638 result_length = 0;
639 for (i = 0; len < 0 || i < len; i++)
641 if (!str[i])
642 break;
644 if (str[i] >= 0x80000000)
646 if (items_read)
647 *items_read = i;
649 goto err_out;
652 result_length += UTF8_LENGTH (str[i]);
655 result = malloc (result_length + 1);
656 p = result;
658 i = 0;
659 while (p < result + result_length)
660 p += stringprep_unichar_to_utf8 (str[i++], p);
662 *p = '\0';
664 if (items_written)
665 *items_written = p - result;
667 err_out:
668 if (items_read)
669 *items_read = i;
671 return result;
675 * g_utf8_normalize:
676 * @str: a UTF-8 encoded string.
677 * @len: length of @str, in bytes, or -1 if @str is nul-terminated.
678 * @mode: the type of normalization to perform.
680 * Converts a string into canonical form, standardizing
681 * such issues as whether a character with an accent
682 * is represented as a base character and combining
683 * accent or as a single precomposed character. You
684 * should generally call g_utf8_normalize() before
685 * comparing two Unicode strings.
687 * The normalization mode %G_NORMALIZE_DEFAULT only
688 * standardizes differences that do not affect the
689 * text content, such as the above-mentioned accent
690 * representation. %G_NORMALIZE_ALL also standardizes
691 * the "compatibility" characters in Unicode, such
692 * as SUPERSCRIPT THREE to the standard forms
693 * (in this case DIGIT THREE). Formatting information
694 * may be lost but for most text operations such
695 * characters should be considered the same.
696 * For example, g_utf8_collate() normalizes
697 * with %G_NORMALIZE_ALL as its first step.
699 * %G_NORMALIZE_DEFAULT_COMPOSE and %G_NORMALIZE_ALL_COMPOSE
700 * are like %G_NORMALIZE_DEFAULT and %G_NORMALIZE_ALL,
701 * but returned a result with composed forms rather
702 * than a maximally decomposed form. This is often
703 * useful if you intend to convert the string to
704 * a legacy encoding or pass it to a system with
705 * less capable Unicode handling.
707 * Return value: a newly allocated string, that is the
708 * normalized form of @str.
710 static char *
711 g_utf8_normalize (const char *str, ssize_t len, GNormalizeMode mode)
713 unsigned long *result_wc = _g_utf8_normalize_wc (str, len, mode);
714 char *result;
716 result = stringprep_ucs4_to_utf8 (result_wc, -1, NULL, NULL);
717 free (result_wc);
719 return result;
723 * stringprep_utf8_nfkc_normalize:
724 * @str: a UTF-8 encoded string.
725 * @len: length of @str, in bytes, or -1 if @str is nul-terminated.
727 * Converts a string into canonical form, standardizing
728 * such issues as whether a character with an accent
729 * is represented as a base character and combining
730 * accent or as a single precomposed character. You
731 * should generally call g_utf8_normalize() before
732 * comparing two Unicode strings.
734 * The normalization mode is NFKC (ALL COMPOSE). It standardizes
735 * differences that do not affect the text content, such as the
736 * above-mentioned accent representation. It standardizes the
737 * "compatibility" characters in Unicode, such as SUPERSCRIPT THREE to
738 * the standard forms (in this case DIGIT THREE). Formatting
739 * information may be lost but for most text operations such
740 * characters should be considered the same. It returns a result with
741 * composed forms rather than a maximally decomposed form.
743 * Return value: a newly allocated string, that is the
744 * NFKC normalized form of @str.
746 char *
747 stringprep_utf8_nfkc_normalize (const char *str, ssize_t len)
749 return g_utf8_normalize (str, len, G_NORMALIZE_NFKC);
753 * stringprep_ucs4_nfkc_normalize:
754 * @str: a Unicode string.
755 * @len: length of @str array, or -1 if @str is nul-terminated.
757 * Converts UCS4 string into UTF-8 and runs
758 * stringprep_utf8_nfkc_normalize().
760 * Return value: a newly allocated Unicode string, that is the NFKC
761 * normalized form of @str.
763 unsigned long *
764 stringprep_ucs4_nfkc_normalize (unsigned long *str, ssize_t len)
766 char *p;
767 unsigned long *result_wc;
769 p = stringprep_ucs4_to_utf8 (str, len, 0, 0);
770 result_wc = _g_utf8_normalize_wc (p, -1, G_NORMALIZE_NFKC);
771 free (p);
773 return result_wc;