Added copying conditions.
[libidn.git] / nfkc.c
blob6014358aac3919be55e1a5fe7f595c40cd31cf37
1 /* nfkc.c unicode normalization utilities
2 * Copyright (C) 2002 Simon Josefsson
4 * This file is part of libstringprep.
6 * Libstringprep is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2.1 of the License, or (at your option) any later version.
11 * Libstringprep is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with libstringprep; if not, write to the Free Software
18 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
22 #ifdef HAVE_CONFIG_H
23 #include "config.h"
24 #endif
26 /* This file contains functions from GLIB including gutf8.c and
27 * gunidecomp.c, all with the following license.
29 * Copyright (C) 1999, 2000 Tom Tromey
30 * Copyright 2000 Red Hat, Inc.
32 * The Gnome Library is free software; you can redistribute it and/or
33 * modify it under the terms of the GNU Lesser General Public License as
34 * published by the Free Software Foundation; either version 2 of the
35 * License, or (at your option) any later version.
37 * The Gnome Library is distributed in the hope that it will be useful,
38 * but WITHOUT ANY WARRANTY; without even the implied warranty of
39 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
40 * Lesser General Public License for more details.
42 * You should have received a copy of the GNU Lesser General Public
43 * License along with the Gnome Library; see the file COPYING.LIB. If not,
44 * write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
45 * Boston, MA 02111-1307, USA.
48 typedef enum
50 G_NORMALIZE_DEFAULT,
51 G_NORMALIZE_NFD = G_NORMALIZE_DEFAULT,
52 G_NORMALIZE_DEFAULT_COMPOSE,
53 G_NORMALIZE_NFC = G_NORMALIZE_DEFAULT_COMPOSE,
54 G_NORMALIZE_ALL,
55 G_NORMALIZE_NFKD = G_NORMALIZE_ALL,
56 G_NORMALIZE_ALL_COMPOSE,
57 G_NORMALIZE_NFKC = G_NORMALIZE_ALL_COMPOSE
59 GNormalizeMode;
61 #include "gunidecomp.h"
62 #include "gunicomp.h"
64 #include <stdlib.h>
66 #define UTF8_COMPUTE(Char, Mask, Len) \
67 if (Char < 128) \
68 { \
69 Len = 1; \
70 Mask = 0x7f; \
71 } \
72 else if ((Char & 0xe0) == 0xc0) \
73 { \
74 Len = 2; \
75 Mask = 0x1f; \
76 } \
77 else if ((Char & 0xf0) == 0xe0) \
78 { \
79 Len = 3; \
80 Mask = 0x0f; \
81 } \
82 else if ((Char & 0xf8) == 0xf0) \
83 { \
84 Len = 4; \
85 Mask = 0x07; \
86 } \
87 else if ((Char & 0xfc) == 0xf8) \
88 { \
89 Len = 5; \
90 Mask = 0x03; \
91 } \
92 else if ((Char & 0xfe) == 0xfc) \
93 { \
94 Len = 6; \
95 Mask = 0x01; \
96 } \
97 else \
98 Len = -1;
100 #define UTF8_LENGTH(Char) \
101 ((Char) < 0x80 ? 1 : \
102 ((Char) < 0x800 ? 2 : \
103 ((Char) < 0x10000 ? 3 : \
104 ((Char) < 0x200000 ? 4 : \
105 ((Char) < 0x4000000 ? 5 : 6)))))
108 #define UTF8_GET(Result, Chars, Count, Mask, Len) \
109 (Result) = (Chars)[0] & (Mask); \
110 for ((Count) = 1; (Count) < (Len); ++(Count)) \
112 if (((Chars)[(Count)] & 0xc0) != 0x80) \
114 (Result) = -1; \
115 break; \
117 (Result) <<= 6; \
118 (Result) |= ((Chars)[(Count)] & 0x3f); \
121 #define UNICODE_VALID(Char) \
122 ((Char) < 0x110000 && \
123 ((Char) < 0xD800 || (Char) >= 0xE000) && \
124 (Char) != 0xFFFE && (Char) != 0xFFFF)
126 static const char utf8_skip_data[256] = {
127 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
128 1, 1, 1, 1, 1, 1, 1,
129 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
130 1, 1, 1, 1, 1, 1, 1,
131 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
132 1, 1, 1, 1, 1, 1, 1,
133 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
134 1, 1, 1, 1, 1, 1, 1,
135 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
136 1, 1, 1, 1, 1, 1, 1,
137 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
138 1, 1, 1, 1, 1, 1, 1,
139 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
140 2, 2, 2, 2, 2, 2, 2,
141 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 5,
142 5, 5, 5, 6, 6, 1, 1
144 static const char *const g_utf8_skip = utf8_skip_data;
146 #define g_utf8_next_char(p) (char *)((p) + g_utf8_skip[*(unsigned char *)(p)])
149 * stringprep_utf8_to_unichar:
150 * @p: a pointer to Unicode character encoded as UTF-8
152 * Converts a sequence of bytes encoded as UTF-8 to a Unicode character.
153 * If @p does not point to a valid UTF-8 encoded character, results are
154 * undefined.
156 * Return value: the resulting character
158 long
159 stringprep_utf8_to_unichar (const char *p)
161 int i, mask = 0, len;
162 long result;
163 unsigned char c = (unsigned char) *p;
165 UTF8_COMPUTE (c, mask, len);
166 if (len == -1)
167 return (long) -1;
168 UTF8_GET (result, p, i, mask, len);
170 return result;
173 #define CC(Page, Char) \
174 ((combining_class_table[Page] >= G_UNICODE_MAX_TABLE_INDEX) \
175 ? (combining_class_table[Page] - G_UNICODE_MAX_TABLE_INDEX) \
176 : (cclass_data[combining_class_table[Page]][Char]))
178 #define COMBINING_CLASS(Char) \
179 (((Char) > (G_UNICODE_LAST_CHAR)) ? 0 : CC((Char) >> 8, (Char) & 0xff))
182 * g_unicode_canonical_ordering:
183 * @string: a UCS-4 encoded string.
184 * @len: the maximum length of @string to use.
186 * Computes the canonical ordering of a string in-place.
187 * This rearranges decomposed characters in the string
188 * according to their combining classes. See the Unicode
189 * manual for more information.
191 static void
192 g_unicode_canonical_ordering (long *string, size_t len)
194 size_t i;
195 int swap = 1;
197 while (swap)
199 int last;
200 swap = 0;
201 last = COMBINING_CLASS (string[0]);
202 for (i = 0; i < len - 1; ++i)
204 int next = COMBINING_CLASS (string[i + 1]);
205 if (next != 0 && last > next)
207 size_t j;
208 /* Percolate item leftward through string. */
209 for (j = i; j > 0; --j)
211 long t;
212 if (COMBINING_CLASS (string[j]) <= next)
213 break;
214 t = string[j + 1];
215 string[j + 1] = string[j];
216 string[j] = t;
217 swap = 1;
219 /* We're re-entering the loop looking at the old
220 character again. */
221 next = last;
223 last = next;
228 static const unsigned char *
229 find_decomposition (long ch, int compat)
231 int start = 0;
232 int end = sizeof (decomp_table) / sizeof ((decomp_table)[0]);
234 if (ch >= decomp_table[start].ch && ch <= decomp_table[end - 1].ch)
236 while (1)
238 int half = (start + end) / 2;
239 if (ch == decomp_table[half].ch)
241 int offset;
243 if (compat)
245 offset = decomp_table[half].compat_offset;
246 if (offset == 0xff)
247 offset = decomp_table[half].canon_offset;
249 else
251 offset = decomp_table[half].canon_offset;
252 if (offset == 0xff)
253 return NULL;
256 return
257 &(decomp_expansion_string
258 [decomp_table[half].expansion_offset + offset]);
260 else if (half == start)
261 break;
262 else if (ch > decomp_table[half].ch)
263 start = half;
264 else
265 end = half;
269 return NULL;
272 #define CI(Page, Char) \
273 ((compose_table[Page] >= G_UNICODE_MAX_TABLE_INDEX) \
274 ? (compose_table[Page] - G_UNICODE_MAX_TABLE_INDEX) \
275 : (compose_data[compose_table[Page]][Char]))
277 #define COMPOSE_INDEX(Char) \
278 (((Char) > (G_UNICODE_LAST_CHAR)) ? 0 : CI((Char) >> 8, (Char) & 0xff))
280 static int
281 combine (long a, long b, long *result)
283 int index_a, index_b;
285 index_a = COMPOSE_INDEX (a);
286 if (index_a >= COMPOSE_FIRST_SINGLE_START && index_a < COMPOSE_SECOND_START)
288 if (b == compose_first_single[index_a - COMPOSE_FIRST_SINGLE_START][0])
290 *result =
291 compose_first_single[index_a - COMPOSE_FIRST_SINGLE_START][1];
292 return 1;
294 else
295 return 0;
298 index_b = COMPOSE_INDEX (b);
299 if (index_b >= COMPOSE_SECOND_SINGLE_START)
301 if (a ==
302 compose_second_single[index_b - COMPOSE_SECOND_SINGLE_START][0])
304 *result =
305 compose_second_single[index_b - COMPOSE_SECOND_SINGLE_START][1];
306 return 1;
308 else
309 return 0;
312 if (index_a >= COMPOSE_FIRST_START && index_a < COMPOSE_FIRST_SINGLE_START
313 && index_b >= COMPOSE_SECOND_START
314 && index_a < COMPOSE_SECOND_SINGLE_START)
316 long res =
317 compose_array[index_a - COMPOSE_FIRST_START][index_b -
318 COMPOSE_SECOND_START];
320 if (res)
322 *result = res;
323 return 1;
327 return 0;
330 static long *
331 _g_utf8_normalize_wc (const char *str, int max_len, GNormalizeMode mode)
333 size_t n_wc;
334 long *wc_buffer;
335 const char *p;
336 size_t last_start;
337 int do_compat = (mode == G_NORMALIZE_NFKC || mode == G_NORMALIZE_NFKD);
338 int do_compose = (mode == G_NORMALIZE_NFC || mode == G_NORMALIZE_NFKC);
340 n_wc = 0;
341 p = str;
342 while ((max_len < 0 || p < str + max_len) && *p)
344 long wc = stringprep_utf8_to_unichar (p);
346 const unsigned char *decomp = find_decomposition (wc, do_compat);
348 if (decomp)
350 int len;
351 /* We store as a double-nul terminated string. */
352 for (len = 0; (decomp[len] || decomp[len + 1]); len += 2)
354 n_wc += len / 2;
356 else
357 n_wc++;
359 p = g_utf8_next_char (p);
362 wc_buffer = malloc (sizeof (long) * (n_wc + 1));
364 last_start = 0;
365 n_wc = 0;
366 p = str;
367 while ((max_len < 0 || p < str + max_len) && *p)
369 long wc = stringprep_utf8_to_unichar (p);
370 const unsigned char *decomp;
371 int cc;
372 size_t old_n_wc = n_wc;
374 decomp = find_decomposition (wc, do_compat);
376 if (decomp)
378 int len;
379 /* We store as a double-nul terminated string. */
380 for (len = 0; (decomp[len] || decomp[len + 1]); len += 2)
381 wc_buffer[n_wc++] = (decomp[len] << 8 | decomp[len + 1]);
383 else
384 wc_buffer[n_wc++] = wc;
386 if (n_wc > 0)
388 cc = COMBINING_CLASS (wc_buffer[old_n_wc]);
390 if (cc == 0)
392 g_unicode_canonical_ordering (wc_buffer + last_start,
393 n_wc - last_start);
394 last_start = old_n_wc;
398 p = g_utf8_next_char (p);
401 if (n_wc > 0)
403 g_unicode_canonical_ordering (wc_buffer + last_start,
404 n_wc - last_start);
405 last_start = n_wc;
408 wc_buffer[n_wc] = 0;
410 /* All decomposed and reordered */
413 if (do_compose && n_wc > 0)
415 size_t i, j;
416 int last_cc = 0;
417 last_start = 0;
419 for (i = 0; i < n_wc; i++)
421 int cc = COMBINING_CLASS (wc_buffer[i]);
423 if (i > 0 &&
424 (last_cc == 0 || last_cc != cc) &&
425 combine (wc_buffer[last_start], wc_buffer[i],
426 &wc_buffer[last_start]))
428 for (j = i + 1; j < n_wc; j++)
429 wc_buffer[j - 1] = wc_buffer[j];
430 n_wc--;
431 i--;
433 if (i == last_start)
434 last_cc = 0;
435 else
436 last_cc = COMBINING_CLASS (wc_buffer[i - 1]);
438 continue;
441 if (cc == 0)
442 last_start = i;
444 last_cc = cc;
448 wc_buffer[n_wc] = 0;
450 return wc_buffer;
454 * g_unichar_to_utf8:
455 * @c: a ISO10646 character code
456 * @outbuf: output buffer, must have at least 6 bytes of space.
457 * If %NULL, the length will be computed and returned
458 * and nothing will be written to @outbuf.
460 * Converts a single character to UTF-8.
462 * Return value: number of bytes written
465 stringprep_unichar_to_utf8 (long c, char *outbuf)
467 int len = 0;
468 int first;
469 int i;
471 if (c < 0x80)
473 first = 0;
474 len = 1;
476 else if (c < 0x800)
478 first = 0xc0;
479 len = 2;
481 else if (c < 0x10000)
483 first = 0xe0;
484 len = 3;
486 else if (c < 0x200000)
488 first = 0xf0;
489 len = 4;
491 else if (c < 0x4000000)
493 first = 0xf8;
494 len = 5;
496 else
498 first = 0xfc;
499 len = 6;
502 if (outbuf)
504 for (i = len - 1; i > 0; --i)
506 outbuf[i] = (c & 0x3f) | 0x80;
507 c >>= 6;
509 outbuf[0] = c | first;
512 return len;
516 * stringgprep_utf8_to_ucs4:
517 * @str: a UTF-8 encoded string
518 * @len: the maximum length of @str to use. If @len < 0, then
519 * the string is nul-terminated.
520 * @items_written: location to store the number of characters in the
521 * result, or %NULL.
523 * Convert a string from UTF-8 to a 32-bit fixed width
524 * representation as UCS-4, assuming valid UTF-8 input.
525 * This function does no error checking on the input.
527 * Return value: a pointer to a newly allocated UCS-4 string.
528 * This value must be freed with g_free().
530 long *
531 stringprep_utf8_to_ucs4 (const char *str, int len, int *items_written)
533 int j, charlen;
534 long *result;
535 int n_chars, i;
536 const char *p;
538 p = str;
539 n_chars = 0;
540 if (len < 0)
542 while (*p)
544 p = g_utf8_next_char (p);
545 ++n_chars;
548 else
550 while (p < str + len && *p)
552 p = g_utf8_next_char (p);
553 ++n_chars;
557 result = malloc (sizeof (long) * (n_chars + 1));
559 p = str;
560 for (i = 0; i < n_chars; i++)
562 long wc = ((unsigned char *) p)[0];
564 if (wc < 0x80)
566 result[i] = wc;
567 p++;
569 else
571 if (wc < 0xe0)
573 charlen = 2;
574 wc &= 0x1f;
576 else if (wc < 0xf0)
578 charlen = 3;
579 wc &= 0x0f;
581 else if (wc < 0xf8)
583 charlen = 4;
584 wc &= 0x07;
586 else if (wc < 0xfc)
588 charlen = 5;
589 wc &= 0x03;
591 else
593 charlen = 6;
594 wc &= 0x01;
597 for (j = 1; j < charlen; j++)
599 wc <<= 6;
600 wc |= ((unsigned char *) p)[j] & 0x3f;
603 result[i] = wc;
604 p += charlen;
607 result[i] = 0;
609 if (items_written)
610 *items_written = i;
612 return result;
615 /* This one is kept around for binary backwards compatibility with
616 library version CURRENT=1. */
617 long *
618 stringprep_utf8_to_ucs4_fast (const char *str, int len, int *items_written)
620 return stringprep_utf8_to_ucs4 (str, len, items_written);
625 * g_ucs4_to_utf8:
626 * @str: a UCS-4 encoded string
627 * @len: the maximum length of @str to use. If @len < 0, then
628 * the string is terminated with a 0 character.
629 * @items_read: location to store number of characters read read, or %NULL.
630 * @items_written: location to store number of bytes written or %NULL.
631 * The value here stored does not include the trailing 0
632 * byte.
633 * @error: location to store the error occuring, or %NULL to ignore
634 * errors. Any of the errors in #GConvertError other than
635 * %G_CONVERT_ERROR_NO_CONVERSION may occur.
637 * Convert a string from a 32-bit fixed width representation as UCS-4.
638 * to UTF-8. The result will be terminated with a 0 byte.
640 * Return value: a pointer to a newly allocated UTF-8 string.
641 * This value must be freed with g_free(). If an
642 * error occurs, %NULL will be returned and
643 * @error set.
645 char *
646 stringprep_ucs4_to_utf8 (const long *str,
647 int len, int *items_read, int *items_written)
649 int result_length;
650 char *result = NULL;
651 char *p;
652 int i;
654 result_length = 0;
655 for (i = 0; len < 0 || i < len; i++)
657 if (!str[i])
658 break;
660 if (str[i] >= 0x80000000)
662 if (items_read)
663 *items_read = i;
665 goto err_out;
668 result_length += UTF8_LENGTH (str[i]);
671 result = malloc (result_length + 1);
672 p = result;
674 i = 0;
675 while (p < result + result_length)
676 p += stringprep_unichar_to_utf8 (str[i++], p);
678 *p = '\0';
680 if (items_written)
681 *items_written = p - result;
683 err_out:
684 if (items_read)
685 *items_read = i;
687 return result;
691 * g_utf8_normalize:
692 * @str: a UTF-8 encoded string.
693 * @len: length of @str, in bytes, or -1 if @str is nul-terminated.
694 * @mode: the type of normalization to perform.
696 * Converts a string into canonical form, standardizing
697 * such issues as whether a character with an accent
698 * is represented as a base character and combining
699 * accent or as a single precomposed character. You
700 * should generally call g_utf8_normalize() before
701 * comparing two Unicode strings.
703 * The normalization mode %G_NORMALIZE_DEFAULT only
704 * standardizes differences that do not affect the
705 * text content, such as the above-mentioned accent
706 * representation. %G_NORMALIZE_ALL also standardizes
707 * the "compatibility" characters in Unicode, such
708 * as SUPERSCRIPT THREE to the standard forms
709 * (in this case DIGIT THREE). Formatting information
710 * may be lost but for most text operations such
711 * characters should be considered the same.
712 * For example, g_utf8_collate() normalizes
713 * with %G_NORMALIZE_ALL as its first step.
715 * %G_NORMALIZE_DEFAULT_COMPOSE and %G_NORMALIZE_ALL_COMPOSE
716 * are like %G_NORMALIZE_DEFAULT and %G_NORMALIZE_ALL,
717 * but returned a result with composed forms rather
718 * than a maximally decomposed form. This is often
719 * useful if you intend to convert the string to
720 * a legacy encoding or pass it to a system with
721 * less capable Unicode handling.
723 * Return value: a newly allocated string, that is the
724 * normalized form of @str.
726 static char *
727 g_utf8_normalize (const char *str, int len, GNormalizeMode mode)
729 long *result_wc = _g_utf8_normalize_wc (str, len, mode);
730 char *result;
732 result = stringprep_ucs4_to_utf8 (result_wc, -1, NULL, NULL);
733 free (result_wc);
735 return result;
738 char *
739 stringprep_utf8_nfkc_normalize (const char *str, int len)
741 return g_utf8_normalize (str, len, G_NORMALIZE_NFKC);
744 long *
745 stringprep_ucs4_nfkc_normalize (long *str, int len)
747 char *p;
748 long *result_wc;
750 p = stringprep_ucs4_to_utf8 (str, len, 0, 0);
751 result_wc = _g_utf8_normalize_wc (p, -1, G_NORMALIZE_NFKC);
752 free (p);
754 return result_wc;