Add memset.c, an AC_LIBOBJ.
[libidn.git] / nfkc.c
blobe55878b0afc34d0e58dd098bb854440ca7e360cc
1 /* nfkc.c unicode normalization utilities
2 * Copyright (C) 2002 Simon Josefsson
4 * This file is part of libstringprep.
6 * Libstringprep is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2.1 of the License, or (at your option) any later version.
11 * Libstringprep is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with libstringprep; if not, write to the Free Software
18 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
22 #include "internal.h"
24 /* This file contains functions from GLIB including gutf8.c and
25 * gunidecomp.c, all with the following license.
27 * Copyright (C) 1999, 2000 Tom Tromey
28 * Copyright 2000 Red Hat, Inc.
30 * The Gnome Library is free software; you can redistribute it and/or
31 * modify it under the terms of the GNU Lesser General Public License as
32 * published by the Free Software Foundation; either version 2 of the
33 * License, or (at your option) any later version.
35 * The Gnome Library is distributed in the hope that it will be useful,
36 * but WITHOUT ANY WARRANTY; without even the implied warranty of
37 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
38 * Lesser General Public License for more details.
40 * You should have received a copy of the GNU Lesser General Public
41 * License along with the Gnome Library; see the file COPYING.LIB. If not,
42 * write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
43 * Boston, MA 02111-1307, USA.
46 typedef enum
48 G_NORMALIZE_DEFAULT,
49 G_NORMALIZE_NFD = G_NORMALIZE_DEFAULT,
50 G_NORMALIZE_DEFAULT_COMPOSE,
51 G_NORMALIZE_NFC = G_NORMALIZE_DEFAULT_COMPOSE,
52 G_NORMALIZE_ALL,
53 G_NORMALIZE_NFKD = G_NORMALIZE_ALL,
54 G_NORMALIZE_ALL_COMPOSE,
55 G_NORMALIZE_NFKC = G_NORMALIZE_ALL_COMPOSE
57 GNormalizeMode;
59 #include "gunidecomp.h"
60 #include "gunicomp.h"
62 #define UTF8_COMPUTE(Char, Mask, Len) \
63 if (Char < 128) \
64 { \
65 Len = 1; \
66 Mask = 0x7f; \
67 } \
68 else if ((Char & 0xe0) == 0xc0) \
69 { \
70 Len = 2; \
71 Mask = 0x1f; \
72 } \
73 else if ((Char & 0xf0) == 0xe0) \
74 { \
75 Len = 3; \
76 Mask = 0x0f; \
77 } \
78 else if ((Char & 0xf8) == 0xf0) \
79 { \
80 Len = 4; \
81 Mask = 0x07; \
82 } \
83 else if ((Char & 0xfc) == 0xf8) \
84 { \
85 Len = 5; \
86 Mask = 0x03; \
87 } \
88 else if ((Char & 0xfe) == 0xfc) \
89 { \
90 Len = 6; \
91 Mask = 0x01; \
92 } \
93 else \
94 Len = -1;
96 #define UTF8_LENGTH(Char) \
97 ((Char) < 0x80 ? 1 : \
98 ((Char) < 0x800 ? 2 : \
99 ((Char) < 0x10000 ? 3 : \
100 ((Char) < 0x200000 ? 4 : \
101 ((Char) < 0x4000000 ? 5 : 6)))))
104 #define UTF8_GET(Result, Chars, Count, Mask, Len) \
105 (Result) = (Chars)[0] & (Mask); \
106 for ((Count) = 1; (Count) < (Len); ++(Count)) \
108 if (((Chars)[(Count)] & 0xc0) != 0x80) \
110 (Result) = -1; \
111 break; \
113 (Result) <<= 6; \
114 (Result) |= ((Chars)[(Count)] & 0x3f); \
117 #define UNICODE_VALID(Char) \
118 ((Char) < 0x110000 && \
119 ((Char) < 0xD800 || (Char) >= 0xE000) && \
120 (Char) != 0xFFFE && (Char) != 0xFFFF)
122 static const char utf8_skip_data[256] = {
123 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
124 1, 1, 1, 1, 1, 1, 1,
125 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
126 1, 1, 1, 1, 1, 1, 1,
127 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
128 1, 1, 1, 1, 1, 1, 1,
129 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
130 1, 1, 1, 1, 1, 1, 1,
131 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
132 1, 1, 1, 1, 1, 1, 1,
133 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
134 1, 1, 1, 1, 1, 1, 1,
135 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
136 2, 2, 2, 2, 2, 2, 2,
137 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 5,
138 5, 5, 5, 6, 6, 1, 1
140 static const char *const g_utf8_skip = utf8_skip_data;
142 #define g_utf8_next_char(p) (char *)((p) + g_utf8_skip[*(unsigned char *)(p)])
145 * stringprep_utf8_to_unichar:
146 * @p: a pointer to Unicode character encoded as UTF-8
148 * Converts a sequence of bytes encoded as UTF-8 to a Unicode character.
149 * If @p does not point to a valid UTF-8 encoded character, results are
150 * undefined.
152 * Return value: the resulting character
154 long
155 stringprep_utf8_to_unichar (const char *p)
157 int i, mask = 0, len;
158 long result;
159 unsigned char c = (unsigned char) *p;
161 UTF8_COMPUTE (c, mask, len);
162 if (len == -1)
163 return (long) -1;
164 UTF8_GET (result, p, i, mask, len);
166 return result;
169 #define CC(Page, Char) \
170 ((combining_class_table[Page] >= G_UNICODE_MAX_TABLE_INDEX) \
171 ? (combining_class_table[Page] - G_UNICODE_MAX_TABLE_INDEX) \
172 : (cclass_data[combining_class_table[Page]][Char]))
174 #define COMBINING_CLASS(Char) \
175 (((Char) > (G_UNICODE_LAST_CHAR)) ? 0 : CC((Char) >> 8, (Char) & 0xff))
178 * g_unicode_canonical_ordering:
179 * @string: a UCS-4 encoded string.
180 * @len: the maximum length of @string to use.
182 * Computes the canonical ordering of a string in-place.
183 * This rearranges decomposed characters in the string
184 * according to their combining classes. See the Unicode
185 * manual for more information.
187 static void
188 g_unicode_canonical_ordering (long *string, size_t len)
190 size_t i;
191 int swap = 1;
193 while (swap)
195 int last;
196 swap = 0;
197 last = COMBINING_CLASS (string[0]);
198 for (i = 0; i < len - 1; ++i)
200 int next = COMBINING_CLASS (string[i + 1]);
201 if (next != 0 && last > next)
203 size_t j;
204 /* Percolate item leftward through string. */
205 for (j = i; j > 0; --j)
207 long t;
208 if (COMBINING_CLASS (string[j]) <= next)
209 break;
210 t = string[j + 1];
211 string[j + 1] = string[j];
212 string[j] = t;
213 swap = 1;
215 /* We're re-entering the loop looking at the old
216 character again. */
217 next = last;
219 last = next;
224 static const unsigned char *
225 find_decomposition (long ch, int compat)
227 int start = 0;
228 int end = sizeof (decomp_table) / sizeof ((decomp_table)[0]);
230 if (ch >= decomp_table[start].ch && ch <= decomp_table[end - 1].ch)
232 while (1)
234 int half = (start + end) / 2;
235 if (ch == decomp_table[half].ch)
237 int offset;
239 if (compat)
241 offset = decomp_table[half].compat_offset;
242 if (offset == 0xff)
243 offset = decomp_table[half].canon_offset;
245 else
247 offset = decomp_table[half].canon_offset;
248 if (offset == 0xff)
249 return NULL;
252 return
253 &(decomp_expansion_string
254 [decomp_table[half].expansion_offset + offset]);
256 else if (half == start)
257 break;
258 else if (ch > decomp_table[half].ch)
259 start = half;
260 else
261 end = half;
265 return NULL;
268 #define CI(Page, Char) \
269 ((compose_table[Page] >= G_UNICODE_MAX_TABLE_INDEX) \
270 ? (compose_table[Page] - G_UNICODE_MAX_TABLE_INDEX) \
271 : (compose_data[compose_table[Page]][Char]))
273 #define COMPOSE_INDEX(Char) \
274 (((Char) > (G_UNICODE_LAST_CHAR)) ? 0 : CI((Char) >> 8, (Char) & 0xff))
276 static int
277 combine (long a, long b, long *result)
279 int index_a, index_b;
281 index_a = COMPOSE_INDEX (a);
282 if (index_a >= COMPOSE_FIRST_SINGLE_START && index_a < COMPOSE_SECOND_START)
284 if (b == compose_first_single[index_a - COMPOSE_FIRST_SINGLE_START][0])
286 *result =
287 compose_first_single[index_a - COMPOSE_FIRST_SINGLE_START][1];
288 return 1;
290 else
291 return 0;
294 index_b = COMPOSE_INDEX (b);
295 if (index_b >= COMPOSE_SECOND_SINGLE_START)
297 if (a ==
298 compose_second_single[index_b - COMPOSE_SECOND_SINGLE_START][0])
300 *result =
301 compose_second_single[index_b - COMPOSE_SECOND_SINGLE_START][1];
302 return 1;
304 else
305 return 0;
308 if (index_a >= COMPOSE_FIRST_START && index_a < COMPOSE_FIRST_SINGLE_START
309 && index_b >= COMPOSE_SECOND_START
310 && index_a < COMPOSE_SECOND_SINGLE_START)
312 long res =
313 compose_array[index_a - COMPOSE_FIRST_START][index_b -
314 COMPOSE_SECOND_START];
316 if (res)
318 *result = res;
319 return 1;
323 return 0;
326 static long *
327 _g_utf8_normalize_wc (const char *str, int max_len, GNormalizeMode mode)
329 size_t n_wc;
330 long *wc_buffer;
331 const char *p;
332 size_t last_start;
333 int do_compat = (mode == G_NORMALIZE_NFKC || mode == G_NORMALIZE_NFKD);
334 int do_compose = (mode == G_NORMALIZE_NFC || mode == G_NORMALIZE_NFKC);
336 n_wc = 0;
337 p = str;
338 while ((max_len < 0 || p < str + max_len) && *p)
340 long wc = stringprep_utf8_to_unichar (p);
342 const unsigned char *decomp = find_decomposition (wc, do_compat);
344 if (decomp)
346 int len;
347 /* We store as a double-nul terminated string. */
348 for (len = 0; (decomp[len] || decomp[len + 1]); len += 2)
350 n_wc += len / 2;
352 else
353 n_wc++;
355 p = g_utf8_next_char (p);
358 wc_buffer = malloc (sizeof (long) * (n_wc + 1));
360 last_start = 0;
361 n_wc = 0;
362 p = str;
363 while ((max_len < 0 || p < str + max_len) && *p)
365 long wc = stringprep_utf8_to_unichar (p);
366 const unsigned char *decomp;
367 int cc;
368 size_t old_n_wc = n_wc;
370 decomp = find_decomposition (wc, do_compat);
372 if (decomp)
374 int len;
375 /* We store as a double-nul terminated string. */
376 for (len = 0; (decomp[len] || decomp[len + 1]); len += 2)
377 wc_buffer[n_wc++] = (decomp[len] << 8 | decomp[len + 1]);
379 else
380 wc_buffer[n_wc++] = wc;
382 if (n_wc > 0)
384 cc = COMBINING_CLASS (wc_buffer[old_n_wc]);
386 if (cc == 0)
388 g_unicode_canonical_ordering (wc_buffer + last_start,
389 n_wc - last_start);
390 last_start = old_n_wc;
394 p = g_utf8_next_char (p);
397 if (n_wc > 0)
399 g_unicode_canonical_ordering (wc_buffer + last_start,
400 n_wc - last_start);
401 last_start = n_wc;
404 wc_buffer[n_wc] = 0;
406 /* All decomposed and reordered */
409 if (do_compose && n_wc > 0)
411 size_t i, j;
412 int last_cc = 0;
413 last_start = 0;
415 for (i = 0; i < n_wc; i++)
417 int cc = COMBINING_CLASS (wc_buffer[i]);
419 if (i > 0 &&
420 (last_cc == 0 || last_cc != cc) &&
421 combine (wc_buffer[last_start], wc_buffer[i],
422 &wc_buffer[last_start]))
424 for (j = i + 1; j < n_wc; j++)
425 wc_buffer[j - 1] = wc_buffer[j];
426 n_wc--;
427 i--;
429 if (i == last_start)
430 last_cc = 0;
431 else
432 last_cc = COMBINING_CLASS (wc_buffer[i - 1]);
434 continue;
437 if (cc == 0)
438 last_start = i;
440 last_cc = cc;
444 wc_buffer[n_wc] = 0;
446 return wc_buffer;
450 * g_unichar_to_utf8:
451 * @c: a ISO10646 character code
452 * @outbuf: output buffer, must have at least 6 bytes of space.
453 * If %NULL, the length will be computed and returned
454 * and nothing will be written to @outbuf.
456 * Converts a single character to UTF-8.
458 * Return value: number of bytes written
461 stringprep_unichar_to_utf8 (long c, char *outbuf)
463 int len = 0;
464 int first;
465 int i;
467 if (c < 0x80)
469 first = 0;
470 len = 1;
472 else if (c < 0x800)
474 first = 0xc0;
475 len = 2;
477 else if (c < 0x10000)
479 first = 0xe0;
480 len = 3;
482 else if (c < 0x200000)
484 first = 0xf0;
485 len = 4;
487 else if (c < 0x4000000)
489 first = 0xf8;
490 len = 5;
492 else
494 first = 0xfc;
495 len = 6;
498 if (outbuf)
500 for (i = len - 1; i > 0; --i)
502 outbuf[i] = (c & 0x3f) | 0x80;
503 c >>= 6;
505 outbuf[0] = c | first;
508 return len;
512 * stringgprep_utf8_to_ucs4:
513 * @str: a UTF-8 encoded string
514 * @len: the maximum length of @str to use. If @len < 0, then
515 * the string is nul-terminated.
516 * @items_written: location to store the number of characters in the
517 * result, or %NULL.
519 * Convert a string from UTF-8 to a 32-bit fixed width
520 * representation as UCS-4, assuming valid UTF-8 input.
521 * This function does no error checking on the input.
523 * Return value: a pointer to a newly allocated UCS-4 string.
524 * This value must be freed with g_free().
526 long *
527 stringprep_utf8_to_ucs4 (const char *str, int len, int *items_written)
529 int j, charlen;
530 long *result;
531 int n_chars, i;
532 const char *p;
534 p = str;
535 n_chars = 0;
536 if (len < 0)
538 while (*p)
540 p = g_utf8_next_char (p);
541 ++n_chars;
544 else
546 while (p < str + len && *p)
548 p = g_utf8_next_char (p);
549 ++n_chars;
553 result = malloc (sizeof (long) * (n_chars + 1));
555 p = str;
556 for (i = 0; i < n_chars; i++)
558 long wc = ((unsigned char *) p)[0];
560 if (wc < 0x80)
562 result[i] = wc;
563 p++;
565 else
567 if (wc < 0xe0)
569 charlen = 2;
570 wc &= 0x1f;
572 else if (wc < 0xf0)
574 charlen = 3;
575 wc &= 0x0f;
577 else if (wc < 0xf8)
579 charlen = 4;
580 wc &= 0x07;
582 else if (wc < 0xfc)
584 charlen = 5;
585 wc &= 0x03;
587 else
589 charlen = 6;
590 wc &= 0x01;
593 for (j = 1; j < charlen; j++)
595 wc <<= 6;
596 wc |= ((unsigned char *) p)[j] & 0x3f;
599 result[i] = wc;
600 p += charlen;
603 result[i] = 0;
605 if (items_written)
606 *items_written = i;
608 return result;
611 /* This one is kept around for binary backwards compatibility with
612 library version CURRENT=1. */
613 long *
614 stringprep_utf8_to_ucs4_fast (const char *str, int len, int *items_written)
616 return stringprep_utf8_to_ucs4 (str, len, items_written);
621 * g_ucs4_to_utf8:
622 * @str: a UCS-4 encoded string
623 * @len: the maximum length of @str to use. If @len < 0, then
624 * the string is terminated with a 0 character.
625 * @items_read: location to store number of characters read read, or %NULL.
626 * @items_written: location to store number of bytes written or %NULL.
627 * The value here stored does not include the trailing 0
628 * byte.
629 * @error: location to store the error occuring, or %NULL to ignore
630 * errors. Any of the errors in #GConvertError other than
631 * %G_CONVERT_ERROR_NO_CONVERSION may occur.
633 * Convert a string from a 32-bit fixed width representation as UCS-4.
634 * to UTF-8. The result will be terminated with a 0 byte.
636 * Return value: a pointer to a newly allocated UTF-8 string.
637 * This value must be freed with g_free(). If an
638 * error occurs, %NULL will be returned and
639 * @error set.
641 char *
642 stringprep_ucs4_to_utf8 (const long *str,
643 int len, int *items_read, int *items_written)
645 int result_length;
646 char *result = NULL;
647 char *p;
648 int i;
650 result_length = 0;
651 for (i = 0; len < 0 || i < len; i++)
653 if (!str[i])
654 break;
656 if (str[i] >= 0x80000000)
658 if (items_read)
659 *items_read = i;
661 goto err_out;
664 result_length += UTF8_LENGTH (str[i]);
667 result = malloc (result_length + 1);
668 p = result;
670 i = 0;
671 while (p < result + result_length)
672 p += stringprep_unichar_to_utf8 (str[i++], p);
674 *p = '\0';
676 if (items_written)
677 *items_written = p - result;
679 err_out:
680 if (items_read)
681 *items_read = i;
683 return result;
687 * g_utf8_normalize:
688 * @str: a UTF-8 encoded string.
689 * @len: length of @str, in bytes, or -1 if @str is nul-terminated.
690 * @mode: the type of normalization to perform.
692 * Converts a string into canonical form, standardizing
693 * such issues as whether a character with an accent
694 * is represented as a base character and combining
695 * accent or as a single precomposed character. You
696 * should generally call g_utf8_normalize() before
697 * comparing two Unicode strings.
699 * The normalization mode %G_NORMALIZE_DEFAULT only
700 * standardizes differences that do not affect the
701 * text content, such as the above-mentioned accent
702 * representation. %G_NORMALIZE_ALL also standardizes
703 * the "compatibility" characters in Unicode, such
704 * as SUPERSCRIPT THREE to the standard forms
705 * (in this case DIGIT THREE). Formatting information
706 * may be lost but for most text operations such
707 * characters should be considered the same.
708 * For example, g_utf8_collate() normalizes
709 * with %G_NORMALIZE_ALL as its first step.
711 * %G_NORMALIZE_DEFAULT_COMPOSE and %G_NORMALIZE_ALL_COMPOSE
712 * are like %G_NORMALIZE_DEFAULT and %G_NORMALIZE_ALL,
713 * but returned a result with composed forms rather
714 * than a maximally decomposed form. This is often
715 * useful if you intend to convert the string to
716 * a legacy encoding or pass it to a system with
717 * less capable Unicode handling.
719 * Return value: a newly allocated string, that is the
720 * normalized form of @str.
722 static char *
723 g_utf8_normalize (const char *str, int len, GNormalizeMode mode)
725 long *result_wc = _g_utf8_normalize_wc (str, len, mode);
726 char *result;
728 result = stringprep_ucs4_to_utf8 (result_wc, -1, NULL, NULL);
729 free (result_wc);
731 return result;
734 char *
735 stringprep_utf8_nfkc_normalize (const char *str, int len)
737 return g_utf8_normalize (str, len, G_NORMALIZE_NFKC);
740 long *
741 stringprep_ucs4_nfkc_normalize (long *str, int len)
743 char *p;
744 long *result_wc;
746 p = stringprep_ucs4_to_utf8 (str, len, 0, 0);
747 result_wc = _g_utf8_normalize_wc (p, -1, G_NORMALIZE_NFKC);
748 free (p);
750 return result_wc;