AC_PROG_RANLIB obsoleted by libtool.
[libidn.git] / nfkc.c
blobe552d04eb9bed1e0c9bd8d6c7d8a398af8e85d0a
1 /* nfkc.c unicode normalization utilities
2 * Copyright (C) 2002 Simon Josefsson
4 * This file is part of libstringprep.
6 * Libstringprep is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2.1 of the License, or (at your option) any later version.
11 * Libstringprep is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with libstringprep; if not, write to the Free Software
18 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
22 #ifdef HAVE_CONFIG_H
23 #include "config.h"
24 #endif
26 /* This file contains functions from GLIB including gutf8.c and
27 * gunidecomp.c, all with the following license.
29 * Copyright (C) 1999, 2000 Tom Tromey
30 * Copyright 2000 Red Hat, Inc.
32 * The Gnome Library is free software; you can redistribute it and/or
33 * modify it under the terms of the GNU Lesser General Public License as
34 * published by the Free Software Foundation; either version 2 of the
35 * License, or (at your option) any later version.
37 * The Gnome Library is distributed in the hope that it will be useful,
38 * but WITHOUT ANY WARRANTY; without even the implied warranty of
39 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
40 * Lesser General Public License for more details.
42 * You should have received a copy of the GNU Lesser General Public
43 * License along with the Gnome Library; see the file COPYING.LIB. If not,
44 * write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
45 * Boston, MA 02111-1307, USA.
48 typedef enum
50 G_NORMALIZE_DEFAULT,
51 G_NORMALIZE_NFD = G_NORMALIZE_DEFAULT,
52 G_NORMALIZE_DEFAULT_COMPOSE,
53 G_NORMALIZE_NFC = G_NORMALIZE_DEFAULT_COMPOSE,
54 G_NORMALIZE_ALL,
55 G_NORMALIZE_NFKD = G_NORMALIZE_ALL,
56 G_NORMALIZE_ALL_COMPOSE,
57 G_NORMALIZE_NFKC = G_NORMALIZE_ALL_COMPOSE
59 GNormalizeMode;
61 #include "gunidecomp.h"
62 #include "gunicomp.h"
64 #include <stdlib.h>
66 #define UTF8_COMPUTE(Char, Mask, Len) \
67 if (Char < 128) \
68 { \
69 Len = 1; \
70 Mask = 0x7f; \
71 } \
72 else if ((Char & 0xe0) == 0xc0) \
73 { \
74 Len = 2; \
75 Mask = 0x1f; \
76 } \
77 else if ((Char & 0xf0) == 0xe0) \
78 { \
79 Len = 3; \
80 Mask = 0x0f; \
81 } \
82 else if ((Char & 0xf8) == 0xf0) \
83 { \
84 Len = 4; \
85 Mask = 0x07; \
86 } \
87 else if ((Char & 0xfc) == 0xf8) \
88 { \
89 Len = 5; \
90 Mask = 0x03; \
91 } \
92 else if ((Char & 0xfe) == 0xfc) \
93 { \
94 Len = 6; \
95 Mask = 0x01; \
96 } \
97 else \
98 Len = -1;
100 #define UTF8_LENGTH(Char) \
101 ((Char) < 0x80 ? 1 : \
102 ((Char) < 0x800 ? 2 : \
103 ((Char) < 0x10000 ? 3 : \
104 ((Char) < 0x200000 ? 4 : \
105 ((Char) < 0x4000000 ? 5 : 6)))))
108 #define UTF8_GET(Result, Chars, Count, Mask, Len) \
109 (Result) = (Chars)[0] & (Mask); \
110 for ((Count) = 1; (Count) < (Len); ++(Count)) \
112 if (((Chars)[(Count)] & 0xc0) != 0x80) \
114 (Result) = -1; \
115 break; \
117 (Result) <<= 6; \
118 (Result) |= ((Chars)[(Count)] & 0x3f); \
121 #define UNICODE_VALID(Char) \
122 ((Char) < 0x110000 && \
123 ((Char) < 0xD800 || (Char) >= 0xE000) && \
124 (Char) != 0xFFFE && (Char) != 0xFFFF)
126 static const char utf8_skip_data[256] = {
127 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
128 1, 1, 1, 1, 1, 1, 1,
129 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
130 1, 1, 1, 1, 1, 1, 1,
131 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
132 1, 1, 1, 1, 1, 1, 1,
133 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
134 1, 1, 1, 1, 1, 1, 1,
135 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
136 1, 1, 1, 1, 1, 1, 1,
137 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
138 1, 1, 1, 1, 1, 1, 1,
139 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
140 2, 2, 2, 2, 2, 2, 2,
141 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 5,
142 5, 5, 5, 6, 6, 1, 1
144 static const char *const g_utf8_skip = utf8_skip_data;
146 #define g_utf8_next_char(p) (char *)((p) + g_utf8_skip[*(unsigned char *)(p)])
149 * g_utf8_get_char:
150 * @p: a pointer to Unicode character encoded as UTF-8
152 * Converts a sequence of bytes encoded as UTF-8 to a Unicode character.
153 * If @p does not point to a valid UTF-8 encoded character, results are
154 * undefined. If you are not sure that the bytes are complete
155 * valid Unicode characters, you should use g_utf8_get_char_validated()
156 * instead.
158 * Return value: the resulting character
160 static long
161 g_utf8_get_char (const char * p)
163 int i, mask = 0, len;
164 long result;
165 unsigned char c = (unsigned char) *p;
167 UTF8_COMPUTE (c, mask, len);
168 if (len == -1)
169 return (long) - 1;
170 UTF8_GET (result, p, i, mask, len);
172 return result;
175 #define CC(Page, Char) \
176 ((combining_class_table[Page] >= G_UNICODE_MAX_TABLE_INDEX) \
177 ? (combining_class_table[Page] - G_UNICODE_MAX_TABLE_INDEX) \
178 : (cclass_data[combining_class_table[Page]][Char]))
180 #define COMBINING_CLASS(Char) \
181 (((Char) > (G_UNICODE_LAST_CHAR)) ? 0 : CC((Char) >> 8, (Char) & 0xff))
184 * g_unicode_canonical_ordering:
185 * @string: a UCS-4 encoded string.
186 * @len: the maximum length of @string to use.
188 * Computes the canonical ordering of a string in-place.
189 * This rearranges decomposed characters in the string
190 * according to their combining classes. See the Unicode
191 * manual for more information.
193 static void
194 g_unicode_canonical_ordering (long * string, size_t len)
196 size_t i;
197 int swap = 1;
199 while (swap)
201 int last;
202 swap = 0;
203 last = COMBINING_CLASS (string[0]);
204 for (i = 0; i < len - 1; ++i)
206 int next = COMBINING_CLASS (string[i + 1]);
207 if (next != 0 && last > next)
209 size_t j;
210 /* Percolate item leftward through string. */
211 for (j = i; j > 0; --j)
213 long t;
214 if (COMBINING_CLASS (string[j]) <= next)
215 break;
216 t = string[j + 1];
217 string[j + 1] = string[j];
218 string[j] = t;
219 swap = 1;
221 /* We're re-entering the loop looking at the old
222 character again. */
223 next = last;
225 last = next;
230 static const unsigned char *
231 find_decomposition (long ch, int compat)
233 int start = 0;
234 int end = sizeof (decomp_table) / sizeof ((decomp_table)[0]);
236 if (ch >= decomp_table[start].ch && ch <= decomp_table[end - 1].ch)
238 while (1)
240 int half = (start + end) / 2;
241 if (ch == decomp_table[half].ch)
243 int offset;
245 if (compat)
247 offset = decomp_table[half].compat_offset;
248 if (offset == 0xff)
249 offset = decomp_table[half].canon_offset;
251 else
253 offset = decomp_table[half].canon_offset;
254 if (offset == 0xff)
255 return NULL;
258 return
259 &(decomp_expansion_string
260 [decomp_table[half].expansion_offset + offset]);
262 else if (half == start)
263 break;
264 else if (ch > decomp_table[half].ch)
265 start = half;
266 else
267 end = half;
271 return NULL;
274 #define CI(Page, Char) \
275 ((compose_table[Page] >= G_UNICODE_MAX_TABLE_INDEX) \
276 ? (compose_table[Page] - G_UNICODE_MAX_TABLE_INDEX) \
277 : (compose_data[compose_table[Page]][Char]))
279 #define COMPOSE_INDEX(Char) \
280 (((Char) > (G_UNICODE_LAST_CHAR)) ? 0 : CI((Char) >> 8, (Char) & 0xff))
282 static int
283 combine (long a, long b, long * result)
285 int index_a, index_b;
287 index_a = COMPOSE_INDEX (a);
288 if (index_a >= COMPOSE_FIRST_SINGLE_START && index_a < COMPOSE_SECOND_START)
290 if (b == compose_first_single[index_a - COMPOSE_FIRST_SINGLE_START][0])
292 *result =
293 compose_first_single[index_a - COMPOSE_FIRST_SINGLE_START][1];
294 return 1;
296 else
297 return 0;
300 index_b = COMPOSE_INDEX (b);
301 if (index_b >= COMPOSE_SECOND_SINGLE_START)
303 if (a ==
304 compose_second_single[index_b - COMPOSE_SECOND_SINGLE_START][0])
306 *result =
307 compose_second_single[index_b - COMPOSE_SECOND_SINGLE_START][1];
308 return 1;
310 else
311 return 0;
314 if (index_a >= COMPOSE_FIRST_START && index_a < COMPOSE_FIRST_SINGLE_START
315 && index_b >= COMPOSE_SECOND_START
316 && index_a < COMPOSE_SECOND_SINGLE_START)
318 long res =
319 compose_array[index_a - COMPOSE_FIRST_START][index_b -
320 COMPOSE_SECOND_START];
322 if (res)
324 *result = res;
325 return 1;
329 return 0;
332 static long *
333 _g_utf8_normalize_wc (const char * str, int max_len, GNormalizeMode mode)
335 size_t n_wc;
336 long *wc_buffer;
337 const char *p;
338 size_t last_start;
339 int do_compat = (mode == G_NORMALIZE_NFKC || mode == G_NORMALIZE_NFKD);
340 int do_compose = (mode == G_NORMALIZE_NFC || mode == G_NORMALIZE_NFKC);
342 n_wc = 0;
343 p = str;
344 while ((max_len < 0 || p < str + max_len) && *p)
346 long wc = g_utf8_get_char (p);
348 const unsigned char *decomp = find_decomposition (wc, do_compat);
350 if (decomp)
352 int len;
353 /* We store as a double-nul terminated string. */
354 for (len = 0; (decomp[len] || decomp[len + 1]); len += 2)
356 n_wc += len / 2;
358 else
359 n_wc++;
361 p = g_utf8_next_char (p);
364 wc_buffer = malloc (sizeof(long) * (n_wc + 1));
366 last_start = 0;
367 n_wc = 0;
368 p = str;
369 while ((max_len < 0 || p < str + max_len) && *p)
371 long wc = g_utf8_get_char (p);
372 const unsigned char *decomp;
373 int cc;
374 size_t old_n_wc = n_wc;
376 decomp = find_decomposition (wc, do_compat);
378 if (decomp)
380 int len;
381 /* We store as a double-nul terminated string. */
382 for (len = 0; (decomp[len] || decomp[len + 1]); len += 2)
383 wc_buffer[n_wc++] = (decomp[len] << 8 | decomp[len + 1]);
385 else
386 wc_buffer[n_wc++] = wc;
388 if (n_wc > 0)
390 cc = COMBINING_CLASS (wc_buffer[old_n_wc]);
392 if (cc == 0)
394 g_unicode_canonical_ordering (wc_buffer + last_start,
395 n_wc - last_start);
396 last_start = old_n_wc;
400 p = g_utf8_next_char (p);
403 if (n_wc > 0)
405 g_unicode_canonical_ordering (wc_buffer + last_start,
406 n_wc - last_start);
407 last_start = n_wc;
410 wc_buffer[n_wc] = 0;
412 /* All decomposed and reordered */
415 if (do_compose && n_wc > 0)
417 size_t i, j;
418 int last_cc = 0;
419 last_start = 0;
421 for (i = 0; i < n_wc; i++)
423 int cc = COMBINING_CLASS (wc_buffer[i]);
425 if (i > 0 &&
426 (last_cc == 0 || last_cc != cc) &&
427 combine (wc_buffer[last_start], wc_buffer[i],
428 &wc_buffer[last_start]))
430 for (j = i + 1; j < n_wc; j++)
431 wc_buffer[j - 1] = wc_buffer[j];
432 n_wc--;
433 i--;
435 if (i == last_start)
436 last_cc = 0;
437 else
438 last_cc = COMBINING_CLASS (wc_buffer[i - 1]);
440 continue;
443 if (cc == 0)
444 last_start = i;
446 last_cc = cc;
450 wc_buffer[n_wc] = 0;
452 return wc_buffer;
456 * g_unichar_to_utf8:
457 * @c: a ISO10646 character code
458 * @outbuf: output buffer, must have at least 6 bytes of space.
459 * If %NULL, the length will be computed and returned
460 * and nothing will be written to @outbuf.
462 * Converts a single character to UTF-8.
464 * Return value: number of bytes written
467 stringprep_unichar_to_utf8 (long c, char * outbuf)
469 int len = 0;
470 int first;
471 int i;
473 if (c < 0x80)
475 first = 0;
476 len = 1;
478 else if (c < 0x800)
480 first = 0xc0;
481 len = 2;
483 else if (c < 0x10000)
485 first = 0xe0;
486 len = 3;
488 else if (c < 0x200000)
490 first = 0xf0;
491 len = 4;
493 else if (c < 0x4000000)
495 first = 0xf8;
496 len = 5;
498 else
500 first = 0xfc;
501 len = 6;
504 if (outbuf)
506 for (i = len - 1; i > 0; --i)
508 outbuf[i] = (c & 0x3f) | 0x80;
509 c >>= 6;
511 outbuf[0] = c | first;
514 return len;
518 * stringgprep_utf8_to_ucs4_fast:
519 * @str: a UTF-8 encoded string
520 * @len: the maximum length of @str to use. If @len < 0, then
521 * the string is nul-terminated.
522 * @items_written: location to store the number of characters in the
523 * result, or %NULL.
525 * Convert a string from UTF-8 to a 32-bit fixed width
526 * representation as UCS-4, assuming valid UTF-8 input.
527 * This function is roughly twice as fast as g_utf8_to_ucs4()
528 * but does no error checking on the input.
530 * Return value: a pointer to a newly allocated UCS-4 string.
531 * This value must be freed with g_free().
533 long *
534 stringprep_utf8_to_ucs4_fast (const char *str,
535 int len,
536 int *items_written)
538 int j, charlen;
539 long *result;
540 int n_chars, i;
541 const char *p;
543 p = str;
544 n_chars = 0;
545 if (len < 0)
547 while (*p)
549 p = g_utf8_next_char (p);
550 ++n_chars;
553 else
555 while (p < str + len && *p)
557 p = g_utf8_next_char (p);
558 ++n_chars;
562 result = malloc(sizeof(long) * (n_chars + 1));
564 p = str;
565 for (i=0; i < n_chars; i++)
567 long wc = ((unsigned char *)p)[0];
569 if (wc < 0x80)
571 result[i] = wc;
572 p++;
574 else
576 if (wc < 0xe0)
578 charlen = 2;
579 wc &= 0x1f;
581 else if (wc < 0xf0)
583 charlen = 3;
584 wc &= 0x0f;
586 else if (wc < 0xf8)
588 charlen = 4;
589 wc &= 0x07;
591 else if (wc < 0xfc)
593 charlen = 5;
594 wc &= 0x03;
596 else
598 charlen = 6;
599 wc &= 0x01;
602 for (j = 1; j < charlen; j++)
604 wc <<= 6;
605 wc |= ((unsigned char *)p)[j] & 0x3f;
608 result[i] = wc;
609 p += charlen;
612 result[i] = 0;
614 if (items_written)
615 *items_written = i;
617 return result;
621 * g_ucs4_to_utf8:
622 * @str: a UCS-4 encoded string
623 * @len: the maximum length of @str to use. If @len < 0, then
624 * the string is terminated with a 0 character.
625 * @items_read: location to store number of characters read read, or %NULL.
626 * @items_written: location to store number of bytes written or %NULL.
627 * The value here stored does not include the trailing 0
628 * byte.
629 * @error: location to store the error occuring, or %NULL to ignore
630 * errors. Any of the errors in #GConvertError other than
631 * %G_CONVERT_ERROR_NO_CONVERSION may occur.
633 * Convert a string from a 32-bit fixed width representation as UCS-4.
634 * to UTF-8. The result will be terminated with a 0 byte.
636 * Return value: a pointer to a newly allocated UTF-8 string.
637 * This value must be freed with g_free(). If an
638 * error occurs, %NULL will be returned and
639 * @error set.
641 char *
642 stringprep_ucs4_to_utf8 (const long * str,
643 int len, int * items_read, int * items_written)
645 int result_length;
646 char *result = NULL;
647 char *p;
648 int i;
650 result_length = 0;
651 for (i = 0; len < 0 || i < len; i++)
653 if (!str[i])
654 break;
656 if (str[i] >= 0x80000000)
658 if (items_read)
659 *items_read = i;
661 goto err_out;
664 result_length += UTF8_LENGTH (str[i]);
667 result = malloc (result_length + 1);
668 p = result;
670 i = 0;
671 while (p < result + result_length)
672 p += stringprep_unichar_to_utf8 (str[i++], p);
674 *p = '\0';
676 if (items_written)
677 *items_written = p - result;
679 err_out:
680 if (items_read)
681 *items_read = i;
683 return result;
687 * g_utf8_normalize:
688 * @str: a UTF-8 encoded string.
689 * @len: length of @str, in bytes, or -1 if @str is nul-terminated.
690 * @mode: the type of normalization to perform.
692 * Converts a string into canonical form, standardizing
693 * such issues as whether a character with an accent
694 * is represented as a base character and combining
695 * accent or as a single precomposed character. You
696 * should generally call g_utf8_normalize() before
697 * comparing two Unicode strings.
699 * The normalization mode %G_NORMALIZE_DEFAULT only
700 * standardizes differences that do not affect the
701 * text content, such as the above-mentioned accent
702 * representation. %G_NORMALIZE_ALL also standardizes
703 * the "compatibility" characters in Unicode, such
704 * as SUPERSCRIPT THREE to the standard forms
705 * (in this case DIGIT THREE). Formatting information
706 * may be lost but for most text operations such
707 * characters should be considered the same.
708 * For example, g_utf8_collate() normalizes
709 * with %G_NORMALIZE_ALL as its first step.
711 * %G_NORMALIZE_DEFAULT_COMPOSE and %G_NORMALIZE_ALL_COMPOSE
712 * are like %G_NORMALIZE_DEFAULT and %G_NORMALIZE_ALL,
713 * but returned a result with composed forms rather
714 * than a maximally decomposed form. This is often
715 * useful if you intend to convert the string to
716 * a legacy encoding or pass it to a system with
717 * less capable Unicode handling.
719 * Return value: a newly allocated string, that is the
720 * normalized form of @str.
722 static char *
723 g_utf8_normalize (const char * str, int len, GNormalizeMode mode)
725 long *result_wc = _g_utf8_normalize_wc (str, len, mode);
726 char *result;
728 result = stringprep_ucs4_to_utf8 (result_wc, -1, NULL, NULL);
729 free (result_wc);
731 return result;
734 char *
735 stringprep_utf8_nfkc_normalize (const char *str, int len)
737 return g_utf8_normalize (str, len, G_NORMALIZE_NFKC);
740 long *
741 stringprep_ucs4_nfkc_normalize (long *str, int len)
743 char *p;
744 long *result_wc;
746 p = stringprep_ucs4_to_utf8 (str, len, 0, 0);
747 result_wc = _g_utf8_normalize_wc (p, -1, G_NORMALIZE_NFKC);
748 free(p);
750 return result_wc;