1 /* nfkc.c unicode normalization utilities
2 * Copyright (C) 2002 Simon Josefsson
4 * This file is part of libstringprep.
6 * Libstringprep is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2.1 of the License, or (at your option) any later version.
11 * Libstringprep is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with libstringprep; if not, write to the Free Software
18 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
26 /* This file contains functions from GLIB including gutf8.c and
27 * gunidecomp.c, all with the following license.
29 * Copyright (C) 1999, 2000 Tom Tromey
30 * Copyright 2000 Red Hat, Inc.
32 * The Gnome Library is free software; you can redistribute it and/or
33 * modify it under the terms of the GNU Lesser General Public License as
34 * published by the Free Software Foundation; either version 2 of the
35 * License, or (at your option) any later version.
37 * The Gnome Library is distributed in the hope that it will be useful,
38 * but WITHOUT ANY WARRANTY; without even the implied warranty of
39 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
40 * Lesser General Public License for more details.
42 * You should have received a copy of the GNU Lesser General Public
43 * License along with the Gnome Library; see the file COPYING.LIB. If not,
44 * write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
45 * Boston, MA 02111-1307, USA.
51 G_NORMALIZE_NFD
= G_NORMALIZE_DEFAULT
,
52 G_NORMALIZE_DEFAULT_COMPOSE
,
53 G_NORMALIZE_NFC
= G_NORMALIZE_DEFAULT_COMPOSE
,
55 G_NORMALIZE_NFKD
= G_NORMALIZE_ALL
,
56 G_NORMALIZE_ALL_COMPOSE
,
57 G_NORMALIZE_NFKC
= G_NORMALIZE_ALL_COMPOSE
61 #include "gunidecomp.h"
66 #define UTF8_COMPUTE(Char, Mask, Len) \
72 else if ((Char & 0xe0) == 0xc0) \
77 else if ((Char & 0xf0) == 0xe0) \
82 else if ((Char & 0xf8) == 0xf0) \
87 else if ((Char & 0xfc) == 0xf8) \
92 else if ((Char & 0xfe) == 0xfc) \
100 #define UTF8_LENGTH(Char) \
101 ((Char) < 0x80 ? 1 : \
102 ((Char) < 0x800 ? 2 : \
103 ((Char) < 0x10000 ? 3 : \
104 ((Char) < 0x200000 ? 4 : \
105 ((Char) < 0x4000000 ? 5 : 6)))))
108 #define UTF8_GET(Result, Chars, Count, Mask, Len) \
109 (Result) = (Chars)[0] & (Mask); \
110 for ((Count) = 1; (Count) < (Len); ++(Count)) \
112 if (((Chars)[(Count)] & 0xc0) != 0x80) \
118 (Result) |= ((Chars)[(Count)] & 0x3f); \
121 #define UNICODE_VALID(Char) \
122 ((Char) < 0x110000 && \
123 ((Char) < 0xD800 || (Char) >= 0xE000) && \
124 (Char) != 0xFFFE && (Char) != 0xFFFF)
126 static const char utf8_skip_data
[256] = {
127 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
129 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
131 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
133 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
135 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
137 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
139 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
141 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 5,
144 static const char *const g_utf8_skip
= utf8_skip_data
;
146 #define g_utf8_next_char(p) (char *)((p) + g_utf8_skip[*(unsigned char *)(p)])
149 * stringprep_utf8_to_unichar:
150 * @p: a pointer to Unicode character encoded as UTF-8
152 * Converts a sequence of bytes encoded as UTF-8 to a Unicode character.
153 * If @p does not point to a valid UTF-8 encoded character, results are
156 * Return value: the resulting character
159 stringprep_utf8_to_unichar (const char *p
)
161 int i
, mask
= 0, len
;
163 unsigned char c
= (unsigned char) *p
;
165 UTF8_COMPUTE (c
, mask
, len
);
168 UTF8_GET (result
, p
, i
, mask
, len
);
173 #define CC(Page, Char) \
174 ((combining_class_table[Page] >= G_UNICODE_MAX_TABLE_INDEX) \
175 ? (combining_class_table[Page] - G_UNICODE_MAX_TABLE_INDEX) \
176 : (cclass_data[combining_class_table[Page]][Char]))
178 #define COMBINING_CLASS(Char) \
179 (((Char) > (G_UNICODE_LAST_CHAR)) ? 0 : CC((Char) >> 8, (Char) & 0xff))
182 * g_unicode_canonical_ordering:
183 * @string: a UCS-4 encoded string.
184 * @len: the maximum length of @string to use.
186 * Computes the canonical ordering of a string in-place.
187 * This rearranges decomposed characters in the string
188 * according to their combining classes. See the Unicode
189 * manual for more information.
192 g_unicode_canonical_ordering (long *string
, size_t len
)
201 last
= COMBINING_CLASS (string
[0]);
202 for (i
= 0; i
< len
- 1; ++i
)
204 int next
= COMBINING_CLASS (string
[i
+ 1]);
205 if (next
!= 0 && last
> next
)
208 /* Percolate item leftward through string. */
209 for (j
= i
; j
> 0; --j
)
212 if (COMBINING_CLASS (string
[j
]) <= next
)
215 string
[j
+ 1] = string
[j
];
219 /* We're re-entering the loop looking at the old
228 static const unsigned char *
229 find_decomposition (long ch
, int compat
)
232 int end
= sizeof (decomp_table
) / sizeof ((decomp_table
)[0]);
234 if (ch
>= decomp_table
[start
].ch
&& ch
<= decomp_table
[end
- 1].ch
)
238 int half
= (start
+ end
) / 2;
239 if (ch
== decomp_table
[half
].ch
)
245 offset
= decomp_table
[half
].compat_offset
;
247 offset
= decomp_table
[half
].canon_offset
;
251 offset
= decomp_table
[half
].canon_offset
;
257 &(decomp_expansion_string
258 [decomp_table
[half
].expansion_offset
+ offset
]);
260 else if (half
== start
)
262 else if (ch
> decomp_table
[half
].ch
)
272 #define CI(Page, Char) \
273 ((compose_table[Page] >= G_UNICODE_MAX_TABLE_INDEX) \
274 ? (compose_table[Page] - G_UNICODE_MAX_TABLE_INDEX) \
275 : (compose_data[compose_table[Page]][Char]))
277 #define COMPOSE_INDEX(Char) \
278 (((Char) > (G_UNICODE_LAST_CHAR)) ? 0 : CI((Char) >> 8, (Char) & 0xff))
281 combine (long a
, long b
, long *result
)
283 int index_a
, index_b
;
285 index_a
= COMPOSE_INDEX (a
);
286 if (index_a
>= COMPOSE_FIRST_SINGLE_START
&& index_a
< COMPOSE_SECOND_START
)
288 if (b
== compose_first_single
[index_a
- COMPOSE_FIRST_SINGLE_START
][0])
291 compose_first_single
[index_a
- COMPOSE_FIRST_SINGLE_START
][1];
298 index_b
= COMPOSE_INDEX (b
);
299 if (index_b
>= COMPOSE_SECOND_SINGLE_START
)
302 compose_second_single
[index_b
- COMPOSE_SECOND_SINGLE_START
][0])
305 compose_second_single
[index_b
- COMPOSE_SECOND_SINGLE_START
][1];
312 if (index_a
>= COMPOSE_FIRST_START
&& index_a
< COMPOSE_FIRST_SINGLE_START
313 && index_b
>= COMPOSE_SECOND_START
314 && index_a
< COMPOSE_SECOND_SINGLE_START
)
317 compose_array
[index_a
- COMPOSE_FIRST_START
][index_b
-
318 COMPOSE_SECOND_START
];
331 _g_utf8_normalize_wc (const char *str
, int max_len
, GNormalizeMode mode
)
337 int do_compat
= (mode
== G_NORMALIZE_NFKC
|| mode
== G_NORMALIZE_NFKD
);
338 int do_compose
= (mode
== G_NORMALIZE_NFC
|| mode
== G_NORMALIZE_NFKC
);
342 while ((max_len
< 0 || p
< str
+ max_len
) && *p
)
344 long wc
= stringprep_utf8_to_unichar (p
);
346 const unsigned char *decomp
= find_decomposition (wc
, do_compat
);
351 /* We store as a double-nul terminated string. */
352 for (len
= 0; (decomp
[len
] || decomp
[len
+ 1]); len
+= 2)
359 p
= g_utf8_next_char (p
);
362 wc_buffer
= malloc (sizeof (long) * (n_wc
+ 1));
367 while ((max_len
< 0 || p
< str
+ max_len
) && *p
)
369 long wc
= stringprep_utf8_to_unichar (p
);
370 const unsigned char *decomp
;
372 size_t old_n_wc
= n_wc
;
374 decomp
= find_decomposition (wc
, do_compat
);
379 /* We store as a double-nul terminated string. */
380 for (len
= 0; (decomp
[len
] || decomp
[len
+ 1]); len
+= 2)
381 wc_buffer
[n_wc
++] = (decomp
[len
] << 8 | decomp
[len
+ 1]);
384 wc_buffer
[n_wc
++] = wc
;
388 cc
= COMBINING_CLASS (wc_buffer
[old_n_wc
]);
392 g_unicode_canonical_ordering (wc_buffer
+ last_start
,
394 last_start
= old_n_wc
;
398 p
= g_utf8_next_char (p
);
403 g_unicode_canonical_ordering (wc_buffer
+ last_start
,
410 /* All decomposed and reordered */
413 if (do_compose
&& n_wc
> 0)
419 for (i
= 0; i
< n_wc
; i
++)
421 int cc
= COMBINING_CLASS (wc_buffer
[i
]);
424 (last_cc
== 0 || last_cc
!= cc
) &&
425 combine (wc_buffer
[last_start
], wc_buffer
[i
],
426 &wc_buffer
[last_start
]))
428 for (j
= i
+ 1; j
< n_wc
; j
++)
429 wc_buffer
[j
- 1] = wc_buffer
[j
];
436 last_cc
= COMBINING_CLASS (wc_buffer
[i
- 1]);
455 * @c: a ISO10646 character code
456 * @outbuf: output buffer, must have at least 6 bytes of space.
457 * If %NULL, the length will be computed and returned
458 * and nothing will be written to @outbuf.
460 * Converts a single character to UTF-8.
462 * Return value: number of bytes written
465 stringprep_unichar_to_utf8 (long c
, char *outbuf
)
481 else if (c
< 0x10000)
486 else if (c
< 0x200000)
491 else if (c
< 0x4000000)
504 for (i
= len
- 1; i
> 0; --i
)
506 outbuf
[i
] = (c
& 0x3f) | 0x80;
509 outbuf
[0] = c
| first
;
516 * stringgprep_utf8_to_ucs4:
517 * @str: a UTF-8 encoded string
518 * @len: the maximum length of @str to use. If @len < 0, then
519 * the string is nul-terminated.
520 * @items_written: location to store the number of characters in the
523 * Convert a string from UTF-8 to a 32-bit fixed width
524 * representation as UCS-4, assuming valid UTF-8 input.
525 * This function does no error checking on the input.
527 * Return value: a pointer to a newly allocated UCS-4 string.
528 * This value must be freed with g_free().
531 stringprep_utf8_to_ucs4 (const char *str
, int len
, int *items_written
)
544 p
= g_utf8_next_char (p
);
550 while (p
< str
+ len
&& *p
)
552 p
= g_utf8_next_char (p
);
557 result
= malloc (sizeof (long) * (n_chars
+ 1));
560 for (i
= 0; i
< n_chars
; i
++)
562 long wc
= ((unsigned char *) p
)[0];
597 for (j
= 1; j
< charlen
; j
++)
600 wc
|= ((unsigned char *) p
)[j
] & 0x3f;
615 /* This one is kept around for binary backwards compatibility with
616 library version CURRENT=1. */
618 stringprep_utf8_to_ucs4_fast (const char *str
, int len
, int *items_written
)
620 return stringprep_utf8_to_ucs4 (str
, len
, items_written
);
626 * @str: a UCS-4 encoded string
627 * @len: the maximum length of @str to use. If @len < 0, then
628 * the string is terminated with a 0 character.
629 * @items_read: location to store number of characters read read, or %NULL.
630 * @items_written: location to store number of bytes written or %NULL.
631 * The value here stored does not include the trailing 0
633 * @error: location to store the error occuring, or %NULL to ignore
634 * errors. Any of the errors in #GConvertError other than
635 * %G_CONVERT_ERROR_NO_CONVERSION may occur.
637 * Convert a string from a 32-bit fixed width representation as UCS-4.
638 * to UTF-8. The result will be terminated with a 0 byte.
640 * Return value: a pointer to a newly allocated UTF-8 string.
641 * This value must be freed with g_free(). If an
642 * error occurs, %NULL will be returned and
646 stringprep_ucs4_to_utf8 (const long *str
,
647 int len
, int *items_read
, int *items_written
)
655 for (i
= 0; len
< 0 || i
< len
; i
++)
660 if (str
[i
] >= 0x80000000)
668 result_length
+= UTF8_LENGTH (str
[i
]);
671 result
= malloc (result_length
+ 1);
675 while (p
< result
+ result_length
)
676 p
+= stringprep_unichar_to_utf8 (str
[i
++], p
);
681 *items_written
= p
- result
;
692 * @str: a UTF-8 encoded string.
693 * @len: length of @str, in bytes, or -1 if @str is nul-terminated.
694 * @mode: the type of normalization to perform.
696 * Converts a string into canonical form, standardizing
697 * such issues as whether a character with an accent
698 * is represented as a base character and combining
699 * accent or as a single precomposed character. You
700 * should generally call g_utf8_normalize() before
701 * comparing two Unicode strings.
703 * The normalization mode %G_NORMALIZE_DEFAULT only
704 * standardizes differences that do not affect the
705 * text content, such as the above-mentioned accent
706 * representation. %G_NORMALIZE_ALL also standardizes
707 * the "compatibility" characters in Unicode, such
708 * as SUPERSCRIPT THREE to the standard forms
709 * (in this case DIGIT THREE). Formatting information
710 * may be lost but for most text operations such
711 * characters should be considered the same.
712 * For example, g_utf8_collate() normalizes
713 * with %G_NORMALIZE_ALL as its first step.
715 * %G_NORMALIZE_DEFAULT_COMPOSE and %G_NORMALIZE_ALL_COMPOSE
716 * are like %G_NORMALIZE_DEFAULT and %G_NORMALIZE_ALL,
717 * but returned a result with composed forms rather
718 * than a maximally decomposed form. This is often
719 * useful if you intend to convert the string to
720 * a legacy encoding or pass it to a system with
721 * less capable Unicode handling.
723 * Return value: a newly allocated string, that is the
724 * normalized form of @str.
727 g_utf8_normalize (const char *str
, int len
, GNormalizeMode mode
)
729 long *result_wc
= _g_utf8_normalize_wc (str
, len
, mode
);
732 result
= stringprep_ucs4_to_utf8 (result_wc
, -1, NULL
, NULL
);
739 stringprep_utf8_nfkc_normalize (const char *str
, int len
)
741 return g_utf8_normalize (str
, len
, G_NORMALIZE_NFKC
);
745 stringprep_ucs4_nfkc_normalize (long *str
, int len
)
750 p
= stringprep_ucs4_to_utf8 (str
, len
, 0, 0);
751 result_wc
= _g_utf8_normalize_wc (p
, -1, G_NORMALIZE_NFKC
);