1 /* nfkc.c unicode normalization utilities
2 * Copyright (C) 2002 Simon Josefsson
4 * This file is part of libstringprep.
6 * Libstringprep is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2.1 of the License, or (at your option) any later version.
11 * Libstringprep is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with libstringprep; if not, write to the Free Software
18 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
24 /* This file contains functions from GLIB including gutf8.c and
25 * gunidecomp.c, all with the following license.
27 * Copyright (C) 1999, 2000 Tom Tromey
28 * Copyright 2000 Red Hat, Inc.
30 * The Gnome Library is free software; you can redistribute it and/or
31 * modify it under the terms of the GNU Lesser General Public License as
32 * published by the Free Software Foundation; either version 2 of the
33 * License, or (at your option) any later version.
35 * The Gnome Library is distributed in the hope that it will be useful,
36 * but WITHOUT ANY WARRANTY; without even the implied warranty of
37 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
38 * Lesser General Public License for more details.
40 * You should have received a copy of the GNU Lesser General Public
41 * License along with the Gnome Library; see the file COPYING.LIB. If not,
42 * write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
43 * Boston, MA 02111-1307, USA.
49 G_NORMALIZE_NFD
= G_NORMALIZE_DEFAULT
,
50 G_NORMALIZE_DEFAULT_COMPOSE
,
51 G_NORMALIZE_NFC
= G_NORMALIZE_DEFAULT_COMPOSE
,
53 G_NORMALIZE_NFKD
= G_NORMALIZE_ALL
,
54 G_NORMALIZE_ALL_COMPOSE
,
55 G_NORMALIZE_NFKC
= G_NORMALIZE_ALL_COMPOSE
59 #include "gunidecomp.h"
62 #define UTF8_COMPUTE(Char, Mask, Len) \
68 else if ((Char & 0xe0) == 0xc0) \
73 else if ((Char & 0xf0) == 0xe0) \
78 else if ((Char & 0xf8) == 0xf0) \
83 else if ((Char & 0xfc) == 0xf8) \
88 else if ((Char & 0xfe) == 0xfc) \
96 #define UTF8_LENGTH(Char) \
97 ((Char) < 0x80 ? 1 : \
98 ((Char) < 0x800 ? 2 : \
99 ((Char) < 0x10000 ? 3 : \
100 ((Char) < 0x200000 ? 4 : \
101 ((Char) < 0x4000000 ? 5 : 6)))))
104 #define UTF8_GET(Result, Chars, Count, Mask, Len) \
105 (Result) = (Chars)[0] & (Mask); \
106 for ((Count) = 1; (Count) < (Len); ++(Count)) \
108 if (((Chars)[(Count)] & 0xc0) != 0x80) \
114 (Result) |= ((Chars)[(Count)] & 0x3f); \
117 #define UNICODE_VALID(Char) \
118 ((Char) < 0x110000 && \
119 ((Char) < 0xD800 || (Char) >= 0xE000) && \
120 (Char) != 0xFFFE && (Char) != 0xFFFF)
122 static const char utf8_skip_data
[256] = {
123 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
125 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
127 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
129 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
131 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
133 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
135 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
137 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 5,
140 static const char *const g_utf8_skip
= utf8_skip_data
;
142 #define g_utf8_next_char(p) (char *)((p) + g_utf8_skip[*(unsigned char *)(p)])
145 * stringprep_utf8_to_unichar:
146 * @p: a pointer to Unicode character encoded as UTF-8
148 * Converts a sequence of bytes encoded as UTF-8 to a Unicode character.
149 * If @p does not point to a valid UTF-8 encoded character, results are
152 * Return value: the resulting character
155 stringprep_utf8_to_unichar (const char *p
)
157 int i
, mask
= 0, len
;
158 unsigned long result
;
159 unsigned char c
= (unsigned char) *p
;
161 UTF8_COMPUTE (c
, mask
, len
);
163 return (unsigned long) -1;
164 UTF8_GET (result
, p
, i
, mask
, len
);
169 #define CC(Page, Char) \
170 ((combining_class_table[Page] >= G_UNICODE_MAX_TABLE_INDEX) \
171 ? (combining_class_table[Page] - G_UNICODE_MAX_TABLE_INDEX) \
172 : (cclass_data[combining_class_table[Page]][Char]))
174 #define COMBINING_CLASS(Char) \
175 (((Char) > (G_UNICODE_LAST_CHAR)) ? 0 : CC((Char) >> 8, (Char) & 0xff))
178 * g_unicode_canonical_ordering:
179 * @string: a UCS-4 encoded string.
180 * @len: the maximum length of @string to use.
182 * Computes the canonical ordering of a string in-place.
183 * This rearranges decomposed characters in the string
184 * according to their combining classes. See the Unicode
185 * manual for more information.
188 g_unicode_canonical_ordering (unsigned long *string
, size_t len
)
197 last
= COMBINING_CLASS (string
[0]);
198 for (i
= 0; i
< len
- 1; ++i
)
200 int next
= COMBINING_CLASS (string
[i
+ 1]);
201 if (next
!= 0 && last
> next
)
204 /* Percolate item leftward through string. */
205 for (j
= i
; j
> 0; --j
)
208 if (COMBINING_CLASS (string
[j
]) <= next
)
211 string
[j
+ 1] = string
[j
];
215 /* We're re-entering the loop looking at the old
224 static const unsigned char *
225 find_decomposition (unsigned long ch
, int compat
)
228 int end
= sizeof (decomp_table
) / sizeof ((decomp_table
)[0]);
230 if (ch
>= decomp_table
[start
].ch
&& ch
<= decomp_table
[end
- 1].ch
)
234 int half
= (start
+ end
) / 2;
235 if (ch
== decomp_table
[half
].ch
)
241 offset
= decomp_table
[half
].compat_offset
;
243 offset
= decomp_table
[half
].canon_offset
;
247 offset
= decomp_table
[half
].canon_offset
;
253 &(decomp_expansion_string
254 [decomp_table
[half
].expansion_offset
+ offset
]);
256 else if (half
== start
)
258 else if (ch
> decomp_table
[half
].ch
)
268 #define CI(Page, Char) \
269 ((compose_table[Page] >= G_UNICODE_MAX_TABLE_INDEX) \
270 ? (compose_table[Page] - G_UNICODE_MAX_TABLE_INDEX) \
271 : (compose_data[compose_table[Page]][Char]))
273 #define COMPOSE_INDEX(Char) \
274 (((Char) > (G_UNICODE_LAST_CHAR)) ? 0 : CI((Char) >> 8, (Char) & 0xff))
277 combine (unsigned long a
, unsigned long b
, unsigned long *result
)
279 int index_a
, index_b
;
281 index_a
= COMPOSE_INDEX (a
);
282 if (index_a
>= COMPOSE_FIRST_SINGLE_START
&& index_a
< COMPOSE_SECOND_START
)
284 if (b
== compose_first_single
[index_a
- COMPOSE_FIRST_SINGLE_START
][0])
287 compose_first_single
[index_a
- COMPOSE_FIRST_SINGLE_START
][1];
294 index_b
= COMPOSE_INDEX (b
);
295 if (index_b
>= COMPOSE_SECOND_SINGLE_START
)
298 compose_second_single
[index_b
- COMPOSE_SECOND_SINGLE_START
][0])
301 compose_second_single
[index_b
- COMPOSE_SECOND_SINGLE_START
][1];
308 if (index_a
>= COMPOSE_FIRST_START
&& index_a
< COMPOSE_FIRST_SINGLE_START
309 && index_b
>= COMPOSE_SECOND_START
310 && index_a
< COMPOSE_SECOND_SINGLE_START
)
313 compose_array
[index_a
- COMPOSE_FIRST_START
][index_b
-
314 COMPOSE_SECOND_START
];
326 static unsigned long *
327 _g_utf8_normalize_wc (const char *str
, int max_len
, GNormalizeMode mode
)
330 unsigned long *wc_buffer
;
333 int do_compat
= (mode
== G_NORMALIZE_NFKC
|| mode
== G_NORMALIZE_NFKD
);
334 int do_compose
= (mode
== G_NORMALIZE_NFC
|| mode
== G_NORMALIZE_NFKC
);
338 while ((max_len
< 0 || p
< str
+ max_len
) && *p
)
340 unsigned long wc
= stringprep_utf8_to_unichar (p
);
342 const unsigned char *decomp
= find_decomposition (wc
, do_compat
);
347 /* We store as a double-nul terminated string. */
348 for (len
= 0; (decomp
[len
] || decomp
[len
+ 1]); len
+= 2)
355 p
= g_utf8_next_char (p
);
358 wc_buffer
= malloc (sizeof (unsigned long) * (n_wc
+ 1));
363 while ((max_len
< 0 || p
< str
+ max_len
) && *p
)
365 unsigned long wc
= stringprep_utf8_to_unichar (p
);
366 const unsigned char *decomp
;
368 size_t old_n_wc
= n_wc
;
370 decomp
= find_decomposition (wc
, do_compat
);
375 /* We store as a double-nul terminated string. */
376 for (len
= 0; (decomp
[len
] || decomp
[len
+ 1]); len
+= 2)
377 wc_buffer
[n_wc
++] = (decomp
[len
] << 8 | decomp
[len
+ 1]);
380 wc_buffer
[n_wc
++] = wc
;
384 cc
= COMBINING_CLASS (wc_buffer
[old_n_wc
]);
388 g_unicode_canonical_ordering (wc_buffer
+ last_start
,
390 last_start
= old_n_wc
;
394 p
= g_utf8_next_char (p
);
399 g_unicode_canonical_ordering (wc_buffer
+ last_start
,
406 /* All decomposed and reordered */
409 if (do_compose
&& n_wc
> 0)
415 for (i
= 0; i
< n_wc
; i
++)
417 int cc
= COMBINING_CLASS (wc_buffer
[i
]);
420 (last_cc
== 0 || last_cc
!= cc
) &&
421 combine (wc_buffer
[last_start
], wc_buffer
[i
],
422 &wc_buffer
[last_start
]))
424 for (j
= i
+ 1; j
< n_wc
; j
++)
425 wc_buffer
[j
- 1] = wc_buffer
[j
];
432 last_cc
= COMBINING_CLASS (wc_buffer
[i
- 1]);
451 * @c: a ISO10646 character code
452 * @outbuf: output buffer, must have at least 6 bytes of space.
453 * If %NULL, the length will be computed and returned
454 * and nothing will be written to @outbuf.
456 * Converts a single character to UTF-8.
458 * Return value: number of bytes written
461 stringprep_unichar_to_utf8 (unsigned long c
, char *outbuf
)
477 else if (c
< 0x10000)
482 else if (c
< 0x200000)
487 else if (c
< 0x4000000)
500 for (i
= len
- 1; i
> 0; --i
)
502 outbuf
[i
] = (c
& 0x3f) | 0x80;
505 outbuf
[0] = c
| first
;
512 * stringgprep_utf8_to_ucs4:
513 * @str: a UTF-8 encoded string
514 * @len: the maximum length of @str to use. If @len < 0, then
515 * the string is nul-terminated.
516 * @items_written: location to store the number of characters in the
519 * Convert a string from UTF-8 to a 32-bit fixed width
520 * representation as UCS-4, assuming valid UTF-8 input.
521 * This function does no error checking on the input.
523 * Return value: a pointer to a newly allocated UCS-4 string.
524 * This value must be freed with g_free().
527 stringprep_utf8_to_ucs4 (const char *str
, int len
, int *items_written
)
530 unsigned long *result
;
540 p
= g_utf8_next_char (p
);
546 while (p
< str
+ len
&& *p
)
548 p
= g_utf8_next_char (p
);
553 result
= malloc (sizeof (unsigned long) * (n_chars
+ 1));
556 for (i
= 0; i
< n_chars
; i
++)
558 unsigned long wc
= ((unsigned char *) p
)[0];
593 for (j
= 1; j
< charlen
; j
++)
596 wc
|= ((unsigned char *) p
)[j
] & 0x3f;
611 /* This one is kept around for binary backwards compatibility with
612 library version CURRENT=1. */
614 stringprep_utf8_to_ucs4_fast (const char *str
, int len
, int *items_written
)
616 return stringprep_utf8_to_ucs4 (str
, len
, items_written
);
622 * @str: a UCS-4 encoded string
623 * @len: the maximum length of @str to use. If @len < 0, then
624 * the string is terminated with a 0 character.
625 * @items_read: location to store number of characters read read, or %NULL.
626 * @items_written: location to store number of bytes written or %NULL.
627 * The value here stored does not include the trailing 0
629 * @error: location to store the error occuring, or %NULL to ignore
630 * errors. Any of the errors in #GConvertError other than
631 * %G_CONVERT_ERROR_NO_CONVERSION may occur.
633 * Convert a string from a 32-bit fixed width representation as UCS-4.
634 * to UTF-8. The result will be terminated with a 0 byte.
636 * Return value: a pointer to a newly allocated UTF-8 string.
637 * This value must be freed with g_free(). If an
638 * error occurs, %NULL will be returned and
642 stringprep_ucs4_to_utf8 (const unsigned long *str
,
643 int len
, int *items_read
, int *items_written
)
651 for (i
= 0; len
< 0 || i
< len
; i
++)
656 if (str
[i
] >= 0x80000000)
664 result_length
+= UTF8_LENGTH (str
[i
]);
667 result
= malloc (result_length
+ 1);
671 while (p
< result
+ result_length
)
672 p
+= stringprep_unichar_to_utf8 (str
[i
++], p
);
677 *items_written
= p
- result
;
688 * @str: a UTF-8 encoded string.
689 * @len: length of @str, in bytes, or -1 if @str is nul-terminated.
690 * @mode: the type of normalization to perform.
692 * Converts a string into canonical form, standardizing
693 * such issues as whether a character with an accent
694 * is represented as a base character and combining
695 * accent or as a single precomposed character. You
696 * should generally call g_utf8_normalize() before
697 * comparing two Unicode strings.
699 * The normalization mode %G_NORMALIZE_DEFAULT only
700 * standardizes differences that do not affect the
701 * text content, such as the above-mentioned accent
702 * representation. %G_NORMALIZE_ALL also standardizes
703 * the "compatibility" characters in Unicode, such
704 * as SUPERSCRIPT THREE to the standard forms
705 * (in this case DIGIT THREE). Formatting information
706 * may be lost but for most text operations such
707 * characters should be considered the same.
708 * For example, g_utf8_collate() normalizes
709 * with %G_NORMALIZE_ALL as its first step.
711 * %G_NORMALIZE_DEFAULT_COMPOSE and %G_NORMALIZE_ALL_COMPOSE
712 * are like %G_NORMALIZE_DEFAULT and %G_NORMALIZE_ALL,
713 * but returned a result with composed forms rather
714 * than a maximally decomposed form. This is often
715 * useful if you intend to convert the string to
716 * a legacy encoding or pass it to a system with
717 * less capable Unicode handling.
719 * Return value: a newly allocated string, that is the
720 * normalized form of @str.
723 g_utf8_normalize (const char *str
, int len
, GNormalizeMode mode
)
725 unsigned long *result_wc
= _g_utf8_normalize_wc (str
, len
, mode
);
728 result
= stringprep_ucs4_to_utf8 (result_wc
, -1, NULL
, NULL
);
735 stringprep_utf8_nfkc_normalize (const char *str
, int len
)
737 return g_utf8_normalize (str
, len
, G_NORMALIZE_NFKC
);
741 stringprep_ucs4_nfkc_normalize (unsigned long *str
, int len
)
744 unsigned long *result_wc
;
746 p
= stringprep_ucs4_to_utf8 (str
, len
, 0, 0);
747 result_wc
= _g_utf8_normalize_wc (p
, -1, G_NORMALIZE_NFKC
);