1 /* Normalization forms (composition and decomposition) of Unicode strings.
2 Copyright (C) 2001-2002, 2009-2024 Free Software Foundation, Inc.
3 Written by Bruno Haible <bruno@clisp.org>, 2009.
5 This file is free software: you can redistribute it and/or modify
6 it under the terms of the GNU Lesser General Public License as
7 published by the Free Software Foundation; either version 2.1 of the
8 License, or (at your option) any later version.
10 This file is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 GNU Lesser General Public License for more details.
15 You should have received a copy of the GNU Lesser General Public License
16 along with this program. If not, see <https://www.gnu.org/licenses/>. */
26 #if @HAVE_UNISTRING_WOE32DLL_H@
27 # include <unistring/woe32dll.h>
29 # define LIBUNISTRING_DLL_VARIABLE
40 All functions prefixed with u8_ operate on UTF-8 encoded strings.
41 Their unit is an uint8_t (1 byte).
43 All functions prefixed with u16_ operate on UTF-16 encoded strings.
44 Their unit is an uint16_t (a 2-byte word).
46 All functions prefixed with u32_ operate on UCS-4 encoded strings.
47 Their unit is an uint32_t (a 4-byte word).
49 All argument pairs (s, n) denote a Unicode string s[0..n-1] with exactly
52 Functions returning a string result take a (resultbuf, lengthp) argument
53 pair. If resultbuf is not NULL and the result fits into *lengthp units,
54 it is put in resultbuf, and resultbuf is returned. Otherwise, a freshly
55 allocated string is returned. In both cases, *lengthp is set to the
56 length (number of units) of the returned string. In case of error,
57 NULL is returned and errno is set. */
62 UC_DECOMP_CANONICAL
,/* Canonical decomposition. */
63 UC_DECOMP_FONT
, /* <font> A font variant (e.g. a blackletter form). */
64 UC_DECOMP_NOBREAK
, /* <noBreak> A no-break version of a space or hyphen. */
65 UC_DECOMP_INITIAL
, /* <initial> An initial presentation form (Arabic). */
66 UC_DECOMP_MEDIAL
, /* <medial> A medial presentation form (Arabic). */
67 UC_DECOMP_FINAL
, /* <final> A final presentation form (Arabic). */
68 UC_DECOMP_ISOLATED
,/* <isolated> An isolated presentation form (Arabic). */
69 UC_DECOMP_CIRCLE
, /* <circle> An encircled form. */
70 UC_DECOMP_SUPER
, /* <super> A superscript form. */
71 UC_DECOMP_SUB
, /* <sub> A subscript form. */
72 UC_DECOMP_VERTICAL
,/* <vertical> A vertical layout presentation form. */
73 UC_DECOMP_WIDE
, /* <wide> A wide (or zenkaku) compatibility character. */
74 UC_DECOMP_NARROW
, /* <narrow> A narrow (or hankaku) compatibility character. */
75 UC_DECOMP_SMALL
, /* <small> A small variant form (CNS compatibility). */
76 UC_DECOMP_SQUARE
, /* <square> A CJK squared font variant. */
77 UC_DECOMP_FRACTION
,/* <fraction> A vulgar fraction form. */
78 UC_DECOMP_COMPAT
/* <compat> Otherwise unspecified compatibility character. */
81 /* Maximum size of decomposition of a single Unicode character. */
82 #define UC_DECOMPOSITION_MAX_LENGTH 32
84 /* Return the character decomposition mapping of a Unicode character.
85 DECOMPOSITION must point to an array of at least UC_DECOMPOSITION_MAX_LENGTH
87 When a decomposition exists, DECOMPOSITION[0..N-1] and *DECOMP_TAG are
88 filled and N is returned. Otherwise -1 is returned. */
90 uc_decomposition (ucs4_t uc
, int *decomp_tag
, ucs4_t
*decomposition
);
92 /* Return the canonical character decomposition mapping of a Unicode character.
93 DECOMPOSITION must point to an array of at least UC_DECOMPOSITION_MAX_LENGTH
95 When a decomposition exists, DECOMPOSITION[0..N-1] is filled and N is
96 returned. Otherwise -1 is returned. */
98 uc_canonical_decomposition (ucs4_t uc
, ucs4_t
*decomposition
);
101 /* Attempt to combine the Unicode characters uc1, uc2.
102 uc1 is known to have canonical combining class 0.
103 Return the combination of uc1 and uc2, if it exists.
105 Not all decompositions can be recombined using this function. See the
106 Unicode file CompositionExclusions.txt for details. */
108 uc_composition (ucs4_t uc1
, ucs4_t uc2
)
112 /* An object of type uninorm_t denotes a Unicode normalization form. */
113 struct unicode_normalization_form
;
114 typedef const struct unicode_normalization_form
*uninorm_t
;
116 /* UNINORM_NFD: Normalization form D: canonical decomposition. */
117 extern @GNULIB_UNINORM_NFD_DLL_VARIABLE@
const struct unicode_normalization_form uninorm_nfd
;
118 #define UNINORM_NFD (&uninorm_nfd)
120 /* UNINORM_NFC: Normalization form C: canonical decomposition, then
121 canonical composition. */
122 extern @GNULIB_UNINORM_NFC_DLL_VARIABLE@
const struct unicode_normalization_form uninorm_nfc
;
123 #define UNINORM_NFC (&uninorm_nfc)
125 /* UNINORM_NFKD: Normalization form KD: compatibility decomposition. */
126 extern @GNULIB_UNINORM_NFKD_DLL_VARIABLE@
const struct unicode_normalization_form uninorm_nfkd
;
127 #define UNINORM_NFKD (&uninorm_nfkd)
129 /* UNINORM_NFKC: Normalization form KC: compatibility decomposition, then
130 canonical composition. */
131 extern @GNULIB_UNINORM_NFKC_DLL_VARIABLE@
const struct unicode_normalization_form uninorm_nfkc
;
132 #define UNINORM_NFKC (&uninorm_nfkc)
134 /* Test whether a normalization form does compatibility decomposition. */
135 #define uninorm_is_compat_decomposing(nf) \
136 ((* (const unsigned int *) (nf) >> 0) & 1)
138 /* Test whether a normalization form includes canonical composition. */
139 #define uninorm_is_composing(nf) \
140 ((* (const unsigned int *) (nf) >> 1) & 1)
142 /* Return the decomposing variant of a normalization form.
143 This maps NFC,NFD -> NFD and NFKC,NFKD -> NFKD. */
145 uninorm_decomposing_form (uninorm_t nf
)
149 /* Return the specified normalization form of a string. */
151 u8_normalize (uninorm_t nf
, const uint8_t *s
, size_t n
,
152 uint8_t *_UC_RESTRICT resultbuf
, size_t *lengthp
);
154 u16_normalize (uninorm_t nf
, const uint16_t *s
, size_t n
,
155 uint16_t *_UC_RESTRICT resultbuf
, size_t *lengthp
);
157 u32_normalize (uninorm_t nf
, const uint32_t *s
, size_t n
,
158 uint32_t *_UC_RESTRICT resultbuf
, size_t *lengthp
);
161 /* Compare S1 and S2, ignoring differences in normalization.
162 NF must be either UNINORM_NFD or UNINORM_NFKD.
163 If successful, set *RESULTP to -1 if S1 < S2, 0 if S1 = S2, 1 if S1 > S2, and
164 return 0. Upon failure, return -1 with errno set. */
166 u8_normcmp (const uint8_t *s1
, size_t n1
, const uint8_t *s2
, size_t n2
,
167 uninorm_t nf
, int *resultp
);
169 u16_normcmp (const uint16_t *s1
, size_t n1
, const uint16_t *s2
, size_t n2
,
170 uninorm_t nf
, int *resultp
);
172 u32_normcmp (const uint32_t *s1
, size_t n1
, const uint32_t *s2
, size_t n2
,
173 uninorm_t nf
, int *resultp
);
176 /* Converts the string S of length N to a NUL-terminated byte sequence, in such
177 a way that comparing uN_normxfrm (S1) and uN_normxfrm (S2) with uN_cmp2() is
178 equivalent to comparing S1 and S2 with uN_normcoll().
179 NF must be either UNINORM_NFC or UNINORM_NFKC. */
181 u8_normxfrm (const uint8_t *s
, size_t n
, uninorm_t nf
,
182 char *resultbuf
, size_t *lengthp
);
184 u16_normxfrm (const uint16_t *s
, size_t n
, uninorm_t nf
,
185 char *resultbuf
, size_t *lengthp
);
187 u32_normxfrm (const uint32_t *s
, size_t n
, uninorm_t nf
,
188 char *resultbuf
, size_t *lengthp
);
191 /* Compare S1 and S2, ignoring differences in normalization, using the
192 collation rules of the current locale.
193 NF must be either UNINORM_NFC or UNINORM_NFKC.
194 If successful, set *RESULTP to -1 if S1 < S2, 0 if S1 = S2, 1 if S1 > S2, and
195 return 0. Upon failure, return -1 with errno set. */
197 u8_normcoll (const uint8_t *s1
, size_t n1
, const uint8_t *s2
, size_t n2
,
198 uninorm_t nf
, int *resultp
);
200 u16_normcoll (const uint16_t *s1
, size_t n1
, const uint16_t *s2
, size_t n2
,
201 uninorm_t nf
, int *resultp
);
203 u32_normcoll (const uint32_t *s1
, size_t n1
, const uint32_t *s2
, size_t n2
,
204 uninorm_t nf
, int *resultp
);
207 /* Normalization of a stream of Unicode characters.
209 A "stream of Unicode characters" is essentially a function that accepts an
210 ucs4_t argument repeatedly, optionally combined with a function that
211 "flushes" the stream. */
213 /* Data type of a stream of Unicode characters that normalizes its input
214 according to a given normalization form and passes the normalized character
215 sequence to the encapsulated stream of Unicode characters. */
216 struct uninorm_filter
;
218 /* Bring data buffered in the filter to its destination, the encapsulated
219 stream, then close and free the filter.
220 Return 0 if successful, or -1 with errno set upon failure. */
222 uninorm_filter_free (struct uninorm_filter
*filter
);
224 /* Create and return a normalization filter for Unicode characters.
225 The pair (stream_func, stream_data) is the encapsulated stream.
226 stream_func (stream_data, uc) receives the Unicode character uc
227 and returns 0 if successful, or -1 with errno set upon failure.
228 Return the new filter, or NULL with errno set upon failure. */
229 extern struct uninorm_filter
*
230 uninorm_filter_create (uninorm_t nf
,
231 int (*stream_func
) (void *stream_data
, ucs4_t uc
),
233 _GL_ATTRIBUTE_DEALLOC (uninorm_filter_free
, 1);
235 /* Stuff a Unicode character into a normalizing filter.
236 Return 0 if successful, or -1 with errno set upon failure. */
238 uninorm_filter_write (struct uninorm_filter
*filter
, ucs4_t uc
);
240 /* Bring data buffered in the filter to its destination, the encapsulated
242 Return 0 if successful, or -1 with errno set upon failure.
243 Note! If after calling this function, additional characters are written
244 into the filter, the resulting character sequence in the encapsulated stream
245 will not necessarily be normalized. */
247 uninorm_filter_flush (struct uninorm_filter
*filter
);
255 #endif /* _UNINORM_H */