1 /* Unicode character case mappings.
2 Copyright (C) 2002, 2009-2020 Free Software Foundation, Inc.
4 This program is free software: you can redistribute it and/or modify it
5 under the terms of the GNU Lesser General Public License as published
6 by the Free Software Foundation; either version 3 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12 Lesser General Public License for more details.
14 You should have received a copy of the GNU Lesser General Public License
15 along with this program. If not, see <https://www.gnu.org/licenses/>. */
35 /* ========================================================================= */
37 /* Character case mappings.
38 These mappings are locale and context independent.
39 WARNING! These functions are not sufficient for languages such as German.
40 Better use the functions below that treat an entire string at once and are
43 /* Return the uppercase mapping of a Unicode character. */
45 uc_toupper (ucs4_t uc
)
48 /* Return the lowercase mapping of a Unicode character. */
50 uc_tolower (ucs4_t uc
)
53 /* Return the titlecase mapping of a Unicode character. */
55 uc_totitle (ucs4_t uc
)
58 /* ========================================================================= */
60 /* String case mappings. */
62 /* These functions are locale dependent. The iso639_language argument
63 identifies the language (e.g. "tr" for Turkish). NULL means to use
64 locale independent case mappings. */
66 /* Return the ISO 639 language code of the current locale.
67 Return "" if it is unknown, or in the "C" locale. */
69 uc_locale_language (void)
74 All functions prefixed with u8_ operate on UTF-8 encoded strings.
75 Their unit is an uint8_t (1 byte).
77 All functions prefixed with u16_ operate on UTF-16 encoded strings.
78 Their unit is an uint16_t (a 2-byte word).
80 All functions prefixed with u32_ operate on UCS-4 encoded strings.
81 Their unit is an uint32_t (a 4-byte word).
83 All argument pairs (s, n) denote a Unicode string s[0..n-1] with exactly
86 Functions returning a string result take a (resultbuf, lengthp) argument
87 pair. If resultbuf is not NULL and the result fits into *lengthp units,
88 it is put in resultbuf, and resultbuf is returned. Otherwise, a freshly
89 allocated string is returned. In both cases, *lengthp is set to the
90 length (number of units) of the returned string. In case of error,
91 NULL is returned and errno is set. */
93 /* Return the uppercase mapping of a string.
94 The nf argument identifies the normalization form to apply after the
95 case-mapping. It can also be NULL, for no normalization. */
97 u8_toupper (const uint8_t *s
, size_t n
, const char *iso639_language
,
99 uint8_t *_UC_RESTRICT resultbuf
, size_t *lengthp
);
101 u16_toupper (const uint16_t *s
, size_t n
, const char *iso639_language
,
103 uint16_t *_UC_RESTRICT resultbuf
, size_t *lengthp
);
105 u32_toupper (const uint32_t *s
, size_t n
, const char *iso639_language
,
107 uint32_t *_UC_RESTRICT resultbuf
, size_t *lengthp
);
109 /* Return the lowercase mapping of a string.
110 The nf argument identifies the normalization form to apply after the
111 case-mapping. It can also be NULL, for no normalization. */
113 u8_tolower (const uint8_t *s
, size_t n
, const char *iso639_language
,
115 uint8_t *_UC_RESTRICT resultbuf
, size_t *lengthp
);
117 u16_tolower (const uint16_t *s
, size_t n
, const char *iso639_language
,
119 uint16_t *_UC_RESTRICT resultbuf
, size_t *lengthp
);
121 u32_tolower (const uint32_t *s
, size_t n
, const char *iso639_language
,
123 uint32_t *_UC_RESTRICT resultbuf
, size_t *lengthp
);
125 /* Return the titlecase mapping of a string.
126 The nf argument identifies the normalization form to apply after the
127 case-mapping. It can also be NULL, for no normalization. */
129 u8_totitle (const uint8_t *s
, size_t n
, const char *iso639_language
,
131 uint8_t *_UC_RESTRICT resultbuf
, size_t *lengthp
);
133 u16_totitle (const uint16_t *s
, size_t n
, const char *iso639_language
,
135 uint16_t *_UC_RESTRICT resultbuf
, size_t *lengthp
);
137 u32_totitle (const uint32_t *s
, size_t n
, const char *iso639_language
,
139 uint32_t *_UC_RESTRICT resultbuf
, size_t *lengthp
);
141 /* The case-mapping context given by a prefix string. */
142 typedef struct casing_prefix_context
144 /* These fields are private, undocumented. */
145 uint32_t last_char_except_ignorable
;
146 uint32_t last_char_normal_or_above
;
148 casing_prefix_context_t
;
149 /* The case-mapping context of the empty prefix string. */
150 extern const casing_prefix_context_t unicase_empty_prefix_context
;
151 /* Return the case-mapping context of a given prefix string. */
152 extern casing_prefix_context_t
153 u8_casing_prefix_context (const uint8_t *s
, size_t n
);
154 extern casing_prefix_context_t
155 u16_casing_prefix_context (const uint16_t *s
, size_t n
);
156 extern casing_prefix_context_t
157 u32_casing_prefix_context (const uint32_t *s
, size_t n
);
158 /* Return the case-mapping context of the prefix concat(A, S), given the
159 case-mapping context of the prefix A. */
160 extern casing_prefix_context_t
161 u8_casing_prefixes_context (const uint8_t *s
, size_t n
,
162 casing_prefix_context_t a_context
);
163 extern casing_prefix_context_t
164 u16_casing_prefixes_context (const uint16_t *s
, size_t n
,
165 casing_prefix_context_t a_context
);
166 extern casing_prefix_context_t
167 u32_casing_prefixes_context (const uint32_t *s
, size_t n
,
168 casing_prefix_context_t a_context
);
170 /* The case-mapping context given by a suffix string. */
171 typedef struct casing_suffix_context
173 /* These fields are private, undocumented. */
174 uint32_t first_char_except_ignorable
;
177 casing_suffix_context_t
;
178 /* The case-mapping context of the empty suffix string. */
179 extern const casing_suffix_context_t unicase_empty_suffix_context
;
180 /* Return the case-mapping context of a given suffix string. */
181 extern casing_suffix_context_t
182 u8_casing_suffix_context (const uint8_t *s
, size_t n
);
183 extern casing_suffix_context_t
184 u16_casing_suffix_context (const uint16_t *s
, size_t n
);
185 extern casing_suffix_context_t
186 u32_casing_suffix_context (const uint32_t *s
, size_t n
);
187 /* Return the case-mapping context of the suffix concat(S, A), given the
188 case-mapping context of the suffix A. */
189 extern casing_suffix_context_t
190 u8_casing_suffixes_context (const uint8_t *s
, size_t n
,
191 casing_suffix_context_t a_context
);
192 extern casing_suffix_context_t
193 u16_casing_suffixes_context (const uint16_t *s
, size_t n
,
194 casing_suffix_context_t a_context
);
195 extern casing_suffix_context_t
196 u32_casing_suffixes_context (const uint32_t *s
, size_t n
,
197 casing_suffix_context_t a_context
);
199 /* Return the uppercase mapping of a string that is surrounded by a prefix
202 u8_ct_toupper (const uint8_t *s
, size_t n
,
203 casing_prefix_context_t prefix_context
,
204 casing_suffix_context_t suffix_context
,
205 const char *iso639_language
,
207 uint8_t *_UC_RESTRICT resultbuf
, size_t *lengthp
);
209 u16_ct_toupper (const uint16_t *s
, size_t n
,
210 casing_prefix_context_t prefix_context
,
211 casing_suffix_context_t suffix_context
,
212 const char *iso639_language
,
214 uint16_t *_UC_RESTRICT resultbuf
, size_t *lengthp
);
216 u32_ct_toupper (const uint32_t *s
, size_t n
,
217 casing_prefix_context_t prefix_context
,
218 casing_suffix_context_t suffix_context
,
219 const char *iso639_language
,
221 uint32_t *_UC_RESTRICT resultbuf
, size_t *lengthp
);
223 /* Return the lowercase mapping of a string that is surrounded by a prefix
226 u8_ct_tolower (const uint8_t *s
, size_t n
,
227 casing_prefix_context_t prefix_context
,
228 casing_suffix_context_t suffix_context
,
229 const char *iso639_language
,
231 uint8_t *_UC_RESTRICT resultbuf
, size_t *lengthp
);
233 u16_ct_tolower (const uint16_t *s
, size_t n
,
234 casing_prefix_context_t prefix_context
,
235 casing_suffix_context_t suffix_context
,
236 const char *iso639_language
,
238 uint16_t *_UC_RESTRICT resultbuf
, size_t *lengthp
);
240 u32_ct_tolower (const uint32_t *s
, size_t n
,
241 casing_prefix_context_t prefix_context
,
242 casing_suffix_context_t suffix_context
,
243 const char *iso639_language
,
245 uint32_t *_UC_RESTRICT resultbuf
, size_t *lengthp
);
247 /* Return the titlecase mapping of a string that is surrounded by a prefix
250 u8_ct_totitle (const uint8_t *s
, size_t n
,
251 casing_prefix_context_t prefix_context
,
252 casing_suffix_context_t suffix_context
,
253 const char *iso639_language
,
255 uint8_t *_UC_RESTRICT resultbuf
, size_t *lengthp
);
257 u16_ct_totitle (const uint16_t *s
, size_t n
,
258 casing_prefix_context_t prefix_context
,
259 casing_suffix_context_t suffix_context
,
260 const char *iso639_language
,
262 uint16_t *_UC_RESTRICT resultbuf
, size_t *lengthp
);
264 u32_ct_totitle (const uint32_t *s
, size_t n
,
265 casing_prefix_context_t prefix_context
,
266 casing_suffix_context_t suffix_context
,
267 const char *iso639_language
,
269 uint32_t *_UC_RESTRICT resultbuf
, size_t *lengthp
);
271 /* Return the case folded string.
272 Comparing uN_casefold (S1) and uN_casefold (S2) with uN_cmp2() is equivalent
273 to comparing S1 and S2 with uN_casecmp().
274 The nf argument identifies the normalization form to apply after the
275 case-mapping. It can also be NULL, for no normalization. */
277 u8_casefold (const uint8_t *s
, size_t n
, const char *iso639_language
,
279 uint8_t *_UC_RESTRICT resultbuf
, size_t *lengthp
);
281 u16_casefold (const uint16_t *s
, size_t n
, const char *iso639_language
,
283 uint16_t *_UC_RESTRICT resultbuf
, size_t *lengthp
);
285 u32_casefold (const uint32_t *s
, size_t n
, const char *iso639_language
,
287 uint32_t *_UC_RESTRICT resultbuf
, size_t *lengthp
);
288 /* Likewise, for a string that is surrounded by a prefix and a suffix. */
290 u8_ct_casefold (const uint8_t *s
, size_t n
,
291 casing_prefix_context_t prefix_context
,
292 casing_suffix_context_t suffix_context
,
293 const char *iso639_language
,
295 uint8_t *_UC_RESTRICT resultbuf
, size_t *lengthp
);
297 u16_ct_casefold (const uint16_t *s
, size_t n
,
298 casing_prefix_context_t prefix_context
,
299 casing_suffix_context_t suffix_context
,
300 const char *iso639_language
,
302 uint16_t *_UC_RESTRICT resultbuf
, size_t *lengthp
);
304 u32_ct_casefold (const uint32_t *s
, size_t n
,
305 casing_prefix_context_t prefix_context
,
306 casing_suffix_context_t suffix_context
,
307 const char *iso639_language
,
309 uint32_t *_UC_RESTRICT resultbuf
, size_t *lengthp
);
311 /* Compare S1 and S2, ignoring differences in case and normalization.
312 The nf argument identifies the normalization form to apply after the
313 case-mapping. It can also be NULL, for no normalization.
314 If successful, set *RESULTP to -1 if S1 < S2, 0 if S1 = S2, 1 if S1 > S2, and
315 return 0. Upon failure, return -1 with errno set. */
317 u8_casecmp (const uint8_t *s1
, size_t n1
,
318 const uint8_t *s2
, size_t n2
,
319 const char *iso639_language
, uninorm_t nf
, int *resultp
);
321 u16_casecmp (const uint16_t *s1
, size_t n1
,
322 const uint16_t *s2
, size_t n2
,
323 const char *iso639_language
, uninorm_t nf
, int *resultp
);
325 u32_casecmp (const uint32_t *s1
, size_t n1
,
326 const uint32_t *s2
, size_t n2
,
327 const char *iso639_language
, uninorm_t nf
, int *resultp
);
329 ulc_casecmp (const char *s1
, size_t n1
,
330 const char *s2
, size_t n2
,
331 const char *iso639_language
, uninorm_t nf
, int *resultp
);
333 /* Convert the string S of length N to a NUL-terminated byte sequence, in such
334 a way that comparing uN_casexfrm (S1) and uN_casexfrm (S2) with the gnulib
335 function memcmp2() is equivalent to comparing S1 and S2 with uN_casecoll().
336 NF must be either UNINORM_NFC, UNINORM_NFKC, or NULL for no normalization. */
338 u8_casexfrm (const uint8_t *s
, size_t n
, const char *iso639_language
,
340 char *_UC_RESTRICT resultbuf
, size_t *lengthp
);
342 u16_casexfrm (const uint16_t *s
, size_t n
, const char *iso639_language
,
344 char *_UC_RESTRICT resultbuf
, size_t *lengthp
);
346 u32_casexfrm (const uint32_t *s
, size_t n
, const char *iso639_language
,
348 char *_UC_RESTRICT resultbuf
, size_t *lengthp
);
350 ulc_casexfrm (const char *s
, size_t n
, const char *iso639_language
,
352 char *_UC_RESTRICT resultbuf
, size_t *lengthp
);
354 /* Compare S1 and S2, ignoring differences in case and normalization, using the
355 collation rules of the current locale.
356 The nf argument identifies the normalization form to apply after the
357 case-mapping. It must be either UNINORM_NFC or UNINORM_NFKC. It can also
358 be NULL, for no normalization.
359 If successful, set *RESULTP to -1 if S1 < S2, 0 if S1 = S2, 1 if S1 > S2, and
360 return 0. Upon failure, return -1 with errno set. */
362 u8_casecoll (const uint8_t *s1
, size_t n1
,
363 const uint8_t *s2
, size_t n2
,
364 const char *iso639_language
, uninorm_t nf
, int *resultp
);
366 u16_casecoll (const uint16_t *s1
, size_t n1
,
367 const uint16_t *s2
, size_t n2
,
368 const char *iso639_language
, uninorm_t nf
, int *resultp
);
370 u32_casecoll (const uint32_t *s1
, size_t n1
,
371 const uint32_t *s2
, size_t n2
,
372 const char *iso639_language
, uninorm_t nf
, int *resultp
);
374 ulc_casecoll (const char *s1
, size_t n1
,
375 const char *s2
, size_t n2
,
376 const char *iso639_language
, uninorm_t nf
, int *resultp
);
379 /* Set *RESULTP to true if mapping NFD(S) to upper case is a no-op, or to false
380 otherwise, and return 0. Upon failure, return -1 with errno set. */
382 u8_is_uppercase (const uint8_t *s
, size_t n
,
383 const char *iso639_language
,
386 u16_is_uppercase (const uint16_t *s
, size_t n
,
387 const char *iso639_language
,
390 u32_is_uppercase (const uint32_t *s
, size_t n
,
391 const char *iso639_language
,
394 /* Set *RESULTP to true if mapping NFD(S) to lower case is a no-op, or to false
395 otherwise, and return 0. Upon failure, return -1 with errno set. */
397 u8_is_lowercase (const uint8_t *s
, size_t n
,
398 const char *iso639_language
,
401 u16_is_lowercase (const uint16_t *s
, size_t n
,
402 const char *iso639_language
,
405 u32_is_lowercase (const uint32_t *s
, size_t n
,
406 const char *iso639_language
,
409 /* Set *RESULTP to true if mapping NFD(S) to title case is a no-op, or to false
410 otherwise, and return 0. Upon failure, return -1 with errno set. */
412 u8_is_titlecase (const uint8_t *s
, size_t n
,
413 const char *iso639_language
,
416 u16_is_titlecase (const uint16_t *s
, size_t n
,
417 const char *iso639_language
,
420 u32_is_titlecase (const uint32_t *s
, size_t n
,
421 const char *iso639_language
,
424 /* Set *RESULTP to true if applying case folding to NFD(S) is a no-op, or to
425 false otherwise, and return 0. Upon failure, return -1 with errno set. */
427 u8_is_casefolded (const uint8_t *s
, size_t n
,
428 const char *iso639_language
,
431 u16_is_casefolded (const uint16_t *s
, size_t n
,
432 const char *iso639_language
,
435 u32_is_casefolded (const uint32_t *s
, size_t n
,
436 const char *iso639_language
,
439 /* Set *RESULTP to true if case matters for S, that is, if mapping NFD(S) to
440 either upper case or lower case or title case is not a no-op.
441 Set *RESULTP to false if NFD(S) maps to itself under the upper case mapping,
442 under the lower case mapping, and under the title case mapping; in other
443 words, when NFD(S) consists entirely of caseless characters.
444 Upon failure, return -1 with errno set. */
446 u8_is_cased (const uint8_t *s
, size_t n
,
447 const char *iso639_language
,
450 u16_is_cased (const uint16_t *s
, size_t n
,
451 const char *iso639_language
,
454 u32_is_cased (const uint32_t *s
, size_t n
,
455 const char *iso639_language
,
459 /* ========================================================================= */
465 #endif /* _UNICASE_H */