unistr/u{8,16,32}-uctomb: Avoid possible trouble with huge strings.
[gnulib.git] / lib / unictype.in.h
blob005b0eabdd8836e2fa5ca053c83f9868c4c23a43
1 /* Unicode character classification and properties.
2 Copyright (C) 2002, 2005-2020 Free Software Foundation, Inc.
4 This program is free software: you can redistribute it and/or modify it
5 under the terms of the GNU Lesser General Public License as published
6 by the Free Software Foundation; either version 3 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12 Lesser General Public License for more details.
14 You should have received a copy of the GNU Lesser General Public License
15 along with this program. If not, see <https://www.gnu.org/licenses/>. */
17 #ifndef _UNICTYPE_H
18 #define _UNICTYPE_H
20 #include "unitypes.h"
22 /* Get bool. */
23 #include <stdbool.h>
25 /* Get size_t. */
26 #include <stddef.h>
28 #ifdef __cplusplus
29 extern "C" {
30 #endif
32 /* ========================================================================= */
34 /* Field 1 of Unicode Character Database: Character name.
35 See "uniname.h". */
37 /* ========================================================================= */
39 /* Field 2 of Unicode Character Database: General category. */
41 /* Data type denoting a General category value. This is not just a bitmask,
42 but rather a bitmask and a pointer to the lookup table, so that programs
43 that use only the predefined bitmasks (i.e. don't combine bitmasks with &
44 and |) don't have a link-time dependency towards the big general table. */
45 typedef struct
47 uint32_t bitmask : 31;
48 /*bool*/ unsigned int generic : 1;
49 union
51 const void *table; /* when generic is 0 */
52 bool (*lookup_fn) (ucs4_t uc, uint32_t bitmask); /* when generic is 1 */
53 } lookup;
55 uc_general_category_t;
57 /* Bits and bit masks denoting General category values. UnicodeData-3.2.0.html
58 says a 32-bit integer will always suffice to represent them.
59 These bit masks can only be used with the uc_is_general_category_withtable
60 function. */
61 enum
63 UC_CATEGORY_MASK_L = 0x0000001f,
64 UC_CATEGORY_MASK_LC = 0x00000007,
65 UC_CATEGORY_MASK_Lu = 0x00000001,
66 UC_CATEGORY_MASK_Ll = 0x00000002,
67 UC_CATEGORY_MASK_Lt = 0x00000004,
68 UC_CATEGORY_MASK_Lm = 0x00000008,
69 UC_CATEGORY_MASK_Lo = 0x00000010,
70 UC_CATEGORY_MASK_M = 0x000000e0,
71 UC_CATEGORY_MASK_Mn = 0x00000020,
72 UC_CATEGORY_MASK_Mc = 0x00000040,
73 UC_CATEGORY_MASK_Me = 0x00000080,
74 UC_CATEGORY_MASK_N = 0x00000700,
75 UC_CATEGORY_MASK_Nd = 0x00000100,
76 UC_CATEGORY_MASK_Nl = 0x00000200,
77 UC_CATEGORY_MASK_No = 0x00000400,
78 UC_CATEGORY_MASK_P = 0x0003f800,
79 UC_CATEGORY_MASK_Pc = 0x00000800,
80 UC_CATEGORY_MASK_Pd = 0x00001000,
81 UC_CATEGORY_MASK_Ps = 0x00002000,
82 UC_CATEGORY_MASK_Pe = 0x00004000,
83 UC_CATEGORY_MASK_Pi = 0x00008000,
84 UC_CATEGORY_MASK_Pf = 0x00010000,
85 UC_CATEGORY_MASK_Po = 0x00020000,
86 UC_CATEGORY_MASK_S = 0x003c0000,
87 UC_CATEGORY_MASK_Sm = 0x00040000,
88 UC_CATEGORY_MASK_Sc = 0x00080000,
89 UC_CATEGORY_MASK_Sk = 0x00100000,
90 UC_CATEGORY_MASK_So = 0x00200000,
91 UC_CATEGORY_MASK_Z = 0x01c00000,
92 UC_CATEGORY_MASK_Zs = 0x00400000,
93 UC_CATEGORY_MASK_Zl = 0x00800000,
94 UC_CATEGORY_MASK_Zp = 0x01000000,
95 UC_CATEGORY_MASK_C = 0x3e000000,
96 UC_CATEGORY_MASK_Cc = 0x02000000,
97 UC_CATEGORY_MASK_Cf = 0x04000000,
98 UC_CATEGORY_MASK_Cs = 0x08000000,
99 UC_CATEGORY_MASK_Co = 0x10000000,
100 UC_CATEGORY_MASK_Cn = 0x20000000
103 /* Predefined General category values. */
104 extern const uc_general_category_t UC_CATEGORY_L;
105 extern const uc_general_category_t UC_CATEGORY_LC;
106 extern const uc_general_category_t UC_CATEGORY_Lu;
107 extern const uc_general_category_t UC_CATEGORY_Ll;
108 extern const uc_general_category_t UC_CATEGORY_Lt;
109 extern const uc_general_category_t UC_CATEGORY_Lm;
110 extern const uc_general_category_t UC_CATEGORY_Lo;
111 extern const uc_general_category_t UC_CATEGORY_M;
112 extern const uc_general_category_t UC_CATEGORY_Mn;
113 extern const uc_general_category_t UC_CATEGORY_Mc;
114 extern const uc_general_category_t UC_CATEGORY_Me;
115 extern const uc_general_category_t UC_CATEGORY_N;
116 extern const uc_general_category_t UC_CATEGORY_Nd;
117 extern const uc_general_category_t UC_CATEGORY_Nl;
118 extern const uc_general_category_t UC_CATEGORY_No;
119 extern const uc_general_category_t UC_CATEGORY_P;
120 extern const uc_general_category_t UC_CATEGORY_Pc;
121 extern const uc_general_category_t UC_CATEGORY_Pd;
122 extern const uc_general_category_t UC_CATEGORY_Ps;
123 extern const uc_general_category_t UC_CATEGORY_Pe;
124 extern const uc_general_category_t UC_CATEGORY_Pi;
125 extern const uc_general_category_t UC_CATEGORY_Pf;
126 extern const uc_general_category_t UC_CATEGORY_Po;
127 extern const uc_general_category_t UC_CATEGORY_S;
128 extern const uc_general_category_t UC_CATEGORY_Sm;
129 extern const uc_general_category_t UC_CATEGORY_Sc;
130 extern const uc_general_category_t UC_CATEGORY_Sk;
131 extern const uc_general_category_t UC_CATEGORY_So;
132 extern const uc_general_category_t UC_CATEGORY_Z;
133 extern const uc_general_category_t UC_CATEGORY_Zs;
134 extern const uc_general_category_t UC_CATEGORY_Zl;
135 extern const uc_general_category_t UC_CATEGORY_Zp;
136 extern const uc_general_category_t UC_CATEGORY_C;
137 extern const uc_general_category_t UC_CATEGORY_Cc;
138 extern const uc_general_category_t UC_CATEGORY_Cf;
139 extern const uc_general_category_t UC_CATEGORY_Cs;
140 extern const uc_general_category_t UC_CATEGORY_Co;
141 extern const uc_general_category_t UC_CATEGORY_Cn;
142 /* Non-public. */
143 extern const uc_general_category_t _UC_CATEGORY_NONE;
145 /* Alias names for predefined General category values. */
146 #define UC_LETTER UC_CATEGORY_L
147 #define UC_CASED_LETTER UC_CATEGORY_LC
148 #define UC_UPPERCASE_LETTER UC_CATEGORY_Lu
149 #define UC_LOWERCASE_LETTER UC_CATEGORY_Ll
150 #define UC_TITLECASE_LETTER UC_CATEGORY_Lt
151 #define UC_MODIFIER_LETTER UC_CATEGORY_Lm
152 #define UC_OTHER_LETTER UC_CATEGORY_Lo
153 #define UC_MARK UC_CATEGORY_M
154 #define UC_NON_SPACING_MARK UC_CATEGORY_Mn
155 #define UC_COMBINING_SPACING_MARK UC_CATEGORY_Mc
156 #define UC_ENCLOSING_MARK UC_CATEGORY_Me
157 #define UC_NUMBER UC_CATEGORY_N
158 #define UC_DECIMAL_DIGIT_NUMBER UC_CATEGORY_Nd
159 #define UC_LETTER_NUMBER UC_CATEGORY_Nl
160 #define UC_OTHER_NUMBER UC_CATEGORY_No
161 #define UC_PUNCTUATION UC_CATEGORY_P
162 #define UC_CONNECTOR_PUNCTUATION UC_CATEGORY_Pc
163 #define UC_DASH_PUNCTUATION UC_CATEGORY_Pd
164 #define UC_OPEN_PUNCTUATION UC_CATEGORY_Ps /* a.k.a. UC_START_PUNCTUATION */
165 #define UC_CLOSE_PUNCTUATION UC_CATEGORY_Pe /* a.k.a. UC_END_PUNCTUATION */
166 #define UC_INITIAL_QUOTE_PUNCTUATION UC_CATEGORY_Pi
167 #define UC_FINAL_QUOTE_PUNCTUATION UC_CATEGORY_Pf
168 #define UC_OTHER_PUNCTUATION UC_CATEGORY_Po
169 #define UC_SYMBOL UC_CATEGORY_S
170 #define UC_MATH_SYMBOL UC_CATEGORY_Sm
171 #define UC_CURRENCY_SYMBOL UC_CATEGORY_Sc
172 #define UC_MODIFIER_SYMBOL UC_CATEGORY_Sk
173 #define UC_OTHER_SYMBOL UC_CATEGORY_So
174 #define UC_SEPARATOR UC_CATEGORY_Z
175 #define UC_SPACE_SEPARATOR UC_CATEGORY_Zs
176 #define UC_LINE_SEPARATOR UC_CATEGORY_Zl
177 #define UC_PARAGRAPH_SEPARATOR UC_CATEGORY_Zp
178 #define UC_OTHER UC_CATEGORY_C
179 #define UC_CONTROL UC_CATEGORY_Cc
180 #define UC_FORMAT UC_CATEGORY_Cf
181 #define UC_SURROGATE UC_CATEGORY_Cs /* all of them are invalid characters */
182 #define UC_PRIVATE_USE UC_CATEGORY_Co
183 #define UC_UNASSIGNED UC_CATEGORY_Cn /* some of them are invalid characters */
185 /* Return the union of two general categories.
186 This corresponds to the unions of the two sets of characters. */
187 extern uc_general_category_t
188 uc_general_category_or (uc_general_category_t category1,
189 uc_general_category_t category2);
191 /* Return the intersection of two general categories as bit masks.
192 This *does*not* correspond to the intersection of the two sets of
193 characters. */
194 extern uc_general_category_t
195 uc_general_category_and (uc_general_category_t category1,
196 uc_general_category_t category2);
198 /* Return the intersection of a general category with the complement of a
199 second general category, as bit masks.
200 This *does*not* correspond to the intersection with complement, when
201 viewing the categories as sets of characters. */
202 extern uc_general_category_t
203 uc_general_category_and_not (uc_general_category_t category1,
204 uc_general_category_t category2);
206 /* Return the name of a general category. */
207 extern const char *
208 uc_general_category_name (uc_general_category_t category)
209 _UC_ATTRIBUTE_PURE;
211 /* Return the long name of a general category. */
212 extern const char *
213 uc_general_category_long_name (uc_general_category_t category)
214 _UC_ATTRIBUTE_PURE;
216 /* Return the general category given by name, e.g. "Lu", or by long name,
217 e.g. "Uppercase Letter". */
218 extern uc_general_category_t
219 uc_general_category_byname (const char *category_name)
220 _UC_ATTRIBUTE_PURE;
222 /* Return the general category of a Unicode character. */
223 extern uc_general_category_t
224 uc_general_category (ucs4_t uc)
225 _UC_ATTRIBUTE_PURE;
227 /* Test whether a Unicode character belongs to a given category.
228 The CATEGORY argument can be the combination of several predefined
229 general categories. */
230 extern bool
231 uc_is_general_category (ucs4_t uc, uc_general_category_t category)
232 _UC_ATTRIBUTE_PURE;
233 /* Likewise. This function uses a big table comprising all categories. */
234 extern bool
235 uc_is_general_category_withtable (ucs4_t uc, uint32_t bitmask)
236 _UC_ATTRIBUTE_CONST;
238 /* ========================================================================= */
240 /* Field 3 of Unicode Character Database: Canonical combining class. */
242 /* The possible results of uc_combining_class (0..255) are described in
243 UCD.html. The list here is not definitive; more values can be added
244 in future versions. */
245 enum
247 UC_CCC_NR = 0, /* Not Reordered */
248 UC_CCC_OV = 1, /* Overlay */
249 UC_CCC_NK = 7, /* Nukta */
250 UC_CCC_KV = 8, /* Kana Voicing */
251 UC_CCC_VR = 9, /* Virama */
252 UC_CCC_ATBL = 200, /* Attached Below Left */
253 UC_CCC_ATB = 202, /* Attached Below */
254 UC_CCC_ATA = 214, /* Attached Above */
255 UC_CCC_ATAR = 216, /* Attached Above Right */
256 UC_CCC_BL = 218, /* Below Left */
257 UC_CCC_B = 220, /* Below */
258 UC_CCC_BR = 222, /* Below Right */
259 UC_CCC_L = 224, /* Left */
260 UC_CCC_R = 226, /* Right */
261 UC_CCC_AL = 228, /* Above Left */
262 UC_CCC_A = 230, /* Above */
263 UC_CCC_AR = 232, /* Above Right */
264 UC_CCC_DB = 233, /* Double Below */
265 UC_CCC_DA = 234, /* Double Above */
266 UC_CCC_IS = 240 /* Iota Subscript */
269 /* Return the canonical combining class of a Unicode character. */
270 extern int
271 uc_combining_class (ucs4_t uc)
272 _UC_ATTRIBUTE_CONST;
274 /* Return the name of a canonical combining class. */
275 extern const char *
276 uc_combining_class_name (int ccc)
277 _UC_ATTRIBUTE_CONST;
279 /* Return the long name of a canonical combining class. */
280 extern const char *
281 uc_combining_class_long_name (int ccc)
282 _UC_ATTRIBUTE_CONST;
284 /* Return the canonical combining class given by name, e.g. "BL", or by long
285 name, e.g. "Below Left". */
286 extern int
287 uc_combining_class_byname (const char *ccc_name)
288 _UC_ATTRIBUTE_PURE;
290 /* ========================================================================= */
292 /* Field 4 of Unicode Character Database: Bidi class.
293 Before Unicode 4.0, this field was called "Bidirectional category". */
295 enum
297 UC_BIDI_L, /* Left-to-Right */
298 UC_BIDI_LRE, /* Left-to-Right Embedding */
299 UC_BIDI_LRO, /* Left-to-Right Override */
300 UC_BIDI_R, /* Right-to-Left */
301 UC_BIDI_AL, /* Right-to-Left Arabic */
302 UC_BIDI_RLE, /* Right-to-Left Embedding */
303 UC_BIDI_RLO, /* Right-to-Left Override */
304 UC_BIDI_PDF, /* Pop Directional Format */
305 UC_BIDI_EN, /* European Number */
306 UC_BIDI_ES, /* European Number Separator */
307 UC_BIDI_ET, /* European Number Terminator */
308 UC_BIDI_AN, /* Arabic Number */
309 UC_BIDI_CS, /* Common Number Separator */
310 UC_BIDI_NSM, /* Non-Spacing Mark */
311 UC_BIDI_BN, /* Boundary Neutral */
312 UC_BIDI_B, /* Paragraph Separator */
313 UC_BIDI_S, /* Segment Separator */
314 UC_BIDI_WS, /* Whitespace */
315 UC_BIDI_ON, /* Other Neutral */
316 UC_BIDI_LRI, /* Left-to-Right Isolate */
317 UC_BIDI_RLI, /* Right-to-Left Isolate */
318 UC_BIDI_FSI, /* First Strong Isolate */
319 UC_BIDI_PDI /* Pop Directional Isolate */
322 /* Return the name of a bidi class. */
323 extern const char *
324 uc_bidi_class_name (int bidi_class)
325 _UC_ATTRIBUTE_CONST;
326 /* Same; obsolete function name. */
327 extern const char *
328 uc_bidi_category_name (int category)
329 _UC_ATTRIBUTE_CONST;
331 /* Return the long name of a bidi class. */
332 extern const char *
333 uc_bidi_class_long_name (int bidi_class)
334 _UC_ATTRIBUTE_CONST;
336 /* Return the bidi class given by name, e.g. "LRE", or by long name, e.g.
337 "Left-to-Right Embedding". */
338 extern int
339 uc_bidi_class_byname (const char *bidi_class_name)
340 _UC_ATTRIBUTE_PURE;
341 /* Same; obsolete function name. */
342 extern int
343 uc_bidi_category_byname (const char *category_name)
344 _UC_ATTRIBUTE_PURE;
346 /* Return the bidi class of a Unicode character. */
347 extern int
348 uc_bidi_class (ucs4_t uc)
349 _UC_ATTRIBUTE_CONST;
350 /* Same; obsolete function name. */
351 extern int
352 uc_bidi_category (ucs4_t uc)
353 _UC_ATTRIBUTE_CONST;
355 /* Test whether a Unicode character belongs to a given bidi class. */
356 extern bool
357 uc_is_bidi_class (ucs4_t uc, int bidi_class)
358 _UC_ATTRIBUTE_CONST;
359 /* Same; obsolete function name. */
360 extern bool
361 uc_is_bidi_category (ucs4_t uc, int category)
362 _UC_ATTRIBUTE_CONST;
364 /* ========================================================================= */
366 /* Field 5 of Unicode Character Database: Character decomposition mapping.
367 See "uninorm.h". */
369 /* ========================================================================= */
371 /* Field 6 of Unicode Character Database: Decimal digit value. */
373 /* Return the decimal digit value of a Unicode character. */
374 extern int
375 uc_decimal_value (ucs4_t uc)
376 _UC_ATTRIBUTE_CONST;
378 /* ========================================================================= */
380 /* Field 7 of Unicode Character Database: Digit value. */
382 /* Return the digit value of a Unicode character. */
383 extern int
384 uc_digit_value (ucs4_t uc)
385 _UC_ATTRIBUTE_CONST;
387 /* ========================================================================= */
389 /* Field 8 of Unicode Character Database: Numeric value. */
391 /* Return the numeric value of a Unicode character. */
392 typedef struct
394 int numerator;
395 int denominator;
397 uc_fraction_t;
398 extern uc_fraction_t
399 uc_numeric_value (ucs4_t uc)
400 _UC_ATTRIBUTE_CONST;
402 /* ========================================================================= */
404 /* Field 9 of Unicode Character Database: Mirrored. */
406 /* Return the mirrored character of a Unicode character UC in *PUC. */
407 extern bool
408 uc_mirror_char (ucs4_t uc, ucs4_t *puc);
410 /* ========================================================================= */
412 /* Field 10 of Unicode Character Database: Unicode 1.0 Name.
413 Not available in this library. */
415 /* ========================================================================= */
417 /* Field 11 of Unicode Character Database: ISO 10646 comment.
418 Not available in this library. */
420 /* ========================================================================= */
422 /* Field 12, 13, 14 of Unicode Character Database: Uppercase mapping,
423 lowercase mapping, titlecase mapping. See "unicase.h". */
425 /* ========================================================================= */
427 /* Field 2 of the file ArabicShaping.txt in the Unicode Character Database. */
429 /* Possible joining types. */
430 enum
432 UC_JOINING_TYPE_U, /* Non_Joining */
433 UC_JOINING_TYPE_T, /* Transparent */
434 UC_JOINING_TYPE_C, /* Join_Causing */
435 UC_JOINING_TYPE_L, /* Left_Joining */
436 UC_JOINING_TYPE_R, /* Right_Joining */
437 UC_JOINING_TYPE_D /* Dual_Joining */
440 /* Return the name of a joining type. */
441 extern const char *
442 uc_joining_type_name (int joining_type)
443 _UC_ATTRIBUTE_CONST;
445 /* Return the long name of a joining type. */
446 extern const char *
447 uc_joining_type_long_name (int joining_type)
448 _UC_ATTRIBUTE_CONST;
450 /* Return the joining type given by name, e.g. "D", or by long name, e.g.
451 "Dual Joining". */
452 extern int
453 uc_joining_type_byname (const char *joining_type_name)
454 _UC_ATTRIBUTE_PURE;
456 /* Return the joining type of a Unicode character. */
457 extern int
458 uc_joining_type (ucs4_t uc)
459 _UC_ATTRIBUTE_CONST;
461 /* ========================================================================= */
463 /* Field 3 of the file ArabicShaping.txt in the Unicode Character Database. */
465 /* Possible joining groups.
466 This enumeration may be extended in the future. */
467 enum
469 UC_JOINING_GROUP_NONE, /* No_Joining_Group */
470 UC_JOINING_GROUP_AIN, /* Ain */
471 UC_JOINING_GROUP_ALAPH, /* Alaph */
472 UC_JOINING_GROUP_ALEF, /* Alef */
473 UC_JOINING_GROUP_BEH, /* Beh */
474 UC_JOINING_GROUP_BETH, /* Beth */
475 UC_JOINING_GROUP_BURUSHASKI_YEH_BARREE, /* Burushaski_Yeh_Barree */
476 UC_JOINING_GROUP_DAL, /* Dal */
477 UC_JOINING_GROUP_DALATH_RISH, /* Dalath_Rish */
478 UC_JOINING_GROUP_E, /* E */
479 UC_JOINING_GROUP_FARSI_YEH, /* Farsi_Yeh */
480 UC_JOINING_GROUP_FE, /* Fe */
481 UC_JOINING_GROUP_FEH, /* Feh */
482 UC_JOINING_GROUP_FINAL_SEMKATH, /* Final_Semkath */
483 UC_JOINING_GROUP_GAF, /* Gaf */
484 UC_JOINING_GROUP_GAMAL, /* Gamal */
485 UC_JOINING_GROUP_HAH, /* Hah */
486 UC_JOINING_GROUP_HE, /* He */
487 UC_JOINING_GROUP_HEH, /* Heh */
488 UC_JOINING_GROUP_HEH_GOAL, /* Heh_Goal */
489 UC_JOINING_GROUP_HETH, /* Heth */
490 UC_JOINING_GROUP_KAF, /* Kaf */
491 UC_JOINING_GROUP_KAPH, /* Kaph */
492 UC_JOINING_GROUP_KHAPH, /* Khaph */
493 UC_JOINING_GROUP_KNOTTED_HEH, /* Knotted_Heh */
494 UC_JOINING_GROUP_LAM, /* Lam */
495 UC_JOINING_GROUP_LAMADH, /* Lamadh */
496 UC_JOINING_GROUP_MEEM, /* Meem */
497 UC_JOINING_GROUP_MIM, /* Mim */
498 UC_JOINING_GROUP_NOON, /* Noon */
499 UC_JOINING_GROUP_NUN, /* Nun */
500 UC_JOINING_GROUP_NYA, /* Nya */
501 UC_JOINING_GROUP_PE, /* Pe */
502 UC_JOINING_GROUP_QAF, /* Qaf */
503 UC_JOINING_GROUP_QAPH, /* Qaph */
504 UC_JOINING_GROUP_REH, /* Reh */
505 UC_JOINING_GROUP_REVERSED_PE, /* Reversed_Pe */
506 UC_JOINING_GROUP_SAD, /* Sad */
507 UC_JOINING_GROUP_SADHE, /* Sadhe */
508 UC_JOINING_GROUP_SEEN, /* Seen */
509 UC_JOINING_GROUP_SEMKATH, /* Semkath */
510 UC_JOINING_GROUP_SHIN, /* Shin */
511 UC_JOINING_GROUP_SWASH_KAF, /* Swash_Kaf */
512 UC_JOINING_GROUP_SYRIAC_WAW, /* Syriac_Waw */
513 UC_JOINING_GROUP_TAH, /* Tah */
514 UC_JOINING_GROUP_TAW, /* Taw */
515 UC_JOINING_GROUP_TEH_MARBUTA, /* Teh_Marbuta */
516 UC_JOINING_GROUP_TEH_MARBUTA_GOAL, /* Teh_Marbuta_Goal */
517 UC_JOINING_GROUP_TETH, /* Teth */
518 UC_JOINING_GROUP_WAW, /* Waw */
519 UC_JOINING_GROUP_YEH, /* Yeh */
520 UC_JOINING_GROUP_YEH_BARREE, /* Yeh_Barree */
521 UC_JOINING_GROUP_YEH_WITH_TAIL, /* Yeh_With_Tail */
522 UC_JOINING_GROUP_YUDH, /* Yudh */
523 UC_JOINING_GROUP_YUDH_HE, /* Yudh_He */
524 UC_JOINING_GROUP_ZAIN, /* Zain */
525 UC_JOINING_GROUP_ZHAIN, /* Zhain */
526 UC_JOINING_GROUP_ROHINGYA_YEH, /* Rohingya_Yeh */
527 UC_JOINING_GROUP_STRAIGHT_WAW, /* Straight_Waw */
528 UC_JOINING_GROUP_MANICHAEAN_ALEPH, /* Manichaean_Aleph */
529 UC_JOINING_GROUP_MANICHAEAN_BETH, /* Manichaean_Beth */
530 UC_JOINING_GROUP_MANICHAEAN_GIMEL, /* Manichaean_Gimel */
531 UC_JOINING_GROUP_MANICHAEAN_DALETH, /* Manichaean_Daleth */
532 UC_JOINING_GROUP_MANICHAEAN_WAW, /* Manichaean_Waw */
533 UC_JOINING_GROUP_MANICHAEAN_ZAYIN, /* Manichaean_Zayin */
534 UC_JOINING_GROUP_MANICHAEAN_HETH, /* Manichaean_Heth */
535 UC_JOINING_GROUP_MANICHAEAN_TETH, /* Manichaean_Teth */
536 UC_JOINING_GROUP_MANICHAEAN_YODH, /* Manichaean_Yodh */
537 UC_JOINING_GROUP_MANICHAEAN_KAPH, /* Manichaean_Kaph */
538 UC_JOINING_GROUP_MANICHAEAN_LAMEDH, /* Manichaean_Lamedh */
539 UC_JOINING_GROUP_MANICHAEAN_DHAMEDH, /* Manichaean_Dhamedh */
540 UC_JOINING_GROUP_MANICHAEAN_THAMEDH, /* Manichaean_Thamedh */
541 UC_JOINING_GROUP_MANICHAEAN_MEM, /* Manichaean_Mem */
542 UC_JOINING_GROUP_MANICHAEAN_NUN, /* Manichaean_Nun */
543 UC_JOINING_GROUP_MANICHAEAN_SAMEKH, /* Manichaean_Aleph */
544 UC_JOINING_GROUP_MANICHAEAN_AYIN, /* Manichaean_Ayin */
545 UC_JOINING_GROUP_MANICHAEAN_PE, /* Manichaean_Pe */
546 UC_JOINING_GROUP_MANICHAEAN_SADHE, /* Manichaean_Sadhe */
547 UC_JOINING_GROUP_MANICHAEAN_QOPH, /* Manichaean_Qoph */
548 UC_JOINING_GROUP_MANICHAEAN_RESH, /* Manichaean_Resh */
549 UC_JOINING_GROUP_MANICHAEAN_TAW, /* Manichaean_Taw */
550 UC_JOINING_GROUP_MANICHAEAN_ONE, /* Manichaean_One */
551 UC_JOINING_GROUP_MANICHAEAN_FIVE, /* Manichaean_Five */
552 UC_JOINING_GROUP_MANICHAEAN_TEN, /* Manichaean_Ten */
553 UC_JOINING_GROUP_MANICHAEAN_TWENTY, /* Manichaean_Twenty */
554 UC_JOINING_GROUP_MANICHAEAN_HUNDRED, /* Manichaean_Hundred */
555 UC_JOINING_GROUP_AFRICAN_FEH, /* African_Feh */
556 UC_JOINING_GROUP_AFRICAN_QAF, /* African_Qaf */
557 UC_JOINING_GROUP_AFRICAN_NOON /* African_Noon */
560 /* Return the name of a joining group. */
561 extern const char *
562 uc_joining_group_name (int joining_group)
563 _UC_ATTRIBUTE_CONST;
565 /* Return the joining group given by name, e.g. "Teh_Marbuta". */
566 extern int
567 uc_joining_group_byname (const char *joining_group_name)
568 _UC_ATTRIBUTE_PURE;
570 /* Return the joining group of a Unicode character. */
571 extern int
572 uc_joining_group (ucs4_t uc)
573 _UC_ATTRIBUTE_CONST;
575 /* ========================================================================= */
577 /* Common API for properties. */
579 /* Data type denoting a property. This is not just a number, but rather a
580 pointer to the test functions, so that programs that use only few of the
581 properties don't have a link-time dependency towards all the tables. */
582 typedef struct
584 bool (*test_fn) (ucs4_t uc);
586 uc_property_t;
588 /* Predefined properties. */
589 /* General. */
590 extern const uc_property_t UC_PROPERTY_WHITE_SPACE;
591 extern const uc_property_t UC_PROPERTY_ALPHABETIC;
592 extern const uc_property_t UC_PROPERTY_OTHER_ALPHABETIC;
593 extern const uc_property_t UC_PROPERTY_NOT_A_CHARACTER;
594 extern const uc_property_t UC_PROPERTY_DEFAULT_IGNORABLE_CODE_POINT;
595 extern const uc_property_t UC_PROPERTY_OTHER_DEFAULT_IGNORABLE_CODE_POINT;
596 extern const uc_property_t UC_PROPERTY_DEPRECATED;
597 extern const uc_property_t UC_PROPERTY_LOGICAL_ORDER_EXCEPTION;
598 extern const uc_property_t UC_PROPERTY_VARIATION_SELECTOR;
599 extern const uc_property_t UC_PROPERTY_PRIVATE_USE;
600 extern const uc_property_t UC_PROPERTY_UNASSIGNED_CODE_VALUE;
601 /* Case. */
602 extern const uc_property_t UC_PROPERTY_UPPERCASE;
603 extern const uc_property_t UC_PROPERTY_OTHER_UPPERCASE;
604 extern const uc_property_t UC_PROPERTY_LOWERCASE;
605 extern const uc_property_t UC_PROPERTY_OTHER_LOWERCASE;
606 extern const uc_property_t UC_PROPERTY_TITLECASE;
607 extern const uc_property_t UC_PROPERTY_CASED;
608 extern const uc_property_t UC_PROPERTY_CASE_IGNORABLE;
609 extern const uc_property_t UC_PROPERTY_CHANGES_WHEN_LOWERCASED;
610 extern const uc_property_t UC_PROPERTY_CHANGES_WHEN_UPPERCASED;
611 extern const uc_property_t UC_PROPERTY_CHANGES_WHEN_TITLECASED;
612 extern const uc_property_t UC_PROPERTY_CHANGES_WHEN_CASEFOLDED;
613 extern const uc_property_t UC_PROPERTY_CHANGES_WHEN_CASEMAPPED;
614 extern const uc_property_t UC_PROPERTY_SOFT_DOTTED;
615 /* Identifiers. */
616 extern const uc_property_t UC_PROPERTY_ID_START;
617 extern const uc_property_t UC_PROPERTY_OTHER_ID_START;
618 extern const uc_property_t UC_PROPERTY_ID_CONTINUE;
619 extern const uc_property_t UC_PROPERTY_OTHER_ID_CONTINUE;
620 extern const uc_property_t UC_PROPERTY_XID_START;
621 extern const uc_property_t UC_PROPERTY_XID_CONTINUE;
622 extern const uc_property_t UC_PROPERTY_PATTERN_WHITE_SPACE;
623 extern const uc_property_t UC_PROPERTY_PATTERN_SYNTAX;
624 /* Shaping and rendering. */
625 extern const uc_property_t UC_PROPERTY_JOIN_CONTROL;
626 extern const uc_property_t UC_PROPERTY_GRAPHEME_BASE;
627 extern const uc_property_t UC_PROPERTY_GRAPHEME_EXTEND;
628 extern const uc_property_t UC_PROPERTY_OTHER_GRAPHEME_EXTEND;
629 extern const uc_property_t UC_PROPERTY_GRAPHEME_LINK;
630 /* Bidi. */
631 extern const uc_property_t UC_PROPERTY_BIDI_CONTROL;
632 extern const uc_property_t UC_PROPERTY_BIDI_LEFT_TO_RIGHT;
633 extern const uc_property_t UC_PROPERTY_BIDI_HEBREW_RIGHT_TO_LEFT;
634 extern const uc_property_t UC_PROPERTY_BIDI_ARABIC_RIGHT_TO_LEFT;
635 extern const uc_property_t UC_PROPERTY_BIDI_EUROPEAN_DIGIT;
636 extern const uc_property_t UC_PROPERTY_BIDI_EUR_NUM_SEPARATOR;
637 extern const uc_property_t UC_PROPERTY_BIDI_EUR_NUM_TERMINATOR;
638 extern const uc_property_t UC_PROPERTY_BIDI_ARABIC_DIGIT;
639 extern const uc_property_t UC_PROPERTY_BIDI_COMMON_SEPARATOR;
640 extern const uc_property_t UC_PROPERTY_BIDI_BLOCK_SEPARATOR;
641 extern const uc_property_t UC_PROPERTY_BIDI_SEGMENT_SEPARATOR;
642 extern const uc_property_t UC_PROPERTY_BIDI_WHITESPACE;
643 extern const uc_property_t UC_PROPERTY_BIDI_NON_SPACING_MARK;
644 extern const uc_property_t UC_PROPERTY_BIDI_BOUNDARY_NEUTRAL;
645 extern const uc_property_t UC_PROPERTY_BIDI_PDF;
646 extern const uc_property_t UC_PROPERTY_BIDI_EMBEDDING_OR_OVERRIDE;
647 extern const uc_property_t UC_PROPERTY_BIDI_OTHER_NEUTRAL;
648 /* Numeric. */
649 extern const uc_property_t UC_PROPERTY_HEX_DIGIT;
650 extern const uc_property_t UC_PROPERTY_ASCII_HEX_DIGIT;
651 /* CJK. */
652 extern const uc_property_t UC_PROPERTY_IDEOGRAPHIC;
653 extern const uc_property_t UC_PROPERTY_UNIFIED_IDEOGRAPH;
654 extern const uc_property_t UC_PROPERTY_RADICAL;
655 extern const uc_property_t UC_PROPERTY_IDS_BINARY_OPERATOR;
656 extern const uc_property_t UC_PROPERTY_IDS_TRINARY_OPERATOR;
657 /* Misc. */
658 extern const uc_property_t UC_PROPERTY_ZERO_WIDTH;
659 extern const uc_property_t UC_PROPERTY_SPACE;
660 extern const uc_property_t UC_PROPERTY_NON_BREAK;
661 extern const uc_property_t UC_PROPERTY_ISO_CONTROL;
662 extern const uc_property_t UC_PROPERTY_FORMAT_CONTROL;
663 extern const uc_property_t UC_PROPERTY_DASH;
664 extern const uc_property_t UC_PROPERTY_HYPHEN;
665 extern const uc_property_t UC_PROPERTY_PUNCTUATION;
666 extern const uc_property_t UC_PROPERTY_LINE_SEPARATOR;
667 extern const uc_property_t UC_PROPERTY_PARAGRAPH_SEPARATOR;
668 extern const uc_property_t UC_PROPERTY_QUOTATION_MARK;
669 extern const uc_property_t UC_PROPERTY_SENTENCE_TERMINAL;
670 extern const uc_property_t UC_PROPERTY_TERMINAL_PUNCTUATION;
671 extern const uc_property_t UC_PROPERTY_CURRENCY_SYMBOL;
672 extern const uc_property_t UC_PROPERTY_MATH;
673 extern const uc_property_t UC_PROPERTY_OTHER_MATH;
674 extern const uc_property_t UC_PROPERTY_PAIRED_PUNCTUATION;
675 extern const uc_property_t UC_PROPERTY_LEFT_OF_PAIR;
676 extern const uc_property_t UC_PROPERTY_COMBINING;
677 extern const uc_property_t UC_PROPERTY_COMPOSITE;
678 extern const uc_property_t UC_PROPERTY_DECIMAL_DIGIT;
679 extern const uc_property_t UC_PROPERTY_NUMERIC;
680 extern const uc_property_t UC_PROPERTY_DIACRITIC;
681 extern const uc_property_t UC_PROPERTY_EXTENDER;
682 extern const uc_property_t UC_PROPERTY_IGNORABLE_CONTROL;
684 /* Return the property given by name, e.g. "White space". */
685 extern uc_property_t
686 uc_property_byname (const char *property_name);
688 /* Test whether a property is valid. */
689 #define uc_property_is_valid(property) ((property).test_fn != NULL)
691 /* Test whether a Unicode character has a given property. */
692 extern bool
693 uc_is_property (ucs4_t uc, uc_property_t property);
694 extern bool uc_is_property_white_space (ucs4_t uc)
695 _UC_ATTRIBUTE_CONST;
696 extern bool uc_is_property_alphabetic (ucs4_t uc)
697 _UC_ATTRIBUTE_CONST;
698 extern bool uc_is_property_other_alphabetic (ucs4_t uc)
699 _UC_ATTRIBUTE_CONST;
700 extern bool uc_is_property_not_a_character (ucs4_t uc)
701 _UC_ATTRIBUTE_CONST;
702 extern bool uc_is_property_default_ignorable_code_point (ucs4_t uc)
703 _UC_ATTRIBUTE_CONST;
704 extern bool uc_is_property_other_default_ignorable_code_point (ucs4_t uc)
705 _UC_ATTRIBUTE_CONST;
706 extern bool uc_is_property_deprecated (ucs4_t uc)
707 _UC_ATTRIBUTE_CONST;
708 extern bool uc_is_property_logical_order_exception (ucs4_t uc)
709 _UC_ATTRIBUTE_CONST;
710 extern bool uc_is_property_variation_selector (ucs4_t uc)
711 _UC_ATTRIBUTE_CONST;
712 extern bool uc_is_property_private_use (ucs4_t uc)
713 _UC_ATTRIBUTE_CONST;
714 extern bool uc_is_property_unassigned_code_value (ucs4_t uc)
715 _UC_ATTRIBUTE_CONST;
716 extern bool uc_is_property_uppercase (ucs4_t uc)
717 _UC_ATTRIBUTE_CONST;
718 extern bool uc_is_property_other_uppercase (ucs4_t uc)
719 _UC_ATTRIBUTE_CONST;
720 extern bool uc_is_property_lowercase (ucs4_t uc)
721 _UC_ATTRIBUTE_CONST;
722 extern bool uc_is_property_other_lowercase (ucs4_t uc)
723 _UC_ATTRIBUTE_CONST;
724 extern bool uc_is_property_titlecase (ucs4_t uc)
725 _UC_ATTRIBUTE_CONST;
726 extern bool uc_is_property_cased (ucs4_t uc)
727 _UC_ATTRIBUTE_CONST;
728 extern bool uc_is_property_case_ignorable (ucs4_t uc)
729 _UC_ATTRIBUTE_CONST;
730 extern bool uc_is_property_changes_when_lowercased (ucs4_t uc)
731 _UC_ATTRIBUTE_CONST;
732 extern bool uc_is_property_changes_when_uppercased (ucs4_t uc)
733 _UC_ATTRIBUTE_CONST;
734 extern bool uc_is_property_changes_when_titlecased (ucs4_t uc)
735 _UC_ATTRIBUTE_CONST;
736 extern bool uc_is_property_changes_when_casefolded (ucs4_t uc)
737 _UC_ATTRIBUTE_CONST;
738 extern bool uc_is_property_changes_when_casemapped (ucs4_t uc)
739 _UC_ATTRIBUTE_CONST;
740 extern bool uc_is_property_soft_dotted (ucs4_t uc)
741 _UC_ATTRIBUTE_CONST;
742 extern bool uc_is_property_id_start (ucs4_t uc)
743 _UC_ATTRIBUTE_CONST;
744 extern bool uc_is_property_other_id_start (ucs4_t uc)
745 _UC_ATTRIBUTE_CONST;
746 extern bool uc_is_property_id_continue (ucs4_t uc)
747 _UC_ATTRIBUTE_CONST;
748 extern bool uc_is_property_other_id_continue (ucs4_t uc)
749 _UC_ATTRIBUTE_CONST;
750 extern bool uc_is_property_xid_start (ucs4_t uc)
751 _UC_ATTRIBUTE_CONST;
752 extern bool uc_is_property_xid_continue (ucs4_t uc)
753 _UC_ATTRIBUTE_CONST;
754 extern bool uc_is_property_pattern_white_space (ucs4_t uc)
755 _UC_ATTRIBUTE_CONST;
756 extern bool uc_is_property_pattern_syntax (ucs4_t uc)
757 _UC_ATTRIBUTE_CONST;
758 extern bool uc_is_property_join_control (ucs4_t uc)
759 _UC_ATTRIBUTE_CONST;
760 extern bool uc_is_property_grapheme_base (ucs4_t uc)
761 _UC_ATTRIBUTE_CONST;
762 extern bool uc_is_property_grapheme_extend (ucs4_t uc)
763 _UC_ATTRIBUTE_CONST;
764 extern bool uc_is_property_other_grapheme_extend (ucs4_t uc)
765 _UC_ATTRIBUTE_CONST;
766 extern bool uc_is_property_grapheme_link (ucs4_t uc)
767 _UC_ATTRIBUTE_CONST;
768 extern bool uc_is_property_bidi_control (ucs4_t uc)
769 _UC_ATTRIBUTE_CONST;
770 extern bool uc_is_property_bidi_left_to_right (ucs4_t uc)
771 _UC_ATTRIBUTE_CONST;
772 extern bool uc_is_property_bidi_hebrew_right_to_left (ucs4_t uc)
773 _UC_ATTRIBUTE_CONST;
774 extern bool uc_is_property_bidi_arabic_right_to_left (ucs4_t uc)
775 _UC_ATTRIBUTE_CONST;
776 extern bool uc_is_property_bidi_european_digit (ucs4_t uc)
777 _UC_ATTRIBUTE_CONST;
778 extern bool uc_is_property_bidi_eur_num_separator (ucs4_t uc)
779 _UC_ATTRIBUTE_CONST;
780 extern bool uc_is_property_bidi_eur_num_terminator (ucs4_t uc)
781 _UC_ATTRIBUTE_CONST;
782 extern bool uc_is_property_bidi_arabic_digit (ucs4_t uc)
783 _UC_ATTRIBUTE_CONST;
784 extern bool uc_is_property_bidi_common_separator (ucs4_t uc)
785 _UC_ATTRIBUTE_CONST;
786 extern bool uc_is_property_bidi_block_separator (ucs4_t uc)
787 _UC_ATTRIBUTE_CONST;
788 extern bool uc_is_property_bidi_segment_separator (ucs4_t uc)
789 _UC_ATTRIBUTE_CONST;
790 extern bool uc_is_property_bidi_whitespace (ucs4_t uc)
791 _UC_ATTRIBUTE_CONST;
792 extern bool uc_is_property_bidi_non_spacing_mark (ucs4_t uc)
793 _UC_ATTRIBUTE_CONST;
794 extern bool uc_is_property_bidi_boundary_neutral (ucs4_t uc)
795 _UC_ATTRIBUTE_CONST;
796 extern bool uc_is_property_bidi_pdf (ucs4_t uc)
797 _UC_ATTRIBUTE_CONST;
798 extern bool uc_is_property_bidi_embedding_or_override (ucs4_t uc)
799 _UC_ATTRIBUTE_CONST;
800 extern bool uc_is_property_bidi_other_neutral (ucs4_t uc)
801 _UC_ATTRIBUTE_CONST;
802 extern bool uc_is_property_hex_digit (ucs4_t uc)
803 _UC_ATTRIBUTE_CONST;
804 extern bool uc_is_property_ascii_hex_digit (ucs4_t uc)
805 _UC_ATTRIBUTE_CONST;
806 extern bool uc_is_property_ideographic (ucs4_t uc)
807 _UC_ATTRIBUTE_CONST;
808 extern bool uc_is_property_unified_ideograph (ucs4_t uc)
809 _UC_ATTRIBUTE_CONST;
810 extern bool uc_is_property_radical (ucs4_t uc)
811 _UC_ATTRIBUTE_CONST;
812 extern bool uc_is_property_ids_binary_operator (ucs4_t uc)
813 _UC_ATTRIBUTE_CONST;
814 extern bool uc_is_property_ids_trinary_operator (ucs4_t uc)
815 _UC_ATTRIBUTE_CONST;
816 extern bool uc_is_property_zero_width (ucs4_t uc)
817 _UC_ATTRIBUTE_CONST;
818 extern bool uc_is_property_space (ucs4_t uc)
819 _UC_ATTRIBUTE_CONST;
820 extern bool uc_is_property_non_break (ucs4_t uc)
821 _UC_ATTRIBUTE_CONST;
822 extern bool uc_is_property_iso_control (ucs4_t uc)
823 _UC_ATTRIBUTE_CONST;
824 extern bool uc_is_property_format_control (ucs4_t uc)
825 _UC_ATTRIBUTE_CONST;
826 extern bool uc_is_property_dash (ucs4_t uc)
827 _UC_ATTRIBUTE_CONST;
828 extern bool uc_is_property_hyphen (ucs4_t uc)
829 _UC_ATTRIBUTE_CONST;
830 extern bool uc_is_property_punctuation (ucs4_t uc)
831 _UC_ATTRIBUTE_CONST;
832 extern bool uc_is_property_line_separator (ucs4_t uc)
833 _UC_ATTRIBUTE_CONST;
834 extern bool uc_is_property_paragraph_separator (ucs4_t uc)
835 _UC_ATTRIBUTE_CONST;
836 extern bool uc_is_property_quotation_mark (ucs4_t uc)
837 _UC_ATTRIBUTE_CONST;
838 extern bool uc_is_property_sentence_terminal (ucs4_t uc)
839 _UC_ATTRIBUTE_CONST;
840 extern bool uc_is_property_terminal_punctuation (ucs4_t uc)
841 _UC_ATTRIBUTE_CONST;
842 extern bool uc_is_property_currency_symbol (ucs4_t uc)
843 _UC_ATTRIBUTE_CONST;
844 extern bool uc_is_property_math (ucs4_t uc)
845 _UC_ATTRIBUTE_CONST;
846 extern bool uc_is_property_other_math (ucs4_t uc)
847 _UC_ATTRIBUTE_CONST;
848 extern bool uc_is_property_paired_punctuation (ucs4_t uc)
849 _UC_ATTRIBUTE_CONST;
850 extern bool uc_is_property_left_of_pair (ucs4_t uc)
851 _UC_ATTRIBUTE_CONST;
852 extern bool uc_is_property_combining (ucs4_t uc)
853 _UC_ATTRIBUTE_CONST;
854 extern bool uc_is_property_composite (ucs4_t uc)
855 _UC_ATTRIBUTE_CONST;
856 extern bool uc_is_property_decimal_digit (ucs4_t uc)
857 _UC_ATTRIBUTE_CONST;
858 extern bool uc_is_property_numeric (ucs4_t uc)
859 _UC_ATTRIBUTE_CONST;
860 extern bool uc_is_property_diacritic (ucs4_t uc)
861 _UC_ATTRIBUTE_CONST;
862 extern bool uc_is_property_extender (ucs4_t uc)
863 _UC_ATTRIBUTE_CONST;
864 extern bool uc_is_property_ignorable_control (ucs4_t uc)
865 _UC_ATTRIBUTE_CONST;
867 /* ========================================================================= */
869 /* Subdivision of the Unicode characters into scripts. */
871 typedef struct
873 unsigned int code : 21;
874 unsigned int start : 1;
875 unsigned int end : 1;
877 uc_interval_t;
878 typedef struct
880 unsigned int nintervals;
881 const uc_interval_t *intervals;
882 const char *name;
884 uc_script_t;
886 /* Return the script of a Unicode character. */
887 extern const uc_script_t *
888 uc_script (ucs4_t uc)
889 _UC_ATTRIBUTE_CONST;
891 /* Return the script given by name, e.g. "HAN". */
892 extern const uc_script_t *
893 uc_script_byname (const char *script_name)
894 _UC_ATTRIBUTE_PURE;
896 /* Test whether a Unicode character belongs to a given script. */
897 extern bool
898 uc_is_script (ucs4_t uc, const uc_script_t *script)
899 _UC_ATTRIBUTE_PURE;
901 /* Get the list of all scripts. */
902 extern void
903 uc_all_scripts (const uc_script_t **scripts, size_t *count);
905 /* ========================================================================= */
907 /* Subdivision of the Unicode character range into blocks. */
909 typedef struct
911 ucs4_t start;
912 ucs4_t end;
913 const char *name;
915 uc_block_t;
917 /* Return the block a character belongs to. */
918 extern const uc_block_t *
919 uc_block (ucs4_t uc)
920 _UC_ATTRIBUTE_CONST;
922 /* Test whether a Unicode character belongs to a given block. */
923 extern bool
924 uc_is_block (ucs4_t uc, const uc_block_t *block)
925 _UC_ATTRIBUTE_PURE;
927 /* Get the list of all blocks. */
928 extern void
929 uc_all_blocks (const uc_block_t **blocks, size_t *count);
931 /* ========================================================================= */
933 /* Properties taken from language standards. */
935 /* Test whether a Unicode character is considered whitespace in ISO C 99. */
936 extern bool
937 uc_is_c_whitespace (ucs4_t uc)
938 _UC_ATTRIBUTE_CONST;
940 /* Test whether a Unicode character is considered whitespace in Java. */
941 extern bool
942 uc_is_java_whitespace (ucs4_t uc)
943 _UC_ATTRIBUTE_CONST;
945 enum
947 UC_IDENTIFIER_START, /* valid as first or subsequent character */
948 UC_IDENTIFIER_VALID, /* valid as subsequent character only */
949 UC_IDENTIFIER_INVALID, /* not valid */
950 UC_IDENTIFIER_IGNORABLE /* ignorable (Java only) */
953 /* Return the categorization of a Unicode character w.r.t. the ISO C 99
954 identifier syntax. */
955 extern int
956 uc_c_ident_category (ucs4_t uc)
957 _UC_ATTRIBUTE_CONST;
959 /* Return the categorization of a Unicode character w.r.t. the Java
960 identifier syntax. */
961 extern int
962 uc_java_ident_category (ucs4_t uc)
963 _UC_ATTRIBUTE_CONST;
965 /* ========================================================================= */
967 /* Like ISO C <ctype.h> and <wctype.h>. These functions are deprecated,
968 because this set of functions was designed with ASCII in mind and cannot
969 reflect the more diverse reality of the Unicode character set. But they
970 can be a quick-and-dirty porting aid when migrating from wchar_t APIs
971 to Unicode strings. */
973 /* Test for any character for which 'uc_is_alpha' or 'uc_is_digit' is true. */
974 extern bool
975 uc_is_alnum (ucs4_t uc)
976 _UC_ATTRIBUTE_CONST;
978 /* Test for any character for which 'uc_is_upper' or 'uc_is_lower' is true,
979 or any character that is one of a locale-specific set of characters for
980 which none of 'uc_is_cntrl', 'uc_is_digit', 'uc_is_punct', or 'uc_is_space'
981 is true. */
982 extern bool
983 uc_is_alpha (ucs4_t uc)
984 _UC_ATTRIBUTE_CONST;
986 /* Test for any control character. */
987 extern bool
988 uc_is_cntrl (ucs4_t uc)
989 _UC_ATTRIBUTE_CONST;
991 /* Test for any character that corresponds to a decimal-digit character. */
992 extern bool
993 uc_is_digit (ucs4_t uc)
994 _UC_ATTRIBUTE_CONST;
996 /* Test for any character for which 'uc_is_print' is true and 'uc_is_space'
997 is false. */
998 extern bool
999 uc_is_graph (ucs4_t uc)
1000 _UC_ATTRIBUTE_CONST;
1002 /* Test for any character that corresponds to a lowercase letter or is one
1003 of a locale-specific set of characters for which none of 'uc_is_cntrl',
1004 'uc_is_digit', 'uc_is_punct', or 'uc_is_space' is true. */
1005 extern bool
1006 uc_is_lower (ucs4_t uc)
1007 _UC_ATTRIBUTE_CONST;
1009 /* Test for any printing character. */
1010 extern bool
1011 uc_is_print (ucs4_t uc)
1012 _UC_ATTRIBUTE_CONST;
1014 /* Test for any printing character that is one of a locale-specific set of
1015 characters for which neither 'uc_is_space' nor 'uc_is_alnum' is true. */
1016 extern bool
1017 uc_is_punct (ucs4_t uc)
1018 _UC_ATTRIBUTE_CONST;
1020 /* Test for any character that corresponds to a locale-specific set of
1021 characters for which none of 'uc_is_alnum', 'uc_is_graph', or 'uc_is_punct'
1022 is true. */
1023 extern bool
1024 uc_is_space (ucs4_t uc)
1025 _UC_ATTRIBUTE_CONST;
1027 /* Test for any character that corresponds to an uppercase letter or is one
1028 of a locale-specific set of character for which none of 'uc_is_cntrl',
1029 'uc_is_digit', 'uc_is_punct', or 'uc_is_space' is true. */
1030 extern bool
1031 uc_is_upper (ucs4_t uc)
1032 _UC_ATTRIBUTE_CONST;
1034 /* Test for any character that corresponds to a hexadecimal-digit
1035 character. */
1036 extern bool
1037 uc_is_xdigit (ucs4_t uc)
1038 _UC_ATTRIBUTE_CONST;
1040 /* GNU extension. */
1041 /* Test for any character that corresponds to a standard blank character or
1042 a locale-specific set of characters for which 'uc_is_alnum' is false. */
1043 extern bool
1044 uc_is_blank (ucs4_t uc)
1045 _UC_ATTRIBUTE_CONST;
1047 /* ========================================================================= */
1049 #ifdef __cplusplus
1051 #endif
1053 #endif /* _UNICTYPE_H */