1 /* Association between Unicode characters and their names.
2 Copyright (C) 2000-2002, 2005-2007, 2009-2020 Free Software Foundation, Inc.
4 This program is free software: you can redistribute it and/or modify it
5 under the terms of the GNU Lesser General Public License as published
6 by the Free Software Foundation; either version 3 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12 Lesser General Public License for more details.
14 You should have received a copy of the GNU Lesser General Public License
15 along with this program. If not, see <https://www.gnu.org/licenses/>. */
28 #include "attribute.h"
30 #define SIZEOF(a) (sizeof(a) / sizeof(a[0]))
33 /* Table of Unicode character names, derived from UnicodeData.txt.
34 This table is generated in a way to minimize the memory footprint:
35 1. its compiled size is small (less than 350 KB),
36 2. it resides entirely in the text or read-only data segment of the
37 executable or shared library: the table contains only immediate
38 integers, no pointers, and the functions don't do heap allocation.
42 static const char unicode_name_words[36303] = ...;
43 #define UNICODE_CHARNAME_NUM_WORDS 6260
44 static const struct { uint16_t extra_offset; uint16_t ind_offset; } unicode_name_by_length[26] = ...;
45 #define UNICODE_CHARNAME_WORD_HANGUL 3902
46 #define UNICODE_CHARNAME_WORD_SYLLABLE 4978
47 #define UNICODE_CHARNAME_WORD_CJK 417
48 #define UNICODE_CHARNAME_WORD_COMPATIBILITY 6107
49 static const uint16_t unicode_names[68940] = ...;
50 static const struct { uint16_t index; uint32_t name:24; } unicode_name_to_index[16626] = ...;
51 static const struct { uint16_t index; uint32_t name:24; } unicode_index_to_name[16626] = ...;
52 #define UNICODE_CHARNAME_MAX_LENGTH 83
53 #define UNICODE_CHARNAME_MAX_WORDS 13
54 static const struct { uint32_t index; uint32_t gap; uint16_t length; } unicode_ranges[401] = ...;
57 /* Returns the word with a given index. */
59 unicode_name_word (unsigned int index
, unsigned int *lengthp
)
64 assert (index
< UNICODE_CHARNAME_NUM_WORDS
);
66 /* Binary search for i with
67 unicode_name_by_length[i].ind_offset <= index
69 index < unicode_name_by_length[i+1].ind_offset
73 i2
= SIZEOF (unicode_name_by_length
) - 1;
76 unsigned int i
= (i1
+ i2
) >> 1;
77 if (unicode_name_by_length
[i
].ind_offset
<= index
)
83 assert (unicode_name_by_length
[i
].ind_offset
<= index
84 && index
< unicode_name_by_length
[i
+1].ind_offset
);
86 return &unicode_name_words
[unicode_name_by_length
[i
].extra_offset
87 + (index
-unicode_name_by_length
[i
].ind_offset
)*i
];
90 /* Looks up the index of a word. */
92 unicode_name_word_lookup (const char *word
, size_t length
)
94 if (length
> 0 && length
< SIZEOF (unicode_name_by_length
) - 1)
96 /* Binary search among the words of given length. */
97 unsigned int extra_offset
= unicode_name_by_length
[length
].extra_offset
;
98 unsigned int i0
= unicode_name_by_length
[length
].ind_offset
;
100 unsigned int i2
= unicode_name_by_length
[length
+1].ind_offset
;
103 unsigned int i
= (i1
+ i2
) >> 1;
104 const char *p
= &unicode_name_words
[extra_offset
+ (i
-i0
)*length
];
105 const char *w
= word
;
106 unsigned int n
= length
;
113 /* Note here: i1 < i < i2. */
119 /* Note here: i1 <= i < i2. */
132 #define UNINAME_INVALID_INDEX UINT16_MAX
134 /* Looks up the internal index of a Unicode character. */
136 unicode_code_to_index (ucs4_t c
)
138 /* Binary search in unicode_ranges. */
140 unsigned int i2
= SIZEOF (unicode_ranges
);
144 unsigned int i
= (i1
+ i2
) >> 1;
146 unicode_ranges
[i
].index
+ unicode_ranges
[i
].gap
;
148 start_code
+ unicode_ranges
[i
].length
- 1;
150 if (start_code
<= c
&& c
<= end_code
)
151 return c
- unicode_ranges
[i
].gap
;
157 /* Note here: i1 < i < i2. */
160 else if (c
< start_code
)
164 /* Note here: i1 <= i < i2. */
168 return UNINAME_INVALID_INDEX
;
171 /* Looks up the codepoint of a Unicode character, from the given
174 unicode_index_to_code (uint16_t index
)
176 /* Binary search in unicode_ranges. */
178 unsigned int i2
= SIZEOF (unicode_ranges
);
182 unsigned int i
= (i1
+ i2
) >> 1;
183 uint16_t start_index
= unicode_ranges
[i
].index
;
184 uint16_t end_index
= start_index
+ unicode_ranges
[i
].length
- 1;
186 if (start_index
<= index
&& index
<= end_index
)
187 return index
+ unicode_ranges
[i
].gap
;
189 if (end_index
< index
)
193 /* Note here: i1 < i < i2. */
196 else if (index
< start_index
)
200 /* Note here: i1 <= i < i2. */
204 return UNINAME_INVALID
;
208 /* Auxiliary tables for Hangul syllable names, see the Unicode 3.0 book,
209 sections 3.11 and 4.4. */
210 static const char jamo_initial_short_name
[19][3] =
212 "G", "GG", "N", "D", "DD", "R", "M", "B", "BB", "S", "SS", "", "J", "JJ",
213 "C", "K", "T", "P", "H"
215 static const char jamo_medial_short_name
[21][4] =
217 "A", "AE", "YA", "YAE", "EO", "E", "YEO", "YE", "O", "WA", "WAE", "OE", "YO",
218 "U", "WEO", "WE", "WI", "YU", "EU", "YI", "I"
220 static const char jamo_final_short_name
[28][3] =
222 "", "G", "GG", "GS", "N", "NI", "NH", "D", "L", "LG", "LM", "LB", "LS", "LT",
223 "LP", "LH", "M", "B", "BS", "S", "SS", "NG", "J", "C", "K", "T", "P", "H"
226 /* Looks up the name of a Unicode character, in uppercase ASCII.
227 Returns the filled buf, or NULL if the character does not have a name. */
229 unicode_character_name (ucs4_t c
, char *buf
)
231 if (c
>= 0xAC00 && c
<= 0xD7A3)
233 /* Special case for Hangul syllables. Keeps the tables small. */
241 /* buf needs to have at least 16 + 7 bytes here. */
242 memcpy (buf
, "HANGUL SYLLABLE ", 16);
246 index3
= tmp
% 28; tmp
= tmp
/ 28;
247 index2
= tmp
% 21; tmp
= tmp
/ 21;
250 q
= jamo_initial_short_name
[index1
];
253 q
= jamo_medial_short_name
[index2
];
256 q
= jamo_final_short_name
[index3
];
262 else if ((c
>= 0xF900 && c
<= 0xFA2D) || (c
>= 0xFA30 && c
<= 0xFA6A)
263 || (c
>= 0xFA70 && c
<= 0xFAD9) || (c
>= 0x2F800 && c
<= 0x2FA1D))
265 /* Special case for CJK compatibility ideographs. Keeps the tables
270 /* buf needs to have at least 28 + 5 bytes here. */
271 memcpy (buf
, "CJK COMPATIBILITY IDEOGRAPH-", 28);
274 for (i
= (c
< 0x10000 ? 12 : 16); i
>= 0; i
-= 4)
276 unsigned int x
= (c
>> i
) & 0xf;
277 *ptr
++ = (x
< 10 ? '0' : 'A' - 10) + x
;
282 else if ((c
>= 0xFE00 && c
<= 0xFE0F) || (c
>= 0xE0100 && c
<= 0xE01EF))
284 /* Special case for variation selectors. Keeps the tables
287 /* buf needs to have at least 19 + 3 bytes here. */
288 sprintf (buf
, "VARIATION SELECTOR-%d",
289 c
<= 0xFE0F ? c
- 0xFE00 + 1 : c
- 0xE0100 + 17);
294 uint16_t index
= unicode_code_to_index (c
);
295 const uint16_t *words
= NULL
;
297 if (index
!= UNINAME_INVALID_INDEX
)
299 /* Binary search in unicode_code_to_name. */
301 unsigned int i2
= SIZEOF (unicode_index_to_name
);
304 unsigned int i
= (i1
+ i2
) >> 1;
305 if (unicode_index_to_name
[i
].index
== index
)
307 words
= &unicode_names
[unicode_index_to_name
[i
].name
];
310 else if (unicode_index_to_name
[i
].index
< index
)
317 /* Note here: i1 < i < i2. */
320 else if (unicode_index_to_name
[i
].index
> index
)
327 /* Note here: i1 <= i < i2. */
334 /* Found it in unicode_index_to_name. Now concatenate the words. */
335 /* buf needs to have at least UNICODE_CHARNAME_MAX_LENGTH bytes. */
339 unsigned int wordlen
;
340 const char *word
= unicode_name_word (*words
>>1, &wordlen
);
343 while (--wordlen
> 0);
344 if ((*words
& 1) == 0)
356 /* Looks up the Unicode character with a given name, in upper- or lowercase
357 ASCII. Returns the character if found, or UNINAME_INVALID if not found. */
359 unicode_name_character (const char *name
)
361 size_t len
= strlen (name
);
362 if (len
> 1 && len
<= UNICODE_CHARNAME_MAX_LENGTH
)
364 /* Test for "word1 word2 ..." syntax. */
365 char buf
[UNICODE_CHARNAME_MAX_LENGTH
];
370 if (!(c
>= ' ' && c
<= '~'))
372 *ptr
++ = (c
>= 'a' && c
<= 'z' ? c
- 'a' + 'A' : c
);
380 /* Special case for variation selector aliases. Keeps the
382 const char *p1
= buf
;
383 if (ptr
>= buf
+ 3 && *p1
++ == 'V')
392 if (*p1
>= '0' && *p1
<= '9')
397 if (c
>= 1 && c
<= 16)
398 return c
- 1 + 0xFE00;
399 else if (c
>= 17 && c
<= 256)
400 return c
- 17 + 0xE0100;
411 /* Convert the constituents to uint16_t words. */
412 uint16_t words
[UNICODE_CHARNAME_MAX_WORDS
];
413 uint16_t *wordptr
= words
;
415 const char *p1
= buf
;
421 while (p2
< ptr
&& *p2
!= ' ')
423 word
= unicode_name_word_lookup (p1
, p2
- p1
);
426 if (wordptr
== &words
[UNICODE_CHARNAME_MAX_WORDS
])
433 /* Special case for Hangul syllables. Keeps the tables small. */
434 if (wordptr
== &words
[2]
435 && words
[0] == UNICODE_CHARNAME_WORD_HANGUL
436 && words
[1] == UNICODE_CHARNAME_WORD_SYLLABLE
)
438 /* Split the last word [p1..ptr) into three parts:
449 && (*p2
== 'B' || *p2
== 'C' || *p2
== 'D'
450 || *p2
== 'G' || *p2
== 'H' || *p2
== 'J'
451 || *p2
== 'K' || *p2
== 'M' || *p2
== 'N'
452 || *p2
== 'P' || *p2
== 'R' || *p2
== 'S'
457 && (*p3
== 'A' || *p3
== 'E' || *p3
== 'I'
458 || *p3
== 'O' || *p3
== 'U' || *p3
== 'W'
463 && (*p4
== 'B' || *p4
== 'C' || *p4
== 'D'
464 || *p4
== 'G' || *p4
== 'H' || *p4
== 'I'
465 || *p4
== 'J' || *p4
== 'K' || *p4
== 'L'
466 || *p4
== 'M' || *p4
== 'N' || *p4
== 'P'
467 || *p4
== 'S' || *p4
== 'T'))
475 if (n1
<= 2 && (n2
>= 1 && n2
<= 3) && n3
<= 2)
479 for (index1
= 0; index1
< 19; index1
++)
480 if (memcmp (jamo_initial_short_name
[index1
], p1
, n1
) == 0
481 && jamo_initial_short_name
[index1
][n1
] == '\0')
485 for (index2
= 0; index2
< 21; index2
++)
486 if (memcmp (jamo_medial_short_name
[index2
], p2
, n2
) == 0
487 && jamo_medial_short_name
[index2
][n2
] == '\0')
491 for (index3
= 0; index3
< 28; index3
++)
492 if (memcmp (jamo_final_short_name
[index3
], p3
, n3
) == 0
493 && jamo_final_short_name
[index3
][n3
] == '\0')
495 return 0xAC00 + (index1
* 21 + index2
) * 28 + index3
;
504 /* Special case for CJK compatibility ideographs. Keeps the
506 if (wordptr
== &words
[2]
507 && words
[0] == UNICODE_CHARNAME_WORD_CJK
508 && words
[1] == UNICODE_CHARNAME_WORD_COMPATIBILITY
511 && memcmp (p1
, "IDEOGRAPH-", 10) == 0)
513 const char *p2
= p1
+ 10;
521 if (*p2
>= '0' && *p2
<= '9')
523 else if (*p2
>= 'A' && *p2
<= 'F')
524 c
+= (*p2
- 'A' + 10);
530 if ((c
>= 0xF900 && c
<= 0xFA2D)
531 || (c
>= 0xFA30 && c
<= 0xFA6A)
532 || (c
>= 0xFA70 && c
<= 0xFAD9)
533 || (c
>= 0x2F800 && c
<= 0x2FA1D))
542 /* Special case for variation selectors. Keeps the
544 if (wordptr
== &words
[1]
545 && words
[0] == UNICODE_CHARNAME_WORD_VARIATION
548 && memcmp (p1
, "SELECTOR-", 9) == 0)
550 const char *p2
= p1
+ 9;
558 if (*p2
>= '0' && *p2
<= '9')
563 if (c
>= 1 && c
<= 16)
564 return c
- 1 + 0xFE00;
565 else if (c
>= 17 && c
<= 256)
566 return c
- 17 + 0xE0100;
579 /* Multiply by 2, to simplify later comparisons. */
580 size_t words_length
= wordptr
- words
;
582 size_t i
= words_length
- 1;
583 words
[i
] = 2 * words
[i
];
587 words
[i
] = 2 * words
[i
] + 1;
590 /* Binary search in unicode_name_to_index. */
593 unsigned int i2
= SIZEOF (unicode_name_to_index
);
596 unsigned int i
= (i1
+ i2
) >> 1;
597 const uint16_t *w
= words
;
598 const uint16_t *p
= &unicode_names
[unicode_name_to_index
[i
].name
];
599 size_t n
= words_length
;
606 /* Note here: i1 < i < i2. */
614 /* Note here: i1 <= i < i2. */
620 return unicode_index_to_code (unicode_name_to_index
[i
].index
);
629 return UNINAME_INVALID
;