1 /* Association between Unicode characters and their names.
2 Copyright (C) 2000-2002, 2005-2007, 2009-2020 Free Software Foundation, Inc.
4 This program is free software: you can redistribute it and/or modify it
5 under the terms of the GNU Lesser General Public License as published
6 by the Free Software Foundation; either version 3 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12 Lesser General Public License for more details.
14 You should have received a copy of the GNU Lesser General Public License
15 along with this program. If not, see <https://www.gnu.org/licenses/>. */
28 #include "attribute.h"
30 #define SIZEOF(a) (sizeof(a) / sizeof(a[0]))
33 /* Table of Unicode character names, derived from UnicodeData.txt.
34 This table is generated in a way to minimize the memory footprint:
35 1. its compiled size is small (less than 350 KB),
36 2. it resides entirely in the text or read-only data segment of the
37 executable or shared library: the table contains only immediate
38 integers, no pointers, and the functions don't do heap allocation.
42 static const char unicode_name_words[36303] = ...;
43 #define UNICODE_CHARNAME_NUM_WORDS 6260
44 static const struct { uint16_t extra_offset; uint16_t ind_offset; } unicode_name_by_length[26] = ...;
45 #define UNICODE_CHARNAME_WORD_HANGUL 3902
46 #define UNICODE_CHARNAME_WORD_SYLLABLE 4978
47 #define UNICODE_CHARNAME_WORD_CJK 417
48 #define UNICODE_CHARNAME_WORD_COMPATIBILITY 6107
49 static const uint16_t unicode_names[68940] = ...;
50 static const struct { uint16_t index; uint32_t name:24; } unicode_name_to_index[16626] = ...;
51 static const struct { uint16_t index; uint32_t name:24; } unicode_index_to_name[16626] = ...;
52 #define UNICODE_CHARNAME_MAX_LENGTH 83
53 #define UNICODE_CHARNAME_MAX_WORDS 13
54 static const struct { uint32_t index; uint32_t gap; uint16_t length; } unicode_ranges[401] = ...;
57 /* Returns the word with a given index. */
59 unicode_name_word (unsigned int index
, unsigned int *lengthp
)
65 assert (index
< UNICODE_CHARNAME_NUM_WORDS
);
67 /* Binary search for i with
68 unicode_name_by_length[i].ind_offset <= index
70 index < unicode_name_by_length[i+1].ind_offset
74 i2
= SIZEOF (unicode_name_by_length
) - 1;
77 unsigned int i
= (i1
+ i2
) >> 1;
78 if (unicode_name_by_length
[i
].ind_offset
<= index
)
84 assert (unicode_name_by_length
[i
].ind_offset
<= index
85 && index
< unicode_name_by_length
[i
+1].ind_offset
);
87 return &unicode_name_words
[unicode_name_by_length
[i
].extra_offset
88 + (index
-unicode_name_by_length
[i
].ind_offset
)*i
];
91 /* Looks up the index of a word. */
93 unicode_name_word_lookup (const char *word
, unsigned int length
)
95 if (length
> 0 && length
< SIZEOF (unicode_name_by_length
) - 1)
97 /* Binary search among the words of given length. */
98 unsigned int extra_offset
= unicode_name_by_length
[length
].extra_offset
;
99 unsigned int i0
= unicode_name_by_length
[length
].ind_offset
;
100 unsigned int i1
= i0
;
101 unsigned int i2
= unicode_name_by_length
[length
+1].ind_offset
;
104 unsigned int i
= (i1
+ i2
) >> 1;
105 const char *p
= &unicode_name_words
[extra_offset
+ (i
-i0
)*length
];
106 const char *w
= word
;
107 unsigned int n
= length
;
114 /* Note here: i1 < i < i2. */
120 /* Note here: i1 <= i < i2. */
133 #define UNINAME_INVALID_INDEX UINT16_MAX
135 /* Looks up the internal index of a Unicode character. */
137 unicode_code_to_index (ucs4_t c
)
139 /* Binary search in unicode_ranges. */
141 unsigned int i2
= SIZEOF (unicode_ranges
);
145 unsigned int i
= (i1
+ i2
) >> 1;
147 unicode_ranges
[i
].index
+ unicode_ranges
[i
].gap
;
149 start_code
+ unicode_ranges
[i
].length
- 1;
151 if (start_code
<= c
&& c
<= end_code
)
152 return c
- unicode_ranges
[i
].gap
;
158 /* Note here: i1 < i < i2. */
161 else if (c
< start_code
)
165 /* Note here: i1 <= i < i2. */
169 return UNINAME_INVALID_INDEX
;
172 /* Looks up the codepoint of a Unicode character, from the given
175 unicode_index_to_code (uint16_t index
)
177 /* Binary search in unicode_ranges. */
179 unsigned int i2
= SIZEOF (unicode_ranges
);
183 unsigned int i
= (i1
+ i2
) >> 1;
184 uint16_t start_index
= unicode_ranges
[i
].index
;
185 uint16_t end_index
= start_index
+ unicode_ranges
[i
].length
- 1;
187 if (start_index
<= index
&& index
<= end_index
)
188 return index
+ unicode_ranges
[i
].gap
;
190 if (end_index
< index
)
194 /* Note here: i1 < i < i2. */
197 else if (index
< start_index
)
201 /* Note here: i1 <= i < i2. */
205 return UNINAME_INVALID
;
209 /* Auxiliary tables for Hangul syllable names, see the Unicode 3.0 book,
210 sections 3.11 and 4.4. */
211 static const char jamo_initial_short_name
[19][3] =
213 "G", "GG", "N", "D", "DD", "R", "M", "B", "BB", "S", "SS", "", "J", "JJ",
214 "C", "K", "T", "P", "H"
216 static const char jamo_medial_short_name
[21][4] =
218 "A", "AE", "YA", "YAE", "EO", "E", "YEO", "YE", "O", "WA", "WAE", "OE", "YO",
219 "U", "WEO", "WE", "WI", "YU", "EU", "YI", "I"
221 static const char jamo_final_short_name
[28][3] =
223 "", "G", "GG", "GS", "N", "NI", "NH", "D", "L", "LG", "LM", "LB", "LS", "LT",
224 "LP", "LH", "M", "B", "BS", "S", "SS", "NG", "J", "C", "K", "T", "P", "H"
227 /* Looks up the name of a Unicode character, in uppercase ASCII.
228 Returns the filled buf, or NULL if the character does not have a name. */
230 unicode_character_name (ucs4_t c
, char *buf
)
232 if (c
>= 0xAC00 && c
<= 0xD7A3)
234 /* Special case for Hangul syllables. Keeps the tables small. */
242 /* buf needs to have at least 16 + 7 bytes here. */
243 memcpy (buf
, "HANGUL SYLLABLE ", 16);
247 index3
= tmp
% 28; tmp
= tmp
/ 28;
248 index2
= tmp
% 21; tmp
= tmp
/ 21;
251 q
= jamo_initial_short_name
[index1
];
254 q
= jamo_medial_short_name
[index2
];
257 q
= jamo_final_short_name
[index3
];
263 else if ((c
>= 0xF900 && c
<= 0xFA2D) || (c
>= 0xFA30 && c
<= 0xFA6A)
264 || (c
>= 0xFA70 && c
<= 0xFAD9) || (c
>= 0x2F800 && c
<= 0x2FA1D))
266 /* Special case for CJK compatibility ideographs. Keeps the tables
271 /* buf needs to have at least 28 + 5 bytes here. */
272 memcpy (buf
, "CJK COMPATIBILITY IDEOGRAPH-", 28);
275 for (i
= (c
< 0x10000 ? 12 : 16); i
>= 0; i
-= 4)
277 unsigned int x
= (c
>> i
) & 0xf;
278 *ptr
++ = (x
< 10 ? '0' : 'A' - 10) + x
;
283 else if ((c
>= 0xFE00 && c
<= 0xFE0F) || (c
>= 0xE0100 && c
<= 0xE01EF))
285 /* Special case for variation selectors. Keeps the tables
288 /* buf needs to have at least 19 + 3 bytes here. */
289 sprintf (buf
, "VARIATION SELECTOR-%d",
290 c
<= 0xFE0F ? c
- 0xFE00 + 1 : c
- 0xE0100 + 17);
295 uint16_t index
= unicode_code_to_index (c
);
296 const uint16_t *words
= NULL
;
298 if (index
!= UNINAME_INVALID_INDEX
)
300 /* Binary search in unicode_code_to_name. */
302 unsigned int i2
= SIZEOF (unicode_index_to_name
);
305 unsigned int i
= (i1
+ i2
) >> 1;
306 if (unicode_index_to_name
[i
].index
== index
)
308 words
= &unicode_names
[unicode_index_to_name
[i
].name
];
311 else if (unicode_index_to_name
[i
].index
< index
)
318 /* Note here: i1 < i < i2. */
321 else if (unicode_index_to_name
[i
].index
> index
)
328 /* Note here: i1 <= i < i2. */
335 /* Found it in unicode_index_to_name. Now concatenate the words. */
336 /* buf needs to have at least UNICODE_CHARNAME_MAX_LENGTH bytes. */
340 unsigned int wordlen
;
341 const char *word
= unicode_name_word (*words
>>1, &wordlen
);
344 while (--wordlen
> 0);
345 if ((*words
& 1) == 0)
357 /* Looks up the Unicode character with a given name, in upper- or lowercase
358 ASCII. Returns the character if found, or UNINAME_INVALID if not found. */
360 unicode_name_character (const char *name
)
362 unsigned int len
= strlen (name
);
363 if (len
> 1 && len
<= UNICODE_CHARNAME_MAX_LENGTH
)
365 /* Test for "word1 word2 ..." syntax. */
366 char buf
[UNICODE_CHARNAME_MAX_LENGTH
];
371 if (!(c
>= ' ' && c
<= '~'))
373 *ptr
++ = (c
>= 'a' && c
<= 'z' ? c
- 'a' + 'A' : c
);
381 /* Special case for variation selector aliases. Keeps the
383 const char *p1
= buf
;
384 if (ptr
>= buf
+ 3 && *p1
++ == 'V')
393 if (*p1
>= '0' && *p1
<= '9')
398 if (c
>= 1 && c
<= 16)
399 return c
- 1 + 0xFE00;
400 else if (c
>= 17 && c
<= 256)
401 return c
- 17 + 0xE0100;
412 /* Convert the constituents to uint16_t words. */
413 uint16_t words
[UNICODE_CHARNAME_MAX_WORDS
];
414 uint16_t *wordptr
= words
;
416 const char *p1
= buf
;
422 while (p2
< ptr
&& *p2
!= ' ')
424 word
= unicode_name_word_lookup (p1
, p2
- p1
);
427 if (wordptr
== &words
[UNICODE_CHARNAME_MAX_WORDS
])
434 /* Special case for Hangul syllables. Keeps the tables small. */
435 if (wordptr
== &words
[2]
436 && words
[0] == UNICODE_CHARNAME_WORD_HANGUL
437 && words
[1] == UNICODE_CHARNAME_WORD_SYLLABLE
)
439 /* Split the last word [p1..ptr) into three parts:
450 && (*p2
== 'B' || *p2
== 'C' || *p2
== 'D'
451 || *p2
== 'G' || *p2
== 'H' || *p2
== 'J'
452 || *p2
== 'K' || *p2
== 'M' || *p2
== 'N'
453 || *p2
== 'P' || *p2
== 'R' || *p2
== 'S'
458 && (*p3
== 'A' || *p3
== 'E' || *p3
== 'I'
459 || *p3
== 'O' || *p3
== 'U' || *p3
== 'W'
464 && (*p4
== 'B' || *p4
== 'C' || *p4
== 'D'
465 || *p4
== 'G' || *p4
== 'H' || *p4
== 'I'
466 || *p4
== 'J' || *p4
== 'K' || *p4
== 'L'
467 || *p4
== 'M' || *p4
== 'N' || *p4
== 'P'
468 || *p4
== 'S' || *p4
== 'T'))
472 unsigned int n1
= p2
- p1
;
473 unsigned int n2
= p3
- p2
;
474 unsigned int n3
= p4
- p3
;
476 if (n1
<= 2 && (n2
>= 1 && n2
<= 3) && n3
<= 2)
480 for (index1
= 0; index1
< 19; index1
++)
481 if (memcmp (jamo_initial_short_name
[index1
], p1
, n1
) == 0
482 && jamo_initial_short_name
[index1
][n1
] == '\0')
486 for (index2
= 0; index2
< 21; index2
++)
487 if (memcmp (jamo_medial_short_name
[index2
], p2
, n2
) == 0
488 && jamo_medial_short_name
[index2
][n2
] == '\0')
492 for (index3
= 0; index3
< 28; index3
++)
493 if (memcmp (jamo_final_short_name
[index3
], p3
, n3
) == 0
494 && jamo_final_short_name
[index3
][n3
] == '\0')
496 return 0xAC00 + (index1
* 21 + index2
) * 28 + index3
;
505 /* Special case for CJK compatibility ideographs. Keeps the
507 if (wordptr
== &words
[2]
508 && words
[0] == UNICODE_CHARNAME_WORD_CJK
509 && words
[1] == UNICODE_CHARNAME_WORD_COMPATIBILITY
512 && memcmp (p1
, "IDEOGRAPH-", 10) == 0)
514 const char *p2
= p1
+ 10;
522 if (*p2
>= '0' && *p2
<= '9')
524 else if (*p2
>= 'A' && *p2
<= 'F')
525 c
+= (*p2
- 'A' + 10);
531 if ((c
>= 0xF900 && c
<= 0xFA2D)
532 || (c
>= 0xFA30 && c
<= 0xFA6A)
533 || (c
>= 0xFA70 && c
<= 0xFAD9)
534 || (c
>= 0x2F800 && c
<= 0x2FA1D))
543 /* Special case for variation selectors. Keeps the
545 if (wordptr
== &words
[1]
546 && words
[0] == UNICODE_CHARNAME_WORD_VARIATION
549 && memcmp (p1
, "SELECTOR-", 9) == 0)
551 const char *p2
= p1
+ 9;
559 if (*p2
>= '0' && *p2
<= '9')
564 if (c
>= 1 && c
<= 16)
565 return c
- 1 + 0xFE00;
566 else if (c
>= 17 && c
<= 256)
567 return c
- 17 + 0xE0100;
580 /* Multiply by 2, to simplify later comparisons. */
581 unsigned int words_length
= wordptr
- words
;
583 int i
= words_length
- 1;
584 words
[i
] = 2 * words
[i
];
586 words
[i
] = 2 * words
[i
] + 1;
588 /* Binary search in unicode_name_to_index. */
591 unsigned int i2
= SIZEOF (unicode_name_to_index
);
594 unsigned int i
= (i1
+ i2
) >> 1;
595 const uint16_t *w
= words
;
596 const uint16_t *p
= &unicode_names
[unicode_name_to_index
[i
].name
];
597 unsigned int n
= words_length
;
604 /* Note here: i1 < i < i2. */
612 /* Note here: i1 <= i < i2. */
618 return unicode_index_to_code (unicode_name_to_index
[i
].index
);
627 return UNINAME_INVALID
;