uniname/uniname: Fix -Wshadow warning.
[gnulib.git] / lib / uniname / uniname.c
blobadefa43694e7c1f09dad26669b2fbefb10a84784
1 /* Association between Unicode characters and their names.
2 Copyright (C) 2000-2002, 2005-2007, 2009-2020 Free Software Foundation, Inc.
4 This program is free software: you can redistribute it and/or modify it
5 under the terms of the GNU Lesser General Public License as published
6 by the Free Software Foundation; either version 3 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12 Lesser General Public License for more details.
14 You should have received a copy of the GNU Lesser General Public License
15 along with this program. If not, see <https://www.gnu.org/licenses/>. */
17 #include <config.h>
19 /* Specification. */
20 #include "uniname.h"
22 #include <assert.h>
23 #include <stdbool.h>
24 #include <stdint.h>
25 #include <stdio.h>
26 #include <string.h>
28 #include "attribute.h"
30 #define SIZEOF(a) (sizeof(a) / sizeof(a[0]))
33 /* Table of Unicode character names, derived from UnicodeData.txt.
34 This table is generated in a way to minimize the memory footprint:
35 1. its compiled size is small (less than 350 KB),
36 2. it resides entirely in the text or read-only data segment of the
37 executable or shared library: the table contains only immediate
38 integers, no pointers, and the functions don't do heap allocation.
40 #include "uninames.h"
41 /* It contains:
42 static const char unicode_name_words[36303] = ...;
43 #define UNICODE_CHARNAME_NUM_WORDS 6260
44 static const struct { uint16_t extra_offset; uint16_t ind_offset; } unicode_name_by_length[26] = ...;
45 #define UNICODE_CHARNAME_WORD_HANGUL 3902
46 #define UNICODE_CHARNAME_WORD_SYLLABLE 4978
47 #define UNICODE_CHARNAME_WORD_CJK 417
48 #define UNICODE_CHARNAME_WORD_COMPATIBILITY 6107
49 static const uint16_t unicode_names[68940] = ...;
50 static const struct { uint16_t index; uint32_t name:24; } unicode_name_to_index[16626] = ...;
51 static const struct { uint16_t index; uint32_t name:24; } unicode_index_to_name[16626] = ...;
52 #define UNICODE_CHARNAME_MAX_LENGTH 83
53 #define UNICODE_CHARNAME_MAX_WORDS 13
54 static const struct { uint32_t index; uint32_t gap; uint16_t length; } unicode_ranges[401] = ...;
57 /* Returns the word with a given index. */
58 static const char *
59 unicode_name_word (unsigned int index, unsigned int *lengthp)
61 unsigned int i1;
62 unsigned int i2;
64 assert (index < UNICODE_CHARNAME_NUM_WORDS);
66 /* Binary search for i with
67 unicode_name_by_length[i].ind_offset <= index
68 and
69 index < unicode_name_by_length[i+1].ind_offset
72 i1 = 0;
73 i2 = SIZEOF (unicode_name_by_length) - 1;
74 while (i2 - i1 > 1)
76 unsigned int i = (i1 + i2) >> 1;
77 if (unicode_name_by_length[i].ind_offset <= index)
78 i1 = i;
79 else
80 i2 = i;
82 unsigned int i = i1;
83 assert (unicode_name_by_length[i].ind_offset <= index
84 && index < unicode_name_by_length[i+1].ind_offset);
85 *lengthp = i;
86 return &unicode_name_words[unicode_name_by_length[i].extra_offset
87 + (index-unicode_name_by_length[i].ind_offset)*i];
90 /* Looks up the index of a word. */
91 static int
92 unicode_name_word_lookup (const char *word, size_t length)
94 if (length > 0 && length < SIZEOF (unicode_name_by_length) - 1)
96 /* Binary search among the words of given length. */
97 unsigned int extra_offset = unicode_name_by_length[length].extra_offset;
98 unsigned int i0 = unicode_name_by_length[length].ind_offset;
99 unsigned int i1 = i0;
100 unsigned int i2 = unicode_name_by_length[length+1].ind_offset;
101 while (i2 - i1 > 0)
103 unsigned int i = (i1 + i2) >> 1;
104 const char *p = &unicode_name_words[extra_offset + (i-i0)*length];
105 const char *w = word;
106 unsigned int n = length;
107 for (;;)
109 if (*p < *w)
111 if (i1 == i)
112 return -1;
113 /* Note here: i1 < i < i2. */
114 i1 = i;
115 break;
117 if (*p > *w)
119 /* Note here: i1 <= i < i2. */
120 i2 = i;
121 break;
123 p++; w++; n--;
124 if (n == 0)
125 return i;
129 return -1;
132 #define UNINAME_INVALID_INDEX UINT16_MAX
134 /* Looks up the internal index of a Unicode character. */
135 static uint16_t
136 unicode_code_to_index (ucs4_t c)
138 /* Binary search in unicode_ranges. */
139 unsigned int i1 = 0;
140 unsigned int i2 = SIZEOF (unicode_ranges);
142 for (;;)
144 unsigned int i = (i1 + i2) >> 1;
145 ucs4_t start_code =
146 unicode_ranges[i].index + unicode_ranges[i].gap;
147 ucs4_t end_code =
148 start_code + unicode_ranges[i].length - 1;
150 if (start_code <= c && c <= end_code)
151 return c - unicode_ranges[i].gap;
153 if (end_code < c)
155 if (i1 == i)
156 break;
157 /* Note here: i1 < i < i2. */
158 i1 = i;
160 else if (c < start_code)
162 if (i2 == i)
163 break;
164 /* Note here: i1 <= i < i2. */
165 i2 = i;
168 return UNINAME_INVALID_INDEX;
171 /* Looks up the codepoint of a Unicode character, from the given
172 internal index. */
173 static ucs4_t
174 unicode_index_to_code (uint16_t index)
176 /* Binary search in unicode_ranges. */
177 unsigned int i1 = 0;
178 unsigned int i2 = SIZEOF (unicode_ranges);
180 for (;;)
182 unsigned int i = (i1 + i2) >> 1;
183 uint16_t start_index = unicode_ranges[i].index;
184 uint16_t end_index = start_index + unicode_ranges[i].length - 1;
186 if (start_index <= index && index <= end_index)
187 return index + unicode_ranges[i].gap;
189 if (end_index < index)
191 if (i1 == i)
192 break;
193 /* Note here: i1 < i < i2. */
194 i1 = i;
196 else if (index < start_index)
198 if (i2 == i)
199 break;
200 /* Note here: i1 <= i < i2. */
201 i2 = i;
204 return UNINAME_INVALID;
208 /* Auxiliary tables for Hangul syllable names, see the Unicode 3.0 book,
209 sections 3.11 and 4.4. */
210 static const char jamo_initial_short_name[19][3] =
212 "G", "GG", "N", "D", "DD", "R", "M", "B", "BB", "S", "SS", "", "J", "JJ",
213 "C", "K", "T", "P", "H"
215 static const char jamo_medial_short_name[21][4] =
217 "A", "AE", "YA", "YAE", "EO", "E", "YEO", "YE", "O", "WA", "WAE", "OE", "YO",
218 "U", "WEO", "WE", "WI", "YU", "EU", "YI", "I"
220 static const char jamo_final_short_name[28][3] =
222 "", "G", "GG", "GS", "N", "NI", "NH", "D", "L", "LG", "LM", "LB", "LS", "LT",
223 "LP", "LH", "M", "B", "BS", "S", "SS", "NG", "J", "C", "K", "T", "P", "H"
226 /* Looks up the name of a Unicode character, in uppercase ASCII.
227 Returns the filled buf, or NULL if the character does not have a name. */
228 char *
229 unicode_character_name (ucs4_t c, char *buf)
231 if (c >= 0xAC00 && c <= 0xD7A3)
233 /* Special case for Hangul syllables. Keeps the tables small. */
234 char *ptr;
235 unsigned int tmp;
236 unsigned int index1;
237 unsigned int index2;
238 unsigned int index3;
239 const char *q;
241 /* buf needs to have at least 16 + 7 bytes here. */
242 memcpy (buf, "HANGUL SYLLABLE ", 16);
243 ptr = buf + 16;
245 tmp = c - 0xAC00;
246 index3 = tmp % 28; tmp = tmp / 28;
247 index2 = tmp % 21; tmp = tmp / 21;
248 index1 = tmp;
250 q = jamo_initial_short_name[index1];
251 while (*q != '\0')
252 *ptr++ = *q++;
253 q = jamo_medial_short_name[index2];
254 while (*q != '\0')
255 *ptr++ = *q++;
256 q = jamo_final_short_name[index3];
257 while (*q != '\0')
258 *ptr++ = *q++;
259 *ptr = '\0';
260 return buf;
262 else if ((c >= 0xF900 && c <= 0xFA2D) || (c >= 0xFA30 && c <= 0xFA6A)
263 || (c >= 0xFA70 && c <= 0xFAD9) || (c >= 0x2F800 && c <= 0x2FA1D))
265 /* Special case for CJK compatibility ideographs. Keeps the tables
266 small. */
267 char *ptr;
268 int i;
270 /* buf needs to have at least 28 + 5 bytes here. */
271 memcpy (buf, "CJK COMPATIBILITY IDEOGRAPH-", 28);
272 ptr = buf + 28;
274 for (i = (c < 0x10000 ? 12 : 16); i >= 0; i -= 4)
276 unsigned int x = (c >> i) & 0xf;
277 *ptr++ = (x < 10 ? '0' : 'A' - 10) + x;
279 *ptr = '\0';
280 return buf;
282 else if ((c >= 0xFE00 && c <= 0xFE0F) || (c >= 0xE0100 && c <= 0xE01EF))
284 /* Special case for variation selectors. Keeps the tables
285 small. */
287 /* buf needs to have at least 19 + 3 bytes here. */
288 sprintf (buf, "VARIATION SELECTOR-%d",
289 c <= 0xFE0F ? c - 0xFE00 + 1 : c - 0xE0100 + 17);
290 return buf;
292 else
294 uint16_t index = unicode_code_to_index (c);
295 const uint16_t *words = NULL;
297 if (index != UNINAME_INVALID_INDEX)
299 /* Binary search in unicode_code_to_name. */
300 unsigned int i1 = 0;
301 unsigned int i2 = SIZEOF (unicode_index_to_name);
302 for (;;)
304 unsigned int i = (i1 + i2) >> 1;
305 if (unicode_index_to_name[i].index == index)
307 words = &unicode_names[unicode_index_to_name[i].name];
308 break;
310 else if (unicode_index_to_name[i].index < index)
312 if (i1 == i)
314 words = NULL;
315 break;
317 /* Note here: i1 < i < i2. */
318 i1 = i;
320 else if (unicode_index_to_name[i].index > index)
322 if (i2 == i)
324 words = NULL;
325 break;
327 /* Note here: i1 <= i < i2. */
328 i2 = i;
332 if (words != NULL)
334 /* Found it in unicode_index_to_name. Now concatenate the words. */
335 /* buf needs to have at least UNICODE_CHARNAME_MAX_LENGTH bytes. */
336 char *ptr = buf;
337 for (;;)
339 unsigned int wordlen;
340 const char *word = unicode_name_word (*words>>1, &wordlen);
342 *ptr++ = *word++;
343 while (--wordlen > 0);
344 if ((*words & 1) == 0)
345 break;
346 *ptr++ = ' ';
347 words++;
349 *ptr = '\0';
350 return buf;
352 return NULL;
356 /* Looks up the Unicode character with a given name, in upper- or lowercase
357 ASCII. Returns the character if found, or UNINAME_INVALID if not found. */
358 ucs4_t
359 unicode_name_character (const char *name)
361 size_t len = strlen (name);
362 if (len > 1 && len <= UNICODE_CHARNAME_MAX_LENGTH)
364 /* Test for "word1 word2 ..." syntax. */
365 char buf[UNICODE_CHARNAME_MAX_LENGTH];
366 char *ptr = buf;
367 for (;;)
369 char c = *name++;
370 if (!(c >= ' ' && c <= '~'))
371 break;
372 *ptr++ = (c >= 'a' && c <= 'z' ? c - 'a' + 'A' : c);
373 if (--len == 0)
374 goto filled_buf;
376 if (false)
377 filled_buf:
380 /* Special case for variation selector aliases. Keeps the
381 tables small. */
382 const char *p1 = buf;
383 if (ptr >= buf + 3 && *p1++ == 'V')
385 if (*p1++ == 'S')
387 if (*p1 != '0')
389 unsigned int c = 0;
390 for (;;)
392 if (*p1 >= '0' && *p1 <= '9')
393 c += (*p1 - '0');
394 p1++;
395 if (p1 == ptr)
397 if (c >= 1 && c <= 16)
398 return c - 1 + 0xFE00;
399 else if (c >= 17 && c <= 256)
400 return c - 17 + 0xE0100;
401 else
402 break;
404 c = c * 10;
411 /* Convert the constituents to uint16_t words. */
412 uint16_t words[UNICODE_CHARNAME_MAX_WORDS];
413 uint16_t *wordptr = words;
415 const char *p1 = buf;
416 for (;;)
419 int word;
420 const char *p2 = p1;
421 while (p2 < ptr && *p2 != ' ')
422 p2++;
423 word = unicode_name_word_lookup (p1, p2 - p1);
424 if (word < 0)
425 break;
426 if (wordptr == &words[UNICODE_CHARNAME_MAX_WORDS])
427 break;
428 *wordptr++ = word;
429 if (p2 == ptr)
430 goto filled_words;
431 p1 = p2 + 1;
433 /* Special case for Hangul syllables. Keeps the tables small. */
434 if (wordptr == &words[2]
435 && words[0] == UNICODE_CHARNAME_WORD_HANGUL
436 && words[1] == UNICODE_CHARNAME_WORD_SYLLABLE)
438 /* Split the last word [p1..ptr) into three parts:
439 1) [BCDGHJKMNPRST]
440 2) [AEIOUWY]
441 3) [BCDGHIJKLMNPST]
443 const char *p2;
444 const char *p3;
445 const char *p4;
447 p2 = p1;
448 while (p2 < ptr
449 && (*p2 == 'B' || *p2 == 'C' || *p2 == 'D'
450 || *p2 == 'G' || *p2 == 'H' || *p2 == 'J'
451 || *p2 == 'K' || *p2 == 'M' || *p2 == 'N'
452 || *p2 == 'P' || *p2 == 'R' || *p2 == 'S'
453 || *p2 == 'T'))
454 p2++;
455 p3 = p2;
456 while (p3 < ptr
457 && (*p3 == 'A' || *p3 == 'E' || *p3 == 'I'
458 || *p3 == 'O' || *p3 == 'U' || *p3 == 'W'
459 || *p3 == 'Y'))
460 p3++;
461 p4 = p3;
462 while (p4 < ptr
463 && (*p4 == 'B' || *p4 == 'C' || *p4 == 'D'
464 || *p4 == 'G' || *p4 == 'H' || *p4 == 'I'
465 || *p4 == 'J' || *p4 == 'K' || *p4 == 'L'
466 || *p4 == 'M' || *p4 == 'N' || *p4 == 'P'
467 || *p4 == 'S' || *p4 == 'T'))
468 p4++;
469 if (p4 == ptr)
471 size_t n1 = p2 - p1;
472 size_t n2 = p3 - p2;
473 size_t n3 = p4 - p3;
475 if (n1 <= 2 && (n2 >= 1 && n2 <= 3) && n3 <= 2)
477 unsigned int index1;
479 for (index1 = 0; index1 < 19; index1++)
480 if (memcmp (jamo_initial_short_name[index1], p1, n1) == 0
481 && jamo_initial_short_name[index1][n1] == '\0')
483 unsigned int index2;
485 for (index2 = 0; index2 < 21; index2++)
486 if (memcmp (jamo_medial_short_name[index2], p2, n2) == 0
487 && jamo_medial_short_name[index2][n2] == '\0')
489 unsigned int index3;
491 for (index3 = 0; index3 < 28; index3++)
492 if (memcmp (jamo_final_short_name[index3], p3, n3) == 0
493 && jamo_final_short_name[index3][n3] == '\0')
495 return 0xAC00 + (index1 * 21 + index2) * 28 + index3;
497 break;
499 break;
504 /* Special case for CJK compatibility ideographs. Keeps the
505 tables small. */
506 if (wordptr == &words[2]
507 && words[0] == UNICODE_CHARNAME_WORD_CJK
508 && words[1] == UNICODE_CHARNAME_WORD_COMPATIBILITY
509 && p1 + 14 <= ptr
510 && p1 + 15 >= ptr
511 && memcmp (p1, "IDEOGRAPH-", 10) == 0)
513 const char *p2 = p1 + 10;
515 if (*p2 != '0')
517 unsigned int c = 0;
519 for (;;)
521 if (*p2 >= '0' && *p2 <= '9')
522 c += (*p2 - '0');
523 else if (*p2 >= 'A' && *p2 <= 'F')
524 c += (*p2 - 'A' + 10);
525 else
526 break;
527 p2++;
528 if (p2 == ptr)
530 if ((c >= 0xF900 && c <= 0xFA2D)
531 || (c >= 0xFA30 && c <= 0xFA6A)
532 || (c >= 0xFA70 && c <= 0xFAD9)
533 || (c >= 0x2F800 && c <= 0x2FA1D))
534 return c;
535 else
536 break;
538 c = c << 4;
542 /* Special case for variation selectors. Keeps the
543 tables small. */
544 if (wordptr == &words[1]
545 && words[0] == UNICODE_CHARNAME_WORD_VARIATION
546 && p1 + 10 <= ptr
547 && p1 + 12 >= ptr
548 && memcmp (p1, "SELECTOR-", 9) == 0)
550 const char *p2 = p1 + 9;
552 if (*p2 != '0')
554 unsigned int c = 0;
556 for (;;)
558 if (*p2 >= '0' && *p2 <= '9')
559 c += (*p2 - '0');
560 p2++;
561 if (p2 == ptr)
563 if (c >= 1 && c <= 16)
564 return c - 1 + 0xFE00;
565 else if (c >= 17 && c <= 256)
566 return c - 17 + 0xE0100;
567 else
568 break;
570 c = c * 10;
576 if (false)
577 filled_words:
579 /* Multiply by 2, to simplify later comparisons. */
580 size_t words_length = wordptr - words;
582 size_t i = words_length - 1;
583 words[i] = 2 * words[i];
584 for (; i > 0; )
586 --i;
587 words[i] = 2 * words[i] + 1;
590 /* Binary search in unicode_name_to_index. */
592 unsigned int i1 = 0;
593 unsigned int i2 = SIZEOF (unicode_name_to_index);
594 for (;;)
596 unsigned int i = (i1 + i2) >> 1;
597 const uint16_t *w = words;
598 const uint16_t *p = &unicode_names[unicode_name_to_index[i].name];
599 size_t n = words_length;
600 for (;;)
602 if (*p < *w)
604 if (i1 == i)
605 goto name_not_found;
606 /* Note here: i1 < i < i2. */
607 i1 = i;
608 break;
610 else if (*p > *w)
612 if (i2 == i)
613 goto name_not_found;
614 /* Note here: i1 <= i < i2. */
615 i2 = i;
616 break;
618 p++; w++; n--;
619 if (n == 0)
620 return unicode_index_to_code (unicode_name_to_index[i].index);
624 name_not_found: ;
629 return UNINAME_INVALID;