integer_length_ll: Optimize for MSVC in 32-bit mode.
[gnulib.git] / lib / uniname / uniname.c
blob7cf1e2e1320b4fc62f2517b7a8aa7fa2ce2a86ec
1 /* Association between Unicode characters and their names.
2 Copyright (C) 2000-2002, 2005-2007, 2009-2020 Free Software Foundation, Inc.
4 This program is free software: you can redistribute it and/or modify it
5 under the terms of the GNU Lesser General Public License as published
6 by the Free Software Foundation; either version 3 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12 Lesser General Public License for more details.
14 You should have received a copy of the GNU Lesser General Public License
15 along with this program. If not, see <https://www.gnu.org/licenses/>. */
17 #include <config.h>
19 /* Specification. */
20 #include "uniname.h"
22 #include <assert.h>
23 #include <stdbool.h>
24 #include <stdint.h>
25 #include <stdio.h>
26 #include <string.h>
28 #include "attribute.h"
30 #define SIZEOF(a) (sizeof(a) / sizeof(a[0]))
33 /* Table of Unicode character names, derived from UnicodeData.txt.
34 This table is generated in a way to minimize the memory footprint:
35 1. its compiled size is small (less than 350 KB),
36 2. it resides entirely in the text or read-only data segment of the
37 executable or shared library: the table contains only immediate
38 integers, no pointers, and the functions don't do heap allocation.
40 #include "uninames.h"
41 /* It contains:
42 static const char unicode_name_words[36303] = ...;
43 #define UNICODE_CHARNAME_NUM_WORDS 6260
44 static const struct { uint16_t extra_offset; uint16_t ind_offset; } unicode_name_by_length[26] = ...;
45 #define UNICODE_CHARNAME_WORD_HANGUL 3902
46 #define UNICODE_CHARNAME_WORD_SYLLABLE 4978
47 #define UNICODE_CHARNAME_WORD_CJK 417
48 #define UNICODE_CHARNAME_WORD_COMPATIBILITY 6107
49 static const uint16_t unicode_names[68940] = ...;
50 static const struct { uint16_t index; uint32_t name:24; } unicode_name_to_index[16626] = ...;
51 static const struct { uint16_t index; uint32_t name:24; } unicode_index_to_name[16626] = ...;
52 #define UNICODE_CHARNAME_MAX_LENGTH 83
53 #define UNICODE_CHARNAME_MAX_WORDS 13
54 static const struct { uint32_t index; uint32_t gap; uint16_t length; } unicode_ranges[401] = ...;
57 /* Returns the word with a given index. */
58 static const char *
59 unicode_name_word (unsigned int index, unsigned int *lengthp)
61 unsigned int i1;
62 unsigned int i2;
63 unsigned int i;
65 assert (index < UNICODE_CHARNAME_NUM_WORDS);
67 /* Binary search for i with
68 unicode_name_by_length[i].ind_offset <= index
69 and
70 index < unicode_name_by_length[i+1].ind_offset
73 i1 = 0;
74 i2 = SIZEOF (unicode_name_by_length) - 1;
75 while (i2 - i1 > 1)
77 unsigned int i = (i1 + i2) >> 1;
78 if (unicode_name_by_length[i].ind_offset <= index)
79 i1 = i;
80 else
81 i2 = i;
83 i = i1;
84 assert (unicode_name_by_length[i].ind_offset <= index
85 && index < unicode_name_by_length[i+1].ind_offset);
86 *lengthp = i;
87 return &unicode_name_words[unicode_name_by_length[i].extra_offset
88 + (index-unicode_name_by_length[i].ind_offset)*i];
91 /* Looks up the index of a word. */
92 static int
93 unicode_name_word_lookup (const char *word, unsigned int length)
95 if (length > 0 && length < SIZEOF (unicode_name_by_length) - 1)
97 /* Binary search among the words of given length. */
98 unsigned int extra_offset = unicode_name_by_length[length].extra_offset;
99 unsigned int i0 = unicode_name_by_length[length].ind_offset;
100 unsigned int i1 = i0;
101 unsigned int i2 = unicode_name_by_length[length+1].ind_offset;
102 while (i2 - i1 > 0)
104 unsigned int i = (i1 + i2) >> 1;
105 const char *p = &unicode_name_words[extra_offset + (i-i0)*length];
106 const char *w = word;
107 unsigned int n = length;
108 for (;;)
110 if (*p < *w)
112 if (i1 == i)
113 return -1;
114 /* Note here: i1 < i < i2. */
115 i1 = i;
116 break;
118 if (*p > *w)
120 /* Note here: i1 <= i < i2. */
121 i2 = i;
122 break;
124 p++; w++; n--;
125 if (n == 0)
126 return i;
130 return -1;
133 #define UNINAME_INVALID_INDEX UINT16_MAX
135 /* Looks up the internal index of a Unicode character. */
136 static uint16_t
137 unicode_code_to_index (ucs4_t c)
139 /* Binary search in unicode_ranges. */
140 unsigned int i1 = 0;
141 unsigned int i2 = SIZEOF (unicode_ranges);
143 for (;;)
145 unsigned int i = (i1 + i2) >> 1;
146 ucs4_t start_code =
147 unicode_ranges[i].index + unicode_ranges[i].gap;
148 ucs4_t end_code =
149 start_code + unicode_ranges[i].length - 1;
151 if (start_code <= c && c <= end_code)
152 return c - unicode_ranges[i].gap;
154 if (end_code < c)
156 if (i1 == i)
157 break;
158 /* Note here: i1 < i < i2. */
159 i1 = i;
161 else if (c < start_code)
163 if (i2 == i)
164 break;
165 /* Note here: i1 <= i < i2. */
166 i2 = i;
169 return UNINAME_INVALID_INDEX;
172 /* Looks up the codepoint of a Unicode character, from the given
173 internal index. */
174 static ucs4_t
175 unicode_index_to_code (uint16_t index)
177 /* Binary search in unicode_ranges. */
178 unsigned int i1 = 0;
179 unsigned int i2 = SIZEOF (unicode_ranges);
181 for (;;)
183 unsigned int i = (i1 + i2) >> 1;
184 uint16_t start_index = unicode_ranges[i].index;
185 uint16_t end_index = start_index + unicode_ranges[i].length - 1;
187 if (start_index <= index && index <= end_index)
188 return index + unicode_ranges[i].gap;
190 if (end_index < index)
192 if (i1 == i)
193 break;
194 /* Note here: i1 < i < i2. */
195 i1 = i;
197 else if (index < start_index)
199 if (i2 == i)
200 break;
201 /* Note here: i1 <= i < i2. */
202 i2 = i;
205 return UNINAME_INVALID;
209 /* Auxiliary tables for Hangul syllable names, see the Unicode 3.0 book,
210 sections 3.11 and 4.4. */
211 static const char jamo_initial_short_name[19][3] =
213 "G", "GG", "N", "D", "DD", "R", "M", "B", "BB", "S", "SS", "", "J", "JJ",
214 "C", "K", "T", "P", "H"
216 static const char jamo_medial_short_name[21][4] =
218 "A", "AE", "YA", "YAE", "EO", "E", "YEO", "YE", "O", "WA", "WAE", "OE", "YO",
219 "U", "WEO", "WE", "WI", "YU", "EU", "YI", "I"
221 static const char jamo_final_short_name[28][3] =
223 "", "G", "GG", "GS", "N", "NI", "NH", "D", "L", "LG", "LM", "LB", "LS", "LT",
224 "LP", "LH", "M", "B", "BS", "S", "SS", "NG", "J", "C", "K", "T", "P", "H"
227 /* Looks up the name of a Unicode character, in uppercase ASCII.
228 Returns the filled buf, or NULL if the character does not have a name. */
229 char *
230 unicode_character_name (ucs4_t c, char *buf)
232 if (c >= 0xAC00 && c <= 0xD7A3)
234 /* Special case for Hangul syllables. Keeps the tables small. */
235 char *ptr;
236 unsigned int tmp;
237 unsigned int index1;
238 unsigned int index2;
239 unsigned int index3;
240 const char *q;
242 /* buf needs to have at least 16 + 7 bytes here. */
243 memcpy (buf, "HANGUL SYLLABLE ", 16);
244 ptr = buf + 16;
246 tmp = c - 0xAC00;
247 index3 = tmp % 28; tmp = tmp / 28;
248 index2 = tmp % 21; tmp = tmp / 21;
249 index1 = tmp;
251 q = jamo_initial_short_name[index1];
252 while (*q != '\0')
253 *ptr++ = *q++;
254 q = jamo_medial_short_name[index2];
255 while (*q != '\0')
256 *ptr++ = *q++;
257 q = jamo_final_short_name[index3];
258 while (*q != '\0')
259 *ptr++ = *q++;
260 *ptr = '\0';
261 return buf;
263 else if ((c >= 0xF900 && c <= 0xFA2D) || (c >= 0xFA30 && c <= 0xFA6A)
264 || (c >= 0xFA70 && c <= 0xFAD9) || (c >= 0x2F800 && c <= 0x2FA1D))
266 /* Special case for CJK compatibility ideographs. Keeps the tables
267 small. */
268 char *ptr;
269 int i;
271 /* buf needs to have at least 28 + 5 bytes here. */
272 memcpy (buf, "CJK COMPATIBILITY IDEOGRAPH-", 28);
273 ptr = buf + 28;
275 for (i = (c < 0x10000 ? 12 : 16); i >= 0; i -= 4)
277 unsigned int x = (c >> i) & 0xf;
278 *ptr++ = (x < 10 ? '0' : 'A' - 10) + x;
280 *ptr = '\0';
281 return buf;
283 else if ((c >= 0xFE00 && c <= 0xFE0F) || (c >= 0xE0100 && c <= 0xE01EF))
285 /* Special case for variation selectors. Keeps the tables
286 small. */
288 /* buf needs to have at least 19 + 3 bytes here. */
289 sprintf (buf, "VARIATION SELECTOR-%d",
290 c <= 0xFE0F ? c - 0xFE00 + 1 : c - 0xE0100 + 17);
291 return buf;
293 else
295 uint16_t index = unicode_code_to_index (c);
296 const uint16_t *words = NULL;
298 if (index != UNINAME_INVALID_INDEX)
300 /* Binary search in unicode_code_to_name. */
301 unsigned int i1 = 0;
302 unsigned int i2 = SIZEOF (unicode_index_to_name);
303 for (;;)
305 unsigned int i = (i1 + i2) >> 1;
306 if (unicode_index_to_name[i].index == index)
308 words = &unicode_names[unicode_index_to_name[i].name];
309 break;
311 else if (unicode_index_to_name[i].index < index)
313 if (i1 == i)
315 words = NULL;
316 break;
318 /* Note here: i1 < i < i2. */
319 i1 = i;
321 else if (unicode_index_to_name[i].index > index)
323 if (i2 == i)
325 words = NULL;
326 break;
328 /* Note here: i1 <= i < i2. */
329 i2 = i;
333 if (words != NULL)
335 /* Found it in unicode_index_to_name. Now concatenate the words. */
336 /* buf needs to have at least UNICODE_CHARNAME_MAX_LENGTH bytes. */
337 char *ptr = buf;
338 for (;;)
340 unsigned int wordlen;
341 const char *word = unicode_name_word (*words>>1, &wordlen);
343 *ptr++ = *word++;
344 while (--wordlen > 0);
345 if ((*words & 1) == 0)
346 break;
347 *ptr++ = ' ';
348 words++;
350 *ptr = '\0';
351 return buf;
353 return NULL;
357 /* Looks up the Unicode character with a given name, in upper- or lowercase
358 ASCII. Returns the character if found, or UNINAME_INVALID if not found. */
359 ucs4_t
360 unicode_name_character (const char *name)
362 unsigned int len = strlen (name);
363 if (len > 1 && len <= UNICODE_CHARNAME_MAX_LENGTH)
365 /* Test for "word1 word2 ..." syntax. */
366 char buf[UNICODE_CHARNAME_MAX_LENGTH];
367 char *ptr = buf;
368 for (;;)
370 char c = *name++;
371 if (!(c >= ' ' && c <= '~'))
372 break;
373 *ptr++ = (c >= 'a' && c <= 'z' ? c - 'a' + 'A' : c);
374 if (--len == 0)
375 goto filled_buf;
377 if (false)
378 filled_buf:
381 /* Special case for variation selector aliases. Keeps the
382 tables small. */
383 const char *p1 = buf;
384 if (ptr >= buf + 3 && *p1++ == 'V')
386 if (*p1++ == 'S')
388 if (*p1 != '0')
390 unsigned int c = 0;
391 for (;;)
393 if (*p1 >= '0' && *p1 <= '9')
394 c += (*p1 - '0');
395 p1++;
396 if (p1 == ptr)
398 if (c >= 1 && c <= 16)
399 return c - 1 + 0xFE00;
400 else if (c >= 17 && c <= 256)
401 return c - 17 + 0xE0100;
402 else
403 break;
405 c = c * 10;
412 /* Convert the constituents to uint16_t words. */
413 uint16_t words[UNICODE_CHARNAME_MAX_WORDS];
414 uint16_t *wordptr = words;
416 const char *p1 = buf;
417 for (;;)
420 int word;
421 const char *p2 = p1;
422 while (p2 < ptr && *p2 != ' ')
423 p2++;
424 word = unicode_name_word_lookup (p1, p2 - p1);
425 if (word < 0)
426 break;
427 if (wordptr == &words[UNICODE_CHARNAME_MAX_WORDS])
428 break;
429 *wordptr++ = word;
430 if (p2 == ptr)
431 goto filled_words;
432 p1 = p2 + 1;
434 /* Special case for Hangul syllables. Keeps the tables small. */
435 if (wordptr == &words[2]
436 && words[0] == UNICODE_CHARNAME_WORD_HANGUL
437 && words[1] == UNICODE_CHARNAME_WORD_SYLLABLE)
439 /* Split the last word [p1..ptr) into three parts:
440 1) [BCDGHJKMNPRST]
441 2) [AEIOUWY]
442 3) [BCDGHIJKLMNPST]
444 const char *p2;
445 const char *p3;
446 const char *p4;
448 p2 = p1;
449 while (p2 < ptr
450 && (*p2 == 'B' || *p2 == 'C' || *p2 == 'D'
451 || *p2 == 'G' || *p2 == 'H' || *p2 == 'J'
452 || *p2 == 'K' || *p2 == 'M' || *p2 == 'N'
453 || *p2 == 'P' || *p2 == 'R' || *p2 == 'S'
454 || *p2 == 'T'))
455 p2++;
456 p3 = p2;
457 while (p3 < ptr
458 && (*p3 == 'A' || *p3 == 'E' || *p3 == 'I'
459 || *p3 == 'O' || *p3 == 'U' || *p3 == 'W'
460 || *p3 == 'Y'))
461 p3++;
462 p4 = p3;
463 while (p4 < ptr
464 && (*p4 == 'B' || *p4 == 'C' || *p4 == 'D'
465 || *p4 == 'G' || *p4 == 'H' || *p4 == 'I'
466 || *p4 == 'J' || *p4 == 'K' || *p4 == 'L'
467 || *p4 == 'M' || *p4 == 'N' || *p4 == 'P'
468 || *p4 == 'S' || *p4 == 'T'))
469 p4++;
470 if (p4 == ptr)
472 unsigned int n1 = p2 - p1;
473 unsigned int n2 = p3 - p2;
474 unsigned int n3 = p4 - p3;
476 if (n1 <= 2 && (n2 >= 1 && n2 <= 3) && n3 <= 2)
478 unsigned int index1;
480 for (index1 = 0; index1 < 19; index1++)
481 if (memcmp (jamo_initial_short_name[index1], p1, n1) == 0
482 && jamo_initial_short_name[index1][n1] == '\0')
484 unsigned int index2;
486 for (index2 = 0; index2 < 21; index2++)
487 if (memcmp (jamo_medial_short_name[index2], p2, n2) == 0
488 && jamo_medial_short_name[index2][n2] == '\0')
490 unsigned int index3;
492 for (index3 = 0; index3 < 28; index3++)
493 if (memcmp (jamo_final_short_name[index3], p3, n3) == 0
494 && jamo_final_short_name[index3][n3] == '\0')
496 return 0xAC00 + (index1 * 21 + index2) * 28 + index3;
498 break;
500 break;
505 /* Special case for CJK compatibility ideographs. Keeps the
506 tables small. */
507 if (wordptr == &words[2]
508 && words[0] == UNICODE_CHARNAME_WORD_CJK
509 && words[1] == UNICODE_CHARNAME_WORD_COMPATIBILITY
510 && p1 + 14 <= ptr
511 && p1 + 15 >= ptr
512 && memcmp (p1, "IDEOGRAPH-", 10) == 0)
514 const char *p2 = p1 + 10;
516 if (*p2 != '0')
518 unsigned int c = 0;
520 for (;;)
522 if (*p2 >= '0' && *p2 <= '9')
523 c += (*p2 - '0');
524 else if (*p2 >= 'A' && *p2 <= 'F')
525 c += (*p2 - 'A' + 10);
526 else
527 break;
528 p2++;
529 if (p2 == ptr)
531 if ((c >= 0xF900 && c <= 0xFA2D)
532 || (c >= 0xFA30 && c <= 0xFA6A)
533 || (c >= 0xFA70 && c <= 0xFAD9)
534 || (c >= 0x2F800 && c <= 0x2FA1D))
535 return c;
536 else
537 break;
539 c = c << 4;
543 /* Special case for variation selectors. Keeps the
544 tables small. */
545 if (wordptr == &words[1]
546 && words[0] == UNICODE_CHARNAME_WORD_VARIATION
547 && p1 + 10 <= ptr
548 && p1 + 12 >= ptr
549 && memcmp (p1, "SELECTOR-", 9) == 0)
551 const char *p2 = p1 + 9;
553 if (*p2 != '0')
555 unsigned int c = 0;
557 for (;;)
559 if (*p2 >= '0' && *p2 <= '9')
560 c += (*p2 - '0');
561 p2++;
562 if (p2 == ptr)
564 if (c >= 1 && c <= 16)
565 return c - 1 + 0xFE00;
566 else if (c >= 17 && c <= 256)
567 return c - 17 + 0xE0100;
568 else
569 break;
571 c = c * 10;
577 if (false)
578 filled_words:
580 /* Multiply by 2, to simplify later comparisons. */
581 unsigned int words_length = wordptr - words;
583 int i = words_length - 1;
584 words[i] = 2 * words[i];
585 for (; --i >= 0; )
586 words[i] = 2 * words[i] + 1;
588 /* Binary search in unicode_name_to_index. */
590 unsigned int i1 = 0;
591 unsigned int i2 = SIZEOF (unicode_name_to_index);
592 for (;;)
594 unsigned int i = (i1 + i2) >> 1;
595 const uint16_t *w = words;
596 const uint16_t *p = &unicode_names[unicode_name_to_index[i].name];
597 unsigned int n = words_length;
598 for (;;)
600 if (*p < *w)
602 if (i1 == i)
603 goto name_not_found;
604 /* Note here: i1 < i < i2. */
605 i1 = i;
606 break;
608 else if (*p > *w)
610 if (i2 == i)
611 goto name_not_found;
612 /* Note here: i1 <= i < i2. */
613 i2 = i;
614 break;
616 p++; w++; n--;
617 if (n == 0)
618 return unicode_index_to_code (unicode_name_to_index[i].index);
622 name_not_found: ;
627 return UNINAME_INVALID;