lib/uniname/uniname.c

   1 /* Association between Unicode characters and their names.
   2    Copyright (C) 2000-2002, 2005-2007, 2009-2020 Free Software Foundation, Inc.
   3
   4    This program is free software: you can redistribute it and/or modify it
   5    under the terms of the GNU Lesser General Public License as published
   6    by the Free Software Foundation; either version 3 of the License, or
   7    (at your option) any later version.
   8
   9    This program is distributed in the hope that it will be useful,
  10    but WITHOUT ANY WARRANTY; without even the implied warranty of
  11    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  12    Lesser General Public License for more details.
  13
  14    You should have received a copy of the GNU Lesser General Public License
  15    along with this program.  If not, see <https://www.gnu.org/licenses/>.  */
  16
  17 #include <config.h>
  18
  19 /* Specification.  */
  20 #include "uniname.h"
  21
  22 #include <assert.h>
  23 #include <stdbool.h>
  24 #include <stdint.h>
  25 #include <stdio.h>
  26 #include <string.h>
  27
  28 #include "attribute.h"
  29
  30 #define SIZEOF(a) (sizeof(a) / sizeof(a[0]))
  31
  32
  33 /* Table of Unicode character names, derived from UnicodeData.txt.
  34    This table is generated in a way to minimize the memory footprint:
  35      1. its compiled size is small (less than 350 KB),
  36      2. it resides entirely in the text or read-only data segment of the
  37         executable or shared library: the table contains only immediate
  38         integers, no pointers, and the functions don't do heap allocation.
  39  */
  40 #include "uninames.h"
  41 /* It contains:
  42   static const char unicode_name_words[36303] = ...;
  43   #define UNICODE_CHARNAME_NUM_WORDS 6260
  44   static const struct { uint16_t extra_offset; uint16_t ind_offset; } unicode_name_by_length[26] = ...;
  45   #define UNICODE_CHARNAME_WORD_HANGUL 3902
  46   #define UNICODE_CHARNAME_WORD_SYLLABLE 4978
  47   #define UNICODE_CHARNAME_WORD_CJK 417
  48   #define UNICODE_CHARNAME_WORD_COMPATIBILITY 6107
  49   static const uint16_t unicode_names[68940] = ...;
  50   static const struct { uint16_t index; uint32_t name:24; } unicode_name_to_index[16626] = ...;
  51   static const struct { uint16_t index; uint32_t name:24; } unicode_index_to_name[16626] = ...;
  52   #define UNICODE_CHARNAME_MAX_LENGTH 83
  53   #define UNICODE_CHARNAME_MAX_WORDS 13
  54   static const struct { uint32_t index; uint32_t gap; uint16_t length; } unicode_ranges[401] = ...;
  55 */
  56
  57 /* Returns the word with a given index.  */
  58 static const char *
  59 unicode_name_word (unsigned int index, unsigned int *lengthp)
  60 {
  61   unsigned int i1;
  62   unsigned int i2;
  63   unsigned int i;
  64
  65   assert (index < UNICODE_CHARNAME_NUM_WORDS);
  66
  67   /* Binary search for i with
  68        unicode_name_by_length[i].ind_offset <= index
  69      and
  70        index < unicode_name_by_length[i+1].ind_offset
  71    */
  72
  73   i1 = 0;
  74   i2 = SIZEOF (unicode_name_by_length) - 1;
  75   while (i2 - i1 > 1)
  76     {
  77       unsigned int i = (i1 + i2) >> 1;
  78       if (unicode_name_by_length[i].ind_offset <= index)
  79         i1 = i;
  80       else
  81         i2 = i;
  82     }
  83   i = i1;
  84   assert (unicode_name_by_length[i].ind_offset <= index
  85           && index < unicode_name_by_length[i+1].ind_offset);
  86   *lengthp = i;
  87   return &unicode_name_words[unicode_name_by_length[i].extra_offset
  88                              + (index-unicode_name_by_length[i].ind_offset)*i];
  89 }
  90
  91 /* Looks up the index of a word.  */
  92 static int
  93 unicode_name_word_lookup (const char *word, unsigned int length)
  94 {
  95   if (length > 0 && length < SIZEOF (unicode_name_by_length) - 1)
  96     {
  97       /* Binary search among the words of given length.  */
  98       unsigned int extra_offset = unicode_name_by_length[length].extra_offset;
  99       unsigned int i0 = unicode_name_by_length[length].ind_offset;
 100       unsigned int i1 = i0;
 101       unsigned int i2 = unicode_name_by_length[length+1].ind_offset;
 102       while (i2 - i1 > 0)
 103         {
 104           unsigned int i = (i1 + i2) >> 1;
 105           const char *p = &unicode_name_words[extra_offset + (i-i0)*length];
 106           const char *w = word;
 107           unsigned int n = length;
 108           for (;;)
 109             {
 110               if (*p < *w)
 111                 {
 112                   if (i1 == i)
 113                     return -1;
 114                   /* Note here: i1 < i < i2.  */
 115                   i1 = i;
 116                   break;
 117                 }
 118               if (*p > *w)
 119                 {
 120                   /* Note here: i1 <= i < i2.  */
 121                   i2 = i;
 122                   break;
 123                 }
 124               p++; w++; n--;
 125               if (n == 0)
 126                 return i;
 127             }
 128         }
 129     }
 130   return -1;
 131 }
 132
 133 #define UNINAME_INVALID_INDEX UINT16_MAX
 134
 135 /* Looks up the internal index of a Unicode character.  */
 136 static uint16_t
 137 unicode_code_to_index (ucs4_t c)
 138 {
 139   /* Binary search in unicode_ranges.  */
 140   unsigned int i1 = 0;
 141   unsigned int i2 = SIZEOF (unicode_ranges);
 142
 143   for (;;)
 144     {
 145       unsigned int i = (i1 + i2) >> 1;
 146       ucs4_t start_code =
 147         unicode_ranges[i].index + unicode_ranges[i].gap;
 148       ucs4_t end_code =
 149         start_code + unicode_ranges[i].length - 1;
 150
 151       if (start_code <= c && c <= end_code)
 152         return c - unicode_ranges[i].gap;
 153
 154       if (end_code < c)
 155         {
 156           if (i1 == i)
 157             break;
 158           /* Note here: i1 < i < i2.  */
 159           i1 = i;
 160         }
 161       else if (c < start_code)
 162         {
 163           if (i2 == i)
 164             break;
 165           /* Note here: i1 <= i < i2.  */
 166           i2 = i;
 167         }
 168     }
 169   return UNINAME_INVALID_INDEX;
 170 }
 171
 172 /* Looks up the codepoint of a Unicode character, from the given
 173    internal index.  */
 174 static ucs4_t
 175 unicode_index_to_code (uint16_t index)
 176 {
 177   /* Binary search in unicode_ranges.  */
 178   unsigned int i1 = 0;
 179   unsigned int i2 = SIZEOF (unicode_ranges);
 180
 181   for (;;)
 182     {
 183       unsigned int i = (i1 + i2) >> 1;
 184       uint16_t start_index = unicode_ranges[i].index;
 185       uint16_t end_index = start_index + unicode_ranges[i].length - 1;
 186
 187       if (start_index <= index && index <= end_index)
 188         return index + unicode_ranges[i].gap;
 189
 190       if (end_index < index)
 191         {
 192           if (i1 == i)
 193             break;
 194           /* Note here: i1 < i < i2.  */
 195           i1 = i;
 196         }
 197       else if (index < start_index)
 198         {
 199           if (i2 == i)
 200             break;
 201           /* Note here: i1 <= i < i2.  */
 202           i2 = i;
 203         }
 204     }
 205   return UNINAME_INVALID;
 206 }
 207
 208
 209 /* Auxiliary tables for Hangul syllable names, see the Unicode 3.0 book,
 210    sections 3.11 and 4.4.  */
 211 static const char jamo_initial_short_name[19][3] =
 212 {
 213   "G", "GG", "N", "D", "DD", "R", "M", "B", "BB", "S", "SS", "", "J", "JJ",
 214   "C", "K", "T", "P", "H"
 215 };
 216 static const char jamo_medial_short_name[21][4] =
 217 {
 218   "A", "AE", "YA", "YAE", "EO", "E", "YEO", "YE", "O", "WA", "WAE", "OE", "YO",
 219   "U", "WEO", "WE", "WI", "YU", "EU", "YI", "I"
 220 };
 221 static const char jamo_final_short_name[28][3] =
 222 {
 223   "", "G", "GG", "GS", "N", "NI", "NH", "D", "L", "LG", "LM", "LB", "LS", "LT",
 224   "LP", "LH", "M", "B", "BS", "S", "SS", "NG", "J", "C", "K", "T", "P", "H"
 225 };
 226
 227 /* Looks up the name of a Unicode character, in uppercase ASCII.
 228    Returns the filled buf, or NULL if the character does not have a name.  */
 229 char *
 230 unicode_character_name (ucs4_t c, char *buf)
 231 {
 232   if (c >= 0xAC00 && c <= 0xD7A3)
 233     {
 234       /* Special case for Hangul syllables. Keeps the tables small.  */
 235       char *ptr;
 236       unsigned int tmp;
 237       unsigned int index1;
 238       unsigned int index2;
 239       unsigned int index3;
 240       const char *q;
 241
 242       /* buf needs to have at least 16 + 7 bytes here.  */
 243       memcpy (buf, "HANGUL SYLLABLE ", 16);
 244       ptr = buf + 16;
 245
 246       tmp = c - 0xAC00;
 247       index3 = tmp % 28; tmp = tmp / 28;
 248       index2 = tmp % 21; tmp = tmp / 21;
 249       index1 = tmp;
 250
 251       q = jamo_initial_short_name[index1];
 252       while (*q != '\0')
 253         *ptr++ = *q++;
 254       q = jamo_medial_short_name[index2];
 255       while (*q != '\0')
 256         *ptr++ = *q++;
 257       q = jamo_final_short_name[index3];
 258       while (*q != '\0')
 259         *ptr++ = *q++;
 260       *ptr = '\0';
 261       return buf;
 262     }
 263   else if ((c >= 0xF900 && c <= 0xFA2D) || (c >= 0xFA30 && c <= 0xFA6A)
 264            || (c >= 0xFA70 && c <= 0xFAD9) || (c >= 0x2F800 && c <= 0x2FA1D))
 265     {
 266       /* Special case for CJK compatibility ideographs. Keeps the tables
 267          small.  */
 268       char *ptr;
 269       int i;
 270
 271       /* buf needs to have at least 28 + 5 bytes here.  */
 272       memcpy (buf, "CJK COMPATIBILITY IDEOGRAPH-", 28);
 273       ptr = buf + 28;
 274
 275       for (i = (c < 0x10000 ? 12 : 16); i >= 0; i -= 4)
 276         {
 277           unsigned int x = (c >> i) & 0xf;
 278           *ptr++ = (x < 10 ? '0' : 'A' - 10) + x;
 279         }
 280       *ptr = '\0';
 281       return buf;
 282     }
 283   else if ((c >= 0xFE00 && c <= 0xFE0F) || (c >= 0xE0100 && c <= 0xE01EF))
 284     {
 285       /* Special case for variation selectors. Keeps the tables
 286          small.  */
 287
 288       /* buf needs to have at least 19 + 3 bytes here.  */
 289       sprintf (buf, "VARIATION SELECTOR-%d",
 290                c <= 0xFE0F ? c - 0xFE00 + 1 : c - 0xE0100 + 17);
 291       return buf;
 292     }
 293   else
 294     {
 295       uint16_t index = unicode_code_to_index (c);
 296       const uint16_t *words = NULL;
 297
 298       if (index != UNINAME_INVALID_INDEX)
 299         {
 300           /* Binary search in unicode_code_to_name.  */
 301           unsigned int i1 = 0;
 302           unsigned int i2 = SIZEOF (unicode_index_to_name);
 303           for (;;)
 304             {
 305               unsigned int i = (i1 + i2) >> 1;
 306               if (unicode_index_to_name[i].index == index)
 307                 {
 308                   words = &unicode_names[unicode_index_to_name[i].name];
 309                   break;
 310                 }
 311               else if (unicode_index_to_name[i].index < index)
 312                 {
 313                   if (i1 == i)
 314                     {
 315                       words = NULL;
 316                       break;
 317                     }
 318                   /* Note here: i1 < i < i2.  */
 319                   i1 = i;
 320                 }
 321               else if (unicode_index_to_name[i].index > index)
 322                 {
 323                   if (i2 == i)
 324                     {
 325                       words = NULL;
 326                       break;
 327                     }
 328                   /* Note here: i1 <= i < i2.  */
 329                   i2 = i;
 330                 }
 331             }
 332         }
 333       if (words != NULL)
 334         {
 335           /* Found it in unicode_index_to_name. Now concatenate the words.  */
 336           /* buf needs to have at least UNICODE_CHARNAME_MAX_LENGTH bytes.  */
 337           char *ptr = buf;
 338           for (;;)
 339             {
 340               unsigned int wordlen;
 341               const char *word = unicode_name_word (*words>>1, &wordlen);
 342               do
 343                 *ptr++ = *word++;
 344               while (--wordlen > 0);
 345               if ((*words & 1) == 0)
 346                 break;
 347               *ptr++ = ' ';
 348               words++;
 349             }
 350           *ptr = '\0';
 351           return buf;
 352         }
 353       return NULL;
 354     }
 355 }
 356
 357 /* Looks up the Unicode character with a given name, in upper- or lowercase
 358    ASCII.  Returns the character if found, or UNINAME_INVALID if not found.  */
 359 ucs4_t
 360 unicode_name_character (const char *name)
 361 {
 362   unsigned int len = strlen (name);
 363   if (len > 1 && len <= UNICODE_CHARNAME_MAX_LENGTH)
 364     {
 365       /* Test for "word1 word2 ..." syntax.  */
 366       char buf[UNICODE_CHARNAME_MAX_LENGTH];
 367       char *ptr = buf;
 368       for (;;)
 369         {
 370           char c = *name++;
 371           if (!(c >= ' ' && c <= '~'))
 372             break;
 373           *ptr++ = (c >= 'a' && c <= 'z' ? c - 'a' + 'A' : c);
 374           if (--len == 0)
 375             goto filled_buf;
 376         }
 377       if (false)
 378       filled_buf:
 379         {
 380           {
 381             /* Special case for variation selector aliases. Keeps the
 382                tables small.  */
 383             const char *p1 = buf;
 384             if (ptr >= buf + 3 && *p1++ == 'V')
 385               {
 386                 if (*p1++ == 'S')
 387                   {
 388                     if (*p1 != '0')
 389                       {
 390                         unsigned int c = 0;
 391                         for (;;)
 392                           {
 393                             if (*p1 >= '0' && *p1 <= '9')
 394                               c += (*p1 - '0');
 395                             p1++;
 396                             if (p1 == ptr)
 397                               {
 398                                 if (c >= 1 && c <= 16)
 399                                   return c - 1 + 0xFE00;
 400                                 else if (c >= 17 && c <= 256)
 401                                   return c - 17 + 0xE0100;
 402                                 else
 403                                   break;
 404                               }
 405                             c = c * 10;
 406                           }
 407                       }
 408                   }
 409               }
 410           }
 411           {
 412             /* Convert the constituents to uint16_t words.  */
 413             uint16_t words[UNICODE_CHARNAME_MAX_WORDS];
 414             uint16_t *wordptr = words;
 415             {
 416               const char *p1 = buf;
 417               for (;;)
 418                 {
 419                   {
 420                     int word;
 421                     const char *p2 = p1;
 422                     while (p2 < ptr && *p2 != ' ')
 423                       p2++;
 424                     word = unicode_name_word_lookup (p1, p2 - p1);
 425                     if (word < 0)
 426                       break;
 427                     if (wordptr == &words[UNICODE_CHARNAME_MAX_WORDS])
 428                       break;
 429                     *wordptr++ = word;
 430                     if (p2 == ptr)
 431                       goto filled_words;
 432                     p1 = p2 + 1;
 433                   }
 434                   /* Special case for Hangul syllables. Keeps the tables small. */
 435                   if (wordptr == &words[2]
 436                       && words[0] == UNICODE_CHARNAME_WORD_HANGUL
 437                       && words[1] == UNICODE_CHARNAME_WORD_SYLLABLE)
 438                     {
 439                       /* Split the last word [p1..ptr) into three parts:
 440                            1) [BCDGHJKMNPRST]
 441                            2) [AEIOUWY]
 442                            3) [BCDGHIJKLMNPST]
 443                        */
 444                       const char *p2;
 445                       const char *p3;
 446                       const char *p4;
 447
 448                       p2 = p1;
 449                       while (p2 < ptr
 450                              && (*p2 == 'B' || *p2 == 'C' || *p2 == 'D'
 451                                  || *p2 == 'G' || *p2 == 'H' || *p2 == 'J'
 452                                  || *p2 == 'K' || *p2 == 'M' || *p2 == 'N'
 453                                  || *p2 == 'P' || *p2 == 'R' || *p2 == 'S'
 454                                  || *p2 == 'T'))
 455                         p2++;
 456                       p3 = p2;
 457                       while (p3 < ptr
 458                              && (*p3 == 'A' || *p3 == 'E' || *p3 == 'I'
 459                                  || *p3 == 'O' || *p3 == 'U' || *p3 == 'W'
 460                                  || *p3 == 'Y'))
 461                         p3++;
 462                       p4 = p3;
 463                       while (p4 < ptr
 464                              && (*p4 == 'B' || *p4 == 'C' || *p4 == 'D'
 465                                  || *p4 == 'G' || *p4 == 'H' || *p4 == 'I'
 466                                  || *p4 == 'J' || *p4 == 'K' || *p4 == 'L'
 467                                  || *p4 == 'M' || *p4 == 'N' || *p4 == 'P'
 468                                  || *p4 == 'S' || *p4 == 'T'))
 469                         p4++;
 470                       if (p4 == ptr)
 471                         {
 472                           unsigned int n1 = p2 - p1;
 473                           unsigned int n2 = p3 - p2;
 474                           unsigned int n3 = p4 - p3;
 475
 476                           if (n1 <= 2 && (n2 >= 1 && n2 <= 3) && n3 <= 2)
 477                             {
 478                               unsigned int index1;
 479
 480                               for (index1 = 0; index1 < 19; index1++)
 481                                 if (memcmp (jamo_initial_short_name[index1], p1, n1) == 0
 482                                     && jamo_initial_short_name[index1][n1] == '\0')
 483                                   {
 484                                     unsigned int index2;
 485
 486                                     for (index2 = 0; index2 < 21; index2++)
 487                                       if (memcmp (jamo_medial_short_name[index2], p2, n2) == 0
 488                                           && jamo_medial_short_name[index2][n2] == '\0')
 489                                         {
 490                                           unsigned int index3;
 491
 492                                           for (index3 = 0; index3 < 28; index3++)
 493                                             if (memcmp (jamo_final_short_name[index3], p3, n3) == 0
 494                                                 && jamo_final_short_name[index3][n3] == '\0')
 495                                               {
 496                                                 return 0xAC00 + (index1 * 21 + index2) * 28 + index3;
 497                                               }
 498                                           break;
 499                                         }
 500                                     break;
 501                                   }
 502                             }
 503                         }
 504                     }
 505                   /* Special case for CJK compatibility ideographs. Keeps the
 506                      tables small.  */
 507                   if (wordptr == &words[2]
 508                       && words[0] == UNICODE_CHARNAME_WORD_CJK
 509                       && words[1] == UNICODE_CHARNAME_WORD_COMPATIBILITY
 510                       && p1 + 14 <= ptr
 511                       && p1 + 15 >= ptr
 512                       && memcmp (p1, "IDEOGRAPH-", 10) == 0)
 513                     {
 514                       const char *p2 = p1 + 10;
 515
 516                       if (*p2 != '0')
 517                         {
 518                           unsigned int c = 0;
 519
 520                           for (;;)
 521                             {
 522                               if (*p2 >= '0' && *p2 <= '9')
 523                                 c += (*p2 - '0');
 524                               else if (*p2 >= 'A' && *p2 <= 'F')
 525                                 c += (*p2 - 'A' + 10);
 526                               else
 527                                 break;
 528                               p2++;
 529                               if (p2 == ptr)
 530                                 {
 531                                   if ((c >= 0xF900 && c <= 0xFA2D)
 532                                       || (c >= 0xFA30 && c <= 0xFA6A)
 533                                       || (c >= 0xFA70 && c <= 0xFAD9)
 534                                       || (c >= 0x2F800 && c <= 0x2FA1D))
 535                                     return c;
 536                                   else
 537                                     break;
 538                                 }
 539                               c = c << 4;
 540                             }
 541                         }
 542                     }
 543                   /* Special case for variation selectors. Keeps the
 544                      tables small.  */
 545                   if (wordptr == &words[1]
 546                       && words[0] == UNICODE_CHARNAME_WORD_VARIATION
 547                       && p1 + 10 <= ptr
 548                       && p1 + 12 >= ptr
 549                       && memcmp (p1, "SELECTOR-", 9) == 0)
 550                     {
 551                       const char *p2 = p1 + 9;
 552
 553                       if (*p2 != '0')
 554                         {
 555                           unsigned int c = 0;
 556
 557                           for (;;)
 558                             {
 559                               if (*p2 >= '0' && *p2 <= '9')
 560                                 c += (*p2 - '0');
 561                               p2++;
 562                               if (p2 == ptr)
 563                                 {
 564                                   if (c >= 1 && c <= 16)
 565                                     return c - 1 + 0xFE00;
 566                                   else if (c >= 17 && c <= 256)
 567                                     return c - 17 + 0xE0100;
 568                                   else
 569                                     break;
 570                                 }
 571                               c = c * 10;
 572                             }
 573                         }
 574                     }
 575                 }
 576             }
 577             if (false)
 578             filled_words:
 579               {
 580                 /* Multiply by 2, to simplify later comparisons.  */
 581                 unsigned int words_length = wordptr - words;
 582                 {
 583                   int i = words_length - 1;
 584                   words[i] = 2 * words[i];
 585                   for (; --i >= 0; )
 586                     words[i] = 2 * words[i] + 1;
 587                 }
 588                 /* Binary search in unicode_name_to_index.  */
 589                 {
 590                   unsigned int i1 = 0;
 591                   unsigned int i2 = SIZEOF (unicode_name_to_index);
 592                   for (;;)
 593                     {
 594                       unsigned int i = (i1 + i2) >> 1;
 595                       const uint16_t *w = words;
 596                       const uint16_t *p = &unicode_names[unicode_name_to_index[i].name];
 597                       unsigned int n = words_length;
 598                       for (;;)
 599                         {
 600                           if (*p < *w)
 601                             {
 602                               if (i1 == i)
 603                                 goto name_not_found;
 604                               /* Note here: i1 < i < i2.  */
 605                               i1 = i;
 606                               break;
 607                             }
 608                           else if (*p > *w)
 609                             {
 610                               if (i2 == i)
 611                                 goto name_not_found;
 612                               /* Note here: i1 <= i < i2.  */
 613                               i2 = i;
 614                               break;
 615                             }
 616                           p++; w++; n--;
 617                           if (n == 0)
 618                             return unicode_index_to_code (unicode_name_to_index[i].index);
 619                         }
 620                     }
 621                 }
 622               name_not_found: ;
 623               }
 624           }
 625         }
 626     }
 627   return UNINAME_INVALID;
 628 }