lib/uniname/uniname.c

   1 /* Association between Unicode characters and their names.
   2    Copyright (C) 2000-2002, 2005-2007, 2009-2020 Free Software Foundation, Inc.
   3
   4    This program is free software: you can redistribute it and/or modify it
   5    under the terms of the GNU Lesser General Public License as published
   6    by the Free Software Foundation; either version 3 of the License, or
   7    (at your option) any later version.
   8
   9    This program is distributed in the hope that it will be useful,
  10    but WITHOUT ANY WARRANTY; without even the implied warranty of
  11    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  12    Lesser General Public License for more details.
  13
  14    You should have received a copy of the GNU Lesser General Public License
  15    along with this program.  If not, see <https://www.gnu.org/licenses/>.  */
  16
  17 #include <config.h>
  18
  19 /* Specification.  */
  20 #include "uniname.h"
  21
  22 #include <assert.h>
  23 #include <stdbool.h>
  24 #include <stdint.h>
  25 #include <stdio.h>
  26 #include <string.h>
  27
  28 #include "attribute.h"
  29
  30 #define SIZEOF(a) (sizeof(a) / sizeof(a[0]))
  31
  32
  33 /* Table of Unicode character names, derived from UnicodeData.txt.
  34    This table is generated in a way to minimize the memory footprint:
  35      1. its compiled size is small (less than 350 KB),
  36      2. it resides entirely in the text or read-only data segment of the
  37         executable or shared library: the table contains only immediate
  38         integers, no pointers, and the functions don't do heap allocation.
  39  */
  40 #include "uninames.h"
  41 /* It contains:
  42   static const char unicode_name_words[36303] = ...;
  43   #define UNICODE_CHARNAME_NUM_WORDS 6260
  44   static const struct { uint16_t extra_offset; uint16_t ind_offset; } unicode_name_by_length[26] = ...;
  45   #define UNICODE_CHARNAME_WORD_HANGUL 3902
  46   #define UNICODE_CHARNAME_WORD_SYLLABLE 4978
  47   #define UNICODE_CHARNAME_WORD_CJK 417
  48   #define UNICODE_CHARNAME_WORD_COMPATIBILITY 6107
  49   static const uint16_t unicode_names[68940] = ...;
  50   static const struct { uint16_t index; uint32_t name:24; } unicode_name_to_index[16626] = ...;
  51   static const struct { uint16_t index; uint32_t name:24; } unicode_index_to_name[16626] = ...;
  52   #define UNICODE_CHARNAME_MAX_LENGTH 83
  53   #define UNICODE_CHARNAME_MAX_WORDS 13
  54   static const struct { uint32_t index; uint32_t gap; uint16_t length; } unicode_ranges[401] = ...;
  55 */
  56
  57 /* Returns the word with a given index.  */
  58 static const char *
  59 unicode_name_word (unsigned int index, unsigned int *lengthp)
  60 {
  61   unsigned int i1;
  62   unsigned int i2;
  63
  64   assert (index < UNICODE_CHARNAME_NUM_WORDS);
  65
  66   /* Binary search for i with
  67        unicode_name_by_length[i].ind_offset <= index
  68      and
  69        index < unicode_name_by_length[i+1].ind_offset
  70    */
  71
  72   i1 = 0;
  73   i2 = SIZEOF (unicode_name_by_length) - 1;
  74   while (i2 - i1 > 1)
  75     {
  76       unsigned int i = (i1 + i2) >> 1;
  77       if (unicode_name_by_length[i].ind_offset <= index)
  78         i1 = i;
  79       else
  80         i2 = i;
  81     }
  82   unsigned int i = i1;
  83   assert (unicode_name_by_length[i].ind_offset <= index
  84           && index < unicode_name_by_length[i+1].ind_offset);
  85   *lengthp = i;
  86   return &unicode_name_words[unicode_name_by_length[i].extra_offset
  87                              + (index-unicode_name_by_length[i].ind_offset)*i];
  88 }
  89
  90 /* Looks up the index of a word.  */
  91 static int
  92 unicode_name_word_lookup (const char *word, size_t length)
  93 {
  94   if (length > 0 && length < SIZEOF (unicode_name_by_length) - 1)
  95     {
  96       /* Binary search among the words of given length.  */
  97       unsigned int extra_offset = unicode_name_by_length[length].extra_offset;
  98       unsigned int i0 = unicode_name_by_length[length].ind_offset;
  99       unsigned int i1 = i0;
 100       unsigned int i2 = unicode_name_by_length[length+1].ind_offset;
 101       while (i2 - i1 > 0)
 102         {
 103           unsigned int i = (i1 + i2) >> 1;
 104           const char *p = &unicode_name_words[extra_offset + (i-i0)*length];
 105           const char *w = word;
 106           unsigned int n = length;
 107           for (;;)
 108             {
 109               if (*p < *w)
 110                 {
 111                   if (i1 == i)
 112                     return -1;
 113                   /* Note here: i1 < i < i2.  */
 114                   i1 = i;
 115                   break;
 116                 }
 117               if (*p > *w)
 118                 {
 119                   /* Note here: i1 <= i < i2.  */
 120                   i2 = i;
 121                   break;
 122                 }
 123               p++; w++; n--;
 124               if (n == 0)
 125                 return i;
 126             }
 127         }
 128     }
 129   return -1;
 130 }
 131
 132 #define UNINAME_INVALID_INDEX UINT16_MAX
 133
 134 /* Looks up the internal index of a Unicode character.  */
 135 static uint16_t
 136 unicode_code_to_index (ucs4_t c)
 137 {
 138   /* Binary search in unicode_ranges.  */
 139   unsigned int i1 = 0;
 140   unsigned int i2 = SIZEOF (unicode_ranges);
 141
 142   for (;;)
 143     {
 144       unsigned int i = (i1 + i2) >> 1;
 145       ucs4_t start_code =
 146         unicode_ranges[i].index + unicode_ranges[i].gap;
 147       ucs4_t end_code =
 148         start_code + unicode_ranges[i].length - 1;
 149
 150       if (start_code <= c && c <= end_code)
 151         return c - unicode_ranges[i].gap;
 152
 153       if (end_code < c)
 154         {
 155           if (i1 == i)
 156             break;
 157           /* Note here: i1 < i < i2.  */
 158           i1 = i;
 159         }
 160       else if (c < start_code)
 161         {
 162           if (i2 == i)
 163             break;
 164           /* Note here: i1 <= i < i2.  */
 165           i2 = i;
 166         }
 167     }
 168   return UNINAME_INVALID_INDEX;
 169 }
 170
 171 /* Looks up the codepoint of a Unicode character, from the given
 172    internal index.  */
 173 static ucs4_t
 174 unicode_index_to_code (uint16_t index)
 175 {
 176   /* Binary search in unicode_ranges.  */
 177   unsigned int i1 = 0;
 178   unsigned int i2 = SIZEOF (unicode_ranges);
 179
 180   for (;;)
 181     {
 182       unsigned int i = (i1 + i2) >> 1;
 183       uint16_t start_index = unicode_ranges[i].index;
 184       uint16_t end_index = start_index + unicode_ranges[i].length - 1;
 185
 186       if (start_index <= index && index <= end_index)
 187         return index + unicode_ranges[i].gap;
 188
 189       if (end_index < index)
 190         {
 191           if (i1 == i)
 192             break;
 193           /* Note here: i1 < i < i2.  */
 194           i1 = i;
 195         }
 196       else if (index < start_index)
 197         {
 198           if (i2 == i)
 199             break;
 200           /* Note here: i1 <= i < i2.  */
 201           i2 = i;
 202         }
 203     }
 204   return UNINAME_INVALID;
 205 }
 206
 207
 208 /* Auxiliary tables for Hangul syllable names, see the Unicode 3.0 book,
 209    sections 3.11 and 4.4.  */
 210 static const char jamo_initial_short_name[19][3] =
 211 {
 212   "G", "GG", "N", "D", "DD", "R", "M", "B", "BB", "S", "SS", "", "J", "JJ",
 213   "C", "K", "T", "P", "H"
 214 };
 215 static const char jamo_medial_short_name[21][4] =
 216 {
 217   "A", "AE", "YA", "YAE", "EO", "E", "YEO", "YE", "O", "WA", "WAE", "OE", "YO",
 218   "U", "WEO", "WE", "WI", "YU", "EU", "YI", "I"
 219 };
 220 static const char jamo_final_short_name[28][3] =
 221 {
 222   "", "G", "GG", "GS", "N", "NI", "NH", "D", "L", "LG", "LM", "LB", "LS", "LT",
 223   "LP", "LH", "M", "B", "BS", "S", "SS", "NG", "J", "C", "K", "T", "P", "H"
 224 };
 225
 226 /* Looks up the name of a Unicode character, in uppercase ASCII.
 227    Returns the filled buf, or NULL if the character does not have a name.  */
 228 char *
 229 unicode_character_name (ucs4_t c, char *buf)
 230 {
 231   if (c >= 0xAC00 && c <= 0xD7A3)
 232     {
 233       /* Special case for Hangul syllables. Keeps the tables small.  */
 234       char *ptr;
 235       unsigned int tmp;
 236       unsigned int index1;
 237       unsigned int index2;
 238       unsigned int index3;
 239       const char *q;
 240
 241       /* buf needs to have at least 16 + 7 bytes here.  */
 242       memcpy (buf, "HANGUL SYLLABLE ", 16);
 243       ptr = buf + 16;
 244
 245       tmp = c - 0xAC00;
 246       index3 = tmp % 28; tmp = tmp / 28;
 247       index2 = tmp % 21; tmp = tmp / 21;
 248       index1 = tmp;
 249
 250       q = jamo_initial_short_name[index1];
 251       while (*q != '\0')
 252         *ptr++ = *q++;
 253       q = jamo_medial_short_name[index2];
 254       while (*q != '\0')
 255         *ptr++ = *q++;
 256       q = jamo_final_short_name[index3];
 257       while (*q != '\0')
 258         *ptr++ = *q++;
 259       *ptr = '\0';
 260       return buf;
 261     }
 262   else if ((c >= 0xF900 && c <= 0xFA2D) || (c >= 0xFA30 && c <= 0xFA6A)
 263            || (c >= 0xFA70 && c <= 0xFAD9) || (c >= 0x2F800 && c <= 0x2FA1D))
 264     {
 265       /* Special case for CJK compatibility ideographs. Keeps the tables
 266          small.  */
 267       char *ptr;
 268       int i;
 269
 270       /* buf needs to have at least 28 + 5 bytes here.  */
 271       memcpy (buf, "CJK COMPATIBILITY IDEOGRAPH-", 28);
 272       ptr = buf + 28;
 273
 274       for (i = (c < 0x10000 ? 12 : 16); i >= 0; i -= 4)
 275         {
 276           unsigned int x = (c >> i) & 0xf;
 277           *ptr++ = (x < 10 ? '0' : 'A' - 10) + x;
 278         }
 279       *ptr = '\0';
 280       return buf;
 281     }
 282   else if ((c >= 0xFE00 && c <= 0xFE0F) || (c >= 0xE0100 && c <= 0xE01EF))
 283     {
 284       /* Special case for variation selectors. Keeps the tables
 285          small.  */
 286
 287       /* buf needs to have at least 19 + 3 bytes here.  */
 288       sprintf (buf, "VARIATION SELECTOR-%d",
 289                c <= 0xFE0F ? c - 0xFE00 + 1 : c - 0xE0100 + 17);
 290       return buf;
 291     }
 292   else
 293     {
 294       uint16_t index = unicode_code_to_index (c);
 295       const uint16_t *words = NULL;
 296
 297       if (index != UNINAME_INVALID_INDEX)
 298         {
 299           /* Binary search in unicode_code_to_name.  */
 300           unsigned int i1 = 0;
 301           unsigned int i2 = SIZEOF (unicode_index_to_name);
 302           for (;;)
 303             {
 304               unsigned int i = (i1 + i2) >> 1;
 305               if (unicode_index_to_name[i].index == index)
 306                 {
 307                   words = &unicode_names[unicode_index_to_name[i].name];
 308                   break;
 309                 }
 310               else if (unicode_index_to_name[i].index < index)
 311                 {
 312                   if (i1 == i)
 313                     {
 314                       words = NULL;
 315                       break;
 316                     }
 317                   /* Note here: i1 < i < i2.  */
 318                   i1 = i;
 319                 }
 320               else if (unicode_index_to_name[i].index > index)
 321                 {
 322                   if (i2 == i)
 323                     {
 324                       words = NULL;
 325                       break;
 326                     }
 327                   /* Note here: i1 <= i < i2.  */
 328                   i2 = i;
 329                 }
 330             }
 331         }
 332       if (words != NULL)
 333         {
 334           /* Found it in unicode_index_to_name. Now concatenate the words.  */
 335           /* buf needs to have at least UNICODE_CHARNAME_MAX_LENGTH bytes.  */
 336           char *ptr = buf;
 337           for (;;)
 338             {
 339               unsigned int wordlen;
 340               const char *word = unicode_name_word (*words>>1, &wordlen);
 341               do
 342                 *ptr++ = *word++;
 343               while (--wordlen > 0);
 344               if ((*words & 1) == 0)
 345                 break;
 346               *ptr++ = ' ';
 347               words++;
 348             }
 349           *ptr = '\0';
 350           return buf;
 351         }
 352       return NULL;
 353     }
 354 }
 355
 356 /* Looks up the Unicode character with a given name, in upper- or lowercase
 357    ASCII.  Returns the character if found, or UNINAME_INVALID if not found.  */
 358 ucs4_t
 359 unicode_name_character (const char *name)
 360 {
 361   size_t len = strlen (name);
 362   if (len > 1 && len <= UNICODE_CHARNAME_MAX_LENGTH)
 363     {
 364       /* Test for "word1 word2 ..." syntax.  */
 365       char buf[UNICODE_CHARNAME_MAX_LENGTH];
 366       char *ptr = buf;
 367       for (;;)
 368         {
 369           char c = *name++;
 370           if (!(c >= ' ' && c <= '~'))
 371             break;
 372           *ptr++ = (c >= 'a' && c <= 'z' ? c - 'a' + 'A' : c);
 373           if (--len == 0)
 374             goto filled_buf;
 375         }
 376       if (false)
 377       filled_buf:
 378         {
 379           {
 380             /* Special case for variation selector aliases. Keeps the
 381                tables small.  */
 382             const char *p1 = buf;
 383             if (ptr >= buf + 3 && *p1++ == 'V')
 384               {
 385                 if (*p1++ == 'S')
 386                   {
 387                     if (*p1 != '0')
 388                       {
 389                         unsigned int c = 0;
 390                         for (;;)
 391                           {
 392                             if (*p1 >= '0' && *p1 <= '9')
 393                               c += (*p1 - '0');
 394                             p1++;
 395                             if (p1 == ptr)
 396                               {
 397                                 if (c >= 1 && c <= 16)
 398                                   return c - 1 + 0xFE00;
 399                                 else if (c >= 17 && c <= 256)
 400                                   return c - 17 + 0xE0100;
 401                                 else
 402                                   break;
 403                               }
 404                             c = c * 10;
 405                           }
 406                       }
 407                   }
 408               }
 409           }
 410           {
 411             /* Convert the constituents to uint16_t words.  */
 412             uint16_t words[UNICODE_CHARNAME_MAX_WORDS];
 413             uint16_t *wordptr = words;
 414             {
 415               const char *p1 = buf;
 416               for (;;)
 417                 {
 418                   {
 419                     int word;
 420                     const char *p2 = p1;
 421                     while (p2 < ptr && *p2 != ' ')
 422                       p2++;
 423                     word = unicode_name_word_lookup (p1, p2 - p1);
 424                     if (word < 0)
 425                       break;
 426                     if (wordptr == &words[UNICODE_CHARNAME_MAX_WORDS])
 427                       break;
 428                     *wordptr++ = word;
 429                     if (p2 == ptr)
 430                       goto filled_words;
 431                     p1 = p2 + 1;
 432                   }
 433                   /* Special case for Hangul syllables. Keeps the tables small. */
 434                   if (wordptr == &words[2]
 435                       && words[0] == UNICODE_CHARNAME_WORD_HANGUL
 436                       && words[1] == UNICODE_CHARNAME_WORD_SYLLABLE)
 437                     {
 438                       /* Split the last word [p1..ptr) into three parts:
 439                            1) [BCDGHJKMNPRST]
 440                            2) [AEIOUWY]
 441                            3) [BCDGHIJKLMNPST]
 442                        */
 443                       const char *p2;
 444                       const char *p3;
 445                       const char *p4;
 446
 447                       p2 = p1;
 448                       while (p2 < ptr
 449                              && (*p2 == 'B' || *p2 == 'C' || *p2 == 'D'
 450                                  || *p2 == 'G' || *p2 == 'H' || *p2 == 'J'
 451                                  || *p2 == 'K' || *p2 == 'M' || *p2 == 'N'
 452                                  || *p2 == 'P' || *p2 == 'R' || *p2 == 'S'
 453                                  || *p2 == 'T'))
 454                         p2++;
 455                       p3 = p2;
 456                       while (p3 < ptr
 457                              && (*p3 == 'A' || *p3 == 'E' || *p3 == 'I'
 458                                  || *p3 == 'O' || *p3 == 'U' || *p3 == 'W'
 459                                  || *p3 == 'Y'))
 460                         p3++;
 461                       p4 = p3;
 462                       while (p4 < ptr
 463                              && (*p4 == 'B' || *p4 == 'C' || *p4 == 'D'
 464                                  || *p4 == 'G' || *p4 == 'H' || *p4 == 'I'
 465                                  || *p4 == 'J' || *p4 == 'K' || *p4 == 'L'
 466                                  || *p4 == 'M' || *p4 == 'N' || *p4 == 'P'
 467                                  || *p4 == 'S' || *p4 == 'T'))
 468                         p4++;
 469                       if (p4 == ptr)
 470                         {
 471                           size_t n1 = p2 - p1;
 472                           size_t n2 = p3 - p2;
 473                           size_t n3 = p4 - p3;
 474
 475                           if (n1 <= 2 && (n2 >= 1 && n2 <= 3) && n3 <= 2)
 476                             {
 477                               unsigned int index1;
 478
 479                               for (index1 = 0; index1 < 19; index1++)
 480                                 if (memcmp (jamo_initial_short_name[index1], p1, n1) == 0
 481                                     && jamo_initial_short_name[index1][n1] == '\0')
 482                                   {
 483                                     unsigned int index2;
 484
 485                                     for (index2 = 0; index2 < 21; index2++)
 486                                       if (memcmp (jamo_medial_short_name[index2], p2, n2) == 0
 487                                           && jamo_medial_short_name[index2][n2] == '\0')
 488                                         {
 489                                           unsigned int index3;
 490
 491                                           for (index3 = 0; index3 < 28; index3++)
 492                                             if (memcmp (jamo_final_short_name[index3], p3, n3) == 0
 493                                                 && jamo_final_short_name[index3][n3] == '\0')
 494                                               {
 495                                                 return 0xAC00 + (index1 * 21 + index2) * 28 + index3;
 496                                               }
 497                                           break;
 498                                         }
 499                                     break;
 500                                   }
 501                             }
 502                         }
 503                     }
 504                   /* Special case for CJK compatibility ideographs. Keeps the
 505                      tables small.  */
 506                   if (wordptr == &words[2]
 507                       && words[0] == UNICODE_CHARNAME_WORD_CJK
 508                       && words[1] == UNICODE_CHARNAME_WORD_COMPATIBILITY
 509                       && p1 + 14 <= ptr
 510                       && p1 + 15 >= ptr
 511                       && memcmp (p1, "IDEOGRAPH-", 10) == 0)
 512                     {
 513                       const char *p2 = p1 + 10;
 514
 515                       if (*p2 != '0')
 516                         {
 517                           unsigned int c = 0;
 518
 519                           for (;;)
 520                             {
 521                               if (*p2 >= '0' && *p2 <= '9')
 522                                 c += (*p2 - '0');
 523                               else if (*p2 >= 'A' && *p2 <= 'F')
 524                                 c += (*p2 - 'A' + 10);
 525                               else
 526                                 break;
 527                               p2++;
 528                               if (p2 == ptr)
 529                                 {
 530                                   if ((c >= 0xF900 && c <= 0xFA2D)
 531                                       || (c >= 0xFA30 && c <= 0xFA6A)
 532                                       || (c >= 0xFA70 && c <= 0xFAD9)
 533                                       || (c >= 0x2F800 && c <= 0x2FA1D))
 534                                     return c;
 535                                   else
 536                                     break;
 537                                 }
 538                               c = c << 4;
 539                             }
 540                         }
 541                     }
 542                   /* Special case for variation selectors. Keeps the
 543                      tables small.  */
 544                   if (wordptr == &words[1]
 545                       && words[0] == UNICODE_CHARNAME_WORD_VARIATION
 546                       && p1 + 10 <= ptr
 547                       && p1 + 12 >= ptr
 548                       && memcmp (p1, "SELECTOR-", 9) == 0)
 549                     {
 550                       const char *p2 = p1 + 9;
 551
 552                       if (*p2 != '0')
 553                         {
 554                           unsigned int c = 0;
 555
 556                           for (;;)
 557                             {
 558                               if (*p2 >= '0' && *p2 <= '9')
 559                                 c += (*p2 - '0');
 560                               p2++;
 561                               if (p2 == ptr)
 562                                 {
 563                                   if (c >= 1 && c <= 16)
 564                                     return c - 1 + 0xFE00;
 565                                   else if (c >= 17 && c <= 256)
 566                                     return c - 17 + 0xE0100;
 567                                   else
 568                                     break;
 569                                 }
 570                               c = c * 10;
 571                             }
 572                         }
 573                     }
 574                 }
 575             }
 576             if (false)
 577             filled_words:
 578               {
 579                 /* Multiply by 2, to simplify later comparisons.  */
 580                 size_t words_length = wordptr - words;
 581                 {
 582                   size_t i = words_length - 1;
 583                   words[i] = 2 * words[i];
 584                   for (; i > 0; )
 585                     {
 586                       --i;
 587                       words[i] = 2 * words[i] + 1;
 588                     }
 589                 }
 590                 /* Binary search in unicode_name_to_index.  */
 591                 {
 592                   unsigned int i1 = 0;
 593                   unsigned int i2 = SIZEOF (unicode_name_to_index);
 594                   for (;;)
 595                     {
 596                       unsigned int i = (i1 + i2) >> 1;
 597                       const uint16_t *w = words;
 598                       const uint16_t *p = &unicode_names[unicode_name_to_index[i].name];
 599                       size_t n = words_length;
 600                       for (;;)
 601                         {
 602                           if (*p < *w)
 603                             {
 604                               if (i1 == i)
 605                                 goto name_not_found;
 606                               /* Note here: i1 < i < i2.  */
 607                               i1 = i;
 608                               break;
 609                             }
 610                           else if (*p > *w)
 611                             {
 612                               if (i2 == i)
 613                                 goto name_not_found;
 614                               /* Note here: i1 <= i < i2.  */
 615                               i2 = i;
 616                               break;
 617                             }
 618                           p++; w++; n--;
 619                           if (n == 0)
 620                             return unicode_index_to_code (unicode_name_to_index[i].index);
 621                         }
 622                     }
 623                 }
 624               name_not_found: ;
 625               }
 626           }
 627         }
 628     }
 629   return UNINAME_INVALID;
 630 }