libcpp/makeuname2c.cc

   1 /* Make uname2c.h from various sources.
   2    Copyright (C) 2005-2023 Free Software Foundation, Inc.
   3    Contributed by Jakub Jelinek <jakub@redhat.com>
   4
   5 This program is free software; you can redistribute it and/or modify it
   6 under the terms of the GNU General Public License as published by the
   7 Free Software Foundation; either version 3, or (at your option) any
   8 later version.
   9
  10 This program is distributed in the hope that it will be useful,
  11 but WITHOUT ANY WARRANTY; without even the implied warranty of
  12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  13 GNU General Public License for more details.
  14
  15 You should have received a copy of the GNU General Public License
  16 along with this program; see the file COPYING3.  If not see
  17 <http://www.gnu.org/licenses/>.  */
  18
  19 /* Run this program as
  20    ./makeuname2c UnicodeData.txt NameAliases.txt > uname2c.h
  21
  22    This program generates 2 big arrays and 2 small ones.
  23    The large ones are uname2c_dict, initialized by string literal
  24    representing dictionary, and uname2c_tree, which is a space optimized
  25    radix tree.
  26    The format of the radix tree is:
  27    byte 0       either 0x80 + (key[0] - ' ')    (if key_len == 1)
  28                 or key_len                      (otherwise)
  29                 either of them ored with 0x40 if it has a codepoint
  30    byte 1       LSB of offset into uname2c_dict for key (only if key_len > 1)
  31    byte 2       MSB of offset into uname2c_dict for key (only if key_len > 1)
  32                 if key_len == 1, the above 2 bytes are omitted
  33    byte 3       LSB of codepoint (only if it has a codepoint)
  34    byte 4       middle byte of codepoint (ditto)
  35    byte 5       MSB of codepoint (ditto), ored with 0x80 if node has children
  36                                    ored with 0x40 if it doesn't have siblings
  37                 if it doesn't have a codepoint, the above 3 bytes are omitted
  38                 and we assume that the node has children
  39    byte 6, 7, 8 uleb128 encoded offset to first child relative to the end
  40                 of the uleb128 (only if node has children)
  41    byte 9       0xff (only if node doesn't have a codepoint and doesn't
  42                       have siblings)
  43
  44    For prefixes of Unicode NR1 or NR2 rule generated names, on a node
  45    representing end of the prefix codepoint is 0xd800 + index into
  46    uname2c_generated array with indexes into uname2c_pairs array of
  47    code points (low, high) of the ranges terminated by single 0.
  48    0xd800 is NR1 rule (Hangul syllables), rest are NR2 rules.
  49 */
  50
  51 #include <assert.h>
  52 #include <stdio.h>
  53 #include <string.h>
  54 #include <stdint.h>
  55 #include <ctype.h>
  56 #include <limits.h>
  57 #include <stdarg.h>
  58 #include <stdbool.h>
  59 #include <stdlib.h>
  60
  61 #define ARRAY_SIZE(a) (sizeof (a) / sizeof ((a)[0]))
  62
  63 #define NUM_CODE_POINTS 0x110000
  64 #define MAX_CODE_POINT 0x10ffff
  65 #define NO_VALUE 0xdc00
  66 #define GENERATED 0xd800
  67
  68 struct entry { const char *name; unsigned long codepoint; };
  69 static struct entry *entries;
  70 static unsigned long num_allocated, num_entries;
  71
  72 /* Unicode 15.1 Table 4-8.  */
  73 struct generated {
  74   const char *prefix;
  75   /* max_high is a workaround for UnicodeData.txt inconsistencies
  76      on a few CJK UNIFIED IDEOGRAPH- ranges where the "*, Last>"
  77      entry is a few code points above the end of the range.  */
  78   unsigned long low, high, max_high;
  79   int idx, ok;
  80 };
  81 static struct generated generated_ranges[] =
  82 { { "HANGUL SYLLABLE ", 0xac00, 0xd7a3, 0, 0, 0 }, /* NR1 rule */
  83   { "CJK UNIFIED IDEOGRAPH-", 0x3400, 0x4dbf, 0, 1, 0 }, /* NR2 rules */
  84   { "CJK UNIFIED IDEOGRAPH-", 0x4e00, 0x9fff, 0, 1, 0 },
  85   { "CJK UNIFIED IDEOGRAPH-", 0x20000, 0x2a6df, 0, 1, 0 },
  86   { "CJK UNIFIED IDEOGRAPH-", 0x2a700, 0x2b739, 0, 1, 0 },
  87   { "CJK UNIFIED IDEOGRAPH-", 0x2b740, 0x2b81d, 0, 1, 0 },
  88   { "CJK UNIFIED IDEOGRAPH-", 0x2b820, 0x2cea1, 0, 1, 0 },
  89   { "CJK UNIFIED IDEOGRAPH-", 0x2ceb0, 0x2ebe0, 0, 1, 0 },
  90   { "CJK UNIFIED IDEOGRAPH-", 0x2ebf0, 0x2ee5d, 0, 1, 0 },
  91   { "CJK UNIFIED IDEOGRAPH-", 0x30000, 0x3134a, 0, 1, 0 },
  92   { "CJK UNIFIED IDEOGRAPH-", 0x31350, 0x323af, 0, 1, 0 },
  93   { "TANGUT IDEOGRAPH-", 0x17000, 0x187f7, 0, 2, 0 },
  94   { "TANGUT IDEOGRAPH-", 0x18d00, 0x18d08, 0, 2, 0 },
  95   { "KHITAN SMALL SCRIPT CHARACTER-", 0x18b00, 0x18cd5, 0, 3, 0 },
  96   { "NUSHU CHARACTER-", 0x1b170, 0x1b2fb, 0, 4, 0 },
  97   { "CJK COMPATIBILITY IDEOGRAPH-", 0xf900, 0xfa6d, 0, 5, 0 },
  98   { "CJK COMPATIBILITY IDEOGRAPH-", 0xfa70, 0xfad9, 0, 5, 0 },
  99   { "CJK COMPATIBILITY IDEOGRAPH-", 0x2f800, 0x2fa1d, 0, 5, 0 }
 100 };
 101
 102 struct node {
 103   struct node *sibling, *child;
 104   const char *key;
 105   size_t key_len, key_idx, node_size, size_sum, child_off;
 106   unsigned long codepoint;
 107   bool in_dict;
 108 };
 109 static struct node *root, **nodes;
 110 static unsigned long num_nodes;
 111 static size_t dict_size, tree_size, max_entry_len;
 112 static char *dict;
 113 static unsigned char *tree;
 114
 115 /* Die!  */
 116
 117 static void
 118 fail (const char *s, ...)
 119 {
 120   va_list ap;
 121
 122   va_start (ap, s);
 123   vfprintf (stderr, s, ap);
 124   va_end (ap);
 125   fputc ('\n', stderr);
 126   exit (1);
 127 }
 128
 129 static void *
 130 xmalloc (size_t size)
 131 {
 132   void *ret = malloc (size);
 133
 134   if (ret == NULL)
 135     fail ("failed to allocate %ld bytes", (long) size);
 136   return ret;
 137 }
 138
 139 static void *
 140 xrealloc (void *p, size_t size)
 141 {
 142   void *ret = p ? realloc (p, size) : malloc (size);
 143
 144   if (ret == NULL)
 145     fail ("failed to allocate %ld bytes", (long) size);
 146   return ret;
 147 }
 148
 149 static int
 150 entrycmp (const void *p1, const void *p2)
 151 {
 152   const struct entry *e1 = (const struct entry *) p1;
 153   const struct entry *e2 = (const struct entry *) p2;
 154   int ret = strcmp (e1->name, e2->name);
 155
 156   if (ret != 0)
 157     return ret;
 158   if (e1->codepoint < e2->codepoint)
 159     return -1;
 160   if (e1->codepoint > e2->codepoint)
 161     return 1;
 162   return 0;
 163 }
 164
 165 static int
 166 nodecmp (const void *p1, const void *p2)
 167 {
 168   const struct node *n1 = *(const struct node *const *) p1;
 169   const struct node *n2 = *(const struct node *const *) p2;
 170   if (n1->key_len > n2->key_len)
 171     return -1;
 172   if (n1->key_len < n2->key_len)
 173     return 1;
 174   return memcmp (n1->key, n2->key, n1->key_len);
 175 }
 176
 177 /* Read UnicodeData.txt and fill in the 'decomp' table to be the
 178    decompositions of characters for which both the character
 179    decomposed and all the code points in the decomposition are valid
 180    for some supported language version, and the 'all_decomp' table to
 181    be the decompositions of all characters without those
 182    constraints.  */
 183
 184 static void
 185 read_table (char *fname, bool aliases_p)
 186 {
 187   FILE *f = fopen (fname, "r");
 188   const char *sname = aliases_p ? "NameAliases.txt" : "UnicodeData.txt";
 189
 190   if (!f)
 191     fail ("opening %s", sname);
 192   for (;;)
 193     {
 194       char line[256];
 195       unsigned long codepoint;
 196       const char *name, *aname;
 197       char *l;
 198       size_t i;
 199
 200       if (!fgets (line, sizeof (line), f))
 201         break;
 202       codepoint = strtoul (line, &l, 16);
 203       if (l == line && aliases_p)
 204         {
 205           /* NameAliased.txt can contain comments and empty lines.  */
 206           if (*line == '#' || *line == '\n')
 207             continue;
 208         }
 209       if (l == line || *l != ';')
 210         fail ("parsing %s, reading code point", sname);
 211       if (codepoint > MAX_CODE_POINT)
 212         fail ("parsing %s, code point too large", sname);
 213
 214       name = l + 1;
 215       do {
 216         ++l;
 217       } while (*l != ';');
 218
 219       aname = NULL;
 220       if (aliases_p)
 221         {
 222           /* Ignore figment and abbreviation aliases.  */
 223           if (strcmp (l + 1, "correction\n") != 0
 224               && strcmp (l + 1, "control\n") != 0
 225               && strcmp (l + 1, "alternate\n") != 0)
 226             continue;
 227           i = ARRAY_SIZE (generated_ranges);
 228         }
 229       else
 230         {
 231           for (i = 0; i < ARRAY_SIZE (generated_ranges); ++i)
 232             if (codepoint >= generated_ranges[i].low
 233                 && codepoint <= generated_ranges[i].max_high)
 234               break;
 235           if (i != ARRAY_SIZE (generated_ranges))
 236             {
 237               if (*name == '<' && l[-1] == '>')
 238                 {
 239                   if (codepoint == generated_ranges[i].low
 240                       && l - name >= 9
 241                       && memcmp (l - 8, ", First>", 8) == 0
 242                       && generated_ranges[i].ok == 0)
 243                     {
 244                       generated_ranges[i].ok = INT_MAX - 1;
 245                       aname = generated_ranges[i].prefix;
 246                       codepoint = GENERATED + generated_ranges[i].idx;
 247                     }
 248                   /* Unfortunately, UnicodeData.txt isn't consistent
 249                      with the Table 4-8 range endpoints in 3 cases,
 250                      the ranges are longer there by a few codepoints.
 251                      So use the max_high hack to avoid verification
 252                      failures.  */
 253                   else if (codepoint == generated_ranges[i].max_high
 254                            && l - name >= 8
 255                            && memcmp (l - 7, ", Last>", 7) == 0
 256                            && generated_ranges[i].ok == INT_MAX - 1)
 257                     {
 258                       generated_ranges[i].ok = INT_MAX;
 259                       continue;
 260                     }
 261                   else
 262                     fail ("unexpected generated entry %lx %.*s",
 263                           codepoint, (int) (l - name), name);
 264                 }
 265               else if (codepoint
 266                        == generated_ranges[i].low + generated_ranges[i].ok
 267                        && l - name == (strlen (generated_ranges[i].prefix)
 268                                        + (name - 1 - line))
 269                        && memcmp (name, generated_ranges[i].prefix,
 270                                   strlen (generated_ranges[i].prefix)) == 0
 271                        && memcmp (name + strlen (generated_ranges[i].prefix),
 272                                   line, name - 1 - line) == 0)
 273                 {
 274                   ++generated_ranges[i].ok;
 275                   if (codepoint != generated_ranges[i].low)
 276                     continue;
 277                   aname = generated_ranges[i].prefix;
 278                   codepoint = GENERATED + generated_ranges[i].idx;
 279                 }
 280               else
 281                 fail ("unexpected generated entry %lx %.*s",
 282                       codepoint, (int) (l - name), name);
 283               if (aname == generated_ranges[i].prefix)
 284                 {
 285                   size_t j;
 286
 287                   /* Don't add an entry for a generated range where the
 288                      same prefix has been added already.  */
 289                   for (j = 0; j < i; ++j)
 290                     if (generated_ranges[j].idx == generated_ranges[i].idx
 291                         && generated_ranges[j].ok != 0)
 292                       break;
 293                   if (j < i)
 294                     continue;
 295                 }
 296             }
 297           else if (*name == '<' && l[-1] == '>')
 298             continue;
 299         }
 300
 301       if (num_entries == num_allocated)
 302         {
 303           num_allocated = num_allocated ? 2 * num_allocated : 65536;
 304           entries = (struct entry *) xrealloc (entries, num_allocated
 305                                                         * sizeof (entries[0]));
 306         }
 307
 308       if (aname == NULL)
 309         {
 310           char *a = (char *) xmalloc (l + 1 - name);
 311           if (l - name > max_entry_len)
 312             max_entry_len = l - name;
 313           memcpy (a, name, l - name);
 314           a[l - name] = '\0';
 315           aname = a;
 316         }
 317       entries[num_entries].name = aname;
 318       entries[num_entries++].codepoint = codepoint;
 319     }
 320   if (ferror (f))
 321     fail ("reading %s", sname);
 322   fclose (f);
 323 }
 324
 325 /* Assumes nodes are added from sorted array, so we never
 326    add any node before existing one, only after it.  */
 327
 328 static void
 329 node_add (struct node **p, const char *key, size_t key_len,
 330           unsigned long codepoint)
 331 {
 332   struct node *n;
 333   size_t i;
 334
 335   do
 336     {
 337       if (*p == NULL)
 338         {
 339           *p = n = (struct node *) xmalloc (sizeof (struct node));
 340           ++num_nodes;
 341           assert (key_len);
 342           n->sibling = NULL;
 343           n->child = NULL;
 344           n->key = key;
 345           n->key_len = key_len;
 346           n->codepoint = codepoint;
 347           return;
 348         }
 349       n = *p;
 350       for (i = 0; i < n->key_len && i < key_len; ++i)
 351         if (n->key[i] != key[i])
 352           break;
 353       if (i == 0)
 354         {
 355           p = &n->sibling;
 356           continue;
 357         }
 358       if (i == n->key_len)
 359         {
 360           assert (key_len > n->key_len);
 361           p = &n->child;
 362           key += n->key_len;
 363           key_len -= n->key_len;
 364           continue;
 365         }
 366       /* Need to split the node.  */
 367       assert (i < key_len);
 368       n = (struct node *) xmalloc (sizeof (struct node));
 369       ++num_nodes;
 370       n->sibling = NULL;
 371       n->child = (*p)->child;
 372       n->key = (*p)->key + i;
 373       n->key_len = (*p)->key_len - i;
 374       n->codepoint = (*p)->codepoint;
 375       (*p)->child = n;
 376       (*p)->key_len = i;
 377       (*p)->codepoint = NO_VALUE;
 378       key += i;
 379       key_len -= i;
 380       p = &n->sibling;
 381     }
 382   while (1);
 383 }
 384
 385 static void
 386 append_nodes (struct node *n)
 387 {
 388   for (; n; n = n->sibling)
 389     {
 390       nodes[num_nodes++] = n;
 391       append_nodes (n->child);
 392     }
 393 }
 394
 395 static size_t
 396 sizeof_uleb128 (size_t val)
 397 {
 398   size_t sz = 0;
 399   do
 400     {
 401       val >>= 7;
 402       sz += 1;
 403     }
 404   while (val != 0);
 405   return sz;
 406 }
 407
 408 static void
 409 size_nodes (struct node *n)
 410 {
 411   if (n->child)
 412     size_nodes (n->child);
 413   if (n->sibling)
 414     size_nodes (n->sibling);
 415   n->node_size = 1 + (n->key_len > 1) * 2;
 416   if (n->codepoint != NO_VALUE)
 417     n->node_size += 3;
 418   else if (n->sibling == NULL)
 419     ++n->node_size;
 420   n->size_sum = 0;
 421   n->child_off = 0;
 422   if (n->sibling)
 423     n->size_sum += n->sibling->size_sum;
 424   if (n->child)
 425     {
 426       n->child_off = n->size_sum + (n->codepoint == NO_VALUE
 427                                     && n->sibling == NULL);
 428       n->node_size += sizeof_uleb128 (n->child_off);
 429     }
 430   n->size_sum += n->node_size;
 431   if (n->child)
 432     n->size_sum += n->child->size_sum;
 433   tree_size += n->node_size;
 434 }
 435
 436 static void
 437 write_uleb128 (unsigned char *p, size_t val)
 438 {
 439   unsigned char c;
 440   do
 441     {
 442       c = val & 0x7f;
 443       val >>= 7;
 444       if (val)
 445         c |= 0x80;
 446       *p++ = c;
 447     }
 448   while (val);
 449 }
 450
 451 static void
 452 write_nodes (struct node *n, size_t off)
 453 {
 454   for (; n; n = n->sibling)
 455     {
 456       assert (off < tree_size && tree[off] == 0);
 457       if (n->key_len > 1)
 458         {
 459           assert (n->key_len < 64);
 460           tree[off] = n->key_len;
 461         }
 462       else
 463         tree[off] = (n->key[0] - ' ') | 0x80;
 464       assert ((tree[off] & 0x40) == 0);
 465       if (n->codepoint != NO_VALUE)
 466         tree[off] |= 0x40;
 467       off++;
 468       if (n->key_len > 1)
 469         {
 470           tree[off++] = n->key_idx & 0xff;
 471           tree[off++] = (n->key_idx >> 8) & 0xff;
 472         }
 473       if (n->codepoint != NO_VALUE)
 474         {
 475           assert (n->codepoint < (1L << 21));
 476           tree[off++] = n->codepoint & 0xff;
 477           tree[off++] = (n->codepoint >> 8) & 0xff;
 478           tree[off] = (n->codepoint >> 16) & 0xff;
 479           if (n->child)
 480             tree[off] |= 0x80;
 481           if (!n->sibling)
 482             tree[off] |= 0x40;
 483           off++;
 484         }
 485       if (n->child)
 486         {
 487           write_uleb128 (&tree[off], n->child_off);
 488           off += sizeof_uleb128 (n->child_off);
 489           write_nodes (n->child, off + n->child_off);
 490         }
 491       if (n->codepoint == NO_VALUE
 492           && n->sibling == NULL)
 493         tree[off++] = 0xff;
 494     }
 495   assert (off <= tree_size);
 496 }
 497
 498 static void
 499 build_radix_tree (void)
 500 {
 501   size_t i, j, k, key_idx;
 502
 503   for (i = 0; i < ARRAY_SIZE (generated_ranges); ++i)
 504     if (generated_ranges[i].ok == INT_MAX)
 505       {
 506         if (generated_ranges[i].max_high - generated_ranges[i].high > 15UL)
 507           break;
 508       }
 509     else if (generated_ranges[i].ok == (generated_ranges[i].high
 510                                         - generated_ranges[i].low + 1))
 511       {
 512         if (generated_ranges[i].max_high != generated_ranges[i].high)
 513           break;
 514       }
 515     else
 516       break;
 517   if (i < ARRAY_SIZE (generated_ranges))
 518     fail ("uncovered generated range %s %lx %lx",
 519           generated_ranges[i].prefix, generated_ranges[i].low,
 520           generated_ranges[i].high);
 521   /* Sort entries alphabetically, node_add relies on that.  */
 522   qsort (entries, num_entries, sizeof (struct entry), entrycmp);
 523   for (i = 1; i < num_entries; ++i)
 524     if (i && strcmp (entries[i].name, entries[i - 1].name) == 0)
 525       fail ("multiple entries for name %s", entries[i].name);
 526
 527   for (i = 0; i < num_entries; ++i)
 528     node_add (&root, entries[i].name, strlen (entries[i].name),
 529               entries[i].codepoint);
 530
 531   nodes = (struct node **) xmalloc (num_nodes * sizeof (struct node *));
 532   i = num_nodes;
 533   num_nodes = 0;
 534   append_nodes (root);
 535   assert (num_nodes == i);
 536   /* Sort node pointers by decreasing string length to handle substrings
 537      right.  */
 538   qsort (nodes, num_nodes, sizeof (struct node *), nodecmp);
 539   if (nodes[0]->key_len >= 64)
 540     /* We could actually encode even 64 and 65, as key_len 0 and 1 will
 541        never appear in the multiple letter key encodings, so could subtract
 542        2.  */
 543     fail ("can't encode key length %d >= 64, so need to split some radix "
 544           "tree nodes to ensure length fits", nodes[0]->key_len);
 545
 546   /* Verify a property charset.cc UAX44-LM2 matching relies on:
 547      if - is at the end of key of some node, then all its siblings
 548      start with alphanumeric characters.
 549      Only 2 character names and 1 alias have - followed by space:
 550      U+0F0A TIBETAN MARK BKA- SHOG YIG MGO
 551      U+0FD0 TIBETAN MARK BKA- SHOG GI MGO RGYAN
 552      U+0FD0 TIBETAN MARK BSKA- SHOG GI MGO RGYAN
 553      so the KA- in there will always be followed at least by SHOG
 554      in the same node.
 555      If this changes, charset.cc needs to change.  */
 556   for (i = 0; i < num_nodes; ++i)
 557     if (nodes[i]->key[nodes[i]->key_len - 1] == '-'
 558         && nodes[i]->child)
 559       {
 560         struct node *n;
 561
 562         for (n = nodes[i]->child; n; n = n->sibling)
 563           if (n->key[0] == ' ')
 564             fail ("node with key %.*s followed by node with key %.*s",
 565                   (int) nodes[i]->key_len, nodes[i]->key,
 566                   (int) n->key_len, n->key);
 567       }
 568
 569   /* This is expensive, O(num_nodes * num_nodes * nodes[0]->key_len), but
 570      fortunately num_nodes is < 64K and key_len < 64.  */
 571   key_idx = 0;
 572   for (i = 0; i < num_nodes; ++i)
 573     {
 574       nodes[i]->key_idx = SIZE_MAX;
 575       nodes[i]->in_dict = false;
 576       if (nodes[i]->key_len > 1)
 577         {
 578           for (j = 0; j < i; ++j)
 579             /* Can't rely on memmem unfortunately.  */
 580             if (nodes[j]->in_dict)
 581               {
 582                 for (k = 0; k <= nodes[j]->key_len - nodes[i]->key_len; ++k)
 583                   if (nodes[j]->key[k] == nodes[i]->key[0]
 584                       && memcmp (nodes[j]->key + k + 1, nodes[i]->key + 1,
 585                                  nodes[i]->key_len - 1) == 0)
 586                     {
 587                       nodes[i]->key_idx = nodes[j]->key_idx + k;
 588                       j = i;
 589                       break;
 590                     }
 591                 if (j == i)
 592                   break;
 593                 for (; k < nodes[j]->key_len; ++k)
 594                   if (nodes[j]->key[k] == nodes[i]->key[0]
 595                       && memcmp (nodes[j]->key + k + 1, nodes[i]->key + 1,
 596                                  nodes[j]->key_len - 1 - k) == 0)
 597                     {
 598                       size_t l;
 599
 600                       for (l = j + 1; l < i; ++l)
 601                         if (nodes[l]->in_dict)
 602                           break;
 603                       if (l < i
 604                           && memcmp (nodes[l]->key,
 605                                      nodes[i]->key + (nodes[j]->key_len - k),
 606                                      nodes[i]->key_len
 607                                      - (nodes[j]->key_len - k)) == 0)
 608                         {
 609                           nodes[i]->key_idx = nodes[j]->key_idx + k;
 610                           j = i;
 611                         }
 612                       else
 613                         j = l - 1;
 614                       break;
 615                     }
 616               }
 617           if (nodes[i]->key_idx == SIZE_MAX)
 618             {
 619               nodes[i]->key_idx = key_idx;
 620               nodes[i]->in_dict = true;
 621               key_idx += nodes[i]->key_len;
 622             }
 623         }
 624     }
 625   if (key_idx >= 65536)
 626     /* We only use 2 bytes for offsets into the dictionary.
 627        If it grows more, there is e.g. a possibility to replace
 628        most often seen words or substrings in the dictionary
 629        with characters other than [A-Z0-9 -] (say LETTER occurs
 630        in the dictionary almost 197 times and so by using a
 631        instead of LETTER we could save (6 - 1) * 197 bytes,
 632        with some on the side table mapping 'a' to "LETTER".  */
 633     fail ("too large dictionary %ld", (long) key_idx);
 634   dict_size = key_idx;
 635
 636   size_nodes (root);
 637
 638   dict = (char *) xmalloc (dict_size + 1);
 639   for (i = 0; i < num_nodes; ++i)
 640     if (nodes[i]->in_dict)
 641       memcpy (dict + nodes[i]->key_idx, nodes[i]->key, nodes[i]->key_len);
 642   dict[dict_size] = '\0';
 643
 644   tree = (unsigned char *) xmalloc (tree_size);
 645   memset (tree, 0, tree_size);
 646   write_nodes (root, 0);
 647 }
 648
 649 /* Print out the huge copyright notice.  */
 650
 651 static void
 652 write_copyright (void)
 653 {
 654   static const char copyright[] = "\
 655 /* Unicode name to codepoint.\n\
 656    Copyright (C) 2005-2023 Free Software Foundation, Inc.\n\
 657 \n\
 658    This program is free software; you can redistribute it and/or modify it\n\
 659    under the terms of the GNU General Public License as published by the\n\
 660    Free Software Foundation; either version 3, or (at your option) any\n\
 661    later version.\n\
 662 \n\
 663    This program is distributed in the hope that it will be useful,\n\
 664    but WITHOUT ANY WARRANTY; without even the implied warranty of\n\
 665    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the\n\
 666    GNU General Public License for more details.\n\
 667 \n\
 668    You should have received a copy of the GNU General Public License\n\
 669    along with this program; see the file COPYING3.  If not see\n\
 670    <http://www.gnu.org/licenses/>.\n\
 671 \n\
 672 \n\
 673    Copyright (C) 1991-2023 Unicode, Inc.  All rights reserved.\n\
 674    Distributed under the Terms of Use in\n\
 675    http://www.unicode.org/copyright.html.\n\
 676 \n\
 677    Permission is hereby granted, free of charge, to any person\n\
 678    obtaining a copy of the Unicode data files and any associated\n\
 679    documentation (the \"Data Files\") or Unicode software and any\n\
 680    associated documentation (the \"Software\") to deal in the Data Files\n\
 681    or Software without restriction, including without limitation the\n\
 682    rights to use, copy, modify, merge, publish, distribute, and/or\n\
 683    sell copies of the Data Files or Software, and to permit persons to\n\
 684    whom the Data Files or Software are furnished to do so, provided\n\
 685    that (a) the above copyright notice(s) and this permission notice\n\
 686    appear with all copies of the Data Files or Software, (b) both the\n\
 687    above copyright notice(s) and this permission notice appear in\n\
 688    associated documentation, and (c) there is clear notice in each\n\
 689    modified Data File or in the Software as well as in the\n\
 690    documentation associated with the Data File(s) or Software that the\n\
 691    data or software has been modified.\n\
 692 \n\
 693    THE DATA FILES AND SOFTWARE ARE PROVIDED \"AS IS\", WITHOUT WARRANTY\n\
 694    OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE\n\
 695    WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND\n\
 696    NONINFRINGEMENT OF THIRD PARTY RIGHTS. IN NO EVENT SHALL THE\n\
 697    COPYRIGHT HOLDER OR HOLDERS INCLUDED IN THIS NOTICE BE LIABLE FOR\n\
 698    ANY CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES, OR ANY\n\
 699    DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,\n\
 700    WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS\n\
 701    ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE\n\
 702    OF THE DATA FILES OR SOFTWARE.\n\
 703 \n\
 704    Except as contained in this notice, the name of a copyright holder\n\
 705    shall not be used in advertising or otherwise to promote the sale,\n\
 706    use or other dealings in these Data Files or Software without prior\n\
 707    written authorization of the copyright holder.  */\n";
 708
 709    puts (copyright);
 710 }
 711
 712 static void
 713 write_dict (void)
 714 {
 715   size_t i;
 716
 717   printf ("static const char uname2c_dict[%ld] =\n", (long) (dict_size + 1));
 718   for (i = 0; i < dict_size; i += 77)
 719     printf ("\"%.77s\"%s\n", dict + i, i + 76 > dict_size ? ";" : "");
 720   puts ("");
 721 }
 722
 723 static void
 724 write_tree (void)
 725 {
 726   size_t i, j;
 727
 728   printf ("static const unsigned char uname2c_tree[%ld] = {\n",
 729           (long) tree_size);
 730   for (i = 0, j = 0; i < tree_size; ++i)
 731     {
 732       printf ("%s0x%02x%s", j == 0 ? "  " : "", tree[i],
 733               i == tree_size - 1 ? " };\n\n" : j == 11 ? ",\n" : ", ");
 734       if (j == 11)
 735         j = 0;
 736       else
 737         ++j;
 738     }
 739 }
 740
 741 static void
 742 write_generated (void)
 743 {
 744   size_t i, j;
 745
 746   puts ("static const cppchar_t uname2c_pairs[] = {");
 747   for (i = 0; i < ARRAY_SIZE (generated_ranges); ++i)
 748     {
 749       if (i == 0)
 750         ;
 751       else if (generated_ranges[i - 1].idx != generated_ranges[i].idx)
 752         puts (", 0,");
 753       else
 754         puts (",");
 755       printf ("  0x%lx, 0x%lx /* %s */",
 756               generated_ranges[i].low,
 757               generated_ranges[i].high,
 758               generated_ranges[i].prefix);
 759     }
 760   puts (", 0 };\n");
 761
 762   puts ("static const unsigned char uname2c_generated[] = {");
 763   for (i = 0, j = -1; i < ARRAY_SIZE (generated_ranges); ++i)
 764     {
 765       if (i == 0 || generated_ranges[i - 1].idx != generated_ranges[i].idx)
 766         printf ("%s  %d /* %s */", i ? ",\n" : "",
 767                 ++j, generated_ranges[i].prefix);
 768       j += 2;
 769     }
 770   puts (" };\n");
 771 }
 772
 773 /* Main program.  */
 774
 775 int
 776 main (int argc, char **argv)
 777 {
 778   size_t i;
 779
 780   if (argc != 3)
 781     fail ("too few arguments to makeradixtree");
 782   for (i = 0; i < ARRAY_SIZE (generated_ranges); ++i)
 783     if (!generated_ranges[i].max_high)
 784       generated_ranges[i].max_high = generated_ranges[i].high;
 785   read_table (argv[1], false);
 786   read_table (argv[2], true);
 787   build_radix_tree ();
 788
 789   write_copyright ();
 790   write_dict ();
 791   write_tree ();
 792   write_generated ();
 793   printf ("static const unsigned int uname2c_max_name_len = %ld;\n\n", max_entry_len);
 794   return 0;
 795 }