libcpp/makeuname2c.cc

   1 /* Make uname2c.h from various sources.
   2    Copyright (C) 2005-2022 Free Software Foundation, Inc.
   3    Contributed by Jakub Jelinek <jakub@redhat.com>
   4
   5 This program is free software; you can redistribute it and/or modify it
   6 under the terms of the GNU General Public License as published by the
   7 Free Software Foundation; either version 3, or (at your option) any
   8 later version.
   9
  10 This program is distributed in the hope that it will be useful,
  11 but WITHOUT ANY WARRANTY; without even the implied warranty of
  12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  13 GNU General Public License for more details.
  14
  15 You should have received a copy of the GNU General Public License
  16 along with this program; see the file COPYING3.  If not see
  17 <http://www.gnu.org/licenses/>.  */
  18
  19 /* Run this program as
  20    ./makeuname2c UnicodeData.txt NameAliases.txt > uname2c.h
  21
  22    This program generates 2 big arrays and 2 small ones.
  23    The large ones are uname2c_dict, initialized by string literal
  24    representing dictionary, and uname2c_tree, which is a space optimized
  25    radix tree.
  26    The format of the radix tree is:
  27    byte 0       either 0x80 + (key[0] - ' ')    (if key_len == 1)
  28                 or key_len                      (otherwise)
  29                 either of them ored with 0x40 if it has a codepoint
  30    byte 1       LSB of offset into uname2c_dict for key (only if key_len > 1)
  31    byte 2       MSB of offset into uname2c_dict for key (only if key_len > 1)
  32                 if key_len == 1, the above 2 bytes are omitted
  33    byte 3       LSB of codepoint (only if it has a codepoint)
  34    byte 4       middle byte of codepoint (ditto)
  35    byte 5       MSB of codepoint (ditto), ored with 0x80 if node has children
  36                                    ored with 0x40 if it doesn't have siblings
  37                 if it doesn't have a codepoint, the above 3 bytes are omitted
  38                 and we assume that the node has children
  39    byte 6, 7, 8 uleb128 encoded offset to first child relative to the end
  40                 of the uleb128 (only if node has children)
  41    byte 9       0xff (only if node doesn't have a codepoint and doesn't
  42                       have siblings)
  43
  44    For prefixes of Unicode NR1 or NR2 rule generated names, on a node
  45    representing end of the prefix codepoint is 0xd800 + index into
  46    uname2c_generated array with indexes into uname2c_pairs array of
  47    code points (low, high) of the ranges terminated by single 0.
  48    0xd800 is NR1 rule (Hangul syllables), rest are NR2 rules.
  49 */
  50
  51 #include <assert.h>
  52 #include <stdio.h>
  53 #include <string.h>
  54 #include <stdint.h>
  55 #include <ctype.h>
  56 #include <limits.h>
  57 #include <stdarg.h>
  58 #include <stdbool.h>
  59 #include <stdlib.h>
  60
  61 #define ARRAY_SIZE(a) (sizeof (a) / sizeof ((a)[0]))
  62
  63 #define NUM_CODE_POINTS 0x110000
  64 #define MAX_CODE_POINT 0x10ffff
  65 #define NO_VALUE 0xdc00
  66 #define GENERATED 0xd800
  67
  68 struct entry { const char *name; unsigned long codepoint; };
  69 static struct entry *entries;
  70 static unsigned long num_allocated, num_entries;
  71
  72 /* Unicode 14 Table 4-8.  */
  73 struct generated {
  74   const char *prefix;
  75   /* max_high is a workaround for UnicodeData.txt inconsistencies
  76      on a few CJK UNIFIED IDEOGRAPH- ranges where the "*, Last>"
  77      entry is a few code points above the end of the range.  */
  78   unsigned long low, high, max_high;
  79   int idx, ok;
  80 };
  81 static struct generated generated_ranges[] =
  82 { { "HANGUL SYLLABLE ", 0xac00, 0xd7a3, 0, 0, 0 }, /* NR1 rule */
  83   { "CJK UNIFIED IDEOGRAPH-", 0x3400, 0x4dbf, 0, 1, 0 }, /* NR2 rules */
  84   { "CJK UNIFIED IDEOGRAPH-", 0x4e00, 0x9ffc, 0x9fff, 1, 0 },
  85   { "CJK UNIFIED IDEOGRAPH-", 0x20000, 0x2a6dd, 0x2a6df, 1, 0 },
  86   { "CJK UNIFIED IDEOGRAPH-", 0x2a700, 0x2b734, 0x2b738, 1, 0 },
  87   { "CJK UNIFIED IDEOGRAPH-", 0x2b740, 0x2b81d, 0, 1, 0 },
  88   { "CJK UNIFIED IDEOGRAPH-", 0x2b820, 0x2cea1, 0, 1, 0 },
  89   { "CJK UNIFIED IDEOGRAPH-", 0x2ceb0, 0x2ebe0, 0, 1, 0 },
  90   { "CJK UNIFIED IDEOGRAPH-", 0x30000, 0x3134a, 0, 1, 0 },
  91   { "TANGUT IDEOGRAPH-", 0x17000, 0x187f7, 0, 2, 0 },
  92   { "TANGUT IDEOGRAPH-", 0x18d00, 0x18d08, 0, 2, 0 },
  93   { "KHITAN SMALL SCRIPT CHARACTER-", 0x18b00, 0x18cd5, 0, 3, 0 },
  94   { "NUSHU CHARACTER-", 0x1b170, 0x1b2fb, 0, 4, 0 },
  95   { "CJK COMPATIBILITY IDEOGRAPH-", 0xf900, 0xfa6d, 0, 5, 0 },
  96   { "CJK COMPATIBILITY IDEOGRAPH-", 0xfa70, 0xfad9, 0, 5, 0 },
  97   { "CJK COMPATIBILITY IDEOGRAPH-", 0x2f800, 0x2fa1d, 0, 5, 0 }
  98 };
  99
 100 struct node {
 101   struct node *sibling, *child;
 102   const char *key;
 103   size_t key_len, key_idx, node_size, size_sum, child_off;
 104   unsigned long codepoint;
 105   bool in_dict;
 106 };
 107 static struct node *root, **nodes;
 108 static unsigned long num_nodes;
 109 static size_t dict_size, tree_size, max_entry_len;
 110 static char *dict;
 111 static unsigned char *tree;
 112
 113 /* Die!  */
 114
 115 static void
 116 fail (const char *s, ...)
 117 {
 118   va_list ap;
 119
 120   va_start (ap, s);
 121   vfprintf (stderr, s, ap);
 122   va_end (ap);
 123   fputc ('\n', stderr);
 124   exit (1);
 125 }
 126
 127 static void *
 128 xmalloc (size_t size)
 129 {
 130   void *ret = malloc (size);
 131
 132   if (ret == NULL)
 133     fail ("failed to allocate %ld bytes", (long) size);
 134   return ret;
 135 }
 136
 137 static void *
 138 xrealloc (void *p, size_t size)
 139 {
 140   void *ret = p ? realloc (p, size) : malloc (size);
 141
 142   if (ret == NULL)
 143     fail ("failed to allocate %ld bytes", (long) size);
 144   return ret;
 145 }
 146
 147 static int
 148 entrycmp (const void *p1, const void *p2)
 149 {
 150   const struct entry *e1 = (const struct entry *) p1;
 151   const struct entry *e2 = (const struct entry *) p2;
 152   int ret = strcmp (e1->name, e2->name);
 153
 154   if (ret != 0)
 155     return ret;
 156   if (e1->codepoint < e2->codepoint)
 157     return -1;
 158   if (e1->codepoint > e2->codepoint)
 159     return 1;
 160   return 0;
 161 }
 162
 163 static int
 164 nodecmp (const void *p1, const void *p2)
 165 {
 166   const struct node *n1 = *(const struct node *const *) p1;
 167   const struct node *n2 = *(const struct node *const *) p2;
 168   if (n1->key_len > n2->key_len)
 169     return -1;
 170   if (n1->key_len < n2->key_len)
 171     return 1;
 172   return memcmp (n1->key, n2->key, n1->key_len);
 173 }
 174
 175 /* Read UnicodeData.txt and fill in the 'decomp' table to be the
 176    decompositions of characters for which both the character
 177    decomposed and all the code points in the decomposition are valid
 178    for some supported language version, and the 'all_decomp' table to
 179    be the decompositions of all characters without those
 180    constraints.  */
 181
 182 static void
 183 read_table (char *fname, bool aliases_p)
 184 {
 185   FILE *f = fopen (fname, "r");
 186   const char *sname = aliases_p ? "NameAliases.txt" : "UnicodeData.txt";
 187
 188   if (!f)
 189     fail ("opening %s", sname);
 190   for (;;)
 191     {
 192       char line[256];
 193       unsigned long codepoint;
 194       const char *name, *aname;
 195       char *l;
 196       size_t i;
 197
 198       if (!fgets (line, sizeof (line), f))
 199         break;
 200       codepoint = strtoul (line, &l, 16);
 201       if (l == line && aliases_p)
 202         {
 203           /* NameAliased.txt can contain comments and empty lines.  */
 204           if (*line == '#' || *line == '\n')
 205             continue;
 206         }
 207       if (l == line || *l != ';')
 208         fail ("parsing %s, reading code point", sname);
 209       if (codepoint > MAX_CODE_POINT)
 210         fail ("parsing %s, code point too large", sname);
 211
 212       name = l + 1;
 213       do {
 214         ++l;
 215       } while (*l != ';');
 216
 217       aname = NULL;
 218       if (aliases_p)
 219         {
 220           /* Ignore figment and abbreviation aliases.  */
 221           if (strcmp (l + 1, "correction\n") != 0
 222               && strcmp (l + 1, "control\n") != 0
 223               && strcmp (l + 1, "alternate\n") != 0)
 224             continue;
 225           i = ARRAY_SIZE (generated_ranges);
 226         }
 227       else
 228         {
 229           for (i = 0; i < ARRAY_SIZE (generated_ranges); ++i)
 230             if (codepoint >= generated_ranges[i].low
 231                 && codepoint <= generated_ranges[i].max_high)
 232               break;
 233           if (i != ARRAY_SIZE (generated_ranges))
 234             {
 235               if (*name == '<' && l[-1] == '>')
 236                 {
 237                   if (codepoint == generated_ranges[i].low
 238                       && l - name >= 9
 239                       && memcmp (l - 8, ", First>", 8) == 0
 240                       && generated_ranges[i].ok == 0)
 241                     {
 242                       generated_ranges[i].ok = INT_MAX - 1;
 243                       aname = generated_ranges[i].prefix;
 244                       codepoint = GENERATED + generated_ranges[i].idx;
 245                     }
 246                   /* Unfortunately, UnicodeData.txt isn't consistent
 247                      with the Table 4-8 range endpoints in 3 cases,
 248                      the ranges are longer there by a few codepoints.
 249                      So use the max_high hack to avoid verification
 250                      failures.  */
 251                   else if (codepoint == generated_ranges[i].max_high
 252                            && l - name >= 8
 253                            && memcmp (l - 7, ", Last>", 7) == 0
 254                            && generated_ranges[i].ok == INT_MAX - 1)
 255                     {
 256                       generated_ranges[i].ok = INT_MAX;
 257                       continue;
 258                     }
 259                   else
 260                     fail ("unexpected generated entry %lx %.*s",
 261                           codepoint, (int) (l - name), name);
 262                 }
 263               else if (codepoint
 264                        == generated_ranges[i].low + generated_ranges[i].ok
 265                        && l - name == (strlen (generated_ranges[i].prefix)
 266                                        + (name - 1 - line))
 267                        && memcmp (name, generated_ranges[i].prefix,
 268                                   strlen (generated_ranges[i].prefix)) == 0
 269                        && memcmp (name + strlen (generated_ranges[i].prefix),
 270                                   line, name - 1 - line) == 0)
 271                 {
 272                   ++generated_ranges[i].ok;
 273                   if (codepoint != generated_ranges[i].low)
 274                     continue;
 275                   aname = generated_ranges[i].prefix;
 276                   codepoint = GENERATED + generated_ranges[i].idx;
 277                 }
 278               else
 279                 fail ("unexpected generated entry %lx %.*s",
 280                       codepoint, (int) (l - name), name);
 281               if (aname == generated_ranges[i].prefix)
 282                 {
 283                   size_t j;
 284
 285                   /* Don't add an entry for a generated range where the
 286                      same prefix has been added already.  */
 287                   for (j = 0; j < i; ++j)
 288                     if (generated_ranges[j].idx == generated_ranges[i].idx
 289                         && generated_ranges[j].ok != 0)
 290                       break;
 291                   if (j < i)
 292                     continue;
 293                 }
 294             }
 295           else if (*name == '<' && l[-1] == '>')
 296             continue;
 297         }
 298
 299       if (num_entries == num_allocated)
 300         {
 301           num_allocated = num_allocated ? 2 * num_allocated : 65536;
 302           entries = (struct entry *) xrealloc (entries, num_allocated
 303                                                         * sizeof (entries[0]));
 304         }
 305
 306       if (aname == NULL)
 307         {
 308           char *a = (char *) xmalloc (l + 1 - name);
 309           if (l - name > max_entry_len)
 310             max_entry_len = l - name;
 311           memcpy (a, name, l - name);
 312           a[l - name] = '\0';
 313           aname = a;
 314         }
 315       entries[num_entries].name = aname;
 316       entries[num_entries++].codepoint = codepoint;
 317     }
 318   if (ferror (f))
 319     fail ("reading %s", sname);
 320   fclose (f);
 321 }
 322
 323 /* Assumes nodes are added from sorted array, so we never
 324    add any node before existing one, only after it.  */
 325
 326 static void
 327 node_add (struct node **p, const char *key, size_t key_len,
 328           unsigned long codepoint)
 329 {
 330   struct node *n;
 331   size_t i;
 332
 333   do
 334     {
 335       if (*p == NULL)
 336         {
 337           *p = n = (struct node *) xmalloc (sizeof (struct node));
 338           ++num_nodes;
 339           assert (key_len);
 340           n->sibling = NULL;
 341           n->child = NULL;
 342           n->key = key;
 343           n->key_len = key_len;
 344           n->codepoint = codepoint;
 345           return;
 346         }
 347       n = *p;
 348       for (i = 0; i < n->key_len && i < key_len; ++i)
 349         if (n->key[i] != key[i])
 350           break;
 351       if (i == 0)
 352         {
 353           p = &n->sibling;
 354           continue;
 355         }
 356       if (i == n->key_len)
 357         {
 358           assert (key_len > n->key_len);
 359           p = &n->child;
 360           key += n->key_len;
 361           key_len -= n->key_len;
 362           continue;
 363         }
 364       /* Need to split the node.  */
 365       assert (i < key_len);
 366       n = (struct node *) xmalloc (sizeof (struct node));
 367       ++num_nodes;
 368       n->sibling = NULL;
 369       n->child = (*p)->child;
 370       n->key = (*p)->key + i;
 371       n->key_len = (*p)->key_len - i;
 372       n->codepoint = (*p)->codepoint;
 373       (*p)->child = n;
 374       (*p)->key_len = i;
 375       (*p)->codepoint = NO_VALUE;
 376       key += i;
 377       key_len -= i;
 378       p = &n->sibling;
 379     }
 380   while (1);
 381 }
 382
 383 static void
 384 append_nodes (struct node *n)
 385 {
 386   for (; n; n = n->sibling)
 387     {
 388       nodes[num_nodes++] = n;
 389       append_nodes (n->child);
 390     }
 391 }
 392
 393 static size_t
 394 sizeof_uleb128 (size_t val)
 395 {
 396   size_t sz = 0;
 397   do
 398     {
 399       val >>= 7;
 400       sz += 1;
 401     }
 402   while (val != 0);
 403   return sz;
 404 }
 405
 406 static void
 407 size_nodes (struct node *n)
 408 {
 409   if (n->child)
 410     size_nodes (n->child);
 411   if (n->sibling)
 412     size_nodes (n->sibling);
 413   n->node_size = 1 + (n->key_len > 1) * 2;
 414   if (n->codepoint != NO_VALUE)
 415     n->node_size += 3;
 416   else if (n->sibling == NULL)
 417     ++n->node_size;
 418   n->size_sum = 0;
 419   n->child_off = 0;
 420   if (n->sibling)
 421     n->size_sum += n->sibling->size_sum;
 422   if (n->child)
 423     {
 424       n->child_off = n->size_sum + (n->codepoint == NO_VALUE
 425                                     && n->sibling == NULL);
 426       n->node_size += sizeof_uleb128 (n->child_off);
 427     }
 428   n->size_sum += n->node_size;
 429   if (n->child)
 430     n->size_sum += n->child->size_sum;
 431   tree_size += n->node_size;
 432 }
 433
 434 static void
 435 write_uleb128 (unsigned char *p, size_t val)
 436 {
 437   unsigned char c;
 438   do
 439     {
 440       c = val & 0x7f;
 441       val >>= 7;
 442       if (val)
 443         c |= 0x80;
 444       *p++ = c;
 445     }
 446   while (val);
 447 }
 448
 449 static void
 450 write_nodes (struct node *n, size_t off)
 451 {
 452   for (; n; n = n->sibling)
 453     {
 454       assert (off < tree_size && tree[off] == 0);
 455       if (n->key_len > 1)
 456         {
 457           assert (n->key_len < 64);
 458           tree[off] = n->key_len;
 459         }
 460       else
 461         tree[off] = (n->key[0] - ' ') | 0x80;
 462       assert ((tree[off] & 0x40) == 0);
 463       if (n->codepoint != NO_VALUE)
 464         tree[off] |= 0x40;
 465       off++;
 466       if (n->key_len > 1)
 467         {
 468           tree[off++] = n->key_idx & 0xff;
 469           tree[off++] = (n->key_idx >> 8) & 0xff;
 470         }
 471       if (n->codepoint != NO_VALUE)
 472         {
 473           assert (n->codepoint < (1L << 21));
 474           tree[off++] = n->codepoint & 0xff;
 475           tree[off++] = (n->codepoint >> 8) & 0xff;
 476           tree[off] = (n->codepoint >> 16) & 0xff;
 477           if (n->child)
 478             tree[off] |= 0x80;
 479           if (!n->sibling)
 480             tree[off] |= 0x40;
 481           off++;
 482         }
 483       if (n->child)
 484         {
 485           write_uleb128 (&tree[off], n->child_off);
 486           off += sizeof_uleb128 (n->child_off);
 487           write_nodes (n->child, off + n->child_off);
 488         }
 489       if (n->codepoint == NO_VALUE
 490           && n->sibling == NULL)
 491         tree[off++] = 0xff;
 492     }
 493   assert (off <= tree_size);
 494 }
 495
 496 static void
 497 build_radix_tree (void)
 498 {
 499   size_t i, j, k, key_idx;
 500
 501   for (i = 0; i < ARRAY_SIZE (generated_ranges); ++i)
 502     if (generated_ranges[i].ok == INT_MAX)
 503       {
 504         if (generated_ranges[i].max_high - generated_ranges[i].high > 15UL)
 505           break;
 506       }
 507     else if (generated_ranges[i].ok == (generated_ranges[i].high
 508                                         - generated_ranges[i].low + 1))
 509       {
 510         if (generated_ranges[i].max_high != generated_ranges[i].high)
 511           break;
 512       }
 513     else
 514       break;
 515   if (i < ARRAY_SIZE (generated_ranges))
 516     fail ("uncovered generated range %s %lx %lx",
 517           generated_ranges[i].prefix, generated_ranges[i].low,
 518           generated_ranges[i].high);
 519   /* Sort entries alphabetically, node_add relies on that.  */
 520   qsort (entries, num_entries, sizeof (struct entry), entrycmp);
 521   for (i = 1; i < num_entries; ++i)
 522     if (i && strcmp (entries[i].name, entries[i - 1].name) == 0)
 523       fail ("multiple entries for name %s", entries[i].name);
 524
 525   for (i = 0; i < num_entries; ++i)
 526     node_add (&root, entries[i].name, strlen (entries[i].name),
 527               entries[i].codepoint);
 528
 529   nodes = (struct node **) xmalloc (num_nodes * sizeof (struct node *));
 530   i = num_nodes;
 531   num_nodes = 0;
 532   append_nodes (root);
 533   assert (num_nodes == i);
 534   /* Sort node pointers by decreasing string length to handle substrings
 535      right.  */
 536   qsort (nodes, num_nodes, sizeof (struct node *), nodecmp);
 537   if (nodes[0]->key_len >= 64)
 538     /* We could actually encode even 64 and 65, as key_len 0 and 1 will
 539        never appear in the multiple letter key encodings, so could subtract
 540        2.  */
 541     fail ("can't encode key length %d >= 64, so need to split some radix "
 542           "tree nodes to ensure length fits", nodes[0]->key_len);
 543
 544   /* Verify a property charset.cc UAX44-LM2 matching relies on:
 545      if - is at the end of key of some node, then all its siblings
 546      start with alphanumeric characters.
 547      Only 2 character names and 1 alias have - followed by space:
 548      U+0F0A TIBETAN MARK BKA- SHOG YIG MGO
 549      U+0FD0 TIBETAN MARK BKA- SHOG GI MGO RGYAN
 550      U+0FD0 TIBETAN MARK BSKA- SHOG GI MGO RGYAN
 551      so the KA- in there will always be followed at least by SHOG
 552      in the same node.
 553      If this changes, charset.cc needs to change.  */
 554   for (i = 0; i < num_nodes; ++i)
 555     if (nodes[i]->key[nodes[i]->key_len - 1] == '-'
 556         && nodes[i]->child)
 557       {
 558         struct node *n;
 559
 560         for (n = nodes[i]->child; n; n = n->sibling)
 561           if (n->key[0] == ' ')
 562             fail ("node with key %.*s followed by node with key %.*s",
 563                   (int) nodes[i]->key_len, nodes[i]->key,
 564                   (int) n->key_len, n->key);
 565       }
 566
 567   /* This is expensive, O(num_nodes * num_nodes * nodes[0]->key_len), but
 568      fortunately num_nodes is < 64K and key_len < 64.  */
 569   key_idx = 0;
 570   for (i = 0; i < num_nodes; ++i)
 571     {
 572       nodes[i]->key_idx = SIZE_MAX;
 573       nodes[i]->in_dict = false;
 574       if (nodes[i]->key_len > 1)
 575         {
 576           for (j = 0; j < i; ++j)
 577             /* Can't rely on memmem unfortunately.  */
 578             if (nodes[j]->in_dict)
 579               {
 580                 for (k = 0; k <= nodes[j]->key_len - nodes[i]->key_len; ++k)
 581                   if (nodes[j]->key[k] == nodes[i]->key[0]
 582                       && memcmp (nodes[j]->key + k + 1, nodes[i]->key + 1,
 583                                  nodes[i]->key_len - 1) == 0)
 584                     {
 585                       nodes[i]->key_idx = nodes[j]->key_idx + k;
 586                       j = i;
 587                       break;
 588                     }
 589                 if (j == i)
 590                   break;
 591                 for (; k < nodes[j]->key_len; ++k)
 592                   if (nodes[j]->key[k] == nodes[i]->key[0]
 593                       && memcmp (nodes[j]->key + k + 1, nodes[i]->key + 1,
 594                                  nodes[j]->key_len - 1 - k) == 0)
 595                     {
 596                       size_t l;
 597
 598                       for (l = j + 1; l < i; ++l)
 599                         if (nodes[l]->in_dict)
 600                           break;
 601                       if (l < i
 602                           && memcmp (nodes[l]->key,
 603                                      nodes[i]->key + (nodes[j]->key_len - k),
 604                                      nodes[i]->key_len
 605                                      - (nodes[j]->key_len - k)) == 0)
 606                         {
 607                           nodes[i]->key_idx = nodes[j]->key_idx + k;
 608                           j = i;
 609                         }
 610                       else
 611                         j = l - 1;
 612                       break;
 613                     }
 614               }
 615           if (nodes[i]->key_idx == SIZE_MAX)
 616             {
 617               nodes[i]->key_idx = key_idx;
 618               nodes[i]->in_dict = true;
 619               key_idx += nodes[i]->key_len;
 620             }
 621         }
 622     }
 623   if (key_idx >= 65536)
 624     /* We only use 2 bytes for offsets into the dictionary.
 625        If it grows more, there is e.g. a possibility to replace
 626        most often seen words or substrings in the dictionary
 627        with characters other than [A-Z0-9 -] (say LETTER occurs
 628        in the dictionary almost 197 times and so by using a
 629        instead of LETTER we could save (6 - 1) * 197 bytes,
 630        with some on the side table mapping 'a' to "LETTER".  */
 631     fail ("too large dictionary %ld", (long) key_idx);
 632   dict_size = key_idx;
 633
 634   size_nodes (root);
 635
 636   dict = (char *) xmalloc (dict_size + 1);
 637   for (i = 0; i < num_nodes; ++i)
 638     if (nodes[i]->in_dict)
 639       memcpy (dict + nodes[i]->key_idx, nodes[i]->key, nodes[i]->key_len);
 640   dict[dict_size] = '\0';
 641
 642   tree = (unsigned char *) xmalloc (tree_size);
 643   memset (tree, 0, tree_size);
 644   write_nodes (root, 0);
 645 }
 646
 647 /* Print out the huge copyright notice.  */
 648
 649 static void
 650 write_copyright (void)
 651 {
 652   static const char copyright[] = "\
 653 /* Unicode name to codepoint.\n\
 654    Copyright (C) 2005-2022 Free Software Foundation, Inc.\n\
 655 \n\
 656    This program is free software; you can redistribute it and/or modify it\n\
 657    under the terms of the GNU General Public License as published by the\n\
 658    Free Software Foundation; either version 3, or (at your option) any\n\
 659    later version.\n\
 660 \n\
 661    This program is distributed in the hope that it will be useful,\n\
 662    but WITHOUT ANY WARRANTY; without even the implied warranty of\n\
 663    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the\n\
 664    GNU General Public License for more details.\n\
 665 \n\
 666    You should have received a copy of the GNU General Public License\n\
 667    along with this program; see the file COPYING3.  If not see\n\
 668    <http://www.gnu.org/licenses/>.\n\
 669 \n\
 670 \n\
 671    Copyright (C) 1991-2021 Unicode, Inc.  All rights reserved.\n\
 672    Distributed under the Terms of Use in\n\
 673    http://www.unicode.org/copyright.html.\n\
 674 \n\
 675    Permission is hereby granted, free of charge, to any person\n\
 676    obtaining a copy of the Unicode data files and any associated\n\
 677    documentation (the \"Data Files\") or Unicode software and any\n\
 678    associated documentation (the \"Software\") to deal in the Data Files\n\
 679    or Software without restriction, including without limitation the\n\
 680    rights to use, copy, modify, merge, publish, distribute, and/or\n\
 681    sell copies of the Data Files or Software, and to permit persons to\n\
 682    whom the Data Files or Software are furnished to do so, provided\n\
 683    that (a) the above copyright notice(s) and this permission notice\n\
 684    appear with all copies of the Data Files or Software, (b) both the\n\
 685    above copyright notice(s) and this permission notice appear in\n\
 686    associated documentation, and (c) there is clear notice in each\n\
 687    modified Data File or in the Software as well as in the\n\
 688    documentation associated with the Data File(s) or Software that the\n\
 689    data or software has been modified.\n\
 690 \n\
 691    THE DATA FILES AND SOFTWARE ARE PROVIDED \"AS IS\", WITHOUT WARRANTY\n\
 692    OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE\n\
 693    WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND\n\
 694    NONINFRINGEMENT OF THIRD PARTY RIGHTS. IN NO EVENT SHALL THE\n\
 695    COPYRIGHT HOLDER OR HOLDERS INCLUDED IN THIS NOTICE BE LIABLE FOR\n\
 696    ANY CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES, OR ANY\n\
 697    DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,\n\
 698    WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS\n\
 699    ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE\n\
 700    OF THE DATA FILES OR SOFTWARE.\n\
 701 \n\
 702    Except as contained in this notice, the name of a copyright holder\n\
 703    shall not be used in advertising or otherwise to promote the sale,\n\
 704    use or other dealings in these Data Files or Software without prior\n\
 705    written authorization of the copyright holder.  */\n";
 706
 707    puts (copyright);
 708 }
 709
 710 static void
 711 write_dict (void)
 712 {
 713   size_t i;
 714
 715   printf ("static const char uname2c_dict[%ld] =\n", (long) (dict_size + 1));
 716   for (i = 0; i < dict_size; i += 77)
 717     printf ("\"%.77s\"%s\n", dict + i, i + 76 > dict_size ? ";" : "");
 718   puts ("");
 719 }
 720
 721 static void
 722 write_tree (void)
 723 {
 724   size_t i, j;
 725
 726   printf ("static const unsigned char uname2c_tree[%ld] = {\n",
 727           (long) tree_size);
 728   for (i = 0, j = 0; i < tree_size; ++i)
 729     {
 730       printf ("%s0x%02x%s", j == 0 ? "  " : "", tree[i],
 731               i == tree_size - 1 ? " };\n\n" : j == 11 ? ",\n" : ", ");
 732       if (j == 11)
 733         j = 0;
 734       else
 735         ++j;
 736     }
 737 }
 738
 739 static void
 740 write_generated (void)
 741 {
 742   size_t i, j;
 743
 744   puts ("static const cppchar_t uname2c_pairs[] = {");
 745   for (i = 0; i < ARRAY_SIZE (generated_ranges); ++i)
 746     {
 747       if (i == 0)
 748         ;
 749       else if (generated_ranges[i - 1].idx != generated_ranges[i].idx)
 750         puts (", 0,");
 751       else
 752         puts (",");
 753       printf ("  0x%lx, 0x%lx /* %s */",
 754               generated_ranges[i].low,
 755               generated_ranges[i].high,
 756               generated_ranges[i].prefix);
 757     }
 758   puts (", 0 };\n");
 759
 760   puts ("static const unsigned char uname2c_generated[] = {");
 761   for (i = 0, j = -1; i < ARRAY_SIZE (generated_ranges); ++i)
 762     {
 763       if (i == 0 || generated_ranges[i - 1].idx != generated_ranges[i].idx)
 764         printf ("%s  %d /* %s */", i ? ",\n" : "",
 765                 ++j, generated_ranges[i].prefix);
 766       j += 2;
 767     }
 768   puts (" };\n");
 769 }
 770
 771 /* Main program.  */
 772
 773 int
 774 main (int argc, char **argv)
 775 {
 776   size_t i;
 777
 778   if (argc != 3)
 779     fail ("too few arguments to makeradixtree");
 780   for (i = 0; i < ARRAY_SIZE (generated_ranges); ++i)
 781     if (!generated_ranges[i].max_high)
 782       generated_ranges[i].max_high = generated_ranges[i].high;
 783   read_table (argv[1], false);
 784   read_table (argv[2], true);
 785   build_radix_tree ();
 786
 787   write_copyright ();
 788   write_dict ();
 789   write_tree ();
 790   write_generated ();
 791   printf ("static const unsigned int uname2c_max_name_len = %ld;\n\n", max_entry_len);
 792   return 0;
 793 }