localedata/gen-unicode-ctype.c

   1 /* Generate a Unicode conforming LC_CTYPE category from a UnicodeData file.
   2    Copyright (C) 2000 Free Software Foundation, Inc.
   3    This file is part of the GNU C Library.
   4    Contributed by Bruno Haible <haible@clisp.cons.org>, 2000.
   5
   6    The GNU C Library is free software; you can redistribute it and/or
   7    modify it under the terms of the GNU Library General Public License as
   8    published by the Free Software Foundation; either version 2 of the
   9    License, or (at your option) any later version.
  10
  11    The GNU C Library is distributed in the hope that it will be useful,
  12    but WITHOUT ANY WARRANTY; without even the implied warranty of
  13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14    Library General Public License for more details.
  15
  16    You should have received a copy of the GNU Library General Public
  17    License along with the GNU UTF-8 Library; see the file COPYING.LIB.  If not,
  18    write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
  19    Boston, MA 02111-1307, USA.  */
  20
  21 /* Usage example:
  22      $ gen-unicode /usr/local/share/Unidata/UnicodeData.txt \
  23                    /usr/local/share/Unidata/PropList.txt \
  24                    3.0
  25  */
  26
  27 #include <stdio.h>
  28 #include <stdlib.h>
  29 #include <stdbool.h>
  30 #include <string.h>
  31 #include <time.h>
  32
  33 /* This structure represents one line in the UnicodeData.txt file.  */
  34 struct unicode_attribute
  35 {
  36   const char *name;           /* Character name */
  37   const char *category;       /* General category */
  38   const char *combining;      /* Canonical combining classes */
  39   const char *bidi;           /* Bidirectional category */
  40   const char *decomposition;  /* Character decomposition mapping */
  41   const char *decdigit;       /* Decimal digit value */
  42   const char *digit;          /* Digit value */
  43   const char *numeric;        /* Numeric value */
  44   int mirrored;               /* mirrored */
  45   const char *oldname;        /* Old Unicode 1.0 name */
  46   const char *comment;        /* Comment */
  47   unsigned int upper;         /* Uppercase mapping */
  48   unsigned int lower;         /* Lowercase mapping */
  49   unsigned int title;         /* Titlecase mapping */
  50 };
  51
  52 /* Missing fields are represented with "" for strings, and NONE for
  53    characters.  */
  54 #define NONE (~(unsigned int)0)
  55
  56 /* The entire contents of the UnicodeData.txt file.  */
  57 struct unicode_attribute unicode_attributes [0x10000];
  58
  59 /* Stores in unicode_attributes[i] the values from the given fields.  */
  60 static void
  61 fill_attribute (unsigned int i,
  62                 const char *field1, const char *field2,
  63                 const char *field3, const char *field4,
  64                 const char *field5, const char *field6,
  65                 const char *field7, const char *field8,
  66                 const char *field9, const char *field10,
  67                 const char *field11, const char *field12,
  68                 const char *field13, const char *field14)
  69 {
  70   struct unicode_attribute * uni;
  71
  72   if (i >= 0x10000)
  73     {
  74       fprintf (stderr, "index too large\n");
  75       exit (1);
  76     }
  77   uni = &unicode_attributes[i];
  78   /* Copy the strings.  */
  79   uni->name          = strdup (field1);
  80   uni->category      = (field2[0] == '\0' ? "" : strdup (field2));
  81   uni->combining     = (field3[0] == '\0' ? "" : strdup (field3));
  82   uni->bidi          = (field4[0] == '\0' ? "" : strdup (field4));
  83   uni->decomposition = (field5[0] == '\0' ? "" : strdup (field5));
  84   uni->decdigit      = (field6[0] == '\0' ? "" : strdup (field6));
  85   uni->digit         = (field7[0] == '\0' ? "" : strdup (field7));
  86   uni->numeric       = (field8[0] == '\0' ? "" : strdup (field8));
  87   uni->mirrored      = (field9[0] == 'Y');
  88   uni->oldname       = (field10[0] == '\0' ? "" : strdup (field10));
  89   uni->comment       = (field11[0] == '\0' ? "" : strdup (field11));
  90   uni->upper = (field12[0] =='\0' ? NONE : strtoul (field12, NULL, 16));
  91   uni->lower = (field13[0] =='\0' ? NONE : strtoul (field13, NULL, 16));
  92   uni->title = (field14[0] =='\0' ? NONE : strtoul (field14, NULL, 16));
  93 }
  94
  95 /* Maximum length of a field in the UnicodeData.txt file.  */
  96 #define FIELDLEN 120
  97
  98 /* Reads the next field from STREAM.  The buffer BUFFER has size FIELDLEN.
  99    Reads up to (but excluding) DELIM.
 100    Returns 1 when a field was successfully read, otherwise 0.  */
 101 static int
 102 getfield (FILE *stream, char *buffer, int delim)
 103 {
 104   int count = 0;
 105   int c;
 106
 107   for (; (c = getc (stream)), (c != EOF && c != delim); )
 108     {
 109       /* The original unicode.org UnicodeData.txt file happens to have
 110          CR/LF line terminators.  Silently convert to LF.  */
 111       if (c == '\r')
 112         continue;
 113
 114       /* Put c into the buffer.  */
 115       if (++count >= FIELDLEN - 1)
 116         {
 117           fprintf (stderr, "field too long\n");
 118           exit (1);
 119         }
 120       *buffer++ = c;
 121     }
 122
 123   if (c == EOF)
 124     return 0;
 125
 126   *buffer = '\0';
 127   return 1;
 128 }
 129
 130 /* Stores in unicode_attributes[] the entire contents of the UnicodeData.txt
 131    file.  */
 132 static void
 133 fill_attributes (const char *unicodedata_filename)
 134 {
 135   unsigned int i, j;
 136   FILE *stream;
 137   char field0[FIELDLEN];
 138   char field1[FIELDLEN];
 139   char field2[FIELDLEN];
 140   char field3[FIELDLEN];
 141   char field4[FIELDLEN];
 142   char field5[FIELDLEN];
 143   char field6[FIELDLEN];
 144   char field7[FIELDLEN];
 145   char field8[FIELDLEN];
 146   char field9[FIELDLEN];
 147   char field10[FIELDLEN];
 148   char field11[FIELDLEN];
 149   char field12[FIELDLEN];
 150   char field13[FIELDLEN];
 151   char field14[FIELDLEN];
 152   int lineno = 0;
 153
 154   for (i = 0; i < 0x10000; i++)
 155     unicode_attributes[i].name = NULL;
 156
 157   stream = fopen (unicodedata_filename, "r");
 158   if (stream == NULL)
 159     {
 160       fprintf (stderr, "error during fopen of '%s'\n", unicodedata_filename);
 161       exit (1);
 162     }
 163
 164   for (;;)
 165     {
 166       int n;
 167
 168       lineno++;
 169       n = getfield (stream, field0, ';');
 170       n += getfield (stream, field1, ';');
 171       n += getfield (stream, field2, ';');
 172       n += getfield (stream, field3, ';');
 173       n += getfield (stream, field4, ';');
 174       n += getfield (stream, field5, ';');
 175       n += getfield (stream, field6, ';');
 176       n += getfield (stream, field7, ';');
 177       n += getfield (stream, field8, ';');
 178       n += getfield (stream, field9, ';');
 179       n += getfield (stream, field10, ';');
 180       n += getfield (stream, field11, ';');
 181       n += getfield (stream, field12, ';');
 182       n += getfield (stream, field13, ';');
 183       n += getfield (stream, field14, '\n');
 184       if (n == 0)
 185         break;
 186       if (n != 15)
 187         {
 188           fprintf (stderr, "short line in'%s':%d\n",
 189                    unicodedata_filename, lineno);
 190           exit (1);
 191         }
 192       i = strtoul (field0, NULL, 16);
 193       if (field1[0] == '<'
 194           && strlen (field1) >= 9
 195           && !strcmp (field1 + strlen(field1) - 8, ", First>"))
 196         {
 197           /* Deal with a range. */
 198           lineno++;
 199           n = getfield (stream, field0, ';');
 200           n += getfield (stream, field1, ';');
 201           n += getfield (stream, field2, ';');
 202           n += getfield (stream, field3, ';');
 203           n += getfield (stream, field4, ';');
 204           n += getfield (stream, field5, ';');
 205           n += getfield (stream, field6, ';');
 206           n += getfield (stream, field7, ';');
 207           n += getfield (stream, field8, ';');
 208           n += getfield (stream, field9, ';');
 209           n += getfield (stream, field10, ';');
 210           n += getfield (stream, field11, ';');
 211           n += getfield (stream, field12, ';');
 212           n += getfield (stream, field13, ';');
 213           n += getfield (stream, field14, '\n');
 214           if (n != 15)
 215             {
 216               fprintf (stderr, "missing end range in '%s':%d\n",
 217                        unicodedata_filename, lineno);
 218               exit (1);
 219             }
 220           if (!(field1[0] == '<'
 221                 && strlen (field1) >= 8
 222                 && !strcmp (field1 + strlen (field1) - 7, ", Last>")))
 223             {
 224               fprintf (stderr, "missing end range in '%s':%d\n",
 225                        unicodedata_filename, lineno);
 226               exit (1);
 227             }
 228           field1[strlen (field1) - 7] = '\0';
 229           j = strtoul (field0, NULL, 16);
 230           for (; i <= j; i++)
 231             fill_attribute (i, field1+1, field2, field3, field4, field5,
 232                                field6, field7, field8, field9, field10,
 233                                field11, field12, field13, field14);
 234         }
 235       else
 236         {
 237           /* Single character line */
 238           fill_attribute (i, field1, field2, field3, field4, field5,
 239                              field6, field7, field8, field9, field10,
 240                              field11, field12, field13, field14);
 241         }
 242     }
 243   if (ferror (stream) || fclose (stream))
 244     {
 245       fprintf (stderr, "error reading from '%s'\n", unicodedata_filename);
 246       exit (1);
 247     }
 248 }
 249
 250 /* The combining property from the PropList.txt file.  */
 251 char unicode_combining[0x10000];
 252
 253 /* Stores in unicode_combining[] the Combining property from the
 254    PropList.txt file.  */
 255 static void
 256 fill_combining (const char *proplist_filename)
 257 {
 258   unsigned int i;
 259   FILE *stream;
 260   char buf[100+1];
 261
 262   for (i = 0; i < 0x10000; i++)
 263     unicode_combining[i] = 0;
 264
 265   stream = fopen (proplist_filename, "r");
 266   if (stream == NULL)
 267     {
 268       fprintf (stderr, "error during fopen of '%s'\n", proplist_filename);
 269       exit (1);
 270     }
 271
 272   /* Search for the "Property dump for: 0x20000004 (Combining)" line.  */
 273   do
 274     {
 275       if (fscanf (stream, "%100[^\n]\n", buf) < 1)
 276         {
 277           fprintf (stderr, "no combining property found in '%s'\n",
 278                    proplist_filename);
 279           exit (1);
 280         }
 281     }
 282   while (strstr (buf, "(Combining)") == NULL);
 283
 284   for (;;)
 285     {
 286       unsigned int i1, i2;
 287
 288       if (fscanf (stream, "%100[^\n]\n", buf) < 1)
 289         {
 290           fprintf (stderr, "premature end of combining property in '%s'\n",
 291                    proplist_filename);
 292           exit (1);
 293         }
 294       if (buf[0] == '*')
 295         break;
 296       if (strlen (buf) >= 10 && buf[4] == '.' && buf[5] == '.')
 297         {
 298           if (sscanf (buf, "%4X..%4X", &i1, &i2) < 2)
 299             {
 300               fprintf (stderr, "parse error in combining property in '%s'\n",
 301                        proplist_filename);
 302               exit (1);
 303             }
 304         }
 305       else if (strlen (buf) >= 4)
 306         {
 307           if (sscanf (buf, "%4X", &i1) < 1)
 308             {
 309               fprintf (stderr, "parse error in combining property in '%s'\n",
 310                        proplist_filename);
 311               exit (1);
 312             }
 313           i2 = i1;
 314         }
 315       else
 316         {
 317           fprintf (stderr, "parse error in combining property in '%s'\n",
 318                    proplist_filename);
 319           exit (1);
 320         }
 321       for (i = i1; i <= i2; i++)
 322         unicode_combining[i] = 1;
 323     }
 324   if (ferror (stream) || fclose (stream))
 325     {
 326       fprintf (stderr, "error reading from '%s'\n", proplist_filename);
 327       exit (1);
 328     }
 329 }
 330
 331 /* Character mappings.  */
 332
 333 static unsigned int
 334 to_upper (unsigned int ch)
 335 {
 336   if (unicode_attributes[ch].name != NULL
 337       && unicode_attributes[ch].upper != NONE)
 338     return unicode_attributes[ch].upper;
 339   else
 340     return ch;
 341 }
 342
 343 static unsigned int
 344 to_lower (unsigned int ch)
 345 {
 346   if (unicode_attributes[ch].name != NULL
 347       && unicode_attributes[ch].lower != NONE)
 348     return unicode_attributes[ch].lower;
 349   else
 350     return ch;
 351 }
 352
 353 static unsigned int
 354 to_title (unsigned int ch)
 355 {
 356   if (unicode_attributes[ch].name != NULL
 357       && unicode_attributes[ch].title != NONE)
 358     return unicode_attributes[ch].title;
 359   else
 360     return ch;
 361 }
 362
 363 /* Character class properties.  */
 364
 365 static bool
 366 is_upper (unsigned int ch)
 367 {
 368   return (to_lower (ch) != ch);
 369 }
 370
 371 static bool
 372 is_lower (unsigned int ch)
 373 {
 374   return (to_upper (ch) != ch)
 375          /* <U00DF> is lowercase, but without simple to_upper mapping.  */
 376          || (ch == 0x00DF);
 377 }
 378
 379 static bool
 380 is_alpha (unsigned int ch)
 381 {
 382   return (unicode_attributes[ch].name != NULL
 383           && (unicode_attributes[ch].category[0] == 'L'
 384               /* Avoid warning for <U0345>.  */
 385               || (ch == 0x0345)
 386               /* Avoid warnings for <U2160>..<U217F>.  */
 387               || (unicode_attributes[ch].category[0] == 'N'
 388                   && unicode_attributes[ch].category[1] == 'l')
 389               /* Avoid warnings for <U24B6>..<U24E9>.  */
 390               || (unicode_attributes[ch].category[0] == 'S'
 391                   && unicode_attributes[ch].category[1] == 'o'
 392                   && strstr (unicode_attributes[ch].name, " LETTER ")
 393                      != NULL)
 394               /* Consider all the non-ASCII digits as alphabetic.
 395                  ISO C 99 forbids us to have them in category "digit",
 396                  but we want iswalnum to return true on them.  */
 397               || (unicode_attributes[ch].category[0] == 'N'
 398                   && unicode_attributes[ch].category[1] == 'd'
 399                   && !(ch >= 0x0030 && ch <= 0x0039))));
 400 }
 401
 402 static bool
 403 is_digit (unsigned int ch)
 404 {
 405 #if 0
 406   return (unicode_attributes[ch].name != NULL
 407           && unicode_attributes[ch].category[0] == 'N'
 408           && unicode_attributes[ch].category[1] == 'd');
 409   /* Note: U+0BE7..U+0BEF and U+1369..U+1371 are digit systems without
 410      a zero.  Must add <0> in front of them by hand.  */
 411 #else
 412   /* SUSV2 gives us some freedom for the "digit" category, but ISO C 99
 413      takes it away:
 414      7.25.2.1.5:
 415         The iswdigit function tests for any wide character that corresponds
 416         to a decimal-digit character (as defined in 5.2.1).
 417      5.2.1:
 418         the 10 decimal digits 0 1 2 3 4 5 6 7 8 9
 419    */
 420   return (ch >= 0x0030 && ch <= 0x0039);
 421 #endif
 422 }
 423
 424 static bool
 425 is_outdigit (unsigned int ch)
 426 {
 427   return (ch >= 0x0030 && ch <= 0x0039);
 428 }
 429
 430 static bool
 431 is_blank (unsigned int ch)
 432 {
 433   return (ch == 0x0009 /* '\t' */
 434           /* Category Zs without mention of "<noBreak>" */
 435           || (unicode_attributes[ch].name != NULL
 436               && unicode_attributes[ch].category[0] == 'Z'
 437               && unicode_attributes[ch].category[1] == 's'
 438               && !strstr (unicode_attributes[ch].decomposition, "<noBreak>")));
 439 }
 440
 441 static bool
 442 is_space (unsigned int ch)
 443 {
 444   /* Don't make U+00A0 a space. Non-breaking space means that all programs
 445      should treat it like a punctuation character, not like a space. */
 446   return (ch == 0x0020 /* ' ' */
 447           || ch == 0x000C /* '\f' */
 448           || ch == 0x000A /* '\n' */
 449           || ch == 0x000D /* '\r' */
 450           || ch == 0x0009 /* '\t' */
 451           || ch == 0x000B /* '\v' */
 452           /* Categories Zl, Zp, and Zs without mention of "<noBreak>" */
 453           || (unicode_attributes[ch].name != NULL
 454               && unicode_attributes[ch].category[0] == 'Z'
 455               && (unicode_attributes[ch].category[1] == 'l'
 456                   || unicode_attributes[ch].category[1] == 'p'
 457                   || (unicode_attributes[ch].category[1] == 's'
 458                       && !strstr (unicode_attributes[ch].decomposition,
 459                                   "<noBreak>")))));
 460 }
 461
 462 static bool
 463 is_cntrl (unsigned int ch)
 464 {
 465   return (unicode_attributes[ch].name != NULL
 466           && (!strcmp (unicode_attributes[ch].name, "<control>")
 467               /* Categories Zl and Zp */
 468               || (unicode_attributes[ch].category[0] == 'Z'
 469                   && (unicode_attributes[ch].category[1] == 'l'
 470                       || unicode_attributes[ch].category[1] == 'p'))));
 471 }
 472
 473 static bool
 474 is_xdigit (unsigned int ch)
 475 {
 476 #if 0
 477   return is_digit (ch)
 478          || (ch >= 0x0041 && ch <= 0x0046)
 479          || (ch >= 0x0061 && ch <= 0x0066);
 480 #else
 481   /* SUSV2 gives us some freedom for the "xdigit" category, but ISO C 99
 482      takes it away:
 483      7.25.2.1.12:
 484         The iswxdigit function tests for any wide character that corresponds
 485         to a hexadecimal-digit character (as defined in 6.4.4.1).
 486      6.4.4.1:
 487         hexadecimal-digit: one of 0 1 2 3 4 5 6 7 8 9 a b c d e f A B C D E F
 488    */
 489   return (ch >= 0x0030 && ch <= 0x0039)
 490          || (ch >= 0x0041 && ch <= 0x0046)
 491          || (ch >= 0x0061 && ch <= 0x0066);
 492 #endif
 493 }
 494
 495 static bool
 496 is_graph (unsigned int ch)
 497 {
 498   return (unicode_attributes[ch].name != NULL
 499           && strcmp (unicode_attributes[ch].name, "<control>")
 500           && !is_space (ch));
 501 }
 502
 503 static bool
 504 is_print (unsigned int ch)
 505 {
 506   return (unicode_attributes[ch].name != NULL
 507           && strcmp (unicode_attributes[ch].name, "<control>")
 508           /* Categories Zl and Zp */
 509           && !(unicode_attributes[ch].name != NULL
 510                && unicode_attributes[ch].category[0] == 'Z'
 511                && (unicode_attributes[ch].category[1] == 'l'
 512                    || unicode_attributes[ch].category[1] == 'p')));
 513 }
 514
 515 static bool
 516 is_punct (unsigned int ch)
 517 {
 518 #if 0
 519   return (unicode_attributes[ch].name != NULL
 520           && unicode_attributes[ch].category[0] == 'P');
 521 #else
 522   /* The traditional POSIX definition of punctuation is every graphic,
 523      non-alphanumeric character.  */
 524   return (is_graph (ch) && !is_alpha (ch) && !is_digit (ch));
 525 #endif
 526 }
 527
 528 static bool
 529 is_combining (unsigned int ch)
 530 {
 531   return (unicode_attributes[ch].name != NULL
 532           && unicode_combining[ch] != 0);
 533 }
 534
 535 static bool
 536 is_combining_level3 (unsigned int ch)
 537 {
 538   return is_combining (ch)
 539          && !(unicode_attributes[ch].combining[0] != '\0'
 540               && unicode_attributes[ch].combining[0] != '0'
 541               && strtoul (unicode_attributes[ch].combining, NULL, 10) >= 200);
 542 }
 543
 544 /* Output a character class (= property) table.  */
 545
 546 static void
 547 output_charclass (FILE *stream, const char *classname,
 548                   bool (*func) (unsigned int))
 549 {
 550   char table[0x10000];
 551   unsigned int i;
 552   bool need_semicolon;
 553   const int max_column = 75;
 554   int column;
 555
 556   for (i = 0; i < 0x10000; i++)
 557     table[i] = (int) func (i);
 558
 559   fprintf (stream, "%s ", classname);
 560   need_semicolon = false;
 561   column = 1000;
 562   for (i = 0; i < 0x10000; )
 563     {
 564       if (!table[i])
 565         i++;
 566       else
 567         {
 568           unsigned int low, high;
 569           char buf[17];
 570
 571           low = i;
 572           do
 573             i++;
 574           while (i < 0x10000 && table[i]);
 575           high = i - 1;
 576
 577           if (low == high)
 578             sprintf (buf, "<U%04X>", low);
 579           else
 580             sprintf (buf, "<U%04X>..<U%04X>", low, high);
 581
 582           if (need_semicolon)
 583             {
 584               fprintf (stream, ";");
 585               column++;
 586             }
 587
 588           if (column + strlen (buf) > max_column)
 589             {
 590               fprintf (stream, "/\n   ");
 591               column = 3;
 592             }
 593
 594           fprintf (stream, "%s", buf);
 595           column += strlen (buf);
 596           need_semicolon = true;
 597         }
 598     }
 599   fprintf (stream, "\n");
 600 }
 601
 602 /* Output a character mapping table.  */
 603
 604 static void
 605 output_charmap (FILE *stream, const char *mapname,
 606                 unsigned int (*func) (unsigned int))
 607 {
 608   char table[0x10000];
 609   unsigned int i;
 610   bool need_semicolon;
 611   const int max_column = 75;
 612   int column;
 613
 614   for (i = 0; i < 0x10000; i++)
 615     table[i] = (func (i) != i);
 616
 617   fprintf (stream, "%s ", mapname);
 618   need_semicolon = false;
 619   column = 1000;
 620   for (i = 0; i < 0x10000; i++)
 621     if (table[i])
 622       {
 623         char buf[18];
 624
 625         sprintf (buf, "(<U%04X>,<U%04X>)", i, func (i));
 626
 627         if (need_semicolon)
 628           {
 629             fprintf (stream, ";");
 630             column++;
 631           }
 632
 633         if (column + strlen (buf) > max_column)
 634           {
 635             fprintf (stream, "/\n   ");
 636             column = 3;
 637           }
 638
 639         fprintf (stream, "%s", buf);
 640         column += strlen (buf);
 641         need_semicolon = true;
 642       }
 643   fprintf (stream, "\n");
 644 }
 645
 646 /* Output the width table.  */
 647
 648 static void
 649 output_widthmap (FILE *stream)
 650 {
 651 }
 652
 653 /* Output the tables to the given file.  */
 654
 655 static void
 656 output_tables (const char *filename, const char *version)
 657 {
 658   FILE *stream;
 659   unsigned int ch;
 660
 661   stream = fopen (filename, "w");
 662   if (stream == NULL)
 663     {
 664       fprintf (stderr, "cannot open '%s' for writing\n", filename);
 665       exit (1);
 666     }
 667
 668   fprintf (stream, "escape_char /\n");
 669   fprintf (stream, "comment_char %%\n");
 670   fprintf (stream, "\n");
 671   fprintf (stream, "%% Generated automatically by gen-unicode for Unicode %s.\n",
 672            version);
 673   fprintf (stream, "\n");
 674
 675   fprintf (stream, "LC_IDENTIFICATION\n");
 676   fprintf (stream, "title     \"Unicode %s FDCC-set\"\n", version);
 677   fprintf (stream, "source    \"UnicodeData.txt, PropList.txt\"\n");
 678   fprintf (stream, "address   \"\"\n");
 679   fprintf (stream, "contact   \"\"\n");
 680   fprintf (stream, "email     \"bug-glibc@gnu.org\"\n");
 681   fprintf (stream, "tel       \"\"\n");
 682   fprintf (stream, "fax       \"\"\n");
 683   fprintf (stream, "language  \"\"\n");
 684   fprintf (stream, "territory \"Earth\"\n");
 685   fprintf (stream, "revision  \"%s\"\n", version);
 686   {
 687     time_t now;
 688     char date[11];
 689     now = time (NULL);
 690     strftime (date, sizeof (date), "%Y-%m-%d", gmtime (&now));
 691     fprintf (stream, "date      \"%s\"\n", date);
 692   }
 693   fprintf (stream, "category  \"unicode:2000\";LC_CTYPE\n");
 694   fprintf (stream, "END LC_IDENTIFICATION\n");
 695   fprintf (stream, "\n");
 696
 697   /* Verifications. */
 698   for (ch = 0; ch < 0x10000; ch++)
 699     {
 700       /* toupper restriction: "Only characters specified for the keywords
 701          lower and upper shall be specified.  */
 702       if (to_upper (ch) != ch && !(is_lower (ch) || is_upper (ch)))
 703         fprintf (stderr,
 704                  "<U%04X> is not upper|lower but toupper(0x%04X) = 0x%04X\n",
 705                  ch, ch, to_upper (ch));
 706
 707       /* tolower restriction: "Only characters specified for the keywords
 708          lower and upper shall be specified.  */
 709       if (to_lower (ch) != ch && !(is_lower (ch) || is_upper (ch)))
 710         fprintf (stderr,
 711                  "<U%04X> is not upper|lower but tolower(0x%04X) = 0x%04X\n",
 712                  ch, ch, to_lower (ch));
 713
 714       /* alpha restriction: "Characters classified as either upper or lower
 715          shall automatically belong to this class.  */
 716       if ((is_lower (ch) || is_upper (ch)) && !is_alpha (ch))
 717         fprintf (stderr, "<U%04X> is upper|lower but not alpha\n", ch);
 718
 719       /* alpha restriction: "No character specified for the keywords cntrl,
 720          digit, punct or space shall be specified."  */
 721       if (is_alpha (ch) && is_cntrl (ch))
 722         fprintf (stderr, "<U%04X> is alpha and cntrl\n", ch);
 723       if (is_alpha (ch) && is_digit (ch))
 724         fprintf (stderr, "<U%04X> is alpha and digit\n", ch);
 725       if (is_alpha (ch) && is_punct (ch))
 726         fprintf (stderr, "<U%04X> is alpha and punct\n", ch);
 727       if (is_alpha (ch) && is_space (ch))
 728         fprintf (stderr, "<U%04X> is alpha and space\n", ch);
 729
 730       /* space restriction: "No character specified for the keywords upper,
 731          lower, alpha, digit, graph or xdigit shall be specified."
 732          upper, lower, alpha already checked above.  */
 733       if (is_space (ch) && is_digit (ch))
 734         fprintf (stderr, "<U%04X> is space and digit\n", ch);
 735       if (is_space (ch) && is_graph (ch))
 736         fprintf (stderr, "<U%04X> is space and graph\n", ch);
 737       if (is_space (ch) && is_xdigit (ch))
 738         fprintf (stderr, "<U%04X> is space and xdigit\n", ch);
 739
 740       /* cntrl restriction: "No character specified for the keywords upper,
 741          lower, alpha, digit, punct, graph, print or xdigit shall be
 742          specified."  upper, lower, alpha already checked above.  */
 743       if (is_cntrl (ch) && is_digit (ch))
 744         fprintf (stderr, "<U%04X> is cntrl and digit\n", ch);
 745       if (is_cntrl (ch) && is_punct (ch))
 746         fprintf (stderr, "<U%04X> is cntrl and punct\n", ch);
 747       if (is_cntrl (ch) && is_graph (ch))
 748         fprintf (stderr, "<U%04X> is cntrl and graph\n", ch);
 749       if (is_cntrl (ch) && is_print (ch))
 750         fprintf (stderr, "<U%04X> is cntrl and print\n", ch);
 751       if (is_cntrl (ch) && is_xdigit (ch))
 752         fprintf (stderr, "<U%04X> is cntrl and xdigit\n", ch);
 753
 754       /* punct restriction: "No character specified for the keywords upper,
 755          lower, alpha, digit, cntrl, xdigit or as the <space> character shall
 756          be specified."  upper, lower, alpha, cntrl already checked above.  */
 757       if (is_punct (ch) && is_digit (ch))
 758         fprintf (stderr, "<U%04X> is punct and digit\n", ch);
 759       if (is_punct (ch) && is_xdigit (ch))
 760         fprintf (stderr, "<U%04X> is punct and xdigit\n", ch);
 761       if (is_punct (ch) && (ch == 0x0020))
 762         fprintf (stderr, "<U%04X> is punct\n", ch);
 763
 764       /* graph restriction: "No character specified for the keyword cntrl
 765          shall be specified."  Already checked above.  */
 766
 767       /* print restriction: "No character specified for the keyword cntrl
 768          shall be specified."  Already checked above.  */
 769
 770       /* graph - print relation: differ only in the <space> character.
 771          How is this possible if there are more than one space character?!
 772          I think susv2/xbd/locale.html should speak of "space characters",
 773          not "space character".  */
 774       if (is_print (ch) && !(is_graph (ch) || /* ch == 0x0020 */ is_space (ch)))
 775         fprintf (stderr, "<U%04X> is print but not graph|<space>\n", ch);
 776       if (!is_print (ch) && (is_graph (ch) || ch == 0x0020))
 777         fprintf (stderr, "<U%04X> is graph|<space> but not print\n", ch);
 778     }
 779
 780   fprintf (stream, "LC_CTYPE\n");
 781   output_charclass (stream, "upper", is_upper);
 782   output_charclass (stream, "lower", is_lower);
 783   output_charclass (stream, "alpha", is_alpha);
 784   output_charclass (stream, "digit", is_digit);
 785   output_charclass (stream, "outdigit", is_outdigit);
 786   output_charclass (stream, "blank", is_blank);
 787   output_charclass (stream, "space", is_space);
 788   output_charclass (stream, "cntrl", is_cntrl);
 789   output_charclass (stream, "punct", is_punct);
 790   output_charclass (stream, "xdigit", is_xdigit);
 791   output_charclass (stream, "graph", is_graph);
 792   output_charclass (stream, "print", is_print);
 793   output_charclass (stream, "class \"combining\";", is_combining);
 794   output_charclass (stream, "class \"combining_level3\";", is_combining_level3);
 795   output_charmap (stream, "toupper", to_upper);
 796   output_charmap (stream, "tolower", to_lower);
 797   output_charmap (stream, "map \"totitle\";", to_title);
 798   output_widthmap (stream);
 799   fprintf (stream, "END LC_CTYPE\n");
 800
 801   if (ferror (stream) || fclose (stream))
 802     {
 803       fprintf (stderr, "error writing to '%s'\n", filename);
 804       exit (1);
 805     }
 806 }
 807
 808 int
 809 main (int argc, char * argv[])
 810 {
 811   if (argc != 4)
 812     {
 813       fprintf (stderr, "Usage: %s UnicodeData.txt PropList.txt version\n",
 814                argv[0]);
 815       exit (1);
 816     }
 817
 818   fill_attributes (argv[1]);
 819   fill_combining (argv[2]);
 820
 821   output_tables ("unicode", argv[3]);
 822
 823   return 0;
 824 }