localedata/gen-unicode-ctype.c

   1 /* Generate a Unicode conforming LC_CTYPE category from a UnicodeData file.
   2    Copyright (C) 2000-2014 Free Software Foundation, Inc.
   3    This file is part of the GNU C Library.
   4    Contributed by Bruno Haible <haible@clisp.cons.org>, 2000.
   5
   6    The GNU C Library is free software; you can redistribute it and/or
   7    modify it under the terms of the GNU Lesser General Public
   8    License as published by the Free Software Foundation; either
   9    version 2.1 of the License, or (at your option) any later version.
  10
  11    The GNU C Library is distributed in the hope that it will be useful,
  12    but WITHOUT ANY WARRANTY; without even the implied warranty of
  13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14    Lesser General Public License for more details.
  15
  16    You should have received a copy of the GNU Lesser General Public
  17    License along with the GNU C Library; if not, see
  18    <http://www.gnu.org/licenses/>.  */
  19
  20 /* Usage example:
  21      $ gen-unicode /usr/local/share/Unidata/UnicodeData.txt 3.1
  22  */
  23
  24 #include <stdio.h>
  25 #include <stdlib.h>
  26 #include <stdbool.h>
  27 #include <string.h>
  28 #include <time.h>
  29
  30 /* This structure represents one line in the UnicodeData.txt file.  */
  31 struct unicode_attribute
  32 {
  33   const char *name;           /* Character name */
  34   const char *category;       /* General category */
  35   const char *combining;      /* Canonical combining classes */
  36   const char *bidi;           /* Bidirectional category */
  37   const char *decomposition;  /* Character decomposition mapping */
  38   const char *decdigit;       /* Decimal digit value */
  39   const char *digit;          /* Digit value */
  40   const char *numeric;        /* Numeric value */
  41   int mirrored;               /* mirrored */
  42   const char *oldname;        /* Old Unicode 1.0 name */
  43   const char *comment;        /* Comment */
  44   unsigned int upper;         /* Uppercase mapping */
  45   unsigned int lower;         /* Lowercase mapping */
  46   unsigned int title;         /* Titlecase mapping */
  47 };
  48
  49 /* Missing fields are represented with "" for strings, and NONE for
  50    characters.  */
  51 #define NONE (~(unsigned int)0)
  52
  53 /* The entire contents of the UnicodeData.txt file.  */
  54 struct unicode_attribute unicode_attributes [0x110000];
  55
  56 /* Stores in unicode_attributes[i] the values from the given fields.  */
  57 static void
  58 fill_attribute (unsigned int i,
  59                 const char *field1, const char *field2,
  60                 const char *field3, const char *field4,
  61                 const char *field5, const char *field6,
  62                 const char *field7, const char *field8,
  63                 const char *field9, const char *field10,
  64                 const char *field11, const char *field12,
  65                 const char *field13, const char *field14)
  66 {
  67   struct unicode_attribute * uni;
  68
  69   if (i >= 0x110000)
  70     {
  71       fprintf (stderr, "index too large\n");
  72       exit (1);
  73     }
  74   if (strcmp (field2, "Cs") == 0)
  75     /* Surrogates are UTF-16 artefacts, not real characters. Ignore them.  */
  76     return;
  77   uni = &unicode_attributes[i];
  78   /* Copy the strings.  */
  79   uni->name          = strdup (field1);
  80   uni->category      = (field2[0] == '\0' ? "" : strdup (field2));
  81   uni->combining     = (field3[0] == '\0' ? "" : strdup (field3));
  82   uni->bidi          = (field4[0] == '\0' ? "" : strdup (field4));
  83   uni->decomposition = (field5[0] == '\0' ? "" : strdup (field5));
  84   uni->decdigit      = (field6[0] == '\0' ? "" : strdup (field6));
  85   uni->digit         = (field7[0] == '\0' ? "" : strdup (field7));
  86   uni->numeric       = (field8[0] == '\0' ? "" : strdup (field8));
  87   uni->mirrored      = (field9[0] == 'Y');
  88   uni->oldname       = (field10[0] == '\0' ? "" : strdup (field10));
  89   uni->comment       = (field11[0] == '\0' ? "" : strdup (field11));
  90   uni->upper = (field12[0] =='\0' ? NONE : strtoul (field12, NULL, 16));
  91   uni->lower = (field13[0] =='\0' ? NONE : strtoul (field13, NULL, 16));
  92   uni->title = (field14[0] =='\0' ? NONE : strtoul (field14, NULL, 16));
  93 }
  94
  95 /* Maximum length of a field in the UnicodeData.txt file.  */
  96 #define FIELDLEN 120
  97
  98 /* Reads the next field from STREAM.  The buffer BUFFER has size FIELDLEN.
  99    Reads up to (but excluding) DELIM.
 100    Returns 1 when a field was successfully read, otherwise 0.  */
 101 static int
 102 getfield (FILE *stream, char *buffer, int delim)
 103 {
 104   int count = 0;
 105   int c;
 106
 107   for (; (c = getc (stream)), (c != EOF && c != delim); )
 108     {
 109       /* The original unicode.org UnicodeData.txt file happens to have
 110          CR/LF line terminators.  Silently convert to LF.  */
 111       if (c == '\r')
 112         continue;
 113
 114       /* Put c into the buffer.  */
 115       if (++count >= FIELDLEN - 1)
 116         {
 117           fprintf (stderr, "field too long\n");
 118           exit (1);
 119         }
 120       *buffer++ = c;
 121     }
 122
 123   if (c == EOF)
 124     return 0;
 125
 126   *buffer = '\0';
 127   return 1;
 128 }
 129
 130 /* Stores in unicode_attributes[] the entire contents of the UnicodeData.txt
 131    file.  */
 132 static void
 133 fill_attributes (const char *unicodedata_filename)
 134 {
 135   unsigned int i, j;
 136   FILE *stream;
 137   char field0[FIELDLEN];
 138   char field1[FIELDLEN];
 139   char field2[FIELDLEN];
 140   char field3[FIELDLEN];
 141   char field4[FIELDLEN];
 142   char field5[FIELDLEN];
 143   char field6[FIELDLEN];
 144   char field7[FIELDLEN];
 145   char field8[FIELDLEN];
 146   char field9[FIELDLEN];
 147   char field10[FIELDLEN];
 148   char field11[FIELDLEN];
 149   char field12[FIELDLEN];
 150   char field13[FIELDLEN];
 151   char field14[FIELDLEN];
 152   int lineno = 0;
 153
 154   for (i = 0; i < 0x110000; i++)
 155     unicode_attributes[i].name = NULL;
 156
 157   stream = fopen (unicodedata_filename, "r");
 158   if (stream == NULL)
 159     {
 160       fprintf (stderr, "error during fopen of '%s'\n", unicodedata_filename);
 161       exit (1);
 162     }
 163
 164   for (;;)
 165     {
 166       int n;
 167
 168       lineno++;
 169       n = getfield (stream, field0, ';');
 170       n += getfield (stream, field1, ';');
 171       n += getfield (stream, field2, ';');
 172       n += getfield (stream, field3, ';');
 173       n += getfield (stream, field4, ';');
 174       n += getfield (stream, field5, ';');
 175       n += getfield (stream, field6, ';');
 176       n += getfield (stream, field7, ';');
 177       n += getfield (stream, field8, ';');
 178       n += getfield (stream, field9, ';');
 179       n += getfield (stream, field10, ';');
 180       n += getfield (stream, field11, ';');
 181       n += getfield (stream, field12, ';');
 182       n += getfield (stream, field13, ';');
 183       n += getfield (stream, field14, '\n');
 184       if (n == 0)
 185         break;
 186       if (n != 15)
 187         {
 188           fprintf (stderr, "short line in'%s':%d\n",
 189                    unicodedata_filename, lineno);
 190           exit (1);
 191         }
 192       i = strtoul (field0, NULL, 16);
 193       if (field1[0] == '<'
 194           && strlen (field1) >= 9
 195           && !strcmp (field1 + strlen(field1) - 8, ", First>"))
 196         {
 197           /* Deal with a range. */
 198           lineno++;
 199           n = getfield (stream, field0, ';');
 200           n += getfield (stream, field1, ';');
 201           n += getfield (stream, field2, ';');
 202           n += getfield (stream, field3, ';');
 203           n += getfield (stream, field4, ';');
 204           n += getfield (stream, field5, ';');
 205           n += getfield (stream, field6, ';');
 206           n += getfield (stream, field7, ';');
 207           n += getfield (stream, field8, ';');
 208           n += getfield (stream, field9, ';');
 209           n += getfield (stream, field10, ';');
 210           n += getfield (stream, field11, ';');
 211           n += getfield (stream, field12, ';');
 212           n += getfield (stream, field13, ';');
 213           n += getfield (stream, field14, '\n');
 214           if (n != 15)
 215             {
 216               fprintf (stderr, "missing end range in '%s':%d\n",
 217                        unicodedata_filename, lineno);
 218               exit (1);
 219             }
 220           if (!(field1[0] == '<'
 221                 && strlen (field1) >= 8
 222                 && !strcmp (field1 + strlen (field1) - 7, ", Last>")))
 223             {
 224               fprintf (stderr, "missing end range in '%s':%d\n",
 225                        unicodedata_filename, lineno);
 226               exit (1);
 227             }
 228           field1[strlen (field1) - 7] = '\0';
 229           j = strtoul (field0, NULL, 16);
 230           for (; i <= j; i++)
 231             fill_attribute (i, field1+1, field2, field3, field4, field5,
 232                                field6, field7, field8, field9, field10,
 233                                field11, field12, field13, field14);
 234         }
 235       else
 236         {
 237           /* Single character line */
 238           fill_attribute (i, field1, field2, field3, field4, field5,
 239                              field6, field7, field8, field9, field10,
 240                              field11, field12, field13, field14);
 241         }
 242     }
 243   if (ferror (stream) || fclose (stream))
 244     {
 245       fprintf (stderr, "error reading from '%s'\n", unicodedata_filename);
 246       exit (1);
 247     }
 248 }
 249
 250 /* Character mappings.  */
 251
 252 static unsigned int
 253 to_upper (unsigned int ch)
 254 {
 255   if (unicode_attributes[ch].name != NULL
 256       && unicode_attributes[ch].upper != NONE)
 257     return unicode_attributes[ch].upper;
 258   else
 259     return ch;
 260 }
 261
 262 static unsigned int
 263 to_lower (unsigned int ch)
 264 {
 265   if (unicode_attributes[ch].name != NULL
 266       && unicode_attributes[ch].lower != NONE)
 267     return unicode_attributes[ch].lower;
 268   else
 269     return ch;
 270 }
 271
 272 static unsigned int
 273 to_title (unsigned int ch)
 274 {
 275   if (unicode_attributes[ch].name != NULL
 276       && unicode_attributes[ch].title != NONE)
 277     return unicode_attributes[ch].title;
 278   else
 279     return ch;
 280 }
 281
 282 /* Character class properties.  */
 283
 284 static bool
 285 is_upper (unsigned int ch)
 286 {
 287   return (to_lower (ch) != ch);
 288 }
 289
 290 static bool
 291 is_lower (unsigned int ch)
 292 {
 293   return (to_upper (ch) != ch)
 294          /* <U00DF> is lowercase, but without simple to_upper mapping.  */
 295          || (ch == 0x00DF);
 296 }
 297
 298 static bool
 299 is_alpha (unsigned int ch)
 300 {
 301   return (unicode_attributes[ch].name != NULL
 302           && ((unicode_attributes[ch].category[0] == 'L'
 303                /* Theppitak Karoonboonyanan <thep@links.nectec.or.th> says
 304                   <U0E2F>, <U0E46> should belong to is_punct.  */
 305                && (ch != 0x0E2F) && (ch != 0x0E46))
 306               /* Theppitak Karoonboonyanan <thep@links.nectec.or.th> says
 307                  <U0E31>, <U0E34>..<U0E3A>, <U0E47>..<U0E4E> are is_alpha.  */
 308               || (ch == 0x0E31)
 309               || (ch >= 0x0E34 && ch <= 0x0E3A)
 310               || (ch >= 0x0E47 && ch <= 0x0E4E)
 311               /* Avoid warning for <U0345>.  */
 312               || (ch == 0x0345)
 313               /* Avoid warnings for <U2160>..<U217F>.  */
 314               || (unicode_attributes[ch].category[0] == 'N'
 315                   && unicode_attributes[ch].category[1] == 'l')
 316               /* Avoid warnings for <U24B6>..<U24E9>.  */
 317               || (unicode_attributes[ch].category[0] == 'S'
 318                   && unicode_attributes[ch].category[1] == 'o'
 319                   && strstr (unicode_attributes[ch].name, " LETTER ")
 320                      != NULL)
 321               /* Consider all the non-ASCII digits as alphabetic.
 322                  ISO C 99 forbids us to have them in category "digit",
 323                  but we want iswalnum to return true on them.  */
 324               || (unicode_attributes[ch].category[0] == 'N'
 325                   && unicode_attributes[ch].category[1] == 'd'
 326                   && !(ch >= 0x0030 && ch <= 0x0039))));
 327 }
 328
 329 static bool
 330 is_digit (unsigned int ch)
 331 {
 332 #if 0
 333   return (unicode_attributes[ch].name != NULL
 334           && unicode_attributes[ch].category[0] == 'N'
 335           && unicode_attributes[ch].category[1] == 'd');
 336   /* Note: U+0BE7..U+0BEF and U+1369..U+1371 are digit systems without
 337      a zero.  Must add <0> in front of them by hand.  */
 338 #else
 339   /* SUSV2 gives us some freedom for the "digit" category, but ISO C 99
 340      takes it away:
 341      7.25.2.1.5:
 342         The iswdigit function tests for any wide character that corresponds
 343         to a decimal-digit character (as defined in 5.2.1).
 344      5.2.1:
 345         the 10 decimal digits 0 1 2 3 4 5 6 7 8 9
 346    */
 347   return (ch >= 0x0030 && ch <= 0x0039);
 348 #endif
 349 }
 350
 351 static bool
 352 is_outdigit (unsigned int ch)
 353 {
 354   return (ch >= 0x0030 && ch <= 0x0039);
 355 }
 356
 357 static bool
 358 is_blank (unsigned int ch)
 359 {
 360   return (ch == 0x0009 /* '\t' */
 361           /* Category Zs without mention of "<noBreak>" */
 362           || (unicode_attributes[ch].name != NULL
 363               && unicode_attributes[ch].category[0] == 'Z'
 364               && unicode_attributes[ch].category[1] == 's'
 365               && !strstr (unicode_attributes[ch].decomposition, "<noBreak>")));
 366 }
 367
 368 static bool
 369 is_space (unsigned int ch)
 370 {
 371   /* Don't make U+00A0 a space. Non-breaking space means that all programs
 372      should treat it like a punctuation character, not like a space. */
 373   return (ch == 0x0020 /* ' ' */
 374           || ch == 0x000C /* '\f' */
 375           || ch == 0x000A /* '\n' */
 376           || ch == 0x000D /* '\r' */
 377           || ch == 0x0009 /* '\t' */
 378           || ch == 0x000B /* '\v' */
 379           /* Categories Zl, Zp, and Zs without mention of "<noBreak>" */
 380           || (unicode_attributes[ch].name != NULL
 381               && unicode_attributes[ch].category[0] == 'Z'
 382               && (unicode_attributes[ch].category[1] == 'l'
 383                   || unicode_attributes[ch].category[1] == 'p'
 384                   || (unicode_attributes[ch].category[1] == 's'
 385                       && !strstr (unicode_attributes[ch].decomposition,
 386                                   "<noBreak>")))));
 387 }
 388
 389 static bool
 390 is_cntrl (unsigned int ch)
 391 {
 392   return (unicode_attributes[ch].name != NULL
 393           && (!strcmp (unicode_attributes[ch].name, "<control>")
 394               /* Categories Zl and Zp */
 395               || (unicode_attributes[ch].category[0] == 'Z'
 396                   && (unicode_attributes[ch].category[1] == 'l'
 397                       || unicode_attributes[ch].category[1] == 'p'))));
 398 }
 399
 400 static bool
 401 is_xdigit (unsigned int ch)
 402 {
 403 #if 0
 404   return is_digit (ch)
 405          || (ch >= 0x0041 && ch <= 0x0046)
 406          || (ch >= 0x0061 && ch <= 0x0066);
 407 #else
 408   /* SUSV2 gives us some freedom for the "xdigit" category, but ISO C 99
 409      takes it away:
 410      7.25.2.1.12:
 411         The iswxdigit function tests for any wide character that corresponds
 412         to a hexadecimal-digit character (as defined in 6.4.4.1).
 413      6.4.4.1:
 414         hexadecimal-digit: one of 0 1 2 3 4 5 6 7 8 9 a b c d e f A B C D E F
 415    */
 416   return (ch >= 0x0030 && ch <= 0x0039)
 417          || (ch >= 0x0041 && ch <= 0x0046)
 418          || (ch >= 0x0061 && ch <= 0x0066);
 419 #endif
 420 }
 421
 422 static bool
 423 is_graph (unsigned int ch)
 424 {
 425   return (unicode_attributes[ch].name != NULL
 426           && strcmp (unicode_attributes[ch].name, "<control>")
 427           && !is_space (ch));
 428 }
 429
 430 static bool
 431 is_print (unsigned int ch)
 432 {
 433   return (unicode_attributes[ch].name != NULL
 434           && strcmp (unicode_attributes[ch].name, "<control>")
 435           /* Categories Zl and Zp */
 436           && !(unicode_attributes[ch].name != NULL
 437                && unicode_attributes[ch].category[0] == 'Z'
 438                && (unicode_attributes[ch].category[1] == 'l'
 439                    || unicode_attributes[ch].category[1] == 'p')));
 440 }
 441
 442 static bool
 443 is_punct (unsigned int ch)
 444 {
 445 #if 0
 446   return (unicode_attributes[ch].name != NULL
 447           && unicode_attributes[ch].category[0] == 'P');
 448 #else
 449   /* The traditional POSIX definition of punctuation is every graphic,
 450      non-alphanumeric character.  */
 451   return (is_graph (ch) && !is_alpha (ch) && !is_digit (ch));
 452 #endif
 453 }
 454
 455 static bool
 456 is_combining (unsigned int ch)
 457 {
 458   /* Up to Unicode 3.0.1 we took the Combining property from the PropList.txt
 459      file. In 3.0.1 it was identical to the union of the general categories
 460      "Mn", "Mc", "Me". In Unicode 3.1 this property has been dropped from the
 461      PropList.txt file, so we take the latter definition.  */
 462   return (unicode_attributes[ch].name != NULL
 463           && unicode_attributes[ch].category[0] == 'M'
 464           && (unicode_attributes[ch].category[1] == 'n'
 465               || unicode_attributes[ch].category[1] == 'c'
 466               || unicode_attributes[ch].category[1] == 'e'));
 467 }
 468
 469 static bool
 470 is_combining_level3 (unsigned int ch)
 471 {
 472   return is_combining (ch)
 473          && !(unicode_attributes[ch].combining[0] != '\0'
 474               && unicode_attributes[ch].combining[0] != '0'
 475               && strtoul (unicode_attributes[ch].combining, NULL, 10) >= 200);
 476 }
 477
 478 /* Return the UCS symbol string for a Unicode character.  */
 479 static const char *
 480 ucs_symbol (unsigned int i)
 481 {
 482   static char buf[11+1];
 483
 484   sprintf (buf, (i < 0x10000 ? "<U%04X>" : "<U%08X>"), i);
 485   return buf;
 486 }
 487
 488 /* Return the UCS symbol range string for a Unicode characters interval.  */
 489 static const char *
 490 ucs_symbol_range (unsigned int low, unsigned int high)
 491 {
 492   static char buf[24+1];
 493
 494   strcpy (buf, ucs_symbol (low));
 495   strcat (buf, "..");
 496   strcat (buf, ucs_symbol (high));
 497   return buf;
 498 }
 499
 500 /* Output a character class (= property) table.  */
 501
 502 static void
 503 output_charclass (FILE *stream, const char *classname,
 504                   bool (*func) (unsigned int))
 505 {
 506   char table[0x110000];
 507   unsigned int i;
 508   bool need_semicolon;
 509   const int max_column = 75;
 510   int column;
 511
 512   for (i = 0; i < 0x110000; i++)
 513     table[i] = (int) func (i);
 514
 515   fprintf (stream, "%s ", classname);
 516   need_semicolon = false;
 517   column = 1000;
 518   for (i = 0; i < 0x110000; )
 519     {
 520       if (!table[i])
 521         i++;
 522       else
 523         {
 524           unsigned int low, high;
 525           char buf[25];
 526
 527           low = i;
 528           do
 529             i++;
 530           while (i < 0x110000 && table[i]);
 531           high = i - 1;
 532
 533           if (low == high)
 534             strcpy (buf, ucs_symbol (low));
 535           else
 536             strcpy (buf, ucs_symbol_range (low, high));
 537
 538           if (need_semicolon)
 539             {
 540               fprintf (stream, ";");
 541               column++;
 542             }
 543
 544           if (column + strlen (buf) > max_column)
 545             {
 546               fprintf (stream, "/\n   ");
 547               column = 3;
 548             }
 549
 550           fprintf (stream, "%s", buf);
 551           column += strlen (buf);
 552           need_semicolon = true;
 553         }
 554     }
 555   fprintf (stream, "\n");
 556 }
 557
 558 /* Output a character mapping table.  */
 559
 560 static void
 561 output_charmap (FILE *stream, const char *mapname,
 562                 unsigned int (*func) (unsigned int))
 563 {
 564   char table[0x110000];
 565   unsigned int i;
 566   bool need_semicolon;
 567   const int max_column = 75;
 568   int column;
 569
 570   for (i = 0; i < 0x110000; i++)
 571     table[i] = (func (i) != i);
 572
 573   fprintf (stream, "%s ", mapname);
 574   need_semicolon = false;
 575   column = 1000;
 576   for (i = 0; i < 0x110000; i++)
 577     if (table[i])
 578       {
 579         char buf[25+1];
 580
 581         strcpy (buf, "(");
 582         strcat (buf, ucs_symbol (i));
 583         strcat (buf, ",");
 584         strcat (buf, ucs_symbol (func (i)));
 585         strcat (buf, ")");
 586
 587         if (need_semicolon)
 588           {
 589             fprintf (stream, ";");
 590             column++;
 591           }
 592
 593         if (column + strlen (buf) > max_column)
 594           {
 595             fprintf (stream, "/\n   ");
 596             column = 3;
 597           }
 598
 599         fprintf (stream, "%s", buf);
 600         column += strlen (buf);
 601         need_semicolon = true;
 602       }
 603   fprintf (stream, "\n");
 604 }
 605
 606 /* Output the width table.  */
 607
 608 static void
 609 output_widthmap (FILE *stream)
 610 {
 611 }
 612
 613 /* Output the tables to the given file.  */
 614
 615 static void
 616 output_tables (const char *filename, const char *version)
 617 {
 618   FILE *stream;
 619   unsigned int ch;
 620
 621   stream = fopen (filename, "w");
 622   if (stream == NULL)
 623     {
 624       fprintf (stderr, "cannot open '%s' for writing\n", filename);
 625       exit (1);
 626     }
 627
 628   fprintf (stream, "escape_char /\n");
 629   fprintf (stream, "comment_char %%\n");
 630   fprintf (stream, "\n");
 631   fprintf (stream, "%% Generated automatically by gen-unicode-ctype for Unicode %s.\n",
 632            version);
 633   fprintf (stream, "\n");
 634
 635   fprintf (stream, "LC_IDENTIFICATION\n");
 636   fprintf (stream, "title     \"Unicode %s FDCC-set\"\n", version);
 637   fprintf (stream, "source    \"UnicodeData.txt, PropList.txt\"\n");
 638   fprintf (stream, "address   \"\"\n");
 639   fprintf (stream, "contact   \"\"\n");
 640   fprintf (stream, "email     \"bug-glibc-locales@gnu.org\"\n");
 641   fprintf (stream, "tel       \"\"\n");
 642   fprintf (stream, "fax       \"\"\n");
 643   fprintf (stream, "language  \"\"\n");
 644   fprintf (stream, "territory \"Earth\"\n");
 645   fprintf (stream, "revision  \"%s\"\n", version);
 646   {
 647     time_t now;
 648     char date[11];
 649     now = time (NULL);
 650     strftime (date, sizeof (date), "%Y-%m-%d", gmtime (&now));
 651     fprintf (stream, "date      \"%s\"\n", date);
 652   }
 653   fprintf (stream, "category  \"unicode:2001\";LC_CTYPE\n");
 654   fprintf (stream, "END LC_IDENTIFICATION\n");
 655   fprintf (stream, "\n");
 656
 657   /* Verifications. */
 658   for (ch = 0; ch < 0x110000; ch++)
 659     {
 660       /* toupper restriction: "Only characters specified for the keywords
 661          lower and upper shall be specified.  */
 662       if (to_upper (ch) != ch && !(is_lower (ch) || is_upper (ch)))
 663         fprintf (stderr,
 664                  "%s is not upper|lower but toupper(0x%04X) = 0x%04X\n",
 665                  ucs_symbol (ch), ch, to_upper (ch));
 666
 667       /* tolower restriction: "Only characters specified for the keywords
 668          lower and upper shall be specified.  */
 669       if (to_lower (ch) != ch && !(is_lower (ch) || is_upper (ch)))
 670         fprintf (stderr,
 671                  "%s is not upper|lower but tolower(0x%04X) = 0x%04X\n",
 672                  ucs_symbol (ch), ch, to_lower (ch));
 673
 674       /* alpha restriction: "Characters classified as either upper or lower
 675          shall automatically belong to this class.  */
 676       if ((is_lower (ch) || is_upper (ch)) && !is_alpha (ch))
 677         fprintf (stderr, "%s is upper|lower but not alpha\n", ucs_symbol (ch));
 678
 679       /* alpha restriction: "No character specified for the keywords cntrl,
 680          digit, punct or space shall be specified."  */
 681       if (is_alpha (ch) && is_cntrl (ch))
 682         fprintf (stderr, "%s is alpha and cntrl\n", ucs_symbol (ch));
 683       if (is_alpha (ch) && is_digit (ch))
 684         fprintf (stderr, "%s is alpha and digit\n", ucs_symbol (ch));
 685       if (is_alpha (ch) && is_punct (ch))
 686         fprintf (stderr, "%s is alpha and punct\n", ucs_symbol (ch));
 687       if (is_alpha (ch) && is_space (ch))
 688         fprintf (stderr, "%s is alpha and space\n", ucs_symbol (ch));
 689
 690       /* space restriction: "No character specified for the keywords upper,
 691          lower, alpha, digit, graph or xdigit shall be specified."
 692          upper, lower, alpha already checked above.  */
 693       if (is_space (ch) && is_digit (ch))
 694         fprintf (stderr, "%s is space and digit\n", ucs_symbol (ch));
 695       if (is_space (ch) && is_graph (ch))
 696         fprintf (stderr, "%s is space and graph\n", ucs_symbol (ch));
 697       if (is_space (ch) && is_xdigit (ch))
 698         fprintf (stderr, "%s is space and xdigit\n", ucs_symbol (ch));
 699
 700       /* cntrl restriction: "No character specified for the keywords upper,
 701          lower, alpha, digit, punct, graph, print or xdigit shall be
 702          specified."  upper, lower, alpha already checked above.  */
 703       if (is_cntrl (ch) && is_digit (ch))
 704         fprintf (stderr, "%s is cntrl and digit\n", ucs_symbol (ch));
 705       if (is_cntrl (ch) && is_punct (ch))
 706         fprintf (stderr, "%s is cntrl and punct\n", ucs_symbol (ch));
 707       if (is_cntrl (ch) && is_graph (ch))
 708         fprintf (stderr, "%s is cntrl and graph\n", ucs_symbol (ch));
 709       if (is_cntrl (ch) && is_print (ch))
 710         fprintf (stderr, "%s is cntrl and print\n", ucs_symbol (ch));
 711       if (is_cntrl (ch) && is_xdigit (ch))
 712         fprintf (stderr, "%s is cntrl and xdigit\n", ucs_symbol (ch));
 713
 714       /* punct restriction: "No character specified for the keywords upper,
 715          lower, alpha, digit, cntrl, xdigit or as the <space> character shall
 716          be specified."  upper, lower, alpha, cntrl already checked above.  */
 717       if (is_punct (ch) && is_digit (ch))
 718         fprintf (stderr, "%s is punct and digit\n", ucs_symbol (ch));
 719       if (is_punct (ch) && is_xdigit (ch))
 720         fprintf (stderr, "%s is punct and xdigit\n", ucs_symbol (ch));
 721       if (is_punct (ch) && (ch == 0x0020))
 722         fprintf (stderr, "%s is punct\n", ucs_symbol (ch));
 723
 724       /* graph restriction: "No character specified for the keyword cntrl
 725          shall be specified."  Already checked above.  */
 726
 727       /* print restriction: "No character specified for the keyword cntrl
 728          shall be specified."  Already checked above.  */
 729
 730       /* graph - print relation: differ only in the <space> character.
 731          How is this possible if there are more than one space character?!
 732          I think susv2/xbd/locale.html should speak of "space characters",
 733          not "space character".  */
 734       if (is_print (ch) && !(is_graph (ch) || /* ch == 0x0020 */ is_space (ch)))
 735         fprintf (stderr,
 736                  "%s is print but not graph|<space>\n", ucs_symbol (ch));
 737       if (!is_print (ch) && (is_graph (ch) || ch == 0x0020))
 738         fprintf (stderr,
 739                  "%s is graph|<space> but not print\n", ucs_symbol (ch));
 740     }
 741
 742   fprintf (stream, "LC_CTYPE\n");
 743   output_charclass (stream, "upper", is_upper);
 744   output_charclass (stream, "lower", is_lower);
 745   output_charclass (stream, "alpha", is_alpha);
 746   output_charclass (stream, "digit", is_digit);
 747   output_charclass (stream, "outdigit", is_outdigit);
 748   output_charclass (stream, "blank", is_blank);
 749   output_charclass (stream, "space", is_space);
 750   output_charclass (stream, "cntrl", is_cntrl);
 751   output_charclass (stream, "punct", is_punct);
 752   output_charclass (stream, "xdigit", is_xdigit);
 753   output_charclass (stream, "graph", is_graph);
 754   output_charclass (stream, "print", is_print);
 755   output_charclass (stream, "class \"combining\";", is_combining);
 756   output_charclass (stream, "class \"combining_level3\";", is_combining_level3);
 757   output_charmap (stream, "toupper", to_upper);
 758   output_charmap (stream, "tolower", to_lower);
 759   output_charmap (stream, "map \"totitle\";", to_title);
 760   output_widthmap (stream);
 761   fprintf (stream, "END LC_CTYPE\n");
 762
 763   if (ferror (stream) || fclose (stream))
 764     {
 765       fprintf (stderr, "error writing to '%s'\n", filename);
 766       exit (1);
 767     }
 768 }
 769
 770 int
 771 main (int argc, char * argv[])
 772 {
 773   if (argc != 3)
 774     {
 775       fprintf (stderr, "Usage: %s UnicodeData.txt version\n", argv[0]);
 776       exit (1);
 777     }
 778
 779   fill_attributes (argv[1]);
 780
 781   output_tables ("unicode", argv[2]);
 782
 783   return 0;
 784 }