localedata/gen-unicode-ctype.c

   1 /* Generate a Unicode conforming LC_CTYPE category from a UnicodeData file.
   2    Copyright (C) 2000-2001 Free Software Foundation, Inc.
   3    This file is part of the GNU C Library.
   4    Contributed by Bruno Haible <haible@clisp.cons.org>, 2000.
   5
   6    The GNU C Library is free software; you can redistribute it and/or
   7    modify it under the terms of the GNU Lesser General Public
   8    License as published by the Free Software Foundation; either
   9    version 2.1 of the License, or (at your option) any later version.
  10
  11    The GNU C Library is distributed in the hope that it will be useful,
  12    but WITHOUT ANY WARRANTY; without even the implied warranty of
  13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14    Lesser General Public License for more details.
  15
  16    You should have received a copy of the GNU Lesser General Public
  17    License along with the GNU C Library; if not, write to the Free
  18    Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
  19    02111-1307 USA.  */
  20
  21 /* Usage example:
  22      $ gen-unicode /usr/local/share/Unidata/UnicodeData.txt 3.1
  23  */
  24
  25 #include <stdio.h>
  26 #include <stdlib.h>
  27 #include <stdbool.h>
  28 #include <string.h>
  29 #include <time.h>
  30
  31 /* This structure represents one line in the UnicodeData.txt file.  */
  32 struct unicode_attribute
  33 {
  34   const char *name;           /* Character name */
  35   const char *category;       /* General category */
  36   const char *combining;      /* Canonical combining classes */
  37   const char *bidi;           /* Bidirectional category */
  38   const char *decomposition;  /* Character decomposition mapping */
  39   const char *decdigit;       /* Decimal digit value */
  40   const char *digit;          /* Digit value */
  41   const char *numeric;        /* Numeric value */
  42   int mirrored;               /* mirrored */
  43   const char *oldname;        /* Old Unicode 1.0 name */
  44   const char *comment;        /* Comment */
  45   unsigned int upper;         /* Uppercase mapping */
  46   unsigned int lower;         /* Lowercase mapping */
  47   unsigned int title;         /* Titlecase mapping */
  48 };
  49
  50 /* Missing fields are represented with "" for strings, and NONE for
  51    characters.  */
  52 #define NONE (~(unsigned int)0)
  53
  54 /* The entire contents of the UnicodeData.txt file.  */
  55 struct unicode_attribute unicode_attributes [0x110000];
  56
  57 /* Stores in unicode_attributes[i] the values from the given fields.  */
  58 static void
  59 fill_attribute (unsigned int i,
  60                 const char *field1, const char *field2,
  61                 const char *field3, const char *field4,
  62                 const char *field5, const char *field6,
  63                 const char *field7, const char *field8,
  64                 const char *field9, const char *field10,
  65                 const char *field11, const char *field12,
  66                 const char *field13, const char *field14)
  67 {
  68   struct unicode_attribute * uni;
  69
  70   if (i >= 0x110000)
  71     {
  72       fprintf (stderr, "index too large\n");
  73       exit (1);
  74     }
  75   if (strcmp (field2, "Cs") == 0)
  76     /* Surrogates are UTF-16 artefacts, not real characters. Ignore them.  */
  77     return;
  78   uni = &unicode_attributes[i];
  79   /* Copy the strings.  */
  80   uni->name          = strdup (field1);
  81   uni->category      = (field2[0] == '\0' ? "" : strdup (field2));
  82   uni->combining     = (field3[0] == '\0' ? "" : strdup (field3));
  83   uni->bidi          = (field4[0] == '\0' ? "" : strdup (field4));
  84   uni->decomposition = (field5[0] == '\0' ? "" : strdup (field5));
  85   uni->decdigit      = (field6[0] == '\0' ? "" : strdup (field6));
  86   uni->digit         = (field7[0] == '\0' ? "" : strdup (field7));
  87   uni->numeric       = (field8[0] == '\0' ? "" : strdup (field8));
  88   uni->mirrored      = (field9[0] == 'Y');
  89   uni->oldname       = (field10[0] == '\0' ? "" : strdup (field10));
  90   uni->comment       = (field11[0] == '\0' ? "" : strdup (field11));
  91   uni->upper = (field12[0] =='\0' ? NONE : strtoul (field12, NULL, 16));
  92   uni->lower = (field13[0] =='\0' ? NONE : strtoul (field13, NULL, 16));
  93   uni->title = (field14[0] =='\0' ? NONE : strtoul (field14, NULL, 16));
  94 }
  95
  96 /* Maximum length of a field in the UnicodeData.txt file.  */
  97 #define FIELDLEN 120
  98
  99 /* Reads the next field from STREAM.  The buffer BUFFER has size FIELDLEN.
 100    Reads up to (but excluding) DELIM.
 101    Returns 1 when a field was successfully read, otherwise 0.  */
 102 static int
 103 getfield (FILE *stream, char *buffer, int delim)
 104 {
 105   int count = 0;
 106   int c;
 107
 108   for (; (c = getc (stream)), (c != EOF && c != delim); )
 109     {
 110       /* The original unicode.org UnicodeData.txt file happens to have
 111          CR/LF line terminators.  Silently convert to LF.  */
 112       if (c == '\r')
 113         continue;
 114
 115       /* Put c into the buffer.  */
 116       if (++count >= FIELDLEN - 1)
 117         {
 118           fprintf (stderr, "field too long\n");
 119           exit (1);
 120         }
 121       *buffer++ = c;
 122     }
 123
 124   if (c == EOF)
 125     return 0;
 126
 127   *buffer = '\0';
 128   return 1;
 129 }
 130
 131 /* Stores in unicode_attributes[] the entire contents of the UnicodeData.txt
 132    file.  */
 133 static void
 134 fill_attributes (const char *unicodedata_filename)
 135 {
 136   unsigned int i, j;
 137   FILE *stream;
 138   char field0[FIELDLEN];
 139   char field1[FIELDLEN];
 140   char field2[FIELDLEN];
 141   char field3[FIELDLEN];
 142   char field4[FIELDLEN];
 143   char field5[FIELDLEN];
 144   char field6[FIELDLEN];
 145   char field7[FIELDLEN];
 146   char field8[FIELDLEN];
 147   char field9[FIELDLEN];
 148   char field10[FIELDLEN];
 149   char field11[FIELDLEN];
 150   char field12[FIELDLEN];
 151   char field13[FIELDLEN];
 152   char field14[FIELDLEN];
 153   int lineno = 0;
 154
 155   for (i = 0; i < 0x110000; i++)
 156     unicode_attributes[i].name = NULL;
 157
 158   stream = fopen (unicodedata_filename, "r");
 159   if (stream == NULL)
 160     {
 161       fprintf (stderr, "error during fopen of '%s'\n", unicodedata_filename);
 162       exit (1);
 163     }
 164
 165   for (;;)
 166     {
 167       int n;
 168
 169       lineno++;
 170       n = getfield (stream, field0, ';');
 171       n += getfield (stream, field1, ';');
 172       n += getfield (stream, field2, ';');
 173       n += getfield (stream, field3, ';');
 174       n += getfield (stream, field4, ';');
 175       n += getfield (stream, field5, ';');
 176       n += getfield (stream, field6, ';');
 177       n += getfield (stream, field7, ';');
 178       n += getfield (stream, field8, ';');
 179       n += getfield (stream, field9, ';');
 180       n += getfield (stream, field10, ';');
 181       n += getfield (stream, field11, ';');
 182       n += getfield (stream, field12, ';');
 183       n += getfield (stream, field13, ';');
 184       n += getfield (stream, field14, '\n');
 185       if (n == 0)
 186         break;
 187       if (n != 15)
 188         {
 189           fprintf (stderr, "short line in'%s':%d\n",
 190                    unicodedata_filename, lineno);
 191           exit (1);
 192         }
 193       i = strtoul (field0, NULL, 16);
 194       if (field1[0] == '<'
 195           && strlen (field1) >= 9
 196           && !strcmp (field1 + strlen(field1) - 8, ", First>"))
 197         {
 198           /* Deal with a range. */
 199           lineno++;
 200           n = getfield (stream, field0, ';');
 201           n += getfield (stream, field1, ';');
 202           n += getfield (stream, field2, ';');
 203           n += getfield (stream, field3, ';');
 204           n += getfield (stream, field4, ';');
 205           n += getfield (stream, field5, ';');
 206           n += getfield (stream, field6, ';');
 207           n += getfield (stream, field7, ';');
 208           n += getfield (stream, field8, ';');
 209           n += getfield (stream, field9, ';');
 210           n += getfield (stream, field10, ';');
 211           n += getfield (stream, field11, ';');
 212           n += getfield (stream, field12, ';');
 213           n += getfield (stream, field13, ';');
 214           n += getfield (stream, field14, '\n');
 215           if (n != 15)
 216             {
 217               fprintf (stderr, "missing end range in '%s':%d\n",
 218                        unicodedata_filename, lineno);
 219               exit (1);
 220             }
 221           if (!(field1[0] == '<'
 222                 && strlen (field1) >= 8
 223                 && !strcmp (field1 + strlen (field1) - 7, ", Last>")))
 224             {
 225               fprintf (stderr, "missing end range in '%s':%d\n",
 226                        unicodedata_filename, lineno);
 227               exit (1);
 228             }
 229           field1[strlen (field1) - 7] = '\0';
 230           j = strtoul (field0, NULL, 16);
 231           for (; i <= j; i++)
 232             fill_attribute (i, field1+1, field2, field3, field4, field5,
 233                                field6, field7, field8, field9, field10,
 234                                field11, field12, field13, field14);
 235         }
 236       else
 237         {
 238           /* Single character line */
 239           fill_attribute (i, field1, field2, field3, field4, field5,
 240                              field6, field7, field8, field9, field10,
 241                              field11, field12, field13, field14);
 242         }
 243     }
 244   if (ferror (stream) || fclose (stream))
 245     {
 246       fprintf (stderr, "error reading from '%s'\n", unicodedata_filename);
 247       exit (1);
 248     }
 249 }
 250
 251 /* Character mappings.  */
 252
 253 static unsigned int
 254 to_upper (unsigned int ch)
 255 {
 256   if (unicode_attributes[ch].name != NULL
 257       && unicode_attributes[ch].upper != NONE)
 258     return unicode_attributes[ch].upper;
 259   else
 260     return ch;
 261 }
 262
 263 static unsigned int
 264 to_lower (unsigned int ch)
 265 {
 266   if (unicode_attributes[ch].name != NULL
 267       && unicode_attributes[ch].lower != NONE)
 268     return unicode_attributes[ch].lower;
 269   else
 270     return ch;
 271 }
 272
 273 static unsigned int
 274 to_title (unsigned int ch)
 275 {
 276   if (unicode_attributes[ch].name != NULL
 277       && unicode_attributes[ch].title != NONE)
 278     return unicode_attributes[ch].title;
 279   else
 280     return ch;
 281 }
 282
 283 /* Character class properties.  */
 284
 285 static bool
 286 is_upper (unsigned int ch)
 287 {
 288   return (to_lower (ch) != ch);
 289 }
 290
 291 static bool
 292 is_lower (unsigned int ch)
 293 {
 294   return (to_upper (ch) != ch)
 295          /* <U00DF> is lowercase, but without simple to_upper mapping.  */
 296          || (ch == 0x00DF);
 297 }
 298
 299 static bool
 300 is_alpha (unsigned int ch)
 301 {
 302   return (unicode_attributes[ch].name != NULL
 303           && ((unicode_attributes[ch].category[0] == 'L'
 304                /* Theppitak Karoonboonyanan <thep@links.nectec.or.th> says
 305                   <U0E2F>, <U0E46> should belong to is_punct.  */
 306                && (ch != 0x0E2F) && (ch != 0x0E46))
 307               /* Theppitak Karoonboonyanan <thep@links.nectec.or.th> says
 308                  <U0E31>, <U0E34>..<U0E3A>, <U0E47>..<U0E4E> are is_alpha.  */
 309               || (ch == 0x0E31)
 310               || (ch >= 0x0E34 && ch <= 0x0E3A)
 311               || (ch >= 0x0E47 && ch <= 0x0E4E)
 312               /* Avoid warning for <U0345>.  */
 313               || (ch == 0x0345)
 314               /* Avoid warnings for <U2160>..<U217F>.  */
 315               || (unicode_attributes[ch].category[0] == 'N'
 316                   && unicode_attributes[ch].category[1] == 'l')
 317               /* Avoid warnings for <U24B6>..<U24E9>.  */
 318               || (unicode_attributes[ch].category[0] == 'S'
 319                   && unicode_attributes[ch].category[1] == 'o'
 320                   && strstr (unicode_attributes[ch].name, " LETTER ")
 321                      != NULL)
 322               /* Consider all the non-ASCII digits as alphabetic.
 323                  ISO C 99 forbids us to have them in category "digit",
 324                  but we want iswalnum to return true on them.  */
 325               || (unicode_attributes[ch].category[0] == 'N'
 326                   && unicode_attributes[ch].category[1] == 'd'
 327                   && !(ch >= 0x0030 && ch <= 0x0039))));
 328 }
 329
 330 static bool
 331 is_digit (unsigned int ch)
 332 {
 333 #if 0
 334   return (unicode_attributes[ch].name != NULL
 335           && unicode_attributes[ch].category[0] == 'N'
 336           && unicode_attributes[ch].category[1] == 'd');
 337   /* Note: U+0BE7..U+0BEF and U+1369..U+1371 are digit systems without
 338      a zero.  Must add <0> in front of them by hand.  */
 339 #else
 340   /* SUSV2 gives us some freedom for the "digit" category, but ISO C 99
 341      takes it away:
 342      7.25.2.1.5:
 343         The iswdigit function tests for any wide character that corresponds
 344         to a decimal-digit character (as defined in 5.2.1).
 345      5.2.1:
 346         the 10 decimal digits 0 1 2 3 4 5 6 7 8 9
 347    */
 348   return (ch >= 0x0030 && ch <= 0x0039);
 349 #endif
 350 }
 351
 352 static bool
 353 is_outdigit (unsigned int ch)
 354 {
 355   return (ch >= 0x0030 && ch <= 0x0039);
 356 }
 357
 358 static bool
 359 is_blank (unsigned int ch)
 360 {
 361   return (ch == 0x0009 /* '\t' */
 362           /* Category Zs without mention of "<noBreak>" */
 363           || (unicode_attributes[ch].name != NULL
 364               && unicode_attributes[ch].category[0] == 'Z'
 365               && unicode_attributes[ch].category[1] == 's'
 366               && !strstr (unicode_attributes[ch].decomposition, "<noBreak>")));
 367 }
 368
 369 static bool
 370 is_space (unsigned int ch)
 371 {
 372   /* Don't make U+00A0 a space. Non-breaking space means that all programs
 373      should treat it like a punctuation character, not like a space. */
 374   return (ch == 0x0020 /* ' ' */
 375           || ch == 0x000C /* '\f' */
 376           || ch == 0x000A /* '\n' */
 377           || ch == 0x000D /* '\r' */
 378           || ch == 0x0009 /* '\t' */
 379           || ch == 0x000B /* '\v' */
 380           /* Categories Zl, Zp, and Zs without mention of "<noBreak>" */
 381           || (unicode_attributes[ch].name != NULL
 382               && unicode_attributes[ch].category[0] == 'Z'
 383               && (unicode_attributes[ch].category[1] == 'l'
 384                   || unicode_attributes[ch].category[1] == 'p'
 385                   || (unicode_attributes[ch].category[1] == 's'
 386                       && !strstr (unicode_attributes[ch].decomposition,
 387                                   "<noBreak>")))));
 388 }
 389
 390 static bool
 391 is_cntrl (unsigned int ch)
 392 {
 393   return (unicode_attributes[ch].name != NULL
 394           && (!strcmp (unicode_attributes[ch].name, "<control>")
 395               /* Categories Zl and Zp */
 396               || (unicode_attributes[ch].category[0] == 'Z'
 397                   && (unicode_attributes[ch].category[1] == 'l'
 398                       || unicode_attributes[ch].category[1] == 'p'))));
 399 }
 400
 401 static bool
 402 is_xdigit (unsigned int ch)
 403 {
 404 #if 0
 405   return is_digit (ch)
 406          || (ch >= 0x0041 && ch <= 0x0046)
 407          || (ch >= 0x0061 && ch <= 0x0066);
 408 #else
 409   /* SUSV2 gives us some freedom for the "xdigit" category, but ISO C 99
 410      takes it away:
 411      7.25.2.1.12:
 412         The iswxdigit function tests for any wide character that corresponds
 413         to a hexadecimal-digit character (as defined in 6.4.4.1).
 414      6.4.4.1:
 415         hexadecimal-digit: one of 0 1 2 3 4 5 6 7 8 9 a b c d e f A B C D E F
 416    */
 417   return (ch >= 0x0030 && ch <= 0x0039)
 418          || (ch >= 0x0041 && ch <= 0x0046)
 419          || (ch >= 0x0061 && ch <= 0x0066);
 420 #endif
 421 }
 422
 423 static bool
 424 is_graph (unsigned int ch)
 425 {
 426   return (unicode_attributes[ch].name != NULL
 427           && strcmp (unicode_attributes[ch].name, "<control>")
 428           && !is_space (ch));
 429 }
 430
 431 static bool
 432 is_print (unsigned int ch)
 433 {
 434   return (unicode_attributes[ch].name != NULL
 435           && strcmp (unicode_attributes[ch].name, "<control>")
 436           /* Categories Zl and Zp */
 437           && !(unicode_attributes[ch].name != NULL
 438                && unicode_attributes[ch].category[0] == 'Z'
 439                && (unicode_attributes[ch].category[1] == 'l'
 440                    || unicode_attributes[ch].category[1] == 'p')));
 441 }
 442
 443 static bool
 444 is_punct (unsigned int ch)
 445 {
 446 #if 0
 447   return (unicode_attributes[ch].name != NULL
 448           && unicode_attributes[ch].category[0] == 'P');
 449 #else
 450   /* The traditional POSIX definition of punctuation is every graphic,
 451      non-alphanumeric character.  */
 452   return (is_graph (ch) && !is_alpha (ch) && !is_digit (ch));
 453 #endif
 454 }
 455
 456 static bool
 457 is_combining (unsigned int ch)
 458 {
 459   /* Up to Unicode 3.0.1 we took the Combining property from the PropList.txt
 460      file. In 3.0.1 it was identical to the union of the general categories
 461      "Mn", "Mc", "Me". In Unicode 3.1 this property has been dropped from the
 462      PropList.txt file, so we take the latter definition.  */
 463   return (unicode_attributes[ch].name != NULL
 464           && unicode_attributes[ch].category[0] == 'M'
 465           && (unicode_attributes[ch].category[1] == 'n'
 466               || unicode_attributes[ch].category[1] == 'c'
 467               || unicode_attributes[ch].category[1] == 'e'));
 468 }
 469
 470 static bool
 471 is_combining_level3 (unsigned int ch)
 472 {
 473   return is_combining (ch)
 474          && !(unicode_attributes[ch].combining[0] != '\0'
 475               && unicode_attributes[ch].combining[0] != '0'
 476               && strtoul (unicode_attributes[ch].combining, NULL, 10) >= 200);
 477 }
 478
 479 /* Return the UCS symbol string for a Unicode character.  */
 480 static const char *
 481 ucs_symbol (unsigned int i)
 482 {
 483   static char buf[11+1];
 484
 485   sprintf (buf, (i < 0x10000 ? "<U%04X>" : "<U%08X>"), i);
 486   return buf;
 487 }
 488
 489 /* Return the UCS symbol range string for a Unicode characters interval.  */
 490 static const char *
 491 ucs_symbol_range (unsigned int low, unsigned int high)
 492 {
 493   static char buf[24+1];
 494
 495   strcpy (buf, ucs_symbol (low));
 496   strcat (buf, "..");
 497   strcat (buf, ucs_symbol (high));
 498   return buf;
 499 }
 500
 501 /* Output a character class (= property) table.  */
 502
 503 static void
 504 output_charclass (FILE *stream, const char *classname,
 505                   bool (*func) (unsigned int))
 506 {
 507   char table[0x110000];
 508   unsigned int i;
 509   bool need_semicolon;
 510   const int max_column = 75;
 511   int column;
 512
 513   for (i = 0; i < 0x110000; i++)
 514     table[i] = (int) func (i);
 515
 516   fprintf (stream, "%s ", classname);
 517   need_semicolon = false;
 518   column = 1000;
 519   for (i = 0; i < 0x110000; )
 520     {
 521       if (!table[i])
 522         i++;
 523       else
 524         {
 525           unsigned int low, high;
 526           char buf[25];
 527
 528           low = i;
 529           do
 530             i++;
 531           while (i < 0x110000 && table[i]);
 532           high = i - 1;
 533
 534           if (low == high)
 535             strcpy (buf, ucs_symbol (low));
 536           else
 537             strcpy (buf, ucs_symbol_range (low, high));
 538
 539           if (need_semicolon)
 540             {
 541               fprintf (stream, ";");
 542               column++;
 543             }
 544
 545           if (column + strlen (buf) > max_column)
 546             {
 547               fprintf (stream, "/\n   ");
 548               column = 3;
 549             }
 550
 551           fprintf (stream, "%s", buf);
 552           column += strlen (buf);
 553           need_semicolon = true;
 554         }
 555     }
 556   fprintf (stream, "\n");
 557 }
 558
 559 /* Output a character mapping table.  */
 560
 561 static void
 562 output_charmap (FILE *stream, const char *mapname,
 563                 unsigned int (*func) (unsigned int))
 564 {
 565   char table[0x110000];
 566   unsigned int i;
 567   bool need_semicolon;
 568   const int max_column = 75;
 569   int column;
 570
 571   for (i = 0; i < 0x110000; i++)
 572     table[i] = (func (i) != i);
 573
 574   fprintf (stream, "%s ", mapname);
 575   need_semicolon = false;
 576   column = 1000;
 577   for (i = 0; i < 0x110000; i++)
 578     if (table[i])
 579       {
 580         char buf[25+1];
 581
 582         strcpy (buf, "(");
 583         strcat (buf, ucs_symbol (i));
 584         strcat (buf, ",");
 585         strcat (buf, ucs_symbol (func (i)));
 586         strcat (buf, ")");
 587
 588         if (need_semicolon)
 589           {
 590             fprintf (stream, ";");
 591             column++;
 592           }
 593
 594         if (column + strlen (buf) > max_column)
 595           {
 596             fprintf (stream, "/\n   ");
 597             column = 3;
 598           }
 599
 600         fprintf (stream, "%s", buf);
 601         column += strlen (buf);
 602         need_semicolon = true;
 603       }
 604   fprintf (stream, "\n");
 605 }
 606
 607 /* Output the width table.  */
 608
 609 static void
 610 output_widthmap (FILE *stream)
 611 {
 612 }
 613
 614 /* Output the tables to the given file.  */
 615
 616 static void
 617 output_tables (const char *filename, const char *version)
 618 {
 619   FILE *stream;
 620   unsigned int ch;
 621
 622   stream = fopen (filename, "w");
 623   if (stream == NULL)
 624     {
 625       fprintf (stderr, "cannot open '%s' for writing\n", filename);
 626       exit (1);
 627     }
 628
 629   fprintf (stream, "escape_char /\n");
 630   fprintf (stream, "comment_char %%\n");
 631   fprintf (stream, "\n");
 632   fprintf (stream, "%% Generated automatically by gen-unicode-ctype for Unicode %s.\n",
 633            version);
 634   fprintf (stream, "\n");
 635
 636   fprintf (stream, "LC_IDENTIFICATION\n");
 637   fprintf (stream, "title     \"Unicode %s FDCC-set\"\n", version);
 638   fprintf (stream, "source    \"UnicodeData.txt, PropList.txt\"\n");
 639   fprintf (stream, "address   \"\"\n");
 640   fprintf (stream, "contact   \"\"\n");
 641   fprintf (stream, "email     \"bug-glibc@gnu.org\"\n");
 642   fprintf (stream, "tel       \"\"\n");
 643   fprintf (stream, "fax       \"\"\n");
 644   fprintf (stream, "language  \"\"\n");
 645   fprintf (stream, "territory \"Earth\"\n");
 646   fprintf (stream, "revision  \"%s\"\n", version);
 647   {
 648     time_t now;
 649     char date[11];
 650     now = time (NULL);
 651     strftime (date, sizeof (date), "%Y-%m-%d", gmtime (&now));
 652     fprintf (stream, "date      \"%s\"\n", date);
 653   }
 654   fprintf (stream, "category  \"unicode:2001\";LC_CTYPE\n");
 655   fprintf (stream, "END LC_IDENTIFICATION\n");
 656   fprintf (stream, "\n");
 657
 658   /* Verifications. */
 659   for (ch = 0; ch < 0x110000; ch++)
 660     {
 661       /* toupper restriction: "Only characters specified for the keywords
 662          lower and upper shall be specified.  */
 663       if (to_upper (ch) != ch && !(is_lower (ch) || is_upper (ch)))
 664         fprintf (stderr,
 665                  "%s is not upper|lower but toupper(0x%04X) = 0x%04X\n",
 666                  ucs_symbol (ch), ch, to_upper (ch));
 667
 668       /* tolower restriction: "Only characters specified for the keywords
 669          lower and upper shall be specified.  */
 670       if (to_lower (ch) != ch && !(is_lower (ch) || is_upper (ch)))
 671         fprintf (stderr,
 672                  "%s is not upper|lower but tolower(0x%04X) = 0x%04X\n",
 673                  ucs_symbol (ch), ch, to_lower (ch));
 674
 675       /* alpha restriction: "Characters classified as either upper or lower
 676          shall automatically belong to this class.  */
 677       if ((is_lower (ch) || is_upper (ch)) && !is_alpha (ch))
 678         fprintf (stderr, "%s is upper|lower but not alpha\n", ucs_symbol (ch));
 679
 680       /* alpha restriction: "No character specified for the keywords cntrl,
 681          digit, punct or space shall be specified."  */
 682       if (is_alpha (ch) && is_cntrl (ch))
 683         fprintf (stderr, "%s is alpha and cntrl\n", ucs_symbol (ch));
 684       if (is_alpha (ch) && is_digit (ch))
 685         fprintf (stderr, "%s is alpha and digit\n", ucs_symbol (ch));
 686       if (is_alpha (ch) && is_punct (ch))
 687         fprintf (stderr, "%s is alpha and punct\n", ucs_symbol (ch));
 688       if (is_alpha (ch) && is_space (ch))
 689         fprintf (stderr, "%s is alpha and space\n", ucs_symbol (ch));
 690
 691       /* space restriction: "No character specified for the keywords upper,
 692          lower, alpha, digit, graph or xdigit shall be specified."
 693          upper, lower, alpha already checked above.  */
 694       if (is_space (ch) && is_digit (ch))
 695         fprintf (stderr, "%s is space and digit\n", ucs_symbol (ch));
 696       if (is_space (ch) && is_graph (ch))
 697         fprintf (stderr, "%s is space and graph\n", ucs_symbol (ch));
 698       if (is_space (ch) && is_xdigit (ch))
 699         fprintf (stderr, "%s is space and xdigit\n", ucs_symbol (ch));
 700
 701       /* cntrl restriction: "No character specified for the keywords upper,
 702          lower, alpha, digit, punct, graph, print or xdigit shall be
 703          specified."  upper, lower, alpha already checked above.  */
 704       if (is_cntrl (ch) && is_digit (ch))
 705         fprintf (stderr, "%s is cntrl and digit\n", ucs_symbol (ch));
 706       if (is_cntrl (ch) && is_punct (ch))
 707         fprintf (stderr, "%s is cntrl and punct\n", ucs_symbol (ch));
 708       if (is_cntrl (ch) && is_graph (ch))
 709         fprintf (stderr, "%s is cntrl and graph\n", ucs_symbol (ch));
 710       if (is_cntrl (ch) && is_print (ch))
 711         fprintf (stderr, "%s is cntrl and print\n", ucs_symbol (ch));
 712       if (is_cntrl (ch) && is_xdigit (ch))
 713         fprintf (stderr, "%s is cntrl and xdigit\n", ucs_symbol (ch));
 714
 715       /* punct restriction: "No character specified for the keywords upper,
 716          lower, alpha, digit, cntrl, xdigit or as the <space> character shall
 717          be specified."  upper, lower, alpha, cntrl already checked above.  */
 718       if (is_punct (ch) && is_digit (ch))
 719         fprintf (stderr, "%s is punct and digit\n", ucs_symbol (ch));
 720       if (is_punct (ch) && is_xdigit (ch))
 721         fprintf (stderr, "%s is punct and xdigit\n", ucs_symbol (ch));
 722       if (is_punct (ch) && (ch == 0x0020))
 723         fprintf (stderr, "%s is punct\n", ucs_symbol (ch));
 724
 725       /* graph restriction: "No character specified for the keyword cntrl
 726          shall be specified."  Already checked above.  */
 727
 728       /* print restriction: "No character specified for the keyword cntrl
 729          shall be specified."  Already checked above.  */
 730
 731       /* graph - print relation: differ only in the <space> character.
 732          How is this possible if there are more than one space character?!
 733          I think susv2/xbd/locale.html should speak of "space characters",
 734          not "space character".  */
 735       if (is_print (ch) && !(is_graph (ch) || /* ch == 0x0020 */ is_space (ch)))
 736         fprintf (stderr,
 737                  "%s is print but not graph|<space>\n", ucs_symbol (ch));
 738       if (!is_print (ch) && (is_graph (ch) || ch == 0x0020))
 739         fprintf (stderr,
 740                  "%s is graph|<space> but not print\n", ucs_symbol (ch));
 741     }
 742
 743   fprintf (stream, "LC_CTYPE\n");
 744   output_charclass (stream, "upper", is_upper);
 745   output_charclass (stream, "lower", is_lower);
 746   output_charclass (stream, "alpha", is_alpha);
 747   output_charclass (stream, "digit", is_digit);
 748   output_charclass (stream, "outdigit", is_outdigit);
 749   output_charclass (stream, "blank", is_blank);
 750   output_charclass (stream, "space", is_space);
 751   output_charclass (stream, "cntrl", is_cntrl);
 752   output_charclass (stream, "punct", is_punct);
 753   output_charclass (stream, "xdigit", is_xdigit);
 754   output_charclass (stream, "graph", is_graph);
 755   output_charclass (stream, "print", is_print);
 756   output_charclass (stream, "class \"combining\";", is_combining);
 757   output_charclass (stream, "class \"combining_level3\";", is_combining_level3);
 758   output_charmap (stream, "toupper", to_upper);
 759   output_charmap (stream, "tolower", to_lower);
 760   output_charmap (stream, "map \"totitle\";", to_title);
 761   output_widthmap (stream);
 762   fprintf (stream, "END LC_CTYPE\n");
 763
 764   if (ferror (stream) || fclose (stream))
 765     {
 766       fprintf (stderr, "error writing to '%s'\n", filename);
 767       exit (1);
 768     }
 769 }
 770
 771 int
 772 main (int argc, char * argv[])
 773 {
 774   if (argc != 3)
 775     {
 776       fprintf (stderr, "Usage: %s UnicodeData.txt version\n", argv[0]);
 777       exit (1);
 778     }
 779
 780   fill_attributes (argv[1]);
 781
 782   output_tables ("unicode", argv[2]);
 783
 784   return 0;
 785 }