locale/programs/charmap.c

   1 /* Copyright (C) 1996,1998,1999,2000,2001 Free Software Foundation, Inc.
   2    This file is part of the GNU C Library.
   3    Contributed by Ulrich Drepper <drepper@gnu.org>, 1996.
   4
   5    The GNU C Library is free software; you can redistribute it and/or
   6    modify it under the terms of the GNU Lesser General Public
   7    License as published by the Free Software Foundation; either
   8    version 2.1 of the License, or (at your option) any later version.
   9
  10    The GNU C Library is distributed in the hope that it will be useful,
  11    but WITHOUT ANY WARRANTY; without even the implied warranty of
  12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  13    Lesser General Public License for more details.
  14
  15    You should have received a copy of the GNU Lesser General Public
  16    License along with the GNU C Library; if not, write to the Free
  17    Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
  18    02111-1307 USA.  */
  19
  20 #ifdef HAVE_CONFIG_H
  21 # include <config.h>
  22 #endif
  23
  24 #include <ctype.h>
  25 #include <errno.h>
  26 #include <libintl.h>
  27 #include <limits.h>
  28 #include <obstack.h>
  29 #include <stdio.h>
  30 #include <stdlib.h>
  31 #include <string.h>
  32
  33 #include "error.h"
  34 #include "linereader.h"
  35 #include "charmap.h"
  36 #include "charmap-dir.h"
  37 #include "repertoire.h"
  38
  39 #include <assert.h>
  40
  41
  42 /* Define the lookup function.  */
  43 #include "charmap-kw.h"
  44
  45
  46 extern void *xmalloc (size_t __n);
  47
  48 /* Prototypes for local functions.  */
  49 static struct charmap_t *parse_charmap (struct linereader *cmfile,
  50                                         int verbose, int be_quiet);
  51 static void new_width (struct linereader *cmfile, struct charmap_t *result,
  52                        const char *from, const char *to,
  53                        unsigned long int width);
  54 static void charmap_new_char (struct linereader *lr, struct charmap_t *cm,
  55                               int nbytes, char *bytes, const char *from,
  56                               const char *to, int decimal_ellipsis, int step);
  57
  58
  59 #ifdef NEED_NULL_POINTER
  60 static const char *null_pointer;
  61 #endif
  62
  63 static struct linereader *
  64 cmlr_open (const char *directory, const char *name, kw_hash_fct_t hf)
  65 {
  66   FILE *fp;
  67
  68   fp = charmap_open (directory, name);
  69   if (fp == NULL)
  70     return NULL;
  71   else
  72     {
  73       size_t dlen = strlen (directory);
  74       int add_slash = (dlen == 0 || directory[dlen - 1] != '/');
  75       size_t nlen = strlen (name);
  76       char *pathname;
  77       char *p;
  78
  79       pathname = alloca (dlen + add_slash + nlen + 1);
  80       p = stpcpy (pathname, directory);
  81       if (add_slash)
  82         *p++ = '/';
  83       stpcpy (p, name);
  84
  85       return lr_create (fp, pathname, hf);
  86     }
  87 }
  88
  89 struct charmap_t *
  90 charmap_read (const char *filename, int verbose, int be_quiet, int use_default)
  91 {
  92   struct charmap_t *result = NULL;
  93
  94   if (filename != NULL)
  95     {
  96       struct linereader *cmfile;
  97
  98       /* First try the name as found in the parameter.  */
  99       cmfile = lr_open (filename, charmap_hash);
 100       if (cmfile == NULL)
 101         {
 102           /* No successful.  So start looking through the directories
 103              in the I18NPATH if this is a simple name.  */
 104           if (strchr (filename, '/') == NULL)
 105             {
 106               char *i18npath = getenv ("I18NPATH");
 107               if (i18npath != NULL && *i18npath != '\0')
 108                 {
 109                   char path[strlen (i18npath) + sizeof ("/charmaps")];
 110                   char *next;
 111                   i18npath = strdupa (i18npath);
 112
 113                   while (cmfile == NULL
 114                          && (next = strsep (&i18npath, ":")) != NULL)
 115                     {
 116                       stpcpy (stpcpy (path, next), "/charmaps");
 117                       cmfile = cmlr_open (path, filename, charmap_hash);
 118
 119                       if (cmfile == NULL)
 120                         /* Try without the "/charmaps" part.  */
 121                         cmfile = cmlr_open (next, filename, charmap_hash);
 122                     }
 123                 }
 124
 125               if (cmfile == NULL)
 126                 /* Try the default directory.  */
 127                 cmfile = cmlr_open (CHARMAP_PATH, filename, charmap_hash);
 128             }
 129         }
 130
 131       if (cmfile != NULL)
 132         {
 133           result = parse_charmap (cmfile, verbose, be_quiet);
 134
 135           if (result == NULL && !be_quiet)
 136             error (0, errno, _("character map file `%s' not found"), filename);
 137         }
 138     }
 139
 140   if (result == NULL && filename != NULL && strchr (filename, '/') == NULL)
 141     {
 142       /* OK, one more try.  We also accept the names given to the
 143          character sets in the files.  Sometimes they differ from the
 144          file name.  */
 145       CHARMAP_DIR *dir;
 146
 147       dir = charmap_opendir (CHARMAP_PATH);
 148       if (dir != NULL)
 149         {
 150           const char *dirent;
 151
 152           while ((dirent = charmap_readdir (dir)) != NULL)
 153             {
 154               char **aliases;
 155               char **p;
 156               int found;
 157
 158               aliases = charmap_aliases (CHARMAP_PATH, dirent);
 159               found = 0;
 160               for (p = aliases; *p; p++)
 161                 if (strcasecmp (*p, filename) == 0)
 162                   {
 163                     found = 1;
 164                     break;
 165                   }
 166               charmap_free_aliases (aliases);
 167
 168               if (found)
 169                 {
 170                   struct linereader *cmfile;
 171
 172                   cmfile = cmlr_open (CHARMAP_PATH, dirent, charmap_hash);
 173                   if (cmfile != NULL)
 174                     result = parse_charmap (cmfile, verbose, be_quiet);
 175
 176                   break;
 177                 }
 178             }
 179
 180           charmap_closedir (dir);
 181         }
 182     }
 183
 184   if (result == NULL && DEFAULT_CHARMAP != NULL)
 185     {
 186       struct linereader *cmfile;
 187
 188       cmfile = cmlr_open (CHARMAP_PATH, DEFAULT_CHARMAP, charmap_hash);
 189       if (cmfile != NULL)
 190         result = parse_charmap (cmfile, verbose, be_quiet);
 191
 192       if (result == NULL)
 193         error (4, errno, _("default character map file `%s' not found"),
 194                DEFAULT_CHARMAP);
 195     }
 196
 197   /* Test of ASCII compatibility of locale encoding.
 198
 199      Verify that the encoding to be used in a locale is ASCII compatible,
 200      at least for the graphic characters, excluding the control characters,
 201      '$' and '@'.  This constraint comes from an ISO C 99 restriction.
 202
 203      ISO C 99 section 7.17.(2) (about wchar_t):
 204        the null character shall have the code value zero and each member of
 205        the basic character set shall have a code value equal to its value
 206        when used as the lone character in an integer character constant.
 207      ISO C 99 section 5.2.1.(3):
 208        Both the basic source and basic execution character sets shall have
 209        the following members: the 26 uppercase letters of the Latin alphabet
 210             A B C D E F G H I J K L M N O P Q R S T U V W X Y Z
 211        the 26 lowercase letters of the Latin alphabet
 212             a b c d e f g h i j k l m n o p q r s t u v w x y z
 213        the 10 decimal digits
 214             0 1 2 3 4 5 6 7 8 9
 215        the following 29 graphic characters
 216             ! " # % & ' ( ) * + , - . / : ; < = > ? [ \ ] ^ _ { | } ~
 217        the space character, and control characters representing horizontal
 218        tab, vertical tab, and form feed.
 219
 220      Therefore, for all members of the "basic character set", the 'char' code
 221      must have the same value as the 'wchar_t' code, which in glibc is the
 222      same as the Unicode code, which for all of the enumerated characters
 223      is identical to the ASCII code. */
 224   if (result != NULL && use_default)
 225     {
 226       static const char basic_charset[] =
 227         {
 228           'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M',
 229           'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z',
 230           'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm',
 231           'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
 232           '0', '1', '2', '3', '4', '5', '6', '7', '8', '9',
 233           '!', '"', '#', '%', '&', '\'', '(', ')', '*', '+', ',', '-',
 234           '.', '/', ':', ';', '<', '=', '>', '?', '[', '\\', ']', '^',
 235           '_', '{', '|', '}', '~', ' ', '\t', '\v', '\f', '\0'
 236         };
 237       int failed = 0;
 238       const char *p = basic_charset;
 239
 240       do
 241         {
 242           struct charseq * seq = charmap_find_symbol (result, p, 1);
 243
 244           if (seq == NULL || seq->ucs4 != *p)
 245             failed = 1;
 246         }
 247       while (*p++ != '\0');
 248
 249       if (failed)
 250         fprintf (stderr, _("\
 251 character map `%s' is not ASCII compatible, locale not ISO C compliant\n"),
 252                  result->code_set_name);
 253     }
 254
 255   return result;
 256 }
 257
 258
 259 static struct charmap_t *
 260 parse_charmap (struct linereader *cmfile, int verbose, int be_quiet)
 261 {
 262   struct charmap_t *result;
 263   int state;
 264   enum token_t expected_tok = tok_error;
 265   const char *expected_str = NULL;
 266   char *from_name = NULL;
 267   char *to_name = NULL;
 268   enum token_t ellipsis = 0;
 269   int step = 1;
 270
 271   /* We don't want symbolic names in string to be translated.  */
 272   cmfile->translate_strings = 0;
 273
 274   /* Allocate room for result.  */
 275   result = (struct charmap_t *) xmalloc (sizeof (struct charmap_t));
 276   memset (result, '\0', sizeof (struct charmap_t));
 277   /* The default DEFAULT_WIDTH is 1.  */
 278   result->width_default = 1;
 279
 280 #define obstack_chunk_alloc malloc
 281 #define obstack_chunk_free free
 282   obstack_init (&result->mem_pool);
 283
 284   if (init_hash (&result->char_table, 256)
 285       || init_hash (&result->byte_table, 256))
 286     {
 287       free (result);
 288       return NULL;
 289     }
 290
 291   /* We use a state machine to describe the charmap description file
 292      format.  */
 293   state = 1;
 294   while (1)
 295     {
 296       /* What's on?  */
 297       struct token *now = lr_token (cmfile, NULL, NULL, NULL, verbose);
 298       enum token_t nowtok = now->tok;
 299       struct token *arg;
 300
 301       if (nowtok == tok_eof)
 302         break;
 303
 304       switch (state)
 305         {
 306         case 1:
 307           /* The beginning.  We expect the special declarations, EOL or
 308              `CHARMAP'.  */
 309           if (nowtok == tok_eol)
 310             /* Ignore empty lines.  */
 311             continue;
 312
 313           if (nowtok == tok_charmap)
 314             {
 315               from_name = NULL;
 316               to_name = NULL;
 317
 318               /* We have to set up the real work.  Fill in some
 319                  default values.  */
 320               if (result->mb_cur_max == 0)
 321                 result->mb_cur_max = 1;
 322               if (result->mb_cur_min == 0)
 323                 result->mb_cur_min = result->mb_cur_max;
 324               if (result->mb_cur_min > result->mb_cur_max)
 325                 {
 326                   if (!be_quiet)
 327                     error (0, 0, _("\
 328 %s: <mb_cur_max> must be greater than <mb_cur_min>\n"),
 329                            cmfile->fname);
 330
 331                   result->mb_cur_min = result->mb_cur_max;
 332                 }
 333
 334               lr_ignore_rest (cmfile, 1);
 335
 336               state = 2;
 337               continue;
 338             }
 339
 340           if (nowtok != tok_code_set_name && nowtok != tok_mb_cur_max
 341               && nowtok != tok_mb_cur_min && nowtok != tok_escape_char
 342               && nowtok != tok_comment_char && nowtok != tok_g0esc
 343               && nowtok != tok_g1esc && nowtok != tok_g2esc
 344               && nowtok != tok_g3esc && nowtok != tok_repertoiremap
 345               && nowtok != tok_include)
 346             {
 347               lr_error (cmfile, _("syntax error in prolog: %s"),
 348                         _("invalid definition"));
 349
 350               lr_ignore_rest (cmfile, 0);
 351               continue;
 352             }
 353
 354           /* We know that we need an argument.  */
 355           arg = lr_token (cmfile, NULL, NULL, NULL, verbose);
 356
 357           switch (nowtok)
 358             {
 359             case tok_code_set_name:
 360             case tok_repertoiremap:
 361               if (arg->tok != tok_ident && arg->tok != tok_string)
 362                 {
 363                 badarg:
 364                   lr_error (cmfile, _("syntax error in prolog: %s"),
 365                             _("bad argument"));
 366
 367                   lr_ignore_rest (cmfile, 0);
 368                   continue;
 369                 }
 370
 371               if (nowtok == tok_code_set_name)
 372                 result->code_set_name = obstack_copy0 (&result->mem_pool,
 373                                                        arg->val.str.startmb,
 374                                                        arg->val.str.lenmb);
 375               else
 376                 result->repertoiremap = obstack_copy0 (&result->mem_pool,
 377                                                        arg->val.str.startmb,
 378                                                        arg->val.str.lenmb);
 379
 380               lr_ignore_rest (cmfile, 1);
 381               continue;
 382
 383             case tok_mb_cur_max:
 384             case tok_mb_cur_min:
 385               if (arg->tok != tok_number)
 386                 goto badarg;
 387
 388               if (verbose
 389                   && ((nowtok == tok_mb_cur_max
 390                        && result->mb_cur_max != 0)
 391                       || (nowtok == tok_mb_cur_max
 392                           && result->mb_cur_max != 0)))
 393                 lr_error (cmfile, _("duplicate definition of <%s>"),
 394                           nowtok == tok_mb_cur_min
 395                           ? "mb_cur_min" : "mb_cur_max");
 396
 397               if (arg->val.num < 1)
 398                 {
 399                   lr_error (cmfile,
 400                             _("value for <%s> must be 1 or greater"),
 401                             nowtok == tok_mb_cur_min
 402                             ? "mb_cur_min" : "mb_cur_max");
 403
 404                   lr_ignore_rest (cmfile, 0);
 405                   continue;
 406                 }
 407               if ((nowtok == tok_mb_cur_max && result->mb_cur_min != 0
 408                    && (int) arg->val.num < result->mb_cur_min)
 409                   || (nowtok == tok_mb_cur_min && result->mb_cur_max != 0
 410                       && (int) arg->val.num > result->mb_cur_max))
 411                 {
 412                   lr_error (cmfile, _("\
 413 value of <%s> must be greater or equal than the value of <%s>"),
 414                             "mb_cur_max", "mb_cur_min");
 415
 416                   lr_ignore_rest (cmfile, 0);
 417                   continue;
 418                 }
 419
 420               if (nowtok == tok_mb_cur_max)
 421                 result->mb_cur_max = arg->val.num;
 422               else
 423                 result->mb_cur_min = arg->val.num;
 424
 425               lr_ignore_rest (cmfile, 1);
 426               continue;
 427
 428             case tok_escape_char:
 429             case tok_comment_char:
 430               if (arg->tok != tok_ident)
 431                 goto badarg;
 432
 433               if (arg->val.str.lenmb != 1)
 434                 {
 435                   lr_error (cmfile, _("\
 436 argument to <%s> must be a single character"),
 437                             nowtok == tok_escape_char ? "escape_char"
 438                                                       : "comment_char");
 439
 440                   lr_ignore_rest (cmfile, 0);
 441                   continue;
 442                 }
 443
 444               if (nowtok == tok_escape_char)
 445                 cmfile->escape_char = *arg->val.str.startmb;
 446               else
 447                 cmfile->comment_char = *arg->val.str.startmb;
 448
 449               lr_ignore_rest (cmfile, 1);
 450               continue;
 451
 452             case tok_g0esc:
 453             case tok_g1esc:
 454             case tok_g2esc:
 455             case tok_g3esc:
 456             case tok_escseq:
 457               lr_ignore_rest (cmfile, 0); /* XXX */
 458               continue;
 459
 460             case tok_include:
 461               lr_error (cmfile, _("\
 462 character sets with locking states are not supported"));
 463               exit (4);
 464
 465             default:
 466               /* Cannot happen.  */
 467               assert (! "Should not happen");
 468             }
 469           break;
 470
 471         case 2:
 472           /* We have seen `CHARMAP' and now are in the body.  Each line
 473              must have the format "%s %s %s\n" or "%s...%s %s %s\n".  */
 474           if (nowtok == tok_eol)
 475             /* Ignore empty lines.  */
 476             continue;
 477
 478           if (nowtok == tok_end)
 479             {
 480               expected_tok = tok_charmap;
 481               expected_str = "CHARMAP";
 482               state = 90;
 483               continue;
 484             }
 485
 486           if (nowtok != tok_bsymbol && nowtok != tok_ucs4)
 487             {
 488               lr_error (cmfile, _("syntax error in %s definition: %s"),
 489                         "CHARMAP", _("no symbolic name given"));
 490
 491               lr_ignore_rest (cmfile, 0);
 492               continue;
 493             }
 494
 495           /* If the previous line was not completely correct free the
 496              used memory.  */
 497           if (from_name != NULL)
 498             obstack_free (&result->mem_pool, from_name);
 499
 500           if (nowtok == tok_bsymbol)
 501             from_name = (char *) obstack_copy0 (&result->mem_pool,
 502                                                 now->val.str.startmb,
 503                                                 now->val.str.lenmb);
 504           else
 505             {
 506               obstack_printf (&result->mem_pool, "U%08X",
 507                               cmfile->token.val.ucs4);
 508               obstack_1grow (&result->mem_pool, '\0');
 509               from_name = (char *) obstack_finish (&result->mem_pool);
 510             }
 511           to_name = NULL;
 512
 513           state = 3;
 514           continue;
 515
 516         case 3:
 517           /* We have two possibilities: We can see an ellipsis or an
 518              encoding value.  */
 519           if (nowtok == tok_ellipsis3 || nowtok == tok_ellipsis4
 520               || nowtok == tok_ellipsis2 || nowtok == tok_ellipsis4_2
 521               || nowtok == tok_ellipsis2_2)
 522             {
 523               ellipsis = nowtok;
 524               if (nowtok == tok_ellipsis4_2)
 525                 {
 526                   step = 2;
 527                   nowtok = tok_ellipsis4;
 528                 }
 529               else if (nowtok == tok_ellipsis2_2)
 530                 {
 531                   step = 2;
 532                   nowtok = tok_ellipsis2;
 533                 }
 534               state = 4;
 535               continue;
 536             }
 537           /* FALLTHROUGH */
 538
 539         case 5:
 540           if (nowtok != tok_charcode)
 541             {
 542               lr_error (cmfile, _("syntax error in %s definition: %s"),
 543                         "CHARMAP", _("invalid encoding given"));
 544
 545               lr_ignore_rest (cmfile, 0);
 546
 547               state = 2;
 548               continue;
 549             }
 550
 551           if (now->val.charcode.nbytes < result->mb_cur_min)
 552             lr_error (cmfile, _("too few bytes in character encoding"));
 553           else if (now->val.charcode.nbytes > result->mb_cur_max)
 554             lr_error (cmfile, _("too many bytes in character encoding"));
 555           else
 556             charmap_new_char (cmfile, result, now->val.charcode.nbytes,
 557                               now->val.charcode.bytes, from_name, to_name,
 558                               ellipsis != tok_ellipsis2, step);
 559
 560           /* Ignore trailing comment silently.  */
 561           lr_ignore_rest (cmfile, 0);
 562
 563           from_name = NULL;
 564           to_name = NULL;
 565           ellipsis = tok_none;
 566           step = 1;
 567
 568           state = 2;
 569           continue;
 570
 571         case 4:
 572           if (nowtok != tok_bsymbol && nowtok != tok_ucs4)
 573             {
 574               lr_error (cmfile, _("syntax error in %s definition: %s"),
 575                         "CHARMAP",
 576                         _("no symbolic name given for end of range"));
 577
 578               lr_ignore_rest (cmfile, 0);
 579               continue;
 580             }
 581
 582           /* Copy the to-name in a safe place.  */
 583           if (nowtok == tok_bsymbol)
 584             to_name = (char *) obstack_copy0 (&result->mem_pool,
 585                                               cmfile->token.val.str.startmb,
 586                                               cmfile->token.val.str.lenmb);
 587           else
 588             {
 589               obstack_printf (&result->mem_pool, "U%08X",
 590                               cmfile->token.val.ucs4);
 591               obstack_1grow (&result->mem_pool, '\0');
 592               to_name = (char *) obstack_finish (&result->mem_pool);
 593             }
 594
 595           state = 5;
 596           continue;
 597
 598         case 90:
 599           if (nowtok != expected_tok)
 600             lr_error (cmfile, _("\
 601 `%1$s' definition does not end with `END %1$s'"), expected_str);
 602
 603           lr_ignore_rest (cmfile, nowtok == expected_tok);
 604           state = 91;
 605           continue;
 606
 607         case 91:
 608           /* Waiting for WIDTH... */
 609           if (nowtok == tok_eol)
 610             /* Ignore empty lines.  */
 611             continue;
 612
 613           if (nowtok == tok_width_default)
 614             {
 615               state = 92;
 616               continue;
 617             }
 618
 619           if (nowtok == tok_width)
 620             {
 621               lr_ignore_rest (cmfile, 1);
 622               state = 93;
 623               continue;
 624             }
 625
 626           if (nowtok == tok_width_variable)
 627             {
 628               lr_ignore_rest (cmfile, 1);
 629               state = 98;
 630               continue;
 631             }
 632
 633           lr_error (cmfile, _("\
 634 only WIDTH definitions are allowed to follow the CHARMAP definition"));
 635
 636           lr_ignore_rest (cmfile, 0);
 637           continue;
 638
 639         case 92:
 640           if (nowtok != tok_number)
 641             lr_error (cmfile, _("value for %s must be an integer"),
 642                       "WIDTH_DEFAULT");
 643           else
 644             result->width_default = now->val.num;
 645
 646           lr_ignore_rest (cmfile, nowtok == tok_number);
 647
 648           state = 91;
 649           continue;
 650
 651         case 93:
 652           /* We now expect `END WIDTH' or lines of the format "%s %d\n" or
 653              "%s...%s %d\n".  */
 654           if (nowtok == tok_eol)
 655             /* ignore empty lines.  */
 656             continue;
 657
 658           if (nowtok == tok_end)
 659             {
 660               expected_tok = tok_width;
 661               expected_str = "WIDTH";
 662               state = 90;
 663               continue;
 664             }
 665
 666           if (nowtok != tok_bsymbol && nowtok != tok_ucs4)
 667             {
 668               lr_error (cmfile, _("syntax error in %s definition: %s"),
 669                         "WIDTH", _("no symbolic name given"));
 670
 671               lr_ignore_rest (cmfile, 0);
 672               continue;
 673             }
 674
 675           if (from_name != NULL)
 676             obstack_free (&result->mem_pool, from_name);
 677
 678           if (nowtok == tok_bsymbol)
 679             from_name = (char *) obstack_copy0 (&result->mem_pool,
 680                                                 now->val.str.startmb,
 681                                                 now->val.str.lenmb);
 682           else
 683             {
 684               obstack_printf (&result->mem_pool, "U%08X",
 685                               cmfile->token.val.ucs4);
 686               obstack_1grow (&result->mem_pool, '\0');
 687               from_name = (char *) obstack_finish (&result->mem_pool);
 688             }
 689
 690           to_name = NULL;
 691
 692           state = 94;
 693           continue;
 694
 695         case 94:
 696           if (nowtok == tok_ellipsis3)
 697             {
 698               state = 95;
 699               continue;
 700             }
 701
 702         case 96:
 703           if (nowtok != tok_number)
 704             lr_error (cmfile, _("value for %s must be an integer"),
 705                       "WIDTH");
 706           else
 707             {
 708               /* Store width for chars.  */
 709               new_width (cmfile, result, from_name, to_name, now->val.num);
 710
 711               from_name = NULL;
 712               to_name = NULL;
 713             }
 714
 715           lr_ignore_rest (cmfile, nowtok == tok_number);
 716
 717           state = 93;
 718           continue;
 719
 720         case 95:
 721           if (nowtok != tok_bsymbol && nowtok != tok_ucs4)
 722             {
 723               lr_error (cmfile, _("syntax error in %s definition: %s"),
 724                         "WIDTH", _("no symbolic name given for end of range"));
 725
 726               lr_ignore_rest (cmfile, 0);
 727
 728               state = 93;
 729               continue;
 730             }
 731
 732           if (nowtok == tok_bsymbol)
 733             to_name = (char *) obstack_copy0 (&result->mem_pool,
 734                                               now->val.str.startmb,
 735                                               now->val.str.lenmb);
 736           else
 737             {
 738               obstack_printf (&result->mem_pool, "U%08X",
 739                               cmfile->token.val.ucs4);
 740               obstack_1grow (&result->mem_pool, '\0');
 741               to_name = (char *) obstack_finish (&result->mem_pool);
 742             }
 743
 744           state = 96;
 745           continue;
 746
 747         case 98:
 748           /* We now expect `END WIDTH_VARIABLE' or lines of the format
 749              "%s\n" or "%s...%s\n".  */
 750           if (nowtok == tok_eol)
 751             /* ignore empty lines.  */
 752             continue;
 753
 754           if (nowtok == tok_end)
 755             {
 756               expected_tok = tok_width_variable;
 757               expected_str = "WIDTH_VARIABLE";
 758               state = 90;
 759               continue;
 760             }
 761
 762           if (nowtok != tok_bsymbol && nowtok != tok_ucs4)
 763             {
 764               lr_error (cmfile, _("syntax error in %s definition: %s"),
 765                         "WIDTH_VARIABLE", _("no symbolic name given"));
 766
 767               lr_ignore_rest (cmfile, 0);
 768
 769               continue;
 770             }
 771
 772           if (from_name != NULL)
 773             obstack_free (&result->mem_pool, from_name);
 774
 775           if (nowtok == tok_bsymbol)
 776             from_name = (char *) obstack_copy0 (&result->mem_pool,
 777                                                 now->val.str.startmb,
 778                                                 now->val.str.lenmb);
 779           else
 780             {
 781               obstack_printf (&result->mem_pool, "U%08X",
 782                               cmfile->token.val.ucs4);
 783               obstack_1grow (&result->mem_pool, '\0');
 784               from_name = (char *) obstack_finish (&result->mem_pool);
 785             }
 786           to_name = NULL;
 787
 788           state = 99;
 789           continue;
 790
 791         case 99:
 792           if (nowtok == tok_ellipsis3)
 793             state = 100;
 794
 795           /* Store info.  */
 796           from_name = NULL;
 797
 798           /* Warn */
 799           state = 98;
 800           continue;
 801
 802         case 100:
 803           if (nowtok != tok_bsymbol && nowtok != tok_ucs4)
 804             {
 805               lr_error (cmfile, _("syntax error in %s definition: %s"),
 806                         "WIDTH_VARIABLE",
 807                         _("no symbolic name given for end of range"));
 808               lr_ignore_rest (cmfile, 0);
 809               continue;
 810             }
 811
 812           if (nowtok == tok_bsymbol)
 813             to_name = (char *) obstack_copy0 (&result->mem_pool,
 814                                               now->val.str.startmb,
 815                                               now->val.str.lenmb);
 816           else
 817             {
 818               obstack_printf (&result->mem_pool, "U%08X",
 819                               cmfile->token.val.ucs4);
 820               obstack_1grow (&result->mem_pool, '\0');
 821               to_name = (char *) obstack_finish (&result->mem_pool);
 822             }
 823
 824           /* XXX Enter value into table.  */
 825
 826           lr_ignore_rest (cmfile, 1);
 827
 828           state = 98;
 829           continue;
 830
 831         default:
 832           error (5, 0, _("%s: error in state machine"), __FILE__);
 833           /* NOTREACHED */
 834         }
 835       break;
 836     }
 837
 838   if (state != 91 && !be_quiet)
 839     error (0, 0, _("%s: premature end of file"), cmfile->fname);
 840
 841   lr_close (cmfile);
 842
 843   return result;
 844 }
 845
 846
 847 static void
 848 new_width (struct linereader *cmfile, struct charmap_t *result,
 849            const char *from, const char *to, unsigned long int width)
 850 {
 851   struct charseq *from_val;
 852   struct charseq *to_val;
 853
 854   from_val = charmap_find_value (result, from, strlen (from));
 855   if (from_val == NULL)
 856     {
 857       lr_error (cmfile, _("unknown character `%s'"), from);
 858       return;
 859     }
 860
 861   if (to == NULL)
 862     to_val = from_val;
 863   else
 864     {
 865       to_val = charmap_find_value (result, to, strlen (to));
 866       if (to_val == NULL)
 867         {
 868           lr_error (cmfile, _("unknown character `%s'"), to);
 869           return;
 870         }
 871     }
 872
 873   if (result->nwidth_rules >= result->nwidth_rules_max)
 874     {
 875       size_t new_size = result->nwidth_rules + 32;
 876       struct width_rule *new_rules =
 877         (struct width_rule *) obstack_alloc (&result->mem_pool,
 878                                              (new_size
 879                                               * sizeof (struct width_rule)));
 880
 881       memcpy (new_rules, result->width_rules,
 882               result->nwidth_rules_max * sizeof (struct width_rule));
 883
 884       result->width_rules = new_rules;
 885       result->nwidth_rules_max = new_size;
 886     }
 887
 888   result->width_rules[result->nwidth_rules].from = from_val;
 889   result->width_rules[result->nwidth_rules].to = to_val;
 890   result->width_rules[result->nwidth_rules].width = (unsigned int) width;
 891   ++result->nwidth_rules;
 892 }
 893
 894
 895 struct charseq *
 896 charmap_find_value (const struct charmap_t *cm, const char *name, size_t len)
 897 {
 898   void *result;
 899
 900   return (find_entry ((hash_table *) &cm->char_table, name, len, &result)
 901           < 0 ? NULL : (struct charseq *) result);
 902 }
 903
 904
 905 static void
 906 charmap_new_char (struct linereader *lr, struct charmap_t *cm,
 907                   int nbytes, char *bytes, const char *from, const char *to,
 908                   int decimal_ellipsis, int step)
 909 {
 910   hash_table *ht = &cm->char_table;
 911   hash_table *bt = &cm->byte_table;
 912   struct obstack *ob = &cm->mem_pool;
 913   char *from_end;
 914   char *to_end;
 915   const char *cp;
 916   int prefix_len, len1, len2;
 917   unsigned int from_nr, to_nr, cnt;
 918   struct charseq *newp;
 919
 920   len1 = strlen (from);
 921
 922   if (to == NULL)
 923     {
 924       newp = (struct charseq *) obstack_alloc (ob, sizeof (*newp) + nbytes);
 925       newp->nbytes = nbytes;
 926       memcpy (newp->bytes, bytes, nbytes);
 927       newp->name = from;
 928
 929       newp->ucs4 = UNINITIALIZED_CHAR_VALUE;
 930       if ((from[0] == 'U' || from[0] == 'P') && (len1 == 5 || len1 == 9))
 931         {
 932           /* Maybe the name is of the form `Uxxxx' or `Uxxxxxxxx' where
 933              xxxx and xxxxxxxx are hexadecimal numbers.  In this case
 934              we use the value of xxxx or xxxxxxxx as the UCS4 value of
 935              this character and we don't have to consult the repertoire
 936              map.
 937
 938              If the name is of the form `Pxxxx' or `Pxxxxxxxx' the xxxx
 939              and xxxxxxxx also give the code point in UCS4 but this must
 940              be in the private, i.e., unassigned, area.  This should be
 941              used for characters which do not (yet) have an equivalent
 942              in ISO 10646 and Unicode.  */
 943           char *endp;
 944
 945           errno = 0;
 946           newp->ucs4 = strtoul (from + 1, &endp, 16);
 947           if (endp - from != len1
 948               || (newp->ucs4 == ULONG_MAX && errno == ERANGE)
 949               || newp->ucs4 >= 0x80000000)
 950             /* This wasn't successful.  Signal this name cannot be a
 951                correct UCS value.  */
 952             newp->ucs4 = UNINITIALIZED_CHAR_VALUE;
 953         }
 954
 955       insert_entry (ht, from, len1, newp);
 956       insert_entry (bt, newp->bytes, nbytes, newp);
 957       /* Please note that it isn't a bug if a symbol is defined more
 958          than once.  All later definitions are simply discarded.  */
 959       return;
 960     }
 961
 962   /* We have a range: the names must have names with equal prefixes
 963      and an equal number of digits, where the second number is greater
 964      or equal than the first.  */
 965   len2 = strlen (to);
 966
 967   if (len1 != len2)
 968     {
 969     illegal_range:
 970       lr_error (lr, _("invalid names for character range"));
 971       return;
 972     }
 973
 974   cp = &from[len1 - 1];
 975   if (decimal_ellipsis)
 976     while (isdigit (*cp) && cp >= from)
 977       --cp;
 978   else
 979     while (isxdigit (*cp) && cp >= from)
 980       {
 981         if (!isdigit (*cp) && !isupper (*cp))
 982           lr_error (lr, _("\
 983 hexadecimal range format should use only capital characters"));
 984         --cp;
 985       }
 986
 987   prefix_len = (cp - from) + 1;
 988
 989   if (cp == &from[len1 - 1] || strncmp (from, to, prefix_len) != 0)
 990     goto illegal_range;
 991
 992   errno = 0;
 993   from_nr = strtoul (&from[prefix_len], &from_end, decimal_ellipsis ? 10 : 16);
 994   if (*from_end != '\0' || (from_nr == ULONG_MAX && errno == ERANGE)
 995       || ((to_nr = strtoul (&to[prefix_len], &to_end,
 996                             decimal_ellipsis ? 10 : 16)) == ULONG_MAX
 997           && errno == ERANGE)
 998       || *to_end != '\0')
 999     {
1000       lr_error (lr, _("<%s> and <%s> are illegal names for range"), from, to);
1001       return;
1002     }
1003
1004   if (from_nr > to_nr)
1005     {
1006       lr_error (lr, _("upper limit in range is not higher then lower limit"));
1007       return;
1008     }
1009
1010   for (cnt = from_nr; cnt <= to_nr; cnt += step)
1011     {
1012       char *name_end;
1013       obstack_printf (ob, decimal_ellipsis ? "%.*s%0*d" : "%.*s%0*X",
1014                       prefix_len, from, len1 - prefix_len, cnt);
1015       obstack_1grow (ob, '\0');
1016       name_end = obstack_finish (ob);
1017
1018       newp = (struct charseq *) obstack_alloc (ob, sizeof (*newp) + nbytes);
1019       newp->nbytes = nbytes;
1020       memcpy (newp->bytes, bytes, nbytes);
1021       newp->name = name_end;
1022
1023       newp->ucs4 = UNINITIALIZED_CHAR_VALUE;
1024       if ((name_end[0] == 'U' || name_end[0] == 'P')
1025           && (len1 == 5 || len1 == 9))
1026         {
1027           /* Maybe the name is of the form `Uxxxx' or `Uxxxxxxxx' where
1028              xxxx and xxxxxxxx are hexadecimal numbers.  In this case
1029              we use the value of xxxx or xxxxxxxx as the UCS4 value of
1030              this character and we don't have to consult the repertoire
1031              map.
1032
1033              If the name is of the form `Pxxxx' or `Pxxxxxxxx' the xxxx
1034              and xxxxxxxx also give the code point in UCS4 but this must
1035              be in the private, i.e., unassigned, area.  This should be
1036              used for characters which do not (yet) have an equivalent
1037              in ISO 10646 and Unicode.  */
1038           char *endp;
1039
1040           errno = 0;
1041           newp->ucs4 = strtoul (name_end + 1, &endp, 16);
1042           if (endp - name_end != len1
1043               || (newp->ucs4 == ULONG_MAX && errno == ERANGE)
1044               || newp->ucs4 >= 0x80000000)
1045             /* This wasn't successful.  Signal this name cannot be a
1046                correct UCS value.  */
1047             newp->ucs4 = UNINITIALIZED_CHAR_VALUE;
1048         }
1049
1050       insert_entry (ht, name_end, len1, newp);
1051       insert_entry (bt, newp->bytes, nbytes, newp);
1052       /* Please note we don't examine the return value since it is no error
1053          if we have two definitions for a symbol.  */
1054
1055       /* Increment the value in the byte sequence.  */
1056       if (++bytes[nbytes - 1] == '\0')
1057         {
1058           int b = nbytes - 2;
1059
1060           do
1061             if (b < 0)
1062               {
1063                 lr_error (lr,
1064                           _("resulting bytes for range not representable."));
1065                 return;
1066               }
1067           while (++bytes[b--] == 0);
1068         }
1069     }
1070 }
1071
1072
1073 struct charseq *
1074 charmap_find_symbol (const struct charmap_t *cm, const char *bytes,
1075                      size_t nbytes)
1076 {
1077   void *result;
1078
1079   return (find_entry ((hash_table *) &cm->byte_table, bytes, nbytes, &result)
1080           < 0 ? NULL : (struct charseq *) result);
1081 }