locale/programs/charmap.c

   1 /* Copyright (C) 1996,1998,1999,2000,2001 Free Software Foundation, Inc.
   2    This file is part of the GNU C Library.
   3    Contributed by Ulrich Drepper <drepper@gnu.org>, 1996.
   4
   5    The GNU C Library is free software; you can redistribute it and/or
   6    modify it under the terms of the GNU Lesser General Public
   7    License as published by the Free Software Foundation; either
   8    version 2.1 of the License, or (at your option) any later version.
   9
  10    The GNU C Library is distributed in the hope that it will be useful,
  11    but WITHOUT ANY WARRANTY; without even the implied warranty of
  12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  13    Lesser General Public License for more details.
  14
  15    You should have received a copy of the GNU Lesser General Public
  16    License along with the GNU C Library; if not, write to the Free
  17    Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
  18    02111-1307 USA.  */
  19
  20 #ifdef HAVE_CONFIG_H
  21 # include <config.h>
  22 #endif
  23
  24 #include <ctype.h>
  25 #include <errno.h>
  26 #include <libintl.h>
  27 #include <limits.h>
  28 #include <obstack.h>
  29 #include <stdio.h>
  30 #include <stdlib.h>
  31 #include <string.h>
  32
  33 #include "error.h"
  34 #include "linereader.h"
  35 #include "charmap.h"
  36 #include "charmap-dir.h"
  37 #include "repertoire.h"
  38
  39 #include <assert.h>
  40
  41
  42 /* Define the lookup function.  */
  43 #include "charmap-kw.h"
  44
  45
  46 extern void *xmalloc (size_t __n);
  47
  48 /* Prototypes for local functions.  */
  49 static struct charmap_t *parse_charmap (struct linereader *cmfile,
  50                                         int verbose, int be_quiet);
  51 static void new_width (struct linereader *cmfile, struct charmap_t *result,
  52                        const char *from, const char *to,
  53                        unsigned long int width);
  54 static void charmap_new_char (struct linereader *lr, struct charmap_t *cm,
  55                               int nbytes, char *bytes, const char *from,
  56                               const char *to, int decimal_ellipsis, int step);
  57
  58
  59 #ifdef NEED_NULL_POINTER
  60 static const char *null_pointer;
  61 #endif
  62
  63 static struct linereader *
  64 cmlr_open (const char *directory, const char *name, kw_hash_fct_t hf)
  65 {
  66   FILE *fp;
  67
  68   fp = charmap_open (directory, name);
  69   if (fp == NULL)
  70     return NULL;
  71   else
  72     {
  73       size_t dlen = strlen (directory);
  74       int add_slash = (dlen == 0 || directory[dlen - 1] != '/');
  75       size_t nlen = strlen (name);
  76       char *pathname;
  77       char *p;
  78
  79       pathname = alloca (dlen + add_slash + nlen + 1);
  80       p = stpcpy (pathname, directory);
  81       if (add_slash)
  82         *p++ = '/';
  83       stpcpy (p, name);
  84
  85       return lr_create (fp, pathname, hf);
  86     }
  87 }
  88
  89 struct charmap_t *
  90 charmap_read (const char *filename, int verbose, int be_quiet, int use_default)
  91 {
  92   struct charmap_t *result = NULL;
  93
  94   if (filename != NULL)
  95     {
  96       struct linereader *cmfile;
  97
  98       /* First try the name as found in the parameter.  */
  99       cmfile = lr_open (filename, charmap_hash);
 100       if (cmfile == NULL)
 101         {
 102           /* No successful.  So start looking through the directories
 103              in the I18NPATH if this is a simple name.  */
 104           if (strchr (filename, '/') == NULL)
 105             {
 106               char *i18npath = getenv ("I18NPATH");
 107               if (i18npath != NULL && *i18npath != '\0')
 108                 {
 109                   char path[strlen (i18npath) + sizeof ("/charmaps")];
 110                   char *next;
 111                   i18npath = strdupa (i18npath);
 112
 113                   while (cmfile == NULL
 114                          && (next = strsep (&i18npath, ":")) != NULL)
 115                     {
 116                       stpcpy (stpcpy (path, next), "/charmaps");
 117                       cmfile = cmlr_open (path, filename, charmap_hash);
 118
 119                       if (cmfile == NULL)
 120                         {
 121                           /* Try without the "/charmaps" part.  */
 122                           cmfile = cmlr_open (next, filename, charmap_hash);
 123                         }
 124                     }
 125                 }
 126
 127               if (cmfile == NULL)
 128                 {
 129                   /* Try the default directory.  */
 130                   cmfile = cmlr_open (CHARMAP_PATH, filename, charmap_hash);
 131                 }
 132             }
 133         }
 134
 135       if (cmfile != NULL)
 136         {
 137           result = parse_charmap (cmfile, verbose, be_quiet);
 138
 139           if (result == NULL && !be_quiet)
 140             error (0, errno, _("character map file `%s' not found"), filename);
 141         }
 142     }
 143
 144   if (result == NULL && filename != NULL && strchr (filename, '/') == NULL)
 145     {
 146       /* OK, one more try.  We also accept the names given to the
 147          character sets in the files.  Sometimes they differ from the
 148          file name.  */
 149       CHARMAP_DIR *dir;
 150
 151       dir = charmap_opendir (CHARMAP_PATH);
 152       if (dir != NULL)
 153         {
 154           const char *dirent;
 155
 156           while ((dirent = charmap_readdir (dir)) != NULL)
 157             {
 158               char **aliases;
 159               char **p;
 160               int found;
 161
 162               aliases = charmap_aliases (CHARMAP_PATH, dirent);
 163               found = 0;
 164               for (p = aliases; *p; p++)
 165                 if (strcasecmp (*p, filename) == 0)
 166                   {
 167                     found = 1;
 168                     break;
 169                   }
 170               charmap_free_aliases (aliases);
 171
 172               if (found)
 173                 {
 174                   struct linereader *cmfile;
 175
 176                   cmfile = cmlr_open (CHARMAP_PATH, dirent, charmap_hash);
 177                   if (cmfile != NULL)
 178                     result = parse_charmap (cmfile, verbose, be_quiet);
 179
 180                   break;
 181                 }
 182             }
 183
 184           charmap_closedir (dir);
 185         }
 186     }
 187
 188   if (result == NULL && DEFAULT_CHARMAP != NULL)
 189     {
 190       struct linereader *cmfile;
 191
 192       cmfile = cmlr_open (CHARMAP_PATH, DEFAULT_CHARMAP, charmap_hash);
 193       if (cmfile != NULL)
 194         result = parse_charmap (cmfile, verbose, be_quiet);
 195
 196       if (result == NULL)
 197         error (4, errno, _("default character map file `%s' not found"),
 198                DEFAULT_CHARMAP);
 199     }
 200
 201   /* Test of ASCII compatibility of locale encoding.
 202
 203      Verify that the encoding to be used in a locale is ASCII compatible,
 204      at least for the graphic characters, excluding the control characters,
 205      '$' and '@'.  This constraint comes from an ISO C 99 restriction.
 206
 207      ISO C 99 section 7.17.(2) (about wchar_t):
 208        the null character shall have the code value zero and each member of
 209        the basic character set shall have a code value equal to its value
 210        when used as the lone character in an integer character constant.
 211      ISO C 99 section 5.2.1.(3):
 212        Both the basic source and basic execution character sets shall have
 213        the following members: the 26 uppercase letters of the Latin alphabet
 214             A B C D E F G H I J K L M N O P Q R S T U V W X Y Z
 215        the 26 lowercase letters of the Latin alphabet
 216             a b c d e f g h i j k l m n o p q r s t u v w x y z
 217        the 10 decimal digits
 218             0 1 2 3 4 5 6 7 8 9
 219        the following 29 graphic characters
 220             ! " # % & ' ( ) * + , - . / : ; < = > ? [ \ ] ^ _ { | } ~
 221        the space character, and control characters representing horizontal
 222        tab, vertical tab, and form feed.
 223
 224      Therefore, for all members of the "basic character set", the 'char' code
 225      must have the same value as the 'wchar_t' code, which in glibc is the
 226      same as the Unicode code, which for all of the enumerated characters
 227      is identical to the ASCII code. */
 228   if (result != NULL && use_default)
 229     {
 230       static const char basic_charset[] =
 231         {
 232           'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M',
 233           'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z',
 234           'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm',
 235           'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
 236           '0', '1', '2', '3', '4', '5', '6', '7', '8', '9',
 237           '!', '"', '#', '%', '&', '\'', '(', ')', '*', '+', ',', '-',
 238           '.', '/', ':', ';', '<', '=', '>', '?', '[', '\\', ']', '^',
 239           '_', '{', '|', '}', '~', ' ', '\t', '\v', '\f', '\0'
 240         };
 241       int failed = 0;
 242       const char *p = basic_charset;
 243
 244       do
 245         {
 246           struct charseq * seq = charmap_find_symbol (result, p, 1);
 247
 248           if (seq == NULL || seq->ucs4 != *p)
 249             failed = 1;
 250         }
 251       while (*p++ != '\0');
 252
 253       if (failed)
 254         fprintf (stderr, _("\
 255 character map `%s' is not ASCII compatible, locale not ISO C compliant\n"),
 256                  result->code_set_name);
 257     }
 258
 259   return result;
 260 }
 261
 262
 263 static struct charmap_t *
 264 parse_charmap (struct linereader *cmfile, int verbose, int be_quiet)
 265 {
 266   struct charmap_t *result;
 267   int state;
 268   enum token_t expected_tok = tok_error;
 269   const char *expected_str = NULL;
 270   char *from_name = NULL;
 271   char *to_name = NULL;
 272   enum token_t ellipsis = 0;
 273   int step = 1;
 274
 275   /* We don't want symbolic names in string to be translated.  */
 276   cmfile->translate_strings = 0;
 277
 278   /* Allocate room for result.  */
 279   result = (struct charmap_t *) xmalloc (sizeof (struct charmap_t));
 280   memset (result, '\0', sizeof (struct charmap_t));
 281   /* The default DEFAULT_WIDTH is 1.  */
 282   result->width_default = 1;
 283
 284 #define obstack_chunk_alloc malloc
 285 #define obstack_chunk_free free
 286   obstack_init (&result->mem_pool);
 287
 288   if (init_hash (&result->char_table, 256)
 289       || init_hash (&result->byte_table, 256))
 290     {
 291       free (result);
 292       return NULL;
 293     }
 294
 295   /* We use a state machine to describe the charmap description file
 296      format.  */
 297   state = 1;
 298   while (1)
 299     {
 300       /* What's on?  */
 301       struct token *now = lr_token (cmfile, NULL, NULL, verbose);
 302       enum token_t nowtok = now->tok;
 303       struct token *arg;
 304
 305       if (nowtok == tok_eof)
 306         break;
 307
 308       switch (state)
 309         {
 310         case 1:
 311           /* The beginning.  We expect the special declarations, EOL or
 312              `CHARMAP'.  */
 313           if (nowtok == tok_eol)
 314             /* Ignore empty lines.  */
 315             continue;
 316
 317           if (nowtok == tok_charmap)
 318             {
 319               from_name = NULL;
 320               to_name = NULL;
 321
 322               /* We have to set up the real work.  Fill in some
 323                  default values.  */
 324               if (result->mb_cur_max == 0)
 325                 result->mb_cur_max = 1;
 326               if (result->mb_cur_min == 0)
 327                 result->mb_cur_min = result->mb_cur_max;
 328               if (result->mb_cur_min > result->mb_cur_max)
 329                 {
 330                   if (!be_quiet)
 331                     error (0, 0, _("\
 332 %s: <mb_cur_max> must be greater than <mb_cur_min>\n"),
 333                            cmfile->fname);
 334
 335                   result->mb_cur_min = result->mb_cur_max;
 336                 }
 337
 338               lr_ignore_rest (cmfile, 1);
 339
 340               state = 2;
 341               continue;
 342             }
 343
 344           if (nowtok != tok_code_set_name && nowtok != tok_mb_cur_max
 345               && nowtok != tok_mb_cur_min && nowtok != tok_escape_char
 346               && nowtok != tok_comment_char && nowtok != tok_g0esc
 347               && nowtok != tok_g1esc && nowtok != tok_g2esc
 348               && nowtok != tok_g3esc && nowtok != tok_repertoiremap
 349               && nowtok != tok_include)
 350             {
 351               lr_error (cmfile, _("syntax error in prolog: %s"),
 352                         _("invalid definition"));
 353
 354               lr_ignore_rest (cmfile, 0);
 355               continue;
 356             }
 357
 358           /* We know that we need an argument.  */
 359           arg = lr_token (cmfile, NULL, NULL, verbose);
 360
 361           switch (nowtok)
 362             {
 363             case tok_code_set_name:
 364             case tok_repertoiremap:
 365               if (arg->tok != tok_ident && arg->tok != tok_string)
 366                 {
 367                 badarg:
 368                   lr_error (cmfile, _("syntax error in prolog: %s"),
 369                             _("bad argument"));
 370
 371                   lr_ignore_rest (cmfile, 0);
 372                   continue;
 373                 }
 374
 375               if (nowtok == tok_code_set_name)
 376                 result->code_set_name = obstack_copy0 (&result->mem_pool,
 377                                                        arg->val.str.startmb,
 378                                                        arg->val.str.lenmb);
 379               else
 380                 result->repertoiremap = obstack_copy0 (&result->mem_pool,
 381                                                        arg->val.str.startmb,
 382                                                        arg->val.str.lenmb);
 383
 384               lr_ignore_rest (cmfile, 1);
 385               continue;
 386
 387             case tok_mb_cur_max:
 388             case tok_mb_cur_min:
 389               if (arg->tok != tok_number)
 390                 goto badarg;
 391
 392               if (verbose
 393                   && ((nowtok == tok_mb_cur_max
 394                        && result->mb_cur_max != 0)
 395                       || (nowtok == tok_mb_cur_max
 396                           && result->mb_cur_max != 0)))
 397                 lr_error (cmfile, _("duplicate definition of <%s>"),
 398                           nowtok == tok_mb_cur_min
 399                           ? "mb_cur_min" : "mb_cur_max");
 400
 401               if (arg->val.num < 1)
 402                 {
 403                   lr_error (cmfile,
 404                             _("value for <%s> must be 1 or greater"),
 405                             nowtok == tok_mb_cur_min
 406                             ? "mb_cur_min" : "mb_cur_max");
 407
 408                   lr_ignore_rest (cmfile, 0);
 409                   continue;
 410                 }
 411               if ((nowtok == tok_mb_cur_max && result->mb_cur_min != 0
 412                    && (int) arg->val.num < result->mb_cur_min)
 413                   || (nowtok == tok_mb_cur_min && result->mb_cur_max != 0
 414                       && (int) arg->val.num > result->mb_cur_max))
 415                 {
 416                   lr_error (cmfile, _("\
 417 value of <%s> must be greater or equal than the value of <%s>"),
 418                             "mb_cur_max", "mb_cur_min");
 419
 420                   lr_ignore_rest (cmfile, 0);
 421                   continue;
 422                 }
 423
 424               if (nowtok == tok_mb_cur_max)
 425                 result->mb_cur_max = arg->val.num;
 426               else
 427                 result->mb_cur_min = arg->val.num;
 428
 429               lr_ignore_rest (cmfile, 1);
 430               continue;
 431
 432             case tok_escape_char:
 433             case tok_comment_char:
 434               if (arg->tok != tok_ident)
 435                 goto badarg;
 436
 437               if (arg->val.str.lenmb != 1)
 438                 {
 439                   lr_error (cmfile, _("\
 440 argument to <%s> must be a single character"),
 441                             nowtok == tok_escape_char ? "escape_char"
 442                                                       : "comment_char");
 443
 444                   lr_ignore_rest (cmfile, 0);
 445                   continue;
 446                 }
 447
 448               if (nowtok == tok_escape_char)
 449                 cmfile->escape_char = *arg->val.str.startmb;
 450               else
 451                 cmfile->comment_char = *arg->val.str.startmb;
 452
 453               lr_ignore_rest (cmfile, 1);
 454               continue;
 455
 456             case tok_g0esc:
 457             case tok_g1esc:
 458             case tok_g2esc:
 459             case tok_g3esc:
 460             case tok_escseq:
 461               lr_ignore_rest (cmfile, 0); /* XXX */
 462               continue;
 463
 464             case tok_include:
 465               lr_error (cmfile, _("\
 466 character sets with locking states are not supported"));
 467               exit (4);
 468
 469             default:
 470               /* Cannot happen.  */
 471               assert (! "Should not happen");
 472             }
 473           break;
 474
 475         case 2:
 476           /* We have seen `CHARMAP' and now are in the body.  Each line
 477              must have the format "%s %s %s\n" or "%s...%s %s %s\n".  */
 478           if (nowtok == tok_eol)
 479             /* Ignore empty lines.  */
 480             continue;
 481
 482           if (nowtok == tok_end)
 483             {
 484               expected_tok = tok_charmap;
 485               expected_str = "CHARMAP";
 486               state = 90;
 487               continue;
 488             }
 489
 490           if (nowtok != tok_bsymbol && nowtok != tok_ucs4)
 491             {
 492               lr_error (cmfile, _("syntax error in %s definition: %s"),
 493                         "CHARMAP", _("no symbolic name given"));
 494
 495               lr_ignore_rest (cmfile, 0);
 496               continue;
 497             }
 498
 499           /* If the previous line was not completely correct free the
 500              used memory.  */
 501           if (from_name != NULL)
 502             obstack_free (&result->mem_pool, from_name);
 503
 504           if (nowtok == tok_bsymbol)
 505             from_name = (char *) obstack_copy0 (&result->mem_pool,
 506                                                 now->val.str.startmb,
 507                                                 now->val.str.lenmb);
 508           else
 509             {
 510               obstack_printf (&result->mem_pool, "U%08X",
 511                               cmfile->token.val.ucs4);
 512               obstack_1grow (&result->mem_pool, '\0');
 513               from_name = (char *) obstack_finish (&result->mem_pool);
 514             }
 515           to_name = NULL;
 516
 517           state = 3;
 518           continue;
 519
 520         case 3:
 521           /* We have two possibilities: We can see an ellipsis or an
 522              encoding value.  */
 523           if (nowtok == tok_ellipsis3 || nowtok == tok_ellipsis4
 524               || nowtok == tok_ellipsis2 || nowtok == tok_ellipsis4_2
 525               || nowtok == tok_ellipsis2_2)
 526             {
 527               ellipsis = nowtok;
 528               if (nowtok == tok_ellipsis4_2)
 529                 {
 530                   step = 2;
 531                   nowtok = tok_ellipsis4;
 532                 }
 533               else if (nowtok == tok_ellipsis2_2)
 534                 {
 535                   step = 2;
 536                   nowtok = tok_ellipsis2;
 537                 }
 538               state = 4;
 539               continue;
 540             }
 541           /* FALLTHROUGH */
 542
 543         case 5:
 544           if (nowtok != tok_charcode)
 545             {
 546               lr_error (cmfile, _("syntax error in %s definition: %s"),
 547                         "CHARMAP", _("invalid encoding given"));
 548
 549               lr_ignore_rest (cmfile, 0);
 550
 551               state = 2;
 552               continue;
 553             }
 554
 555           if (now->val.charcode.nbytes < result->mb_cur_min)
 556             lr_error (cmfile, _("too few bytes in character encoding"));
 557           else if (now->val.charcode.nbytes > result->mb_cur_max)
 558             lr_error (cmfile, _("too many bytes in character encoding"));
 559           else
 560             charmap_new_char (cmfile, result, now->val.charcode.nbytes,
 561                               now->val.charcode.bytes, from_name, to_name,
 562                               ellipsis != tok_ellipsis2, step);
 563
 564           /* Ignore trailing comment silently.  */
 565           lr_ignore_rest (cmfile, 0);
 566
 567           from_name = NULL;
 568           to_name = NULL;
 569           ellipsis = tok_none;
 570           step = 1;
 571
 572           state = 2;
 573           continue;
 574
 575         case 4:
 576           if (nowtok != tok_bsymbol && nowtok != tok_ucs4)
 577             {
 578               lr_error (cmfile, _("syntax error in %s definition: %s"),
 579                         "CHARMAP",
 580                         _("no symbolic name given for end of range"));
 581
 582               lr_ignore_rest (cmfile, 0);
 583               continue;
 584             }
 585
 586           /* Copy the to-name in a safe place.  */
 587           if (nowtok == tok_bsymbol)
 588             to_name = (char *) obstack_copy0 (&result->mem_pool,
 589                                               cmfile->token.val.str.startmb,
 590                                               cmfile->token.val.str.lenmb);
 591           else
 592             {
 593               obstack_printf (&result->mem_pool, "U%08X",
 594                               cmfile->token.val.ucs4);
 595               obstack_1grow (&result->mem_pool, '\0');
 596               to_name = (char *) obstack_finish (&result->mem_pool);
 597             }
 598
 599           state = 5;
 600           continue;
 601
 602         case 90:
 603           if (nowtok != expected_tok)
 604             lr_error (cmfile, _("\
 605 `%1$s' definition does not end with `END %1$s'"), expected_str);
 606
 607           lr_ignore_rest (cmfile, nowtok == expected_tok);
 608           state = 91;
 609           continue;
 610
 611         case 91:
 612           /* Waiting for WIDTH... */
 613           if (nowtok == tok_eol)
 614             /* Ignore empty lines.  */
 615             continue;
 616
 617           if (nowtok == tok_width_default)
 618             {
 619               state = 92;
 620               continue;
 621             }
 622
 623           if (nowtok == tok_width)
 624             {
 625               lr_ignore_rest (cmfile, 1);
 626               state = 93;
 627               continue;
 628             }
 629
 630           if (nowtok == tok_width_variable)
 631             {
 632               lr_ignore_rest (cmfile, 1);
 633               state = 98;
 634               continue;
 635             }
 636
 637           lr_error (cmfile, _("\
 638 only WIDTH definitions are allowed to follow the CHARMAP definition"));
 639
 640           lr_ignore_rest (cmfile, 0);
 641           continue;
 642
 643         case 92:
 644           if (nowtok != tok_number)
 645             lr_error (cmfile, _("value for %s must be an integer"),
 646                       "WIDTH_DEFAULT");
 647           else
 648             result->width_default = now->val.num;
 649
 650           lr_ignore_rest (cmfile, nowtok == tok_number);
 651
 652           state = 91;
 653           continue;
 654
 655         case 93:
 656           /* We now expect `END WIDTH' or lines of the format "%s %d\n" or
 657              "%s...%s %d\n".  */
 658           if (nowtok == tok_eol)
 659             /* ignore empty lines.  */
 660             continue;
 661
 662           if (nowtok == tok_end)
 663             {
 664               expected_tok = tok_width;
 665               expected_str = "WIDTH";
 666               state = 90;
 667               continue;
 668             }
 669
 670           if (nowtok != tok_bsymbol && nowtok != tok_ucs4)
 671             {
 672               lr_error (cmfile, _("syntax error in %s definition: %s"),
 673                         "WIDTH", _("no symbolic name given"));
 674
 675               lr_ignore_rest (cmfile, 0);
 676               continue;
 677             }
 678
 679           if (from_name != NULL)
 680             obstack_free (&result->mem_pool, from_name);
 681
 682           if (nowtok == tok_bsymbol)
 683             from_name = (char *) obstack_copy0 (&result->mem_pool,
 684                                                 now->val.str.startmb,
 685                                                 now->val.str.lenmb);
 686           else
 687             {
 688               obstack_printf (&result->mem_pool, "U%08X",
 689                               cmfile->token.val.ucs4);
 690               obstack_1grow (&result->mem_pool, '\0');
 691               from_name = (char *) obstack_finish (&result->mem_pool);
 692             }
 693
 694           to_name = NULL;
 695
 696           state = 94;
 697           continue;
 698
 699         case 94:
 700           if (nowtok == tok_ellipsis3)
 701             {
 702               state = 95;
 703               continue;
 704             }
 705
 706         case 96:
 707           if (nowtok != tok_number)
 708             lr_error (cmfile, _("value for %s must be an integer"),
 709                       "WIDTH");
 710           else
 711             {
 712               /* Store width for chars.  */
 713               new_width (cmfile, result, from_name, to_name, now->val.num);
 714
 715               from_name = NULL;
 716               to_name = NULL;
 717             }
 718
 719           lr_ignore_rest (cmfile, nowtok == tok_number);
 720
 721           state = 93;
 722           continue;
 723
 724         case 95:
 725           if (nowtok != tok_bsymbol && nowtok != tok_ucs4)
 726             {
 727               lr_error (cmfile, _("syntax error in %s definition: %s"),
 728                         "WIDTH", _("no symbolic name given for end of range"));
 729
 730               lr_ignore_rest (cmfile, 0);
 731
 732               state = 93;
 733               continue;
 734             }
 735
 736           if (nowtok == tok_bsymbol)
 737             to_name = (char *) obstack_copy0 (&result->mem_pool,
 738                                               now->val.str.startmb,
 739                                               now->val.str.lenmb);
 740           else
 741             {
 742               obstack_printf (&result->mem_pool, "U%08X",
 743                               cmfile->token.val.ucs4);
 744               obstack_1grow (&result->mem_pool, '\0');
 745               to_name = (char *) obstack_finish (&result->mem_pool);
 746             }
 747
 748           state = 96;
 749           continue;
 750
 751         case 98:
 752           /* We now expect `END WIDTH_VARIABLE' or lines of the format
 753              "%s\n" or "%s...%s\n".  */
 754           if (nowtok == tok_eol)
 755             /* ignore empty lines.  */
 756             continue;
 757
 758           if (nowtok == tok_end)
 759             {
 760               expected_tok = tok_width_variable;
 761               expected_str = "WIDTH_VARIABLE";
 762               state = 90;
 763               continue;
 764             }
 765
 766           if (nowtok != tok_bsymbol && nowtok != tok_ucs4)
 767             {
 768               lr_error (cmfile, _("syntax error in %s definition: %s"),
 769                         "WIDTH_VARIABLE", _("no symbolic name given"));
 770
 771               lr_ignore_rest (cmfile, 0);
 772
 773               continue;
 774             }
 775
 776           if (from_name != NULL)
 777             obstack_free (&result->mem_pool, from_name);
 778
 779           if (nowtok == tok_bsymbol)
 780             from_name = (char *) obstack_copy0 (&result->mem_pool,
 781                                                 now->val.str.startmb,
 782                                                 now->val.str.lenmb);
 783           else
 784             {
 785               obstack_printf (&result->mem_pool, "U%08X",
 786                               cmfile->token.val.ucs4);
 787               obstack_1grow (&result->mem_pool, '\0');
 788               from_name = (char *) obstack_finish (&result->mem_pool);
 789             }
 790           to_name = NULL;
 791
 792           state = 99;
 793           continue;
 794
 795         case 99:
 796           if (nowtok == tok_ellipsis3)
 797             state = 100;
 798
 799           /* Store info.  */
 800           from_name = NULL;
 801
 802           /* Warn */
 803           state = 98;
 804           continue;
 805
 806         case 100:
 807           if (nowtok != tok_bsymbol && nowtok != tok_ucs4)
 808             {
 809               lr_error (cmfile, _("syntax error in %s definition: %s"),
 810                         "WIDTH_VARIABLE",
 811                         _("no symbolic name given for end of range"));
 812               lr_ignore_rest (cmfile, 0);
 813               continue;
 814             }
 815
 816           if (nowtok == tok_bsymbol)
 817             to_name = (char *) obstack_copy0 (&result->mem_pool,
 818                                               now->val.str.startmb,
 819                                               now->val.str.lenmb);
 820           else
 821             {
 822               obstack_printf (&result->mem_pool, "U%08X",
 823                               cmfile->token.val.ucs4);
 824               obstack_1grow (&result->mem_pool, '\0');
 825               to_name = (char *) obstack_finish (&result->mem_pool);
 826             }
 827
 828           /* XXX Enter value into table.  */
 829
 830           lr_ignore_rest (cmfile, 1);
 831
 832           state = 98;
 833           continue;
 834
 835         default:
 836           error (5, 0, _("%s: error in state machine"), __FILE__);
 837           /* NOTREACHED */
 838         }
 839       break;
 840     }
 841
 842   if (state != 91 && !be_quiet)
 843     error (0, 0, _("%s: premature end of file"), cmfile->fname);
 844
 845   lr_close (cmfile);
 846
 847   return result;
 848 }
 849
 850
 851 static void
 852 new_width (struct linereader *cmfile, struct charmap_t *result,
 853            const char *from, const char *to, unsigned long int width)
 854 {
 855   struct charseq *from_val;
 856   struct charseq *to_val;
 857
 858   from_val = charmap_find_value (result, from, strlen (from));
 859   if (from_val == NULL)
 860     {
 861       lr_error (cmfile, _("unknown character `%s'"), from);
 862       return;
 863     }
 864
 865   if (to == NULL)
 866     to_val = from_val;
 867   else
 868     {
 869       to_val = charmap_find_value (result, to, strlen (to));
 870       if (to_val == NULL)
 871         {
 872           lr_error (cmfile, _("unknown character `%s'"), to);
 873           return;
 874         }
 875     }
 876
 877   if (result->nwidth_rules >= result->nwidth_rules_max)
 878     {
 879       size_t new_size = result->nwidth_rules + 32;
 880       struct width_rule *new_rules =
 881         (struct width_rule *) obstack_alloc (&result->mem_pool,
 882                                              (new_size
 883                                               * sizeof (struct width_rule)));
 884
 885       memcpy (new_rules, result->width_rules,
 886               result->nwidth_rules_max * sizeof (struct width_rule));
 887
 888       result->width_rules = new_rules;
 889       result->nwidth_rules_max = new_size;
 890     }
 891
 892   result->width_rules[result->nwidth_rules].from = from_val;
 893   result->width_rules[result->nwidth_rules].to = to_val;
 894   result->width_rules[result->nwidth_rules].width = (unsigned int) width;
 895   ++result->nwidth_rules;
 896 }
 897
 898
 899 struct charseq *
 900 charmap_find_value (const struct charmap_t *cm, const char *name, size_t len)
 901 {
 902   void *result;
 903
 904   return (find_entry ((hash_table *) &cm->char_table, name, len, &result)
 905           < 0 ? NULL : (struct charseq *) result);
 906 }
 907
 908
 909 static void
 910 charmap_new_char (struct linereader *lr, struct charmap_t *cm,
 911                   int nbytes, char *bytes, const char *from, const char *to,
 912                   int decimal_ellipsis, int step)
 913 {
 914   hash_table *ht = &cm->char_table;
 915   hash_table *bt = &cm->byte_table;
 916   struct obstack *ob = &cm->mem_pool;
 917   char *from_end;
 918   char *to_end;
 919   const char *cp;
 920   int prefix_len, len1, len2;
 921   unsigned int from_nr, to_nr, cnt;
 922   struct charseq *newp;
 923
 924   len1 = strlen (from);
 925
 926   if (to == NULL)
 927     {
 928       newp = (struct charseq *) obstack_alloc (ob, sizeof (*newp) + nbytes);
 929       newp->nbytes = nbytes;
 930       memcpy (newp->bytes, bytes, nbytes);
 931       newp->name = from;
 932
 933       newp->ucs4 = UNINITIALIZED_CHAR_VALUE;
 934       if ((from[0] == 'U' || from[0] == 'P') && (len1 == 5 || len1 == 9))
 935         {
 936           /* Maybe the name is of the form `Uxxxx' or `Uxxxxxxxx' where
 937              xxxx and xxxxxxxx are hexadecimal numbers.  In this case
 938              we use the value of xxxx or xxxxxxxx as the UCS4 value of
 939              this character and we don't have to consult the repertoire
 940              map.
 941
 942              If the name is of the form `Pxxxx' or `Pxxxxxxxx' the xxxx
 943              and xxxxxxxx also give the code point in UCS4 but this must
 944              be in the private, i.e., unassigned, area.  This should be
 945              used for characters which do not (yet) have an equivalent
 946              in ISO 10646 and Unicode.  */
 947           char *endp;
 948
 949           errno = 0;
 950           newp->ucs4 = strtoul (from + 1, &endp, 16);
 951           if (endp - from != len1
 952               || (newp->ucs4 == ULONG_MAX && errno == ERANGE)
 953               || newp->ucs4 >= 0x80000000)
 954             /* This wasn't successful.  Signal this name cannot be a
 955                correct UCS value.  */
 956             newp->ucs4 = UNINITIALIZED_CHAR_VALUE;
 957         }
 958
 959       insert_entry (ht, from, len1, newp);
 960       insert_entry (bt, newp->bytes, nbytes, newp);
 961       /* Please note that it isn't a bug if a symbol is defined more
 962          than once.  All later definitions are simply discarded.  */
 963       return;
 964     }
 965
 966   /* We have a range: the names must have names with equal prefixes
 967      and an equal number of digits, where the second number is greater
 968      or equal than the first.  */
 969   len2 = strlen (to);
 970
 971   if (len1 != len2)
 972     {
 973     illegal_range:
 974       lr_error (lr, _("invalid names for character range"));
 975       return;
 976     }
 977
 978   cp = &from[len1 - 1];
 979   if (decimal_ellipsis)
 980     while (isdigit (*cp) && cp >= from)
 981       --cp;
 982   else
 983     while (isxdigit (*cp) && cp >= from)
 984       {
 985         if (!isdigit (*cp) && !isupper (*cp))
 986           lr_error (lr, _("\
 987 hexadecimal range format should use only capital characters"));
 988         --cp;
 989       }
 990
 991   prefix_len = (cp - from) + 1;
 992
 993   if (cp == &from[len1 - 1] || strncmp (from, to, prefix_len) != 0)
 994     goto illegal_range;
 995
 996   errno = 0;
 997   from_nr = strtoul (&from[prefix_len], &from_end, decimal_ellipsis ? 10 : 16);
 998   if (*from_end != '\0' || (from_nr == ULONG_MAX && errno == ERANGE)
 999       || ((to_nr = strtoul (&to[prefix_len], &to_end,
1000                             decimal_ellipsis ? 10 : 16)) == ULONG_MAX
1001           && errno == ERANGE)
1002       || *to_end != '\0')
1003     {
1004       lr_error (lr, _("<%s> and <%s> are illegal names for range"), from, to);
1005       return;
1006     }
1007
1008   if (from_nr > to_nr)
1009     {
1010       lr_error (lr, _("upper limit in range is not higher then lower limit"));
1011       return;
1012     }
1013
1014   for (cnt = from_nr; cnt <= to_nr; cnt += step)
1015     {
1016       char *name_end;
1017       obstack_printf (ob, decimal_ellipsis ? "%.*s%0*d" : "%.*s%0*X",
1018                       prefix_len, from, len1 - prefix_len, cnt);
1019       obstack_1grow (ob, '\0');
1020       name_end = obstack_finish (ob);
1021
1022       newp = (struct charseq *) obstack_alloc (ob, sizeof (*newp) + nbytes);
1023       newp->nbytes = nbytes;
1024       memcpy (newp->bytes, bytes, nbytes);
1025       newp->name = name_end;
1026
1027       newp->ucs4 = UNINITIALIZED_CHAR_VALUE;
1028       if ((name_end[0] == 'U' || name_end[0] == 'P')
1029           && (len1 == 5 || len1 == 9))
1030         {
1031           /* Maybe the name is of the form `Uxxxx' or `Uxxxxxxxx' where
1032              xxxx and xxxxxxxx are hexadecimal numbers.  In this case
1033              we use the value of xxxx or xxxxxxxx as the UCS4 value of
1034              this character and we don't have to consult the repertoire
1035              map.
1036
1037              If the name is of the form `Pxxxx' or `Pxxxxxxxx' the xxxx
1038              and xxxxxxxx also give the code point in UCS4 but this must
1039              be in the private, i.e., unassigned, area.  This should be
1040              used for characters which do not (yet) have an equivalent
1041              in ISO 10646 and Unicode.  */
1042           char *endp;
1043
1044           errno = 0;
1045           newp->ucs4 = strtoul (name_end + 1, &endp, 16);
1046           if (endp - name_end != len1
1047               || (newp->ucs4 == ULONG_MAX && errno == ERANGE)
1048               || newp->ucs4 >= 0x80000000)
1049             /* This wasn't successful.  Signal this name cannot be a
1050                correct UCS value.  */
1051             newp->ucs4 = UNINITIALIZED_CHAR_VALUE;
1052         }
1053
1054       insert_entry (ht, name_end, len1, newp);
1055       insert_entry (bt, newp->bytes, nbytes, newp);
1056       /* Please note we don't examine the return value since it is no error
1057          if we have two definitions for a symbol.  */
1058
1059       /* Increment the value in the byte sequence.  */
1060       if (++bytes[nbytes - 1] == '\0')
1061         {
1062           int b = nbytes - 2;
1063
1064           do
1065             if (b < 0)
1066               {
1067                 lr_error (lr,
1068                           _("resulting bytes for range not representable."));
1069                 return;
1070               }
1071           while (++bytes[b--] == 0);
1072         }
1073     }
1074 }
1075
1076
1077 struct charseq *
1078 charmap_find_symbol (const struct charmap_t *cm, const char *bytes,
1079                      size_t nbytes)
1080 {
1081   void *result;
1082
1083   return (find_entry ((hash_table *) &cm->byte_table, bytes, nbytes, &result)
1084           < 0 ? NULL : (struct charseq *) result);
1085 }