locale/programs/charmap.c

   1 /* Copyright (C) 1996, 1998-2004,2005, 2006 Free Software Foundation, Inc.
   2    This file is part of the GNU C Library.
   3    Contributed by Ulrich Drepper <drepper@gnu.org>, 1996.
   4
   5    This program is free software; you can redistribute it and/or modify
   6    it under the terms of the GNU General Public License version 2 as
   7    published by the Free Software Foundation.
   8
   9    This program is distributed in the hope that it will be useful,
  10    but WITHOUT ANY WARRANTY; without even the implied warranty of
  11    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  12    GNU General Public License for more details.
  13
  14    You should have received a copy of the GNU General Public License
  15    along with this program; if not, write to the Free Software Foundation,
  16    Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.  */
  17
  18 #ifdef HAVE_CONFIG_H
  19 # include <config.h>
  20 #endif
  21
  22 #include <ctype.h>
  23 #include <errno.h>
  24 #include <libintl.h>
  25 #include <limits.h>
  26 #include <stdio.h>
  27 #include <stdlib.h>
  28 #include <string.h>
  29 #include <error.h>
  30
  31 #include "localedef.h"
  32 #include "linereader.h"
  33 #include "charmap.h"
  34 #include "charmap-dir.h"
  35
  36 #include <assert.h>
  37
  38
  39 /* Define the lookup function.  */
  40 #include "charmap-kw.h"
  41
  42
  43 /* Prototypes for local functions.  */
  44 static struct charmap_t *parse_charmap (struct linereader *cmfile,
  45                                         int verbose, int be_quiet);
  46 static void new_width (struct linereader *cmfile, struct charmap_t *result,
  47                        const char *from, const char *to,
  48                        unsigned long int width);
  49 static void charmap_new_char (struct linereader *lr, struct charmap_t *cm,
  50                               size_t nbytes, unsigned char *bytes,
  51                               const char *from, const char *to,
  52                               int decimal_ellipsis, int step);
  53
  54
  55 bool enc_not_ascii_compatible;
  56
  57
  58 #ifdef NEED_NULL_POINTER
  59 static const char *null_pointer;
  60 #endif
  61
  62 static struct linereader *
  63 cmlr_open (const char *directory, const char *name, kw_hash_fct_t hf)
  64 {
  65   FILE *fp;
  66
  67   fp = charmap_open (directory, name);
  68   if (fp == NULL)
  69     return NULL;
  70   else
  71     {
  72       size_t dlen = strlen (directory);
  73       int add_slash = (dlen == 0 || directory[dlen - 1] != '/');
  74       size_t nlen = strlen (name);
  75       char *pathname;
  76       char *p;
  77
  78       pathname = alloca (dlen + add_slash + nlen + 1);
  79       p = stpcpy (pathname, directory);
  80       if (add_slash)
  81         *p++ = '/';
  82       stpcpy (p, name);
  83
  84       return lr_create (fp, pathname, hf);
  85     }
  86 }
  87
  88 struct charmap_t *
  89 charmap_read (const char *filename, int verbose, int error_not_found,
  90               int be_quiet, int use_default)
  91 {
  92   struct charmap_t *result = NULL;
  93
  94   if (filename != NULL)
  95     {
  96       struct linereader *cmfile;
  97
  98       /* First try the name as found in the parameter.  */
  99       cmfile = lr_open (filename, charmap_hash);
 100       if (cmfile == NULL)
 101         {
 102           /* No successful.  So start looking through the directories
 103              in the I18NPATH if this is a simple name.  */
 104           if (strchr (filename, '/') == NULL)
 105             {
 106               char *i18npath = getenv ("I18NPATH");
 107               if (i18npath != NULL && *i18npath != '\0')
 108                 {
 109                   const size_t pathlen = strlen (i18npath);
 110                   char i18npathbuf[pathlen + 1];
 111                   char path[pathlen + sizeof ("/charmaps")];
 112                   char *next;
 113                   i18npath = memcpy (i18npathbuf, i18npath, pathlen + 1);
 114
 115                   while (cmfile == NULL
 116                          && (next = strsep (&i18npath, ":")) != NULL)
 117                     {
 118                       stpcpy (stpcpy (path, next), "/charmaps");
 119                       cmfile = cmlr_open (path, filename, charmap_hash);
 120
 121                       if (cmfile == NULL)
 122                         /* Try without the "/charmaps" part.  */
 123                         cmfile = cmlr_open (next, filename, charmap_hash);
 124                     }
 125                 }
 126
 127               if (cmfile == NULL)
 128                 /* Try the default directory.  */
 129                 cmfile = cmlr_open (CHARMAP_PATH, filename, charmap_hash);
 130             }
 131         }
 132
 133       if (cmfile != NULL)
 134         result = parse_charmap (cmfile, verbose, be_quiet);
 135
 136       if (result == NULL && error_not_found)
 137         WITH_CUR_LOCALE (error (0, errno, _("\
 138 character map file `%s' not found"), filename));
 139     }
 140
 141   if (result == NULL && filename != NULL && strchr (filename, '/') == NULL)
 142     {
 143       /* OK, one more try.  We also accept the names given to the
 144          character sets in the files.  Sometimes they differ from the
 145          file name.  */
 146       CHARMAP_DIR *dir;
 147
 148       dir = charmap_opendir (CHARMAP_PATH);
 149       if (dir != NULL)
 150         {
 151           const char *dirent;
 152
 153           while ((dirent = charmap_readdir (dir)) != NULL)
 154             {
 155               char **aliases;
 156               char **p;
 157               int found;
 158
 159               aliases = charmap_aliases (CHARMAP_PATH, dirent);
 160               found = 0;
 161               for (p = aliases; *p; p++)
 162                 if (strcasecmp (*p, filename) == 0)
 163                   {
 164                     found = 1;
 165                     break;
 166                   }
 167               charmap_free_aliases (aliases);
 168
 169               if (found)
 170                 {
 171                   struct linereader *cmfile;
 172
 173                   cmfile = cmlr_open (CHARMAP_PATH, dirent, charmap_hash);
 174                   if (cmfile != NULL)
 175                     result = parse_charmap (cmfile, verbose, be_quiet);
 176
 177                   break;
 178                 }
 179             }
 180
 181           charmap_closedir (dir);
 182         }
 183     }
 184
 185   if (result == NULL && DEFAULT_CHARMAP != NULL)
 186     {
 187       struct linereader *cmfile;
 188
 189       cmfile = cmlr_open (CHARMAP_PATH, DEFAULT_CHARMAP, charmap_hash);
 190       if (cmfile != NULL)
 191         result = parse_charmap (cmfile, verbose, be_quiet);
 192
 193       if (result == NULL)
 194         WITH_CUR_LOCALE (error (4, errno, _("\
 195 default character map file `%s' not found"), DEFAULT_CHARMAP));
 196     }
 197
 198   if (result != NULL && result->code_set_name == NULL)
 199     /* The input file does not specify a code set name.  This
 200        shouldn't happen but we should cope with it.  */
 201     result->code_set_name = basename (filename);
 202
 203   /* Test of ASCII compatibility of locale encoding.
 204
 205      Verify that the encoding to be used in a locale is ASCII compatible,
 206      at least for the graphic characters, excluding the control characters,
 207      '$' and '@'.  This constraint comes from an ISO C 99 restriction.
 208
 209      ISO C 99 section 7.17.(2) (about wchar_t):
 210        the null character shall have the code value zero and each member of
 211        the basic character set shall have a code value equal to its value
 212        when used as the lone character in an integer character constant.
 213      ISO C 99 section 5.2.1.(3):
 214        Both the basic source and basic execution character sets shall have
 215        the following members: the 26 uppercase letters of the Latin alphabet
 216             A B C D E F G H I J K L M N O P Q R S T U V W X Y Z
 217        the 26 lowercase letters of the Latin alphabet
 218             a b c d e f g h i j k l m n o p q r s t u v w x y z
 219        the 10 decimal digits
 220             0 1 2 3 4 5 6 7 8 9
 221        the following 29 graphic characters
 222             ! " # % & ' ( ) * + , - . / : ; < = > ? [ \ ] ^ _ { | } ~
 223        the space character, and control characters representing horizontal
 224        tab, vertical tab, and form feed.
 225
 226      Therefore, for all members of the "basic character set", the 'char' code
 227      must have the same value as the 'wchar_t' code, which in glibc is the
 228      same as the Unicode code, which for all of the enumerated characters
 229      is identical to the ASCII code. */
 230   if (result != NULL && use_default)
 231     {
 232       static const char basic_charset[] =
 233         {
 234           'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M',
 235           'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z',
 236           'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm',
 237           'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
 238           '0', '1', '2', '3', '4', '5', '6', '7', '8', '9',
 239           '!', '"', '#', '%', '&', '\'', '(', ')', '*', '+', ',', '-',
 240           '.', '/', ':', ';', '<', '=', '>', '?', '[', '\\', ']', '^',
 241           '_', '{', '|', '}', '~', ' ', '\t', '\v', '\f', '\0'
 242         };
 243       int failed = 0;
 244       const char *p = basic_charset;
 245
 246       do
 247         {
 248           struct charseq *seq = charmap_find_symbol (result, p, 1);
 249
 250           if (seq == NULL || seq->ucs4 != (uint32_t) *p)
 251             failed = 1;
 252         }
 253       while (*p++ != '\0');
 254
 255       if (failed)
 256         {
 257           WITH_CUR_LOCALE (fprintf (stderr, _("\
 258 character map `%s' is not ASCII compatible, locale not ISO C compliant\n"),
 259                                     result->code_set_name));
 260           enc_not_ascii_compatible = true;
 261         }
 262     }
 263
 264   return result;
 265 }
 266
 267
 268 static struct charmap_t *
 269 parse_charmap (struct linereader *cmfile, int verbose, int be_quiet)
 270 {
 271   struct charmap_t *result;
 272   int state;
 273   enum token_t expected_tok = tok_error;
 274   const char *expected_str = NULL;
 275   char *from_name = NULL;
 276   char *to_name = NULL;
 277   enum token_t ellipsis = 0;
 278   int step = 1;
 279
 280   /* We don't want symbolic names in string to be translated.  */
 281   cmfile->translate_strings = 0;
 282
 283   /* Allocate room for result.  */
 284   result = (struct charmap_t *) xmalloc (sizeof (struct charmap_t));
 285   memset (result, '\0', sizeof (struct charmap_t));
 286   /* The default DEFAULT_WIDTH is 1.  */
 287   result->width_default = 1;
 288
 289 #define obstack_chunk_alloc malloc
 290 #define obstack_chunk_free free
 291   obstack_init (&result->mem_pool);
 292
 293   if (init_hash (&result->char_table, 256)
 294       || init_hash (&result->byte_table, 256))
 295     {
 296       free (result);
 297       return NULL;
 298     }
 299
 300   /* We use a state machine to describe the charmap description file
 301      format.  */
 302   state = 1;
 303   while (1)
 304     {
 305       /* What's on?  */
 306       struct token *now = lr_token (cmfile, NULL, NULL, NULL, verbose);
 307       enum token_t nowtok = now->tok;
 308       struct token *arg;
 309
 310       if (nowtok == tok_eof)
 311         break;
 312
 313       switch (state)
 314         {
 315         case 1:
 316           /* The beginning.  We expect the special declarations, EOL or
 317              `CHARMAP'.  */
 318           if (nowtok == tok_eol)
 319             /* Ignore empty lines.  */
 320             continue;
 321
 322           if (nowtok == tok_charmap)
 323             {
 324               from_name = NULL;
 325               to_name = NULL;
 326
 327               /* We have to set up the real work.  Fill in some
 328                  default values.  */
 329               if (result->mb_cur_max == 0)
 330                 result->mb_cur_max = 1;
 331               if (result->mb_cur_min == 0)
 332                 result->mb_cur_min = result->mb_cur_max;
 333               if (result->mb_cur_min > result->mb_cur_max)
 334                 {
 335                   if (!be_quiet)
 336                     WITH_CUR_LOCALE (error (0, 0, _("\
 337 %s: <mb_cur_max> must be greater than <mb_cur_min>\n"),
 338                                             cmfile->fname));
 339
 340                   result->mb_cur_min = result->mb_cur_max;
 341                 }
 342
 343               lr_ignore_rest (cmfile, 1);
 344
 345               state = 2;
 346               continue;
 347             }
 348
 349           if (nowtok != tok_code_set_name && nowtok != tok_mb_cur_max
 350               && nowtok != tok_mb_cur_min && nowtok != tok_escape_char
 351               && nowtok != tok_comment_char && nowtok != tok_g0esc
 352               && nowtok != tok_g1esc && nowtok != tok_g2esc
 353               && nowtok != tok_g3esc && nowtok != tok_repertoiremap
 354               && nowtok != tok_include)
 355             {
 356               lr_error (cmfile, _("syntax error in prolog: %s"),
 357                         _("invalid definition"));
 358
 359               lr_ignore_rest (cmfile, 0);
 360               continue;
 361             }
 362
 363           /* We know that we need an argument.  */
 364           arg = lr_token (cmfile, NULL, NULL, NULL, verbose);
 365
 366           switch (nowtok)
 367             {
 368             case tok_code_set_name:
 369             case tok_repertoiremap:
 370               if (arg->tok != tok_ident && arg->tok != tok_string)
 371                 {
 372                 badarg:
 373                   lr_error (cmfile, _("syntax error in prolog: %s"),
 374                             _("bad argument"));
 375
 376                   lr_ignore_rest (cmfile, 0);
 377                   continue;
 378                 }
 379
 380               if (nowtok == tok_code_set_name)
 381                 result->code_set_name = obstack_copy0 (&result->mem_pool,
 382                                                        arg->val.str.startmb,
 383                                                        arg->val.str.lenmb);
 384               else
 385                 result->repertoiremap = obstack_copy0 (&result->mem_pool,
 386                                                        arg->val.str.startmb,
 387                                                        arg->val.str.lenmb);
 388
 389               lr_ignore_rest (cmfile, 1);
 390               continue;
 391
 392             case tok_mb_cur_max:
 393             case tok_mb_cur_min:
 394               if (arg->tok != tok_number)
 395                 goto badarg;
 396
 397               if (verbose
 398                   && ((nowtok == tok_mb_cur_max
 399                        && result->mb_cur_max != 0)
 400                       || (nowtok == tok_mb_cur_max
 401                           && result->mb_cur_max != 0)))
 402                 lr_error (cmfile, _("duplicate definition of <%s>"),
 403                           nowtok == tok_mb_cur_min
 404                           ? "mb_cur_min" : "mb_cur_max");
 405
 406               if (arg->val.num < 1)
 407                 {
 408                   lr_error (cmfile,
 409                             _("value for <%s> must be 1 or greater"),
 410                             nowtok == tok_mb_cur_min
 411                             ? "mb_cur_min" : "mb_cur_max");
 412
 413                   lr_ignore_rest (cmfile, 0);
 414                   continue;
 415                 }
 416               if ((nowtok == tok_mb_cur_max && result->mb_cur_min != 0
 417                    && (int) arg->val.num < result->mb_cur_min)
 418                   || (nowtok == tok_mb_cur_min && result->mb_cur_max != 0
 419                       && (int) arg->val.num > result->mb_cur_max))
 420                 {
 421                   lr_error (cmfile, _("\
 422 value of <%s> must be greater or equal than the value of <%s>"),
 423                             "mb_cur_max", "mb_cur_min");
 424
 425                   lr_ignore_rest (cmfile, 0);
 426                   continue;
 427                 }
 428
 429               if (nowtok == tok_mb_cur_max)
 430                 result->mb_cur_max = arg->val.num;
 431               else
 432                 result->mb_cur_min = arg->val.num;
 433
 434               lr_ignore_rest (cmfile, 1);
 435               continue;
 436
 437             case tok_escape_char:
 438             case tok_comment_char:
 439               if (arg->tok != tok_ident)
 440                 goto badarg;
 441
 442               if (arg->val.str.lenmb != 1)
 443                 {
 444                   lr_error (cmfile, _("\
 445 argument to <%s> must be a single character"),
 446                             nowtok == tok_escape_char ? "escape_char"
 447                                                       : "comment_char");
 448
 449                   lr_ignore_rest (cmfile, 0);
 450                   continue;
 451                 }
 452
 453               if (nowtok == tok_escape_char)
 454                 cmfile->escape_char = *arg->val.str.startmb;
 455               else
 456                 cmfile->comment_char = *arg->val.str.startmb;
 457
 458               lr_ignore_rest (cmfile, 1);
 459               continue;
 460
 461             case tok_g0esc:
 462             case tok_g1esc:
 463             case tok_g2esc:
 464             case tok_g3esc:
 465             case tok_escseq:
 466               lr_ignore_rest (cmfile, 0); /* XXX */
 467               continue;
 468
 469             case tok_include:
 470               lr_error (cmfile, _("\
 471 character sets with locking states are not supported"));
 472               exit (4);
 473
 474             default:
 475               /* Cannot happen.  */
 476               assert (! "Should not happen");
 477             }
 478           break;
 479
 480         case 2:
 481           /* We have seen `CHARMAP' and now are in the body.  Each line
 482              must have the format "%s %s %s\n" or "%s...%s %s %s\n".  */
 483           if (nowtok == tok_eol)
 484             /* Ignore empty lines.  */
 485             continue;
 486
 487           if (nowtok == tok_end)
 488             {
 489               expected_tok = tok_charmap;
 490               expected_str = "CHARMAP";
 491               state = 90;
 492               continue;
 493             }
 494
 495           if (nowtok != tok_bsymbol && nowtok != tok_ucs4)
 496             {
 497               lr_error (cmfile, _("syntax error in %s definition: %s"),
 498                         "CHARMAP", _("no symbolic name given"));
 499
 500               lr_ignore_rest (cmfile, 0);
 501               continue;
 502             }
 503
 504           /* If the previous line was not completely correct free the
 505              used memory.  */
 506           if (from_name != NULL)
 507             obstack_free (&result->mem_pool, from_name);
 508
 509           if (nowtok == tok_bsymbol)
 510             from_name = (char *) obstack_copy0 (&result->mem_pool,
 511                                                 now->val.str.startmb,
 512                                                 now->val.str.lenmb);
 513           else
 514             {
 515               obstack_printf (&result->mem_pool, "U%08X",
 516                               cmfile->token.val.ucs4);
 517               obstack_1grow (&result->mem_pool, '\0');
 518               from_name = (char *) obstack_finish (&result->mem_pool);
 519             }
 520           to_name = NULL;
 521
 522           state = 3;
 523           continue;
 524
 525         case 3:
 526           /* We have two possibilities: We can see an ellipsis or an
 527              encoding value.  */
 528           if (nowtok == tok_ellipsis3 || nowtok == tok_ellipsis4
 529               || nowtok == tok_ellipsis2 || nowtok == tok_ellipsis4_2
 530               || nowtok == tok_ellipsis2_2)
 531             {
 532               ellipsis = nowtok;
 533               if (nowtok == tok_ellipsis4_2)
 534                 {
 535                   step = 2;
 536                   nowtok = tok_ellipsis4;
 537                 }
 538               else if (nowtok == tok_ellipsis2_2)
 539                 {
 540                   step = 2;
 541                   nowtok = tok_ellipsis2;
 542                 }
 543               state = 4;
 544               continue;
 545             }
 546           /* FALLTHROUGH */
 547
 548         case 5:
 549           if (nowtok != tok_charcode)
 550             {
 551               lr_error (cmfile, _("syntax error in %s definition: %s"),
 552                         "CHARMAP", _("invalid encoding given"));
 553
 554               lr_ignore_rest (cmfile, 0);
 555
 556               state = 2;
 557               continue;
 558             }
 559
 560           if (now->val.charcode.nbytes < result->mb_cur_min)
 561             lr_error (cmfile, _("too few bytes in character encoding"));
 562           else if (now->val.charcode.nbytes > result->mb_cur_max)
 563             lr_error (cmfile, _("too many bytes in character encoding"));
 564           else
 565             charmap_new_char (cmfile, result, now->val.charcode.nbytes,
 566                               now->val.charcode.bytes, from_name, to_name,
 567                               ellipsis != tok_ellipsis2, step);
 568
 569           /* Ignore trailing comment silently.  */
 570           lr_ignore_rest (cmfile, 0);
 571
 572           from_name = NULL;
 573           to_name = NULL;
 574           ellipsis = tok_none;
 575           step = 1;
 576
 577           state = 2;
 578           continue;
 579
 580         case 4:
 581           if (nowtok != tok_bsymbol && nowtok != tok_ucs4)
 582             {
 583               lr_error (cmfile, _("syntax error in %s definition: %s"),
 584                         "CHARMAP",
 585                         _("no symbolic name given for end of range"));
 586
 587               lr_ignore_rest (cmfile, 0);
 588               continue;
 589             }
 590
 591           /* Copy the to-name in a safe place.  */
 592           if (nowtok == tok_bsymbol)
 593             to_name = (char *) obstack_copy0 (&result->mem_pool,
 594                                               cmfile->token.val.str.startmb,
 595                                               cmfile->token.val.str.lenmb);
 596           else
 597             {
 598               obstack_printf (&result->mem_pool, "U%08X",
 599                               cmfile->token.val.ucs4);
 600               obstack_1grow (&result->mem_pool, '\0');
 601               to_name = (char *) obstack_finish (&result->mem_pool);
 602             }
 603
 604           state = 5;
 605           continue;
 606
 607         case 90:
 608           if (nowtok != expected_tok)
 609             lr_error (cmfile, _("\
 610 %1$s: definition does not end with `END %1$s'"), expected_str);
 611
 612           lr_ignore_rest (cmfile, nowtok == expected_tok);
 613           state = 91;
 614           continue;
 615
 616         case 91:
 617           /* Waiting for WIDTH... */
 618           if (nowtok == tok_eol)
 619             /* Ignore empty lines.  */
 620             continue;
 621
 622           if (nowtok == tok_width_default)
 623             {
 624               state = 92;
 625               continue;
 626             }
 627
 628           if (nowtok == tok_width)
 629             {
 630               lr_ignore_rest (cmfile, 1);
 631               state = 93;
 632               continue;
 633             }
 634
 635           if (nowtok == tok_width_variable)
 636             {
 637               lr_ignore_rest (cmfile, 1);
 638               state = 98;
 639               continue;
 640             }
 641
 642           lr_error (cmfile, _("\
 643 only WIDTH definitions are allowed to follow the CHARMAP definition"));
 644
 645           lr_ignore_rest (cmfile, 0);
 646           continue;
 647
 648         case 92:
 649           if (nowtok != tok_number)
 650             lr_error (cmfile, _("value for %s must be an integer"),
 651                       "WIDTH_DEFAULT");
 652           else
 653             result->width_default = now->val.num;
 654
 655           lr_ignore_rest (cmfile, nowtok == tok_number);
 656
 657           state = 91;
 658           continue;
 659
 660         case 93:
 661           /* We now expect `END WIDTH' or lines of the format "%s %d\n" or
 662              "%s...%s %d\n".  */
 663           if (nowtok == tok_eol)
 664             /* ignore empty lines.  */
 665             continue;
 666
 667           if (nowtok == tok_end)
 668             {
 669               expected_tok = tok_width;
 670               expected_str = "WIDTH";
 671               state = 90;
 672               continue;
 673             }
 674
 675           if (nowtok != tok_bsymbol && nowtok != tok_ucs4)
 676             {
 677               lr_error (cmfile, _("syntax error in %s definition: %s"),
 678                         "WIDTH", _("no symbolic name given"));
 679
 680               lr_ignore_rest (cmfile, 0);
 681               continue;
 682             }
 683
 684           if (from_name != NULL)
 685             obstack_free (&result->mem_pool, from_name);
 686
 687           if (nowtok == tok_bsymbol)
 688             from_name = (char *) obstack_copy0 (&result->mem_pool,
 689                                                 now->val.str.startmb,
 690                                                 now->val.str.lenmb);
 691           else
 692             {
 693               obstack_printf (&result->mem_pool, "U%08X",
 694                               cmfile->token.val.ucs4);
 695               obstack_1grow (&result->mem_pool, '\0');
 696               from_name = (char *) obstack_finish (&result->mem_pool);
 697             }
 698
 699           to_name = NULL;
 700
 701           state = 94;
 702           continue;
 703
 704         case 94:
 705           if (nowtok == tok_ellipsis3)
 706             {
 707               state = 95;
 708               continue;
 709             }
 710
 711         case 96:
 712           if (nowtok != tok_number)
 713             lr_error (cmfile, _("value for %s must be an integer"),
 714                       "WIDTH");
 715           else
 716             {
 717               /* Store width for chars.  */
 718               new_width (cmfile, result, from_name, to_name, now->val.num);
 719
 720               from_name = NULL;
 721               to_name = NULL;
 722             }
 723
 724           lr_ignore_rest (cmfile, nowtok == tok_number);
 725
 726           state = 93;
 727           continue;
 728
 729         case 95:
 730           if (nowtok != tok_bsymbol && nowtok != tok_ucs4)
 731             {
 732               lr_error (cmfile, _("syntax error in %s definition: %s"),
 733                         "WIDTH", _("no symbolic name given for end of range"));
 734
 735               lr_ignore_rest (cmfile, 0);
 736
 737               state = 93;
 738               continue;
 739             }
 740
 741           if (nowtok == tok_bsymbol)
 742             to_name = (char *) obstack_copy0 (&result->mem_pool,
 743                                               now->val.str.startmb,
 744                                               now->val.str.lenmb);
 745           else
 746             {
 747               obstack_printf (&result->mem_pool, "U%08X",
 748                               cmfile->token.val.ucs4);
 749               obstack_1grow (&result->mem_pool, '\0');
 750               to_name = (char *) obstack_finish (&result->mem_pool);
 751             }
 752
 753           state = 96;
 754           continue;
 755
 756         case 98:
 757           /* We now expect `END WIDTH_VARIABLE' or lines of the format
 758              "%s\n" or "%s...%s\n".  */
 759           if (nowtok == tok_eol)
 760             /* ignore empty lines.  */
 761             continue;
 762
 763           if (nowtok == tok_end)
 764             {
 765               expected_tok = tok_width_variable;
 766               expected_str = "WIDTH_VARIABLE";
 767               state = 90;
 768               continue;
 769             }
 770
 771           if (nowtok != tok_bsymbol && nowtok != tok_ucs4)
 772             {
 773               lr_error (cmfile, _("syntax error in %s definition: %s"),
 774                         "WIDTH_VARIABLE", _("no symbolic name given"));
 775
 776               lr_ignore_rest (cmfile, 0);
 777
 778               continue;
 779             }
 780
 781           if (from_name != NULL)
 782             obstack_free (&result->mem_pool, from_name);
 783
 784           if (nowtok == tok_bsymbol)
 785             from_name = (char *) obstack_copy0 (&result->mem_pool,
 786                                                 now->val.str.startmb,
 787                                                 now->val.str.lenmb);
 788           else
 789             {
 790               obstack_printf (&result->mem_pool, "U%08X",
 791                               cmfile->token.val.ucs4);
 792               obstack_1grow (&result->mem_pool, '\0');
 793               from_name = (char *) obstack_finish (&result->mem_pool);
 794             }
 795           to_name = NULL;
 796
 797           state = 99;
 798           continue;
 799
 800         case 99:
 801           if (nowtok == tok_ellipsis3)
 802             state = 100;
 803
 804           /* Store info.  */
 805           from_name = NULL;
 806
 807           /* Warn */
 808           state = 98;
 809           continue;
 810
 811         case 100:
 812           if (nowtok != tok_bsymbol && nowtok != tok_ucs4)
 813             {
 814               lr_error (cmfile, _("syntax error in %s definition: %s"),
 815                         "WIDTH_VARIABLE",
 816                         _("no symbolic name given for end of range"));
 817               lr_ignore_rest (cmfile, 0);
 818               continue;
 819             }
 820
 821           if (nowtok == tok_bsymbol)
 822             to_name = (char *) obstack_copy0 (&result->mem_pool,
 823                                               now->val.str.startmb,
 824                                               now->val.str.lenmb);
 825           else
 826             {
 827               obstack_printf (&result->mem_pool, "U%08X",
 828                               cmfile->token.val.ucs4);
 829               obstack_1grow (&result->mem_pool, '\0');
 830               to_name = (char *) obstack_finish (&result->mem_pool);
 831             }
 832
 833           /* XXX Enter value into table.  */
 834
 835           lr_ignore_rest (cmfile, 1);
 836
 837           state = 98;
 838           continue;
 839
 840         default:
 841           WITH_CUR_LOCALE (error (5, 0, _("%s: error in state machine"),
 842                                   __FILE__));
 843           /* NOTREACHED */
 844         }
 845       break;
 846     }
 847
 848   if (state != 91 && !be_quiet)
 849     WITH_CUR_LOCALE (error (0, 0, _("%s: premature end of file"),
 850                             cmfile->fname));
 851
 852   lr_close (cmfile);
 853
 854   return result;
 855 }
 856
 857
 858 static void
 859 new_width (struct linereader *cmfile, struct charmap_t *result,
 860            const char *from, const char *to, unsigned long int width)
 861 {
 862   struct charseq *from_val;
 863   struct charseq *to_val;
 864
 865   from_val = charmap_find_value (result, from, strlen (from));
 866   if (from_val == NULL)
 867     {
 868       lr_error (cmfile, _("unknown character `%s'"), from);
 869       return;
 870     }
 871
 872   if (to == NULL)
 873     to_val = from_val;
 874   else
 875     {
 876       to_val = charmap_find_value (result, to, strlen (to));
 877       if (to_val == NULL)
 878         {
 879           lr_error (cmfile, _("unknown character `%s'"), to);
 880           return;
 881         }
 882
 883       /* Make sure the number of bytes for the end points of the range
 884          is correct.  */
 885       if (from_val->nbytes != to_val->nbytes)
 886         {
 887           lr_error (cmfile, _("\
 888 number of bytes for byte sequence of beginning and end of range not the same: %d vs %d"),
 889                     from_val->nbytes, to_val->nbytes);
 890           return;
 891         }
 892     }
 893
 894   if (result->nwidth_rules >= result->nwidth_rules_max)
 895     {
 896       size_t new_size = result->nwidth_rules + 32;
 897       struct width_rule *new_rules =
 898         (struct width_rule *) obstack_alloc (&result->mem_pool,
 899                                              (new_size
 900                                               * sizeof (struct width_rule)));
 901
 902       memcpy (new_rules, result->width_rules,
 903               result->nwidth_rules_max * sizeof (struct width_rule));
 904
 905       result->width_rules = new_rules;
 906       result->nwidth_rules_max = new_size;
 907     }
 908
 909   result->width_rules[result->nwidth_rules].from = from_val;
 910   result->width_rules[result->nwidth_rules].to = to_val;
 911   result->width_rules[result->nwidth_rules].width = (unsigned int) width;
 912   ++result->nwidth_rules;
 913 }
 914
 915
 916 struct charseq *
 917 charmap_find_value (const struct charmap_t *cm, const char *name, size_t len)
 918 {
 919   void *result;
 920
 921   return (find_entry ((hash_table *) &cm->char_table, name, len, &result)
 922           < 0 ? NULL : (struct charseq *) result);
 923 }
 924
 925
 926 static void
 927 charmap_new_char (struct linereader *lr, struct charmap_t *cm,
 928                   size_t nbytes, unsigned char *bytes,
 929                   const char *from, const char *to,
 930                   int decimal_ellipsis, int step)
 931 {
 932   hash_table *ht = &cm->char_table;
 933   hash_table *bt = &cm->byte_table;
 934   struct obstack *ob = &cm->mem_pool;
 935   char *from_end;
 936   char *to_end;
 937   const char *cp;
 938   int prefix_len, len1, len2;
 939   unsigned int from_nr, to_nr, cnt;
 940   struct charseq *newp;
 941
 942   len1 = strlen (from);
 943
 944   if (to == NULL)
 945     {
 946       newp = (struct charseq *) obstack_alloc (ob, sizeof (*newp) + nbytes);
 947       newp->nbytes = nbytes;
 948       memcpy (newp->bytes, bytes, nbytes);
 949       newp->name = from;
 950
 951       newp->ucs4 = UNINITIALIZED_CHAR_VALUE;
 952       if ((from[0] == 'U' || from[0] == 'P') && (len1 == 5 || len1 == 9))
 953         {
 954           /* Maybe the name is of the form `Uxxxx' or `Uxxxxxxxx' where
 955              xxxx and xxxxxxxx are hexadecimal numbers.  In this case
 956              we use the value of xxxx or xxxxxxxx as the UCS4 value of
 957              this character and we don't have to consult the repertoire
 958              map.
 959
 960              If the name is of the form `Pxxxx' or `Pxxxxxxxx' the xxxx
 961              and xxxxxxxx also give the code point in UCS4 but this must
 962              be in the private, i.e., unassigned, area.  This should be
 963              used for characters which do not (yet) have an equivalent
 964              in ISO 10646 and Unicode.  */
 965           char *endp;
 966
 967           errno = 0;
 968           newp->ucs4 = strtoul (from + 1, &endp, 16);
 969           if (endp - from != len1
 970               || (newp->ucs4 == ~((uint32_t) 0) && errno == ERANGE)
 971               || newp->ucs4 >= 0x80000000)
 972             /* This wasn't successful.  Signal this name cannot be a
 973                correct UCS value.  */
 974             newp->ucs4 = UNINITIALIZED_CHAR_VALUE;
 975         }
 976
 977       insert_entry (ht, from, len1, newp);
 978       insert_entry (bt, newp->bytes, nbytes, newp);
 979       /* Please note that it isn't a bug if a symbol is defined more
 980          than once.  All later definitions are simply discarded.  */
 981       return;
 982     }
 983
 984   /* We have a range: the names must have names with equal prefixes
 985      and an equal number of digits, where the second number is greater
 986      or equal than the first.  */
 987   len2 = strlen (to);
 988
 989   if (len1 != len2)
 990     {
 991     illegal_range:
 992       lr_error (lr, _("invalid names for character range"));
 993       return;
 994     }
 995
 996   cp = &from[len1 - 1];
 997   if (decimal_ellipsis)
 998     while (isdigit (*cp) && cp >= from)
 999       --cp;
1000   else
1001     while (isxdigit (*cp) && cp >= from)
1002       {
1003         if (!isdigit (*cp) && !isupper (*cp))
1004           lr_error (lr, _("\
1005 hexadecimal range format should use only capital characters"));
1006         --cp;
1007       }
1008
1009   prefix_len = (cp - from) + 1;
1010
1011   if (cp == &from[len1 - 1] || strncmp (from, to, prefix_len) != 0)
1012     goto illegal_range;
1013
1014   errno = 0;
1015   from_nr = strtoul (&from[prefix_len], &from_end, decimal_ellipsis ? 10 : 16);
1016   if (*from_end != '\0' || (from_nr == UINT_MAX && errno == ERANGE)
1017       || ((to_nr = strtoul (&to[prefix_len], &to_end,
1018                             decimal_ellipsis ? 10 : 16)) == UINT_MAX
1019           && errno == ERANGE)
1020       || *to_end != '\0')
1021     {
1022       lr_error (lr, _("<%s> and <%s> are invalid names for range"), from, to);
1023       return;
1024     }
1025
1026   if (from_nr > to_nr)
1027     {
1028       lr_error (lr, _("upper limit in range is smaller than lower limit"));
1029       return;
1030     }
1031
1032   for (cnt = from_nr; cnt <= to_nr; cnt += step)
1033     {
1034       char *name_end;
1035       obstack_printf (ob, decimal_ellipsis ? "%.*s%0*d" : "%.*s%0*X",
1036                       prefix_len, from, len1 - prefix_len, cnt);
1037       obstack_1grow (ob, '\0');
1038       name_end = obstack_finish (ob);
1039
1040       newp = (struct charseq *) obstack_alloc (ob, sizeof (*newp) + nbytes);
1041       newp->nbytes = nbytes;
1042       memcpy (newp->bytes, bytes, nbytes);
1043       newp->name = name_end;
1044
1045       newp->ucs4 = UNINITIALIZED_CHAR_VALUE;
1046       if ((name_end[0] == 'U' || name_end[0] == 'P')
1047           && (len1 == 5 || len1 == 9))
1048         {
1049           /* Maybe the name is of the form `Uxxxx' or `Uxxxxxxxx' where
1050              xxxx and xxxxxxxx are hexadecimal numbers.  In this case
1051              we use the value of xxxx or xxxxxxxx as the UCS4 value of
1052              this character and we don't have to consult the repertoire
1053              map.
1054
1055              If the name is of the form `Pxxxx' or `Pxxxxxxxx' the xxxx
1056              and xxxxxxxx also give the code point in UCS4 but this must
1057              be in the private, i.e., unassigned, area.  This should be
1058              used for characters which do not (yet) have an equivalent
1059              in ISO 10646 and Unicode.  */
1060           char *endp;
1061
1062           errno = 0;
1063           newp->ucs4 = strtoul (name_end + 1, &endp, 16);
1064           if (endp - name_end != len1
1065               || (newp->ucs4 == ~((uint32_t) 0) && errno == ERANGE)
1066               || newp->ucs4 >= 0x80000000)
1067             /* This wasn't successful.  Signal this name cannot be a
1068                correct UCS value.  */
1069             newp->ucs4 = UNINITIALIZED_CHAR_VALUE;
1070         }
1071
1072       insert_entry (ht, name_end, len1, newp);
1073       insert_entry (bt, newp->bytes, nbytes, newp);
1074       /* Please note we don't examine the return value since it is no error
1075          if we have two definitions for a symbol.  */
1076
1077       /* Increment the value in the byte sequence.  */
1078       if (++bytes[nbytes - 1] == '\0')
1079         {
1080           int b = nbytes - 2;
1081
1082           do
1083             if (b < 0)
1084               {
1085                 lr_error (lr,
1086                           _("resulting bytes for range not representable."));
1087                 return;
1088               }
1089           while (++bytes[b--] == 0);
1090         }
1091     }
1092 }
1093
1094
1095 struct charseq *
1096 charmap_find_symbol (const struct charmap_t *cm, const char *bytes,
1097                      size_t nbytes)
1098 {
1099   void *result;
1100
1101   return (find_entry ((hash_table *) &cm->byte_table, bytes, nbytes, &result)
1102           < 0 ? NULL : (struct charseq *) result);
1103 }