locale/programs/charmap.c

   1 /* Copyright (C) 1996, 1998-2002, 2003, 2004 Free Software Foundation, Inc.
   2    This file is part of the GNU C Library.
   3    Contributed by Ulrich Drepper <drepper@gnu.org>, 1996.
   4
   5    The GNU C Library is free software; you can redistribute it and/or
   6    modify it under the terms of the GNU Lesser General Public
   7    License as published by the Free Software Foundation; either
   8    version 2.1 of the License, or (at your option) any later version.
   9
  10    The GNU C Library is distributed in the hope that it will be useful,
  11    but WITHOUT ANY WARRANTY; without even the implied warranty of
  12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  13    Lesser General Public License for more details.
  14
  15    You should have received a copy of the GNU Lesser General Public
  16    License along with the GNU C Library; if not, write to the Free
  17    Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
  18    02111-1307 USA.  */
  19
  20 #ifdef HAVE_CONFIG_H
  21 # include <config.h>
  22 #endif
  23
  24 #include <ctype.h>
  25 #include <errno.h>
  26 #include <libintl.h>
  27 #include <limits.h>
  28 #include <stdio.h>
  29 #include <stdlib.h>
  30 #include <string.h>
  31 #include <error.h>
  32
  33 #include "localedef.h"
  34 #include "linereader.h"
  35 #include "charmap.h"
  36 #include "charmap-dir.h"
  37
  38 #include <assert.h>
  39
  40
  41 /* Define the lookup function.  */
  42 #include "charmap-kw.h"
  43
  44
  45 /* Prototypes for local functions.  */
  46 static struct charmap_t *parse_charmap (struct linereader *cmfile,
  47                                         int verbose, int be_quiet);
  48 static void new_width (struct linereader *cmfile, struct charmap_t *result,
  49                        const char *from, const char *to,
  50                        unsigned long int width);
  51 static void charmap_new_char (struct linereader *lr, struct charmap_t *cm,
  52                               int nbytes, char *bytes, const char *from,
  53                               const char *to, int decimal_ellipsis, int step);
  54
  55
  56 bool enc_not_ascii_compatible;
  57
  58
  59 #ifdef NEED_NULL_POINTER
  60 static const char *null_pointer;
  61 #endif
  62
  63 static struct linereader *
  64 cmlr_open (const char *directory, const char *name, kw_hash_fct_t hf)
  65 {
  66   FILE *fp;
  67
  68   fp = charmap_open (directory, name);
  69   if (fp == NULL)
  70     return NULL;
  71   else
  72     {
  73       size_t dlen = strlen (directory);
  74       int add_slash = (dlen == 0 || directory[dlen - 1] != '/');
  75       size_t nlen = strlen (name);
  76       char *pathname;
  77       char *p;
  78
  79       pathname = alloca (dlen + add_slash + nlen + 1);
  80       p = stpcpy (pathname, directory);
  81       if (add_slash)
  82         *p++ = '/';
  83       stpcpy (p, name);
  84
  85       return lr_create (fp, pathname, hf);
  86     }
  87 }
  88
  89 struct charmap_t *
  90 charmap_read (const char *filename, int verbose, int be_quiet, int use_default)
  91 {
  92   struct charmap_t *result = NULL;
  93
  94   if (filename != NULL)
  95     {
  96       struct linereader *cmfile;
  97
  98       /* First try the name as found in the parameter.  */
  99       cmfile = lr_open (filename, charmap_hash);
 100       if (cmfile == NULL)
 101         {
 102           /* No successful.  So start looking through the directories
 103              in the I18NPATH if this is a simple name.  */
 104           if (strchr (filename, '/') == NULL)
 105             {
 106               char *i18npath = getenv ("I18NPATH");
 107               if (i18npath != NULL && *i18npath != '\0')
 108                 {
 109                   const size_t pathlen = strlen (i18npath);
 110                   char i18npathbuf[pathlen + 1];
 111                   char path[pathlen + sizeof ("/charmaps")];
 112                   char *next;
 113                   i18npath = memcpy (i18npathbuf, i18npath, pathlen + 1);
 114
 115                   while (cmfile == NULL
 116                          && (next = strsep (&i18npath, ":")) != NULL)
 117                     {
 118                       stpcpy (stpcpy (path, next), "/charmaps");
 119                       cmfile = cmlr_open (path, filename, charmap_hash);
 120
 121                       if (cmfile == NULL)
 122                         /* Try without the "/charmaps" part.  */
 123                         cmfile = cmlr_open (next, filename, charmap_hash);
 124                     }
 125                 }
 126
 127               if (cmfile == NULL)
 128                 /* Try the default directory.  */
 129                 cmfile = cmlr_open (CHARMAP_PATH, filename, charmap_hash);
 130             }
 131         }
 132
 133       if (cmfile != NULL)
 134         {
 135           result = parse_charmap (cmfile, verbose, be_quiet);
 136
 137           if (result == NULL && !be_quiet)
 138             WITH_CUR_LOCALE (error (0, errno, _("\
 139 character map file `%s' not found"), filename));
 140         }
 141     }
 142
 143   if (result == NULL && filename != NULL && strchr (filename, '/') == NULL)
 144     {
 145       /* OK, one more try.  We also accept the names given to the
 146          character sets in the files.  Sometimes they differ from the
 147          file name.  */
 148       CHARMAP_DIR *dir;
 149
 150       dir = charmap_opendir (CHARMAP_PATH);
 151       if (dir != NULL)
 152         {
 153           const char *dirent;
 154
 155           while ((dirent = charmap_readdir (dir)) != NULL)
 156             {
 157               char **aliases;
 158               char **p;
 159               int found;
 160
 161               aliases = charmap_aliases (CHARMAP_PATH, dirent);
 162               found = 0;
 163               for (p = aliases; *p; p++)
 164                 if (strcasecmp (*p, filename) == 0)
 165                   {
 166                     found = 1;
 167                     break;
 168                   }
 169               charmap_free_aliases (aliases);
 170
 171               if (found)
 172                 {
 173                   struct linereader *cmfile;
 174
 175                   cmfile = cmlr_open (CHARMAP_PATH, dirent, charmap_hash);
 176                   if (cmfile != NULL)
 177                     result = parse_charmap (cmfile, verbose, be_quiet);
 178
 179                   break;
 180                 }
 181             }
 182
 183           charmap_closedir (dir);
 184         }
 185     }
 186
 187   if (result == NULL && DEFAULT_CHARMAP != NULL)
 188     {
 189       struct linereader *cmfile;
 190
 191       cmfile = cmlr_open (CHARMAP_PATH, DEFAULT_CHARMAP, charmap_hash);
 192       if (cmfile != NULL)
 193         result = parse_charmap (cmfile, verbose, be_quiet);
 194
 195       if (result == NULL)
 196         WITH_CUR_LOCALE (error (4, errno, _("\
 197 default character map file `%s' not found"), DEFAULT_CHARMAP));
 198     }
 199
 200   if (result != NULL && result->code_set_name == NULL)
 201     /* The input file does not specify a code set name.  This
 202        shouldn't happen but we should cope with it.  */
 203     result->code_set_name = basename (filename);
 204
 205   /* Test of ASCII compatibility of locale encoding.
 206
 207      Verify that the encoding to be used in a locale is ASCII compatible,
 208      at least for the graphic characters, excluding the control characters,
 209      '$' and '@'.  This constraint comes from an ISO C 99 restriction.
 210
 211      ISO C 99 section 7.17.(2) (about wchar_t):
 212        the null character shall have the code value zero and each member of
 213        the basic character set shall have a code value equal to its value
 214        when used as the lone character in an integer character constant.
 215      ISO C 99 section 5.2.1.(3):
 216        Both the basic source and basic execution character sets shall have
 217        the following members: the 26 uppercase letters of the Latin alphabet
 218             A B C D E F G H I J K L M N O P Q R S T U V W X Y Z
 219        the 26 lowercase letters of the Latin alphabet
 220             a b c d e f g h i j k l m n o p q r s t u v w x y z
 221        the 10 decimal digits
 222             0 1 2 3 4 5 6 7 8 9
 223        the following 29 graphic characters
 224             ! " # % & ' ( ) * + , - . / : ; < = > ? [ \ ] ^ _ { | } ~
 225        the space character, and control characters representing horizontal
 226        tab, vertical tab, and form feed.
 227
 228      Therefore, for all members of the "basic character set", the 'char' code
 229      must have the same value as the 'wchar_t' code, which in glibc is the
 230      same as the Unicode code, which for all of the enumerated characters
 231      is identical to the ASCII code. */
 232   if (result != NULL && use_default)
 233     {
 234       static const char basic_charset[] =
 235         {
 236           'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M',
 237           'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z',
 238           'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm',
 239           'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
 240           '0', '1', '2', '3', '4', '5', '6', '7', '8', '9',
 241           '!', '"', '#', '%', '&', '\'', '(', ')', '*', '+', ',', '-',
 242           '.', '/', ':', ';', '<', '=', '>', '?', '[', '\\', ']', '^',
 243           '_', '{', '|', '}', '~', ' ', '\t', '\v', '\f', '\0'
 244         };
 245       int failed = 0;
 246       const char *p = basic_charset;
 247
 248       do
 249         {
 250           struct charseq *seq = charmap_find_symbol (result, p, 1);
 251
 252           if (seq == NULL || seq->ucs4 != (uint32_t) *p)
 253             failed = 1;
 254         }
 255       while (*p++ != '\0');
 256
 257       if (failed)
 258         {
 259           WITH_CUR_LOCALE (fprintf (stderr, _("\
 260 character map `%s' is not ASCII compatible, locale not ISO C compliant\n"),
 261                                     result->code_set_name));
 262           enc_not_ascii_compatible = true;
 263         }
 264     }
 265
 266   return result;
 267 }
 268
 269
 270 static struct charmap_t *
 271 parse_charmap (struct linereader *cmfile, int verbose, int be_quiet)
 272 {
 273   struct charmap_t *result;
 274   int state;
 275   enum token_t expected_tok = tok_error;
 276   const char *expected_str = NULL;
 277   char *from_name = NULL;
 278   char *to_name = NULL;
 279   enum token_t ellipsis = 0;
 280   int step = 1;
 281
 282   /* We don't want symbolic names in string to be translated.  */
 283   cmfile->translate_strings = 0;
 284
 285   /* Allocate room for result.  */
 286   result = (struct charmap_t *) xmalloc (sizeof (struct charmap_t));
 287   memset (result, '\0', sizeof (struct charmap_t));
 288   /* The default DEFAULT_WIDTH is 1.  */
 289   result->width_default = 1;
 290
 291 #define obstack_chunk_alloc malloc
 292 #define obstack_chunk_free free
 293   obstack_init (&result->mem_pool);
 294
 295   if (init_hash (&result->char_table, 256)
 296       || init_hash (&result->byte_table, 256))
 297     {
 298       free (result);
 299       return NULL;
 300     }
 301
 302   /* We use a state machine to describe the charmap description file
 303      format.  */
 304   state = 1;
 305   while (1)
 306     {
 307       /* What's on?  */
 308       struct token *now = lr_token (cmfile, NULL, NULL, NULL, verbose);
 309       enum token_t nowtok = now->tok;
 310       struct token *arg;
 311
 312       if (nowtok == tok_eof)
 313         break;
 314
 315       switch (state)
 316         {
 317         case 1:
 318           /* The beginning.  We expect the special declarations, EOL or
 319              `CHARMAP'.  */
 320           if (nowtok == tok_eol)
 321             /* Ignore empty lines.  */
 322             continue;
 323
 324           if (nowtok == tok_charmap)
 325             {
 326               from_name = NULL;
 327               to_name = NULL;
 328
 329               /* We have to set up the real work.  Fill in some
 330                  default values.  */
 331               if (result->mb_cur_max == 0)
 332                 result->mb_cur_max = 1;
 333               if (result->mb_cur_min == 0)
 334                 result->mb_cur_min = result->mb_cur_max;
 335               if (result->mb_cur_min > result->mb_cur_max)
 336                 {
 337                   if (!be_quiet)
 338                     WITH_CUR_LOCALE (error (0, 0, _("\
 339 %s: <mb_cur_max> must be greater than <mb_cur_min>\n"),
 340                                             cmfile->fname));
 341
 342                   result->mb_cur_min = result->mb_cur_max;
 343                 }
 344
 345               lr_ignore_rest (cmfile, 1);
 346
 347               state = 2;
 348               continue;
 349             }
 350
 351           if (nowtok != tok_code_set_name && nowtok != tok_mb_cur_max
 352               && nowtok != tok_mb_cur_min && nowtok != tok_escape_char
 353               && nowtok != tok_comment_char && nowtok != tok_g0esc
 354               && nowtok != tok_g1esc && nowtok != tok_g2esc
 355               && nowtok != tok_g3esc && nowtok != tok_repertoiremap
 356               && nowtok != tok_include)
 357             {
 358               lr_error (cmfile, _("syntax error in prolog: %s"),
 359                         _("invalid definition"));
 360
 361               lr_ignore_rest (cmfile, 0);
 362               continue;
 363             }
 364
 365           /* We know that we need an argument.  */
 366           arg = lr_token (cmfile, NULL, NULL, NULL, verbose);
 367
 368           switch (nowtok)
 369             {
 370             case tok_code_set_name:
 371             case tok_repertoiremap:
 372               if (arg->tok != tok_ident && arg->tok != tok_string)
 373                 {
 374                 badarg:
 375                   lr_error (cmfile, _("syntax error in prolog: %s"),
 376                             _("bad argument"));
 377
 378                   lr_ignore_rest (cmfile, 0);
 379                   continue;
 380                 }
 381
 382               if (nowtok == tok_code_set_name)
 383                 result->code_set_name = obstack_copy0 (&result->mem_pool,
 384                                                        arg->val.str.startmb,
 385                                                        arg->val.str.lenmb);
 386               else
 387                 result->repertoiremap = obstack_copy0 (&result->mem_pool,
 388                                                        arg->val.str.startmb,
 389                                                        arg->val.str.lenmb);
 390
 391               lr_ignore_rest (cmfile, 1);
 392               continue;
 393
 394             case tok_mb_cur_max:
 395             case tok_mb_cur_min:
 396               if (arg->tok != tok_number)
 397                 goto badarg;
 398
 399               if (verbose
 400                   && ((nowtok == tok_mb_cur_max
 401                        && result->mb_cur_max != 0)
 402                       || (nowtok == tok_mb_cur_max
 403                           && result->mb_cur_max != 0)))
 404                 lr_error (cmfile, _("duplicate definition of <%s>"),
 405                           nowtok == tok_mb_cur_min
 406                           ? "mb_cur_min" : "mb_cur_max");
 407
 408               if (arg->val.num < 1)
 409                 {
 410                   lr_error (cmfile,
 411                             _("value for <%s> must be 1 or greater"),
 412                             nowtok == tok_mb_cur_min
 413                             ? "mb_cur_min" : "mb_cur_max");
 414
 415                   lr_ignore_rest (cmfile, 0);
 416                   continue;
 417                 }
 418               if ((nowtok == tok_mb_cur_max && result->mb_cur_min != 0
 419                    && (int) arg->val.num < result->mb_cur_min)
 420                   || (nowtok == tok_mb_cur_min && result->mb_cur_max != 0
 421                       && (int) arg->val.num > result->mb_cur_max))
 422                 {
 423                   lr_error (cmfile, _("\
 424 value of <%s> must be greater or equal than the value of <%s>"),
 425                             "mb_cur_max", "mb_cur_min");
 426
 427                   lr_ignore_rest (cmfile, 0);
 428                   continue;
 429                 }
 430
 431               if (nowtok == tok_mb_cur_max)
 432                 result->mb_cur_max = arg->val.num;
 433               else
 434                 result->mb_cur_min = arg->val.num;
 435
 436               lr_ignore_rest (cmfile, 1);
 437               continue;
 438
 439             case tok_escape_char:
 440             case tok_comment_char:
 441               if (arg->tok != tok_ident)
 442                 goto badarg;
 443
 444               if (arg->val.str.lenmb != 1)
 445                 {
 446                   lr_error (cmfile, _("\
 447 argument to <%s> must be a single character"),
 448                             nowtok == tok_escape_char ? "escape_char"
 449                                                       : "comment_char");
 450
 451                   lr_ignore_rest (cmfile, 0);
 452                   continue;
 453                 }
 454
 455               if (nowtok == tok_escape_char)
 456                 cmfile->escape_char = *arg->val.str.startmb;
 457               else
 458                 cmfile->comment_char = *arg->val.str.startmb;
 459
 460               lr_ignore_rest (cmfile, 1);
 461               continue;
 462
 463             case tok_g0esc:
 464             case tok_g1esc:
 465             case tok_g2esc:
 466             case tok_g3esc:
 467             case tok_escseq:
 468               lr_ignore_rest (cmfile, 0); /* XXX */
 469               continue;
 470
 471             case tok_include:
 472               lr_error (cmfile, _("\
 473 character sets with locking states are not supported"));
 474               exit (4);
 475
 476             default:
 477               /* Cannot happen.  */
 478               assert (! "Should not happen");
 479             }
 480           break;
 481
 482         case 2:
 483           /* We have seen `CHARMAP' and now are in the body.  Each line
 484              must have the format "%s %s %s\n" or "%s...%s %s %s\n".  */
 485           if (nowtok == tok_eol)
 486             /* Ignore empty lines.  */
 487             continue;
 488
 489           if (nowtok == tok_end)
 490             {
 491               expected_tok = tok_charmap;
 492               expected_str = "CHARMAP";
 493               state = 90;
 494               continue;
 495             }
 496
 497           if (nowtok != tok_bsymbol && nowtok != tok_ucs4)
 498             {
 499               lr_error (cmfile, _("syntax error in %s definition: %s"),
 500                         "CHARMAP", _("no symbolic name given"));
 501
 502               lr_ignore_rest (cmfile, 0);
 503               continue;
 504             }
 505
 506           /* If the previous line was not completely correct free the
 507              used memory.  */
 508           if (from_name != NULL)
 509             obstack_free (&result->mem_pool, from_name);
 510
 511           if (nowtok == tok_bsymbol)
 512             from_name = (char *) obstack_copy0 (&result->mem_pool,
 513                                                 now->val.str.startmb,
 514                                                 now->val.str.lenmb);
 515           else
 516             {
 517               obstack_printf (&result->mem_pool, "U%08X",
 518                               cmfile->token.val.ucs4);
 519               obstack_1grow (&result->mem_pool, '\0');
 520               from_name = (char *) obstack_finish (&result->mem_pool);
 521             }
 522           to_name = NULL;
 523
 524           state = 3;
 525           continue;
 526
 527         case 3:
 528           /* We have two possibilities: We can see an ellipsis or an
 529              encoding value.  */
 530           if (nowtok == tok_ellipsis3 || nowtok == tok_ellipsis4
 531               || nowtok == tok_ellipsis2 || nowtok == tok_ellipsis4_2
 532               || nowtok == tok_ellipsis2_2)
 533             {
 534               ellipsis = nowtok;
 535               if (nowtok == tok_ellipsis4_2)
 536                 {
 537                   step = 2;
 538                   nowtok = tok_ellipsis4;
 539                 }
 540               else if (nowtok == tok_ellipsis2_2)
 541                 {
 542                   step = 2;
 543                   nowtok = tok_ellipsis2;
 544                 }
 545               state = 4;
 546               continue;
 547             }
 548           /* FALLTHROUGH */
 549
 550         case 5:
 551           if (nowtok != tok_charcode)
 552             {
 553               lr_error (cmfile, _("syntax error in %s definition: %s"),
 554                         "CHARMAP", _("invalid encoding given"));
 555
 556               lr_ignore_rest (cmfile, 0);
 557
 558               state = 2;
 559               continue;
 560             }
 561
 562           if (now->val.charcode.nbytes < result->mb_cur_min)
 563             lr_error (cmfile, _("too few bytes in character encoding"));
 564           else if (now->val.charcode.nbytes > result->mb_cur_max)
 565             lr_error (cmfile, _("too many bytes in character encoding"));
 566           else
 567             charmap_new_char (cmfile, result, now->val.charcode.nbytes,
 568                               now->val.charcode.bytes, from_name, to_name,
 569                               ellipsis != tok_ellipsis2, step);
 570
 571           /* Ignore trailing comment silently.  */
 572           lr_ignore_rest (cmfile, 0);
 573
 574           from_name = NULL;
 575           to_name = NULL;
 576           ellipsis = tok_none;
 577           step = 1;
 578
 579           state = 2;
 580           continue;
 581
 582         case 4:
 583           if (nowtok != tok_bsymbol && nowtok != tok_ucs4)
 584             {
 585               lr_error (cmfile, _("syntax error in %s definition: %s"),
 586                         "CHARMAP",
 587                         _("no symbolic name given for end of range"));
 588
 589               lr_ignore_rest (cmfile, 0);
 590               continue;
 591             }
 592
 593           /* Copy the to-name in a safe place.  */
 594           if (nowtok == tok_bsymbol)
 595             to_name = (char *) obstack_copy0 (&result->mem_pool,
 596                                               cmfile->token.val.str.startmb,
 597                                               cmfile->token.val.str.lenmb);
 598           else
 599             {
 600               obstack_printf (&result->mem_pool, "U%08X",
 601                               cmfile->token.val.ucs4);
 602               obstack_1grow (&result->mem_pool, '\0');
 603               to_name = (char *) obstack_finish (&result->mem_pool);
 604             }
 605
 606           state = 5;
 607           continue;
 608
 609         case 90:
 610           if (nowtok != expected_tok)
 611             lr_error (cmfile, _("\
 612 `%1$s' definition does not end with `END %1$s'"), expected_str);
 613
 614           lr_ignore_rest (cmfile, nowtok == expected_tok);
 615           state = 91;
 616           continue;
 617
 618         case 91:
 619           /* Waiting for WIDTH... */
 620           if (nowtok == tok_eol)
 621             /* Ignore empty lines.  */
 622             continue;
 623
 624           if (nowtok == tok_width_default)
 625             {
 626               state = 92;
 627               continue;
 628             }
 629
 630           if (nowtok == tok_width)
 631             {
 632               lr_ignore_rest (cmfile, 1);
 633               state = 93;
 634               continue;
 635             }
 636
 637           if (nowtok == tok_width_variable)
 638             {
 639               lr_ignore_rest (cmfile, 1);
 640               state = 98;
 641               continue;
 642             }
 643
 644           lr_error (cmfile, _("\
 645 only WIDTH definitions are allowed to follow the CHARMAP definition"));
 646
 647           lr_ignore_rest (cmfile, 0);
 648           continue;
 649
 650         case 92:
 651           if (nowtok != tok_number)
 652             lr_error (cmfile, _("value for %s must be an integer"),
 653                       "WIDTH_DEFAULT");
 654           else
 655             result->width_default = now->val.num;
 656
 657           lr_ignore_rest (cmfile, nowtok == tok_number);
 658
 659           state = 91;
 660           continue;
 661
 662         case 93:
 663           /* We now expect `END WIDTH' or lines of the format "%s %d\n" or
 664              "%s...%s %d\n".  */
 665           if (nowtok == tok_eol)
 666             /* ignore empty lines.  */
 667             continue;
 668
 669           if (nowtok == tok_end)
 670             {
 671               expected_tok = tok_width;
 672               expected_str = "WIDTH";
 673               state = 90;
 674               continue;
 675             }
 676
 677           if (nowtok != tok_bsymbol && nowtok != tok_ucs4)
 678             {
 679               lr_error (cmfile, _("syntax error in %s definition: %s"),
 680                         "WIDTH", _("no symbolic name given"));
 681
 682               lr_ignore_rest (cmfile, 0);
 683               continue;
 684             }
 685
 686           if (from_name != NULL)
 687             obstack_free (&result->mem_pool, from_name);
 688
 689           if (nowtok == tok_bsymbol)
 690             from_name = (char *) obstack_copy0 (&result->mem_pool,
 691                                                 now->val.str.startmb,
 692                                                 now->val.str.lenmb);
 693           else
 694             {
 695               obstack_printf (&result->mem_pool, "U%08X",
 696                               cmfile->token.val.ucs4);
 697               obstack_1grow (&result->mem_pool, '\0');
 698               from_name = (char *) obstack_finish (&result->mem_pool);
 699             }
 700
 701           to_name = NULL;
 702
 703           state = 94;
 704           continue;
 705
 706         case 94:
 707           if (nowtok == tok_ellipsis3)
 708             {
 709               state = 95;
 710               continue;
 711             }
 712
 713         case 96:
 714           if (nowtok != tok_number)
 715             lr_error (cmfile, _("value for %s must be an integer"),
 716                       "WIDTH");
 717           else
 718             {
 719               /* Store width for chars.  */
 720               new_width (cmfile, result, from_name, to_name, now->val.num);
 721
 722               from_name = NULL;
 723               to_name = NULL;
 724             }
 725
 726           lr_ignore_rest (cmfile, nowtok == tok_number);
 727
 728           state = 93;
 729           continue;
 730
 731         case 95:
 732           if (nowtok != tok_bsymbol && nowtok != tok_ucs4)
 733             {
 734               lr_error (cmfile, _("syntax error in %s definition: %s"),
 735                         "WIDTH", _("no symbolic name given for end of range"));
 736
 737               lr_ignore_rest (cmfile, 0);
 738
 739               state = 93;
 740               continue;
 741             }
 742
 743           if (nowtok == tok_bsymbol)
 744             to_name = (char *) obstack_copy0 (&result->mem_pool,
 745                                               now->val.str.startmb,
 746                                               now->val.str.lenmb);
 747           else
 748             {
 749               obstack_printf (&result->mem_pool, "U%08X",
 750                               cmfile->token.val.ucs4);
 751               obstack_1grow (&result->mem_pool, '\0');
 752               to_name = (char *) obstack_finish (&result->mem_pool);
 753             }
 754
 755           state = 96;
 756           continue;
 757
 758         case 98:
 759           /* We now expect `END WIDTH_VARIABLE' or lines of the format
 760              "%s\n" or "%s...%s\n".  */
 761           if (nowtok == tok_eol)
 762             /* ignore empty lines.  */
 763             continue;
 764
 765           if (nowtok == tok_end)
 766             {
 767               expected_tok = tok_width_variable;
 768               expected_str = "WIDTH_VARIABLE";
 769               state = 90;
 770               continue;
 771             }
 772
 773           if (nowtok != tok_bsymbol && nowtok != tok_ucs4)
 774             {
 775               lr_error (cmfile, _("syntax error in %s definition: %s"),
 776                         "WIDTH_VARIABLE", _("no symbolic name given"));
 777
 778               lr_ignore_rest (cmfile, 0);
 779
 780               continue;
 781             }
 782
 783           if (from_name != NULL)
 784             obstack_free (&result->mem_pool, from_name);
 785
 786           if (nowtok == tok_bsymbol)
 787             from_name = (char *) obstack_copy0 (&result->mem_pool,
 788                                                 now->val.str.startmb,
 789                                                 now->val.str.lenmb);
 790           else
 791             {
 792               obstack_printf (&result->mem_pool, "U%08X",
 793                               cmfile->token.val.ucs4);
 794               obstack_1grow (&result->mem_pool, '\0');
 795               from_name = (char *) obstack_finish (&result->mem_pool);
 796             }
 797           to_name = NULL;
 798
 799           state = 99;
 800           continue;
 801
 802         case 99:
 803           if (nowtok == tok_ellipsis3)
 804             state = 100;
 805
 806           /* Store info.  */
 807           from_name = NULL;
 808
 809           /* Warn */
 810           state = 98;
 811           continue;
 812
 813         case 100:
 814           if (nowtok != tok_bsymbol && nowtok != tok_ucs4)
 815             {
 816               lr_error (cmfile, _("syntax error in %s definition: %s"),
 817                         "WIDTH_VARIABLE",
 818                         _("no symbolic name given for end of range"));
 819               lr_ignore_rest (cmfile, 0);
 820               continue;
 821             }
 822
 823           if (nowtok == tok_bsymbol)
 824             to_name = (char *) obstack_copy0 (&result->mem_pool,
 825                                               now->val.str.startmb,
 826                                               now->val.str.lenmb);
 827           else
 828             {
 829               obstack_printf (&result->mem_pool, "U%08X",
 830                               cmfile->token.val.ucs4);
 831               obstack_1grow (&result->mem_pool, '\0');
 832               to_name = (char *) obstack_finish (&result->mem_pool);
 833             }
 834
 835           /* XXX Enter value into table.  */
 836
 837           lr_ignore_rest (cmfile, 1);
 838
 839           state = 98;
 840           continue;
 841
 842         default:
 843           WITH_CUR_LOCALE (error (5, 0, _("%s: error in state machine"),
 844                                   __FILE__));
 845           /* NOTREACHED */
 846         }
 847       break;
 848     }
 849
 850   if (state != 91 && !be_quiet)
 851     WITH_CUR_LOCALE (error (0, 0, _("%s: premature end of file"),
 852                             cmfile->fname));
 853
 854   lr_close (cmfile);
 855
 856   return result;
 857 }
 858
 859
 860 static void
 861 new_width (struct linereader *cmfile, struct charmap_t *result,
 862            const char *from, const char *to, unsigned long int width)
 863 {
 864   struct charseq *from_val;
 865   struct charseq *to_val;
 866
 867   from_val = charmap_find_value (result, from, strlen (from));
 868   if (from_val == NULL)
 869     {
 870       lr_error (cmfile, _("unknown character `%s'"), from);
 871       return;
 872     }
 873
 874   if (to == NULL)
 875     to_val = from_val;
 876   else
 877     {
 878       to_val = charmap_find_value (result, to, strlen (to));
 879       if (to_val == NULL)
 880         {
 881           lr_error (cmfile, _("unknown character `%s'"), to);
 882           return;
 883         }
 884
 885       /* Make sure the number of bytes for the end points of the range
 886          is correct.  */
 887       if (from_val->nbytes != to_val->nbytes)
 888         {
 889           lr_error (cmfile, _("\
 890 number of bytes for byte sequence of beginning and end of range not the same: %d vs %d"),
 891                     from_val->nbytes, to_val->nbytes);
 892           return;
 893         }
 894     }
 895
 896   if (result->nwidth_rules >= result->nwidth_rules_max)
 897     {
 898       size_t new_size = result->nwidth_rules + 32;
 899       struct width_rule *new_rules =
 900         (struct width_rule *) obstack_alloc (&result->mem_pool,
 901                                              (new_size
 902                                               * sizeof (struct width_rule)));
 903
 904       memcpy (new_rules, result->width_rules,
 905               result->nwidth_rules_max * sizeof (struct width_rule));
 906
 907       result->width_rules = new_rules;
 908       result->nwidth_rules_max = new_size;
 909     }
 910
 911   result->width_rules[result->nwidth_rules].from = from_val;
 912   result->width_rules[result->nwidth_rules].to = to_val;
 913   result->width_rules[result->nwidth_rules].width = (unsigned int) width;
 914   ++result->nwidth_rules;
 915 }
 916
 917
 918 struct charseq *
 919 charmap_find_value (const struct charmap_t *cm, const char *name, size_t len)
 920 {
 921   void *result;
 922
 923   return (find_entry ((hash_table *) &cm->char_table, name, len, &result)
 924           < 0 ? NULL : (struct charseq *) result);
 925 }
 926
 927
 928 static void
 929 charmap_new_char (struct linereader *lr, struct charmap_t *cm,
 930                   int nbytes, char *bytes, const char *from, const char *to,
 931                   int decimal_ellipsis, int step)
 932 {
 933   hash_table *ht = &cm->char_table;
 934   hash_table *bt = &cm->byte_table;
 935   struct obstack *ob = &cm->mem_pool;
 936   char *from_end;
 937   char *to_end;
 938   const char *cp;
 939   int prefix_len, len1, len2;
 940   unsigned int from_nr, to_nr, cnt;
 941   struct charseq *newp;
 942
 943   len1 = strlen (from);
 944
 945   if (to == NULL)
 946     {
 947       newp = (struct charseq *) obstack_alloc (ob, sizeof (*newp) + nbytes);
 948       newp->nbytes = nbytes;
 949       memcpy (newp->bytes, bytes, nbytes);
 950       newp->name = from;
 951
 952       newp->ucs4 = UNINITIALIZED_CHAR_VALUE;
 953       if ((from[0] == 'U' || from[0] == 'P') && (len1 == 5 || len1 == 9))
 954         {
 955           /* Maybe the name is of the form `Uxxxx' or `Uxxxxxxxx' where
 956              xxxx and xxxxxxxx are hexadecimal numbers.  In this case
 957              we use the value of xxxx or xxxxxxxx as the UCS4 value of
 958              this character and we don't have to consult the repertoire
 959              map.
 960
 961              If the name is of the form `Pxxxx' or `Pxxxxxxxx' the xxxx
 962              and xxxxxxxx also give the code point in UCS4 but this must
 963              be in the private, i.e., unassigned, area.  This should be
 964              used for characters which do not (yet) have an equivalent
 965              in ISO 10646 and Unicode.  */
 966           char *endp;
 967
 968           errno = 0;
 969           newp->ucs4 = strtoul (from + 1, &endp, 16);
 970           if (endp - from != len1
 971               || (newp->ucs4 == ~((uint32_t) 0) && errno == ERANGE)
 972               || newp->ucs4 >= 0x80000000)
 973             /* This wasn't successful.  Signal this name cannot be a
 974                correct UCS value.  */
 975             newp->ucs4 = UNINITIALIZED_CHAR_VALUE;
 976         }
 977
 978       insert_entry (ht, from, len1, newp);
 979       insert_entry (bt, newp->bytes, nbytes, newp);
 980       /* Please note that it isn't a bug if a symbol is defined more
 981          than once.  All later definitions are simply discarded.  */
 982       return;
 983     }
 984
 985   /* We have a range: the names must have names with equal prefixes
 986      and an equal number of digits, where the second number is greater
 987      or equal than the first.  */
 988   len2 = strlen (to);
 989
 990   if (len1 != len2)
 991     {
 992     illegal_range:
 993       lr_error (lr, _("invalid names for character range"));
 994       return;
 995     }
 996
 997   cp = &from[len1 - 1];
 998   if (decimal_ellipsis)
 999     while (isdigit (*cp) && cp >= from)
1000       --cp;
1001   else
1002     while (isxdigit (*cp) && cp >= from)
1003       {
1004         if (!isdigit (*cp) && !isupper (*cp))
1005           lr_error (lr, _("\
1006 hexadecimal range format should use only capital characters"));
1007         --cp;
1008       }
1009
1010   prefix_len = (cp - from) + 1;
1011
1012   if (cp == &from[len1 - 1] || strncmp (from, to, prefix_len) != 0)
1013     goto illegal_range;
1014
1015   errno = 0;
1016   from_nr = strtoul (&from[prefix_len], &from_end, decimal_ellipsis ? 10 : 16);
1017   if (*from_end != '\0' || (from_nr == UINT_MAX && errno == ERANGE)
1018       || ((to_nr = strtoul (&to[prefix_len], &to_end,
1019                             decimal_ellipsis ? 10 : 16)) == UINT_MAX
1020           && errno == ERANGE)
1021       || *to_end != '\0')
1022     {
1023       lr_error (lr, _("<%s> and <%s> are illegal names for range"), from, to);
1024       return;
1025     }
1026
1027   if (from_nr > to_nr)
1028     {
1029       lr_error (lr, _("upper limit in range is not higher then lower limit"));
1030       return;
1031     }
1032
1033   for (cnt = from_nr; cnt <= to_nr; cnt += step)
1034     {
1035       char *name_end;
1036       obstack_printf (ob, decimal_ellipsis ? "%.*s%0*d" : "%.*s%0*X",
1037                       prefix_len, from, len1 - prefix_len, cnt);
1038       obstack_1grow (ob, '\0');
1039       name_end = obstack_finish (ob);
1040
1041       newp = (struct charseq *) obstack_alloc (ob, sizeof (*newp) + nbytes);
1042       newp->nbytes = nbytes;
1043       memcpy (newp->bytes, bytes, nbytes);
1044       newp->name = name_end;
1045
1046       newp->ucs4 = UNINITIALIZED_CHAR_VALUE;
1047       if ((name_end[0] == 'U' || name_end[0] == 'P')
1048           && (len1 == 5 || len1 == 9))
1049         {
1050           /* Maybe the name is of the form `Uxxxx' or `Uxxxxxxxx' where
1051              xxxx and xxxxxxxx are hexadecimal numbers.  In this case
1052              we use the value of xxxx or xxxxxxxx as the UCS4 value of
1053              this character and we don't have to consult the repertoire
1054              map.
1055
1056              If the name is of the form `Pxxxx' or `Pxxxxxxxx' the xxxx
1057              and xxxxxxxx also give the code point in UCS4 but this must
1058              be in the private, i.e., unassigned, area.  This should be
1059              used for characters which do not (yet) have an equivalent
1060              in ISO 10646 and Unicode.  */
1061           char *endp;
1062
1063           errno = 0;
1064           newp->ucs4 = strtoul (name_end + 1, &endp, 16);
1065           if (endp - name_end != len1
1066               || (newp->ucs4 == ~((uint32_t) 0) && errno == ERANGE)
1067               || newp->ucs4 >= 0x80000000)
1068             /* This wasn't successful.  Signal this name cannot be a
1069                correct UCS value.  */
1070             newp->ucs4 = UNINITIALIZED_CHAR_VALUE;
1071         }
1072
1073       insert_entry (ht, name_end, len1, newp);
1074       insert_entry (bt, newp->bytes, nbytes, newp);
1075       /* Please note we don't examine the return value since it is no error
1076          if we have two definitions for a symbol.  */
1077
1078       /* Increment the value in the byte sequence.  */
1079       if (++bytes[nbytes - 1] == '\0')
1080         {
1081           int b = nbytes - 2;
1082
1083           do
1084             if (b < 0)
1085               {
1086                 lr_error (lr,
1087                           _("resulting bytes for range not representable."));
1088                 return;
1089               }
1090           while (++bytes[b--] == 0);
1091         }
1092     }
1093 }
1094
1095
1096 struct charseq *
1097 charmap_find_symbol (const struct charmap_t *cm, const char *bytes,
1098                      size_t nbytes)
1099 {
1100   void *result;
1101
1102   return (find_entry ((hash_table *) &cm->byte_table, bytes, nbytes, &result)
1103           < 0 ? NULL : (struct charseq *) result);
1104 }