locale/programs/charmap.c

   1 /* Copyright (C) 1996, 1998-2002, 2003, 2004 Free Software Foundation, Inc.
   2    This file is part of the GNU C Library.
   3    Contributed by Ulrich Drepper <drepper@gnu.org>, 1996.
   4
   5    The GNU C Library is free software; you can redistribute it and/or
   6    modify it under the terms of the GNU Lesser General Public
   7    License as published by the Free Software Foundation; either
   8    version 2.1 of the License, or (at your option) any later version.
   9
  10    The GNU C Library is distributed in the hope that it will be useful,
  11    but WITHOUT ANY WARRANTY; without even the implied warranty of
  12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  13    Lesser General Public License for more details.
  14
  15    You should have received a copy of the GNU Lesser General Public
  16    License along with the GNU C Library; if not, write to the Free
  17    Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
  18    02111-1307 USA.  */
  19
  20 #ifdef HAVE_CONFIG_H
  21 # include <config.h>
  22 #endif
  23
  24 #include <ctype.h>
  25 #include <errno.h>
  26 #include <libintl.h>
  27 #include <limits.h>
  28 #include <stdio.h>
  29 #include <stdlib.h>
  30 #include <string.h>
  31 #include <error.h>
  32
  33 #include "localedef.h"
  34 #include "linereader.h"
  35 #include "charmap.h"
  36 #include "charmap-dir.h"
  37
  38 #include <assert.h>
  39
  40
  41 /* Define the lookup function.  */
  42 #include "charmap-kw.h"
  43
  44
  45 /* Prototypes for local functions.  */
  46 static struct charmap_t *parse_charmap (struct linereader *cmfile,
  47                                         int verbose, int be_quiet);
  48 static void new_width (struct linereader *cmfile, struct charmap_t *result,
  49                        const char *from, const char *to,
  50                        unsigned long int width);
  51 static void charmap_new_char (struct linereader *lr, struct charmap_t *cm,
  52                               size_t nbytes, unsigned char *bytes,
  53                               const char *from, const char *to,
  54                               int decimal_ellipsis, int step);
  55
  56
  57 bool enc_not_ascii_compatible;
  58
  59
  60 #ifdef NEED_NULL_POINTER
  61 static const char *null_pointer;
  62 #endif
  63
  64 static struct linereader *
  65 cmlr_open (const char *directory, const char *name, kw_hash_fct_t hf)
  66 {
  67   FILE *fp;
  68
  69   fp = charmap_open (directory, name);
  70   if (fp == NULL)
  71     return NULL;
  72   else
  73     {
  74       size_t dlen = strlen (directory);
  75       int add_slash = (dlen == 0 || directory[dlen - 1] != '/');
  76       size_t nlen = strlen (name);
  77       char *pathname;
  78       char *p;
  79
  80       pathname = alloca (dlen + add_slash + nlen + 1);
  81       p = stpcpy (pathname, directory);
  82       if (add_slash)
  83         *p++ = '/';
  84       stpcpy (p, name);
  85
  86       return lr_create (fp, pathname, hf);
  87     }
  88 }
  89
  90 struct charmap_t *
  91 charmap_read (const char *filename, int verbose, int be_quiet, int use_default)
  92 {
  93   struct charmap_t *result = NULL;
  94
  95   if (filename != NULL)
  96     {
  97       struct linereader *cmfile;
  98
  99       /* First try the name as found in the parameter.  */
 100       cmfile = lr_open (filename, charmap_hash);
 101       if (cmfile == NULL)
 102         {
 103           /* No successful.  So start looking through the directories
 104              in the I18NPATH if this is a simple name.  */
 105           if (strchr (filename, '/') == NULL)
 106             {
 107               char *i18npath = getenv ("I18NPATH");
 108               if (i18npath != NULL && *i18npath != '\0')
 109                 {
 110                   const size_t pathlen = strlen (i18npath);
 111                   char i18npathbuf[pathlen + 1];
 112                   char path[pathlen + sizeof ("/charmaps")];
 113                   char *next;
 114                   i18npath = memcpy (i18npathbuf, i18npath, pathlen + 1);
 115
 116                   while (cmfile == NULL
 117                          && (next = strsep (&i18npath, ":")) != NULL)
 118                     {
 119                       stpcpy (stpcpy (path, next), "/charmaps");
 120                       cmfile = cmlr_open (path, filename, charmap_hash);
 121
 122                       if (cmfile == NULL)
 123                         /* Try without the "/charmaps" part.  */
 124                         cmfile = cmlr_open (next, filename, charmap_hash);
 125                     }
 126                 }
 127
 128               if (cmfile == NULL)
 129                 /* Try the default directory.  */
 130                 cmfile = cmlr_open (CHARMAP_PATH, filename, charmap_hash);
 131             }
 132         }
 133
 134       if (cmfile != NULL)
 135         {
 136           result = parse_charmap (cmfile, verbose, be_quiet);
 137
 138           if (result == NULL && !be_quiet)
 139             WITH_CUR_LOCALE (error (0, errno, _("\
 140 character map file `%s' not found"), filename));
 141         }
 142     }
 143
 144   if (result == NULL && filename != NULL && strchr (filename, '/') == NULL)
 145     {
 146       /* OK, one more try.  We also accept the names given to the
 147          character sets in the files.  Sometimes they differ from the
 148          file name.  */
 149       CHARMAP_DIR *dir;
 150
 151       dir = charmap_opendir (CHARMAP_PATH);
 152       if (dir != NULL)
 153         {
 154           const char *dirent;
 155
 156           while ((dirent = charmap_readdir (dir)) != NULL)
 157             {
 158               char **aliases;
 159               char **p;
 160               int found;
 161
 162               aliases = charmap_aliases (CHARMAP_PATH, dirent);
 163               found = 0;
 164               for (p = aliases; *p; p++)
 165                 if (strcasecmp (*p, filename) == 0)
 166                   {
 167                     found = 1;
 168                     break;
 169                   }
 170               charmap_free_aliases (aliases);
 171
 172               if (found)
 173                 {
 174                   struct linereader *cmfile;
 175
 176                   cmfile = cmlr_open (CHARMAP_PATH, dirent, charmap_hash);
 177                   if (cmfile != NULL)
 178                     result = parse_charmap (cmfile, verbose, be_quiet);
 179
 180                   break;
 181                 }
 182             }
 183
 184           charmap_closedir (dir);
 185         }
 186     }
 187
 188   if (result == NULL && DEFAULT_CHARMAP != NULL)
 189     {
 190       struct linereader *cmfile;
 191
 192       cmfile = cmlr_open (CHARMAP_PATH, DEFAULT_CHARMAP, charmap_hash);
 193       if (cmfile != NULL)
 194         result = parse_charmap (cmfile, verbose, be_quiet);
 195
 196       if (result == NULL)
 197         WITH_CUR_LOCALE (error (4, errno, _("\
 198 default character map file `%s' not found"), DEFAULT_CHARMAP));
 199     }
 200
 201   if (result != NULL && result->code_set_name == NULL)
 202     /* The input file does not specify a code set name.  This
 203        shouldn't happen but we should cope with it.  */
 204     result->code_set_name = basename (filename);
 205
 206   /* Test of ASCII compatibility of locale encoding.
 207
 208      Verify that the encoding to be used in a locale is ASCII compatible,
 209      at least for the graphic characters, excluding the control characters,
 210      '$' and '@'.  This constraint comes from an ISO C 99 restriction.
 211
 212      ISO C 99 section 7.17.(2) (about wchar_t):
 213        the null character shall have the code value zero and each member of
 214        the basic character set shall have a code value equal to its value
 215        when used as the lone character in an integer character constant.
 216      ISO C 99 section 5.2.1.(3):
 217        Both the basic source and basic execution character sets shall have
 218        the following members: the 26 uppercase letters of the Latin alphabet
 219             A B C D E F G H I J K L M N O P Q R S T U V W X Y Z
 220        the 26 lowercase letters of the Latin alphabet
 221             a b c d e f g h i j k l m n o p q r s t u v w x y z
 222        the 10 decimal digits
 223             0 1 2 3 4 5 6 7 8 9
 224        the following 29 graphic characters
 225             ! " # % & ' ( ) * + , - . / : ; < = > ? [ \ ] ^ _ { | } ~
 226        the space character, and control characters representing horizontal
 227        tab, vertical tab, and form feed.
 228
 229      Therefore, for all members of the "basic character set", the 'char' code
 230      must have the same value as the 'wchar_t' code, which in glibc is the
 231      same as the Unicode code, which for all of the enumerated characters
 232      is identical to the ASCII code. */
 233   if (result != NULL && use_default)
 234     {
 235       static const char basic_charset[] =
 236         {
 237           'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M',
 238           'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z',
 239           'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm',
 240           'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
 241           '0', '1', '2', '3', '4', '5', '6', '7', '8', '9',
 242           '!', '"', '#', '%', '&', '\'', '(', ')', '*', '+', ',', '-',
 243           '.', '/', ':', ';', '<', '=', '>', '?', '[', '\\', ']', '^',
 244           '_', '{', '|', '}', '~', ' ', '\t', '\v', '\f', '\0'
 245         };
 246       int failed = 0;
 247       const char *p = basic_charset;
 248
 249       do
 250         {
 251           struct charseq *seq = charmap_find_symbol (result, p, 1);
 252
 253           if (seq == NULL || seq->ucs4 != (uint32_t) *p)
 254             failed = 1;
 255         }
 256       while (*p++ != '\0');
 257
 258       if (failed)
 259         {
 260           WITH_CUR_LOCALE (fprintf (stderr, _("\
 261 character map `%s' is not ASCII compatible, locale not ISO C compliant\n"),
 262                                     result->code_set_name));
 263           enc_not_ascii_compatible = true;
 264         }
 265     }
 266
 267   return result;
 268 }
 269
 270
 271 static struct charmap_t *
 272 parse_charmap (struct linereader *cmfile, int verbose, int be_quiet)
 273 {
 274   struct charmap_t *result;
 275   int state;
 276   enum token_t expected_tok = tok_error;
 277   const char *expected_str = NULL;
 278   char *from_name = NULL;
 279   char *to_name = NULL;
 280   enum token_t ellipsis = 0;
 281   int step = 1;
 282
 283   /* We don't want symbolic names in string to be translated.  */
 284   cmfile->translate_strings = 0;
 285
 286   /* Allocate room for result.  */
 287   result = (struct charmap_t *) xmalloc (sizeof (struct charmap_t));
 288   memset (result, '\0', sizeof (struct charmap_t));
 289   /* The default DEFAULT_WIDTH is 1.  */
 290   result->width_default = 1;
 291
 292 #define obstack_chunk_alloc malloc
 293 #define obstack_chunk_free free
 294   obstack_init (&result->mem_pool);
 295
 296   if (init_hash (&result->char_table, 256)
 297       || init_hash (&result->byte_table, 256))
 298     {
 299       free (result);
 300       return NULL;
 301     }
 302
 303   /* We use a state machine to describe the charmap description file
 304      format.  */
 305   state = 1;
 306   while (1)
 307     {
 308       /* What's on?  */
 309       struct token *now = lr_token (cmfile, NULL, NULL, NULL, verbose);
 310       enum token_t nowtok = now->tok;
 311       struct token *arg;
 312
 313       if (nowtok == tok_eof)
 314         break;
 315
 316       switch (state)
 317         {
 318         case 1:
 319           /* The beginning.  We expect the special declarations, EOL or
 320              `CHARMAP'.  */
 321           if (nowtok == tok_eol)
 322             /* Ignore empty lines.  */
 323             continue;
 324
 325           if (nowtok == tok_charmap)
 326             {
 327               from_name = NULL;
 328               to_name = NULL;
 329
 330               /* We have to set up the real work.  Fill in some
 331                  default values.  */
 332               if (result->mb_cur_max == 0)
 333                 result->mb_cur_max = 1;
 334               if (result->mb_cur_min == 0)
 335                 result->mb_cur_min = result->mb_cur_max;
 336               if (result->mb_cur_min > result->mb_cur_max)
 337                 {
 338                   if (!be_quiet)
 339                     WITH_CUR_LOCALE (error (0, 0, _("\
 340 %s: <mb_cur_max> must be greater than <mb_cur_min>\n"),
 341                                             cmfile->fname));
 342
 343                   result->mb_cur_min = result->mb_cur_max;
 344                 }
 345
 346               lr_ignore_rest (cmfile, 1);
 347
 348               state = 2;
 349               continue;
 350             }
 351
 352           if (nowtok != tok_code_set_name && nowtok != tok_mb_cur_max
 353               && nowtok != tok_mb_cur_min && nowtok != tok_escape_char
 354               && nowtok != tok_comment_char && nowtok != tok_g0esc
 355               && nowtok != tok_g1esc && nowtok != tok_g2esc
 356               && nowtok != tok_g3esc && nowtok != tok_repertoiremap
 357               && nowtok != tok_include)
 358             {
 359               lr_error (cmfile, _("syntax error in prolog: %s"),
 360                         _("invalid definition"));
 361
 362               lr_ignore_rest (cmfile, 0);
 363               continue;
 364             }
 365
 366           /* We know that we need an argument.  */
 367           arg = lr_token (cmfile, NULL, NULL, NULL, verbose);
 368
 369           switch (nowtok)
 370             {
 371             case tok_code_set_name:
 372             case tok_repertoiremap:
 373               if (arg->tok != tok_ident && arg->tok != tok_string)
 374                 {
 375                 badarg:
 376                   lr_error (cmfile, _("syntax error in prolog: %s"),
 377                             _("bad argument"));
 378
 379                   lr_ignore_rest (cmfile, 0);
 380                   continue;
 381                 }
 382
 383               if (nowtok == tok_code_set_name)
 384                 result->code_set_name = obstack_copy0 (&result->mem_pool,
 385                                                        arg->val.str.startmb,
 386                                                        arg->val.str.lenmb);
 387               else
 388                 result->repertoiremap = obstack_copy0 (&result->mem_pool,
 389                                                        arg->val.str.startmb,
 390                                                        arg->val.str.lenmb);
 391
 392               lr_ignore_rest (cmfile, 1);
 393               continue;
 394
 395             case tok_mb_cur_max:
 396             case tok_mb_cur_min:
 397               if (arg->tok != tok_number)
 398                 goto badarg;
 399
 400               if (verbose
 401                   && ((nowtok == tok_mb_cur_max
 402                        && result->mb_cur_max != 0)
 403                       || (nowtok == tok_mb_cur_max
 404                           && result->mb_cur_max != 0)))
 405                 lr_error (cmfile, _("duplicate definition of <%s>"),
 406                           nowtok == tok_mb_cur_min
 407                           ? "mb_cur_min" : "mb_cur_max");
 408
 409               if (arg->val.num < 1)
 410                 {
 411                   lr_error (cmfile,
 412                             _("value for <%s> must be 1 or greater"),
 413                             nowtok == tok_mb_cur_min
 414                             ? "mb_cur_min" : "mb_cur_max");
 415
 416                   lr_ignore_rest (cmfile, 0);
 417                   continue;
 418                 }
 419               if ((nowtok == tok_mb_cur_max && result->mb_cur_min != 0
 420                    && (int) arg->val.num < result->mb_cur_min)
 421                   || (nowtok == tok_mb_cur_min && result->mb_cur_max != 0
 422                       && (int) arg->val.num > result->mb_cur_max))
 423                 {
 424                   lr_error (cmfile, _("\
 425 value of <%s> must be greater or equal than the value of <%s>"),
 426                             "mb_cur_max", "mb_cur_min");
 427
 428                   lr_ignore_rest (cmfile, 0);
 429                   continue;
 430                 }
 431
 432               if (nowtok == tok_mb_cur_max)
 433                 result->mb_cur_max = arg->val.num;
 434               else
 435                 result->mb_cur_min = arg->val.num;
 436
 437               lr_ignore_rest (cmfile, 1);
 438               continue;
 439
 440             case tok_escape_char:
 441             case tok_comment_char:
 442               if (arg->tok != tok_ident)
 443                 goto badarg;
 444
 445               if (arg->val.str.lenmb != 1)
 446                 {
 447                   lr_error (cmfile, _("\
 448 argument to <%s> must be a single character"),
 449                             nowtok == tok_escape_char ? "escape_char"
 450                                                       : "comment_char");
 451
 452                   lr_ignore_rest (cmfile, 0);
 453                   continue;
 454                 }
 455
 456               if (nowtok == tok_escape_char)
 457                 cmfile->escape_char = *arg->val.str.startmb;
 458               else
 459                 cmfile->comment_char = *arg->val.str.startmb;
 460
 461               lr_ignore_rest (cmfile, 1);
 462               continue;
 463
 464             case tok_g0esc:
 465             case tok_g1esc:
 466             case tok_g2esc:
 467             case tok_g3esc:
 468             case tok_escseq:
 469               lr_ignore_rest (cmfile, 0); /* XXX */
 470               continue;
 471
 472             case tok_include:
 473               lr_error (cmfile, _("\
 474 character sets with locking states are not supported"));
 475               exit (4);
 476
 477             default:
 478               /* Cannot happen.  */
 479               assert (! "Should not happen");
 480             }
 481           break;
 482
 483         case 2:
 484           /* We have seen `CHARMAP' and now are in the body.  Each line
 485              must have the format "%s %s %s\n" or "%s...%s %s %s\n".  */
 486           if (nowtok == tok_eol)
 487             /* Ignore empty lines.  */
 488             continue;
 489
 490           if (nowtok == tok_end)
 491             {
 492               expected_tok = tok_charmap;
 493               expected_str = "CHARMAP";
 494               state = 90;
 495               continue;
 496             }
 497
 498           if (nowtok != tok_bsymbol && nowtok != tok_ucs4)
 499             {
 500               lr_error (cmfile, _("syntax error in %s definition: %s"),
 501                         "CHARMAP", _("no symbolic name given"));
 502
 503               lr_ignore_rest (cmfile, 0);
 504               continue;
 505             }
 506
 507           /* If the previous line was not completely correct free the
 508              used memory.  */
 509           if (from_name != NULL)
 510             obstack_free (&result->mem_pool, from_name);
 511
 512           if (nowtok == tok_bsymbol)
 513             from_name = (char *) obstack_copy0 (&result->mem_pool,
 514                                                 now->val.str.startmb,
 515                                                 now->val.str.lenmb);
 516           else
 517             {
 518               obstack_printf (&result->mem_pool, "U%08X",
 519                               cmfile->token.val.ucs4);
 520               obstack_1grow (&result->mem_pool, '\0');
 521               from_name = (char *) obstack_finish (&result->mem_pool);
 522             }
 523           to_name = NULL;
 524
 525           state = 3;
 526           continue;
 527
 528         case 3:
 529           /* We have two possibilities: We can see an ellipsis or an
 530              encoding value.  */
 531           if (nowtok == tok_ellipsis3 || nowtok == tok_ellipsis4
 532               || nowtok == tok_ellipsis2 || nowtok == tok_ellipsis4_2
 533               || nowtok == tok_ellipsis2_2)
 534             {
 535               ellipsis = nowtok;
 536               if (nowtok == tok_ellipsis4_2)
 537                 {
 538                   step = 2;
 539                   nowtok = tok_ellipsis4;
 540                 }
 541               else if (nowtok == tok_ellipsis2_2)
 542                 {
 543                   step = 2;
 544                   nowtok = tok_ellipsis2;
 545                 }
 546               state = 4;
 547               continue;
 548             }
 549           /* FALLTHROUGH */
 550
 551         case 5:
 552           if (nowtok != tok_charcode)
 553             {
 554               lr_error (cmfile, _("syntax error in %s definition: %s"),
 555                         "CHARMAP", _("invalid encoding given"));
 556
 557               lr_ignore_rest (cmfile, 0);
 558
 559               state = 2;
 560               continue;
 561             }
 562
 563           if (now->val.charcode.nbytes < result->mb_cur_min)
 564             lr_error (cmfile, _("too few bytes in character encoding"));
 565           else if (now->val.charcode.nbytes > result->mb_cur_max)
 566             lr_error (cmfile, _("too many bytes in character encoding"));
 567           else
 568             charmap_new_char (cmfile, result, now->val.charcode.nbytes,
 569                               now->val.charcode.bytes, from_name, to_name,
 570                               ellipsis != tok_ellipsis2, step);
 571
 572           /* Ignore trailing comment silently.  */
 573           lr_ignore_rest (cmfile, 0);
 574
 575           from_name = NULL;
 576           to_name = NULL;
 577           ellipsis = tok_none;
 578           step = 1;
 579
 580           state = 2;
 581           continue;
 582
 583         case 4:
 584           if (nowtok != tok_bsymbol && nowtok != tok_ucs4)
 585             {
 586               lr_error (cmfile, _("syntax error in %s definition: %s"),
 587                         "CHARMAP",
 588                         _("no symbolic name given for end of range"));
 589
 590               lr_ignore_rest (cmfile, 0);
 591               continue;
 592             }
 593
 594           /* Copy the to-name in a safe place.  */
 595           if (nowtok == tok_bsymbol)
 596             to_name = (char *) obstack_copy0 (&result->mem_pool,
 597                                               cmfile->token.val.str.startmb,
 598                                               cmfile->token.val.str.lenmb);
 599           else
 600             {
 601               obstack_printf (&result->mem_pool, "U%08X",
 602                               cmfile->token.val.ucs4);
 603               obstack_1grow (&result->mem_pool, '\0');
 604               to_name = (char *) obstack_finish (&result->mem_pool);
 605             }
 606
 607           state = 5;
 608           continue;
 609
 610         case 90:
 611           if (nowtok != expected_tok)
 612             lr_error (cmfile, _("\
 613 `%1$s' definition does not end with `END %1$s'"), expected_str);
 614
 615           lr_ignore_rest (cmfile, nowtok == expected_tok);
 616           state = 91;
 617           continue;
 618
 619         case 91:
 620           /* Waiting for WIDTH... */
 621           if (nowtok == tok_eol)
 622             /* Ignore empty lines.  */
 623             continue;
 624
 625           if (nowtok == tok_width_default)
 626             {
 627               state = 92;
 628               continue;
 629             }
 630
 631           if (nowtok == tok_width)
 632             {
 633               lr_ignore_rest (cmfile, 1);
 634               state = 93;
 635               continue;
 636             }
 637
 638           if (nowtok == tok_width_variable)
 639             {
 640               lr_ignore_rest (cmfile, 1);
 641               state = 98;
 642               continue;
 643             }
 644
 645           lr_error (cmfile, _("\
 646 only WIDTH definitions are allowed to follow the CHARMAP definition"));
 647
 648           lr_ignore_rest (cmfile, 0);
 649           continue;
 650
 651         case 92:
 652           if (nowtok != tok_number)
 653             lr_error (cmfile, _("value for %s must be an integer"),
 654                       "WIDTH_DEFAULT");
 655           else
 656             result->width_default = now->val.num;
 657
 658           lr_ignore_rest (cmfile, nowtok == tok_number);
 659
 660           state = 91;
 661           continue;
 662
 663         case 93:
 664           /* We now expect `END WIDTH' or lines of the format "%s %d\n" or
 665              "%s...%s %d\n".  */
 666           if (nowtok == tok_eol)
 667             /* ignore empty lines.  */
 668             continue;
 669
 670           if (nowtok == tok_end)
 671             {
 672               expected_tok = tok_width;
 673               expected_str = "WIDTH";
 674               state = 90;
 675               continue;
 676             }
 677
 678           if (nowtok != tok_bsymbol && nowtok != tok_ucs4)
 679             {
 680               lr_error (cmfile, _("syntax error in %s definition: %s"),
 681                         "WIDTH", _("no symbolic name given"));
 682
 683               lr_ignore_rest (cmfile, 0);
 684               continue;
 685             }
 686
 687           if (from_name != NULL)
 688             obstack_free (&result->mem_pool, from_name);
 689
 690           if (nowtok == tok_bsymbol)
 691             from_name = (char *) obstack_copy0 (&result->mem_pool,
 692                                                 now->val.str.startmb,
 693                                                 now->val.str.lenmb);
 694           else
 695             {
 696               obstack_printf (&result->mem_pool, "U%08X",
 697                               cmfile->token.val.ucs4);
 698               obstack_1grow (&result->mem_pool, '\0');
 699               from_name = (char *) obstack_finish (&result->mem_pool);
 700             }
 701
 702           to_name = NULL;
 703
 704           state = 94;
 705           continue;
 706
 707         case 94:
 708           if (nowtok == tok_ellipsis3)
 709             {
 710               state = 95;
 711               continue;
 712             }
 713
 714         case 96:
 715           if (nowtok != tok_number)
 716             lr_error (cmfile, _("value for %s must be an integer"),
 717                       "WIDTH");
 718           else
 719             {
 720               /* Store width for chars.  */
 721               new_width (cmfile, result, from_name, to_name, now->val.num);
 722
 723               from_name = NULL;
 724               to_name = NULL;
 725             }
 726
 727           lr_ignore_rest (cmfile, nowtok == tok_number);
 728
 729           state = 93;
 730           continue;
 731
 732         case 95:
 733           if (nowtok != tok_bsymbol && nowtok != tok_ucs4)
 734             {
 735               lr_error (cmfile, _("syntax error in %s definition: %s"),
 736                         "WIDTH", _("no symbolic name given for end of range"));
 737
 738               lr_ignore_rest (cmfile, 0);
 739
 740               state = 93;
 741               continue;
 742             }
 743
 744           if (nowtok == tok_bsymbol)
 745             to_name = (char *) obstack_copy0 (&result->mem_pool,
 746                                               now->val.str.startmb,
 747                                               now->val.str.lenmb);
 748           else
 749             {
 750               obstack_printf (&result->mem_pool, "U%08X",
 751                               cmfile->token.val.ucs4);
 752               obstack_1grow (&result->mem_pool, '\0');
 753               to_name = (char *) obstack_finish (&result->mem_pool);
 754             }
 755
 756           state = 96;
 757           continue;
 758
 759         case 98:
 760           /* We now expect `END WIDTH_VARIABLE' or lines of the format
 761              "%s\n" or "%s...%s\n".  */
 762           if (nowtok == tok_eol)
 763             /* ignore empty lines.  */
 764             continue;
 765
 766           if (nowtok == tok_end)
 767             {
 768               expected_tok = tok_width_variable;
 769               expected_str = "WIDTH_VARIABLE";
 770               state = 90;
 771               continue;
 772             }
 773
 774           if (nowtok != tok_bsymbol && nowtok != tok_ucs4)
 775             {
 776               lr_error (cmfile, _("syntax error in %s definition: %s"),
 777                         "WIDTH_VARIABLE", _("no symbolic name given"));
 778
 779               lr_ignore_rest (cmfile, 0);
 780
 781               continue;
 782             }
 783
 784           if (from_name != NULL)
 785             obstack_free (&result->mem_pool, from_name);
 786
 787           if (nowtok == tok_bsymbol)
 788             from_name = (char *) obstack_copy0 (&result->mem_pool,
 789                                                 now->val.str.startmb,
 790                                                 now->val.str.lenmb);
 791           else
 792             {
 793               obstack_printf (&result->mem_pool, "U%08X",
 794                               cmfile->token.val.ucs4);
 795               obstack_1grow (&result->mem_pool, '\0');
 796               from_name = (char *) obstack_finish (&result->mem_pool);
 797             }
 798           to_name = NULL;
 799
 800           state = 99;
 801           continue;
 802
 803         case 99:
 804           if (nowtok == tok_ellipsis3)
 805             state = 100;
 806
 807           /* Store info.  */
 808           from_name = NULL;
 809
 810           /* Warn */
 811           state = 98;
 812           continue;
 813
 814         case 100:
 815           if (nowtok != tok_bsymbol && nowtok != tok_ucs4)
 816             {
 817               lr_error (cmfile, _("syntax error in %s definition: %s"),
 818                         "WIDTH_VARIABLE",
 819                         _("no symbolic name given for end of range"));
 820               lr_ignore_rest (cmfile, 0);
 821               continue;
 822             }
 823
 824           if (nowtok == tok_bsymbol)
 825             to_name = (char *) obstack_copy0 (&result->mem_pool,
 826                                               now->val.str.startmb,
 827                                               now->val.str.lenmb);
 828           else
 829             {
 830               obstack_printf (&result->mem_pool, "U%08X",
 831                               cmfile->token.val.ucs4);
 832               obstack_1grow (&result->mem_pool, '\0');
 833               to_name = (char *) obstack_finish (&result->mem_pool);
 834             }
 835
 836           /* XXX Enter value into table.  */
 837
 838           lr_ignore_rest (cmfile, 1);
 839
 840           state = 98;
 841           continue;
 842
 843         default:
 844           WITH_CUR_LOCALE (error (5, 0, _("%s: error in state machine"),
 845                                   __FILE__));
 846           /* NOTREACHED */
 847         }
 848       break;
 849     }
 850
 851   if (state != 91 && !be_quiet)
 852     WITH_CUR_LOCALE (error (0, 0, _("%s: premature end of file"),
 853                             cmfile->fname));
 854
 855   lr_close (cmfile);
 856
 857   return result;
 858 }
 859
 860
 861 static void
 862 new_width (struct linereader *cmfile, struct charmap_t *result,
 863            const char *from, const char *to, unsigned long int width)
 864 {
 865   struct charseq *from_val;
 866   struct charseq *to_val;
 867
 868   from_val = charmap_find_value (result, from, strlen (from));
 869   if (from_val == NULL)
 870     {
 871       lr_error (cmfile, _("unknown character `%s'"), from);
 872       return;
 873     }
 874
 875   if (to == NULL)
 876     to_val = from_val;
 877   else
 878     {
 879       to_val = charmap_find_value (result, to, strlen (to));
 880       if (to_val == NULL)
 881         {
 882           lr_error (cmfile, _("unknown character `%s'"), to);
 883           return;
 884         }
 885
 886       /* Make sure the number of bytes for the end points of the range
 887          is correct.  */
 888       if (from_val->nbytes != to_val->nbytes)
 889         {
 890           lr_error (cmfile, _("\
 891 number of bytes for byte sequence of beginning and end of range not the same: %d vs %d"),
 892                     from_val->nbytes, to_val->nbytes);
 893           return;
 894         }
 895     }
 896
 897   if (result->nwidth_rules >= result->nwidth_rules_max)
 898     {
 899       size_t new_size = result->nwidth_rules + 32;
 900       struct width_rule *new_rules =
 901         (struct width_rule *) obstack_alloc (&result->mem_pool,
 902                                              (new_size
 903                                               * sizeof (struct width_rule)));
 904
 905       memcpy (new_rules, result->width_rules,
 906               result->nwidth_rules_max * sizeof (struct width_rule));
 907
 908       result->width_rules = new_rules;
 909       result->nwidth_rules_max = new_size;
 910     }
 911
 912   result->width_rules[result->nwidth_rules].from = from_val;
 913   result->width_rules[result->nwidth_rules].to = to_val;
 914   result->width_rules[result->nwidth_rules].width = (unsigned int) width;
 915   ++result->nwidth_rules;
 916 }
 917
 918
 919 struct charseq *
 920 charmap_find_value (const struct charmap_t *cm, const char *name, size_t len)
 921 {
 922   void *result;
 923
 924   return (find_entry ((hash_table *) &cm->char_table, name, len, &result)
 925           < 0 ? NULL : (struct charseq *) result);
 926 }
 927
 928
 929 static void
 930 charmap_new_char (struct linereader *lr, struct charmap_t *cm,
 931                   size_t nbytes, unsigned char *bytes,
 932                   const char *from, const char *to,
 933                   int decimal_ellipsis, int step)
 934 {
 935   hash_table *ht = &cm->char_table;
 936   hash_table *bt = &cm->byte_table;
 937   struct obstack *ob = &cm->mem_pool;
 938   char *from_end;
 939   char *to_end;
 940   const char *cp;
 941   int prefix_len, len1, len2;
 942   unsigned int from_nr, to_nr, cnt;
 943   struct charseq *newp;
 944
 945   len1 = strlen (from);
 946
 947   if (to == NULL)
 948     {
 949       newp = (struct charseq *) obstack_alloc (ob, sizeof (*newp) + nbytes);
 950       newp->nbytes = nbytes;
 951       memcpy (newp->bytes, bytes, nbytes);
 952       newp->name = from;
 953
 954       newp->ucs4 = UNINITIALIZED_CHAR_VALUE;
 955       if ((from[0] == 'U' || from[0] == 'P') && (len1 == 5 || len1 == 9))
 956         {
 957           /* Maybe the name is of the form `Uxxxx' or `Uxxxxxxxx' where
 958              xxxx and xxxxxxxx are hexadecimal numbers.  In this case
 959              we use the value of xxxx or xxxxxxxx as the UCS4 value of
 960              this character and we don't have to consult the repertoire
 961              map.
 962
 963              If the name is of the form `Pxxxx' or `Pxxxxxxxx' the xxxx
 964              and xxxxxxxx also give the code point in UCS4 but this must
 965              be in the private, i.e., unassigned, area.  This should be
 966              used for characters which do not (yet) have an equivalent
 967              in ISO 10646 and Unicode.  */
 968           char *endp;
 969
 970           errno = 0;
 971           newp->ucs4 = strtoul (from + 1, &endp, 16);
 972           if (endp - from != len1
 973               || (newp->ucs4 == ~((uint32_t) 0) && errno == ERANGE)
 974               || newp->ucs4 >= 0x80000000)
 975             /* This wasn't successful.  Signal this name cannot be a
 976                correct UCS value.  */
 977             newp->ucs4 = UNINITIALIZED_CHAR_VALUE;
 978         }
 979
 980       insert_entry (ht, from, len1, newp);
 981       insert_entry (bt, newp->bytes, nbytes, newp);
 982       /* Please note that it isn't a bug if a symbol is defined more
 983          than once.  All later definitions are simply discarded.  */
 984       return;
 985     }
 986
 987   /* We have a range: the names must have names with equal prefixes
 988      and an equal number of digits, where the second number is greater
 989      or equal than the first.  */
 990   len2 = strlen (to);
 991
 992   if (len1 != len2)
 993     {
 994     illegal_range:
 995       lr_error (lr, _("invalid names for character range"));
 996       return;
 997     }
 998
 999   cp = &from[len1 - 1];
1000   if (decimal_ellipsis)
1001     while (isdigit (*cp) && cp >= from)
1002       --cp;
1003   else
1004     while (isxdigit (*cp) && cp >= from)
1005       {
1006         if (!isdigit (*cp) && !isupper (*cp))
1007           lr_error (lr, _("\
1008 hexadecimal range format should use only capital characters"));
1009         --cp;
1010       }
1011
1012   prefix_len = (cp - from) + 1;
1013
1014   if (cp == &from[len1 - 1] || strncmp (from, to, prefix_len) != 0)
1015     goto illegal_range;
1016
1017   errno = 0;
1018   from_nr = strtoul (&from[prefix_len], &from_end, decimal_ellipsis ? 10 : 16);
1019   if (*from_end != '\0' || (from_nr == UINT_MAX && errno == ERANGE)
1020       || ((to_nr = strtoul (&to[prefix_len], &to_end,
1021                             decimal_ellipsis ? 10 : 16)) == UINT_MAX
1022           && errno == ERANGE)
1023       || *to_end != '\0')
1024     {
1025       lr_error (lr, _("<%s> and <%s> are illegal names for range"), from, to);
1026       return;
1027     }
1028
1029   if (from_nr > to_nr)
1030     {
1031       lr_error (lr, _("upper limit in range is not higher then lower limit"));
1032       return;
1033     }
1034
1035   for (cnt = from_nr; cnt <= to_nr; cnt += step)
1036     {
1037       char *name_end;
1038       obstack_printf (ob, decimal_ellipsis ? "%.*s%0*d" : "%.*s%0*X",
1039                       prefix_len, from, len1 - prefix_len, cnt);
1040       obstack_1grow (ob, '\0');
1041       name_end = obstack_finish (ob);
1042
1043       newp = (struct charseq *) obstack_alloc (ob, sizeof (*newp) + nbytes);
1044       newp->nbytes = nbytes;
1045       memcpy (newp->bytes, bytes, nbytes);
1046       newp->name = name_end;
1047
1048       newp->ucs4 = UNINITIALIZED_CHAR_VALUE;
1049       if ((name_end[0] == 'U' || name_end[0] == 'P')
1050           && (len1 == 5 || len1 == 9))
1051         {
1052           /* Maybe the name is of the form `Uxxxx' or `Uxxxxxxxx' where
1053              xxxx and xxxxxxxx are hexadecimal numbers.  In this case
1054              we use the value of xxxx or xxxxxxxx as the UCS4 value of
1055              this character and we don't have to consult the repertoire
1056              map.
1057
1058              If the name is of the form `Pxxxx' or `Pxxxxxxxx' the xxxx
1059              and xxxxxxxx also give the code point in UCS4 but this must
1060              be in the private, i.e., unassigned, area.  This should be
1061              used for characters which do not (yet) have an equivalent
1062              in ISO 10646 and Unicode.  */
1063           char *endp;
1064
1065           errno = 0;
1066           newp->ucs4 = strtoul (name_end + 1, &endp, 16);
1067           if (endp - name_end != len1
1068               || (newp->ucs4 == ~((uint32_t) 0) && errno == ERANGE)
1069               || newp->ucs4 >= 0x80000000)
1070             /* This wasn't successful.  Signal this name cannot be a
1071                correct UCS value.  */
1072             newp->ucs4 = UNINITIALIZED_CHAR_VALUE;
1073         }
1074
1075       insert_entry (ht, name_end, len1, newp);
1076       insert_entry (bt, newp->bytes, nbytes, newp);
1077       /* Please note we don't examine the return value since it is no error
1078          if we have two definitions for a symbol.  */
1079
1080       /* Increment the value in the byte sequence.  */
1081       if (++bytes[nbytes - 1] == '\0')
1082         {
1083           int b = nbytes - 2;
1084
1085           do
1086             if (b < 0)
1087               {
1088                 lr_error (lr,
1089                           _("resulting bytes for range not representable."));
1090                 return;
1091               }
1092           while (++bytes[b--] == 0);
1093         }
1094     }
1095 }
1096
1097
1098 struct charseq *
1099 charmap_find_symbol (const struct charmap_t *cm, const char *bytes,
1100                      size_t nbytes)
1101 {
1102   void *result;
1103
1104   return (find_entry ((hash_table *) &cm->byte_table, bytes, nbytes, &result)
1105           < 0 ? NULL : (struct charseq *) result);
1106 }