locale/programs/charmap.c

   1 /* Copyright (C) 1996,1998,1999,2000,2001 Free Software Foundation, Inc.
   2    This file is part of the GNU C Library.
   3    Contributed by Ulrich Drepper <drepper@gnu.org>, 1996.
   4
   5    The GNU C Library is free software; you can redistribute it and/or
   6    modify it under the terms of the GNU Library General Public License as
   7    published by the Free Software Foundation; either version 2 of the
   8    License, or (at your option) any later version.
   9
  10    The GNU C Library is distributed in the hope that it will be useful,
  11    but WITHOUT ANY WARRANTY; without even the implied warranty of
  12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  13    Library General Public License for more details.
  14
  15    You should have received a copy of the GNU Library General Public
  16    License along with the GNU C Library; see the file COPYING.LIB.  If not,
  17    write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
  18    Boston, MA 02111-1307, USA.  */
  19
  20 #ifdef HAVE_CONFIG_H
  21 # include <config.h>
  22 #endif
  23
  24 #include <ctype.h>
  25 #include <errno.h>
  26 #include <libintl.h>
  27 #include <limits.h>
  28 #include <obstack.h>
  29 #include <stdio.h>
  30 #include <stdlib.h>
  31 #include <string.h>
  32
  33 #include "error.h"
  34 #include "linereader.h"
  35 #include "charmap.h"
  36 #include "charmap-dir.h"
  37 #include "repertoire.h"
  38
  39 #include <assert.h>
  40
  41
  42 /* Define the lookup function.  */
  43 #include "charmap-kw.h"
  44
  45
  46 extern void *xmalloc (size_t __n);
  47
  48 /* Prototypes for local functions.  */
  49 static struct charmap_t *parse_charmap (struct linereader *cmfile,
  50                                         int verbose, int be_quiet);
  51 static void new_width (struct linereader *cmfile, struct charmap_t *result,
  52                        const char *from, const char *to,
  53                        unsigned long int width);
  54 static void charmap_new_char (struct linereader *lr, struct charmap_t *cm,
  55                               int nbytes, char *bytes, const char *from,
  56                               const char *to, int decimal_ellipsis, int step);
  57
  58
  59 static const char *null_pointer;
  60
  61 static struct linereader *
  62 cmlr_open (const char *directory, const char *name, kw_hash_fct_t hf)
  63 {
  64   FILE *fp;
  65
  66   fp = charmap_open (directory, name);
  67   if (fp == NULL)
  68     return NULL;
  69   else
  70     {
  71       size_t dlen = strlen (directory);
  72       int add_slash = (dlen == 0 || directory[dlen - 1] != '/');
  73       size_t nlen = strlen (name);
  74       char *pathname;
  75       char *p;
  76
  77       pathname = alloca (dlen + add_slash + nlen + 1);
  78       p = stpcpy (pathname, directory);
  79       if (add_slash)
  80         *p++ = '/';
  81       stpcpy (p, name);
  82
  83       return lr_create (fp, pathname, hf);
  84     }
  85 }
  86
  87 struct charmap_t *
  88 charmap_read (const char *filename, int verbose, int be_quiet, int use_default)
  89 {
  90   struct charmap_t *result = NULL;
  91
  92   if (filename != NULL)
  93     {
  94       struct linereader *cmfile;
  95
  96       /* First try the name as found in the parameter.  */
  97       cmfile = lr_open (filename, charmap_hash);
  98       if (cmfile == NULL)
  99         {
 100           /* No successful.  So start looking through the directories
 101              in the I18NPATH if this is a simple name.  */
 102           if (strchr (filename, '/') == NULL)
 103             {
 104               char *i18npath = getenv ("I18NPATH");
 105               if (i18npath != NULL && *i18npath != '\0')
 106                 {
 107                   char path[strlen (i18npath) + sizeof ("/charmaps")];
 108                   char *next;
 109                   i18npath = strdupa (i18npath);
 110
 111                   while (cmfile == NULL
 112                          && (next = strsep (&i18npath, ":")) != NULL)
 113                     {
 114                       stpcpy (stpcpy (path, next), "/charmaps");
 115                       cmfile = cmlr_open (path, filename, charmap_hash);
 116
 117                       if (cmfile == NULL)
 118                         {
 119                           /* Try without the "/charmaps" part.  */
 120                           cmfile = cmlr_open (next, filename, charmap_hash);
 121                         }
 122                     }
 123                 }
 124
 125               if (cmfile == NULL)
 126                 {
 127                   /* Try the default directory.  */
 128                   cmfile = cmlr_open (CHARMAP_PATH, filename, charmap_hash);
 129                 }
 130             }
 131         }
 132
 133       if (cmfile != NULL)
 134         {
 135           result = parse_charmap (cmfile, verbose, be_quiet);
 136
 137           if (result == NULL && !be_quiet)
 138             error (0, errno, _("character map file `%s' not found"), filename);
 139         }
 140     }
 141
 142   if (result == NULL && filename != NULL && strchr (filename, '/') == NULL)
 143     {
 144       /* OK, one more try.  We also accept the names given to the
 145          character sets in the files.  Sometimes they differ from the
 146          file name.  */
 147       CHARMAP_DIR *dir;
 148
 149       dir = charmap_opendir (CHARMAP_PATH);
 150       if (dir != NULL)
 151         {
 152           const char *dirent;
 153
 154           while ((dirent = charmap_readdir (dir)) != NULL)
 155             {
 156               char **aliases;
 157               char **p;
 158               int found;
 159
 160               aliases = charmap_aliases (CHARMAP_PATH, dirent);
 161               found = 0;
 162               for (p = aliases; *p; p++)
 163                 if (strcasecmp (*p, filename) == 0)
 164                   {
 165                     found = 1;
 166                     break;
 167                   }
 168               charmap_free_aliases (aliases);
 169
 170               if (found)
 171                 {
 172                   struct linereader *cmfile;
 173
 174                   cmfile = cmlr_open (CHARMAP_PATH, dirent, charmap_hash);
 175                   if (cmfile != NULL)
 176                     result = parse_charmap (cmfile, verbose, be_quiet);
 177
 178                   break;
 179                 }
 180             }
 181
 182           charmap_closedir (dir);
 183         }
 184     }
 185
 186   if (result == NULL && DEFAULT_CHARMAP != NULL)
 187     {
 188       struct linereader *cmfile;
 189
 190       cmfile = cmlr_open (CHARMAP_PATH, DEFAULT_CHARMAP, charmap_hash);
 191       if (cmfile != NULL)
 192         result = parse_charmap (cmfile, verbose, be_quiet);
 193
 194       if (result == NULL)
 195         error (4, errno, _("default character map file `%s' not found"),
 196                DEFAULT_CHARMAP);
 197     }
 198
 199   /* Test of ASCII compatibility of locale encoding.
 200
 201      Verify that the encoding to be used in a locale is ASCII compatible,
 202      at least for the graphic characters, excluding the control characters,
 203      '$' and '@'.  This constraint comes from an ISO C 99 restriction.
 204
 205      ISO C 99 section 7.17.(2) (about wchar_t):
 206        the null character shall have the code value zero and each member of
 207        the basic character set shall have a code value equal to its value
 208        when used as the lone character in an integer character constant.
 209      ISO C 99 section 5.2.1.(3):
 210        Both the basic source and basic execution character sets shall have
 211        the following members: the 26 uppercase letters of the Latin alphabet
 212             A B C D E F G H I J K L M N O P Q R S T U V W X Y Z
 213        the 26 lowercase letters of the Latin alphabet
 214             a b c d e f g h i j k l m n o p q r s t u v w x y z
 215        the 10 decimal digits
 216             0 1 2 3 4 5 6 7 8 9
 217        the following 29 graphic characters
 218             ! " # % & ' ( ) * + , - . / : ; < = > ? [ \ ] ^ _ { | } ~
 219        the space character, and control characters representing horizontal
 220        tab, vertical tab, and form feed.
 221
 222      Therefore, for all members of the "basic character set", the 'char' code
 223      must have the same value as the 'wchar_t' code, which in glibc is the
 224      same as the Unicode code, which for all of the enumerated characters
 225      is identical to the ASCII code. */
 226   if (result != NULL && use_default)
 227     {
 228       static const char basic_charset[] =
 229         {
 230           'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M',
 231           'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z',
 232           'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm',
 233           'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
 234           '0', '1', '2', '3', '4', '5', '6', '7', '8', '9',
 235           '!', '"', '#', '%', '&', '\'', '(', ')', '*', '+', ',', '-',
 236           '.', '/', ':', ';', '<', '=', '>', '?', '[', '\\', ']', '^',
 237           '_', '{', '|', '}', '~', ' ', '\t', '\v', '\f', '\0'
 238         };
 239       int failed = 0;
 240       const char *p = basic_charset;
 241
 242       do
 243         {
 244           struct charseq * seq = charmap_find_symbol (result, p, 1);
 245
 246           if (seq == NULL || seq->ucs4 != *p)
 247             failed = 1;
 248         }
 249       while (*p++ != '\0');
 250
 251       if (failed)
 252         fprintf (stderr, _("\
 253 character map `%s' is not ASCII compatible, locale not ISO C compliant\n"),
 254                  result->code_set_name);
 255     }
 256
 257   return result;
 258 }
 259
 260
 261 static struct charmap_t *
 262 parse_charmap (struct linereader *cmfile, int verbose, int be_quiet)
 263 {
 264   struct charmap_t *result;
 265   int state;
 266   enum token_t expected_tok = tok_error;
 267   const char *expected_str = NULL;
 268   char *from_name = NULL;
 269   char *to_name = NULL;
 270   enum token_t ellipsis = 0;
 271   int step = 1;
 272
 273   /* We don't want symbolic names in string to be translated.  */
 274   cmfile->translate_strings = 0;
 275
 276   /* Allocate room for result.  */
 277   result = (struct charmap_t *) xmalloc (sizeof (struct charmap_t));
 278   memset (result, '\0', sizeof (struct charmap_t));
 279   /* The default DEFAULT_WIDTH is 1.  */
 280   result->width_default = 1;
 281
 282 #define obstack_chunk_alloc malloc
 283 #define obstack_chunk_free free
 284   obstack_init (&result->mem_pool);
 285
 286   if (init_hash (&result->char_table, 256)
 287       || init_hash (&result->byte_table, 256))
 288     {
 289       free (result);
 290       return NULL;
 291     }
 292
 293   /* We use a state machine to describe the charmap description file
 294      format.  */
 295   state = 1;
 296   while (1)
 297     {
 298       /* What's on?  */
 299       struct token *now = lr_token (cmfile, NULL, NULL, verbose);
 300       enum token_t nowtok = now->tok;
 301       struct token *arg;
 302
 303       if (nowtok == tok_eof)
 304         break;
 305
 306       switch (state)
 307         {
 308         case 1:
 309           /* The beginning.  We expect the special declarations, EOL or
 310              `CHARMAP'.  */
 311           if (nowtok == tok_eol)
 312             /* Ignore empty lines.  */
 313             continue;
 314
 315           if (nowtok == tok_charmap)
 316             {
 317               from_name = NULL;
 318               to_name = NULL;
 319
 320               /* We have to set up the real work.  Fill in some
 321                  default values.  */
 322               if (result->mb_cur_max == 0)
 323                 result->mb_cur_max = 1;
 324               if (result->mb_cur_min == 0)
 325                 result->mb_cur_min = result->mb_cur_max;
 326               if (result->mb_cur_min > result->mb_cur_max)
 327                 {
 328                   if (!be_quiet)
 329                     error (0, 0, _("\
 330 %s: <mb_cur_max> must be greater than <mb_cur_min>\n"),
 331                            cmfile->fname);
 332
 333                   result->mb_cur_min = result->mb_cur_max;
 334                 }
 335
 336               lr_ignore_rest (cmfile, 1);
 337
 338               state = 2;
 339               continue;
 340             }
 341
 342           if (nowtok != tok_code_set_name && nowtok != tok_mb_cur_max
 343               && nowtok != tok_mb_cur_min && nowtok != tok_escape_char
 344               && nowtok != tok_comment_char && nowtok != tok_g0esc
 345               && nowtok != tok_g1esc && nowtok != tok_g2esc
 346               && nowtok != tok_g3esc && nowtok != tok_repertoiremap
 347               && nowtok != tok_include)
 348             {
 349               lr_error (cmfile, _("syntax error in prolog: %s"),
 350                         _("invalid definition"));
 351
 352               lr_ignore_rest (cmfile, 0);
 353               continue;
 354             }
 355
 356           /* We know that we need an argument.  */
 357           arg = lr_token (cmfile, NULL, NULL, verbose);
 358
 359           switch (nowtok)
 360             {
 361             case tok_code_set_name:
 362             case tok_repertoiremap:
 363               if (arg->tok != tok_ident && arg->tok != tok_string)
 364                 {
 365                 badarg:
 366                   lr_error (cmfile, _("syntax error in prolog: %s"),
 367                             _("bad argument"));
 368
 369                   lr_ignore_rest (cmfile, 0);
 370                   continue;
 371                 }
 372
 373               if (nowtok == tok_code_set_name)
 374                 result->code_set_name = obstack_copy0 (&result->mem_pool,
 375                                                        arg->val.str.startmb,
 376                                                        arg->val.str.lenmb);
 377               else
 378                 result->repertoiremap = obstack_copy0 (&result->mem_pool,
 379                                                        arg->val.str.startmb,
 380                                                        arg->val.str.lenmb);
 381
 382               lr_ignore_rest (cmfile, 1);
 383               continue;
 384
 385             case tok_mb_cur_max:
 386             case tok_mb_cur_min:
 387               if (arg->tok != tok_number)
 388                 goto badarg;
 389
 390               if (verbose
 391                   && ((nowtok == tok_mb_cur_max
 392                        && result->mb_cur_max != 0)
 393                       || (nowtok == tok_mb_cur_max
 394                           && result->mb_cur_max != 0)))
 395                 lr_error (cmfile, _("duplicate definition of <%s>"),
 396                           nowtok == tok_mb_cur_min
 397                           ? "mb_cur_min" : "mb_cur_max");
 398
 399               if (arg->val.num < 1)
 400                 {
 401                   lr_error (cmfile,
 402                             _("value for <%s> must be 1 or greater"),
 403                             nowtok == tok_mb_cur_min
 404                             ? "mb_cur_min" : "mb_cur_max");
 405
 406                   lr_ignore_rest (cmfile, 0);
 407                   continue;
 408                 }
 409               if ((nowtok == tok_mb_cur_max && result->mb_cur_min != 0
 410                    && (int) arg->val.num < result->mb_cur_min)
 411                   || (nowtok == tok_mb_cur_min && result->mb_cur_max != 0
 412                       && (int) arg->val.num > result->mb_cur_max))
 413                 {
 414                   lr_error (cmfile, _("\
 415 value of <%s> must be greater or equal than the value of <%s>"),
 416                             "mb_cur_max", "mb_cur_min");
 417
 418                   lr_ignore_rest (cmfile, 0);
 419                   continue;
 420                 }
 421
 422               if (nowtok == tok_mb_cur_max)
 423                 result->mb_cur_max = arg->val.num;
 424               else
 425                 result->mb_cur_min = arg->val.num;
 426
 427               lr_ignore_rest (cmfile, 1);
 428               continue;
 429
 430             case tok_escape_char:
 431             case tok_comment_char:
 432               if (arg->tok != tok_ident)
 433                 goto badarg;
 434
 435               if (arg->val.str.lenmb != 1)
 436                 {
 437                   lr_error (cmfile, _("\
 438 argument to <%s> must be a single character"),
 439                             nowtok == tok_escape_char ? "escape_char"
 440                                                       : "comment_char");
 441
 442                   lr_ignore_rest (cmfile, 0);
 443                   continue;
 444                 }
 445
 446               if (nowtok == tok_escape_char)
 447                 cmfile->escape_char = *arg->val.str.startmb;
 448               else
 449                 cmfile->comment_char = *arg->val.str.startmb;
 450
 451               lr_ignore_rest (cmfile, 1);
 452               continue;
 453
 454             case tok_g0esc:
 455             case tok_g1esc:
 456             case tok_g2esc:
 457             case tok_g3esc:
 458             case tok_escseq:
 459               lr_ignore_rest (cmfile, 0); /* XXX */
 460               continue;
 461
 462             case tok_include:
 463               lr_error (cmfile, _("\
 464 character sets with locking states are not supported"));
 465               exit (4);
 466
 467             default:
 468               /* Cannot happen.  */
 469               assert (! "Should not happen");
 470             }
 471           break;
 472
 473         case 2:
 474           /* We have seen `CHARMAP' and now are in the body.  Each line
 475              must have the format "%s %s %s\n" or "%s...%s %s %s\n".  */
 476           if (nowtok == tok_eol)
 477             /* Ignore empty lines.  */
 478             continue;
 479
 480           if (nowtok == tok_end)
 481             {
 482               expected_tok = tok_charmap;
 483               expected_str = "CHARMAP";
 484               state = 90;
 485               continue;
 486             }
 487
 488           if (nowtok != tok_bsymbol && nowtok != tok_ucs4)
 489             {
 490               lr_error (cmfile, _("syntax error in %s definition: %s"),
 491                         "CHARMAP", _("no symbolic name given"));
 492
 493               lr_ignore_rest (cmfile, 0);
 494               continue;
 495             }
 496
 497           /* If the previous line was not completely correct free the
 498              used memory.  */
 499           if (from_name != NULL)
 500             obstack_free (&result->mem_pool, from_name);
 501
 502           if (nowtok == tok_bsymbol)
 503             from_name = (char *) obstack_copy0 (&result->mem_pool,
 504                                                 now->val.str.startmb,
 505                                                 now->val.str.lenmb);
 506           else
 507             {
 508               obstack_printf (&result->mem_pool, "U%08X",
 509                               cmfile->token.val.ucs4);
 510               obstack_1grow (&result->mem_pool, '\0');
 511               from_name = (char *) obstack_finish (&result->mem_pool);
 512             }
 513           to_name = NULL;
 514
 515           state = 3;
 516           continue;
 517
 518         case 3:
 519           /* We have two possibilities: We can see an ellipsis or an
 520              encoding value.  */
 521           if (nowtok == tok_ellipsis3 || nowtok == tok_ellipsis4
 522               || nowtok == tok_ellipsis2 || nowtok == tok_ellipsis4_2
 523               || nowtok == tok_ellipsis2_2)
 524             {
 525               ellipsis = nowtok;
 526               if (nowtok == tok_ellipsis4_2)
 527                 {
 528                   step = 2;
 529                   nowtok = tok_ellipsis4;
 530                 }
 531               else if (nowtok == tok_ellipsis2_2)
 532                 {
 533                   step = 2;
 534                   nowtok = tok_ellipsis2;
 535                 }
 536               state = 4;
 537               continue;
 538             }
 539           /* FALLTHROUGH */
 540
 541         case 5:
 542           if (nowtok != tok_charcode)
 543             {
 544               lr_error (cmfile, _("syntax error in %s definition: %s"),
 545                         "CHARMAP", _("invalid encoding given"));
 546
 547               lr_ignore_rest (cmfile, 0);
 548
 549               state = 2;
 550               continue;
 551             }
 552
 553           if (now->val.charcode.nbytes < result->mb_cur_min)
 554             lr_error (cmfile, _("too few bytes in character encoding"));
 555           else if (now->val.charcode.nbytes > result->mb_cur_max)
 556             lr_error (cmfile, _("too many bytes in character encoding"));
 557           else
 558             charmap_new_char (cmfile, result, now->val.charcode.nbytes,
 559                               now->val.charcode.bytes, from_name, to_name,
 560                               ellipsis != tok_ellipsis2, step);
 561
 562           /* Ignore trailing comment silently.  */
 563           lr_ignore_rest (cmfile, 0);
 564
 565           from_name = NULL;
 566           to_name = NULL;
 567           ellipsis = tok_none;
 568           step = 1;
 569
 570           state = 2;
 571           continue;
 572
 573         case 4:
 574           if (nowtok != tok_bsymbol && nowtok != tok_ucs4)
 575             {
 576               lr_error (cmfile, _("syntax error in %s definition: %s"),
 577                         "CHARMAP",
 578                         _("no symbolic name given for end of range"));
 579
 580               lr_ignore_rest (cmfile, 0);
 581               continue;
 582             }
 583
 584           /* Copy the to-name in a safe place.  */
 585           if (nowtok == tok_bsymbol)
 586             to_name = (char *) obstack_copy0 (&result->mem_pool,
 587                                               cmfile->token.val.str.startmb,
 588                                               cmfile->token.val.str.lenmb);
 589           else
 590             {
 591               obstack_printf (&result->mem_pool, "U%08X",
 592                               cmfile->token.val.ucs4);
 593               obstack_1grow (&result->mem_pool, '\0');
 594               to_name = (char *) obstack_finish (&result->mem_pool);
 595             }
 596
 597           state = 5;
 598           continue;
 599
 600         case 90:
 601           if (nowtok != expected_tok)
 602             lr_error (cmfile, _("\
 603 `%1$s' definition does not end with `END %1$s'"), expected_str);
 604
 605           lr_ignore_rest (cmfile, nowtok == expected_tok);
 606           state = 91;
 607           continue;
 608
 609         case 91:
 610           /* Waiting for WIDTH... */
 611           if (nowtok == tok_eol)
 612             /* Ignore empty lines.  */
 613             continue;
 614
 615           if (nowtok == tok_width_default)
 616             {
 617               state = 92;
 618               continue;
 619             }
 620
 621           if (nowtok == tok_width)
 622             {
 623               lr_ignore_rest (cmfile, 1);
 624               state = 93;
 625               continue;
 626             }
 627
 628           if (nowtok == tok_width_variable)
 629             {
 630               lr_ignore_rest (cmfile, 1);
 631               state = 98;
 632               continue;
 633             }
 634
 635           lr_error (cmfile, _("\
 636 only WIDTH definitions are allowed to follow the CHARMAP definition"));
 637
 638           lr_ignore_rest (cmfile, 0);
 639           continue;
 640
 641         case 92:
 642           if (nowtok != tok_number)
 643             lr_error (cmfile, _("value for %s must be an integer"),
 644                       "WIDTH_DEFAULT");
 645           else
 646             result->width_default = now->val.num;
 647
 648           lr_ignore_rest (cmfile, nowtok == tok_number);
 649
 650           state = 91;
 651           continue;
 652
 653         case 93:
 654           /* We now expect `END WIDTH' or lines of the format "%s %d\n" or
 655              "%s...%s %d\n".  */
 656           if (nowtok == tok_eol)
 657             /* ignore empty lines.  */
 658             continue;
 659
 660           if (nowtok == tok_end)
 661             {
 662               expected_tok = tok_width;
 663               expected_str = "WIDTH";
 664               state = 90;
 665               continue;
 666             }
 667
 668           if (nowtok != tok_bsymbol && nowtok != tok_ucs4)
 669             {
 670               lr_error (cmfile, _("syntax error in %s definition: %s"),
 671                         "WIDTH", _("no symbolic name given"));
 672
 673               lr_ignore_rest (cmfile, 0);
 674               continue;
 675             }
 676
 677           if (from_name != NULL)
 678             obstack_free (&result->mem_pool, from_name);
 679
 680           if (nowtok == tok_bsymbol)
 681             from_name = (char *) obstack_copy0 (&result->mem_pool,
 682                                                 now->val.str.startmb,
 683                                                 now->val.str.lenmb);
 684           else
 685             {
 686               obstack_printf (&result->mem_pool, "U%08X",
 687                               cmfile->token.val.ucs4);
 688               obstack_1grow (&result->mem_pool, '\0');
 689               from_name = (char *) obstack_finish (&result->mem_pool);
 690             }
 691
 692           to_name = NULL;
 693
 694           state = 94;
 695           continue;
 696
 697         case 94:
 698           if (nowtok == tok_ellipsis3)
 699             {
 700               state = 95;
 701               continue;
 702             }
 703
 704         case 96:
 705           if (nowtok != tok_number)
 706             lr_error (cmfile, _("value for %s must be an integer"),
 707                       "WIDTH");
 708           else
 709             {
 710               /* Store width for chars.  */
 711               new_width (cmfile, result, from_name, to_name, now->val.num);
 712
 713               from_name = NULL;
 714               to_name = NULL;
 715             }
 716
 717           lr_ignore_rest (cmfile, nowtok == tok_number);
 718
 719           state = 93;
 720           continue;
 721
 722         case 95:
 723           if (nowtok != tok_bsymbol && nowtok != tok_ucs4)
 724             {
 725               lr_error (cmfile, _("syntax error in %s definition: %s"),
 726                         "WIDTH", _("no symbolic name given for end of range"));
 727
 728               lr_ignore_rest (cmfile, 0);
 729
 730               state = 93;
 731               continue;
 732             }
 733
 734           if (nowtok == tok_bsymbol)
 735             to_name = (char *) obstack_copy0 (&result->mem_pool,
 736                                               now->val.str.startmb,
 737                                               now->val.str.lenmb);
 738           else
 739             {
 740               obstack_printf (&result->mem_pool, "U%08X",
 741                               cmfile->token.val.ucs4);
 742               obstack_1grow (&result->mem_pool, '\0');
 743               to_name = (char *) obstack_finish (&result->mem_pool);
 744             }
 745
 746           state = 96;
 747           continue;
 748
 749         case 98:
 750           /* We now expect `END WIDTH_VARIABLE' or lines of the format
 751              "%s\n" or "%s...%s\n".  */
 752           if (nowtok == tok_eol)
 753             /* ignore empty lines.  */
 754             continue;
 755
 756           if (nowtok == tok_end)
 757             {
 758               expected_tok = tok_width_variable;
 759               expected_str = "WIDTH_VARIABLE";
 760               state = 90;
 761               continue;
 762             }
 763
 764           if (nowtok != tok_bsymbol && nowtok != tok_ucs4)
 765             {
 766               lr_error (cmfile, _("syntax error in %s definition: %s"),
 767                         "WIDTH_VARIABLE", _("no symbolic name given"));
 768
 769               lr_ignore_rest (cmfile, 0);
 770
 771               continue;
 772             }
 773
 774           if (from_name != NULL)
 775             obstack_free (&result->mem_pool, from_name);
 776
 777           if (nowtok == tok_bsymbol)
 778             from_name = (char *) obstack_copy0 (&result->mem_pool,
 779                                                 now->val.str.startmb,
 780                                                 now->val.str.lenmb);
 781           else
 782             {
 783               obstack_printf (&result->mem_pool, "U%08X",
 784                               cmfile->token.val.ucs4);
 785               obstack_1grow (&result->mem_pool, '\0');
 786               from_name = (char *) obstack_finish (&result->mem_pool);
 787             }
 788           to_name = NULL;
 789
 790           state = 99;
 791           continue;
 792
 793         case 99:
 794           if (nowtok == tok_ellipsis3)
 795             state = 100;
 796
 797           /* Store info.  */
 798           from_name = NULL;
 799
 800           /* Warn */
 801           state = 98;
 802           continue;
 803
 804         case 100:
 805           if (nowtok != tok_bsymbol && nowtok != tok_ucs4)
 806             {
 807               lr_error (cmfile, _("syntax error in %s definition: %s"),
 808                         "WIDTH_VARIABLE",
 809                         _("no symbolic name given for end of range"));
 810               lr_ignore_rest (cmfile, 0);
 811               continue;
 812             }
 813
 814           if (nowtok == tok_bsymbol)
 815             to_name = (char *) obstack_copy0 (&result->mem_pool,
 816                                               now->val.str.startmb,
 817                                               now->val.str.lenmb);
 818           else
 819             {
 820               obstack_printf (&result->mem_pool, "U%08X",
 821                               cmfile->token.val.ucs4);
 822               obstack_1grow (&result->mem_pool, '\0');
 823               to_name = (char *) obstack_finish (&result->mem_pool);
 824             }
 825
 826           /* XXX Enter value into table.  */
 827
 828           lr_ignore_rest (cmfile, 1);
 829
 830           state = 98;
 831           continue;
 832
 833         default:
 834           error (5, 0, _("%s: error in state machine"), __FILE__);
 835           /* NOTREACHED */
 836         }
 837       break;
 838     }
 839
 840   if (state != 91 && !be_quiet)
 841     error (0, 0, _("%s: premature end of file"), cmfile->fname);
 842
 843   lr_close (cmfile);
 844
 845   return result;
 846 }
 847
 848
 849 static void
 850 new_width (struct linereader *cmfile, struct charmap_t *result,
 851            const char *from, const char *to, unsigned long int width)
 852 {
 853   struct charseq *from_val;
 854   struct charseq *to_val;
 855
 856   from_val = charmap_find_value (result, from, strlen (from));
 857   if (from_val == NULL)
 858     {
 859       lr_error (cmfile, _("unknown character `%s'"), from);
 860       return;
 861     }
 862
 863   if (to == NULL)
 864     to_val = from_val;
 865   else
 866     {
 867       to_val = charmap_find_value (result, to, strlen (to));
 868       if (to_val == NULL)
 869         {
 870           lr_error (cmfile, _("unknown character `%s'"), to);
 871           return;
 872         }
 873     }
 874
 875   if (result->nwidth_rules >= result->nwidth_rules_max)
 876     {
 877       size_t new_size = result->nwidth_rules + 32;
 878       struct width_rule *new_rules =
 879         (struct width_rule *) obstack_alloc (&result->mem_pool,
 880                                              (new_size
 881                                               * sizeof (struct width_rule)));
 882
 883       memcpy (new_rules, result->width_rules,
 884               result->nwidth_rules_max * sizeof (struct width_rule));
 885
 886       result->width_rules = new_rules;
 887       result->nwidth_rules_max = new_size;
 888     }
 889
 890   result->width_rules[result->nwidth_rules].from = from_val;
 891   result->width_rules[result->nwidth_rules].to = to_val;
 892   result->width_rules[result->nwidth_rules].width = (unsigned int) width;
 893   ++result->nwidth_rules;
 894 }
 895
 896
 897 struct charseq *
 898 charmap_find_value (const struct charmap_t *cm, const char *name, size_t len)
 899 {
 900   void *result;
 901
 902   return (find_entry ((hash_table *) &cm->char_table, name, len, &result)
 903           < 0 ? NULL : (struct charseq *) result);
 904 }
 905
 906
 907 static void
 908 charmap_new_char (struct linereader *lr, struct charmap_t *cm,
 909                   int nbytes, char *bytes, const char *from, const char *to,
 910                   int decimal_ellipsis, int step)
 911 {
 912   hash_table *ht = &cm->char_table;
 913   hash_table *bt = &cm->byte_table;
 914   struct obstack *ob = &cm->mem_pool;
 915   char *from_end;
 916   char *to_end;
 917   const char *cp;
 918   int prefix_len, len1, len2;
 919   unsigned int from_nr, to_nr, cnt;
 920   struct charseq *newp;
 921
 922   len1 = strlen (from);
 923
 924   if (to == NULL)
 925     {
 926       newp = (struct charseq *) obstack_alloc (ob, sizeof (*newp) + nbytes);
 927       newp->nbytes = nbytes;
 928       memcpy (newp->bytes, bytes, nbytes);
 929       newp->name = from;
 930
 931       newp->ucs4 = UNINITIALIZED_CHAR_VALUE;
 932       if ((from[0] == 'U' || from[0] == 'P') && (len1 == 5 || len1 == 9))
 933         {
 934           /* Maybe the name is of the form `Uxxxx' or `Uxxxxxxxx' where
 935              xxxx and xxxxxxxx are hexadecimal numbers.  In this case
 936              we use the value of xxxx or xxxxxxxx as the UCS4 value of
 937              this character and we don't have to consult the repertoire
 938              map.
 939
 940              If the name is of the form `Pxxxx' or `Pxxxxxxxx' the xxxx
 941              and xxxxxxxx also give the code point in UCS4 but this must
 942              be in the private, i.e., unassigned, area.  This should be
 943              used for characters which do not (yet) have an equivalent
 944              in ISO 10646 and Unicode.  */
 945           char *endp;
 946
 947           errno = 0;
 948           newp->ucs4 = strtoul (from + 1, &endp, 16);
 949           if (endp - from != len1
 950               || (newp->ucs4 == ULONG_MAX && errno == ERANGE)
 951               || newp->ucs4 >= 0x80000000)
 952             /* This wasn't successful.  Signal this name cannot be a
 953                correct UCS value.  */
 954             newp->ucs4 = UNINITIALIZED_CHAR_VALUE;
 955         }
 956
 957       insert_entry (ht, from, len1, newp);
 958       insert_entry (bt, newp->bytes, nbytes, newp);
 959       /* Please note that it isn't a bug if a symbol is defined more
 960          than once.  All later definitions are simply discarded.  */
 961       return;
 962     }
 963
 964   /* We have a range: the names must have names with equal prefixes
 965      and an equal number of digits, where the second number is greater
 966      or equal than the first.  */
 967   len2 = strlen (to);
 968
 969   if (len1 != len2)
 970     {
 971     illegal_range:
 972       lr_error (lr, _("invalid names for character range"));
 973       return;
 974     }
 975
 976   cp = &from[len1 - 1];
 977   if (decimal_ellipsis)
 978     while (isdigit (*cp) && cp >= from)
 979       --cp;
 980   else
 981     while (isxdigit (*cp) && cp >= from)
 982       {
 983         if (!isdigit (*cp) && !isupper (*cp))
 984           lr_error (lr, _("\
 985 hexadecimal range format should use only capital characters"));
 986         --cp;
 987       }
 988
 989   prefix_len = (cp - from) + 1;
 990
 991   if (cp == &from[len1 - 1] || strncmp (from, to, prefix_len) != 0)
 992     goto illegal_range;
 993
 994   errno = 0;
 995   from_nr = strtoul (&from[prefix_len], &from_end, decimal_ellipsis ? 10 : 16);
 996   if (*from_end != '\0' || (from_nr == ULONG_MAX && errno == ERANGE)
 997       || ((to_nr = strtoul (&to[prefix_len], &to_end,
 998                             decimal_ellipsis ? 10 : 16)) == ULONG_MAX
 999           && errno == ERANGE)
1000       || *to_end != '\0')
1001     {
1002       lr_error (lr, _("<%s> and <%s> are illegal names for range"), from, to);
1003       return;
1004     }
1005
1006   if (from_nr > to_nr)
1007     {
1008       lr_error (lr, _("upper limit in range is not higher then lower limit"));
1009       return;
1010     }
1011
1012   for (cnt = from_nr; cnt <= to_nr; cnt += step)
1013     {
1014       char *name_end;
1015       obstack_printf (ob, decimal_ellipsis ? "%.*s%0*d" : "%.*s%0*X",
1016                       prefix_len, from, len1 - prefix_len, cnt);
1017       obstack_1grow (ob, '\0');
1018       name_end = obstack_finish (ob);
1019
1020       newp = (struct charseq *) obstack_alloc (ob, sizeof (*newp) + nbytes);
1021       newp->nbytes = nbytes;
1022       memcpy (newp->bytes, bytes, nbytes);
1023       newp->name = name_end;
1024
1025       newp->ucs4 = UNINITIALIZED_CHAR_VALUE;
1026       if ((name_end[0] == 'U' || name_end[0] == 'P')
1027           && (len1 == 5 || len1 == 9))
1028         {
1029           /* Maybe the name is of the form `Uxxxx' or `Uxxxxxxxx' where
1030              xxxx and xxxxxxxx are hexadecimal numbers.  In this case
1031              we use the value of xxxx or xxxxxxxx as the UCS4 value of
1032              this character and we don't have to consult the repertoire
1033              map.
1034
1035              If the name is of the form `Pxxxx' or `Pxxxxxxxx' the xxxx
1036              and xxxxxxxx also give the code point in UCS4 but this must
1037              be in the private, i.e., unassigned, area.  This should be
1038              used for characters which do not (yet) have an equivalent
1039              in ISO 10646 and Unicode.  */
1040           char *endp;
1041
1042           errno = 0;
1043           newp->ucs4 = strtoul (name_end, &endp, 16);
1044           if (endp - name_end != len1
1045               || (newp->ucs4 == ULONG_MAX && errno == ERANGE)
1046               || newp->ucs4 >= 0x80000000)
1047             /* This wasn't successful.  Signal this name cannot be a
1048                correct UCS value.  */
1049             newp->ucs4 = UNINITIALIZED_CHAR_VALUE;
1050         }
1051
1052       insert_entry (ht, name_end, len1, newp);
1053       insert_entry (bt, newp->bytes, nbytes, newp);
1054       /* Please note we don't examine the return value since it is no error
1055          if we have two definitions for a symbol.  */
1056
1057       /* Increment the value in the byte sequence.  */
1058       if (++bytes[nbytes - 1] == '\0')
1059         {
1060           int b = nbytes - 2;
1061
1062           do
1063             if (b < 0)
1064               {
1065                 lr_error (lr,
1066                           _("resulting bytes for range not representable."));
1067                 return;
1068               }
1069           while (++bytes[b--] == 0);
1070         }
1071     }
1072 }
1073
1074
1075 struct charseq *
1076 charmap_find_symbol (const struct charmap_t *cm, const char *bytes,
1077                      size_t nbytes)
1078 {
1079   void *result;
1080
1081   return (find_entry ((hash_table *) &cm->byte_table, bytes, nbytes, &result)
1082           < 0 ? NULL : (struct charseq *) result);
1083 }