locale/programs/charmap.c

   1 /* Copyright (C) 1996-2017 Free Software Foundation, Inc.
   2    This file is part of the GNU C Library.
   3    Contributed by Ulrich Drepper <drepper@gnu.org>, 1996.
   4
   5    This program is free software; you can redistribute it and/or modify
   6    it under the terms of the GNU General Public License as published
   7    by the Free Software Foundation; version 2 of the License, or
   8    (at your option) any later version.
   9
  10    This program is distributed in the hope that it will be useful,
  11    but WITHOUT ANY WARRANTY; without even the implied warranty of
  12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  13    GNU General Public License for more details.
  14
  15    You should have received a copy of the GNU General Public License
  16    along with this program; if not, see <http://www.gnu.org/licenses/>.  */
  17
  18 #ifdef HAVE_CONFIG_H
  19 # include <config.h>
  20 #endif
  21
  22 #include <ctype.h>
  23 #include <errno.h>
  24 #include <libintl.h>
  25 #include <limits.h>
  26 #include <stdio.h>
  27 #include <stdlib.h>
  28 #include <string.h>
  29 #include <stdint.h>
  30
  31 #include "localedef.h"
  32 #include "linereader.h"
  33 #include "charmap.h"
  34 #include "charmap-dir.h"
  35
  36 #include <assert.h>
  37
  38
  39 /* Define the lookup function.  */
  40 #include "charmap-kw.h"
  41
  42
  43 /* Prototypes for local functions.  */
  44 static struct charmap_t *parse_charmap (struct linereader *cmfile,
  45                                         int verbose, int be_quiet);
  46 static void new_width (struct linereader *cmfile, struct charmap_t *result,
  47                        const char *from, const char *to,
  48                        unsigned long int width);
  49 static void charmap_new_char (struct linereader *lr, struct charmap_t *cm,
  50                               size_t nbytes, unsigned char *bytes,
  51                               const char *from, const char *to,
  52                               int decimal_ellipsis, int step);
  53
  54
  55 bool enc_not_ascii_compatible;
  56
  57
  58 #ifdef NEED_NULL_POINTER
  59 static const char *null_pointer;
  60 #endif
  61
  62 static struct linereader *
  63 cmlr_open (const char *directory, const char *name, kw_hash_fct_t hf)
  64 {
  65   FILE *fp;
  66
  67   fp = charmap_open (directory, name);
  68   if (fp == NULL)
  69     return NULL;
  70   else
  71     {
  72       size_t dlen = strlen (directory);
  73       int add_slash = (dlen == 0 || directory[dlen - 1] != '/');
  74       size_t nlen = strlen (name);
  75       char *pathname;
  76       char *p;
  77
  78       pathname = alloca (dlen + add_slash + nlen + 1);
  79       p = stpcpy (pathname, directory);
  80       if (add_slash)
  81         *p++ = '/';
  82       stpcpy (p, name);
  83
  84       return lr_create (fp, pathname, hf);
  85     }
  86 }
  87
  88 struct charmap_t *
  89 charmap_read (const char *filename, int verbose, int error_not_found,
  90               int be_quiet, int use_default)
  91 {
  92   struct charmap_t *result = NULL;
  93
  94   if (filename != NULL)
  95     {
  96       struct linereader *cmfile;
  97
  98       /* First try the name as found in the parameter.  */
  99       cmfile = lr_open (filename, charmap_hash);
 100       if (cmfile == NULL)
 101         {
 102           /* No successful.  So start looking through the directories
 103              in the I18NPATH if this is a simple name.  */
 104           if (strchr (filename, '/') == NULL)
 105             {
 106               char *i18npath = getenv ("I18NPATH");
 107               if (i18npath != NULL && *i18npath != '\0')
 108                 {
 109                   const size_t pathlen = strlen (i18npath);
 110                   char i18npathbuf[pathlen + 1];
 111                   char path[pathlen + sizeof ("/charmaps")];
 112                   char *next;
 113                   i18npath = memcpy (i18npathbuf, i18npath, pathlen + 1);
 114
 115                   while (cmfile == NULL
 116                          && (next = strsep (&i18npath, ":")) != NULL)
 117                     {
 118                       stpcpy (stpcpy (path, next), "/charmaps");
 119                       cmfile = cmlr_open (path, filename, charmap_hash);
 120
 121                       if (cmfile == NULL)
 122                         /* Try without the "/charmaps" part.  */
 123                         cmfile = cmlr_open (next, filename, charmap_hash);
 124                     }
 125                 }
 126
 127               if (cmfile == NULL)
 128                 /* Try the default directory.  */
 129                 cmfile = cmlr_open (CHARMAP_PATH, filename, charmap_hash);
 130             }
 131         }
 132
 133       if (cmfile != NULL)
 134         result = parse_charmap (cmfile, verbose, be_quiet);
 135
 136       if (result == NULL && error_not_found)
 137         record_error (0, errno,
 138                       _("character map file `%s' not found"),
 139                       filename);
 140     }
 141
 142   if (result == NULL && filename != NULL && strchr (filename, '/') == NULL)
 143     {
 144       /* OK, one more try.  We also accept the names given to the
 145          character sets in the files.  Sometimes they differ from the
 146          file name.  */
 147       CHARMAP_DIR *dir;
 148
 149       dir = charmap_opendir (CHARMAP_PATH);
 150       if (dir != NULL)
 151         {
 152           const char *dirent;
 153
 154           while ((dirent = charmap_readdir (dir)) != NULL)
 155             {
 156               char **aliases;
 157               char **p;
 158               int found;
 159
 160               aliases = charmap_aliases (CHARMAP_PATH, dirent);
 161               found = 0;
 162               for (p = aliases; *p; p++)
 163                 if (strcasecmp (*p, filename) == 0)
 164                   {
 165                     found = 1;
 166                     break;
 167                   }
 168               charmap_free_aliases (aliases);
 169
 170               if (found)
 171                 {
 172                   struct linereader *cmfile;
 173
 174                   cmfile = cmlr_open (CHARMAP_PATH, dirent, charmap_hash);
 175                   if (cmfile != NULL)
 176                     result = parse_charmap (cmfile, verbose, be_quiet);
 177
 178                   break;
 179                 }
 180             }
 181
 182           charmap_closedir (dir);
 183         }
 184     }
 185
 186   if (result == NULL && DEFAULT_CHARMAP != NULL)
 187     {
 188       struct linereader *cmfile;
 189
 190       cmfile = cmlr_open (CHARMAP_PATH, DEFAULT_CHARMAP, charmap_hash);
 191       if (cmfile != NULL)
 192         result = parse_charmap (cmfile, verbose, be_quiet);
 193
 194       if (result == NULL)
 195         record_error (4, errno,
 196                       _("default character map file `%s' not found"),
 197                       DEFAULT_CHARMAP);
 198     }
 199
 200   if (result != NULL && result->code_set_name == NULL)
 201     /* The input file does not specify a code set name.  This
 202        shouldn't happen but we should cope with it.  */
 203     result->code_set_name = basename (filename);
 204
 205   /* Test of ASCII compatibility of locale encoding.
 206
 207      Verify that the encoding to be used in a locale is ASCII compatible,
 208      at least for the graphic characters, excluding the control characters,
 209      '$' and '@'.  This constraint comes from an ISO C 99 restriction.
 210
 211      ISO C 99 section 7.17.(2) (about wchar_t):
 212        the null character shall have the code value zero and each member of
 213        the basic character set shall have a code value equal to its value
 214        when used as the lone character in an integer character constant.
 215      ISO C 99 section 5.2.1.(3):
 216        Both the basic source and basic execution character sets shall have
 217        the following members: the 26 uppercase letters of the Latin alphabet
 218             A B C D E F G H I J K L M N O P Q R S T U V W X Y Z
 219        the 26 lowercase letters of the Latin alphabet
 220             a b c d e f g h i j k l m n o p q r s t u v w x y z
 221        the 10 decimal digits
 222             0 1 2 3 4 5 6 7 8 9
 223        the following 29 graphic characters
 224             ! " # % & ' ( ) * + , - . / : ; < = > ? [ \ ] ^ _ { | } ~
 225        the space character, and control characters representing horizontal
 226        tab, vertical tab, and form feed.
 227
 228      Therefore, for all members of the "basic character set", the 'char' code
 229      must have the same value as the 'wchar_t' code, which in glibc is the
 230      same as the Unicode code, which for all of the enumerated characters
 231      is identical to the ASCII code. */
 232   if (result != NULL && use_default)
 233     {
 234       static const char basic_charset[] =
 235         {
 236           'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M',
 237           'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z',
 238           'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm',
 239           'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
 240           '0', '1', '2', '3', '4', '5', '6', '7', '8', '9',
 241           '!', '"', '#', '%', '&', '\'', '(', ')', '*', '+', ',', '-',
 242           '.', '/', ':', ';', '<', '=', '>', '?', '[', '\\', ']', '^',
 243           '_', '{', '|', '}', '~', ' ', '\t', '\v', '\f', '\0'
 244         };
 245       int failed = 0;
 246       const char *p = basic_charset;
 247
 248       do
 249         {
 250           struct charseq *seq = charmap_find_symbol (result, p, 1);
 251
 252           if (seq == NULL || seq->ucs4 != (uint32_t) *p)
 253             failed = 1;
 254         }
 255       while (*p++ != '\0');
 256
 257       if (failed)
 258         {
 259           record_warning (_("\
 260 character map `%s' is not ASCII compatible, locale not ISO C compliant\n"),
 261                           result->code_set_name);
 262           enc_not_ascii_compatible = true;
 263         }
 264     }
 265
 266   return result;
 267 }
 268
 269
 270 static struct charmap_t *
 271 parse_charmap (struct linereader *cmfile, int verbose, int be_quiet)
 272 {
 273   struct charmap_t *result;
 274   int state;
 275   enum token_t expected_tok = tok_error;
 276   const char *expected_str = NULL;
 277   char *from_name = NULL;
 278   char *to_name = NULL;
 279   enum token_t ellipsis = 0;
 280   int step = 1;
 281
 282   /* We don't want symbolic names in string to be translated.  */
 283   cmfile->translate_strings = 0;
 284
 285   /* Allocate room for result.  */
 286   result = (struct charmap_t *) xmalloc (sizeof (struct charmap_t));
 287   memset (result, '\0', sizeof (struct charmap_t));
 288   /* The default DEFAULT_WIDTH is 1.  */
 289   result->width_default = 1;
 290
 291 #define obstack_chunk_alloc malloc
 292 #define obstack_chunk_free free
 293   obstack_init (&result->mem_pool);
 294
 295   if (init_hash (&result->char_table, 256)
 296       || init_hash (&result->byte_table, 256))
 297     {
 298       free (result);
 299       return NULL;
 300     }
 301
 302   /* We use a state machine to describe the charmap description file
 303      format.  */
 304   state = 1;
 305   while (1)
 306     {
 307       /* What's on?  */
 308       struct token *now = lr_token (cmfile, NULL, NULL, NULL, verbose);
 309       enum token_t nowtok = now->tok;
 310       struct token *arg;
 311
 312       if (nowtok == tok_eof)
 313         break;
 314
 315       switch (state)
 316         {
 317         case 1:
 318           /* The beginning.  We expect the special declarations, EOL or
 319              `CHARMAP'.  */
 320           if (nowtok == tok_eol)
 321             /* Ignore empty lines.  */
 322             continue;
 323
 324           if (nowtok == tok_charmap)
 325             {
 326               from_name = NULL;
 327               to_name = NULL;
 328
 329               /* We have to set up the real work.  Fill in some
 330                  default values.  */
 331               if (result->mb_cur_max == 0)
 332                 result->mb_cur_max = 1;
 333               if (result->mb_cur_min == 0)
 334                 result->mb_cur_min = result->mb_cur_max;
 335               if (result->mb_cur_min > result->mb_cur_max)
 336                 {
 337                   record_error (0, 0, _("\
 338 %s: <mb_cur_max> must be greater than <mb_cur_min>\n"),
 339                                 cmfile->fname);
 340
 341                   result->mb_cur_min = result->mb_cur_max;
 342                 }
 343
 344               lr_ignore_rest (cmfile, 1);
 345
 346               state = 2;
 347               continue;
 348             }
 349
 350           if (nowtok != tok_code_set_name && nowtok != tok_mb_cur_max
 351               && nowtok != tok_mb_cur_min && nowtok != tok_escape_char
 352               && nowtok != tok_comment_char && nowtok != tok_g0esc
 353               && nowtok != tok_g1esc && nowtok != tok_g2esc
 354               && nowtok != tok_g3esc && nowtok != tok_repertoiremap
 355               && nowtok != tok_include)
 356             {
 357               lr_error (cmfile, _("syntax error in prolog: %s"),
 358                         _("invalid definition"));
 359
 360               lr_ignore_rest (cmfile, 0);
 361               continue;
 362             }
 363
 364           /* We know that we need an argument.  */
 365           arg = lr_token (cmfile, NULL, NULL, NULL, verbose);
 366
 367           switch (nowtok)
 368             {
 369             case tok_code_set_name:
 370             case tok_repertoiremap:
 371               if (arg->tok != tok_ident && arg->tok != tok_string)
 372                 {
 373                 badarg:
 374                   lr_error (cmfile, _("syntax error in prolog: %s"),
 375                             _("bad argument"));
 376
 377                   lr_ignore_rest (cmfile, 0);
 378                   continue;
 379                 }
 380
 381               if (nowtok == tok_code_set_name)
 382                 result->code_set_name = obstack_copy0 (&result->mem_pool,
 383                                                        arg->val.str.startmb,
 384                                                        arg->val.str.lenmb);
 385               else
 386                 result->repertoiremap = obstack_copy0 (&result->mem_pool,
 387                                                        arg->val.str.startmb,
 388                                                        arg->val.str.lenmb);
 389
 390               lr_ignore_rest (cmfile, 1);
 391               continue;
 392
 393             case tok_mb_cur_max:
 394             case tok_mb_cur_min:
 395               if (arg->tok != tok_number)
 396                 goto badarg;
 397
 398               if ((nowtok == tok_mb_cur_max
 399                        && result->mb_cur_max != 0)
 400                       || (nowtok == tok_mb_cur_max
 401                           && result->mb_cur_max != 0))
 402                 lr_error (cmfile, _("duplicate definition of <%s>"),
 403                           nowtok == tok_mb_cur_min
 404                           ? "mb_cur_min" : "mb_cur_max");
 405
 406               if (arg->val.num < 1)
 407                 {
 408                   lr_error (cmfile,
 409                             _("value for <%s> must be 1 or greater"),
 410                             nowtok == tok_mb_cur_min
 411                             ? "mb_cur_min" : "mb_cur_max");
 412
 413                   lr_ignore_rest (cmfile, 0);
 414                   continue;
 415                 }
 416               if ((nowtok == tok_mb_cur_max && result->mb_cur_min != 0
 417                    && (int) arg->val.num < result->mb_cur_min)
 418                   || (nowtok == tok_mb_cur_min && result->mb_cur_max != 0
 419                       && (int) arg->val.num > result->mb_cur_max))
 420                 {
 421                   lr_error (cmfile, _("\
 422 value of <%s> must be greater or equal than the value of <%s>"),
 423                             "mb_cur_max", "mb_cur_min");
 424
 425                   lr_ignore_rest (cmfile, 0);
 426                   continue;
 427                 }
 428
 429               if (nowtok == tok_mb_cur_max)
 430                 result->mb_cur_max = arg->val.num;
 431               else
 432                 result->mb_cur_min = arg->val.num;
 433
 434               lr_ignore_rest (cmfile, 1);
 435               continue;
 436
 437             case tok_escape_char:
 438             case tok_comment_char:
 439               if (arg->tok != tok_ident)
 440                 goto badarg;
 441
 442               if (arg->val.str.lenmb != 1)
 443                 {
 444                   lr_error (cmfile, _("\
 445 argument to <%s> must be a single character"),
 446                             nowtok == tok_escape_char ? "escape_char"
 447                                                       : "comment_char");
 448
 449                   lr_ignore_rest (cmfile, 0);
 450                   continue;
 451                 }
 452
 453               if (nowtok == tok_escape_char)
 454                 cmfile->escape_char = *arg->val.str.startmb;
 455               else
 456                 cmfile->comment_char = *arg->val.str.startmb;
 457
 458               lr_ignore_rest (cmfile, 1);
 459               continue;
 460
 461             case tok_g0esc:
 462             case tok_g1esc:
 463             case tok_g2esc:
 464             case tok_g3esc:
 465             case tok_escseq:
 466               lr_ignore_rest (cmfile, 0); /* XXX */
 467               continue;
 468
 469             case tok_include:
 470               lr_error (cmfile, _("\
 471 character sets with locking states are not supported"));
 472               exit (4);
 473
 474             default:
 475               /* Cannot happen.  */
 476               assert (! "Should not happen");
 477             }
 478           break;
 479
 480         case 2:
 481           /* We have seen `CHARMAP' and now are in the body.  Each line
 482              must have the format "%s %s %s\n" or "%s...%s %s %s\n".  */
 483           if (nowtok == tok_eol)
 484             /* Ignore empty lines.  */
 485             continue;
 486
 487           if (nowtok == tok_end)
 488             {
 489               expected_tok = tok_charmap;
 490               expected_str = "CHARMAP";
 491               state = 90;
 492               continue;
 493             }
 494
 495           if (nowtok != tok_bsymbol && nowtok != tok_ucs4)
 496             {
 497               lr_error (cmfile, _("syntax error in %s definition: %s"),
 498                         "CHARMAP", _("no symbolic name given"));
 499
 500               lr_ignore_rest (cmfile, 0);
 501               continue;
 502             }
 503
 504           /* If the previous line was not completely correct free the
 505              used memory.  */
 506           if (from_name != NULL)
 507             obstack_free (&result->mem_pool, from_name);
 508
 509           if (nowtok == tok_bsymbol)
 510             from_name = (char *) obstack_copy0 (&result->mem_pool,
 511                                                 now->val.str.startmb,
 512                                                 now->val.str.lenmb);
 513           else
 514             {
 515               obstack_printf (&result->mem_pool, "U%08X",
 516                               cmfile->token.val.ucs4);
 517               obstack_1grow (&result->mem_pool, '\0');
 518               from_name = (char *) obstack_finish (&result->mem_pool);
 519             }
 520           to_name = NULL;
 521
 522           state = 3;
 523           continue;
 524
 525         case 3:
 526           /* We have two possibilities: We can see an ellipsis or an
 527              encoding value.  */
 528           if (nowtok == tok_ellipsis3 || nowtok == tok_ellipsis4
 529               || nowtok == tok_ellipsis2 || nowtok == tok_ellipsis4_2
 530               || nowtok == tok_ellipsis2_2)
 531             {
 532               ellipsis = nowtok;
 533               if (nowtok == tok_ellipsis4_2)
 534                 {
 535                   step = 2;
 536                   nowtok = tok_ellipsis4;
 537                 }
 538               else if (nowtok == tok_ellipsis2_2)
 539                 {
 540                   step = 2;
 541                   nowtok = tok_ellipsis2;
 542                 }
 543               state = 4;
 544               continue;
 545             }
 546           /* FALLTHROUGH */
 547
 548         case 5:
 549           if (nowtok != tok_charcode)
 550             {
 551               lr_error (cmfile, _("syntax error in %s definition: %s"),
 552                         "CHARMAP", _("invalid encoding given"));
 553
 554               lr_ignore_rest (cmfile, 0);
 555
 556               state = 2;
 557               continue;
 558             }
 559
 560           if (now->val.charcode.nbytes < result->mb_cur_min)
 561             lr_error (cmfile, _("too few bytes in character encoding"));
 562           else if (now->val.charcode.nbytes > result->mb_cur_max)
 563             lr_error (cmfile, _("too many bytes in character encoding"));
 564           else
 565             charmap_new_char (cmfile, result, now->val.charcode.nbytes,
 566                               now->val.charcode.bytes, from_name, to_name,
 567                               ellipsis != tok_ellipsis2, step);
 568
 569           /* Ignore trailing comment silently.  */
 570           lr_ignore_rest (cmfile, 0);
 571
 572           from_name = NULL;
 573           to_name = NULL;
 574           ellipsis = tok_none;
 575           step = 1;
 576
 577           state = 2;
 578           continue;
 579
 580         case 4:
 581           if (nowtok != tok_bsymbol && nowtok != tok_ucs4)
 582             {
 583               lr_error (cmfile, _("syntax error in %s definition: %s"),
 584                         "CHARMAP",
 585                         _("no symbolic name given for end of range"));
 586
 587               lr_ignore_rest (cmfile, 0);
 588               continue;
 589             }
 590
 591           /* Copy the to-name in a safe place.  */
 592           if (nowtok == tok_bsymbol)
 593             to_name = (char *) obstack_copy0 (&result->mem_pool,
 594                                               cmfile->token.val.str.startmb,
 595                                               cmfile->token.val.str.lenmb);
 596           else
 597             {
 598               obstack_printf (&result->mem_pool, "U%08X",
 599                               cmfile->token.val.ucs4);
 600               obstack_1grow (&result->mem_pool, '\0');
 601               to_name = (char *) obstack_finish (&result->mem_pool);
 602             }
 603
 604           state = 5;
 605           continue;
 606
 607         case 90:
 608           if (nowtok != expected_tok)
 609             lr_error (cmfile, _("\
 610 %1$s: definition does not end with `END %1$s'"), expected_str);
 611
 612           lr_ignore_rest (cmfile, nowtok == expected_tok);
 613           state = 91;
 614           continue;
 615
 616         case 91:
 617           /* Waiting for WIDTH... */
 618           if (nowtok == tok_eol)
 619             /* Ignore empty lines.  */
 620             continue;
 621
 622           if (nowtok == tok_width_default)
 623             {
 624               state = 92;
 625               continue;
 626             }
 627
 628           if (nowtok == tok_width)
 629             {
 630               lr_ignore_rest (cmfile, 1);
 631               state = 93;
 632               continue;
 633             }
 634
 635           if (nowtok == tok_width_variable)
 636             {
 637               lr_ignore_rest (cmfile, 1);
 638               state = 98;
 639               continue;
 640             }
 641
 642           lr_error (cmfile, _("\
 643 only WIDTH definitions are allowed to follow the CHARMAP definition"));
 644
 645           lr_ignore_rest (cmfile, 0);
 646           continue;
 647
 648         case 92:
 649           if (nowtok != tok_number)
 650             lr_error (cmfile, _("value for %s must be an integer"),
 651                       "WIDTH_DEFAULT");
 652           else
 653             result->width_default = now->val.num;
 654
 655           lr_ignore_rest (cmfile, nowtok == tok_number);
 656
 657           state = 91;
 658           continue;
 659
 660         case 93:
 661           /* We now expect `END WIDTH' or lines of the format "%s %d\n" or
 662              "%s...%s %d\n".  */
 663           if (nowtok == tok_eol)
 664             /* ignore empty lines.  */
 665             continue;
 666
 667           if (nowtok == tok_end)
 668             {
 669               expected_tok = tok_width;
 670               expected_str = "WIDTH";
 671               state = 90;
 672               continue;
 673             }
 674
 675           if (nowtok != tok_bsymbol && nowtok != tok_ucs4)
 676             {
 677               lr_error (cmfile, _("syntax error in %s definition: %s"),
 678                         "WIDTH", _("no symbolic name given"));
 679
 680               lr_ignore_rest (cmfile, 0);
 681               continue;
 682             }
 683
 684           if (from_name != NULL)
 685             obstack_free (&result->mem_pool, from_name);
 686
 687           if (nowtok == tok_bsymbol)
 688             from_name = (char *) obstack_copy0 (&result->mem_pool,
 689                                                 now->val.str.startmb,
 690                                                 now->val.str.lenmb);
 691           else
 692             {
 693               obstack_printf (&result->mem_pool, "U%08X",
 694                               cmfile->token.val.ucs4);
 695               obstack_1grow (&result->mem_pool, '\0');
 696               from_name = (char *) obstack_finish (&result->mem_pool);
 697             }
 698
 699           to_name = NULL;
 700
 701           state = 94;
 702           continue;
 703
 704         case 94:
 705           if (nowtok == tok_ellipsis3)
 706             {
 707               state = 95;
 708               continue;
 709             }
 710
 711         case 96:
 712           if (nowtok != tok_number)
 713             lr_error (cmfile, _("value for %s must be an integer"),
 714                       "WIDTH");
 715           else
 716             {
 717               /* Store width for chars.  */
 718               new_width (cmfile, result, from_name, to_name, now->val.num);
 719
 720               from_name = NULL;
 721               to_name = NULL;
 722             }
 723
 724           lr_ignore_rest (cmfile, nowtok == tok_number);
 725
 726           state = 93;
 727           continue;
 728
 729         case 95:
 730           if (nowtok != tok_bsymbol && nowtok != tok_ucs4)
 731             {
 732               lr_error (cmfile, _("syntax error in %s definition: %s"),
 733                         "WIDTH", _("no symbolic name given for end of range"));
 734
 735               lr_ignore_rest (cmfile, 0);
 736
 737               state = 93;
 738               continue;
 739             }
 740
 741           if (nowtok == tok_bsymbol)
 742             to_name = (char *) obstack_copy0 (&result->mem_pool,
 743                                               now->val.str.startmb,
 744                                               now->val.str.lenmb);
 745           else
 746             {
 747               obstack_printf (&result->mem_pool, "U%08X",
 748                               cmfile->token.val.ucs4);
 749               obstack_1grow (&result->mem_pool, '\0');
 750               to_name = (char *) obstack_finish (&result->mem_pool);
 751             }
 752
 753           state = 96;
 754           continue;
 755
 756         case 98:
 757           /* We now expect `END WIDTH_VARIABLE' or lines of the format
 758              "%s\n" or "%s...%s\n".  */
 759           if (nowtok == tok_eol)
 760             /* ignore empty lines.  */
 761             continue;
 762
 763           if (nowtok == tok_end)
 764             {
 765               expected_tok = tok_width_variable;
 766               expected_str = "WIDTH_VARIABLE";
 767               state = 90;
 768               continue;
 769             }
 770
 771           if (nowtok != tok_bsymbol && nowtok != tok_ucs4)
 772             {
 773               lr_error (cmfile, _("syntax error in %s definition: %s"),
 774                         "WIDTH_VARIABLE", _("no symbolic name given"));
 775
 776               lr_ignore_rest (cmfile, 0);
 777
 778               continue;
 779             }
 780
 781           if (from_name != NULL)
 782             obstack_free (&result->mem_pool, from_name);
 783
 784           if (nowtok == tok_bsymbol)
 785             from_name = (char *) obstack_copy0 (&result->mem_pool,
 786                                                 now->val.str.startmb,
 787                                                 now->val.str.lenmb);
 788           else
 789             {
 790               obstack_printf (&result->mem_pool, "U%08X",
 791                               cmfile->token.val.ucs4);
 792               obstack_1grow (&result->mem_pool, '\0');
 793               from_name = (char *) obstack_finish (&result->mem_pool);
 794             }
 795           to_name = NULL;
 796
 797           state = 99;
 798           continue;
 799
 800         case 99:
 801           if (nowtok == tok_ellipsis3)
 802             state = 100;
 803
 804           /* Store info.  */
 805           from_name = NULL;
 806
 807           /* Warn */
 808           state = 98;
 809           continue;
 810
 811         case 100:
 812           if (nowtok != tok_bsymbol && nowtok != tok_ucs4)
 813             {
 814               lr_error (cmfile, _("syntax error in %s definition: %s"),
 815                         "WIDTH_VARIABLE",
 816                         _("no symbolic name given for end of range"));
 817               lr_ignore_rest (cmfile, 0);
 818               continue;
 819             }
 820
 821           if (nowtok == tok_bsymbol)
 822             to_name = (char *) obstack_copy0 (&result->mem_pool,
 823                                               now->val.str.startmb,
 824                                               now->val.str.lenmb);
 825           else
 826             {
 827               obstack_printf (&result->mem_pool, "U%08X",
 828                               cmfile->token.val.ucs4);
 829               obstack_1grow (&result->mem_pool, '\0');
 830               to_name = (char *) obstack_finish (&result->mem_pool);
 831             }
 832
 833           /* XXX Enter value into table.  */
 834
 835           lr_ignore_rest (cmfile, 1);
 836
 837           state = 98;
 838           continue;
 839
 840         default:
 841           record_error (5, 0, _("%s: error in state machine"),
 842                         __FILE__);
 843           /* NOTREACHED */
 844         }
 845       break;
 846     }
 847
 848   if (state != 91)
 849     record_error (0, 0, _("%s: premature end of file"),
 850                   cmfile->fname);
 851
 852   lr_close (cmfile);
 853
 854   return result;
 855 }
 856
 857
 858 static void
 859 new_width (struct linereader *cmfile, struct charmap_t *result,
 860            const char *from, const char *to, unsigned long int width)
 861 {
 862   struct charseq *from_val;
 863   struct charseq *to_val;
 864
 865   from_val = charmap_find_value (result, from, strlen (from));
 866   if (from_val == NULL)
 867     {
 868       lr_error (cmfile, _("unknown character `%s'"), from);
 869       return;
 870     }
 871
 872   if (to == NULL)
 873     to_val = from_val;
 874   else
 875     {
 876       to_val = charmap_find_value (result, to, strlen (to));
 877       if (to_val == NULL)
 878         {
 879           lr_error (cmfile, _("unknown character `%s'"), to);
 880           return;
 881         }
 882
 883       /* Make sure the number of bytes for the end points of the range
 884          is correct.  */
 885       if (from_val->nbytes != to_val->nbytes)
 886         {
 887           lr_error (cmfile, _("\
 888 number of bytes for byte sequence of beginning and end of range not the same: %d vs %d"),
 889                     from_val->nbytes, to_val->nbytes);
 890           return;
 891         }
 892     }
 893
 894   if (result->nwidth_rules >= result->nwidth_rules_max)
 895     {
 896       size_t new_size = result->nwidth_rules + 32;
 897       struct width_rule *new_rules =
 898         (struct width_rule *) obstack_alloc (&result->mem_pool,
 899                                              (new_size
 900                                               * sizeof (struct width_rule)));
 901
 902       memcpy (new_rules, result->width_rules,
 903               result->nwidth_rules_max * sizeof (struct width_rule));
 904
 905       result->width_rules = new_rules;
 906       result->nwidth_rules_max = new_size;
 907     }
 908
 909   result->width_rules[result->nwidth_rules].from = from_val;
 910   result->width_rules[result->nwidth_rules].to = to_val;
 911   result->width_rules[result->nwidth_rules].width = (unsigned int) width;
 912   ++result->nwidth_rules;
 913 }
 914
 915
 916 struct charseq *
 917 charmap_find_value (const struct charmap_t *cm, const char *name, size_t len)
 918 {
 919   void *result;
 920
 921   return (find_entry ((hash_table *) &cm->char_table, name, len, &result)
 922           < 0 ? NULL : (struct charseq *) result);
 923 }
 924
 925
 926 static void
 927 charmap_new_char (struct linereader *lr, struct charmap_t *cm,
 928                   size_t nbytes, unsigned char *bytes,
 929                   const char *from, const char *to,
 930                   int decimal_ellipsis, int step)
 931 {
 932   hash_table *ht = &cm->char_table;
 933   hash_table *bt = &cm->byte_table;
 934   struct obstack *ob = &cm->mem_pool;
 935   char *from_end;
 936   char *to_end;
 937   const char *cp;
 938   int prefix_len, len1, len2;
 939   unsigned int from_nr, to_nr, cnt;
 940   struct charseq *newp;
 941
 942   len1 = strlen (from);
 943
 944   if (to == NULL)
 945     {
 946       newp = (struct charseq *) obstack_alloc (ob, sizeof (*newp) + nbytes);
 947       newp->nbytes = nbytes;
 948       memcpy (newp->bytes, bytes, nbytes);
 949       newp->name = from;
 950
 951       newp->ucs4 = UNINITIALIZED_CHAR_VALUE;
 952       if ((from[0] == 'U' || from[0] == 'P') && (len1 == 5 || len1 == 9))
 953         {
 954           /* Maybe the name is of the form `Uxxxx' or `Uxxxxxxxx' where
 955              xxxx and xxxxxxxx are hexadecimal numbers.  In this case
 956              we use the value of xxxx or xxxxxxxx as the UCS4 value of
 957              this character and we don't have to consult the repertoire
 958              map.
 959
 960              If the name is of the form `Pxxxx' or `Pxxxxxxxx' the xxxx
 961              and xxxxxxxx also give the code point in UCS4 but this must
 962              be in the private, i.e., unassigned, area.  This should be
 963              used for characters which do not (yet) have an equivalent
 964              in ISO 10646 and Unicode.  */
 965           char *endp;
 966
 967           errno = 0;
 968           newp->ucs4 = strtoul (from + 1, &endp, 16);
 969           if (endp - from != len1
 970               || (newp->ucs4 == ~((uint32_t) 0) && errno == ERANGE)
 971               || newp->ucs4 >= 0x80000000)
 972             /* This wasn't successful.  Signal this name cannot be a
 973                correct UCS value.  */
 974             newp->ucs4 = UNINITIALIZED_CHAR_VALUE;
 975         }
 976
 977       insert_entry (ht, from, len1, newp);
 978       insert_entry (bt, newp->bytes, nbytes, newp);
 979       /* Please note that it isn't a bug if a symbol is defined more
 980          than once.  All later definitions are simply discarded.  */
 981       return;
 982     }
 983
 984   /* We have a range: the names must have names with equal prefixes
 985      and an equal number of digits, where the second number is greater
 986      or equal than the first.  */
 987   len2 = strlen (to);
 988
 989   if (len1 != len2)
 990     {
 991     illegal_range:
 992       lr_error (lr, _("invalid names for character range"));
 993       return;
 994     }
 995
 996   cp = &from[len1 - 1];
 997   if (decimal_ellipsis)
 998     while (isdigit (*cp) && cp >= from)
 999       --cp;
1000   else
1001     while (isxdigit (*cp) && cp >= from)
1002       {
1003         if (!isdigit (*cp) && !isupper (*cp))
1004           lr_error (lr, _("\
1005 hexadecimal range format should use only capital characters"));
1006         --cp;
1007       }
1008
1009   prefix_len = (cp - from) + 1;
1010
1011   if (cp == &from[len1 - 1] || strncmp (from, to, prefix_len) != 0)
1012     goto illegal_range;
1013
1014   errno = 0;
1015   from_nr = strtoul (&from[prefix_len], &from_end, decimal_ellipsis ? 10 : 16);
1016   if (*from_end != '\0' || (from_nr == UINT_MAX && errno == ERANGE)
1017       || ((to_nr = strtoul (&to[prefix_len], &to_end,
1018                             decimal_ellipsis ? 10 : 16)) == UINT_MAX
1019           && errno == ERANGE)
1020       || *to_end != '\0')
1021     {
1022       lr_error (lr, _("<%s> and <%s> are invalid names for range"), from, to);
1023       return;
1024     }
1025
1026   if (from_nr > to_nr)
1027     {
1028       lr_error (lr, _("upper limit in range is smaller than lower limit"));
1029       return;
1030     }
1031
1032   for (cnt = from_nr; cnt <= to_nr; cnt += step)
1033     {
1034       char *name_end;
1035       obstack_printf (ob, decimal_ellipsis ? "%.*s%0*d" : "%.*s%0*X",
1036                       prefix_len, from, len1 - prefix_len, cnt);
1037       obstack_1grow (ob, '\0');
1038       name_end = obstack_finish (ob);
1039
1040       newp = (struct charseq *) obstack_alloc (ob, sizeof (*newp) + nbytes);
1041       newp->nbytes = nbytes;
1042       memcpy (newp->bytes, bytes, nbytes);
1043       newp->name = name_end;
1044
1045       newp->ucs4 = UNINITIALIZED_CHAR_VALUE;
1046       if ((name_end[0] == 'U' || name_end[0] == 'P')
1047           && (len1 == 5 || len1 == 9))
1048         {
1049           /* Maybe the name is of the form `Uxxxx' or `Uxxxxxxxx' where
1050              xxxx and xxxxxxxx are hexadecimal numbers.  In this case
1051              we use the value of xxxx or xxxxxxxx as the UCS4 value of
1052              this character and we don't have to consult the repertoire
1053              map.
1054
1055              If the name is of the form `Pxxxx' or `Pxxxxxxxx' the xxxx
1056              and xxxxxxxx also give the code point in UCS4 but this must
1057              be in the private, i.e., unassigned, area.  This should be
1058              used for characters which do not (yet) have an equivalent
1059              in ISO 10646 and Unicode.  */
1060           char *endp;
1061
1062           errno = 0;
1063           newp->ucs4 = strtoul (name_end + 1, &endp, 16);
1064           if (endp - name_end != len1
1065               || (newp->ucs4 == ~((uint32_t) 0) && errno == ERANGE)
1066               || newp->ucs4 >= 0x80000000)
1067             /* This wasn't successful.  Signal this name cannot be a
1068                correct UCS value.  */
1069             newp->ucs4 = UNINITIALIZED_CHAR_VALUE;
1070         }
1071
1072       insert_entry (ht, name_end, len1, newp);
1073       insert_entry (bt, newp->bytes, nbytes, newp);
1074       /* Please note we don't examine the return value since it is no error
1075          if we have two definitions for a symbol.  */
1076
1077       /* Increment the value in the byte sequence.  */
1078       if (++bytes[nbytes - 1] == '\0')
1079         {
1080           int b = nbytes - 2;
1081
1082           do
1083             if (b < 0)
1084               {
1085                 lr_error (lr,
1086                           _("resulting bytes for range not representable."));
1087                 return;
1088               }
1089           while (++bytes[b--] == 0);
1090         }
1091     }
1092 }
1093
1094
1095 struct charseq *
1096 charmap_find_symbol (const struct charmap_t *cm, const char *bytes,
1097                      size_t nbytes)
1098 {
1099   void *result;
1100
1101   return (find_entry ((hash_table *) &cm->byte_table, bytes, nbytes, &result)
1102           < 0 ? NULL : (struct charseq *) result);
1103 }