locale/programs/charmap.c

   1 /* Copyright (C) 1996-2020 Free Software Foundation, Inc.
   2    This file is part of the GNU C Library.
   3    Contributed by Ulrich Drepper <drepper@gnu.org>, 1996.
   4
   5    This program is free software; you can redistribute it and/or modify
   6    it under the terms of the GNU General Public License as published
   7    by the Free Software Foundation; version 2 of the License, or
   8    (at your option) any later version.
   9
  10    This program is distributed in the hope that it will be useful,
  11    but WITHOUT ANY WARRANTY; without even the implied warranty of
  12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  13    GNU General Public License for more details.
  14
  15    You should have received a copy of the GNU General Public License
  16    along with this program; if not, see <https://www.gnu.org/licenses/>.  */
  17
  18 #ifdef HAVE_CONFIG_H
  19 # include <config.h>
  20 #endif
  21
  22 #include <ctype.h>
  23 #include <errno.h>
  24 #include <libintl.h>
  25 #include <limits.h>
  26 #include <stdio.h>
  27 #include <stdlib.h>
  28 #include <string.h>
  29 #include <stdint.h>
  30
  31 #include "localedef.h"
  32 #include "linereader.h"
  33 #include "charmap.h"
  34 #include "charmap-dir.h"
  35
  36 #include <assert.h>
  37
  38
  39 /* Define the lookup function.  */
  40 #include "charmap-kw.h"
  41
  42
  43 /* Prototypes for local functions.  */
  44 static struct charmap_t *parse_charmap (struct linereader *cmfile,
  45                                         int verbose, int be_quiet);
  46 static void new_width (struct linereader *cmfile, struct charmap_t *result,
  47                        const char *from, const char *to,
  48                        unsigned long int width);
  49 static void charmap_new_char (struct linereader *lr, struct charmap_t *cm,
  50                               size_t nbytes, unsigned char *bytes,
  51                               const char *from, const char *to,
  52                               int decimal_ellipsis, int step);
  53
  54
  55 bool enc_not_ascii_compatible;
  56
  57
  58 #ifdef NEED_NULL_POINTER
  59 static const char *null_pointer;
  60 #endif
  61
  62 static struct linereader *
  63 cmlr_open (const char *directory, const char *name, kw_hash_fct_t hf)
  64 {
  65   FILE *fp;
  66
  67   fp = charmap_open (directory, name);
  68   if (fp == NULL)
  69     return NULL;
  70   else
  71     {
  72       size_t dlen = strlen (directory);
  73       int add_slash = (dlen == 0 || directory[dlen - 1] != '/');
  74       size_t nlen = strlen (name);
  75       char *pathname;
  76       char *p;
  77
  78       pathname = alloca (dlen + add_slash + nlen + 1);
  79       p = stpcpy (pathname, directory);
  80       if (add_slash)
  81         *p++ = '/';
  82       stpcpy (p, name);
  83
  84       return lr_create (fp, pathname, hf);
  85     }
  86 }
  87
  88 struct charmap_t *
  89 charmap_read (const char *filename, int verbose, int error_not_found,
  90               int be_quiet, int use_default)
  91 {
  92   struct charmap_t *result = NULL;
  93
  94   if (filename != NULL)
  95     {
  96       struct linereader *cmfile;
  97
  98       /* First try the name as found in the parameter.  */
  99       cmfile = lr_open (filename, charmap_hash);
 100       if (cmfile == NULL)
 101         {
 102           /* No successful.  So start looking through the directories
 103              in the I18NPATH if this is a simple name.  */
 104           if (strchr (filename, '/') == NULL)
 105             {
 106               char *i18npath = getenv ("I18NPATH");
 107               if (i18npath != NULL && *i18npath != '\0')
 108                 {
 109                   const size_t pathlen = strlen (i18npath);
 110                   char i18npathbuf[pathlen + 1];
 111                   char path[pathlen + sizeof ("/charmaps")];
 112                   char *next;
 113                   i18npath = memcpy (i18npathbuf, i18npath, pathlen + 1);
 114
 115                   while (cmfile == NULL
 116                          && (next = strsep (&i18npath, ":")) != NULL)
 117                     {
 118                       stpcpy (stpcpy (path, next), "/charmaps");
 119                       cmfile = cmlr_open (path, filename, charmap_hash);
 120
 121                       if (cmfile == NULL)
 122                         /* Try without the "/charmaps" part.  */
 123                         cmfile = cmlr_open (next, filename, charmap_hash);
 124                     }
 125                 }
 126
 127               if (cmfile == NULL)
 128                 /* Try the default directory.  */
 129                 cmfile = cmlr_open (CHARMAP_PATH, filename, charmap_hash);
 130             }
 131         }
 132
 133       if (cmfile != NULL)
 134         result = parse_charmap (cmfile, verbose, be_quiet);
 135
 136       if (result == NULL && error_not_found)
 137         record_error (0, errno,
 138                       _("character map file `%s' not found"),
 139                       filename);
 140     }
 141
 142   if (result == NULL && filename != NULL && strchr (filename, '/') == NULL)
 143     {
 144       /* OK, one more try.  We also accept the names given to the
 145          character sets in the files.  Sometimes they differ from the
 146          file name.  */
 147       CHARMAP_DIR *dir;
 148
 149       dir = charmap_opendir (CHARMAP_PATH);
 150       if (dir != NULL)
 151         {
 152           const char *dirent;
 153
 154           while ((dirent = charmap_readdir (dir)) != NULL)
 155             {
 156               char **aliases;
 157               char **p;
 158               int found;
 159
 160               aliases = charmap_aliases (CHARMAP_PATH, dirent);
 161               found = 0;
 162               for (p = aliases; *p; p++)
 163                 if (strcasecmp (*p, filename) == 0)
 164                   {
 165                     found = 1;
 166                     break;
 167                   }
 168               charmap_free_aliases (aliases);
 169
 170               if (found)
 171                 {
 172                   struct linereader *cmfile;
 173
 174                   cmfile = cmlr_open (CHARMAP_PATH, dirent, charmap_hash);
 175                   if (cmfile != NULL)
 176                     result = parse_charmap (cmfile, verbose, be_quiet);
 177
 178                   break;
 179                 }
 180             }
 181
 182           charmap_closedir (dir);
 183         }
 184     }
 185
 186   if (result == NULL && DEFAULT_CHARMAP != NULL)
 187     {
 188       struct linereader *cmfile;
 189
 190       cmfile = cmlr_open (CHARMAP_PATH, DEFAULT_CHARMAP, charmap_hash);
 191       if (cmfile != NULL)
 192         result = parse_charmap (cmfile, verbose, be_quiet);
 193
 194       if (result == NULL)
 195         record_error (4, errno,
 196                       _("default character map file `%s' not found"),
 197                       DEFAULT_CHARMAP);
 198     }
 199
 200   if (result != NULL && result->code_set_name == NULL)
 201     /* The input file does not specify a code set name.  This
 202        shouldn't happen but we should cope with it.  */
 203     result->code_set_name = basename (filename);
 204
 205   /* Test of ASCII compatibility of locale encoding.
 206
 207      Verify that the encoding to be used in a locale is ASCII compatible,
 208      at least for the graphic characters, excluding the control characters,
 209      '$' and '@'.  This constraint comes from an ISO C 99 restriction.
 210
 211      ISO C 99 section 7.17.(2) (about wchar_t):
 212        the null character shall have the code value zero and each member of
 213        the basic character set shall have a code value equal to its value
 214        when used as the lone character in an integer character constant.
 215      ISO C 99 section 5.2.1.(3):
 216        Both the basic source and basic execution character sets shall have
 217        the following members: the 26 uppercase letters of the Latin alphabet
 218             A B C D E F G H I J K L M N O P Q R S T U V W X Y Z
 219        the 26 lowercase letters of the Latin alphabet
 220             a b c d e f g h i j k l m n o p q r s t u v w x y z
 221        the 10 decimal digits
 222             0 1 2 3 4 5 6 7 8 9
 223        the following 29 graphic characters
 224             ! " # % & ' ( ) * + , - . / : ; < = > ? [ \ ] ^ _ { | } ~
 225        the space character, and control characters representing horizontal
 226        tab, vertical tab, and form feed.
 227
 228      Therefore, for all members of the "basic character set", the 'char' code
 229      must have the same value as the 'wchar_t' code, which in glibc is the
 230      same as the Unicode code, which for all of the enumerated characters
 231      is identical to the ASCII code. */
 232   if (result != NULL && use_default)
 233     {
 234       static const char basic_charset[] =
 235         {
 236           'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M',
 237           'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z',
 238           'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm',
 239           'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
 240           '0', '1', '2', '3', '4', '5', '6', '7', '8', '9',
 241           '!', '"', '#', '%', '&', '\'', '(', ')', '*', '+', ',', '-',
 242           '.', '/', ':', ';', '<', '=', '>', '?', '[', '\\', ']', '^',
 243           '_', '{', '|', '}', '~', ' ', '\t', '\v', '\f', '\0'
 244         };
 245       int failed = 0;
 246       const char *p = basic_charset;
 247
 248       do
 249         {
 250           struct charseq *seq = charmap_find_symbol (result, p, 1);
 251
 252           if (seq == NULL || seq->ucs4 != (uint32_t) *p)
 253             failed = 1;
 254         }
 255       while (*p++ != '\0');
 256
 257       if (failed)
 258         {
 259           /* A user may disable the ASCII compatibility warning check,
 260              but we must remember that the encoding is not ASCII
 261              compatible, since it may have other implications.  Later
 262              we will set _NL_CTYPE_MAP_TO_NONASCII from this value.  */
 263           if (warn_ascii)
 264             record_warning (_(
 265 "character map `%s' is not ASCII compatible, locale not ISO C compliant "
 266 "[--no-warnings=ascii]"),
 267                             result->code_set_name);
 268           enc_not_ascii_compatible = true;
 269         }
 270     }
 271
 272   return result;
 273 }
 274
 275
 276 static struct charmap_t *
 277 parse_charmap (struct linereader *cmfile, int verbose, int be_quiet)
 278 {
 279   struct charmap_t *result;
 280   int state;
 281   enum token_t expected_tok = tok_error;
 282   const char *expected_str = NULL;
 283   char *from_name = NULL;
 284   char *to_name = NULL;
 285   enum token_t ellipsis = 0;
 286   int step = 1;
 287
 288   /* We don't want symbolic names in string to be translated.  */
 289   cmfile->translate_strings = 0;
 290
 291   /* Allocate room for result.  */
 292   result = (struct charmap_t *) xmalloc (sizeof (struct charmap_t));
 293   memset (result, '\0', sizeof (struct charmap_t));
 294   /* The default DEFAULT_WIDTH is 1.  */
 295   result->width_default = 1;
 296
 297 #define obstack_chunk_alloc malloc
 298 #define obstack_chunk_free free
 299   obstack_init (&result->mem_pool);
 300
 301   if (init_hash (&result->char_table, 256)
 302       || init_hash (&result->byte_table, 256))
 303     {
 304       free (result);
 305       return NULL;
 306     }
 307
 308   /* We use a state machine to describe the charmap description file
 309      format.  */
 310   state = 1;
 311   while (1)
 312     {
 313       /* What's on?  */
 314       struct token *now = lr_token (cmfile, NULL, NULL, NULL, verbose);
 315       enum token_t nowtok = now->tok;
 316       struct token *arg;
 317
 318       if (nowtok == tok_eof)
 319         break;
 320
 321       switch (state)
 322         {
 323         case 1:
 324           /* The beginning.  We expect the special declarations, EOL or
 325              `CHARMAP'.  */
 326           if (nowtok == tok_eol)
 327             /* Ignore empty lines.  */
 328             continue;
 329
 330           if (nowtok == tok_charmap)
 331             {
 332               from_name = NULL;
 333               to_name = NULL;
 334
 335               /* We have to set up the real work.  Fill in some
 336                  default values.  */
 337               if (result->mb_cur_max == 0)
 338                 result->mb_cur_max = 1;
 339               if (result->mb_cur_min == 0)
 340                 result->mb_cur_min = result->mb_cur_max;
 341               if (result->mb_cur_min > result->mb_cur_max)
 342                 {
 343                   record_error (0, 0, _("\
 344 %s: <mb_cur_max> must be greater than <mb_cur_min>\n"),
 345                                 cmfile->fname);
 346
 347                   result->mb_cur_min = result->mb_cur_max;
 348                 }
 349
 350               lr_ignore_rest (cmfile, 1);
 351
 352               state = 2;
 353               continue;
 354             }
 355
 356           if (nowtok != tok_code_set_name && nowtok != tok_mb_cur_max
 357               && nowtok != tok_mb_cur_min && nowtok != tok_escape_char
 358               && nowtok != tok_comment_char && nowtok != tok_g0esc
 359               && nowtok != tok_g1esc && nowtok != tok_g2esc
 360               && nowtok != tok_g3esc && nowtok != tok_repertoiremap
 361               && nowtok != tok_include)
 362             {
 363               lr_error (cmfile, _("syntax error in prolog: %s"),
 364                         _("invalid definition"));
 365
 366               lr_ignore_rest (cmfile, 0);
 367               continue;
 368             }
 369
 370           /* We know that we need an argument.  */
 371           arg = lr_token (cmfile, NULL, NULL, NULL, verbose);
 372
 373           switch (nowtok)
 374             {
 375             case tok_code_set_name:
 376             case tok_repertoiremap:
 377               if (arg->tok != tok_ident && arg->tok != tok_string)
 378                 {
 379                 badarg:
 380                   lr_error (cmfile, _("syntax error in prolog: %s"),
 381                             _("bad argument"));
 382
 383                   lr_ignore_rest (cmfile, 0);
 384                   continue;
 385                 }
 386
 387               if (nowtok == tok_code_set_name)
 388                 result->code_set_name = obstack_copy0 (&result->mem_pool,
 389                                                        arg->val.str.startmb,
 390                                                        arg->val.str.lenmb);
 391               else
 392                 result->repertoiremap = obstack_copy0 (&result->mem_pool,
 393                                                        arg->val.str.startmb,
 394                                                        arg->val.str.lenmb);
 395
 396               lr_ignore_rest (cmfile, 1);
 397               continue;
 398
 399             case tok_mb_cur_max:
 400             case tok_mb_cur_min:
 401               if (arg->tok != tok_number)
 402                 goto badarg;
 403
 404               if ((nowtok == tok_mb_cur_max
 405                        && result->mb_cur_max != 0)
 406                       || (nowtok == tok_mb_cur_max
 407                           && result->mb_cur_max != 0))
 408                 lr_error (cmfile, _("duplicate definition of <%s>"),
 409                           nowtok == tok_mb_cur_min
 410                           ? "mb_cur_min" : "mb_cur_max");
 411
 412               if (arg->val.num < 1)
 413                 {
 414                   lr_error (cmfile,
 415                             _("value for <%s> must be 1 or greater"),
 416                             nowtok == tok_mb_cur_min
 417                             ? "mb_cur_min" : "mb_cur_max");
 418
 419                   lr_ignore_rest (cmfile, 0);
 420                   continue;
 421                 }
 422               if ((nowtok == tok_mb_cur_max && result->mb_cur_min != 0
 423                    && (int) arg->val.num < result->mb_cur_min)
 424                   || (nowtok == tok_mb_cur_min && result->mb_cur_max != 0
 425                       && (int) arg->val.num > result->mb_cur_max))
 426                 {
 427                   lr_error (cmfile, _("\
 428 value of <%s> must be greater or equal than the value of <%s>"),
 429                             "mb_cur_max", "mb_cur_min");
 430
 431                   lr_ignore_rest (cmfile, 0);
 432                   continue;
 433                 }
 434
 435               if (nowtok == tok_mb_cur_max)
 436                 result->mb_cur_max = arg->val.num;
 437               else
 438                 result->mb_cur_min = arg->val.num;
 439
 440               lr_ignore_rest (cmfile, 1);
 441               continue;
 442
 443             case tok_escape_char:
 444             case tok_comment_char:
 445               if (arg->tok != tok_ident)
 446                 goto badarg;
 447
 448               if (arg->val.str.lenmb != 1)
 449                 {
 450                   lr_error (cmfile, _("\
 451 argument to <%s> must be a single character"),
 452                             nowtok == tok_escape_char ? "escape_char"
 453                                                       : "comment_char");
 454
 455                   lr_ignore_rest (cmfile, 0);
 456                   continue;
 457                 }
 458
 459               if (nowtok == tok_escape_char)
 460                 cmfile->escape_char = *arg->val.str.startmb;
 461               else
 462                 cmfile->comment_char = *arg->val.str.startmb;
 463
 464               lr_ignore_rest (cmfile, 1);
 465               continue;
 466
 467             case tok_g0esc:
 468             case tok_g1esc:
 469             case tok_g2esc:
 470             case tok_g3esc:
 471             case tok_escseq:
 472               lr_ignore_rest (cmfile, 0); /* XXX */
 473               continue;
 474
 475             case tok_include:
 476               lr_error (cmfile, _("\
 477 character sets with locking states are not supported"));
 478               exit (4);
 479
 480             default:
 481               /* Cannot happen.  */
 482               assert (! "Should not happen");
 483             }
 484           break;
 485
 486         case 2:
 487           /* We have seen `CHARMAP' and now are in the body.  Each line
 488              must have the format "%s %s %s\n" or "%s...%s %s %s\n".  */
 489           if (nowtok == tok_eol)
 490             /* Ignore empty lines.  */
 491             continue;
 492
 493           if (nowtok == tok_end)
 494             {
 495               expected_tok = tok_charmap;
 496               expected_str = "CHARMAP";
 497               state = 90;
 498               continue;
 499             }
 500
 501           if (nowtok != tok_bsymbol && nowtok != tok_ucs4)
 502             {
 503               lr_error (cmfile, _("syntax error in %s definition: %s"),
 504                         "CHARMAP", _("no symbolic name given"));
 505
 506               lr_ignore_rest (cmfile, 0);
 507               continue;
 508             }
 509
 510           /* If the previous line was not completely correct free the
 511              used memory.  */
 512           if (from_name != NULL)
 513             obstack_free (&result->mem_pool, from_name);
 514
 515           if (nowtok == tok_bsymbol)
 516             from_name = (char *) obstack_copy0 (&result->mem_pool,
 517                                                 now->val.str.startmb,
 518                                                 now->val.str.lenmb);
 519           else
 520             {
 521               obstack_printf (&result->mem_pool, "U%08X",
 522                               cmfile->token.val.ucs4);
 523               obstack_1grow (&result->mem_pool, '\0');
 524               from_name = (char *) obstack_finish (&result->mem_pool);
 525             }
 526           to_name = NULL;
 527
 528           state = 3;
 529           continue;
 530
 531         case 3:
 532           /* We have two possibilities: We can see an ellipsis or an
 533              encoding value.  */
 534           if (nowtok == tok_ellipsis3 || nowtok == tok_ellipsis4
 535               || nowtok == tok_ellipsis2 || nowtok == tok_ellipsis4_2
 536               || nowtok == tok_ellipsis2_2)
 537             {
 538               ellipsis = nowtok;
 539               if (nowtok == tok_ellipsis4_2)
 540                 {
 541                   step = 2;
 542                   nowtok = tok_ellipsis4;
 543                 }
 544               else if (nowtok == tok_ellipsis2_2)
 545                 {
 546                   step = 2;
 547                   nowtok = tok_ellipsis2;
 548                 }
 549               state = 4;
 550               continue;
 551             }
 552           /* FALLTHROUGH */
 553
 554         case 5:
 555           if (nowtok != tok_charcode)
 556             {
 557               lr_error (cmfile, _("syntax error in %s definition: %s"),
 558                         "CHARMAP", _("invalid encoding given"));
 559
 560               lr_ignore_rest (cmfile, 0);
 561
 562               state = 2;
 563               continue;
 564             }
 565
 566           if (now->val.charcode.nbytes < result->mb_cur_min)
 567             lr_error (cmfile, _("too few bytes in character encoding"));
 568           else if (now->val.charcode.nbytes > result->mb_cur_max)
 569             lr_error (cmfile, _("too many bytes in character encoding"));
 570           else
 571             charmap_new_char (cmfile, result, now->val.charcode.nbytes,
 572                               now->val.charcode.bytes, from_name, to_name,
 573                               ellipsis != tok_ellipsis2, step);
 574
 575           /* Ignore trailing comment silently.  */
 576           lr_ignore_rest (cmfile, 0);
 577
 578           from_name = NULL;
 579           to_name = NULL;
 580           ellipsis = tok_none;
 581           step = 1;
 582
 583           state = 2;
 584           continue;
 585
 586         case 4:
 587           if (nowtok != tok_bsymbol && nowtok != tok_ucs4)
 588             {
 589               lr_error (cmfile, _("syntax error in %s definition: %s"),
 590                         "CHARMAP",
 591                         _("no symbolic name given for end of range"));
 592
 593               lr_ignore_rest (cmfile, 0);
 594               continue;
 595             }
 596
 597           /* Copy the to-name in a safe place.  */
 598           if (nowtok == tok_bsymbol)
 599             to_name = (char *) obstack_copy0 (&result->mem_pool,
 600                                               cmfile->token.val.str.startmb,
 601                                               cmfile->token.val.str.lenmb);
 602           else
 603             {
 604               obstack_printf (&result->mem_pool, "U%08X",
 605                               cmfile->token.val.ucs4);
 606               obstack_1grow (&result->mem_pool, '\0');
 607               to_name = (char *) obstack_finish (&result->mem_pool);
 608             }
 609
 610           state = 5;
 611           continue;
 612
 613         case 90:
 614           if (nowtok != expected_tok)
 615             lr_error (cmfile, _("\
 616 %1$s: definition does not end with `END %1$s'"), expected_str);
 617
 618           lr_ignore_rest (cmfile, nowtok == expected_tok);
 619           state = 91;
 620           continue;
 621
 622         case 91:
 623           /* Waiting for WIDTH... */
 624           if (nowtok == tok_eol)
 625             /* Ignore empty lines.  */
 626             continue;
 627
 628           if (nowtok == tok_width_default)
 629             {
 630               state = 92;
 631               continue;
 632             }
 633
 634           if (nowtok == tok_width)
 635             {
 636               lr_ignore_rest (cmfile, 1);
 637               state = 93;
 638               continue;
 639             }
 640
 641           if (nowtok == tok_width_variable)
 642             {
 643               lr_ignore_rest (cmfile, 1);
 644               state = 98;
 645               continue;
 646             }
 647
 648           lr_error (cmfile, _("\
 649 only WIDTH definitions are allowed to follow the CHARMAP definition"));
 650
 651           lr_ignore_rest (cmfile, 0);
 652           continue;
 653
 654         case 92:
 655           if (nowtok != tok_number)
 656             lr_error (cmfile, _("value for %s must be an integer"),
 657                       "WIDTH_DEFAULT");
 658           else
 659             result->width_default = now->val.num;
 660
 661           lr_ignore_rest (cmfile, nowtok == tok_number);
 662
 663           state = 91;
 664           continue;
 665
 666         case 93:
 667           /* We now expect `END WIDTH' or lines of the format "%s %d\n" or
 668              "%s...%s %d\n".  */
 669           if (nowtok == tok_eol)
 670             /* ignore empty lines.  */
 671             continue;
 672
 673           if (nowtok == tok_end)
 674             {
 675               expected_tok = tok_width;
 676               expected_str = "WIDTH";
 677               state = 90;
 678               continue;
 679             }
 680
 681           if (nowtok != tok_bsymbol && nowtok != tok_ucs4)
 682             {
 683               lr_error (cmfile, _("syntax error in %s definition: %s"),
 684                         "WIDTH", _("no symbolic name given"));
 685
 686               lr_ignore_rest (cmfile, 0);
 687               continue;
 688             }
 689
 690           if (from_name != NULL)
 691             obstack_free (&result->mem_pool, from_name);
 692
 693           if (nowtok == tok_bsymbol)
 694             from_name = (char *) obstack_copy0 (&result->mem_pool,
 695                                                 now->val.str.startmb,
 696                                                 now->val.str.lenmb);
 697           else
 698             {
 699               obstack_printf (&result->mem_pool, "U%08X",
 700                               cmfile->token.val.ucs4);
 701               obstack_1grow (&result->mem_pool, '\0');
 702               from_name = (char *) obstack_finish (&result->mem_pool);
 703             }
 704
 705           to_name = NULL;
 706
 707           state = 94;
 708           continue;
 709
 710         case 94:
 711           if (nowtok == tok_ellipsis3)
 712             {
 713               state = 95;
 714               continue;
 715             }
 716           /* Fall through.  */
 717
 718         case 96:
 719           if (nowtok != tok_number)
 720             lr_error (cmfile, _("value for %s must be an integer"),
 721                       "WIDTH");
 722           else
 723             {
 724               /* Store width for chars.  */
 725               new_width (cmfile, result, from_name, to_name, now->val.num);
 726
 727               from_name = NULL;
 728               to_name = NULL;
 729             }
 730
 731           lr_ignore_rest (cmfile, nowtok == tok_number);
 732
 733           state = 93;
 734           continue;
 735
 736         case 95:
 737           if (nowtok != tok_bsymbol && nowtok != tok_ucs4)
 738             {
 739               lr_error (cmfile, _("syntax error in %s definition: %s"),
 740                         "WIDTH", _("no symbolic name given for end of range"));
 741
 742               lr_ignore_rest (cmfile, 0);
 743
 744               state = 93;
 745               continue;
 746             }
 747
 748           if (nowtok == tok_bsymbol)
 749             to_name = (char *) obstack_copy0 (&result->mem_pool,
 750                                               now->val.str.startmb,
 751                                               now->val.str.lenmb);
 752           else
 753             {
 754               obstack_printf (&result->mem_pool, "U%08X",
 755                               cmfile->token.val.ucs4);
 756               obstack_1grow (&result->mem_pool, '\0');
 757               to_name = (char *) obstack_finish (&result->mem_pool);
 758             }
 759
 760           state = 96;
 761           continue;
 762
 763         case 98:
 764           /* We now expect `END WIDTH_VARIABLE' or lines of the format
 765              "%s\n" or "%s...%s\n".  */
 766           if (nowtok == tok_eol)
 767             /* ignore empty lines.  */
 768             continue;
 769
 770           if (nowtok == tok_end)
 771             {
 772               expected_tok = tok_width_variable;
 773               expected_str = "WIDTH_VARIABLE";
 774               state = 90;
 775               continue;
 776             }
 777
 778           if (nowtok != tok_bsymbol && nowtok != tok_ucs4)
 779             {
 780               lr_error (cmfile, _("syntax error in %s definition: %s"),
 781                         "WIDTH_VARIABLE", _("no symbolic name given"));
 782
 783               lr_ignore_rest (cmfile, 0);
 784
 785               continue;
 786             }
 787
 788           if (from_name != NULL)
 789             obstack_free (&result->mem_pool, from_name);
 790
 791           if (nowtok == tok_bsymbol)
 792             from_name = (char *) obstack_copy0 (&result->mem_pool,
 793                                                 now->val.str.startmb,
 794                                                 now->val.str.lenmb);
 795           else
 796             {
 797               obstack_printf (&result->mem_pool, "U%08X",
 798                               cmfile->token.val.ucs4);
 799               obstack_1grow (&result->mem_pool, '\0');
 800               from_name = (char *) obstack_finish (&result->mem_pool);
 801             }
 802           to_name = NULL;
 803
 804           state = 99;
 805           continue;
 806
 807         case 99:
 808           if (nowtok == tok_ellipsis3)
 809             state = 100;
 810
 811           /* Store info.  */
 812           from_name = NULL;
 813
 814           /* Warn */
 815           state = 98;
 816           continue;
 817
 818         case 100:
 819           if (nowtok != tok_bsymbol && nowtok != tok_ucs4)
 820             {
 821               lr_error (cmfile, _("syntax error in %s definition: %s"),
 822                         "WIDTH_VARIABLE",
 823                         _("no symbolic name given for end of range"));
 824               lr_ignore_rest (cmfile, 0);
 825               continue;
 826             }
 827
 828           if (nowtok == tok_bsymbol)
 829             to_name = (char *) obstack_copy0 (&result->mem_pool,
 830                                               now->val.str.startmb,
 831                                               now->val.str.lenmb);
 832           else
 833             {
 834               obstack_printf (&result->mem_pool, "U%08X",
 835                               cmfile->token.val.ucs4);
 836               obstack_1grow (&result->mem_pool, '\0');
 837               to_name = (char *) obstack_finish (&result->mem_pool);
 838             }
 839
 840           /* XXX Enter value into table.  */
 841
 842           lr_ignore_rest (cmfile, 1);
 843
 844           state = 98;
 845           continue;
 846
 847         default:
 848           record_error (5, 0, _("%s: error in state machine"),
 849                         __FILE__);
 850           /* NOTREACHED */
 851         }
 852       break;
 853     }
 854
 855   if (state != 91)
 856     record_error (0, 0, _("%s: premature end of file"),
 857                   cmfile->fname);
 858
 859   lr_close (cmfile);
 860
 861   return result;
 862 }
 863
 864
 865 static void
 866 new_width (struct linereader *cmfile, struct charmap_t *result,
 867            const char *from, const char *to, unsigned long int width)
 868 {
 869   struct charseq *from_val;
 870   struct charseq *to_val;
 871
 872   from_val = charmap_find_value (result, from, strlen (from));
 873   if (from_val == NULL)
 874     {
 875       lr_error (cmfile, _("unknown character `%s'"), from);
 876       return;
 877     }
 878
 879   if (to == NULL)
 880     to_val = from_val;
 881   else
 882     {
 883       to_val = charmap_find_value (result, to, strlen (to));
 884       if (to_val == NULL)
 885         {
 886           lr_error (cmfile, _("unknown character `%s'"), to);
 887           return;
 888         }
 889
 890       /* Make sure the number of bytes for the end points of the range
 891          is correct.  */
 892       if (from_val->nbytes != to_val->nbytes)
 893         {
 894           lr_error (cmfile, _("\
 895 number of bytes for byte sequence of beginning and end of range not the same: %d vs %d"),
 896                     from_val->nbytes, to_val->nbytes);
 897           return;
 898         }
 899     }
 900
 901   if (result->nwidth_rules >= result->nwidth_rules_max)
 902     {
 903       size_t new_size = result->nwidth_rules + 32;
 904       struct width_rule *new_rules =
 905         (struct width_rule *) obstack_alloc (&result->mem_pool,
 906                                              (new_size
 907                                               * sizeof (struct width_rule)));
 908
 909       memcpy (new_rules, result->width_rules,
 910               result->nwidth_rules_max * sizeof (struct width_rule));
 911
 912       result->width_rules = new_rules;
 913       result->nwidth_rules_max = new_size;
 914     }
 915
 916   result->width_rules[result->nwidth_rules].from = from_val;
 917   result->width_rules[result->nwidth_rules].to = to_val;
 918   result->width_rules[result->nwidth_rules].width = (unsigned int) width;
 919   ++result->nwidth_rules;
 920 }
 921
 922
 923 struct charseq *
 924 charmap_find_value (const struct charmap_t *cm, const char *name, size_t len)
 925 {
 926   void *result;
 927
 928   return (find_entry ((hash_table *) &cm->char_table, name, len, &result)
 929           < 0 ? NULL : (struct charseq *) result);
 930 }
 931
 932
 933 static void
 934 charmap_new_char (struct linereader *lr, struct charmap_t *cm,
 935                   size_t nbytes, unsigned char *bytes,
 936                   const char *from, const char *to,
 937                   int decimal_ellipsis, int step)
 938 {
 939   hash_table *ht = &cm->char_table;
 940   hash_table *bt = &cm->byte_table;
 941   struct obstack *ob = &cm->mem_pool;
 942   char *from_end;
 943   char *to_end;
 944   const char *cp;
 945   int prefix_len, len1, len2;
 946   unsigned int from_nr, to_nr, cnt;
 947   struct charseq *newp;
 948
 949   len1 = strlen (from);
 950
 951   if (to == NULL)
 952     {
 953       newp = (struct charseq *) obstack_alloc (ob, sizeof (*newp) + nbytes);
 954       newp->nbytes = nbytes;
 955       memcpy (newp->bytes, bytes, nbytes);
 956       newp->name = from;
 957
 958       newp->ucs4 = UNINITIALIZED_CHAR_VALUE;
 959       if ((from[0] == 'U' || from[0] == 'P') && (len1 == 5 || len1 == 9))
 960         {
 961           /* Maybe the name is of the form `Uxxxx' or `Uxxxxxxxx' where
 962              xxxx and xxxxxxxx are hexadecimal numbers.  In this case
 963              we use the value of xxxx or xxxxxxxx as the UCS4 value of
 964              this character and we don't have to consult the repertoire
 965              map.
 966
 967              If the name is of the form `Pxxxx' or `Pxxxxxxxx' the xxxx
 968              and xxxxxxxx also give the code point in UCS4 but this must
 969              be in the private, i.e., unassigned, area.  This should be
 970              used for characters which do not (yet) have an equivalent
 971              in ISO 10646 and Unicode.  */
 972           char *endp;
 973
 974           errno = 0;
 975           newp->ucs4 = strtoul (from + 1, &endp, 16);
 976           if (endp - from != len1
 977               || (newp->ucs4 == ~((uint32_t) 0) && errno == ERANGE)
 978               || newp->ucs4 >= 0x80000000)
 979             /* This wasn't successful.  Signal this name cannot be a
 980                correct UCS value.  */
 981             newp->ucs4 = UNINITIALIZED_CHAR_VALUE;
 982         }
 983
 984       insert_entry (ht, from, len1, newp);
 985       insert_entry (bt, newp->bytes, nbytes, newp);
 986       /* Please note that it isn't a bug if a symbol is defined more
 987          than once.  All later definitions are simply discarded.  */
 988       return;
 989     }
 990
 991   /* We have a range: the names must have names with equal prefixes
 992      and an equal number of digits, where the second number is greater
 993      or equal than the first.  */
 994   len2 = strlen (to);
 995
 996   if (len1 != len2)
 997     {
 998     illegal_range:
 999       lr_error (lr, _("invalid names for character range"));
1000       return;
1001     }
1002
1003   cp = &from[len1 - 1];
1004   if (decimal_ellipsis)
1005     while (isdigit (*cp) && cp >= from)
1006       --cp;
1007   else
1008     while (isxdigit (*cp) && cp >= from)
1009       {
1010         if (!isdigit (*cp) && !isupper (*cp))
1011           lr_error (lr, _("\
1012 hexadecimal range format should use only capital characters"));
1013         --cp;
1014       }
1015
1016   prefix_len = (cp - from) + 1;
1017
1018   if (cp == &from[len1 - 1] || strncmp (from, to, prefix_len) != 0)
1019     goto illegal_range;
1020
1021   errno = 0;
1022   from_nr = strtoul (&from[prefix_len], &from_end, decimal_ellipsis ? 10 : 16);
1023   if (*from_end != '\0' || (from_nr == UINT_MAX && errno == ERANGE)
1024       || ((to_nr = strtoul (&to[prefix_len], &to_end,
1025                             decimal_ellipsis ? 10 : 16)) == UINT_MAX
1026           && errno == ERANGE)
1027       || *to_end != '\0')
1028     {
1029       lr_error (lr, _("<%s> and <%s> are invalid names for range"), from, to);
1030       return;
1031     }
1032
1033   if (from_nr > to_nr)
1034     {
1035       lr_error (lr, _("upper limit in range is smaller than lower limit"));
1036       return;
1037     }
1038
1039   for (cnt = from_nr; cnt <= to_nr; cnt += step)
1040     {
1041       char *name_end;
1042       obstack_printf (ob, decimal_ellipsis ? "%.*s%0*d" : "%.*s%0*X",
1043                       prefix_len, from, len1 - prefix_len, cnt);
1044       obstack_1grow (ob, '\0');
1045       name_end = obstack_finish (ob);
1046
1047       newp = (struct charseq *) obstack_alloc (ob, sizeof (*newp) + nbytes);
1048       newp->nbytes = nbytes;
1049       memcpy (newp->bytes, bytes, nbytes);
1050       newp->name = name_end;
1051
1052       newp->ucs4 = UNINITIALIZED_CHAR_VALUE;
1053       if ((name_end[0] == 'U' || name_end[0] == 'P')
1054           && (len1 == 5 || len1 == 9))
1055         {
1056           /* Maybe the name is of the form `Uxxxx' or `Uxxxxxxxx' where
1057              xxxx and xxxxxxxx are hexadecimal numbers.  In this case
1058              we use the value of xxxx or xxxxxxxx as the UCS4 value of
1059              this character and we don't have to consult the repertoire
1060              map.
1061
1062              If the name is of the form `Pxxxx' or `Pxxxxxxxx' the xxxx
1063              and xxxxxxxx also give the code point in UCS4 but this must
1064              be in the private, i.e., unassigned, area.  This should be
1065              used for characters which do not (yet) have an equivalent
1066              in ISO 10646 and Unicode.  */
1067           char *endp;
1068
1069           errno = 0;
1070           newp->ucs4 = strtoul (name_end + 1, &endp, 16);
1071           if (endp - name_end != len1
1072               || (newp->ucs4 == ~((uint32_t) 0) && errno == ERANGE)
1073               || newp->ucs4 >= 0x80000000)
1074             /* This wasn't successful.  Signal this name cannot be a
1075                correct UCS value.  */
1076             newp->ucs4 = UNINITIALIZED_CHAR_VALUE;
1077         }
1078
1079       insert_entry (ht, name_end, len1, newp);
1080       insert_entry (bt, newp->bytes, nbytes, newp);
1081       /* Please note we don't examine the return value since it is no error
1082          if we have two definitions for a symbol.  */
1083
1084       /* Increment the value in the byte sequence.  */
1085       if (++bytes[nbytes - 1] == '\0')
1086         {
1087           int b = nbytes - 2;
1088
1089           do
1090             if (b < 0)
1091               {
1092                 lr_error (lr,
1093                           _("resulting bytes for range not representable."));
1094                 return;
1095               }
1096           while (++bytes[b--] == 0);
1097         }
1098     }
1099 }
1100
1101
1102 struct charseq *
1103 charmap_find_symbol (const struct charmap_t *cm, const char *bytes,
1104                      size_t nbytes)
1105 {
1106   void *result;
1107
1108   return (find_entry ((hash_table *) &cm->byte_table, bytes, nbytes, &result)
1109           < 0 ? NULL : (struct charseq *) result);
1110 }