locale/programs/charmap.c

   1 /* Copyright (C) 1996-2022 Free Software Foundation, Inc.
   2    This file is part of the GNU C Library.
   3
   4    This program is free software; you can redistribute it and/or modify
   5    it under the terms of the GNU General Public License as published
   6    by the Free Software Foundation; version 2 of the License, or
   7    (at your option) any later version.
   8
   9    This program is distributed in the hope that it will be useful,
  10    but WITHOUT ANY WARRANTY; without even the implied warranty of
  11    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  12    GNU General Public License for more details.
  13
  14    You should have received a copy of the GNU General Public License
  15    along with this program; if not, see <https://www.gnu.org/licenses/>.  */
  16
  17 #ifdef HAVE_CONFIG_H
  18 # include <config.h>
  19 #endif
  20
  21 #include <ctype.h>
  22 #include <errno.h>
  23 #include <libintl.h>
  24 #include <limits.h>
  25 #include <stdio.h>
  26 #include <stdlib.h>
  27 #include <string.h>
  28 #include <stdint.h>
  29
  30 #include "localedef.h"
  31 #include "linereader.h"
  32 #include "charmap.h"
  33 #include "charmap-dir.h"
  34
  35 #include <assert.h>
  36
  37
  38 /* Define the lookup function.  */
  39 #include "charmap-kw.h"
  40
  41
  42 /* Prototypes for local functions.  */
  43 static struct charmap_t *parse_charmap (struct linereader *cmfile,
  44                                         int verbose, int be_quiet);
  45 static void new_width (struct linereader *cmfile, struct charmap_t *result,
  46                        const char *from, const char *to,
  47                        unsigned long int width);
  48 static void charmap_new_char (struct linereader *lr, struct charmap_t *cm,
  49                               size_t nbytes, unsigned char *bytes,
  50                               const char *from, const char *to,
  51                               int decimal_ellipsis, int step);
  52
  53
  54 bool enc_not_ascii_compatible;
  55
  56
  57 #ifdef NEED_NULL_POINTER
  58 static const char *null_pointer;
  59 #endif
  60
  61 static struct linereader *
  62 cmlr_open (const char *directory, const char *name, kw_hash_fct_t hf)
  63 {
  64   FILE *fp;
  65
  66   fp = charmap_open (directory, name);
  67   if (fp == NULL)
  68     return NULL;
  69   else
  70     {
  71       size_t dlen = strlen (directory);
  72       int add_slash = (dlen == 0 || directory[dlen - 1] != '/');
  73       size_t nlen = strlen (name);
  74       char *pathname;
  75       char *p;
  76
  77       pathname = alloca (dlen + add_slash + nlen + 1);
  78       p = stpcpy (pathname, directory);
  79       if (add_slash)
  80         *p++ = '/';
  81       stpcpy (p, name);
  82
  83       return lr_create (fp, pathname, hf);
  84     }
  85 }
  86
  87 struct charmap_t *
  88 charmap_read (const char *filename, int verbose, int error_not_found,
  89               int be_quiet, int use_default)
  90 {
  91   struct charmap_t *result = NULL;
  92
  93   if (filename != NULL)
  94     {
  95       struct linereader *cmfile;
  96
  97       /* First try the name as found in the parameter.  */
  98       cmfile = lr_open (filename, charmap_hash);
  99       if (cmfile == NULL)
 100         {
 101           /* No successful.  So start looking through the directories
 102              in the I18NPATH if this is a simple name.  */
 103           if (strchr (filename, '/') == NULL)
 104             {
 105               char *i18npath = getenv ("I18NPATH");
 106               if (i18npath != NULL && *i18npath != '\0')
 107                 {
 108                   const size_t pathlen = strlen (i18npath);
 109                   char i18npathbuf[pathlen + 1];
 110                   char path[pathlen + sizeof ("/charmaps")];
 111                   char *next;
 112                   i18npath = memcpy (i18npathbuf, i18npath, pathlen + 1);
 113
 114                   while (cmfile == NULL
 115                          && (next = strsep (&i18npath, ":")) != NULL)
 116                     {
 117                       stpcpy (stpcpy (path, next), "/charmaps");
 118                       cmfile = cmlr_open (path, filename, charmap_hash);
 119
 120                       if (cmfile == NULL)
 121                         /* Try without the "/charmaps" part.  */
 122                         cmfile = cmlr_open (next, filename, charmap_hash);
 123                     }
 124                 }
 125
 126               if (cmfile == NULL)
 127                 /* Try the default directory.  */
 128                 cmfile = cmlr_open (CHARMAP_PATH, filename, charmap_hash);
 129             }
 130         }
 131
 132       if (cmfile != NULL)
 133         result = parse_charmap (cmfile, verbose, be_quiet);
 134
 135       if (result == NULL && error_not_found)
 136         record_error (0, errno,
 137                       _("character map file `%s' not found"),
 138                       filename);
 139     }
 140
 141   if (result == NULL && filename != NULL && strchr (filename, '/') == NULL)
 142     {
 143       /* OK, one more try.  We also accept the names given to the
 144          character sets in the files.  Sometimes they differ from the
 145          file name.  */
 146       CHARMAP_DIR *dir;
 147
 148       dir = charmap_opendir (CHARMAP_PATH);
 149       if (dir != NULL)
 150         {
 151           const char *dirent;
 152
 153           while ((dirent = charmap_readdir (dir)) != NULL)
 154             {
 155               char **aliases;
 156               char **p;
 157               int found;
 158
 159               aliases = charmap_aliases (CHARMAP_PATH, dirent);
 160               found = 0;
 161               for (p = aliases; *p; p++)
 162                 if (strcasecmp (*p, filename) == 0)
 163                   {
 164                     found = 1;
 165                     break;
 166                   }
 167               charmap_free_aliases (aliases);
 168
 169               if (found)
 170                 {
 171                   struct linereader *cmfile;
 172
 173                   cmfile = cmlr_open (CHARMAP_PATH, dirent, charmap_hash);
 174                   if (cmfile != NULL)
 175                     result = parse_charmap (cmfile, verbose, be_quiet);
 176
 177                   break;
 178                 }
 179             }
 180
 181           charmap_closedir (dir);
 182         }
 183     }
 184
 185   if (result == NULL && DEFAULT_CHARMAP != NULL)
 186     {
 187       struct linereader *cmfile;
 188
 189       cmfile = cmlr_open (CHARMAP_PATH, DEFAULT_CHARMAP, charmap_hash);
 190       if (cmfile != NULL)
 191         result = parse_charmap (cmfile, verbose, be_quiet);
 192
 193       if (result == NULL)
 194         record_error (4, errno,
 195                       _("default character map file `%s' not found"),
 196                       DEFAULT_CHARMAP);
 197     }
 198
 199   if (result != NULL && result->code_set_name == NULL)
 200     /* The input file does not specify a code set name.  This
 201        shouldn't happen but we should cope with it.  */
 202     result->code_set_name = basename (filename);
 203
 204   /* Test of ASCII compatibility of locale encoding.
 205
 206      Verify that the encoding to be used in a locale is ASCII compatible,
 207      at least for the graphic characters, excluding the control characters,
 208      '$' and '@'.  This constraint comes from an ISO C 99 restriction.
 209
 210      ISO C 99 section 7.17.(2) (about wchar_t):
 211        the null character shall have the code value zero and each member of
 212        the basic character set shall have a code value equal to its value
 213        when used as the lone character in an integer character constant.
 214      ISO C 99 section 5.2.1.(3):
 215        Both the basic source and basic execution character sets shall have
 216        the following members: the 26 uppercase letters of the Latin alphabet
 217             A B C D E F G H I J K L M N O P Q R S T U V W X Y Z
 218        the 26 lowercase letters of the Latin alphabet
 219             a b c d e f g h i j k l m n o p q r s t u v w x y z
 220        the 10 decimal digits
 221             0 1 2 3 4 5 6 7 8 9
 222        the following 29 graphic characters
 223             ! " # % & ' ( ) * + , - . / : ; < = > ? [ \ ] ^ _ { | } ~
 224        the space character, and control characters representing horizontal
 225        tab, vertical tab, and form feed.
 226
 227      Therefore, for all members of the "basic character set", the 'char' code
 228      must have the same value as the 'wchar_t' code, which in glibc is the
 229      same as the Unicode code, which for all of the enumerated characters
 230      is identical to the ASCII code. */
 231   if (result != NULL && use_default)
 232     {
 233       static const char basic_charset[] =
 234         {
 235           'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M',
 236           'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z',
 237           'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm',
 238           'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
 239           '0', '1', '2', '3', '4', '5', '6', '7', '8', '9',
 240           '!', '"', '#', '%', '&', '\'', '(', ')', '*', '+', ',', '-',
 241           '.', '/', ':', ';', '<', '=', '>', '?', '[', '\\', ']', '^',
 242           '_', '{', '|', '}', '~', ' ', '\t', '\v', '\f', '\0'
 243         };
 244       int failed = 0;
 245       const char *p = basic_charset;
 246
 247       do
 248         {
 249           struct charseq *seq = charmap_find_symbol (result, p, 1);
 250
 251           if (seq == NULL || seq->ucs4 != (uint32_t) *p)
 252             failed = 1;
 253         }
 254       while (*p++ != '\0');
 255
 256       if (failed)
 257         {
 258           /* A user may disable the ASCII compatibility warning check,
 259              but we must remember that the encoding is not ASCII
 260              compatible, since it may have other implications.  Later
 261              we will set _NL_CTYPE_MAP_TO_NONASCII from this value.  */
 262           if (warn_ascii)
 263             record_warning (_(
 264 "character map `%s' is not ASCII compatible, locale not ISO C compliant "
 265 "[--no-warnings=ascii]"),
 266                             result->code_set_name);
 267           enc_not_ascii_compatible = true;
 268         }
 269     }
 270
 271   return result;
 272 }
 273
 274
 275 static struct charmap_t *
 276 parse_charmap (struct linereader *cmfile, int verbose, int be_quiet)
 277 {
 278   struct charmap_t *result;
 279   int state;
 280   enum token_t expected_tok = tok_error;
 281   const char *expected_str = NULL;
 282   char *from_name = NULL;
 283   char *to_name = NULL;
 284   enum token_t ellipsis = 0;
 285   int step = 1;
 286
 287   /* We don't want symbolic names in string to be translated.  */
 288   cmfile->translate_strings = 0;
 289
 290   /* Allocate room for result.  */
 291   result = (struct charmap_t *) xmalloc (sizeof (struct charmap_t));
 292   memset (result, '\0', sizeof (struct charmap_t));
 293   /* The default DEFAULT_WIDTH is 1.  */
 294   result->width_default = 1;
 295
 296 #define obstack_chunk_alloc malloc
 297 #define obstack_chunk_free free
 298   obstack_init (&result->mem_pool);
 299
 300   if (init_hash (&result->char_table, 256)
 301       || init_hash (&result->byte_table, 256))
 302     {
 303       free (result);
 304       return NULL;
 305     }
 306
 307   /* We use a state machine to describe the charmap description file
 308      format.  */
 309   state = 1;
 310   while (1)
 311     {
 312       /* What's on?  */
 313       struct token *now = lr_token (cmfile, NULL, NULL, NULL, verbose);
 314       enum token_t nowtok = now->tok;
 315       struct token *arg;
 316
 317       if (nowtok == tok_eof)
 318         break;
 319
 320       switch (state)
 321         {
 322         case 1:
 323           /* The beginning.  We expect the special declarations, EOL or
 324              `CHARMAP'.  */
 325           if (nowtok == tok_eol)
 326             /* Ignore empty lines.  */
 327             continue;
 328
 329           if (nowtok == tok_charmap)
 330             {
 331               from_name = NULL;
 332               to_name = NULL;
 333
 334               /* We have to set up the real work.  Fill in some
 335                  default values.  */
 336               if (result->mb_cur_max == 0)
 337                 result->mb_cur_max = 1;
 338               if (result->mb_cur_min == 0)
 339                 result->mb_cur_min = result->mb_cur_max;
 340               if (result->mb_cur_min > result->mb_cur_max)
 341                 {
 342                   record_error (0, 0, _("\
 343 %s: <mb_cur_max> must be greater than <mb_cur_min>\n"),
 344                                 cmfile->fname);
 345
 346                   result->mb_cur_min = result->mb_cur_max;
 347                 }
 348
 349               lr_ignore_rest (cmfile, 1);
 350
 351               state = 2;
 352               continue;
 353             }
 354
 355           if (nowtok != tok_code_set_name && nowtok != tok_mb_cur_max
 356               && nowtok != tok_mb_cur_min && nowtok != tok_escape_char
 357               && nowtok != tok_comment_char && nowtok != tok_g0esc
 358               && nowtok != tok_g1esc && nowtok != tok_g2esc
 359               && nowtok != tok_g3esc && nowtok != tok_repertoiremap
 360               && nowtok != tok_include)
 361             {
 362               lr_error (cmfile, _("syntax error in prolog: %s"),
 363                         _("invalid definition"));
 364
 365               lr_ignore_rest (cmfile, 0);
 366               continue;
 367             }
 368
 369           /* We know that we need an argument.  */
 370           arg = lr_token (cmfile, NULL, NULL, NULL, verbose);
 371
 372           switch (nowtok)
 373             {
 374             case tok_code_set_name:
 375             case tok_repertoiremap:
 376               if (arg->tok != tok_ident && arg->tok != tok_string)
 377                 {
 378                 badarg:
 379                   lr_error (cmfile, _("syntax error in prolog: %s"),
 380                             _("bad argument"));
 381
 382                   lr_ignore_rest (cmfile, 0);
 383                   continue;
 384                 }
 385
 386               if (nowtok == tok_code_set_name)
 387                 result->code_set_name = obstack_copy0 (&result->mem_pool,
 388                                                        arg->val.str.startmb,
 389                                                        arg->val.str.lenmb);
 390               else
 391                 result->repertoiremap = obstack_copy0 (&result->mem_pool,
 392                                                        arg->val.str.startmb,
 393                                                        arg->val.str.lenmb);
 394
 395               lr_ignore_rest (cmfile, 1);
 396               continue;
 397
 398             case tok_mb_cur_max:
 399             case tok_mb_cur_min:
 400               if (arg->tok != tok_number)
 401                 goto badarg;
 402
 403               if ((nowtok == tok_mb_cur_max
 404                        && result->mb_cur_max != 0)
 405                       || (nowtok == tok_mb_cur_max
 406                           && result->mb_cur_max != 0))
 407                 lr_error (cmfile, _("duplicate definition of <%s>"),
 408                           nowtok == tok_mb_cur_min
 409                           ? "mb_cur_min" : "mb_cur_max");
 410
 411               if (arg->val.num < 1)
 412                 {
 413                   lr_error (cmfile,
 414                             _("value for <%s> must be 1 or greater"),
 415                             nowtok == tok_mb_cur_min
 416                             ? "mb_cur_min" : "mb_cur_max");
 417
 418                   lr_ignore_rest (cmfile, 0);
 419                   continue;
 420                 }
 421               if ((nowtok == tok_mb_cur_max && result->mb_cur_min != 0
 422                    && (int) arg->val.num < result->mb_cur_min)
 423                   || (nowtok == tok_mb_cur_min && result->mb_cur_max != 0
 424                       && (int) arg->val.num > result->mb_cur_max))
 425                 {
 426                   lr_error (cmfile, _("\
 427 value of <%s> must be greater or equal than the value of <%s>"),
 428                             "mb_cur_max", "mb_cur_min");
 429
 430                   lr_ignore_rest (cmfile, 0);
 431                   continue;
 432                 }
 433
 434               if (nowtok == tok_mb_cur_max)
 435                 result->mb_cur_max = arg->val.num;
 436               else
 437                 result->mb_cur_min = arg->val.num;
 438
 439               lr_ignore_rest (cmfile, 1);
 440               continue;
 441
 442             case tok_escape_char:
 443             case tok_comment_char:
 444               if (arg->tok != tok_ident)
 445                 goto badarg;
 446
 447               if (arg->val.str.lenmb != 1)
 448                 {
 449                   lr_error (cmfile, _("\
 450 argument to <%s> must be a single character"),
 451                             nowtok == tok_escape_char ? "escape_char"
 452                                                       : "comment_char");
 453
 454                   lr_ignore_rest (cmfile, 0);
 455                   continue;
 456                 }
 457
 458               if (nowtok == tok_escape_char)
 459                 cmfile->escape_char = *arg->val.str.startmb;
 460               else
 461                 cmfile->comment_char = *arg->val.str.startmb;
 462
 463               lr_ignore_rest (cmfile, 1);
 464               continue;
 465
 466             case tok_g0esc:
 467             case tok_g1esc:
 468             case tok_g2esc:
 469             case tok_g3esc:
 470             case tok_escseq:
 471               lr_ignore_rest (cmfile, 0); /* XXX */
 472               continue;
 473
 474             case tok_include:
 475               lr_error (cmfile, _("\
 476 character sets with locking states are not supported"));
 477               exit (4);
 478
 479             default:
 480               /* Cannot happen.  */
 481               assert (! "Should not happen");
 482             }
 483           break;
 484
 485         case 2:
 486           /* We have seen `CHARMAP' and now are in the body.  Each line
 487              must have the format "%s %s %s\n" or "%s...%s %s %s\n".  */
 488           if (nowtok == tok_eol)
 489             /* Ignore empty lines.  */
 490             continue;
 491
 492           if (nowtok == tok_end)
 493             {
 494               expected_tok = tok_charmap;
 495               expected_str = "CHARMAP";
 496               state = 90;
 497               continue;
 498             }
 499
 500           if (nowtok != tok_bsymbol && nowtok != tok_ucs4)
 501             {
 502               lr_error (cmfile, _("syntax error in %s definition: %s"),
 503                         "CHARMAP", _("no symbolic name given"));
 504
 505               lr_ignore_rest (cmfile, 0);
 506               continue;
 507             }
 508
 509           /* If the previous line was not completely correct free the
 510              used memory.  */
 511           if (from_name != NULL)
 512             obstack_free (&result->mem_pool, from_name);
 513
 514           if (nowtok == tok_bsymbol)
 515             from_name = (char *) obstack_copy0 (&result->mem_pool,
 516                                                 now->val.str.startmb,
 517                                                 now->val.str.lenmb);
 518           else
 519             {
 520               obstack_printf (&result->mem_pool, "U%08X",
 521                               cmfile->token.val.ucs4);
 522               obstack_1grow (&result->mem_pool, '\0');
 523               from_name = (char *) obstack_finish (&result->mem_pool);
 524             }
 525           to_name = NULL;
 526
 527           state = 3;
 528           continue;
 529
 530         case 3:
 531           /* We have two possibilities: We can see an ellipsis or an
 532              encoding value.  */
 533           if (nowtok == tok_ellipsis3 || nowtok == tok_ellipsis4
 534               || nowtok == tok_ellipsis2 || nowtok == tok_ellipsis4_2
 535               || nowtok == tok_ellipsis2_2)
 536             {
 537               ellipsis = nowtok;
 538               if (nowtok == tok_ellipsis4_2)
 539                 {
 540                   step = 2;
 541                   nowtok = tok_ellipsis4;
 542                 }
 543               else if (nowtok == tok_ellipsis2_2)
 544                 {
 545                   step = 2;
 546                   nowtok = tok_ellipsis2;
 547                 }
 548               state = 4;
 549               continue;
 550             }
 551           /* FALLTHROUGH */
 552
 553         case 5:
 554           if (nowtok != tok_charcode)
 555             {
 556               lr_error (cmfile, _("syntax error in %s definition: %s"),
 557                         "CHARMAP", _("invalid encoding given"));
 558
 559               lr_ignore_rest (cmfile, 0);
 560
 561               state = 2;
 562               continue;
 563             }
 564
 565           if (now->val.charcode.nbytes < result->mb_cur_min)
 566             lr_error (cmfile, _("too few bytes in character encoding"));
 567           else if (now->val.charcode.nbytes > result->mb_cur_max)
 568             lr_error (cmfile, _("too many bytes in character encoding"));
 569           else
 570             charmap_new_char (cmfile, result, now->val.charcode.nbytes,
 571                               now->val.charcode.bytes, from_name, to_name,
 572                               ellipsis != tok_ellipsis2, step);
 573
 574           /* Ignore trailing comment silently.  */
 575           lr_ignore_rest (cmfile, 0);
 576
 577           from_name = NULL;
 578           to_name = NULL;
 579           ellipsis = tok_none;
 580           step = 1;
 581
 582           state = 2;
 583           continue;
 584
 585         case 4:
 586           if (nowtok != tok_bsymbol && nowtok != tok_ucs4)
 587             {
 588               lr_error (cmfile, _("syntax error in %s definition: %s"),
 589                         "CHARMAP",
 590                         _("no symbolic name given for end of range"));
 591
 592               lr_ignore_rest (cmfile, 0);
 593               continue;
 594             }
 595
 596           /* Copy the to-name in a safe place.  */
 597           if (nowtok == tok_bsymbol)
 598             to_name = (char *) obstack_copy0 (&result->mem_pool,
 599                                               cmfile->token.val.str.startmb,
 600                                               cmfile->token.val.str.lenmb);
 601           else
 602             {
 603               obstack_printf (&result->mem_pool, "U%08X",
 604                               cmfile->token.val.ucs4);
 605               obstack_1grow (&result->mem_pool, '\0');
 606               to_name = (char *) obstack_finish (&result->mem_pool);
 607             }
 608
 609           state = 5;
 610           continue;
 611
 612         case 90:
 613           if (nowtok != expected_tok)
 614             lr_error (cmfile, _("\
 615 %1$s: definition does not end with `END %1$s'"), expected_str);
 616
 617           lr_ignore_rest (cmfile, nowtok == expected_tok);
 618           state = 91;
 619           continue;
 620
 621         case 91:
 622           /* Waiting for WIDTH... */
 623           if (nowtok == tok_eol)
 624             /* Ignore empty lines.  */
 625             continue;
 626
 627           if (nowtok == tok_width_default)
 628             {
 629               state = 92;
 630               continue;
 631             }
 632
 633           if (nowtok == tok_width)
 634             {
 635               lr_ignore_rest (cmfile, 1);
 636               state = 93;
 637               continue;
 638             }
 639
 640           if (nowtok == tok_width_variable)
 641             {
 642               lr_ignore_rest (cmfile, 1);
 643               state = 98;
 644               continue;
 645             }
 646
 647           lr_error (cmfile, _("\
 648 only WIDTH definitions are allowed to follow the CHARMAP definition"));
 649
 650           lr_ignore_rest (cmfile, 0);
 651           continue;
 652
 653         case 92:
 654           if (nowtok != tok_number)
 655             lr_error (cmfile, _("value for %s must be an integer"),
 656                       "WIDTH_DEFAULT");
 657           else
 658             result->width_default = now->val.num;
 659
 660           lr_ignore_rest (cmfile, nowtok == tok_number);
 661
 662           state = 91;
 663           continue;
 664
 665         case 93:
 666           /* We now expect `END WIDTH' or lines of the format "%s %d\n" or
 667              "%s...%s %d\n".  */
 668           if (nowtok == tok_eol)
 669             /* ignore empty lines.  */
 670             continue;
 671
 672           if (nowtok == tok_end)
 673             {
 674               expected_tok = tok_width;
 675               expected_str = "WIDTH";
 676               state = 90;
 677               continue;
 678             }
 679
 680           if (nowtok != tok_bsymbol && nowtok != tok_ucs4)
 681             {
 682               lr_error (cmfile, _("syntax error in %s definition: %s"),
 683                         "WIDTH", _("no symbolic name given"));
 684
 685               lr_ignore_rest (cmfile, 0);
 686               continue;
 687             }
 688
 689           if (from_name != NULL)
 690             obstack_free (&result->mem_pool, from_name);
 691
 692           if (nowtok == tok_bsymbol)
 693             from_name = (char *) obstack_copy0 (&result->mem_pool,
 694                                                 now->val.str.startmb,
 695                                                 now->val.str.lenmb);
 696           else
 697             {
 698               obstack_printf (&result->mem_pool, "U%08X",
 699                               cmfile->token.val.ucs4);
 700               obstack_1grow (&result->mem_pool, '\0');
 701               from_name = (char *) obstack_finish (&result->mem_pool);
 702             }
 703
 704           to_name = NULL;
 705
 706           state = 94;
 707           continue;
 708
 709         case 94:
 710           if (nowtok == tok_ellipsis3)
 711             {
 712               state = 95;
 713               continue;
 714             }
 715           /* Fall through.  */
 716
 717         case 96:
 718           if (nowtok != tok_number)
 719             lr_error (cmfile, _("value for %s must be an integer"),
 720                       "WIDTH");
 721           else
 722             {
 723               /* Store width for chars.  */
 724               new_width (cmfile, result, from_name, to_name, now->val.num);
 725
 726               from_name = NULL;
 727               to_name = NULL;
 728             }
 729
 730           lr_ignore_rest (cmfile, nowtok == tok_number);
 731
 732           state = 93;
 733           continue;
 734
 735         case 95:
 736           if (nowtok != tok_bsymbol && nowtok != tok_ucs4)
 737             {
 738               lr_error (cmfile, _("syntax error in %s definition: %s"),
 739                         "WIDTH", _("no symbolic name given for end of range"));
 740
 741               lr_ignore_rest (cmfile, 0);
 742
 743               state = 93;
 744               continue;
 745             }
 746
 747           if (nowtok == tok_bsymbol)
 748             to_name = (char *) obstack_copy0 (&result->mem_pool,
 749                                               now->val.str.startmb,
 750                                               now->val.str.lenmb);
 751           else
 752             {
 753               obstack_printf (&result->mem_pool, "U%08X",
 754                               cmfile->token.val.ucs4);
 755               obstack_1grow (&result->mem_pool, '\0');
 756               to_name = (char *) obstack_finish (&result->mem_pool);
 757             }
 758
 759           state = 96;
 760           continue;
 761
 762         case 98:
 763           /* We now expect `END WIDTH_VARIABLE' or lines of the format
 764              "%s\n" or "%s...%s\n".  */
 765           if (nowtok == tok_eol)
 766             /* ignore empty lines.  */
 767             continue;
 768
 769           if (nowtok == tok_end)
 770             {
 771               expected_tok = tok_width_variable;
 772               expected_str = "WIDTH_VARIABLE";
 773               state = 90;
 774               continue;
 775             }
 776
 777           if (nowtok != tok_bsymbol && nowtok != tok_ucs4)
 778             {
 779               lr_error (cmfile, _("syntax error in %s definition: %s"),
 780                         "WIDTH_VARIABLE", _("no symbolic name given"));
 781
 782               lr_ignore_rest (cmfile, 0);
 783
 784               continue;
 785             }
 786
 787           if (from_name != NULL)
 788             obstack_free (&result->mem_pool, from_name);
 789
 790           if (nowtok == tok_bsymbol)
 791             from_name = (char *) obstack_copy0 (&result->mem_pool,
 792                                                 now->val.str.startmb,
 793                                                 now->val.str.lenmb);
 794           else
 795             {
 796               obstack_printf (&result->mem_pool, "U%08X",
 797                               cmfile->token.val.ucs4);
 798               obstack_1grow (&result->mem_pool, '\0');
 799               from_name = (char *) obstack_finish (&result->mem_pool);
 800             }
 801           to_name = NULL;
 802
 803           state = 99;
 804           continue;
 805
 806         case 99:
 807           if (nowtok == tok_ellipsis3)
 808             state = 100;
 809
 810           /* Store info.  */
 811           from_name = NULL;
 812
 813           /* Warn */
 814           state = 98;
 815           continue;
 816
 817         case 100:
 818           if (nowtok != tok_bsymbol && nowtok != tok_ucs4)
 819             {
 820               lr_error (cmfile, _("syntax error in %s definition: %s"),
 821                         "WIDTH_VARIABLE",
 822                         _("no symbolic name given for end of range"));
 823               lr_ignore_rest (cmfile, 0);
 824               continue;
 825             }
 826
 827           if (nowtok == tok_bsymbol)
 828             to_name = (char *) obstack_copy0 (&result->mem_pool,
 829                                               now->val.str.startmb,
 830                                               now->val.str.lenmb);
 831           else
 832             {
 833               obstack_printf (&result->mem_pool, "U%08X",
 834                               cmfile->token.val.ucs4);
 835               obstack_1grow (&result->mem_pool, '\0');
 836               to_name = (char *) obstack_finish (&result->mem_pool);
 837             }
 838
 839           /* XXX Enter value into table.  */
 840
 841           lr_ignore_rest (cmfile, 1);
 842
 843           state = 98;
 844           continue;
 845
 846         default:
 847           record_error (5, 0, _("%s: error in state machine"),
 848                         __FILE__);
 849           /* NOTREACHED */
 850         }
 851       break;
 852     }
 853
 854   if (state != 91)
 855     record_error (0, 0, _("%s: premature end of file"),
 856                   cmfile->fname);
 857
 858   lr_close (cmfile);
 859
 860   return result;
 861 }
 862
 863
 864 static void
 865 new_width (struct linereader *cmfile, struct charmap_t *result,
 866            const char *from, const char *to, unsigned long int width)
 867 {
 868   struct charseq *from_val;
 869   struct charseq *to_val;
 870
 871   from_val = charmap_find_value (result, from, strlen (from));
 872   if (from_val == NULL)
 873     {
 874       lr_error (cmfile, _("unknown character `%s'"), from);
 875       return;
 876     }
 877
 878   if (to == NULL)
 879     to_val = from_val;
 880   else
 881     {
 882       to_val = charmap_find_value (result, to, strlen (to));
 883       if (to_val == NULL)
 884         {
 885           lr_error (cmfile, _("unknown character `%s'"), to);
 886           return;
 887         }
 888
 889       /* Make sure the number of bytes for the end points of the range
 890          is correct.  */
 891       if (from_val->nbytes != to_val->nbytes)
 892         {
 893           lr_error (cmfile, _("\
 894 number of bytes for byte sequence of beginning and end of range not the same: %d vs %d"),
 895                     from_val->nbytes, to_val->nbytes);
 896           return;
 897         }
 898     }
 899
 900   if (result->nwidth_rules >= result->nwidth_rules_max)
 901     {
 902       size_t new_size = result->nwidth_rules + 32;
 903       struct width_rule *new_rules =
 904         (struct width_rule *) obstack_alloc (&result->mem_pool,
 905                                              (new_size
 906                                               * sizeof (struct width_rule)));
 907
 908       memcpy (new_rules, result->width_rules,
 909               result->nwidth_rules_max * sizeof (struct width_rule));
 910
 911       result->width_rules = new_rules;
 912       result->nwidth_rules_max = new_size;
 913     }
 914
 915   result->width_rules[result->nwidth_rules].from = from_val;
 916   result->width_rules[result->nwidth_rules].to = to_val;
 917   result->width_rules[result->nwidth_rules].width = (unsigned int) width;
 918   ++result->nwidth_rules;
 919 }
 920
 921
 922 struct charseq *
 923 charmap_find_value (const struct charmap_t *cm, const char *name, size_t len)
 924 {
 925   void *result;
 926
 927   return (find_entry ((hash_table *) &cm->char_table, name, len, &result)
 928           < 0 ? NULL : (struct charseq *) result);
 929 }
 930
 931
 932 static void
 933 charmap_new_char (struct linereader *lr, struct charmap_t *cm,
 934                   size_t nbytes, unsigned char *bytes,
 935                   const char *from, const char *to,
 936                   int decimal_ellipsis, int step)
 937 {
 938   hash_table *ht = &cm->char_table;
 939   hash_table *bt = &cm->byte_table;
 940   struct obstack *ob = &cm->mem_pool;
 941   char *from_end;
 942   char *to_end;
 943   const char *cp;
 944   int prefix_len, len1, len2;
 945   unsigned int from_nr, to_nr, cnt;
 946   struct charseq *newp;
 947
 948   len1 = strlen (from);
 949
 950   if (to == NULL)
 951     {
 952       newp = (struct charseq *) obstack_alloc (ob, sizeof (*newp) + nbytes);
 953       newp->nbytes = nbytes;
 954       memcpy (newp->bytes, bytes, nbytes);
 955       newp->name = from;
 956
 957       newp->ucs4 = UNINITIALIZED_CHAR_VALUE;
 958       if ((from[0] == 'U' || from[0] == 'P') && (len1 == 5 || len1 == 9))
 959         {
 960           /* Maybe the name is of the form `Uxxxx' or `Uxxxxxxxx' where
 961              xxxx and xxxxxxxx are hexadecimal numbers.  In this case
 962              we use the value of xxxx or xxxxxxxx as the UCS4 value of
 963              this character and we don't have to consult the repertoire
 964              map.
 965
 966              If the name is of the form `Pxxxx' or `Pxxxxxxxx' the xxxx
 967              and xxxxxxxx also give the code point in UCS4 but this must
 968              be in the private, i.e., unassigned, area.  This should be
 969              used for characters which do not (yet) have an equivalent
 970              in ISO 10646 and Unicode.  */
 971           char *endp;
 972
 973           errno = 0;
 974           newp->ucs4 = strtoul (from + 1, &endp, 16);
 975           if (endp - from != len1
 976               || (newp->ucs4 == ~((uint32_t) 0) && errno == ERANGE)
 977               || newp->ucs4 >= 0x80000000)
 978             /* This wasn't successful.  Signal this name cannot be a
 979                correct UCS value.  */
 980             newp->ucs4 = UNINITIALIZED_CHAR_VALUE;
 981         }
 982
 983       insert_entry (ht, from, len1, newp);
 984       insert_entry (bt, newp->bytes, nbytes, newp);
 985       /* Please note that it isn't a bug if a symbol is defined more
 986          than once.  All later definitions are simply discarded.  */
 987       return;
 988     }
 989
 990   /* We have a range: the names must have names with equal prefixes
 991      and an equal number of digits, where the second number is greater
 992      or equal than the first.  */
 993   len2 = strlen (to);
 994
 995   if (len1 != len2)
 996     {
 997     illegal_range:
 998       lr_error (lr, _("invalid names for character range"));
 999       return;
1000     }
1001
1002   cp = &from[len1 - 1];
1003   if (decimal_ellipsis)
1004     while (isdigit (*cp) && cp >= from)
1005       --cp;
1006   else
1007     while (isxdigit (*cp) && cp >= from)
1008       {
1009         if (!isdigit (*cp) && !isupper (*cp))
1010           lr_error (lr, _("\
1011 hexadecimal range format should use only capital characters"));
1012         --cp;
1013       }
1014
1015   prefix_len = (cp - from) + 1;
1016
1017   if (cp == &from[len1 - 1] || strncmp (from, to, prefix_len) != 0)
1018     goto illegal_range;
1019
1020   errno = 0;
1021   from_nr = strtoul (&from[prefix_len], &from_end, decimal_ellipsis ? 10 : 16);
1022   if (*from_end != '\0' || (from_nr == UINT_MAX && errno == ERANGE)
1023       || ((to_nr = strtoul (&to[prefix_len], &to_end,
1024                             decimal_ellipsis ? 10 : 16)) == UINT_MAX
1025           && errno == ERANGE)
1026       || *to_end != '\0')
1027     {
1028       lr_error (lr, _("<%s> and <%s> are invalid names for range"), from, to);
1029       return;
1030     }
1031
1032   if (from_nr > to_nr)
1033     {
1034       lr_error (lr, _("upper limit in range is smaller than lower limit"));
1035       return;
1036     }
1037
1038   for (cnt = from_nr; cnt <= to_nr; cnt += step)
1039     {
1040       char *name_end;
1041       obstack_printf (ob, decimal_ellipsis ? "%.*s%0*d" : "%.*s%0*X",
1042                       prefix_len, from, len1 - prefix_len, cnt);
1043       obstack_1grow (ob, '\0');
1044       name_end = obstack_finish (ob);
1045
1046       newp = (struct charseq *) obstack_alloc (ob, sizeof (*newp) + nbytes);
1047       newp->nbytes = nbytes;
1048       memcpy (newp->bytes, bytes, nbytes);
1049       newp->name = name_end;
1050
1051       newp->ucs4 = UNINITIALIZED_CHAR_VALUE;
1052       if ((name_end[0] == 'U' || name_end[0] == 'P')
1053           && (len1 == 5 || len1 == 9))
1054         {
1055           /* Maybe the name is of the form `Uxxxx' or `Uxxxxxxxx' where
1056              xxxx and xxxxxxxx are hexadecimal numbers.  In this case
1057              we use the value of xxxx or xxxxxxxx as the UCS4 value of
1058              this character and we don't have to consult the repertoire
1059              map.
1060
1061              If the name is of the form `Pxxxx' or `Pxxxxxxxx' the xxxx
1062              and xxxxxxxx also give the code point in UCS4 but this must
1063              be in the private, i.e., unassigned, area.  This should be
1064              used for characters which do not (yet) have an equivalent
1065              in ISO 10646 and Unicode.  */
1066           char *endp;
1067
1068           errno = 0;
1069           newp->ucs4 = strtoul (name_end + 1, &endp, 16);
1070           if (endp - name_end != len1
1071               || (newp->ucs4 == ~((uint32_t) 0) && errno == ERANGE)
1072               || newp->ucs4 >= 0x80000000)
1073             /* This wasn't successful.  Signal this name cannot be a
1074                correct UCS value.  */
1075             newp->ucs4 = UNINITIALIZED_CHAR_VALUE;
1076         }
1077
1078       insert_entry (ht, name_end, len1, newp);
1079       insert_entry (bt, newp->bytes, nbytes, newp);
1080       /* Please note we don't examine the return value since it is no error
1081          if we have two definitions for a symbol.  */
1082
1083       /* Increment the value in the byte sequence.  */
1084       if (++bytes[nbytes - 1] == '\0')
1085         {
1086           int b = nbytes - 2;
1087
1088           do
1089             if (b < 0)
1090               {
1091                 lr_error (lr,
1092                           _("resulting bytes for range not representable."));
1093                 return;
1094               }
1095           while (++bytes[b--] == 0);
1096         }
1097     }
1098 }
1099
1100
1101 struct charseq *
1102 charmap_find_symbol (const struct charmap_t *cm, const char *bytes,
1103                      size_t nbytes)
1104 {
1105   void *result;
1106
1107   return (find_entry ((hash_table *) &cm->byte_table, bytes, nbytes, &result)
1108           < 0 ? NULL : (struct charseq *) result);
1109 }