locale/programs/charmap.c

   1 /* Copyright (C) 1996-2014 Free Software Foundation, Inc.
   2    This file is part of the GNU C Library.
   3    Contributed by Ulrich Drepper <drepper@gnu.org>, 1996.
   4
   5    This program is free software; you can redistribute it and/or modify
   6    it under the terms of the GNU General Public License as published
   7    by the Free Software Foundation; version 2 of the License, or
   8    (at your option) any later version.
   9
  10    This program is distributed in the hope that it will be useful,
  11    but WITHOUT ANY WARRANTY; without even the implied warranty of
  12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  13    GNU General Public License for more details.
  14
  15    You should have received a copy of the GNU General Public License
  16    along with this program; if not, see <http://www.gnu.org/licenses/>.  */
  17
  18 #ifdef HAVE_CONFIG_H
  19 # include <config.h>
  20 #endif
  21
  22 #include <ctype.h>
  23 #include <errno.h>
  24 #include <libintl.h>
  25 #include <limits.h>
  26 #include <stdio.h>
  27 #include <stdlib.h>
  28 #include <string.h>
  29 #include <error.h>
  30 #include <stdint.h>
  31
  32 #include "localedef.h"
  33 #include "linereader.h"
  34 #include "charmap.h"
  35 #include "charmap-dir.h"
  36
  37 #include <assert.h>
  38
  39
  40 /* Define the lookup function.  */
  41 #include "charmap-kw.h"
  42
  43
  44 /* Prototypes for local functions.  */
  45 static struct charmap_t *parse_charmap (struct linereader *cmfile,
  46                                         int verbose, int be_quiet);
  47 static void new_width (struct linereader *cmfile, struct charmap_t *result,
  48                        const char *from, const char *to,
  49                        unsigned long int width);
  50 static void charmap_new_char (struct linereader *lr, struct charmap_t *cm,
  51                               size_t nbytes, unsigned char *bytes,
  52                               const char *from, const char *to,
  53                               int decimal_ellipsis, int step);
  54
  55
  56 bool enc_not_ascii_compatible;
  57
  58
  59 #ifdef NEED_NULL_POINTER
  60 static const char *null_pointer;
  61 #endif
  62
  63 static struct linereader *
  64 cmlr_open (const char *directory, const char *name, kw_hash_fct_t hf)
  65 {
  66   FILE *fp;
  67
  68   fp = charmap_open (directory, name);
  69   if (fp == NULL)
  70     return NULL;
  71   else
  72     {
  73       size_t dlen = strlen (directory);
  74       int add_slash = (dlen == 0 || directory[dlen - 1] != '/');
  75       size_t nlen = strlen (name);
  76       char *pathname;
  77       char *p;
  78
  79       pathname = alloca (dlen + add_slash + nlen + 1);
  80       p = stpcpy (pathname, directory);
  81       if (add_slash)
  82         *p++ = '/';
  83       stpcpy (p, name);
  84
  85       return lr_create (fp, pathname, hf);
  86     }
  87 }
  88
  89 struct charmap_t *
  90 charmap_read (const char *filename, int verbose, int error_not_found,
  91               int be_quiet, int use_default)
  92 {
  93   struct charmap_t *result = NULL;
  94
  95   if (filename != NULL)
  96     {
  97       struct linereader *cmfile;
  98
  99       /* First try the name as found in the parameter.  */
 100       cmfile = lr_open (filename, charmap_hash);
 101       if (cmfile == NULL)
 102         {
 103           /* No successful.  So start looking through the directories
 104              in the I18NPATH if this is a simple name.  */
 105           if (strchr (filename, '/') == NULL)
 106             {
 107               char *i18npath = getenv ("I18NPATH");
 108               if (i18npath != NULL && *i18npath != '\0')
 109                 {
 110                   const size_t pathlen = strlen (i18npath);
 111                   char i18npathbuf[pathlen + 1];
 112                   char path[pathlen + sizeof ("/charmaps")];
 113                   char *next;
 114                   i18npath = memcpy (i18npathbuf, i18npath, pathlen + 1);
 115
 116                   while (cmfile == NULL
 117                          && (next = strsep (&i18npath, ":")) != NULL)
 118                     {
 119                       stpcpy (stpcpy (path, next), "/charmaps");
 120                       cmfile = cmlr_open (path, filename, charmap_hash);
 121
 122                       if (cmfile == NULL)
 123                         /* Try without the "/charmaps" part.  */
 124                         cmfile = cmlr_open (next, filename, charmap_hash);
 125                     }
 126                 }
 127
 128               if (cmfile == NULL)
 129                 /* Try the default directory.  */
 130                 cmfile = cmlr_open (CHARMAP_PATH, filename, charmap_hash);
 131             }
 132         }
 133
 134       if (cmfile != NULL)
 135         result = parse_charmap (cmfile, verbose, be_quiet);
 136
 137       if (result == NULL && error_not_found)
 138         WITH_CUR_LOCALE (error (0, errno, _("\
 139 character map file `%s' not found"), filename));
 140     }
 141
 142   if (result == NULL && filename != NULL && strchr (filename, '/') == NULL)
 143     {
 144       /* OK, one more try.  We also accept the names given to the
 145          character sets in the files.  Sometimes they differ from the
 146          file name.  */
 147       CHARMAP_DIR *dir;
 148
 149       dir = charmap_opendir (CHARMAP_PATH);
 150       if (dir != NULL)
 151         {
 152           const char *dirent;
 153
 154           while ((dirent = charmap_readdir (dir)) != NULL)
 155             {
 156               char **aliases;
 157               char **p;
 158               int found;
 159
 160               aliases = charmap_aliases (CHARMAP_PATH, dirent);
 161               found = 0;
 162               for (p = aliases; *p; p++)
 163                 if (strcasecmp (*p, filename) == 0)
 164                   {
 165                     found = 1;
 166                     break;
 167                   }
 168               charmap_free_aliases (aliases);
 169
 170               if (found)
 171                 {
 172                   struct linereader *cmfile;
 173
 174                   cmfile = cmlr_open (CHARMAP_PATH, dirent, charmap_hash);
 175                   if (cmfile != NULL)
 176                     result = parse_charmap (cmfile, verbose, be_quiet);
 177
 178                   break;
 179                 }
 180             }
 181
 182           charmap_closedir (dir);
 183         }
 184     }
 185
 186   if (result == NULL && DEFAULT_CHARMAP != NULL)
 187     {
 188       struct linereader *cmfile;
 189
 190       cmfile = cmlr_open (CHARMAP_PATH, DEFAULT_CHARMAP, charmap_hash);
 191       if (cmfile != NULL)
 192         result = parse_charmap (cmfile, verbose, be_quiet);
 193
 194       if (result == NULL)
 195         WITH_CUR_LOCALE (error (4, errno, _("\
 196 default character map file `%s' not found"), DEFAULT_CHARMAP));
 197     }
 198
 199   if (result != NULL && result->code_set_name == NULL)
 200     /* The input file does not specify a code set name.  This
 201        shouldn't happen but we should cope with it.  */
 202     result->code_set_name = basename (filename);
 203
 204   /* Test of ASCII compatibility of locale encoding.
 205
 206      Verify that the encoding to be used in a locale is ASCII compatible,
 207      at least for the graphic characters, excluding the control characters,
 208      '$' and '@'.  This constraint comes from an ISO C 99 restriction.
 209
 210      ISO C 99 section 7.17.(2) (about wchar_t):
 211        the null character shall have the code value zero and each member of
 212        the basic character set shall have a code value equal to its value
 213        when used as the lone character in an integer character constant.
 214      ISO C 99 section 5.2.1.(3):
 215        Both the basic source and basic execution character sets shall have
 216        the following members: the 26 uppercase letters of the Latin alphabet
 217             A B C D E F G H I J K L M N O P Q R S T U V W X Y Z
 218        the 26 lowercase letters of the Latin alphabet
 219             a b c d e f g h i j k l m n o p q r s t u v w x y z
 220        the 10 decimal digits
 221             0 1 2 3 4 5 6 7 8 9
 222        the following 29 graphic characters
 223             ! " # % & ' ( ) * + , - . / : ; < = > ? [ \ ] ^ _ { | } ~
 224        the space character, and control characters representing horizontal
 225        tab, vertical tab, and form feed.
 226
 227      Therefore, for all members of the "basic character set", the 'char' code
 228      must have the same value as the 'wchar_t' code, which in glibc is the
 229      same as the Unicode code, which for all of the enumerated characters
 230      is identical to the ASCII code. */
 231   if (result != NULL && use_default)
 232     {
 233       static const char basic_charset[] =
 234         {
 235           'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M',
 236           'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z',
 237           'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm',
 238           'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
 239           '0', '1', '2', '3', '4', '5', '6', '7', '8', '9',
 240           '!', '"', '#', '%', '&', '\'', '(', ')', '*', '+', ',', '-',
 241           '.', '/', ':', ';', '<', '=', '>', '?', '[', '\\', ']', '^',
 242           '_', '{', '|', '}', '~', ' ', '\t', '\v', '\f', '\0'
 243         };
 244       int failed = 0;
 245       const char *p = basic_charset;
 246
 247       do
 248         {
 249           struct charseq *seq = charmap_find_symbol (result, p, 1);
 250
 251           if (seq == NULL || seq->ucs4 != (uint32_t) *p)
 252             failed = 1;
 253         }
 254       while (*p++ != '\0');
 255
 256       if (failed)
 257         {
 258           WITH_CUR_LOCALE (fprintf (stderr, _("\
 259 character map `%s' is not ASCII compatible, locale not ISO C compliant\n"),
 260                                     result->code_set_name));
 261           enc_not_ascii_compatible = true;
 262         }
 263     }
 264
 265   return result;
 266 }
 267
 268
 269 static struct charmap_t *
 270 parse_charmap (struct linereader *cmfile, int verbose, int be_quiet)
 271 {
 272   struct charmap_t *result;
 273   int state;
 274   enum token_t expected_tok = tok_error;
 275   const char *expected_str = NULL;
 276   char *from_name = NULL;
 277   char *to_name = NULL;
 278   enum token_t ellipsis = 0;
 279   int step = 1;
 280
 281   /* We don't want symbolic names in string to be translated.  */
 282   cmfile->translate_strings = 0;
 283
 284   /* Allocate room for result.  */
 285   result = (struct charmap_t *) xmalloc (sizeof (struct charmap_t));
 286   memset (result, '\0', sizeof (struct charmap_t));
 287   /* The default DEFAULT_WIDTH is 1.  */
 288   result->width_default = 1;
 289
 290 #define obstack_chunk_alloc malloc
 291 #define obstack_chunk_free free
 292   obstack_init (&result->mem_pool);
 293
 294   if (init_hash (&result->char_table, 256)
 295       || init_hash (&result->byte_table, 256))
 296     {
 297       free (result);
 298       return NULL;
 299     }
 300
 301   /* We use a state machine to describe the charmap description file
 302      format.  */
 303   state = 1;
 304   while (1)
 305     {
 306       /* What's on?  */
 307       struct token *now = lr_token (cmfile, NULL, NULL, NULL, verbose);
 308       enum token_t nowtok = now->tok;
 309       struct token *arg;
 310
 311       if (nowtok == tok_eof)
 312         break;
 313
 314       switch (state)
 315         {
 316         case 1:
 317           /* The beginning.  We expect the special declarations, EOL or
 318              `CHARMAP'.  */
 319           if (nowtok == tok_eol)
 320             /* Ignore empty lines.  */
 321             continue;
 322
 323           if (nowtok == tok_charmap)
 324             {
 325               from_name = NULL;
 326               to_name = NULL;
 327
 328               /* We have to set up the real work.  Fill in some
 329                  default values.  */
 330               if (result->mb_cur_max == 0)
 331                 result->mb_cur_max = 1;
 332               if (result->mb_cur_min == 0)
 333                 result->mb_cur_min = result->mb_cur_max;
 334               if (result->mb_cur_min > result->mb_cur_max)
 335                 {
 336                   if (!be_quiet)
 337                     WITH_CUR_LOCALE (error (0, 0, _("\
 338 %s: <mb_cur_max> must be greater than <mb_cur_min>\n"),
 339                                             cmfile->fname));
 340
 341                   result->mb_cur_min = result->mb_cur_max;
 342                 }
 343
 344               lr_ignore_rest (cmfile, 1);
 345
 346               state = 2;
 347               continue;
 348             }
 349
 350           if (nowtok != tok_code_set_name && nowtok != tok_mb_cur_max
 351               && nowtok != tok_mb_cur_min && nowtok != tok_escape_char
 352               && nowtok != tok_comment_char && nowtok != tok_g0esc
 353               && nowtok != tok_g1esc && nowtok != tok_g2esc
 354               && nowtok != tok_g3esc && nowtok != tok_repertoiremap
 355               && nowtok != tok_include)
 356             {
 357               lr_error (cmfile, _("syntax error in prolog: %s"),
 358                         _("invalid definition"));
 359
 360               lr_ignore_rest (cmfile, 0);
 361               continue;
 362             }
 363
 364           /* We know that we need an argument.  */
 365           arg = lr_token (cmfile, NULL, NULL, NULL, verbose);
 366
 367           switch (nowtok)
 368             {
 369             case tok_code_set_name:
 370             case tok_repertoiremap:
 371               if (arg->tok != tok_ident && arg->tok != tok_string)
 372                 {
 373                 badarg:
 374                   lr_error (cmfile, _("syntax error in prolog: %s"),
 375                             _("bad argument"));
 376
 377                   lr_ignore_rest (cmfile, 0);
 378                   continue;
 379                 }
 380
 381               if (nowtok == tok_code_set_name)
 382                 result->code_set_name = obstack_copy0 (&result->mem_pool,
 383                                                        arg->val.str.startmb,
 384                                                        arg->val.str.lenmb);
 385               else
 386                 result->repertoiremap = obstack_copy0 (&result->mem_pool,
 387                                                        arg->val.str.startmb,
 388                                                        arg->val.str.lenmb);
 389
 390               lr_ignore_rest (cmfile, 1);
 391               continue;
 392
 393             case tok_mb_cur_max:
 394             case tok_mb_cur_min:
 395               if (arg->tok != tok_number)
 396                 goto badarg;
 397
 398               if (verbose
 399                   && ((nowtok == tok_mb_cur_max
 400                        && result->mb_cur_max != 0)
 401                       || (nowtok == tok_mb_cur_max
 402                           && result->mb_cur_max != 0)))
 403                 lr_error (cmfile, _("duplicate definition of <%s>"),
 404                           nowtok == tok_mb_cur_min
 405                           ? "mb_cur_min" : "mb_cur_max");
 406
 407               if (arg->val.num < 1)
 408                 {
 409                   lr_error (cmfile,
 410                             _("value for <%s> must be 1 or greater"),
 411                             nowtok == tok_mb_cur_min
 412                             ? "mb_cur_min" : "mb_cur_max");
 413
 414                   lr_ignore_rest (cmfile, 0);
 415                   continue;
 416                 }
 417               if ((nowtok == tok_mb_cur_max && result->mb_cur_min != 0
 418                    && (int) arg->val.num < result->mb_cur_min)
 419                   || (nowtok == tok_mb_cur_min && result->mb_cur_max != 0
 420                       && (int) arg->val.num > result->mb_cur_max))
 421                 {
 422                   lr_error (cmfile, _("\
 423 value of <%s> must be greater or equal than the value of <%s>"),
 424                             "mb_cur_max", "mb_cur_min");
 425
 426                   lr_ignore_rest (cmfile, 0);
 427                   continue;
 428                 }
 429
 430               if (nowtok == tok_mb_cur_max)
 431                 result->mb_cur_max = arg->val.num;
 432               else
 433                 result->mb_cur_min = arg->val.num;
 434
 435               lr_ignore_rest (cmfile, 1);
 436               continue;
 437
 438             case tok_escape_char:
 439             case tok_comment_char:
 440               if (arg->tok != tok_ident)
 441                 goto badarg;
 442
 443               if (arg->val.str.lenmb != 1)
 444                 {
 445                   lr_error (cmfile, _("\
 446 argument to <%s> must be a single character"),
 447                             nowtok == tok_escape_char ? "escape_char"
 448                                                       : "comment_char");
 449
 450                   lr_ignore_rest (cmfile, 0);
 451                   continue;
 452                 }
 453
 454               if (nowtok == tok_escape_char)
 455                 cmfile->escape_char = *arg->val.str.startmb;
 456               else
 457                 cmfile->comment_char = *arg->val.str.startmb;
 458
 459               lr_ignore_rest (cmfile, 1);
 460               continue;
 461
 462             case tok_g0esc:
 463             case tok_g1esc:
 464             case tok_g2esc:
 465             case tok_g3esc:
 466             case tok_escseq:
 467               lr_ignore_rest (cmfile, 0); /* XXX */
 468               continue;
 469
 470             case tok_include:
 471               lr_error (cmfile, _("\
 472 character sets with locking states are not supported"));
 473               exit (4);
 474
 475             default:
 476               /* Cannot happen.  */
 477               assert (! "Should not happen");
 478             }
 479           break;
 480
 481         case 2:
 482           /* We have seen `CHARMAP' and now are in the body.  Each line
 483              must have the format "%s %s %s\n" or "%s...%s %s %s\n".  */
 484           if (nowtok == tok_eol)
 485             /* Ignore empty lines.  */
 486             continue;
 487
 488           if (nowtok == tok_end)
 489             {
 490               expected_tok = tok_charmap;
 491               expected_str = "CHARMAP";
 492               state = 90;
 493               continue;
 494             }
 495
 496           if (nowtok != tok_bsymbol && nowtok != tok_ucs4)
 497             {
 498               lr_error (cmfile, _("syntax error in %s definition: %s"),
 499                         "CHARMAP", _("no symbolic name given"));
 500
 501               lr_ignore_rest (cmfile, 0);
 502               continue;
 503             }
 504
 505           /* If the previous line was not completely correct free the
 506              used memory.  */
 507           if (from_name != NULL)
 508             obstack_free (&result->mem_pool, from_name);
 509
 510           if (nowtok == tok_bsymbol)
 511             from_name = (char *) obstack_copy0 (&result->mem_pool,
 512                                                 now->val.str.startmb,
 513                                                 now->val.str.lenmb);
 514           else
 515             {
 516               obstack_printf (&result->mem_pool, "U%08X",
 517                               cmfile->token.val.ucs4);
 518               obstack_1grow (&result->mem_pool, '\0');
 519               from_name = (char *) obstack_finish (&result->mem_pool);
 520             }
 521           to_name = NULL;
 522
 523           state = 3;
 524           continue;
 525
 526         case 3:
 527           /* We have two possibilities: We can see an ellipsis or an
 528              encoding value.  */
 529           if (nowtok == tok_ellipsis3 || nowtok == tok_ellipsis4
 530               || nowtok == tok_ellipsis2 || nowtok == tok_ellipsis4_2
 531               || nowtok == tok_ellipsis2_2)
 532             {
 533               ellipsis = nowtok;
 534               if (nowtok == tok_ellipsis4_2)
 535                 {
 536                   step = 2;
 537                   nowtok = tok_ellipsis4;
 538                 }
 539               else if (nowtok == tok_ellipsis2_2)
 540                 {
 541                   step = 2;
 542                   nowtok = tok_ellipsis2;
 543                 }
 544               state = 4;
 545               continue;
 546             }
 547           /* FALLTHROUGH */
 548
 549         case 5:
 550           if (nowtok != tok_charcode)
 551             {
 552               lr_error (cmfile, _("syntax error in %s definition: %s"),
 553                         "CHARMAP", _("invalid encoding given"));
 554
 555               lr_ignore_rest (cmfile, 0);
 556
 557               state = 2;
 558               continue;
 559             }
 560
 561           if (now->val.charcode.nbytes < result->mb_cur_min)
 562             lr_error (cmfile, _("too few bytes in character encoding"));
 563           else if (now->val.charcode.nbytes > result->mb_cur_max)
 564             lr_error (cmfile, _("too many bytes in character encoding"));
 565           else
 566             charmap_new_char (cmfile, result, now->val.charcode.nbytes,
 567                               now->val.charcode.bytes, from_name, to_name,
 568                               ellipsis != tok_ellipsis2, step);
 569
 570           /* Ignore trailing comment silently.  */
 571           lr_ignore_rest (cmfile, 0);
 572
 573           from_name = NULL;
 574           to_name = NULL;
 575           ellipsis = tok_none;
 576           step = 1;
 577
 578           state = 2;
 579           continue;
 580
 581         case 4:
 582           if (nowtok != tok_bsymbol && nowtok != tok_ucs4)
 583             {
 584               lr_error (cmfile, _("syntax error in %s definition: %s"),
 585                         "CHARMAP",
 586                         _("no symbolic name given for end of range"));
 587
 588               lr_ignore_rest (cmfile, 0);
 589               continue;
 590             }
 591
 592           /* Copy the to-name in a safe place.  */
 593           if (nowtok == tok_bsymbol)
 594             to_name = (char *) obstack_copy0 (&result->mem_pool,
 595                                               cmfile->token.val.str.startmb,
 596                                               cmfile->token.val.str.lenmb);
 597           else
 598             {
 599               obstack_printf (&result->mem_pool, "U%08X",
 600                               cmfile->token.val.ucs4);
 601               obstack_1grow (&result->mem_pool, '\0');
 602               to_name = (char *) obstack_finish (&result->mem_pool);
 603             }
 604
 605           state = 5;
 606           continue;
 607
 608         case 90:
 609           if (nowtok != expected_tok)
 610             lr_error (cmfile, _("\
 611 %1$s: definition does not end with `END %1$s'"), expected_str);
 612
 613           lr_ignore_rest (cmfile, nowtok == expected_tok);
 614           state = 91;
 615           continue;
 616
 617         case 91:
 618           /* Waiting for WIDTH... */
 619           if (nowtok == tok_eol)
 620             /* Ignore empty lines.  */
 621             continue;
 622
 623           if (nowtok == tok_width_default)
 624             {
 625               state = 92;
 626               continue;
 627             }
 628
 629           if (nowtok == tok_width)
 630             {
 631               lr_ignore_rest (cmfile, 1);
 632               state = 93;
 633               continue;
 634             }
 635
 636           if (nowtok == tok_width_variable)
 637             {
 638               lr_ignore_rest (cmfile, 1);
 639               state = 98;
 640               continue;
 641             }
 642
 643           lr_error (cmfile, _("\
 644 only WIDTH definitions are allowed to follow the CHARMAP definition"));
 645
 646           lr_ignore_rest (cmfile, 0);
 647           continue;
 648
 649         case 92:
 650           if (nowtok != tok_number)
 651             lr_error (cmfile, _("value for %s must be an integer"),
 652                       "WIDTH_DEFAULT");
 653           else
 654             result->width_default = now->val.num;
 655
 656           lr_ignore_rest (cmfile, nowtok == tok_number);
 657
 658           state = 91;
 659           continue;
 660
 661         case 93:
 662           /* We now expect `END WIDTH' or lines of the format "%s %d\n" or
 663              "%s...%s %d\n".  */
 664           if (nowtok == tok_eol)
 665             /* ignore empty lines.  */
 666             continue;
 667
 668           if (nowtok == tok_end)
 669             {
 670               expected_tok = tok_width;
 671               expected_str = "WIDTH";
 672               state = 90;
 673               continue;
 674             }
 675
 676           if (nowtok != tok_bsymbol && nowtok != tok_ucs4)
 677             {
 678               lr_error (cmfile, _("syntax error in %s definition: %s"),
 679                         "WIDTH", _("no symbolic name given"));
 680
 681               lr_ignore_rest (cmfile, 0);
 682               continue;
 683             }
 684
 685           if (from_name != NULL)
 686             obstack_free (&result->mem_pool, from_name);
 687
 688           if (nowtok == tok_bsymbol)
 689             from_name = (char *) obstack_copy0 (&result->mem_pool,
 690                                                 now->val.str.startmb,
 691                                                 now->val.str.lenmb);
 692           else
 693             {
 694               obstack_printf (&result->mem_pool, "U%08X",
 695                               cmfile->token.val.ucs4);
 696               obstack_1grow (&result->mem_pool, '\0');
 697               from_name = (char *) obstack_finish (&result->mem_pool);
 698             }
 699
 700           to_name = NULL;
 701
 702           state = 94;
 703           continue;
 704
 705         case 94:
 706           if (nowtok == tok_ellipsis3)
 707             {
 708               state = 95;
 709               continue;
 710             }
 711
 712         case 96:
 713           if (nowtok != tok_number)
 714             lr_error (cmfile, _("value for %s must be an integer"),
 715                       "WIDTH");
 716           else
 717             {
 718               /* Store width for chars.  */
 719               new_width (cmfile, result, from_name, to_name, now->val.num);
 720
 721               from_name = NULL;
 722               to_name = NULL;
 723             }
 724
 725           lr_ignore_rest (cmfile, nowtok == tok_number);
 726
 727           state = 93;
 728           continue;
 729
 730         case 95:
 731           if (nowtok != tok_bsymbol && nowtok != tok_ucs4)
 732             {
 733               lr_error (cmfile, _("syntax error in %s definition: %s"),
 734                         "WIDTH", _("no symbolic name given for end of range"));
 735
 736               lr_ignore_rest (cmfile, 0);
 737
 738               state = 93;
 739               continue;
 740             }
 741
 742           if (nowtok == tok_bsymbol)
 743             to_name = (char *) obstack_copy0 (&result->mem_pool,
 744                                               now->val.str.startmb,
 745                                               now->val.str.lenmb);
 746           else
 747             {
 748               obstack_printf (&result->mem_pool, "U%08X",
 749                               cmfile->token.val.ucs4);
 750               obstack_1grow (&result->mem_pool, '\0');
 751               to_name = (char *) obstack_finish (&result->mem_pool);
 752             }
 753
 754           state = 96;
 755           continue;
 756
 757         case 98:
 758           /* We now expect `END WIDTH_VARIABLE' or lines of the format
 759              "%s\n" or "%s...%s\n".  */
 760           if (nowtok == tok_eol)
 761             /* ignore empty lines.  */
 762             continue;
 763
 764           if (nowtok == tok_end)
 765             {
 766               expected_tok = tok_width_variable;
 767               expected_str = "WIDTH_VARIABLE";
 768               state = 90;
 769               continue;
 770             }
 771
 772           if (nowtok != tok_bsymbol && nowtok != tok_ucs4)
 773             {
 774               lr_error (cmfile, _("syntax error in %s definition: %s"),
 775                         "WIDTH_VARIABLE", _("no symbolic name given"));
 776
 777               lr_ignore_rest (cmfile, 0);
 778
 779               continue;
 780             }
 781
 782           if (from_name != NULL)
 783             obstack_free (&result->mem_pool, from_name);
 784
 785           if (nowtok == tok_bsymbol)
 786             from_name = (char *) obstack_copy0 (&result->mem_pool,
 787                                                 now->val.str.startmb,
 788                                                 now->val.str.lenmb);
 789           else
 790             {
 791               obstack_printf (&result->mem_pool, "U%08X",
 792                               cmfile->token.val.ucs4);
 793               obstack_1grow (&result->mem_pool, '\0');
 794               from_name = (char *) obstack_finish (&result->mem_pool);
 795             }
 796           to_name = NULL;
 797
 798           state = 99;
 799           continue;
 800
 801         case 99:
 802           if (nowtok == tok_ellipsis3)
 803             state = 100;
 804
 805           /* Store info.  */
 806           from_name = NULL;
 807
 808           /* Warn */
 809           state = 98;
 810           continue;
 811
 812         case 100:
 813           if (nowtok != tok_bsymbol && nowtok != tok_ucs4)
 814             {
 815               lr_error (cmfile, _("syntax error in %s definition: %s"),
 816                         "WIDTH_VARIABLE",
 817                         _("no symbolic name given for end of range"));
 818               lr_ignore_rest (cmfile, 0);
 819               continue;
 820             }
 821
 822           if (nowtok == tok_bsymbol)
 823             to_name = (char *) obstack_copy0 (&result->mem_pool,
 824                                               now->val.str.startmb,
 825                                               now->val.str.lenmb);
 826           else
 827             {
 828               obstack_printf (&result->mem_pool, "U%08X",
 829                               cmfile->token.val.ucs4);
 830               obstack_1grow (&result->mem_pool, '\0');
 831               to_name = (char *) obstack_finish (&result->mem_pool);
 832             }
 833
 834           /* XXX Enter value into table.  */
 835
 836           lr_ignore_rest (cmfile, 1);
 837
 838           state = 98;
 839           continue;
 840
 841         default:
 842           WITH_CUR_LOCALE (error (5, 0, _("%s: error in state machine"),
 843                                   __FILE__));
 844           /* NOTREACHED */
 845         }
 846       break;
 847     }
 848
 849   if (state != 91 && !be_quiet)
 850     WITH_CUR_LOCALE (error (0, 0, _("%s: premature end of file"),
 851                             cmfile->fname));
 852
 853   lr_close (cmfile);
 854
 855   return result;
 856 }
 857
 858
 859 static void
 860 new_width (struct linereader *cmfile, struct charmap_t *result,
 861            const char *from, const char *to, unsigned long int width)
 862 {
 863   struct charseq *from_val;
 864   struct charseq *to_val;
 865
 866   from_val = charmap_find_value (result, from, strlen (from));
 867   if (from_val == NULL)
 868     {
 869       lr_error (cmfile, _("unknown character `%s'"), from);
 870       return;
 871     }
 872
 873   if (to == NULL)
 874     to_val = from_val;
 875   else
 876     {
 877       to_val = charmap_find_value (result, to, strlen (to));
 878       if (to_val == NULL)
 879         {
 880           lr_error (cmfile, _("unknown character `%s'"), to);
 881           return;
 882         }
 883
 884       /* Make sure the number of bytes for the end points of the range
 885          is correct.  */
 886       if (from_val->nbytes != to_val->nbytes)
 887         {
 888           lr_error (cmfile, _("\
 889 number of bytes for byte sequence of beginning and end of range not the same: %d vs %d"),
 890                     from_val->nbytes, to_val->nbytes);
 891           return;
 892         }
 893     }
 894
 895   if (result->nwidth_rules >= result->nwidth_rules_max)
 896     {
 897       size_t new_size = result->nwidth_rules + 32;
 898       struct width_rule *new_rules =
 899         (struct width_rule *) obstack_alloc (&result->mem_pool,
 900                                              (new_size
 901                                               * sizeof (struct width_rule)));
 902
 903       memcpy (new_rules, result->width_rules,
 904               result->nwidth_rules_max * sizeof (struct width_rule));
 905
 906       result->width_rules = new_rules;
 907       result->nwidth_rules_max = new_size;
 908     }
 909
 910   result->width_rules[result->nwidth_rules].from = from_val;
 911   result->width_rules[result->nwidth_rules].to = to_val;
 912   result->width_rules[result->nwidth_rules].width = (unsigned int) width;
 913   ++result->nwidth_rules;
 914 }
 915
 916
 917 struct charseq *
 918 charmap_find_value (const struct charmap_t *cm, const char *name, size_t len)
 919 {
 920   void *result;
 921
 922   return (find_entry ((hash_table *) &cm->char_table, name, len, &result)
 923           < 0 ? NULL : (struct charseq *) result);
 924 }
 925
 926
 927 static void
 928 charmap_new_char (struct linereader *lr, struct charmap_t *cm,
 929                   size_t nbytes, unsigned char *bytes,
 930                   const char *from, const char *to,
 931                   int decimal_ellipsis, int step)
 932 {
 933   hash_table *ht = &cm->char_table;
 934   hash_table *bt = &cm->byte_table;
 935   struct obstack *ob = &cm->mem_pool;
 936   char *from_end;
 937   char *to_end;
 938   const char *cp;
 939   int prefix_len, len1, len2;
 940   unsigned int from_nr, to_nr, cnt;
 941   struct charseq *newp;
 942
 943   len1 = strlen (from);
 944
 945   if (to == NULL)
 946     {
 947       newp = (struct charseq *) obstack_alloc (ob, sizeof (*newp) + nbytes);
 948       newp->nbytes = nbytes;
 949       memcpy (newp->bytes, bytes, nbytes);
 950       newp->name = from;
 951
 952       newp->ucs4 = UNINITIALIZED_CHAR_VALUE;
 953       if ((from[0] == 'U' || from[0] == 'P') && (len1 == 5 || len1 == 9))
 954         {
 955           /* Maybe the name is of the form `Uxxxx' or `Uxxxxxxxx' where
 956              xxxx and xxxxxxxx are hexadecimal numbers.  In this case
 957              we use the value of xxxx or xxxxxxxx as the UCS4 value of
 958              this character and we don't have to consult the repertoire
 959              map.
 960
 961              If the name is of the form `Pxxxx' or `Pxxxxxxxx' the xxxx
 962              and xxxxxxxx also give the code point in UCS4 but this must
 963              be in the private, i.e., unassigned, area.  This should be
 964              used for characters which do not (yet) have an equivalent
 965              in ISO 10646 and Unicode.  */
 966           char *endp;
 967
 968           errno = 0;
 969           newp->ucs4 = strtoul (from + 1, &endp, 16);
 970           if (endp - from != len1
 971               || (newp->ucs4 == ~((uint32_t) 0) && errno == ERANGE)
 972               || newp->ucs4 >= 0x80000000)
 973             /* This wasn't successful.  Signal this name cannot be a
 974                correct UCS value.  */
 975             newp->ucs4 = UNINITIALIZED_CHAR_VALUE;
 976         }
 977
 978       insert_entry (ht, from, len1, newp);
 979       insert_entry (bt, newp->bytes, nbytes, newp);
 980       /* Please note that it isn't a bug if a symbol is defined more
 981          than once.  All later definitions are simply discarded.  */
 982       return;
 983     }
 984
 985   /* We have a range: the names must have names with equal prefixes
 986      and an equal number of digits, where the second number is greater
 987      or equal than the first.  */
 988   len2 = strlen (to);
 989
 990   if (len1 != len2)
 991     {
 992     illegal_range:
 993       lr_error (lr, _("invalid names for character range"));
 994       return;
 995     }
 996
 997   cp = &from[len1 - 1];
 998   if (decimal_ellipsis)
 999     while (isdigit (*cp) && cp >= from)
1000       --cp;
1001   else
1002     while (isxdigit (*cp) && cp >= from)
1003       {
1004         if (!isdigit (*cp) && !isupper (*cp))
1005           lr_error (lr, _("\
1006 hexadecimal range format should use only capital characters"));
1007         --cp;
1008       }
1009
1010   prefix_len = (cp - from) + 1;
1011
1012   if (cp == &from[len1 - 1] || strncmp (from, to, prefix_len) != 0)
1013     goto illegal_range;
1014
1015   errno = 0;
1016   from_nr = strtoul (&from[prefix_len], &from_end, decimal_ellipsis ? 10 : 16);
1017   if (*from_end != '\0' || (from_nr == UINT_MAX && errno == ERANGE)
1018       || ((to_nr = strtoul (&to[prefix_len], &to_end,
1019                             decimal_ellipsis ? 10 : 16)) == UINT_MAX
1020           && errno == ERANGE)
1021       || *to_end != '\0')
1022     {
1023       lr_error (lr, _("<%s> and <%s> are invalid names for range"), from, to);
1024       return;
1025     }
1026
1027   if (from_nr > to_nr)
1028     {
1029       lr_error (lr, _("upper limit in range is smaller than lower limit"));
1030       return;
1031     }
1032
1033   for (cnt = from_nr; cnt <= to_nr; cnt += step)
1034     {
1035       char *name_end;
1036       obstack_printf (ob, decimal_ellipsis ? "%.*s%0*d" : "%.*s%0*X",
1037                       prefix_len, from, len1 - prefix_len, cnt);
1038       obstack_1grow (ob, '\0');
1039       name_end = obstack_finish (ob);
1040
1041       newp = (struct charseq *) obstack_alloc (ob, sizeof (*newp) + nbytes);
1042       newp->nbytes = nbytes;
1043       memcpy (newp->bytes, bytes, nbytes);
1044       newp->name = name_end;
1045
1046       newp->ucs4 = UNINITIALIZED_CHAR_VALUE;
1047       if ((name_end[0] == 'U' || name_end[0] == 'P')
1048           && (len1 == 5 || len1 == 9))
1049         {
1050           /* Maybe the name is of the form `Uxxxx' or `Uxxxxxxxx' where
1051              xxxx and xxxxxxxx are hexadecimal numbers.  In this case
1052              we use the value of xxxx or xxxxxxxx as the UCS4 value of
1053              this character and we don't have to consult the repertoire
1054              map.
1055
1056              If the name is of the form `Pxxxx' or `Pxxxxxxxx' the xxxx
1057              and xxxxxxxx also give the code point in UCS4 but this must
1058              be in the private, i.e., unassigned, area.  This should be
1059              used for characters which do not (yet) have an equivalent
1060              in ISO 10646 and Unicode.  */
1061           char *endp;
1062
1063           errno = 0;
1064           newp->ucs4 = strtoul (name_end + 1, &endp, 16);
1065           if (endp - name_end != len1
1066               || (newp->ucs4 == ~((uint32_t) 0) && errno == ERANGE)
1067               || newp->ucs4 >= 0x80000000)
1068             /* This wasn't successful.  Signal this name cannot be a
1069                correct UCS value.  */
1070             newp->ucs4 = UNINITIALIZED_CHAR_VALUE;
1071         }
1072
1073       insert_entry (ht, name_end, len1, newp);
1074       insert_entry (bt, newp->bytes, nbytes, newp);
1075       /* Please note we don't examine the return value since it is no error
1076          if we have two definitions for a symbol.  */
1077
1078       /* Increment the value in the byte sequence.  */
1079       if (++bytes[nbytes - 1] == '\0')
1080         {
1081           int b = nbytes - 2;
1082
1083           do
1084             if (b < 0)
1085               {
1086                 lr_error (lr,
1087                           _("resulting bytes for range not representable."));
1088                 return;
1089               }
1090           while (++bytes[b--] == 0);
1091         }
1092     }
1093 }
1094
1095
1096 struct charseq *
1097 charmap_find_symbol (const struct charmap_t *cm, const char *bytes,
1098                      size_t nbytes)
1099 {
1100   void *result;
1101
1102   return (find_entry ((hash_table *) &cm->byte_table, bytes, nbytes, &result)
1103           < 0 ? NULL : (struct charseq *) result);
1104 }