locale/programs/linereader.c

   1 /* Copyright (C) 1996, 1997, 1998, 1999 Free Software Foundation, Inc.
   2    This file is part of the GNU C Library.
   3    Contributed by Ulrich Drepper <drepper@gnu.org>, 1996.
   4
   5    The GNU C Library is free software; you can redistribute it and/or
   6    modify it under the terms of the GNU Library General Public License as
   7    published by the Free Software Foundation; either version 2 of the
   8    License, or (at your option) any later version.
   9
  10    The GNU C Library is distributed in the hope that it will be useful,
  11    but WITHOUT ANY WARRANTY; without even the implied warranty of
  12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  13    Library General Public License for more details.
  14
  15    You should have received a copy of the GNU Library General Public
  16    License along with the GNU C Library; see the file COPYING.LIB.  If not,
  17    write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
  18    Boston, MA 02111-1307, USA.  */
  19
  20 #ifdef HAVE_CONFIG_H
  21 # include <config.h>
  22 #endif
  23
  24 #include <ctype.h>
  25 #include <errno.h>
  26 #include <libintl.h>
  27 #include <stdarg.h>
  28 #include <stdlib.h>
  29 #include <string.h>
  30
  31 #include "charmap.h"
  32 #include "error.h"
  33 #include "linereader.h"
  34 #include "localedef.h"
  35 #include "stringtrans.h"
  36
  37
  38 /* Prototypes for local functions.  */
  39 static struct token *get_toplvl_escape (struct linereader *lr);
  40 static struct token *get_symname (struct linereader *lr);
  41 static struct token *get_ident (struct linereader *lr);
  42 static struct token *get_string (struct linereader *lr,
  43                                  const struct charmap_t *charmap,
  44                                  const struct repertoire_t *repertoire);
  45
  46
  47 struct linereader *
  48 lr_open (const char *fname, kw_hash_fct_t hf)
  49 {
  50   FILE *fp;
  51   struct linereader *result;
  52   int n;
  53
  54   if (fname == NULL || strcmp (fname, "-") == 0
  55       || strcmp (fname, "/dev/stdin") == 0)
  56     fp = stdin;
  57   else
  58     {
  59       fp = fopen (fname, "r");
  60       if (fp == NULL)
  61         return NULL;
  62     }
  63
  64   result = (struct linereader *) xmalloc (sizeof (*result));
  65
  66   result->fp = fp;
  67   result->fname = xstrdup (fname ? : "<stdin>");
  68   result->buf = NULL;
  69   result->bufsize = 0;
  70   result->lineno = 1;
  71   result->idx = 0;
  72   result->comment_char = '#';
  73   result->escape_char = '\\';
  74   result->translate_strings = 1;
  75
  76   n = getdelim (&result->buf, &result->bufsize, '\n', result->fp);
  77   if (n < 0)
  78     {
  79       int save = errno;
  80       fclose (result->fp);
  81       free ((char *) result->fname);
  82       free (result);
  83       errno = save;
  84       return NULL;
  85     }
  86
  87   if (n > 1 && result->buf[n - 2] == '\\' && result->buf[n - 1] == '\n')
  88     n -= 2;
  89
  90   result->buf[n] = '\0';
  91   result->bufact = n;
  92   result->hash_fct = hf;
  93
  94   return result;
  95 }
  96
  97
  98 int
  99 lr_eof (struct linereader *lr)
 100 {
 101   return lr->bufact = 0;
 102 }
 103
 104
 105 void
 106 lr_close (struct linereader *lr)
 107 {
 108   fclose (lr->fp);
 109   free (lr->buf);
 110   free (lr);
 111 }
 112
 113
 114 int
 115 lr_next (struct linereader *lr)
 116 {
 117   int n;
 118
 119   n = getdelim (&lr->buf, &lr->bufsize, '\n', lr->fp);
 120   if (n < 0)
 121     return -1;
 122
 123   ++lr->lineno;
 124
 125   if (n > 1 && lr->buf[n - 2] == lr->escape_char && lr->buf[n - 1] == '\n')
 126     {
 127 #if 0
 128       /* XXX Is this correct?  */
 129       /* An escaped newline character is substituted with a single <SP>.  */
 130       --n;
 131       lr->buf[n - 1] = ' ';
 132 #else
 133       n -= 2;
 134 #endif
 135     }
 136
 137   lr->buf[n] = '\0';
 138   lr->bufact = n;
 139   lr->idx = 0;
 140
 141   return 0;
 142 }
 143
 144
 145 /* Defined in error.c.  */
 146 /* This variable is incremented each time `error' is called.  */
 147 extern unsigned int error_message_count;
 148
 149 /* The calling program should define program_name and set it to the
 150    name of the executing program.  */
 151 extern char *program_name;
 152
 153
 154 struct token *
 155 lr_token (struct linereader *lr, const struct charmap_t *charmap,
 156           const struct repertoire_t *repertoire)
 157 {
 158   int ch;
 159
 160   while (1)
 161     {
 162       do
 163         {
 164           ch = lr_getc (lr);
 165
 166           if (ch == EOF)
 167             {
 168               lr->token.tok = tok_eof;
 169               return &lr->token;
 170             };
 171
 172           if (ch == '\n')
 173             {
 174               lr->token.tok = tok_eol;
 175               return &lr->token;
 176             }
 177         }
 178       while (isspace (ch));
 179
 180       if (ch == EOF)
 181         {
 182           lr->token.tok = tok_eof;
 183           return &lr->token;
 184         };
 185
 186       if (ch != lr->comment_char)
 187         break;
 188
 189       /* Ignore rest of line.  */
 190       lr_ignore_rest (lr, 0);
 191       lr->token.tok = tok_eol;
 192       return &lr->token;
 193     }
 194
 195   /* Match escape sequences.  */
 196   if (ch == lr->escape_char)
 197     return get_toplvl_escape (lr);
 198
 199   /* Match ellipsis.  */
 200   if (ch == '.')
 201     {
 202       if (strncmp (&lr->buf[lr->idx], "...", 3) == 0)
 203         {
 204           lr_getc (lr);
 205           lr_getc (lr);
 206           lr_getc (lr);
 207           lr->token.tok = tok_ellipsis4;
 208           return &lr->token;
 209         }
 210       if (strncmp (&lr->buf[lr->idx], "..", 2) == 0)
 211         {
 212           lr_getc (lr);
 213           lr_getc (lr);
 214           lr->token.tok = tok_ellipsis3;
 215           return &lr->token;
 216         }
 217       if (lr->buf[lr->idx] == '.')
 218         {
 219           lr_getc (lr);
 220           lr->token.tok = tok_ellipsis2;
 221           return &lr->token;
 222         }
 223     }
 224
 225   switch (ch)
 226     {
 227     case '<':
 228       return get_symname (lr);
 229
 230     case '0' ... '9':
 231       lr->token.tok = tok_number;
 232       lr->token.val.num = ch - '0';
 233
 234       while (isdigit (ch = lr_getc (lr)))
 235         {
 236           lr->token.val.num *= 10;
 237           lr->token.val.num += ch - '0';
 238         }
 239       if (isalpha (ch))
 240         lr_error (lr, _("garbage at end of number"));
 241       lr_ungetn (lr, 1);
 242
 243       return &lr->token;
 244
 245     case ';':
 246       lr->token.tok = tok_semicolon;
 247       return &lr->token;
 248
 249     case ',':
 250       lr->token.tok = tok_comma;
 251       return &lr->token;
 252
 253     case '(':
 254       lr->token.tok = tok_open_brace;
 255       return &lr->token;
 256
 257     case ')':
 258       lr->token.tok = tok_close_brace;
 259       return &lr->token;
 260
 261     case '"':
 262       return get_string (lr, charmap, repertoire);
 263
 264     case '-':
 265       ch = lr_getc (lr);
 266       if (ch == '1')
 267         {
 268           lr->token.tok = tok_minus1;
 269           return &lr->token;
 270         }
 271       lr_ungetn (lr, 2);
 272       break;
 273     }
 274
 275   return get_ident (lr);
 276 }
 277
 278
 279 static struct token *
 280 get_toplvl_escape (struct linereader *lr)
 281 {
 282   /* This is supposed to be a numeric value.  We return the
 283      numerical value and the number of bytes.  */
 284   size_t start_idx = lr->idx - 1;
 285   char *bytes = lr->token.val.charcode.bytes;
 286   int nbytes = 0;
 287   int ch;
 288
 289   do
 290     {
 291       unsigned int byte = 0;
 292       unsigned int base = 8;
 293
 294       ch = lr_getc (lr);
 295
 296       if (ch == 'd')
 297         {
 298           base = 10;
 299           ch = lr_getc (lr);
 300         }
 301       else if (ch == 'x')
 302         {
 303           base = 16;
 304           ch = lr_getc (lr);
 305         }
 306
 307       if ((base == 16 && !isxdigit (ch))
 308           || (base != 16 && (ch < '0' || ch >= (int) ('0' + base))))
 309         {
 310         esc_error:
 311           lr->token.val.str.startmb = &lr->buf[start_idx];
 312
 313           while (ch != EOF && !isspace (ch))
 314             ch = lr_getc (lr);
 315           lr->token.val.str.lenmb = lr->idx - start_idx;
 316
 317           lr->token.tok = tok_error;
 318           return &lr->token;
 319         }
 320
 321       if (isdigit (ch))
 322         byte = ch - '0';
 323       else
 324         byte = tolower (ch) - 'a' + 10;
 325
 326       ch = lr_getc (lr);
 327       if ((base == 16 && !isxdigit (ch))
 328           || (base != 16 && (ch < '0' || ch >= (int) ('0' + base))))
 329         goto esc_error;
 330
 331       byte *= base;
 332       if (isdigit (ch))
 333         byte += ch - '0';
 334       else
 335         byte += tolower (ch) - 'a' + 10;
 336
 337       ch = lr_getc (lr);
 338       if (base != 16 && isdigit (ch))
 339         {
 340           byte *= base;
 341           byte += ch - '0';
 342
 343           ch = lr_getc (lr);
 344         }
 345
 346       bytes[nbytes++] = byte;
 347     }
 348   while (ch == lr->escape_char && nbytes < 4);
 349
 350   if (!isspace (ch))
 351     lr_error (lr, _("garbage at end of character code specification"));
 352
 353   lr_ungetn (lr, 1);
 354
 355   lr->token.tok = tok_charcode;
 356   lr->token.val.charcode.nbytes = nbytes;
 357
 358   return &lr->token;
 359 }
 360
 361
 362 #define ADDC(ch) \
 363   do                                                                          \
 364     {                                                                         \
 365       if (bufact == bufmax)                                                   \
 366         {                                                                     \
 367           bufmax *= 2;                                                        \
 368           buf = xrealloc (buf, bufmax);                                       \
 369         }                                                                     \
 370       buf[bufact++] = (ch);                                                   \
 371     }                                                                         \
 372   while (0)
 373
 374
 375 #define ADDS(s, l) \
 376   do                                                                          \
 377     {                                                                         \
 378       size_t _l = (l);                                                        \
 379       if (bufact + _l > bufmax)                                               \
 380         {                                                                     \
 381           if (bufact < _l)                                                    \
 382             bufact = _l;                                                      \
 383           bufmax *= 2;                                                        \
 384           buf = xrealloc (buf, bufmax);                                       \
 385         }                                                                     \
 386       memcpy (&buf[bufact], s, _l);                                           \
 387       bufact += _l;                                                           \
 388     }                                                                         \
 389   while (0)
 390
 391
 392 #define ADDWC(ch) \
 393   do                                                                          \
 394     {                                                                         \
 395       if (buf2act == buf2max)                                                 \
 396         {                                                                     \
 397           buf2max *= 2;                                                       \
 398           buf2 = xrealloc (buf2, buf2max * 4);                                \
 399         }                                                                     \
 400       buf2[buf2act++] = (ch);                                                 \
 401     }                                                                         \
 402   while (0)
 403
 404
 405 static struct token *
 406 get_symname (struct linereader *lr)
 407 {
 408   /* Symbol in brackets.  We must distinguish three kinds:
 409      1. reserved words
 410      2. ISO 10646 position values
 411      3. all other.  */
 412   char *buf;
 413   size_t bufact = 0;
 414   size_t bufmax = 56;
 415   const struct keyword_t *kw;
 416   int ch;
 417
 418   buf = (char *) xmalloc (bufmax);
 419
 420   do
 421     {
 422       ch = lr_getc (lr);
 423       if (ch == lr->escape_char)
 424         {
 425           int c2 = lr_getc (lr);
 426           ADDC (c2);
 427
 428           if (c2 == '\n')
 429             ch = '\n';
 430         }
 431       else
 432         ADDC (ch);
 433     }
 434   while (ch != '>' && ch != '\n');
 435
 436   if (ch == '\n')
 437     lr_error (lr, _("unterminated symbolic name"));
 438
 439   /* Test for ISO 10646 position value.  */
 440   if (buf[0] == 'U' && (bufact == 6 || bufact == 10))
 441     {
 442       char *cp = buf + 1;
 443       while (cp < &buf[bufact - 1] && isxdigit (*cp))
 444         ++cp;
 445
 446       if (cp == &buf[bufact - 1])
 447         {
 448           /* Yes, it is.  */
 449           lr->token.tok = tok_ucs4;
 450           lr->token.val.ucs4 = strtoul (buf + 1, NULL, 16);
 451
 452           return &lr->token;
 453         }
 454     }
 455
 456   /* It is a symbolic name.  Test for reserved words.  */
 457   kw = lr->hash_fct (buf, bufact - 1);
 458
 459   if (kw != NULL && kw->symname_or_ident == 1)
 460     {
 461       lr->token.tok = kw->token;
 462       free (buf);
 463     }
 464   else
 465     {
 466       lr->token.tok = tok_bsymbol;
 467
 468       buf[bufact] = '\0';
 469       buf = xrealloc (buf, bufact + 1);
 470
 471       lr->token.val.str.startmb = buf;
 472       lr->token.val.str.lenmb = bufact - 1;
 473     }
 474
 475   return &lr->token;
 476 }
 477
 478
 479 static struct token *
 480 get_ident (struct linereader *lr)
 481 {
 482   char *buf;
 483   size_t bufact;
 484   size_t bufmax = 56;
 485   const struct keyword_t *kw;
 486   int ch;
 487
 488   buf = xmalloc (bufmax);
 489   bufact = 0;
 490
 491   ADDC (lr->buf[lr->idx - 1]);
 492
 493   while (!isspace ((ch = lr_getc (lr))) && ch != '"' && ch != ';'
 494          && ch != '<' && ch != ',')
 495     {
 496       if (ch == lr->escape_char)
 497         {
 498           ch = lr_getc (lr);
 499           if (ch == '\n' || ch == EOF)
 500             {
 501               lr_error (lr, _("invalid escape sequence"));
 502               break;
 503             }
 504         }
 505       ADDC (ch);
 506     }
 507
 508   lr_ungetn (lr, 1);
 509
 510   kw = lr->hash_fct (buf, bufact);
 511
 512   if (kw != NULL && kw->symname_or_ident == 0)
 513     {
 514       lr->token.tok = kw->token;
 515       free (buf);
 516     }
 517   else
 518     {
 519       lr->token.tok = tok_ident;
 520
 521       buf[bufact] = '\0';
 522       buf = xrealloc (buf, bufact + 1);
 523
 524       lr->token.val.str.startmb = buf;
 525       lr->token.val.str.lenmb = bufact;
 526     }
 527
 528   return &lr->token;
 529 }
 530
 531
 532 static struct token *
 533 get_string (struct linereader *lr, const struct charmap_t *charmap,
 534             const struct repertoire_t *repertoire)
 535 {
 536   int return_widestr = lr->return_widestr;
 537   char *buf;
 538   wchar_t *buf2 = NULL;
 539   size_t bufact;
 540   size_t bufmax = 56;
 541
 542   /* We must return two different strings.  */
 543   buf = xmalloc (bufmax);
 544   bufact = 0;
 545
 546   /* We know it'll be a string.  */
 547   lr->token.tok = tok_string;
 548
 549   /* If we need not translate the strings (i.e., expand <...> parts)
 550      we can run a simple loop.  */
 551   if (!lr->translate_strings)
 552     {
 553       int ch;
 554
 555       buf2 = NULL;
 556       while ((ch = lr_getc (lr)) != '"' && ch != '\n' && ch != EOF)
 557         ADDC (ch);
 558
 559       /* Catch errors with trailing escape character.  */
 560       if (bufact > 0 && buf[bufact - 1] == lr->escape_char
 561           && (bufact == 1 || buf[bufact - 2] != lr->escape_char))
 562         {
 563           lr_error (lr, _("illegal escape sequence at end of string"));
 564           --bufact;
 565         }
 566       else if (ch == '\n' || ch == EOF)
 567         lr_error (lr, _("unterminated string"));
 568
 569       ADDC ('\0');
 570     }
 571   else
 572     {
 573       int illegal_string = 0;
 574       size_t buf2act = 0;
 575       size_t buf2max = 56 * sizeof (uint32_t);
 576       int ch;
 577       int warned = 0;
 578
 579       /* We have to provide the wide character result as well.  */
 580       if (return_widestr)
 581         buf2 = xmalloc (buf2max);
 582
 583       /* Read until the end of the string (or end of the line or file).  */
 584       while ((ch = lr_getc (lr)) != '"' && ch != '\n' && ch != EOF)
 585         {
 586           size_t startidx;
 587           uint32_t wch;
 588           struct charseq *seq;
 589
 590           if (ch != '<')
 591             {
 592               /* The standards leave it up to the implementation to decide
 593                  what to do with character which stand for themself.  We
 594                  could jump through hoops to find out the value relative to
 595                  the charmap and the repertoire map, but instead we leave
 596                  it up to the locale definition author to write a better
 597                  definition.  We assume here that every character which
 598                  stands for itself is encoded using ISO 8859-1.  Using the
 599                  escape character is allowed.  */
 600               if (ch == lr->escape_char)
 601                 {
 602                   ch = lr_getc (lr);
 603                   if (ch == '\n' || ch == EOF)
 604                     break;
 605                 }
 606
 607               if (verbose && !warned)
 608                 {
 609                   lr_error (lr, _("\
 610 non-symbolic character value should not be used"));
 611                   warned = 1;
 612                 }
 613
 614               ADDC (ch);
 615               if (return_widestr)
 616                 ADDWC ((uint32_t) ch);
 617
 618               continue;
 619             }
 620
 621           /* Now we have to search for the end of the symbolic name, i.e.,
 622              the closing '>'.  */
 623           startidx = bufact;
 624           while ((ch = lr_getc (lr)) != '>' && ch != '\n' && ch != EOF)
 625             {
 626               if (ch == lr->escape_char)
 627                 {
 628                   ch = lr_getc (lr);
 629                   if (ch == '\n' || ch == EOF)
 630                     break;
 631                 }
 632               ADDC (ch);
 633             }
 634           if (ch == '\n' || ch == EOF)
 635             /* Not a correct string.  */
 636             break;
 637           if (bufact == startidx)
 638             {
 639               /* <> is no correct name.  Ignore it and also signal an
 640                  error.  */
 641               illegal_string = 1;
 642               continue;
 643             }
 644
 645           /* It might be a Uxxxx symbol.  */
 646           if (buf[startidx] == 'U'
 647               && (bufact - startidx == 5 || bufact - startidx == 9))
 648             {
 649               char *cp = buf + startidx + 1;
 650               while (cp < &buf[bufact] && isxdigit (*cp))
 651                 ++cp;
 652
 653               if (cp == &buf[bufact])
 654                 {
 655                   const char *symbol = NULL;
 656
 657                   /* Yes, it is.  */
 658                   ADDC ('\0');
 659                   wch = strtoul (buf + startidx + 1, NULL, 16);
 660
 661                   /* Now forget about the name we just added.  */
 662                   bufact = startidx;
 663
 664                   if (return_widestr)
 665                     ADDWC (wch);
 666
 667                   /* Now determine from the repertoire the name of the
 668                      character and find it in the charmap.  */
 669                   if (repertoire != NULL)
 670                     symbol = repertoire_find_symbol (repertoire, wch);
 671
 672                   if (symbol == NULL)
 673                     {
 674                       /* We cannot generate a string since we cannot map
 675                          from the Unicode number to the character symbol.  */
 676                       lr_error (lr,
 677                                 _("character <U%0*X> not in repertoire map"),
 678                                 wch > 0xffff ? 8 : 4, wch);
 679
 680                       illegal_string = 1;
 681                     }
 682                   else
 683                     {
 684                       seq = charmap_find_value (charmap, symbol,
 685                                                 strlen (symbol));
 686
 687                       if (seq == NULL)
 688                         {
 689                           /* Not a known name.  */
 690                           lr_error (lr,
 691                                     _("symbol `%s' not in charmap"), symbol);
 692                           illegal_string = 1;
 693                         }
 694                       else
 695                         ADDS (seq->bytes, seq->nbytes);
 696                     }
 697
 698                   continue;
 699                 }
 700             }
 701
 702           if (return_widestr)
 703             {
 704               /* We now have the symbolic name in buf[startidx] to
 705                  buf[bufact-1].  Now find out the value for this
 706                  character in the repertoire map as well as in the
 707                  charmap (in this order).  */
 708               wch = repertoire_find_value (repertoire, &buf[startidx],
 709                                            bufact - startidx);
 710               if (wch == ILLEGAL_CHAR_VALUE)
 711                 {
 712                   /* This name is not in the repertoire map.  */
 713                   lr_error (lr, _("symbol `%.*s' not in repertoire map"),
 714                             bufact - startidx, &buf[startidx]);
 715                   illegal_string = 1;
 716                 }
 717               else
 718                 ADDWC (wch);
 719             }
 720
 721           /* Now the same for the multibyte representation.  */
 722           seq = charmap_find_value (charmap, &buf[startidx],
 723                                     bufact - startidx);
 724
 725           if (seq == NULL)
 726             {
 727               /* This name is not in the charmap.  */
 728               lr_error (lr, _("symbol `%.*s' not in charmap"),
 729                         bufact - startidx, &buf[startidx]);
 730               illegal_string = 1;
 731
 732               /* Now forget about the name we just added.  */
 733               bufact = startidx;
 734             }
 735           else
 736             {
 737               /* Now forget about the name we just added.  */
 738               bufact = startidx;
 739
 740               ADDS (seq->bytes, seq->nbytes);
 741             }
 742         }
 743
 744       if (ch == '\n' || ch == EOF)
 745         {
 746           lr_error (lr, _("unterminated string"));
 747           illegal_string = 1;
 748         }
 749
 750       if (illegal_string)
 751         {
 752           free (buf);
 753           if (buf2 != NULL)
 754             free (buf2);
 755           lr->token.val.str.startmb = NULL;
 756           lr->token.val.str.lenmb = 0;
 757
 758           return &lr->token;
 759         }
 760
 761       ADDC ('\0');
 762
 763       if (return_widestr)
 764         {
 765           ADDWC (0);
 766           lr->token.val.str.startwc = xrealloc (buf2,
 767                                                 buf2act * sizeof (uint32_t));
 768           lr->token.val.str.lenwc = buf2act;
 769         }
 770     }
 771
 772   lr->token.val.str.startmb = xrealloc (buf, bufact);
 773   lr->token.val.str.lenmb = bufact;
 774
 775   return &lr->token;
 776 }