locale/programs/linereader.c

   1 /* Copyright (C) 1996, 1997, 1998, 1999, 2000 Free Software Foundation, Inc.
   2    This file is part of the GNU C Library.
   3    Contributed by Ulrich Drepper <drepper@gnu.org>, 1996.
   4
   5    The GNU C Library is free software; you can redistribute it and/or
   6    modify it under the terms of the GNU Library General Public License as
   7    published by the Free Software Foundation; either version 2 of the
   8    License, or (at your option) any later version.
   9
  10    The GNU C Library is distributed in the hope that it will be useful,
  11    but WITHOUT ANY WARRANTY; without even the implied warranty of
  12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  13    Library General Public License for more details.
  14
  15    You should have received a copy of the GNU Library General Public
  16    License along with the GNU C Library; see the file COPYING.LIB.  If not,
  17    write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
  18    Boston, MA 02111-1307, USA.  */
  19
  20 #ifdef HAVE_CONFIG_H
  21 # include <config.h>
  22 #endif
  23
  24 #include <ctype.h>
  25 #include <errno.h>
  26 #include <libintl.h>
  27 #include <stdarg.h>
  28 #include <stdlib.h>
  29 #include <string.h>
  30
  31 #include "charmap.h"
  32 #include "error.h"
  33 #include "linereader.h"
  34 #include "localedef.h"
  35
  36
  37 /* Prototypes for local functions.  */
  38 static struct token *get_toplvl_escape (struct linereader *lr);
  39 static struct token *get_symname (struct linereader *lr);
  40 static struct token *get_ident (struct linereader *lr);
  41 static struct token *get_string (struct linereader *lr,
  42                                  const struct charmap_t *charmap,
  43                                  const struct repertoire_t *repertoire);
  44
  45
  46 struct linereader *
  47 lr_open (const char *fname, kw_hash_fct_t hf)
  48 {
  49   FILE *fp;
  50   struct linereader *result;
  51   int n;
  52
  53   if (fname == NULL || strcmp (fname, "-") == 0
  54       || strcmp (fname, "/dev/stdin") == 0)
  55     fp = stdin;
  56   else
  57     {
  58       fp = fopen (fname, "r");
  59       if (fp == NULL)
  60         return NULL;
  61     }
  62
  63   result = (struct linereader *) xmalloc (sizeof (*result));
  64
  65   result->fp = fp;
  66   result->fname = xstrdup (fname ? : "<stdin>");
  67   result->buf = NULL;
  68   result->bufsize = 0;
  69   result->lineno = 1;
  70   result->idx = 0;
  71   result->comment_char = '#';
  72   result->escape_char = '\\';
  73   result->translate_strings = 1;
  74
  75   n = getdelim (&result->buf, &result->bufsize, '\n', result->fp);
  76   if (n < 0)
  77     {
  78       int save = errno;
  79       fclose (result->fp);
  80       free ((char *) result->fname);
  81       free (result);
  82       errno = save;
  83       return NULL;
  84     }
  85
  86   if (n > 1 && result->buf[n - 2] == '\\' && result->buf[n - 1] == '\n')
  87     n -= 2;
  88
  89   result->buf[n] = '\0';
  90   result->bufact = n;
  91   result->hash_fct = hf;
  92
  93   return result;
  94 }
  95
  96
  97 int
  98 lr_eof (struct linereader *lr)
  99 {
 100   return lr->bufact = 0;
 101 }
 102
 103
 104 void
 105 lr_close (struct linereader *lr)
 106 {
 107   fclose (lr->fp);
 108   free (lr->buf);
 109   free (lr);
 110 }
 111
 112
 113 int
 114 lr_next (struct linereader *lr)
 115 {
 116   int n;
 117
 118   n = getdelim (&lr->buf, &lr->bufsize, '\n', lr->fp);
 119   if (n < 0)
 120     return -1;
 121
 122   ++lr->lineno;
 123
 124   if (n > 1 && lr->buf[n - 2] == lr->escape_char && lr->buf[n - 1] == '\n')
 125     {
 126 #if 0
 127       /* XXX Is this correct?  */
 128       /* An escaped newline character is substituted with a single <SP>.  */
 129       --n;
 130       lr->buf[n - 1] = ' ';
 131 #else
 132       n -= 2;
 133 #endif
 134     }
 135
 136   lr->buf[n] = '\0';
 137   lr->bufact = n;
 138   lr->idx = 0;
 139
 140   return 0;
 141 }
 142
 143
 144 /* Defined in error.c.  */
 145 /* This variable is incremented each time `error' is called.  */
 146 extern unsigned int error_message_count;
 147
 148 /* The calling program should define program_name and set it to the
 149    name of the executing program.  */
 150 extern char *program_name;
 151
 152
 153 struct token *
 154 lr_token (struct linereader *lr, const struct charmap_t *charmap,
 155           const struct repertoire_t *repertoire)
 156 {
 157   int ch;
 158
 159   while (1)
 160     {
 161       do
 162         {
 163           ch = lr_getc (lr);
 164
 165           if (ch == EOF)
 166             {
 167               lr->token.tok = tok_eof;
 168               return &lr->token;
 169             };
 170
 171           if (ch == '\n')
 172             {
 173               lr->token.tok = tok_eol;
 174               return &lr->token;
 175             }
 176         }
 177       while (isspace (ch));
 178
 179       if (ch == EOF)
 180         {
 181           lr->token.tok = tok_eof;
 182           return &lr->token;
 183         };
 184
 185       if (ch != lr->comment_char)
 186         break;
 187
 188       /* Is there an newline at the end of the buffer?  */
 189       if (lr->buf[lr->bufact - 1] != '\n')
 190         {
 191           /* No.  Some people want this to mean that only the line in
 192              the file not the logical, concatenated line is ignored.
 193              Let's try this.  */
 194           lr->idx = lr->bufact;
 195           continue;
 196         }
 197
 198       /* Ignore rest of line.  */
 199       lr_ignore_rest (lr, 0);
 200       lr->token.tok = tok_eol;
 201       return &lr->token;
 202     }
 203
 204   /* Match escape sequences.  */
 205   if (ch == lr->escape_char)
 206     return get_toplvl_escape (lr);
 207
 208   /* Match ellipsis.  */
 209   if (ch == '.')
 210     {
 211       if (strncmp (&lr->buf[lr->idx], "...(2)....", 10) == 0)
 212         {
 213           int cnt;
 214           for (cnt = 0; cnt < 10; ++cnt)
 215             lr_getc (lr);
 216           lr->token.tok = tok_ellipsis4_2;
 217           return &lr->token;
 218         }
 219       if (strncmp (&lr->buf[lr->idx], "...", 3) == 0)
 220         {
 221           lr_getc (lr);
 222           lr_getc (lr);
 223           lr_getc (lr);
 224           lr->token.tok = tok_ellipsis4;
 225           return &lr->token;
 226         }
 227       if (strncmp (&lr->buf[lr->idx], "..", 2) == 0)
 228         {
 229           lr_getc (lr);
 230           lr_getc (lr);
 231           lr->token.tok = tok_ellipsis3;
 232           return &lr->token;
 233         }
 234       if (strncmp (&lr->buf[lr->idx], ".(2)..", 6) == 0)
 235         {
 236           int cnt;
 237           for (cnt = 0; cnt < 6; ++cnt)
 238             lr_getc (lr);
 239           lr->token.tok = tok_ellipsis2_2;
 240           return &lr->token;
 241         }
 242       if (lr->buf[lr->idx] == '.')
 243         {
 244           lr_getc (lr);
 245           lr->token.tok = tok_ellipsis2;
 246           return &lr->token;
 247         }
 248     }
 249
 250   switch (ch)
 251     {
 252     case '<':
 253       return get_symname (lr);
 254
 255     case '0' ... '9':
 256       lr->token.tok = tok_number;
 257       lr->token.val.num = ch - '0';
 258
 259       while (isdigit (ch = lr_getc (lr)))
 260         {
 261           lr->token.val.num *= 10;
 262           lr->token.val.num += ch - '0';
 263         }
 264       if (isalpha (ch))
 265         lr_error (lr, _("garbage at end of number"));
 266       lr_ungetn (lr, 1);
 267
 268       return &lr->token;
 269
 270     case ';':
 271       lr->token.tok = tok_semicolon;
 272       return &lr->token;
 273
 274     case ',':
 275       lr->token.tok = tok_comma;
 276       return &lr->token;
 277
 278     case '(':
 279       lr->token.tok = tok_open_brace;
 280       return &lr->token;
 281
 282     case ')':
 283       lr->token.tok = tok_close_brace;
 284       return &lr->token;
 285
 286     case '"':
 287       return get_string (lr, charmap, repertoire);
 288
 289     case '-':
 290       ch = lr_getc (lr);
 291       if (ch == '1')
 292         {
 293           lr->token.tok = tok_minus1;
 294           return &lr->token;
 295         }
 296       lr_ungetn (lr, 2);
 297       break;
 298     }
 299
 300   return get_ident (lr);
 301 }
 302
 303
 304 static struct token *
 305 get_toplvl_escape (struct linereader *lr)
 306 {
 307   /* This is supposed to be a numeric value.  We return the
 308      numerical value and the number of bytes.  */
 309   size_t start_idx = lr->idx - 1;
 310   char *bytes = lr->token.val.charcode.bytes;
 311   int nbytes = 0;
 312   int ch;
 313
 314   do
 315     {
 316       unsigned int byte = 0;
 317       unsigned int base = 8;
 318
 319       ch = lr_getc (lr);
 320
 321       if (ch == 'd')
 322         {
 323           base = 10;
 324           ch = lr_getc (lr);
 325         }
 326       else if (ch == 'x')
 327         {
 328           base = 16;
 329           ch = lr_getc (lr);
 330         }
 331
 332       if ((base == 16 && !isxdigit (ch))
 333           || (base != 16 && (ch < '0' || ch >= (int) ('0' + base))))
 334         {
 335         esc_error:
 336           lr->token.val.str.startmb = &lr->buf[start_idx];
 337
 338           while (ch != EOF && !isspace (ch))
 339             ch = lr_getc (lr);
 340           lr->token.val.str.lenmb = lr->idx - start_idx;
 341
 342           lr->token.tok = tok_error;
 343           return &lr->token;
 344         }
 345
 346       if (isdigit (ch))
 347         byte = ch - '0';
 348       else
 349         byte = tolower (ch) - 'a' + 10;
 350
 351       ch = lr_getc (lr);
 352       if ((base == 16 && !isxdigit (ch))
 353           || (base != 16 && (ch < '0' || ch >= (int) ('0' + base))))
 354         goto esc_error;
 355
 356       byte *= base;
 357       if (isdigit (ch))
 358         byte += ch - '0';
 359       else
 360         byte += tolower (ch) - 'a' + 10;
 361
 362       ch = lr_getc (lr);
 363       if (base != 16 && isdigit (ch))
 364         {
 365           byte *= base;
 366           byte += ch - '0';
 367
 368           ch = lr_getc (lr);
 369         }
 370
 371       bytes[nbytes++] = byte;
 372     }
 373   while (ch == lr->escape_char && nbytes < 4);
 374
 375   if (!isspace (ch))
 376     lr_error (lr, _("garbage at end of character code specification"));
 377
 378   lr_ungetn (lr, 1);
 379
 380   lr->token.tok = tok_charcode;
 381   lr->token.val.charcode.nbytes = nbytes;
 382
 383   return &lr->token;
 384 }
 385
 386
 387 #define ADDC(ch) \
 388   do                                                                          \
 389     {                                                                         \
 390       if (bufact == bufmax)                                                   \
 391         {                                                                     \
 392           bufmax *= 2;                                                        \
 393           buf = xrealloc (buf, bufmax);                                       \
 394         }                                                                     \
 395       buf[bufact++] = (ch);                                                   \
 396     }                                                                         \
 397   while (0)
 398
 399
 400 #define ADDS(s, l) \
 401   do                                                                          \
 402     {                                                                         \
 403       size_t _l = (l);                                                        \
 404       if (bufact + _l > bufmax)                                               \
 405         {                                                                     \
 406           if (bufact < _l)                                                    \
 407             bufact = _l;                                                      \
 408           bufmax *= 2;                                                        \
 409           buf = xrealloc (buf, bufmax);                                       \
 410         }                                                                     \
 411       memcpy (&buf[bufact], s, _l);                                           \
 412       bufact += _l;                                                           \
 413     }                                                                         \
 414   while (0)
 415
 416
 417 #define ADDWC(ch) \
 418   do                                                                          \
 419     {                                                                         \
 420       if (buf2act == buf2max)                                                 \
 421         {                                                                     \
 422           buf2max *= 2;                                                       \
 423           buf2 = xrealloc (buf2, buf2max * 4);                                \
 424         }                                                                     \
 425       buf2[buf2act++] = (ch);                                                 \
 426     }                                                                         \
 427   while (0)
 428
 429
 430 static struct token *
 431 get_symname (struct linereader *lr)
 432 {
 433   /* Symbol in brackets.  We must distinguish three kinds:
 434      1. reserved words
 435      2. ISO 10646 position values
 436      3. all other.  */
 437   char *buf;
 438   size_t bufact = 0;
 439   size_t bufmax = 56;
 440   const struct keyword_t *kw;
 441   int ch;
 442
 443   buf = (char *) xmalloc (bufmax);
 444
 445   do
 446     {
 447       ch = lr_getc (lr);
 448       if (ch == lr->escape_char)
 449         {
 450           int c2 = lr_getc (lr);
 451           ADDC (c2);
 452
 453           if (c2 == '\n')
 454             ch = '\n';
 455         }
 456       else
 457         ADDC (ch);
 458     }
 459   while (ch != '>' && ch != '\n');
 460
 461   if (ch == '\n')
 462     lr_error (lr, _("unterminated symbolic name"));
 463
 464   /* Test for ISO 10646 position value.  */
 465   if (buf[0] == 'U' && (bufact == 6 || bufact == 10))
 466     {
 467       char *cp = buf + 1;
 468       while (cp < &buf[bufact - 1] && isxdigit (*cp))
 469         ++cp;
 470
 471       if (cp == &buf[bufact - 1])
 472         {
 473           /* Yes, it is.  */
 474           lr->token.tok = tok_ucs4;
 475           lr->token.val.ucs4 = strtoul (buf + 1, NULL, 16);
 476
 477           return &lr->token;
 478         }
 479     }
 480
 481   /* It is a symbolic name.  Test for reserved words.  */
 482   kw = lr->hash_fct (buf, bufact - 1);
 483
 484   if (kw != NULL && kw->symname_or_ident == 1)
 485     {
 486       lr->token.tok = kw->token;
 487       free (buf);
 488     }
 489   else
 490     {
 491       lr->token.tok = tok_bsymbol;
 492
 493       buf[bufact] = '\0';
 494       buf = xrealloc (buf, bufact + 1);
 495
 496       lr->token.val.str.startmb = buf;
 497       lr->token.val.str.lenmb = bufact - 1;
 498     }
 499
 500   return &lr->token;
 501 }
 502
 503
 504 static struct token *
 505 get_ident (struct linereader *lr)
 506 {
 507   char *buf;
 508   size_t bufact;
 509   size_t bufmax = 56;
 510   const struct keyword_t *kw;
 511   int ch;
 512
 513   buf = xmalloc (bufmax);
 514   bufact = 0;
 515
 516   ADDC (lr->buf[lr->idx - 1]);
 517
 518   while (!isspace ((ch = lr_getc (lr))) && ch != '"' && ch != ';'
 519          && ch != '<' && ch != ',')
 520     {
 521       if (ch == lr->escape_char)
 522         {
 523           ch = lr_getc (lr);
 524           if (ch == '\n' || ch == EOF)
 525             {
 526               lr_error (lr, _("invalid escape sequence"));
 527               break;
 528             }
 529         }
 530       ADDC (ch);
 531     }
 532
 533   lr_ungetn (lr, 1);
 534
 535   kw = lr->hash_fct (buf, bufact);
 536
 537   if (kw != NULL && kw->symname_or_ident == 0)
 538     {
 539       lr->token.tok = kw->token;
 540       free (buf);
 541     }
 542   else
 543     {
 544       lr->token.tok = tok_ident;
 545
 546       buf[bufact] = '\0';
 547       buf = xrealloc (buf, bufact + 1);
 548
 549       lr->token.val.str.startmb = buf;
 550       lr->token.val.str.lenmb = bufact;
 551     }
 552
 553   return &lr->token;
 554 }
 555
 556
 557 static struct token *
 558 get_string (struct linereader *lr, const struct charmap_t *charmap,
 559             const struct repertoire_t *repertoire)
 560 {
 561   int return_widestr = lr->return_widestr;
 562   char *buf;
 563   wchar_t *buf2 = NULL;
 564   size_t bufact;
 565   size_t bufmax = 56;
 566
 567   /* We must return two different strings.  */
 568   buf = xmalloc (bufmax);
 569   bufact = 0;
 570
 571   /* We know it'll be a string.  */
 572   lr->token.tok = tok_string;
 573
 574   /* If we need not translate the strings (i.e., expand <...> parts)
 575      we can run a simple loop.  */
 576   if (!lr->translate_strings)
 577     {
 578       int ch;
 579
 580       buf2 = NULL;
 581       while ((ch = lr_getc (lr)) != '"' && ch != '\n' && ch != EOF)
 582         ADDC (ch);
 583
 584       /* Catch errors with trailing escape character.  */
 585       if (bufact > 0 && buf[bufact - 1] == lr->escape_char
 586           && (bufact == 1 || buf[bufact - 2] != lr->escape_char))
 587         {
 588           lr_error (lr, _("illegal escape sequence at end of string"));
 589           --bufact;
 590         }
 591       else if (ch == '\n' || ch == EOF)
 592         lr_error (lr, _("unterminated string"));
 593
 594       ADDC ('\0');
 595     }
 596   else
 597     {
 598       int illegal_string = 0;
 599       size_t buf2act = 0;
 600       size_t buf2max = 56 * sizeof (uint32_t);
 601       int ch;
 602       int warned = 0;
 603
 604       /* We have to provide the wide character result as well.  */
 605       if (return_widestr)
 606         buf2 = xmalloc (buf2max);
 607
 608       /* Read until the end of the string (or end of the line or file).  */
 609       while ((ch = lr_getc (lr)) != '"' && ch != '\n' && ch != EOF)
 610         {
 611           size_t startidx;
 612           uint32_t wch;
 613           struct charseq *seq;
 614
 615           if (ch != '<')
 616             {
 617               /* The standards leave it up to the implementation to decide
 618                  what to do with character which stand for themself.  We
 619                  could jump through hoops to find out the value relative to
 620                  the charmap and the repertoire map, but instead we leave
 621                  it up to the locale definition author to write a better
 622                  definition.  We assume here that every character which
 623                  stands for itself is encoded using ISO 8859-1.  Using the
 624                  escape character is allowed.  */
 625               if (ch == lr->escape_char)
 626                 {
 627                   ch = lr_getc (lr);
 628                   if (ch == '\n' || ch == EOF)
 629                     break;
 630                 }
 631
 632               if (verbose && !warned)
 633                 {
 634                   lr_error (lr, _("\
 635 non-symbolic character value should not be used"));
 636                   warned = 1;
 637                 }
 638
 639               ADDC (ch);
 640               if (return_widestr)
 641                 ADDWC ((uint32_t) ch);
 642
 643               continue;
 644             }
 645
 646           /* Now we have to search for the end of the symbolic name, i.e.,
 647              the closing '>'.  */
 648           startidx = bufact;
 649           while ((ch = lr_getc (lr)) != '>' && ch != '\n' && ch != EOF)
 650             {
 651               if (ch == lr->escape_char)
 652                 {
 653                   ch = lr_getc (lr);
 654                   if (ch == '\n' || ch == EOF)
 655                     break;
 656                 }
 657               ADDC (ch);
 658             }
 659           if (ch == '\n' || ch == EOF)
 660             /* Not a correct string.  */
 661             break;
 662           if (bufact == startidx)
 663             {
 664               /* <> is no correct name.  Ignore it and also signal an
 665                  error.  */
 666               illegal_string = 1;
 667               continue;
 668             }
 669
 670           /* It might be a Uxxxx symbol.  */
 671           if (buf[startidx] == 'U'
 672               && (bufact - startidx == 5 || bufact - startidx == 9))
 673             {
 674               char *cp = buf + startidx + 1;
 675               while (cp < &buf[bufact] && isxdigit (*cp))
 676                 ++cp;
 677
 678               if (cp == &buf[bufact])
 679                 {
 680                   char utmp[10];
 681                   const char *symbol = NULL;
 682
 683                   /* Yes, it is.  */
 684                   ADDC ('\0');
 685                   wch = strtoul (buf + startidx + 1, NULL, 16);
 686
 687                   /* Now forget about the name we just added.  */
 688                   bufact = startidx;
 689
 690                   if (return_widestr)
 691                     ADDWC (wch);
 692
 693                   /* See whether the charmap contains the Uxxxxxxxx names.  */
 694                   snprintf (utmp, sizeof (utmp), "U%08X", wch);
 695                   seq = charmap_find_value (charmap, utmp, 9);
 696
 697                   if (seq == NULL)
 698                     {
 699                      /* No, this isn't the case.  Now determine from
 700                         the repertoire the name of the character and
 701                         find it in the charmap.  */
 702                       if (repertoire != NULL)
 703                         symbol = repertoire_find_symbol (repertoire, wch);
 704
 705                       if (symbol == NULL)
 706                         /* We cannot generate a string since we
 707                            cannot map from the Unicode number to the
 708                            character symbol.  */
 709                         illegal_string = 1;
 710                       else
 711                         {
 712                           seq = charmap_find_value (charmap, symbol,
 713                                                     strlen (symbol));
 714
 715                           if (seq == NULL)
 716                             /* Not a known name.  */
 717                             illegal_string = 1;
 718                         }
 719                     }
 720
 721                   if (seq != NULL)
 722                     ADDS (seq->bytes, seq->nbytes);
 723
 724                   continue;
 725                 }
 726             }
 727
 728           /* We now have the symbolic name in buf[startidx] to
 729              buf[bufact-1].  Now find out the value for this character
 730              in the charmap as well as in the repertoire map (in this
 731              order).  */
 732           seq = charmap_find_value (charmap, &buf[startidx],
 733                                     bufact - startidx);
 734
 735           if (seq == NULL)
 736             {
 737               /* This name is not in the charmap.  */
 738               lr_error (lr, _("symbol `%.*s' not in charmap"),
 739                         (int) (bufact - startidx), &buf[startidx]);
 740               illegal_string = 1;
 741             }
 742
 743           if (return_widestr)
 744             {
 745               /* Now the same for the multibyte representation.  */
 746               if (seq != NULL && seq->ucs4 != UNINITIALIZED_CHAR_VALUE)
 747                 wch = seq->ucs4;
 748               else
 749                 {
 750                   wch = repertoire_find_value (repertoire, &buf[startidx],
 751                                                bufact - startidx);
 752                   if (seq != NULL)
 753                     seq->ucs4 = wch;
 754                 }
 755
 756               if (wch == ILLEGAL_CHAR_VALUE)
 757                 {
 758                   /* This name is not in the repertoire map.  */
 759                   lr_error (lr, _("symbol `%.*s' not in repertoire map"),
 760                             (int) (bufact - startidx), &buf[startidx]);
 761                   illegal_string = 1;
 762                 }
 763               else
 764                 ADDWC (wch);
 765             }
 766
 767           /* Now forget about the name we just added.  */
 768           bufact = startidx;
 769
 770           /* And copy the bytes.  */
 771           if (seq != NULL)
 772             ADDS (seq->bytes, seq->nbytes);
 773         }
 774
 775       if (ch == '\n' || ch == EOF)
 776         {
 777           lr_error (lr, _("unterminated string"));
 778           illegal_string = 1;
 779         }
 780
 781       if (illegal_string)
 782         {
 783           free (buf);
 784           if (buf2 != NULL)
 785             free (buf2);
 786           lr->token.val.str.startmb = NULL;
 787           lr->token.val.str.lenmb = 0;
 788
 789           return &lr->token;
 790         }
 791
 792       ADDC ('\0');
 793
 794       if (return_widestr)
 795         {
 796           ADDWC (0);
 797           lr->token.val.str.startwc = xrealloc (buf2,
 798                                                 buf2act * sizeof (uint32_t));
 799           lr->token.val.str.lenwc = buf2act;
 800         }
 801     }
 802
 803   lr->token.val.str.startmb = xrealloc (buf, bufact);
 804   lr->token.val.str.lenmb = bufact;
 805
 806   return &lr->token;
 807 }