locale/programs/linereader.c

   1 /* Copyright (C) 1996 Free Software Foundation, Inc.
   2 This file is part of the GNU C Library.
   3 Contributed by Ulrich Drepper, <drepper@gnu.ai.mit.edu>.
   4
   5 The GNU C Library is free software; you can redistribute it and/or
   6 modify it under the terms of the GNU Library General Public License as
   7 published by the Free Software Foundation; either version 2 of the
   8 License, or (at your option) any later version.
   9
  10 The GNU C Library is distributed in the hope that it will be useful,
  11 but WITHOUT ANY WARRANTY; without even the implied warranty of
  12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  13 Library General Public License for more details.
  14
  15 You should have received a copy of the GNU Library General Public
  16 License along with the GNU C Library; see the file COPYING.LIB.  If
  17 not, write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
  18 Boston, MA 02111-1307, USA.  */
  19
  20 #ifdef HAVE_CONFIG_H
  21 # include <config.h>
  22 #endif
  23
  24 #include <ctype.h>
  25 #include <errno.h>
  26 #include <libintl.h>
  27 #include <stdarg.h>
  28 #include <stdlib.h>
  29 #include <string.h>
  30
  31 #include "error.h"
  32 #include "linereader.h"
  33 #include "charset.h"
  34 #include "stringtrans.h"
  35
  36
  37 void *xmalloc (size_t __n);
  38 void *xrealloc (void *__p, size_t __n);
  39 char *xstrdup (const char *__str);
  40
  41
  42 static struct token *get_toplvl_escape (struct linereader *lr);
  43 static struct token *get_symname (struct linereader *lr);
  44 static struct token *get_ident (struct linereader *lr);
  45 static struct token *get_string (struct linereader *lr,
  46                                  const struct charset_t *charset);
  47
  48
  49 struct linereader *
  50 lr_open (const char *fname, kw_hash_fct_t hf)
  51 {
  52   FILE *fp;
  53   struct linereader *result;
  54   int n;
  55
  56   if (fname == NULL || strcmp (fname, "-") == 0
  57       || strcmp (fname, "/dev/stdin") == 0)
  58     fp = stdin;
  59   else
  60     {
  61       fp = fopen (fname, "r");
  62       if (fp == NULL)
  63         return NULL;
  64     }
  65
  66   result = (struct linereader *) xmalloc (sizeof (*result));
  67
  68   result->fp = fp;
  69   result->fname = xstrdup (fname ? : "<stdin>");
  70   result->buf = NULL;
  71   result->bufsize = 0;
  72   result->lineno = 1;
  73   result->idx = 0;
  74   result->comment_char = '#';
  75   result->escape_char = '\\';
  76   result->translate_strings = 1;
  77
  78   n = getdelim (&result->buf, &result->bufsize, '\n', result->fp);
  79   if (n < 0)
  80     {
  81       int save = errno;
  82       fclose (result->fp);
  83       free (result->fname);
  84       free (result);
  85       errno = save;
  86       return NULL;
  87     }
  88
  89   if (n > 1 && result->buf[n - 2] == '\\' && result->buf[n - 1] == '\n')
  90     n -= 2;
  91
  92   result->buf[n] = '\0';
  93   result->bufact = n;
  94   result->hash_fct = hf;
  95
  96   return result;
  97 }
  98
  99
 100 int
 101 lr_eof (struct linereader *lr)
 102 {
 103   return lr->bufact = 0;
 104 }
 105
 106
 107 void
 108 lr_close (struct linereader *lr)
 109 {
 110   fclose (lr->fp);
 111   free (lr->fname);
 112   free (lr->buf);
 113   free (lr);
 114 }
 115
 116
 117 int
 118 lr_next (struct linereader *lr)
 119 {
 120   int n;
 121
 122   n = getdelim (&lr->buf, &lr->bufsize, '\n', lr->fp);
 123   if (n < 0)
 124     return -1;
 125
 126   ++lr->lineno;
 127
 128   if (n > 1 && lr->buf[n - 2] == lr->escape_char && lr->buf[n - 1] == '\n')
 129     {
 130       /* An escaped newline character is substituted with a single <SP>.  */
 131       --n;
 132       lr->buf[n - 1] = ' ';
 133     }
 134
 135   lr->buf[n] = '\0';
 136   lr->bufact = n;
 137   lr->idx = 0;
 138
 139   return 0;
 140 }
 141
 142
 143 /* Defined in error.c.  */
 144 /* This variable is incremented each time `error' is called.  */
 145 extern unsigned int error_message_count;
 146
 147 /* The calling program should define program_name and set it to the
 148    name of the executing program.  */
 149 extern char *program_name;
 150
 151
 152 struct token *
 153 lr_token (struct linereader *lr, const struct charset_t *charset)
 154 {
 155   int ch;
 156
 157   while (1)
 158     {
 159       do
 160         {
 161           ch = lr_getc (lr);
 162
 163           if (ch == '\n')
 164             {
 165               lr->token.tok = tok_eol;
 166               return &lr->token;
 167             }
 168         }
 169       while (isspace (ch));
 170
 171       if (ch == EOF)
 172         {
 173           lr->token.tok = tok_eof;
 174           return &lr->token;
 175         };
 176
 177       if (ch != lr->comment_char)
 178         break;
 179
 180       /* Ignore rest of line.  */
 181       lr_ignore_rest (lr, 0);
 182       lr->token.tok = tok_eol;
 183       return &lr->token;
 184     }
 185
 186   /* Match escape sequences.  */
 187   if (ch == lr->escape_char)
 188     return get_toplvl_escape (lr);
 189
 190   /* Match ellipsis.  */
 191   if (ch == '.' && strncmp (&lr->buf[lr->idx], "..", 2) == 0)
 192     {
 193       lr_getc (lr);
 194       lr_getc (lr);
 195       lr->token.tok = tok_ellipsis;
 196       return &lr->token;
 197     }
 198
 199   switch (ch)
 200     {
 201     case '<':
 202       return get_symname (lr);
 203
 204     case '0' ... '9':
 205       lr->token.tok = tok_number;
 206       lr->token.val.num = ch - '0';
 207
 208       while (isdigit (ch = lr_getc (lr)))
 209         {
 210           lr->token.val.num *= 10;
 211           lr->token.val.num += ch - '0';
 212         }
 213       if (isalpha (ch))
 214         lr_error (lr, _("garbage at end of digit"));
 215       lr_ungetn (lr, 1);
 216
 217       return &lr->token;
 218
 219     case ';':
 220       lr->token.tok = tok_semicolon;
 221       return &lr->token;
 222
 223     case ',':
 224       lr->token.tok = tok_comma;
 225       return &lr->token;
 226
 227     case '(':
 228       lr->token.tok = tok_open_brace;
 229       return &lr->token;
 230
 231     case ')':
 232       lr->token.tok = tok_close_brace;
 233       return &lr->token;
 234
 235     case '"':
 236       return get_string (lr, charset);
 237
 238     case '-':
 239       ch = lr_getc (lr);
 240       if (ch == '1')
 241         {
 242           lr->token.tok = tok_minus1;
 243           return &lr->token;
 244         }
 245       lr_ungetn (lr, 2);
 246       break;
 247     }
 248
 249   return get_ident (lr);
 250 }
 251
 252
 253 static struct token *
 254 get_toplvl_escape (struct linereader *lr)
 255 {
 256   /* This is supposed to be a numeric value.  We return the
 257      numerical value and the number of bytes.  */
 258   size_t start_idx = lr->idx - 1;
 259   unsigned int value = 0;
 260   int nbytes = 0;
 261   int ch;
 262
 263   do
 264     {
 265       unsigned int byte = 0;
 266       unsigned int base = 8;
 267
 268       ch = lr_getc (lr);
 269
 270       if (ch == 'd')
 271         {
 272           base = 10;
 273           ch = lr_getc (lr);
 274         }
 275       else if (ch == 'x')
 276         {
 277           base = 16;
 278           ch = lr_getc (lr);
 279         }
 280
 281       if ((base == 16 && !isxdigit (ch))
 282           || (base != 16 && (ch < '0' || ch >= '0' + base)))
 283         {
 284         esc_error:
 285           lr->token.val.str.start = &lr->buf[start_idx];
 286
 287           while (ch != EOF || !isspace (ch))
 288             ch = lr_getc (lr);
 289           lr->token.val.str.len = lr->idx - start_idx;
 290
 291           lr->token.tok = tok_error;
 292           return &lr->token;
 293         }
 294
 295       if (isdigit (ch))
 296         byte = ch - '0';
 297       else
 298         byte = tolower (ch) - 'a' + 10;
 299
 300       ch = lr_getc (lr);
 301       if ((base == 16 && !isxdigit (ch))
 302           || (base != 16 && (ch < '0' || ch >= '0' + base)))
 303         goto esc_error;
 304
 305       byte *= base;
 306       if (isdigit (ch))
 307         byte += ch - '0';
 308       else
 309         byte += tolower (ch) - 'a' + 10;
 310
 311       ch = lr_getc (lr);
 312       if (base != 16 && isdigit (ch))
 313         {
 314           byte *= base;
 315           base += ch - '0';
 316
 317           ch = lr_getc (lr);
 318         }
 319
 320       value *= 256;
 321       value += byte;
 322
 323       ++nbytes;
 324     }
 325   while (ch == lr->escape_char && nbytes < 4);
 326
 327   if (!isspace (ch))
 328     lr_error (lr, _("garbage at end of character code specification"));
 329
 330   lr_ungetn (lr, 1);
 331
 332   lr->token.tok = tok_charcode;
 333   lr->token.val.charcode.val = value;
 334   lr->token.val.charcode.nbytes = nbytes;
 335
 336   return &lr->token;
 337 }
 338
 339
 340 #define ADDC(ch)                                                            \
 341   do                                                                        \
 342     {                                                                       \
 343       if (bufact == bufmax)                                                 \
 344         {                                                                   \
 345           bufmax *= 2;                                                      \
 346           buf = xrealloc (buf, bufmax);                                     \
 347         }                                                                   \
 348       buf[bufact++] = (ch);                                                 \
 349     }                                                                       \
 350   while (0)
 351
 352
 353 static struct token *
 354 get_symname (struct linereader *lr)
 355 {
 356   /* Symbol in brackets.  We must distinguish three kinds:
 357      1. reserved words
 358      2. ISO 10646 position values
 359      3. all other.  */
 360   char *buf;
 361   size_t bufact = 0;
 362   size_t bufmax = 56;
 363   const struct keyword_t *kw;
 364   int ch;
 365
 366   buf = (char *) xmalloc (bufmax);
 367
 368   do
 369     {
 370       ch = lr_getc (lr);
 371       if (ch == lr->escape_char)
 372         {
 373           int c2 = lr_getc (lr);
 374           ADDC (c2);
 375
 376           if (c2 == '\n')
 377             ch = '\n';
 378         }
 379       else
 380         ADDC (ch);
 381     }
 382   while (ch != '>' && ch != '\n');
 383
 384   if (ch == '\n')
 385     lr_error (lr, _("unterminated symbolic name"));
 386
 387   /* Test for ISO 10646 position value.  */
 388   if (buf[0] == 'U' && (bufact == 6 || bufact == 10))
 389     {
 390       char *cp = buf + 1;
 391       while (cp < &buf[bufact - 1] && isxdigit (*cp))
 392         ++cp;
 393
 394       if (cp == &buf[bufact - 1])
 395         {
 396           /* Yes, it is.  */
 397           lr->token.tok = bufact == 6 ? tok_ucs2 : tok_ucs4;
 398           lr->token.val.charcode.val = strtoul (buf, NULL, 16);
 399           lr->token.val.charcode.nbytes = lr->token.tok == tok_ucs2 ? 2 : 4;
 400
 401           return &lr->token;
 402         }
 403     }
 404
 405   /* It is a symbolic name.  Test for reserved words.  */
 406   kw = lr->hash_fct (buf, bufact - 1);
 407
 408   if (kw != NULL && kw->symname_or_ident == 1)
 409     {
 410       lr->token.tok = kw->token;
 411       free (buf);
 412     }
 413   else
 414     {
 415       lr->token.tok = tok_bsymbol;
 416
 417       buf[bufact] = '\0';
 418       buf = xrealloc (buf, bufact + 1);
 419
 420       lr->token.val.str.start = buf;
 421       lr->token.val.str.len = bufact - 1;
 422     }
 423
 424   return &lr->token;
 425 }
 426
 427
 428 static struct token *
 429 get_ident (struct linereader *lr)
 430 {
 431   char *buf;
 432   size_t bufact;
 433   size_t bufmax = 56;
 434   const struct keyword_t *kw;
 435   int ch;
 436
 437   buf = xmalloc (bufmax);
 438   bufact = 0;
 439
 440   ADDC (lr->buf[lr->idx - 1]);
 441
 442   while (!isspace ((ch = lr_getc (lr))) && ch != '"' && ch != ';'
 443          && ch != '<' && ch != ',')
 444     /* XXX Handle escape sequences?  */
 445     ADDC (ch);
 446
 447   lr_ungetn (lr, 1);
 448
 449   kw = lr->hash_fct (buf, bufact);
 450
 451   if (kw != NULL && kw->symname_or_ident == 0)
 452     {
 453       lr->token.tok = kw->token;
 454       free (buf);
 455     }
 456   else
 457     {
 458       lr->token.tok = tok_ident;
 459
 460       buf[bufact] = '\0';
 461       buf = xrealloc (buf, bufact + 1);
 462
 463       lr->token.val.str.start = buf;
 464       lr->token.val.str.len = bufact;
 465     }
 466
 467   return &lr->token;
 468 }
 469
 470
 471 static struct token *
 472 get_string (struct linereader *lr, const struct charset_t *charset)
 473 {
 474   int illegal_string = 0;
 475   char *buf, *cp;
 476   size_t bufact;
 477   size_t bufmax = 56;
 478   int ch;
 479
 480   buf = xmalloc (bufmax);
 481   bufact = 0;
 482
 483   while ((ch = lr_getc (lr)) != '"' && ch != '\n' && ch != EOF)
 484     if (ch != '<' || charset == NULL)
 485       {
 486         if (ch == lr->escape_char)
 487           {
 488             ch = lr_getc (lr);
 489             if (ch == '\n' || ch == EOF)
 490               break;
 491           }
 492         ADDC (ch);
 493       }
 494     else
 495       {
 496         /* We have to get the value of the symbol.  */
 497         unsigned int value;
 498         size_t startidx = bufact;
 499
 500         if (!lr->translate_strings)
 501           ADDC ('<');
 502
 503         while ((ch = lr_getc (lr)) != '>' && ch != '\n' && ch != EOF)
 504           {
 505             if (ch == lr->escape_char)
 506               {
 507                 ch = lr_getc (lr);
 508                 if (ch == '\n' || ch == EOF)
 509                   break;
 510               }
 511             ADDC (ch);
 512           }
 513
 514         if (ch == '\n' || ch == EOF)
 515           lr_error (lr, _("unterminated string"));
 516         else
 517           if (!lr->translate_strings)
 518             ADDC ('>');
 519
 520         if (lr->translate_strings)
 521           {
 522             value = charset_find_value (charset, &buf[startidx],
 523                                         bufact - startidx);
 524             if (value == ILLEGAL_CHAR_VALUE)
 525               illegal_string = 1;
 526             bufact = startidx;
 527
 528             if (bufmax - bufact < 8)
 529               {
 530                 bufmax *= 2;
 531                 buf = (char *) xrealloc (buf, bufmax);
 532               }
 533
 534             cp = &buf[bufact];
 535             if (encode_char (value, &cp))
 536               illegal_string = 1;
 537
 538             bufact = cp - buf;
 539           }
 540       }
 541
 542   /* Catch errors with trailing escape character.  */
 543   if (bufact > 0 && buf[bufact - 1] == lr->escape_char
 544       && (bufact == 1 || buf[bufact - 2] != lr->escape_char))
 545     {
 546       lr_error (lr, _("illegal escape sequence at end of string"));
 547       --bufact;
 548     }
 549   else if (ch == '\n' || ch == EOF)
 550     lr_error (lr, _("unterminated string"));
 551
 552   /* Terminate string if necessary.  */
 553   if (lr->translate_strings)
 554     {
 555       cp = &buf[bufact];
 556       if (encode_char (0, &cp))
 557         illegal_string = 1;
 558
 559       bufact = cp - buf;
 560     }
 561   else
 562     ADDC ('\0');
 563
 564   lr->token.tok = tok_string;
 565
 566   if (illegal_string)
 567     {
 568       free (buf);
 569       lr->token.val.str.start = NULL;
 570       lr->token.val.str.len = 0;
 571     }
 572   else
 573     {
 574       buf = xrealloc (buf, bufact + 1);
 575
 576       lr->token.val.str.start = buf;
 577       lr->token.val.str.len = bufact;
 578     }
 579
 580   return &lr->token;
 581 }