Sun Jun 2 20:14:30 1996 Andreas Schwab <schwab@issan.informatik.uni-dortmund.de>
[glibc.git] / locale / programs / linereader.c
blob68508dff0c6fbad824d3b96e439c62d1458cce9d
1 /* Copyright (C) 1996 Free Software Foundation, Inc.
2 This file is part of the GNU C Library.
3 Contributed by Ulrich Drepper, <drepper@gnu.ai.mit.edu>.
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Library General Public License as
7 published by the Free Software Foundation; either version 2 of the
8 License, or (at your option) any later version.
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Library General Public License for more details.
15 You should have received a copy of the GNU Library General Public
16 License along with the GNU C Library; see the file COPYING.LIB. If
17 not, write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
18 Boston, MA 02111-1307, USA. */
20 #ifdef HAVE_CONFIG_H
21 # include <config.h>
22 #endif
24 #include <ctype.h>
25 #include <errno.h>
26 #include <libintl.h>
27 #include <stdarg.h>
28 #include <stdlib.h>
29 #include <string.h>
31 #include "error.h"
32 #include "linereader.h"
33 #include "charset.h"
34 #include "stringtrans.h"
37 void *xmalloc (size_t __n);
38 void *xrealloc (void *__p, size_t __n);
39 char *xstrdup (const char *__str);
42 static struct token *get_toplvl_escape (struct linereader *lr);
43 static struct token *get_symname (struct linereader *lr);
44 static struct token *get_ident (struct linereader *lr);
45 static struct token *get_string (struct linereader *lr,
46 const struct charset_t *charset);
49 struct linereader *
50 lr_open (const char *fname, kw_hash_fct_t hf)
52 FILE *fp;
53 struct linereader *result;
54 int n;
56 if (fname == NULL || strcmp (fname, "-") == 0
57 || strcmp (fname, "/dev/stdin") == 0)
58 fp = stdin;
59 else
61 fp = fopen (fname, "r");
62 if (fp == NULL)
63 return NULL;
66 result = (struct linereader *) xmalloc (sizeof (*result));
68 result->fp = fp;
69 result->fname = xstrdup (fname ? : "<stdin>");
70 result->buf = NULL;
71 result->bufsize = 0;
72 result->lineno = 1;
73 result->idx = 0;
74 result->comment_char = '#';
75 result->escape_char = '\\';
76 result->translate_strings = 1;
78 n = getdelim (&result->buf, &result->bufsize, '\n', result->fp);
79 if (n < 0)
81 int save = errno;
82 fclose (result->fp);
83 free (result->fname);
84 free (result);
85 errno = save;
86 return NULL;
89 if (n > 1 && result->buf[n - 2] == '\\' && result->buf[n - 1] == '\n')
90 n -= 2;
92 result->buf[n] = '\0';
93 result->bufact = n;
94 result->hash_fct = hf;
96 return result;
101 lr_eof (struct linereader *lr)
103 return lr->bufact = 0;
107 void
108 lr_close (struct linereader *lr)
110 fclose (lr->fp);
111 free (lr->fname);
112 free (lr->buf);
113 free (lr);
118 lr_next (struct linereader *lr)
120 int n;
122 n = getdelim (&lr->buf, &lr->bufsize, '\n', lr->fp);
123 if (n < 0)
124 return -1;
126 ++lr->lineno;
128 if (n > 1 && lr->buf[n - 2] == lr->escape_char && lr->buf[n - 1] == '\n')
130 /* An escaped newline character is substituted with a single <SP>. */
131 --n;
132 lr->buf[n - 1] = ' ';
135 lr->buf[n] = '\0';
136 lr->bufact = n;
137 lr->idx = 0;
139 return 0;
143 /* Defined in error.c. */
144 /* This variable is incremented each time `error' is called. */
145 extern unsigned int error_message_count;
147 /* The calling program should define program_name and set it to the
148 name of the executing program. */
149 extern char *program_name;
152 struct token *
153 lr_token (struct linereader *lr, const struct charset_t *charset)
155 int ch;
157 while (1)
161 ch = lr_getc (lr);
163 if (ch == '\n')
165 lr->token.tok = tok_eol;
166 return &lr->token;
169 while (isspace (ch));
171 if (ch == EOF)
173 lr->token.tok = tok_eof;
174 return &lr->token;
177 if (ch != lr->comment_char)
178 break;
180 /* Ignore rest of line. */
181 lr_ignore_rest (lr, 0);
182 lr->token.tok = tok_eol;
183 return &lr->token;
186 /* Match escape sequences. */
187 if (ch == lr->escape_char)
188 return get_toplvl_escape (lr);
190 /* Match ellipsis. */
191 if (ch == '.' && strncmp (&lr->buf[lr->idx], "..", 2) == 0)
193 lr_getc (lr);
194 lr_getc (lr);
195 lr->token.tok = tok_ellipsis;
196 return &lr->token;
199 switch (ch)
201 case '<':
202 return get_symname (lr);
204 case '0' ... '9':
205 lr->token.tok = tok_number;
206 lr->token.val.num = ch - '0';
208 while (isdigit (ch = lr_getc (lr)))
210 lr->token.val.num *= 10;
211 lr->token.val.num += ch - '0';
213 if (isalpha (ch))
214 lr_error (lr, _("garbage at end of digit"));
215 lr_ungetn (lr, 1);
217 return &lr->token;
219 case ';':
220 lr->token.tok = tok_semicolon;
221 return &lr->token;
223 case ',':
224 lr->token.tok = tok_comma;
225 return &lr->token;
227 case '(':
228 lr->token.tok = tok_open_brace;
229 return &lr->token;
231 case ')':
232 lr->token.tok = tok_close_brace;
233 return &lr->token;
235 case '"':
236 return get_string (lr, charset);
238 case '-':
239 ch = lr_getc (lr);
240 if (ch == '1')
242 lr->token.tok = tok_minus1;
243 return &lr->token;
245 lr_ungetn (lr, 2);
246 break;
249 return get_ident (lr);
253 static struct token *
254 get_toplvl_escape (struct linereader *lr)
256 /* This is supposed to be a numeric value. We return the
257 numerical value and the number of bytes. */
258 size_t start_idx = lr->idx - 1;
259 unsigned int value = 0;
260 int nbytes = 0;
261 int ch;
265 unsigned int byte = 0;
266 unsigned int base = 8;
268 ch = lr_getc (lr);
270 if (ch == 'd')
272 base = 10;
273 ch = lr_getc (lr);
275 else if (ch == 'x')
277 base = 16;
278 ch = lr_getc (lr);
281 if ((base == 16 && !isxdigit (ch))
282 || (base != 16 && (ch < '0' || ch >= '0' + base)))
284 esc_error:
285 lr->token.val.str.start = &lr->buf[start_idx];
287 while (ch != EOF || !isspace (ch))
288 ch = lr_getc (lr);
289 lr->token.val.str.len = lr->idx - start_idx;
291 lr->token.tok = tok_error;
292 return &lr->token;
295 if (isdigit (ch))
296 byte = ch - '0';
297 else
298 byte = tolower (ch) - 'a' + 10;
300 ch = lr_getc (lr);
301 if ((base == 16 && !isxdigit (ch))
302 || (base != 16 && (ch < '0' || ch >= '0' + base)))
303 goto esc_error;
305 byte *= base;
306 if (isdigit (ch))
307 byte += ch - '0';
308 else
309 byte += tolower (ch) - 'a' + 10;
311 ch = lr_getc (lr);
312 if (base != 16 && isdigit (ch))
314 byte *= base;
315 base += ch - '0';
317 ch = lr_getc (lr);
320 value *= 256;
321 value += byte;
323 ++nbytes;
325 while (ch == lr->escape_char && nbytes < 4);
327 if (!isspace (ch))
328 lr_error (lr, _("garbage at end of character code specification"));
330 lr_ungetn (lr, 1);
332 lr->token.tok = tok_charcode;
333 lr->token.val.charcode.val = value;
334 lr->token.val.charcode.nbytes = nbytes;
336 return &lr->token;
340 #define ADDC(ch) \
341 do \
343 if (bufact == bufmax) \
345 bufmax *= 2; \
346 buf = xrealloc (buf, bufmax); \
348 buf[bufact++] = (ch); \
350 while (0)
353 static struct token *
354 get_symname (struct linereader *lr)
356 /* Symbol in brackets. We must distinguish three kinds:
357 1. reserved words
358 2. ISO 10646 position values
359 3. all other. */
360 char *buf;
361 size_t bufact = 0;
362 size_t bufmax = 56;
363 const struct keyword_t *kw;
364 int ch;
366 buf = (char *) xmalloc (bufmax);
370 ch = lr_getc (lr);
371 if (ch == lr->escape_char)
373 int c2 = lr_getc (lr);
374 ADDC (c2);
376 if (c2 == '\n')
377 ch = '\n';
379 else
380 ADDC (ch);
382 while (ch != '>' && ch != '\n');
384 if (ch == '\n')
385 lr_error (lr, _("unterminated symbolic name"));
387 /* Test for ISO 10646 position value. */
388 if (buf[0] == 'U' && (bufact == 6 || bufact == 10))
390 char *cp = buf + 1;
391 while (cp < &buf[bufact - 1] && isxdigit (*cp))
392 ++cp;
394 if (cp == &buf[bufact - 1])
396 /* Yes, it is. */
397 lr->token.tok = bufact == 6 ? tok_ucs2 : tok_ucs4;
398 lr->token.val.charcode.val = strtoul (buf, NULL, 16);
399 lr->token.val.charcode.nbytes = lr->token.tok == tok_ucs2 ? 2 : 4;
401 return &lr->token;
405 /* It is a symbolic name. Test for reserved words. */
406 kw = lr->hash_fct (buf, bufact - 1);
408 if (kw != NULL && kw->symname_or_ident == 1)
410 lr->token.tok = kw->token;
411 free (buf);
413 else
415 lr->token.tok = tok_bsymbol;
417 buf[bufact] = '\0';
418 buf = xrealloc (buf, bufact + 1);
420 lr->token.val.str.start = buf;
421 lr->token.val.str.len = bufact - 1;
424 return &lr->token;
428 static struct token *
429 get_ident (struct linereader *lr)
431 char *buf;
432 size_t bufact;
433 size_t bufmax = 56;
434 const struct keyword_t *kw;
435 int ch;
437 buf = xmalloc (bufmax);
438 bufact = 0;
440 ADDC (lr->buf[lr->idx - 1]);
442 while (!isspace ((ch = lr_getc (lr))) && ch != '"' && ch != ';'
443 && ch != '<' && ch != ',')
444 /* XXX Handle escape sequences? */
445 ADDC (ch);
447 lr_ungetn (lr, 1);
449 kw = lr->hash_fct (buf, bufact);
451 if (kw != NULL && kw->symname_or_ident == 0)
453 lr->token.tok = kw->token;
454 free (buf);
456 else
458 lr->token.tok = tok_ident;
460 buf[bufact] = '\0';
461 buf = xrealloc (buf, bufact + 1);
463 lr->token.val.str.start = buf;
464 lr->token.val.str.len = bufact;
467 return &lr->token;
471 static struct token *
472 get_string (struct linereader *lr, const struct charset_t *charset)
474 int illegal_string = 0;
475 char *buf, *cp;
476 size_t bufact;
477 size_t bufmax = 56;
478 int ch;
480 buf = xmalloc (bufmax);
481 bufact = 0;
483 while ((ch = lr_getc (lr)) != '"' && ch != '\n' && ch != EOF)
484 if (ch != '<' || charset == NULL)
486 if (ch == lr->escape_char)
488 ch = lr_getc (lr);
489 if (ch == '\n' || ch == EOF)
490 break;
492 ADDC (ch);
494 else
496 /* We have to get the value of the symbol. */
497 unsigned int value;
498 size_t startidx = bufact;
500 if (!lr->translate_strings)
501 ADDC ('<');
503 while ((ch = lr_getc (lr)) != '>' && ch != '\n' && ch != EOF)
505 if (ch == lr->escape_char)
507 ch = lr_getc (lr);
508 if (ch == '\n' || ch == EOF)
509 break;
511 ADDC (ch);
514 if (ch == '\n' || ch == EOF)
515 lr_error (lr, _("unterminated string"));
516 else
517 if (!lr->translate_strings)
518 ADDC ('>');
520 if (lr->translate_strings)
522 value = charset_find_value (charset, &buf[startidx],
523 bufact - startidx);
524 if (value == ILLEGAL_CHAR_VALUE)
525 illegal_string = 1;
526 bufact = startidx;
528 if (bufmax - bufact < 8)
530 bufmax *= 2;
531 buf = (char *) xrealloc (buf, bufmax);
534 cp = &buf[bufact];
535 if (encode_char (value, &cp))
536 illegal_string = 1;
538 bufact = cp - buf;
542 /* Catch errors with trailing escape character. */
543 if (bufact > 0 && buf[bufact - 1] == lr->escape_char
544 && (bufact == 1 || buf[bufact - 2] != lr->escape_char))
546 lr_error (lr, _("illegal escape sequence at end of string"));
547 --bufact;
549 else if (ch == '\n' || ch == EOF)
550 lr_error (lr, _("unterminated string"));
552 /* Terminate string if necessary. */
553 if (lr->translate_strings)
555 cp = &buf[bufact];
556 if (encode_char (0, &cp))
557 illegal_string = 1;
559 bufact = cp - buf;
561 else
562 ADDC ('\0');
564 lr->token.tok = tok_string;
566 if (illegal_string)
568 free (buf);
569 lr->token.val.str.start = NULL;
570 lr->token.val.str.len = 0;
572 else
574 buf = xrealloc (buf, bufact + 1);
576 lr->token.val.str.start = buf;
577 lr->token.val.str.len = bufact;
580 return &lr->token;