Update.
[glibc.git] / locale / programs / linereader.c
blob564173083e8ae0681dc1bfe29c5c5be231fc07fd
1 /* Copyright (C) 1996, 1997, 1998, 1999 Free Software Foundation, Inc.
2 This file is part of the GNU C Library.
3 Contributed by Ulrich Drepper <drepper@gnu.org>, 1996.
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Library General Public License as
7 published by the Free Software Foundation; either version 2 of the
8 License, or (at your option) any later version.
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Library General Public License for more details.
15 You should have received a copy of the GNU Library General Public
16 License along with the GNU C Library; see the file COPYING.LIB. If not,
17 write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
18 Boston, MA 02111-1307, USA. */
20 #ifdef HAVE_CONFIG_H
21 # include <config.h>
22 #endif
24 #include <ctype.h>
25 #include <errno.h>
26 #include <libintl.h>
27 #include <stdarg.h>
28 #include <stdlib.h>
29 #include <string.h>
31 #include "charmap.h"
32 #include "error.h"
33 #include "linereader.h"
34 #include "localedef.h"
35 #include "stringtrans.h"
38 /* Prototypes for local functions. */
39 static struct token *get_toplvl_escape (struct linereader *lr);
40 static struct token *get_symname (struct linereader *lr);
41 static struct token *get_ident (struct linereader *lr);
42 static struct token *get_string (struct linereader *lr,
43 const struct charmap_t *charmap,
44 const struct repertoire_t *repertoire);
47 struct linereader *
48 lr_open (const char *fname, kw_hash_fct_t hf)
50 FILE *fp;
51 struct linereader *result;
52 int n;
54 if (fname == NULL || strcmp (fname, "-") == 0
55 || strcmp (fname, "/dev/stdin") == 0)
56 fp = stdin;
57 else
59 fp = fopen (fname, "r");
60 if (fp == NULL)
61 return NULL;
64 result = (struct linereader *) xmalloc (sizeof (*result));
66 result->fp = fp;
67 result->fname = xstrdup (fname ? : "<stdin>");
68 result->buf = NULL;
69 result->bufsize = 0;
70 result->lineno = 1;
71 result->idx = 0;
72 result->comment_char = '#';
73 result->escape_char = '\\';
74 result->translate_strings = 1;
76 n = getdelim (&result->buf, &result->bufsize, '\n', result->fp);
77 if (n < 0)
79 int save = errno;
80 fclose (result->fp);
81 free ((char *) result->fname);
82 free (result);
83 errno = save;
84 return NULL;
87 if (n > 1 && result->buf[n - 2] == '\\' && result->buf[n - 1] == '\n')
88 n -= 2;
90 result->buf[n] = '\0';
91 result->bufact = n;
92 result->hash_fct = hf;
94 return result;
98 int
99 lr_eof (struct linereader *lr)
101 return lr->bufact = 0;
105 void
106 lr_close (struct linereader *lr)
108 fclose (lr->fp);
109 free (lr->buf);
110 free (lr);
115 lr_next (struct linereader *lr)
117 int n;
119 n = getdelim (&lr->buf, &lr->bufsize, '\n', lr->fp);
120 if (n < 0)
121 return -1;
123 ++lr->lineno;
125 if (n > 1 && lr->buf[n - 2] == lr->escape_char && lr->buf[n - 1] == '\n')
127 #if 0
128 /* XXX Is this correct? */
129 /* An escaped newline character is substituted with a single <SP>. */
130 --n;
131 lr->buf[n - 1] = ' ';
132 #else
133 n -= 2;
134 #endif
137 lr->buf[n] = '\0';
138 lr->bufact = n;
139 lr->idx = 0;
141 return 0;
145 /* Defined in error.c. */
146 /* This variable is incremented each time `error' is called. */
147 extern unsigned int error_message_count;
149 /* The calling program should define program_name and set it to the
150 name of the executing program. */
151 extern char *program_name;
154 struct token *
155 lr_token (struct linereader *lr, const struct charmap_t *charmap,
156 const struct repertoire_t *repertoire)
158 int ch;
160 while (1)
164 ch = lr_getc (lr);
166 if (ch == EOF)
168 lr->token.tok = tok_eof;
169 return &lr->token;
172 if (ch == '\n')
174 lr->token.tok = tok_eol;
175 return &lr->token;
178 while (isspace (ch));
180 if (ch == EOF)
182 lr->token.tok = tok_eof;
183 return &lr->token;
186 if (ch != lr->comment_char)
187 break;
189 /* Ignore rest of line. */
190 lr_ignore_rest (lr, 0);
191 lr->token.tok = tok_eol;
192 return &lr->token;
195 /* Match escape sequences. */
196 if (ch == lr->escape_char)
197 return get_toplvl_escape (lr);
199 /* Match ellipsis. */
200 if (ch == '.')
202 if (strncmp (&lr->buf[lr->idx], "...", 3) == 0)
204 lr_getc (lr);
205 lr_getc (lr);
206 lr_getc (lr);
207 lr->token.tok = tok_ellipsis4;
208 return &lr->token;
210 if (strncmp (&lr->buf[lr->idx], "..", 2) == 0)
212 lr_getc (lr);
213 lr_getc (lr);
214 lr->token.tok = tok_ellipsis3;
215 return &lr->token;
217 if (lr->buf[lr->idx] == '.')
219 lr_getc (lr);
220 lr->token.tok = tok_ellipsis2;
221 return &lr->token;
225 switch (ch)
227 case '<':
228 return get_symname (lr);
230 case '0' ... '9':
231 lr->token.tok = tok_number;
232 lr->token.val.num = ch - '0';
234 while (isdigit (ch = lr_getc (lr)))
236 lr->token.val.num *= 10;
237 lr->token.val.num += ch - '0';
239 if (isalpha (ch))
240 lr_error (lr, _("garbage at end of number"));
241 lr_ungetn (lr, 1);
243 return &lr->token;
245 case ';':
246 lr->token.tok = tok_semicolon;
247 return &lr->token;
249 case ',':
250 lr->token.tok = tok_comma;
251 return &lr->token;
253 case '(':
254 lr->token.tok = tok_open_brace;
255 return &lr->token;
257 case ')':
258 lr->token.tok = tok_close_brace;
259 return &lr->token;
261 case '"':
262 return get_string (lr, charmap, repertoire);
264 case '-':
265 ch = lr_getc (lr);
266 if (ch == '1')
268 lr->token.tok = tok_minus1;
269 return &lr->token;
271 lr_ungetn (lr, 2);
272 break;
275 return get_ident (lr);
279 static struct token *
280 get_toplvl_escape (struct linereader *lr)
282 /* This is supposed to be a numeric value. We return the
283 numerical value and the number of bytes. */
284 size_t start_idx = lr->idx - 1;
285 char *bytes = lr->token.val.charcode.bytes;
286 int nbytes = 0;
287 int ch;
291 unsigned int byte = 0;
292 unsigned int base = 8;
294 ch = lr_getc (lr);
296 if (ch == 'd')
298 base = 10;
299 ch = lr_getc (lr);
301 else if (ch == 'x')
303 base = 16;
304 ch = lr_getc (lr);
307 if ((base == 16 && !isxdigit (ch))
308 || (base != 16 && (ch < '0' || ch >= (int) ('0' + base))))
310 esc_error:
311 lr->token.val.str.startmb = &lr->buf[start_idx];
313 while (ch != EOF && !isspace (ch))
314 ch = lr_getc (lr);
315 lr->token.val.str.lenmb = lr->idx - start_idx;
317 lr->token.tok = tok_error;
318 return &lr->token;
321 if (isdigit (ch))
322 byte = ch - '0';
323 else
324 byte = tolower (ch) - 'a' + 10;
326 ch = lr_getc (lr);
327 if ((base == 16 && !isxdigit (ch))
328 || (base != 16 && (ch < '0' || ch >= (int) ('0' + base))))
329 goto esc_error;
331 byte *= base;
332 if (isdigit (ch))
333 byte += ch - '0';
334 else
335 byte += tolower (ch) - 'a' + 10;
337 ch = lr_getc (lr);
338 if (base != 16 && isdigit (ch))
340 byte *= base;
341 byte += ch - '0';
343 ch = lr_getc (lr);
346 bytes[nbytes++] = byte;
348 while (ch == lr->escape_char && nbytes < 4);
350 if (!isspace (ch))
351 lr_error (lr, _("garbage at end of character code specification"));
353 lr_ungetn (lr, 1);
355 lr->token.tok = tok_charcode;
356 lr->token.val.charcode.nbytes = nbytes;
358 return &lr->token;
362 #define ADDC(ch) \
363 do \
365 if (bufact == bufmax) \
367 bufmax *= 2; \
368 buf = xrealloc (buf, bufmax); \
370 buf[bufact++] = (ch); \
372 while (0)
375 #define ADDS(s, l) \
376 do \
378 size_t _l = (l); \
379 if (bufact + _l > bufmax) \
381 if (bufact < _l) \
382 bufact = _l; \
383 bufmax *= 2; \
384 buf = xrealloc (buf, bufmax); \
386 memcpy (&buf[bufact], s, _l); \
387 bufact += _l; \
389 while (0)
392 #define ADDWC(ch) \
393 do \
395 if (buf2act == buf2max) \
397 buf2max *= 2; \
398 buf2 = xrealloc (buf2, buf2max * 4); \
400 buf2[buf2act++] = (ch); \
402 while (0)
405 static struct token *
406 get_symname (struct linereader *lr)
408 /* Symbol in brackets. We must distinguish three kinds:
409 1. reserved words
410 2. ISO 10646 position values
411 3. all other. */
412 char *buf;
413 size_t bufact = 0;
414 size_t bufmax = 56;
415 const struct keyword_t *kw;
416 int ch;
418 buf = (char *) xmalloc (bufmax);
422 ch = lr_getc (lr);
423 if (ch == lr->escape_char)
425 int c2 = lr_getc (lr);
426 ADDC (c2);
428 if (c2 == '\n')
429 ch = '\n';
431 else
432 ADDC (ch);
434 while (ch != '>' && ch != '\n');
436 if (ch == '\n')
437 lr_error (lr, _("unterminated symbolic name"));
439 /* Test for ISO 10646 position value. */
440 if (buf[0] == 'U' && (bufact == 6 || bufact == 10))
442 char *cp = buf + 1;
443 while (cp < &buf[bufact - 1] && isxdigit (*cp))
444 ++cp;
446 if (cp == &buf[bufact - 1])
448 /* Yes, it is. */
449 lr->token.tok = tok_ucs4;
450 lr->token.val.ucs4 = strtoul (buf + 1, NULL, 16);
452 return &lr->token;
456 /* It is a symbolic name. Test for reserved words. */
457 kw = lr->hash_fct (buf, bufact - 1);
459 if (kw != NULL && kw->symname_or_ident == 1)
461 lr->token.tok = kw->token;
462 free (buf);
464 else
466 lr->token.tok = tok_bsymbol;
468 buf[bufact] = '\0';
469 buf = xrealloc (buf, bufact + 1);
471 lr->token.val.str.startmb = buf;
472 lr->token.val.str.lenmb = bufact - 1;
475 return &lr->token;
479 static struct token *
480 get_ident (struct linereader *lr)
482 char *buf;
483 size_t bufact;
484 size_t bufmax = 56;
485 const struct keyword_t *kw;
486 int ch;
488 buf = xmalloc (bufmax);
489 bufact = 0;
491 ADDC (lr->buf[lr->idx - 1]);
493 while (!isspace ((ch = lr_getc (lr))) && ch != '"' && ch != ';'
494 && ch != '<' && ch != ',')
496 if (ch == lr->escape_char)
498 ch = lr_getc (lr);
499 if (ch == '\n' || ch == EOF)
501 lr_error (lr, _("invalid escape sequence"));
502 break;
505 ADDC (ch);
508 lr_ungetn (lr, 1);
510 kw = lr->hash_fct (buf, bufact);
512 if (kw != NULL && kw->symname_or_ident == 0)
514 lr->token.tok = kw->token;
515 free (buf);
517 else
519 lr->token.tok = tok_ident;
521 buf[bufact] = '\0';
522 buf = xrealloc (buf, bufact + 1);
524 lr->token.val.str.startmb = buf;
525 lr->token.val.str.lenmb = bufact;
528 return &lr->token;
532 static struct token *
533 get_string (struct linereader *lr, const struct charmap_t *charmap,
534 const struct repertoire_t *repertoire)
536 int return_widestr = lr->return_widestr;
537 char *buf;
538 wchar_t *buf2 = NULL;
539 size_t bufact;
540 size_t bufmax = 56;
542 /* We must return two different strings. */
543 buf = xmalloc (bufmax);
544 bufact = 0;
546 /* We know it'll be a string. */
547 lr->token.tok = tok_string;
549 /* If we need not translate the strings (i.e., expand <...> parts)
550 we can run a simple loop. */
551 if (!lr->translate_strings)
553 int ch;
555 buf2 = NULL;
556 while ((ch = lr_getc (lr)) != '"' && ch != '\n' && ch != EOF)
557 ADDC (ch);
559 /* Catch errors with trailing escape character. */
560 if (bufact > 0 && buf[bufact - 1] == lr->escape_char
561 && (bufact == 1 || buf[bufact - 2] != lr->escape_char))
563 lr_error (lr, _("illegal escape sequence at end of string"));
564 --bufact;
566 else if (ch == '\n' || ch == EOF)
567 lr_error (lr, _("unterminated string"));
569 ADDC ('\0');
571 else
573 int illegal_string = 0;
574 size_t buf2act = 0;
575 size_t buf2max = 56 * sizeof (uint32_t);
576 int ch;
577 int warned = 0;
579 /* We have to provide the wide character result as well. */
580 if (return_widestr)
581 buf2 = xmalloc (buf2max);
583 /* Read until the end of the string (or end of the line or file). */
584 while ((ch = lr_getc (lr)) != '"' && ch != '\n' && ch != EOF)
586 size_t startidx;
587 uint32_t wch;
588 struct charseq *seq;
590 if (ch != '<')
592 /* The standards leave it up to the implementation to decide
593 what to do with character which stand for themself. We
594 could jump through hoops to find out the value relative to
595 the charmap and the repertoire map, but instead we leave
596 it up to the locale definition author to write a better
597 definition. We assume here that every character which
598 stands for itself is encoded using ISO 8859-1. Using the
599 escape character is allowed. */
600 if (ch == lr->escape_char)
602 ch = lr_getc (lr);
603 if (ch == '\n' || ch == EOF)
604 break;
607 if (verbose && !warned)
609 lr_error (lr, _("\
610 non-symbolic character value should not be used"));
611 warned = 1;
614 ADDC (ch);
615 if (return_widestr)
616 ADDWC ((uint32_t) ch);
618 continue;
621 /* Now we have to search for the end of the symbolic name, i.e.,
622 the closing '>'. */
623 startidx = bufact;
624 while ((ch = lr_getc (lr)) != '>' && ch != '\n' && ch != EOF)
626 if (ch == lr->escape_char)
628 ch = lr_getc (lr);
629 if (ch == '\n' || ch == EOF)
630 break;
632 ADDC (ch);
634 if (ch == '\n' || ch == EOF)
635 /* Not a correct string. */
636 break;
637 if (bufact == startidx)
639 /* <> is no correct name. Ignore it and also signal an
640 error. */
641 illegal_string = 1;
642 continue;
645 /* It might be a Uxxxx symbol. */
646 if (buf[startidx] == 'U'
647 && (bufact - startidx == 5 || bufact - startidx == 9))
649 char *cp = buf + startidx + 1;
650 while (cp < &buf[bufact] && isxdigit (*cp))
651 ++cp;
653 if (cp == &buf[bufact])
655 const char *symbol = NULL;
657 /* Yes, it is. */
658 ADDC ('\0');
659 wch = strtoul (buf + startidx + 1, NULL, 16);
661 /* Now forget about the name we just added. */
662 bufact = startidx;
664 if (return_widestr)
665 ADDWC (wch);
667 /* Now determine from the repertoire the name of the
668 character and find it in the charmap. */
669 if (repertoire != NULL)
670 symbol = repertoire_find_symbol (repertoire, wch);
672 if (symbol == NULL)
674 /* We cannot generate a string since we cannot map
675 from the Unicode number to the character symbol. */
676 lr_error (lr,
677 _("character <U%0*X> not in repertoire map"),
678 wch > 0xffff ? 8 : 4, wch);
680 illegal_string = 1;
682 else
684 seq = charmap_find_value (charmap, symbol,
685 strlen (symbol));
687 if (seq == NULL)
689 /* Not a known name. */
690 lr_error (lr,
691 _("symbol `%s' not in charmap"), symbol);
692 illegal_string = 1;
694 else
695 ADDS (seq->bytes, seq->nbytes);
698 continue;
702 if (return_widestr)
704 /* We now have the symbolic name in buf[startidx] to
705 buf[bufact-1]. Now find out the value for this
706 character in the repertoire map as well as in the
707 charmap (in this order). */
708 wch = repertoire_find_value (repertoire, &buf[startidx],
709 bufact - startidx);
710 if (wch == ILLEGAL_CHAR_VALUE)
712 /* This name is not in the repertoire map. */
713 lr_error (lr, _("symbol `%.*s' not in repertoire map"),
714 bufact - startidx, &buf[startidx]);
715 illegal_string = 1;
717 else
718 ADDWC (wch);
721 /* Now the same for the multibyte representation. */
722 seq = charmap_find_value (charmap, &buf[startidx],
723 bufact - startidx);
725 if (seq == NULL)
727 /* This name is not in the charmap. */
728 lr_error (lr, _("symbol `%.*s' not in charmap"),
729 bufact - startidx, &buf[startidx]);
730 illegal_string = 1;
732 /* Now forget about the name we just added. */
733 bufact = startidx;
735 else
737 /* Now forget about the name we just added. */
738 bufact = startidx;
740 ADDS (seq->bytes, seq->nbytes);
744 if (ch == '\n' || ch == EOF)
746 lr_error (lr, _("unterminated string"));
747 illegal_string = 1;
750 if (illegal_string)
752 free (buf);
753 if (buf2 != NULL)
754 free (buf2);
755 lr->token.val.str.startmb = NULL;
756 lr->token.val.str.lenmb = 0;
758 return &lr->token;
761 ADDC ('\0');
763 if (return_widestr)
765 ADDWC (0);
766 lr->token.val.str.startwc = xrealloc (buf2,
767 buf2act * sizeof (uint32_t));
768 lr->token.val.str.lenwc = buf2act;
772 lr->token.val.str.startmb = xrealloc (buf, bufact);
773 lr->token.val.str.lenmb = bufact;
775 return &lr->token;