Update.
[glibc.git] / locale / programs / linereader.c
blobac843bcd2f3b07261b7ad1c617993c5420862779
1 /* Copyright (C) 1996,1997,1998,1999,2000,2001 Free Software Foundation, Inc.
2 This file is part of the GNU C Library.
3 Contributed by Ulrich Drepper <drepper@gnu.org>, 1996.
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, write to the Free
17 Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
18 02111-1307 USA. */
20 #ifdef HAVE_CONFIG_H
21 # include <config.h>
22 #endif
24 #include <ctype.h>
25 #include <errno.h>
26 #include <libintl.h>
27 #include <stdarg.h>
28 #include <stdlib.h>
29 #include <string.h>
31 #include "charmap.h"
32 #include "error.h"
33 #include "linereader.h"
35 /* Prototypes for a few program-wide used functions. */
36 extern void *xmalloc (size_t __n);
37 extern void *xrealloc (void *__p, size_t __n);
38 extern char *xstrdup (const char *__str);
41 /* Prototypes for local functions. */
42 static struct token *get_toplvl_escape (struct linereader *lr);
43 static struct token *get_symname (struct linereader *lr);
44 static struct token *get_ident (struct linereader *lr);
45 static struct token *get_string (struct linereader *lr,
46 const struct charmap_t *charmap,
47 const struct repertoire_t *repertoire,
48 int verbose);
51 struct linereader *
52 lr_open (const char *fname, kw_hash_fct_t hf)
54 FILE *fp;
56 if (fname == NULL || strcmp (fname, "-") == 0
57 || strcmp (fname, "/dev/stdin") == 0)
58 return lr_create (stdin, "<stdin>", hf);
59 else
61 fp = fopen (fname, "r");
62 if (fp == NULL)
63 return NULL;
64 return lr_create (fp, fname, hf);
68 struct linereader *
69 lr_create (FILE *fp, const char *fname, kw_hash_fct_t hf)
71 struct linereader *result;
72 int n;
74 result = (struct linereader *) xmalloc (sizeof (*result));
76 result->fp = fp;
77 result->fname = xstrdup (fname);
78 result->buf = NULL;
79 result->bufsize = 0;
80 result->lineno = 1;
81 result->idx = 0;
82 result->comment_char = '#';
83 result->escape_char = '\\';
84 result->translate_strings = 1;
86 n = getdelim (&result->buf, &result->bufsize, '\n', result->fp);
87 if (n < 0)
89 int save = errno;
90 fclose (result->fp);
91 free ((char *) result->fname);
92 free (result);
93 errno = save;
94 return NULL;
97 if (n > 1 && result->buf[n - 2] == '\\' && result->buf[n - 1] == '\n')
98 n -= 2;
100 result->buf[n] = '\0';
101 result->bufact = n;
102 result->hash_fct = hf;
104 return result;
109 lr_eof (struct linereader *lr)
111 return lr->bufact = 0;
115 void
116 lr_close (struct linereader *lr)
118 fclose (lr->fp);
119 free (lr->buf);
120 free (lr);
125 lr_next (struct linereader *lr)
127 int n;
129 n = getdelim (&lr->buf, &lr->bufsize, '\n', lr->fp);
130 if (n < 0)
131 return -1;
133 ++lr->lineno;
135 if (n > 1 && lr->buf[n - 2] == lr->escape_char && lr->buf[n - 1] == '\n')
137 #if 0
138 /* XXX Is this correct? */
139 /* An escaped newline character is substituted with a single <SP>. */
140 --n;
141 lr->buf[n - 1] = ' ';
142 #else
143 n -= 2;
144 #endif
147 lr->buf[n] = '\0';
148 lr->bufact = n;
149 lr->idx = 0;
151 return 0;
155 /* Defined in error.c. */
156 /* This variable is incremented each time `error' is called. */
157 extern unsigned int error_message_count;
159 /* The calling program should define program_name and set it to the
160 name of the executing program. */
161 extern char *program_name;
164 struct token *
165 lr_token (struct linereader *lr, const struct charmap_t *charmap,
166 const struct repertoire_t *repertoire, int verbose)
168 int ch;
170 while (1)
174 ch = lr_getc (lr);
176 if (ch == EOF)
178 lr->token.tok = tok_eof;
179 return &lr->token;
182 if (ch == '\n')
184 lr->token.tok = tok_eol;
185 return &lr->token;
188 while (isspace (ch));
190 if (ch == EOF)
192 lr->token.tok = tok_eof;
193 return &lr->token;
196 if (ch != lr->comment_char)
197 break;
199 /* Is there an newline at the end of the buffer? */
200 if (lr->buf[lr->bufact - 1] != '\n')
202 /* No. Some people want this to mean that only the line in
203 the file not the logical, concatenated line is ignored.
204 Let's try this. */
205 lr->idx = lr->bufact;
206 continue;
209 /* Ignore rest of line. */
210 lr_ignore_rest (lr, 0);
211 lr->token.tok = tok_eol;
212 return &lr->token;
215 /* Match escape sequences. */
216 if (ch == lr->escape_char)
217 return get_toplvl_escape (lr);
219 /* Match ellipsis. */
220 if (ch == '.')
222 if (strncmp (&lr->buf[lr->idx], "...(2)....", 10) == 0)
224 int cnt;
225 for (cnt = 0; cnt < 10; ++cnt)
226 lr_getc (lr);
227 lr->token.tok = tok_ellipsis4_2;
228 return &lr->token;
230 if (strncmp (&lr->buf[lr->idx], "...", 3) == 0)
232 lr_getc (lr);
233 lr_getc (lr);
234 lr_getc (lr);
235 lr->token.tok = tok_ellipsis4;
236 return &lr->token;
238 if (strncmp (&lr->buf[lr->idx], "..", 2) == 0)
240 lr_getc (lr);
241 lr_getc (lr);
242 lr->token.tok = tok_ellipsis3;
243 return &lr->token;
245 if (strncmp (&lr->buf[lr->idx], ".(2)..", 6) == 0)
247 int cnt;
248 for (cnt = 0; cnt < 6; ++cnt)
249 lr_getc (lr);
250 lr->token.tok = tok_ellipsis2_2;
251 return &lr->token;
253 if (lr->buf[lr->idx] == '.')
255 lr_getc (lr);
256 lr->token.tok = tok_ellipsis2;
257 return &lr->token;
261 switch (ch)
263 case '<':
264 return get_symname (lr);
266 case '0' ... '9':
267 lr->token.tok = tok_number;
268 lr->token.val.num = ch - '0';
270 while (isdigit (ch = lr_getc (lr)))
272 lr->token.val.num *= 10;
273 lr->token.val.num += ch - '0';
275 if (isalpha (ch))
276 lr_error (lr, _("garbage at end of number"));
277 lr_ungetn (lr, 1);
279 return &lr->token;
281 case ';':
282 lr->token.tok = tok_semicolon;
283 return &lr->token;
285 case ',':
286 lr->token.tok = tok_comma;
287 return &lr->token;
289 case '(':
290 lr->token.tok = tok_open_brace;
291 return &lr->token;
293 case ')':
294 lr->token.tok = tok_close_brace;
295 return &lr->token;
297 case '"':
298 return get_string (lr, charmap, repertoire, verbose);
300 case '-':
301 ch = lr_getc (lr);
302 if (ch == '1')
304 lr->token.tok = tok_minus1;
305 return &lr->token;
307 lr_ungetn (lr, 2);
308 break;
311 return get_ident (lr);
315 static struct token *
316 get_toplvl_escape (struct linereader *lr)
318 /* This is supposed to be a numeric value. We return the
319 numerical value and the number of bytes. */
320 size_t start_idx = lr->idx - 1;
321 char *bytes = lr->token.val.charcode.bytes;
322 int nbytes = 0;
323 int ch;
327 unsigned int byte = 0;
328 unsigned int base = 8;
330 ch = lr_getc (lr);
332 if (ch == 'd')
334 base = 10;
335 ch = lr_getc (lr);
337 else if (ch == 'x')
339 base = 16;
340 ch = lr_getc (lr);
343 if ((base == 16 && !isxdigit (ch))
344 || (base != 16 && (ch < '0' || ch >= (int) ('0' + base))))
346 esc_error:
347 lr->token.val.str.startmb = &lr->buf[start_idx];
349 while (ch != EOF && !isspace (ch))
350 ch = lr_getc (lr);
351 lr->token.val.str.lenmb = lr->idx - start_idx;
353 lr->token.tok = tok_error;
354 return &lr->token;
357 if (isdigit (ch))
358 byte = ch - '0';
359 else
360 byte = tolower (ch) - 'a' + 10;
362 ch = lr_getc (lr);
363 if ((base == 16 && !isxdigit (ch))
364 || (base != 16 && (ch < '0' || ch >= (int) ('0' + base))))
365 goto esc_error;
367 byte *= base;
368 if (isdigit (ch))
369 byte += ch - '0';
370 else
371 byte += tolower (ch) - 'a' + 10;
373 ch = lr_getc (lr);
374 if (base != 16 && isdigit (ch))
376 byte *= base;
377 byte += ch - '0';
379 ch = lr_getc (lr);
382 bytes[nbytes++] = byte;
384 while (ch == lr->escape_char
385 && nbytes < sizeof (lr->token.val.charcode.bytes));
387 if (!isspace (ch))
388 lr_error (lr, _("garbage at end of character code specification"));
390 lr_ungetn (lr, 1);
392 lr->token.tok = tok_charcode;
393 lr->token.val.charcode.nbytes = nbytes;
395 return &lr->token;
399 #define ADDC(ch) \
400 do \
402 if (bufact == bufmax) \
404 bufmax *= 2; \
405 buf = xrealloc (buf, bufmax); \
407 buf[bufact++] = (ch); \
409 while (0)
412 #define ADDS(s, l) \
413 do \
415 size_t _l = (l); \
416 if (bufact + _l > bufmax) \
418 if (bufact < _l) \
419 bufact = _l; \
420 bufmax *= 2; \
421 buf = xrealloc (buf, bufmax); \
423 memcpy (&buf[bufact], s, _l); \
424 bufact += _l; \
426 while (0)
429 #define ADDWC(ch) \
430 do \
432 if (buf2act == buf2max) \
434 buf2max *= 2; \
435 buf2 = xrealloc (buf2, buf2max * 4); \
437 buf2[buf2act++] = (ch); \
439 while (0)
442 static struct token *
443 get_symname (struct linereader *lr)
445 /* Symbol in brackets. We must distinguish three kinds:
446 1. reserved words
447 2. ISO 10646 position values
448 3. all other. */
449 char *buf;
450 size_t bufact = 0;
451 size_t bufmax = 56;
452 const struct keyword_t *kw;
453 int ch;
455 buf = (char *) xmalloc (bufmax);
459 ch = lr_getc (lr);
460 if (ch == lr->escape_char)
462 int c2 = lr_getc (lr);
463 ADDC (c2);
465 if (c2 == '\n')
466 ch = '\n';
468 else
469 ADDC (ch);
471 while (ch != '>' && ch != '\n');
473 if (ch == '\n')
474 lr_error (lr, _("unterminated symbolic name"));
476 /* Test for ISO 10646 position value. */
477 if (buf[0] == 'U' && (bufact == 6 || bufact == 10))
479 char *cp = buf + 1;
480 while (cp < &buf[bufact - 1] && isxdigit (*cp))
481 ++cp;
483 if (cp == &buf[bufact - 1])
485 /* Yes, it is. */
486 lr->token.tok = tok_ucs4;
487 lr->token.val.ucs4 = strtoul (buf + 1, NULL, 16);
489 return &lr->token;
493 /* It is a symbolic name. Test for reserved words. */
494 kw = lr->hash_fct (buf, bufact - 1);
496 if (kw != NULL && kw->symname_or_ident == 1)
498 lr->token.tok = kw->token;
499 free (buf);
501 else
503 lr->token.tok = tok_bsymbol;
505 buf[bufact] = '\0';
506 buf = xrealloc (buf, bufact + 1);
508 lr->token.val.str.startmb = buf;
509 lr->token.val.str.lenmb = bufact - 1;
512 return &lr->token;
516 static struct token *
517 get_ident (struct linereader *lr)
519 char *buf;
520 size_t bufact;
521 size_t bufmax = 56;
522 const struct keyword_t *kw;
523 int ch;
525 buf = xmalloc (bufmax);
526 bufact = 0;
528 ADDC (lr->buf[lr->idx - 1]);
530 while (!isspace ((ch = lr_getc (lr))) && ch != '"' && ch != ';'
531 && ch != '<' && ch != ',' && ch != EOF)
533 if (ch == lr->escape_char)
535 ch = lr_getc (lr);
536 if (ch == '\n' || ch == EOF)
538 lr_error (lr, _("invalid escape sequence"));
539 break;
542 ADDC (ch);
545 lr_ungetc (lr, ch);
547 kw = lr->hash_fct (buf, bufact);
549 if (kw != NULL && kw->symname_or_ident == 0)
551 lr->token.tok = kw->token;
552 free (buf);
554 else
556 lr->token.tok = tok_ident;
558 buf[bufact] = '\0';
559 buf = xrealloc (buf, bufact + 1);
561 lr->token.val.str.startmb = buf;
562 lr->token.val.str.lenmb = bufact;
565 return &lr->token;
569 static struct token *
570 get_string (struct linereader *lr, const struct charmap_t *charmap,
571 const struct repertoire_t *repertoire, int verbose)
573 int return_widestr = lr->return_widestr;
574 char *buf;
575 wchar_t *buf2 = NULL;
576 size_t bufact;
577 size_t bufmax = 56;
579 /* We must return two different strings. */
580 buf = xmalloc (bufmax);
581 bufact = 0;
583 /* We know it'll be a string. */
584 lr->token.tok = tok_string;
586 /* If we need not translate the strings (i.e., expand <...> parts)
587 we can run a simple loop. */
588 if (!lr->translate_strings)
590 int ch;
592 buf2 = NULL;
593 while ((ch = lr_getc (lr)) != '"' && ch != '\n' && ch != EOF)
594 ADDC (ch);
596 /* Catch errors with trailing escape character. */
597 if (bufact > 0 && buf[bufact - 1] == lr->escape_char
598 && (bufact == 1 || buf[bufact - 2] != lr->escape_char))
600 lr_error (lr, _("illegal escape sequence at end of string"));
601 --bufact;
603 else if (ch == '\n' || ch == EOF)
604 lr_error (lr, _("unterminated string"));
606 ADDC ('\0');
608 else
610 int illegal_string = 0;
611 size_t buf2act = 0;
612 size_t buf2max = 56 * sizeof (uint32_t);
613 int ch;
614 int warned = 0;
616 /* We have to provide the wide character result as well. */
617 if (return_widestr)
618 buf2 = xmalloc (buf2max);
620 /* Read until the end of the string (or end of the line or file). */
621 while ((ch = lr_getc (lr)) != '"' && ch != '\n' && ch != EOF)
623 size_t startidx;
624 uint32_t wch;
625 struct charseq *seq;
627 if (ch != '<')
629 /* The standards leave it up to the implementation to decide
630 what to do with character which stand for themself. We
631 could jump through hoops to find out the value relative to
632 the charmap and the repertoire map, but instead we leave
633 it up to the locale definition author to write a better
634 definition. We assume here that every character which
635 stands for itself is encoded using ISO 8859-1. Using the
636 escape character is allowed. */
637 if (ch == lr->escape_char)
639 ch = lr_getc (lr);
640 if (ch == '\n' || ch == EOF)
641 break;
644 if (verbose && !warned)
646 lr_error (lr, _("\
647 non-symbolic character value should not be used"));
648 warned = 1;
651 ADDC (ch);
652 if (return_widestr)
653 ADDWC ((uint32_t) ch);
655 continue;
658 /* Now we have to search for the end of the symbolic name, i.e.,
659 the closing '>'. */
660 startidx = bufact;
661 while ((ch = lr_getc (lr)) != '>' && ch != '\n' && ch != EOF)
663 if (ch == lr->escape_char)
665 ch = lr_getc (lr);
666 if (ch == '\n' || ch == EOF)
667 break;
669 ADDC (ch);
671 if (ch == '\n' || ch == EOF)
672 /* Not a correct string. */
673 break;
674 if (bufact == startidx)
676 /* <> is no correct name. Ignore it and also signal an
677 error. */
678 illegal_string = 1;
679 continue;
682 /* It might be a Uxxxx symbol. */
683 if (buf[startidx] == 'U'
684 && (bufact - startidx == 5 || bufact - startidx == 9))
686 char *cp = buf + startidx + 1;
687 while (cp < &buf[bufact] && isxdigit (*cp))
688 ++cp;
690 if (cp == &buf[bufact])
692 char utmp[10];
693 const char *symbol = NULL;
695 /* Yes, it is. */
696 ADDC ('\0');
697 wch = strtoul (buf + startidx + 1, NULL, 16);
699 /* Now forget about the name we just added. */
700 bufact = startidx;
702 if (return_widestr)
703 ADDWC (wch);
705 /* See whether the charmap contains the Uxxxxxxxx names. */
706 snprintf (utmp, sizeof (utmp), "U%08X", wch);
707 seq = charmap_find_value (charmap, utmp, 9);
709 if (seq == NULL)
711 /* No, this isn't the case. Now determine from
712 the repertoire the name of the character and
713 find it in the charmap. */
714 if (repertoire != NULL)
715 symbol = repertoire_find_symbol (repertoire, wch);
717 if (symbol == NULL)
718 /* We cannot generate a string since we
719 cannot map from the Unicode number to the
720 character symbol. */
721 illegal_string = 1;
722 else
724 seq = charmap_find_value (charmap, symbol,
725 strlen (symbol));
727 if (seq == NULL)
728 /* Not a known name. */
729 illegal_string = 1;
733 if (seq != NULL)
734 ADDS (seq->bytes, seq->nbytes);
736 continue;
740 /* We now have the symbolic name in buf[startidx] to
741 buf[bufact-1]. Now find out the value for this character
742 in the charmap as well as in the repertoire map (in this
743 order). */
744 seq = charmap_find_value (charmap, &buf[startidx],
745 bufact - startidx);
747 if (seq == NULL)
749 /* This name is not in the charmap. */
750 lr_error (lr, _("symbol `%.*s' not in charmap"),
751 (int) (bufact - startidx), &buf[startidx]);
752 illegal_string = 1;
755 if (return_widestr)
757 /* Now the same for the multibyte representation. */
758 if (seq != NULL && seq->ucs4 != UNINITIALIZED_CHAR_VALUE)
759 wch = seq->ucs4;
760 else
762 wch = repertoire_find_value (repertoire, &buf[startidx],
763 bufact - startidx);
764 if (seq != NULL)
765 seq->ucs4 = wch;
768 if (wch == ILLEGAL_CHAR_VALUE)
770 /* This name is not in the repertoire map. */
771 lr_error (lr, _("symbol `%.*s' not in repertoire map"),
772 (int) (bufact - startidx), &buf[startidx]);
773 illegal_string = 1;
775 else
776 ADDWC (wch);
779 /* Now forget about the name we just added. */
780 bufact = startidx;
782 /* And copy the bytes. */
783 if (seq != NULL)
784 ADDS (seq->bytes, seq->nbytes);
787 if (ch == '\n' || ch == EOF)
789 lr_error (lr, _("unterminated string"));
790 illegal_string = 1;
793 if (illegal_string)
795 free (buf);
796 if (buf2 != NULL)
797 free (buf2);
798 lr->token.val.str.startmb = NULL;
799 lr->token.val.str.lenmb = 0;
800 lr->token.val.str.startwc = NULL;
801 lr->token.val.str.lenwc = 0;
803 return &lr->token;
806 ADDC ('\0');
808 if (return_widestr)
810 ADDWC (0);
811 lr->token.val.str.startwc = xrealloc (buf2,
812 buf2act * sizeof (uint32_t));
813 lr->token.val.str.lenwc = buf2act;
817 lr->token.val.str.startmb = xrealloc (buf, bufact);
818 lr->token.val.str.lenmb = bufact;
820 return &lr->token;