Update.
[glibc.git] / locale / programs / linereader.c
blobf569d1b9b862ff28b985bf5427904df6f13b0108
1 /* Copyright (C) 1996-2001, 2002 Free Software Foundation, Inc.
2 This file is part of the GNU C Library.
3 Contributed by Ulrich Drepper <drepper@gnu.org>, 1996.
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, write to the Free
17 Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
18 02111-1307 USA. */
20 #ifdef HAVE_CONFIG_H
21 # include <config.h>
22 #endif
24 #include <assert.h>
25 #include <ctype.h>
26 #include <errno.h>
27 #include <libintl.h>
28 #include <stdarg.h>
29 #include <stdlib.h>
30 #include <string.h>
32 #include "localedef.h"
33 #include "charmap.h"
34 #include "error.h"
35 #include "linereader.h"
36 #include "locfile.h"
38 /* Prototypes for local functions. */
39 static struct token *get_toplvl_escape (struct linereader *lr);
40 static struct token *get_symname (struct linereader *lr);
41 static struct token *get_ident (struct linereader *lr);
42 static struct token *get_string (struct linereader *lr,
43 const struct charmap_t *charmap,
44 struct localedef_t *locale,
45 const struct repertoire_t *repertoire,
46 int verbose);
49 struct linereader *
50 lr_open (const char *fname, kw_hash_fct_t hf)
52 FILE *fp;
54 if (fname == NULL || strcmp (fname, "-") == 0
55 || strcmp (fname, "/dev/stdin") == 0)
56 return lr_create (stdin, "<stdin>", hf);
57 else
59 fp = fopen (fname, "r");
60 if (fp == NULL)
61 return NULL;
62 return lr_create (fp, fname, hf);
66 struct linereader *
67 lr_create (FILE *fp, const char *fname, kw_hash_fct_t hf)
69 struct linereader *result;
70 int n;
72 result = (struct linereader *) xmalloc (sizeof (*result));
74 result->fp = fp;
75 result->fname = xstrdup (fname);
76 result->buf = NULL;
77 result->bufsize = 0;
78 result->lineno = 1;
79 result->idx = 0;
80 result->comment_char = '#';
81 result->escape_char = '\\';
82 result->translate_strings = 1;
84 n = getdelim (&result->buf, &result->bufsize, '\n', result->fp);
85 if (n < 0)
87 int save = errno;
88 fclose (result->fp);
89 free ((char *) result->fname);
90 free (result);
91 errno = save;
92 return NULL;
95 if (n > 1 && result->buf[n - 2] == '\\' && result->buf[n - 1] == '\n')
96 n -= 2;
98 result->buf[n] = '\0';
99 result->bufact = n;
100 result->hash_fct = hf;
102 return result;
107 lr_eof (struct linereader *lr)
109 return lr->bufact = 0;
113 void
114 lr_close (struct linereader *lr)
116 fclose (lr->fp);
117 free (lr->buf);
118 free (lr);
123 lr_next (struct linereader *lr)
125 int n;
127 n = getdelim (&lr->buf, &lr->bufsize, '\n', lr->fp);
128 if (n < 0)
129 return -1;
131 ++lr->lineno;
133 if (n > 1 && lr->buf[n - 2] == lr->escape_char && lr->buf[n - 1] == '\n')
135 #if 0
136 /* XXX Is this correct? */
137 /* An escaped newline character is substituted with a single <SP>. */
138 --n;
139 lr->buf[n - 1] = ' ';
140 #else
141 n -= 2;
142 #endif
145 lr->buf[n] = '\0';
146 lr->bufact = n;
147 lr->idx = 0;
149 return 0;
153 /* Defined in error.c. */
154 /* This variable is incremented each time `error' is called. */
155 extern unsigned int error_message_count;
157 /* The calling program should define program_name and set it to the
158 name of the executing program. */
159 extern char *program_name;
162 struct token *
163 lr_token (struct linereader *lr, const struct charmap_t *charmap,
164 struct localedef_t *locale, const struct repertoire_t *repertoire,
165 int verbose)
167 int ch;
169 while (1)
173 ch = lr_getc (lr);
175 if (ch == EOF)
177 lr->token.tok = tok_eof;
178 return &lr->token;
181 if (ch == '\n')
183 lr->token.tok = tok_eol;
184 return &lr->token;
187 while (isspace (ch));
189 if (ch == EOF)
191 lr->token.tok = tok_eof;
192 return &lr->token;
195 if (ch != lr->comment_char)
196 break;
198 /* Is there an newline at the end of the buffer? */
199 if (lr->buf[lr->bufact - 1] != '\n')
201 /* No. Some people want this to mean that only the line in
202 the file not the logical, concatenated line is ignored.
203 Let's try this. */
204 lr->idx = lr->bufact;
205 continue;
208 /* Ignore rest of line. */
209 lr_ignore_rest (lr, 0);
210 lr->token.tok = tok_eol;
211 return &lr->token;
214 /* Match escape sequences. */
215 if (ch == lr->escape_char)
216 return get_toplvl_escape (lr);
218 /* Match ellipsis. */
219 if (ch == '.')
221 if (strncmp (&lr->buf[lr->idx], "...(2)....", 10) == 0)
223 int cnt;
224 for (cnt = 0; cnt < 10; ++cnt)
225 lr_getc (lr);
226 lr->token.tok = tok_ellipsis4_2;
227 return &lr->token;
229 if (strncmp (&lr->buf[lr->idx], "...", 3) == 0)
231 lr_getc (lr);
232 lr_getc (lr);
233 lr_getc (lr);
234 lr->token.tok = tok_ellipsis4;
235 return &lr->token;
237 if (strncmp (&lr->buf[lr->idx], "..", 2) == 0)
239 lr_getc (lr);
240 lr_getc (lr);
241 lr->token.tok = tok_ellipsis3;
242 return &lr->token;
244 if (strncmp (&lr->buf[lr->idx], ".(2)..", 6) == 0)
246 int cnt;
247 for (cnt = 0; cnt < 6; ++cnt)
248 lr_getc (lr);
249 lr->token.tok = tok_ellipsis2_2;
250 return &lr->token;
252 if (lr->buf[lr->idx] == '.')
254 lr_getc (lr);
255 lr->token.tok = tok_ellipsis2;
256 return &lr->token;
260 switch (ch)
262 case '<':
263 return get_symname (lr);
265 case '0' ... '9':
266 lr->token.tok = tok_number;
267 lr->token.val.num = ch - '0';
269 while (isdigit (ch = lr_getc (lr)))
271 lr->token.val.num *= 10;
272 lr->token.val.num += ch - '0';
274 if (isalpha (ch))
275 lr_error (lr, _("garbage at end of number"));
276 lr_ungetn (lr, 1);
278 return &lr->token;
280 case ';':
281 lr->token.tok = tok_semicolon;
282 return &lr->token;
284 case ',':
285 lr->token.tok = tok_comma;
286 return &lr->token;
288 case '(':
289 lr->token.tok = tok_open_brace;
290 return &lr->token;
292 case ')':
293 lr->token.tok = tok_close_brace;
294 return &lr->token;
296 case '"':
297 return get_string (lr, charmap, locale, repertoire, verbose);
299 case '-':
300 ch = lr_getc (lr);
301 if (ch == '1')
303 lr->token.tok = tok_minus1;
304 return &lr->token;
306 lr_ungetn (lr, 2);
307 break;
310 return get_ident (lr);
314 static struct token *
315 get_toplvl_escape (struct linereader *lr)
317 /* This is supposed to be a numeric value. We return the
318 numerical value and the number of bytes. */
319 size_t start_idx = lr->idx - 1;
320 char *bytes = lr->token.val.charcode.bytes;
321 int nbytes = 0;
322 int ch;
326 unsigned int byte = 0;
327 unsigned int base = 8;
329 ch = lr_getc (lr);
331 if (ch == 'd')
333 base = 10;
334 ch = lr_getc (lr);
336 else if (ch == 'x')
338 base = 16;
339 ch = lr_getc (lr);
342 if ((base == 16 && !isxdigit (ch))
343 || (base != 16 && (ch < '0' || ch >= (int) ('0' + base))))
345 esc_error:
346 lr->token.val.str.startmb = &lr->buf[start_idx];
348 while (ch != EOF && !isspace (ch))
349 ch = lr_getc (lr);
350 lr->token.val.str.lenmb = lr->idx - start_idx;
352 lr->token.tok = tok_error;
353 return &lr->token;
356 if (isdigit (ch))
357 byte = ch - '0';
358 else
359 byte = tolower (ch) - 'a' + 10;
361 ch = lr_getc (lr);
362 if ((base == 16 && !isxdigit (ch))
363 || (base != 16 && (ch < '0' || ch >= (int) ('0' + base))))
364 goto esc_error;
366 byte *= base;
367 if (isdigit (ch))
368 byte += ch - '0';
369 else
370 byte += tolower (ch) - 'a' + 10;
372 ch = lr_getc (lr);
373 if (base != 16 && isdigit (ch))
375 byte *= base;
376 byte += ch - '0';
378 ch = lr_getc (lr);
381 bytes[nbytes++] = byte;
383 while (ch == lr->escape_char
384 && nbytes < sizeof (lr->token.val.charcode.bytes));
386 if (!isspace (ch))
387 lr_error (lr, _("garbage at end of character code specification"));
389 lr_ungetn (lr, 1);
391 lr->token.tok = tok_charcode;
392 lr->token.val.charcode.nbytes = nbytes;
394 return &lr->token;
398 #define ADDC(ch) \
399 do \
401 if (bufact == bufmax) \
403 bufmax *= 2; \
404 buf = xrealloc (buf, bufmax); \
406 buf[bufact++] = (ch); \
408 while (0)
411 #define ADDS(s, l) \
412 do \
414 size_t _l = (l); \
415 if (bufact + _l > bufmax) \
417 if (bufact < _l) \
418 bufact = _l; \
419 bufmax *= 2; \
420 buf = xrealloc (buf, bufmax); \
422 memcpy (&buf[bufact], s, _l); \
423 bufact += _l; \
425 while (0)
428 #define ADDWC(ch) \
429 do \
431 if (buf2act == buf2max) \
433 buf2max *= 2; \
434 buf2 = xrealloc (buf2, buf2max * 4); \
436 buf2[buf2act++] = (ch); \
438 while (0)
441 static struct token *
442 get_symname (struct linereader *lr)
444 /* Symbol in brackets. We must distinguish three kinds:
445 1. reserved words
446 2. ISO 10646 position values
447 3. all other. */
448 char *buf;
449 size_t bufact = 0;
450 size_t bufmax = 56;
451 const struct keyword_t *kw;
452 int ch;
454 buf = (char *) xmalloc (bufmax);
458 ch = lr_getc (lr);
459 if (ch == lr->escape_char)
461 int c2 = lr_getc (lr);
462 ADDC (c2);
464 if (c2 == '\n')
465 ch = '\n';
467 else
468 ADDC (ch);
470 while (ch != '>' && ch != '\n');
472 if (ch == '\n')
473 lr_error (lr, _("unterminated symbolic name"));
475 /* Test for ISO 10646 position value. */
476 if (buf[0] == 'U' && (bufact == 6 || bufact == 10))
478 char *cp = buf + 1;
479 while (cp < &buf[bufact - 1] && isxdigit (*cp))
480 ++cp;
482 if (cp == &buf[bufact - 1])
484 /* Yes, it is. */
485 lr->token.tok = tok_ucs4;
486 lr->token.val.ucs4 = strtoul (buf + 1, NULL, 16);
488 return &lr->token;
492 /* It is a symbolic name. Test for reserved words. */
493 kw = lr->hash_fct (buf, bufact - 1);
495 if (kw != NULL && kw->symname_or_ident == 1)
497 lr->token.tok = kw->token;
498 free (buf);
500 else
502 lr->token.tok = tok_bsymbol;
504 buf[bufact] = '\0';
505 buf = xrealloc (buf, bufact + 1);
507 lr->token.val.str.startmb = buf;
508 lr->token.val.str.lenmb = bufact - 1;
511 return &lr->token;
515 static struct token *
516 get_ident (struct linereader *lr)
518 char *buf;
519 size_t bufact;
520 size_t bufmax = 56;
521 const struct keyword_t *kw;
522 int ch;
524 buf = xmalloc (bufmax);
525 bufact = 0;
527 ADDC (lr->buf[lr->idx - 1]);
529 while (!isspace ((ch = lr_getc (lr))) && ch != '"' && ch != ';'
530 && ch != '<' && ch != ',' && ch != EOF)
532 if (ch == lr->escape_char)
534 ch = lr_getc (lr);
535 if (ch == '\n' || ch == EOF)
537 lr_error (lr, _("invalid escape sequence"));
538 break;
541 ADDC (ch);
544 lr_ungetc (lr, ch);
546 kw = lr->hash_fct (buf, bufact);
548 if (kw != NULL && kw->symname_or_ident == 0)
550 lr->token.tok = kw->token;
551 free (buf);
553 else
555 lr->token.tok = tok_ident;
557 buf[bufact] = '\0';
558 buf = xrealloc (buf, bufact + 1);
560 lr->token.val.str.startmb = buf;
561 lr->token.val.str.lenmb = bufact;
564 return &lr->token;
568 static struct token *
569 get_string (struct linereader *lr, const struct charmap_t *charmap,
570 struct localedef_t *locale, const struct repertoire_t *repertoire,
571 int verbose)
573 int return_widestr = lr->return_widestr;
574 char *buf;
575 wchar_t *buf2 = NULL;
576 size_t bufact;
577 size_t bufmax = 56;
579 /* We must return two different strings. */
580 buf = xmalloc (bufmax);
581 bufact = 0;
583 /* We know it'll be a string. */
584 lr->token.tok = tok_string;
586 /* If we need not translate the strings (i.e., expand <...> parts)
587 we can run a simple loop. */
588 if (!lr->translate_strings)
590 int ch;
592 buf2 = NULL;
593 while ((ch = lr_getc (lr)) != '"' && ch != '\n' && ch != EOF)
594 ADDC (ch);
596 /* Catch errors with trailing escape character. */
597 if (bufact > 0 && buf[bufact - 1] == lr->escape_char
598 && (bufact == 1 || buf[bufact - 2] != lr->escape_char))
600 lr_error (lr, _("illegal escape sequence at end of string"));
601 --bufact;
603 else if (ch == '\n' || ch == EOF)
604 lr_error (lr, _("unterminated string"));
606 ADDC ('\0');
608 else
610 int illegal_string = 0;
611 size_t buf2act = 0;
612 size_t buf2max = 56 * sizeof (uint32_t);
613 int ch;
614 int warned = 0;
616 /* We have to provide the wide character result as well. */
617 if (return_widestr)
618 buf2 = xmalloc (buf2max);
620 /* Read until the end of the string (or end of the line or file). */
621 while ((ch = lr_getc (lr)) != '"' && ch != '\n' && ch != EOF)
623 size_t startidx;
624 uint32_t wch;
625 struct charseq *seq;
627 if (ch != '<')
629 /* The standards leave it up to the implementation to decide
630 what to do with character which stand for themself. We
631 could jump through hoops to find out the value relative to
632 the charmap and the repertoire map, but instead we leave
633 it up to the locale definition author to write a better
634 definition. We assume here that every character which
635 stands for itself is encoded using ISO 8859-1. Using the
636 escape character is allowed. */
637 if (ch == lr->escape_char)
639 ch = lr_getc (lr);
640 if (ch == '\n' || ch == EOF)
641 break;
644 if (verbose && !warned)
646 lr_error (lr, _("\
647 non-symbolic character value should not be used"));
648 warned = 1;
651 ADDC (ch);
652 if (return_widestr)
653 ADDWC ((uint32_t) ch);
655 continue;
658 /* Now we have to search for the end of the symbolic name, i.e.,
659 the closing '>'. */
660 startidx = bufact;
661 while ((ch = lr_getc (lr)) != '>' && ch != '\n' && ch != EOF)
663 if (ch == lr->escape_char)
665 ch = lr_getc (lr);
666 if (ch == '\n' || ch == EOF)
667 break;
669 ADDC (ch);
671 if (ch == '\n' || ch == EOF)
672 /* Not a correct string. */
673 break;
674 if (bufact == startidx)
676 /* <> is no correct name. Ignore it and also signal an
677 error. */
678 illegal_string = 1;
679 continue;
682 /* It might be a Uxxxx symbol. */
683 if (buf[startidx] == 'U'
684 && (bufact - startidx == 5 || bufact - startidx == 9))
686 char *cp = buf + startidx + 1;
687 while (cp < &buf[bufact] && isxdigit (*cp))
688 ++cp;
690 if (cp == &buf[bufact])
692 char utmp[10];
694 /* Yes, it is. */
695 ADDC ('\0');
696 wch = strtoul (buf + startidx + 1, NULL, 16);
698 /* Now forget about the name we just added. */
699 bufact = startidx;
701 if (return_widestr)
702 ADDWC (wch);
704 /* See whether the charmap contains the Uxxxxxxxx names. */
705 snprintf (utmp, sizeof (utmp), "U%08X", wch);
706 seq = charmap_find_value (charmap, utmp, 9);
708 if (seq == NULL)
710 /* No, this isn't the case. Now determine from
711 the repertoire the name of the character and
712 find it in the charmap. */
713 if (repertoire != NULL)
715 const char *symbol;
717 symbol = repertoire_find_symbol (repertoire, wch);
719 if (symbol != NULL)
720 seq = charmap_find_value (charmap, symbol,
721 strlen (symbol));
724 if (seq == NULL)
726 #ifndef NO_TRANSLITERATION
727 /* Transliterate if possible. */
728 if (locale != NULL)
730 uint32_t *translit;
732 if ((locale->avail & CTYPE_LOCALE) == 0)
734 /* Load the CTYPE data now. */
735 int old_needed = locale->needed;
737 locale->needed = 0;
738 locale = load_locale (CTYPE_LOCALE,
739 locale->name,
740 locale->repertoire_name,
741 charmap, locale);
742 locale->needed = old_needed;
745 if ((locale->avail & CTYPE_LOCALE) != 0
746 && ((translit = find_translit (locale,
747 charmap, wch))
748 != NULL))
749 /* The CTYPE data contains a matching
750 transliteration. */
752 int i;
754 for (i = 0; translit[i] != 0; ++i)
756 char utmp[10];
758 snprintf (utmp, sizeof (utmp), "U%08X",
759 translit[i]);
760 seq = charmap_find_value (charmap, utmp,
762 assert (seq != NULL);
763 ADDS (seq->bytes, seq->nbytes);
766 continue;
769 #endif /* NO_TRANSLITERATION */
771 /* Not a known name. */
772 illegal_string = 1;
776 if (seq != NULL)
777 ADDS (seq->bytes, seq->nbytes);
779 continue;
783 /* We now have the symbolic name in buf[startidx] to
784 buf[bufact-1]. Now find out the value for this character
785 in the charmap as well as in the repertoire map (in this
786 order). */
787 seq = charmap_find_value (charmap, &buf[startidx],
788 bufact - startidx);
790 if (seq == NULL)
792 /* This name is not in the charmap. */
793 lr_error (lr, _("symbol `%.*s' not in charmap"),
794 (int) (bufact - startidx), &buf[startidx]);
795 illegal_string = 1;
798 if (return_widestr)
800 /* Now the same for the multibyte representation. */
801 if (seq != NULL && seq->ucs4 != UNINITIALIZED_CHAR_VALUE)
802 wch = seq->ucs4;
803 else
805 wch = repertoire_find_value (repertoire, &buf[startidx],
806 bufact - startidx);
807 if (seq != NULL)
808 seq->ucs4 = wch;
811 if (wch == ILLEGAL_CHAR_VALUE)
813 /* This name is not in the repertoire map. */
814 lr_error (lr, _("symbol `%.*s' not in repertoire map"),
815 (int) (bufact - startidx), &buf[startidx]);
816 illegal_string = 1;
818 else
819 ADDWC (wch);
822 /* Now forget about the name we just added. */
823 bufact = startidx;
825 /* And copy the bytes. */
826 if (seq != NULL)
827 ADDS (seq->bytes, seq->nbytes);
830 if (ch == '\n' || ch == EOF)
832 lr_error (lr, _("unterminated string"));
833 illegal_string = 1;
836 if (illegal_string)
838 free (buf);
839 if (buf2 != NULL)
840 free (buf2);
841 lr->token.val.str.startmb = NULL;
842 lr->token.val.str.lenmb = 0;
843 lr->token.val.str.startwc = NULL;
844 lr->token.val.str.lenwc = 0;
846 return &lr->token;
849 ADDC ('\0');
851 if (return_widestr)
853 ADDWC (0);
854 lr->token.val.str.startwc = xrealloc (buf2,
855 buf2act * sizeof (uint32_t));
856 lr->token.val.str.lenwc = buf2act;
860 lr->token.val.str.startmb = xrealloc (buf, bufact);
861 lr->token.val.str.lenmb = bufact;
863 return &lr->token;