Update.
[glibc.git] / locale / programs / linereader.c
blob233799a87de6ae9f01f9e4b988ffe649292cd04b
1 /* Copyright (C) 1996-2001, 2002, 2003, 2004 Free Software Foundation, Inc.
2 This file is part of the GNU C Library.
3 Contributed by Ulrich Drepper <drepper@gnu.org>, 1996.
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, write to the Free
17 Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
18 02111-1307 USA. */
20 #ifdef HAVE_CONFIG_H
21 # include <config.h>
22 #endif
24 #include <assert.h>
25 #include <ctype.h>
26 #include <errno.h>
27 #include <libintl.h>
28 #include <stdarg.h>
29 #include <stdlib.h>
30 #include <string.h>
32 #include "localedef.h"
33 #include "charmap.h"
34 #include "error.h"
35 #include "linereader.h"
36 #include "locfile.h"
38 /* Prototypes for local functions. */
39 static struct token *get_toplvl_escape (struct linereader *lr);
40 static struct token *get_symname (struct linereader *lr);
41 static struct token *get_ident (struct linereader *lr);
42 static struct token *get_string (struct linereader *lr,
43 const struct charmap_t *charmap,
44 struct localedef_t *locale,
45 const struct repertoire_t *repertoire,
46 int verbose);
49 struct linereader *
50 lr_open (const char *fname, kw_hash_fct_t hf)
52 FILE *fp;
54 if (fname == NULL || strcmp (fname, "-") == 0
55 || strcmp (fname, "/dev/stdin") == 0)
56 return lr_create (stdin, "<stdin>", hf);
57 else
59 fp = fopen (fname, "rm");
60 if (fp == NULL)
61 return NULL;
62 return lr_create (fp, fname, hf);
66 struct linereader *
67 lr_create (FILE *fp, const char *fname, kw_hash_fct_t hf)
69 struct linereader *result;
70 int n;
72 result = (struct linereader *) xmalloc (sizeof (*result));
74 result->fp = fp;
75 result->fname = xstrdup (fname);
76 result->buf = NULL;
77 result->bufsize = 0;
78 result->lineno = 1;
79 result->idx = 0;
80 result->comment_char = '#';
81 result->escape_char = '\\';
82 result->translate_strings = 1;
83 result->return_widestr = 0;
85 n = getdelim (&result->buf, &result->bufsize, '\n', result->fp);
86 if (n < 0)
88 int save = errno;
89 fclose (result->fp);
90 free ((char *) result->fname);
91 free (result);
92 errno = save;
93 return NULL;
96 if (n > 1 && result->buf[n - 2] == '\\' && result->buf[n - 1] == '\n')
97 n -= 2;
99 result->buf[n] = '\0';
100 result->bufact = n;
101 result->hash_fct = hf;
103 return result;
108 lr_eof (struct linereader *lr)
110 return lr->bufact = 0;
114 void
115 lr_ignore_rest (struct linereader *lr, int verbose)
117 if (verbose)
119 while (isspace (lr->buf[lr->idx]) && lr->buf[lr->idx] != '\n'
120 && lr->buf[lr->idx] != lr->comment_char)
121 if (lr->buf[lr->idx] == '\0')
123 if (lr_next (lr) < 0)
124 return;
126 else
127 ++lr->idx;
129 if (lr->buf[lr->idx] != '\n' && ! feof (lr->fp)
130 && lr->buf[lr->idx] != lr->comment_char)
131 lr_error (lr, _("trailing garbage at end of line"));
134 /* Ignore continued line. */
135 while (lr->bufact > 0 && lr->buf[lr->bufact - 1] != '\n')
136 if (lr_next (lr) < 0)
137 break;
139 lr->idx = lr->bufact;
143 void
144 lr_close (struct linereader *lr)
146 fclose (lr->fp);
147 free (lr->buf);
148 free (lr);
153 lr_next (struct linereader *lr)
155 int n;
157 n = getdelim (&lr->buf, &lr->bufsize, '\n', lr->fp);
158 if (n < 0)
159 return -1;
161 ++lr->lineno;
163 if (n > 1 && lr->buf[n - 2] == lr->escape_char && lr->buf[n - 1] == '\n')
165 #if 0
166 /* XXX Is this correct? */
167 /* An escaped newline character is substituted with a single <SP>. */
168 --n;
169 lr->buf[n - 1] = ' ';
170 #else
171 n -= 2;
172 #endif
175 lr->buf[n] = '\0';
176 lr->bufact = n;
177 lr->idx = 0;
179 return 0;
183 /* Defined in error.c. */
184 /* This variable is incremented each time `error' is called. */
185 extern unsigned int error_message_count;
187 /* The calling program should define program_name and set it to the
188 name of the executing program. */
189 extern char *program_name;
192 struct token *
193 lr_token (struct linereader *lr, const struct charmap_t *charmap,
194 struct localedef_t *locale, const struct repertoire_t *repertoire,
195 int verbose)
197 int ch;
199 while (1)
203 ch = lr_getc (lr);
205 if (ch == EOF)
207 lr->token.tok = tok_eof;
208 return &lr->token;
211 if (ch == '\n')
213 lr->token.tok = tok_eol;
214 return &lr->token;
217 while (isspace (ch));
219 if (ch == EOF)
221 lr->token.tok = tok_eof;
222 return &lr->token;
225 if (ch != lr->comment_char)
226 break;
228 /* Is there an newline at the end of the buffer? */
229 if (lr->buf[lr->bufact - 1] != '\n')
231 /* No. Some people want this to mean that only the line in
232 the file not the logical, concatenated line is ignored.
233 Let's try this. */
234 lr->idx = lr->bufact;
235 continue;
238 /* Ignore rest of line. */
239 lr_ignore_rest (lr, 0);
240 lr->token.tok = tok_eol;
241 return &lr->token;
244 /* Match escape sequences. */
245 if (ch == lr->escape_char)
246 return get_toplvl_escape (lr);
248 /* Match ellipsis. */
249 if (ch == '.')
251 if (strncmp (&lr->buf[lr->idx], "...(2)....", 10) == 0)
253 int cnt;
254 for (cnt = 0; cnt < 10; ++cnt)
255 lr_getc (lr);
256 lr->token.tok = tok_ellipsis4_2;
257 return &lr->token;
259 if (strncmp (&lr->buf[lr->idx], "...", 3) == 0)
261 lr_getc (lr);
262 lr_getc (lr);
263 lr_getc (lr);
264 lr->token.tok = tok_ellipsis4;
265 return &lr->token;
267 if (strncmp (&lr->buf[lr->idx], "..", 2) == 0)
269 lr_getc (lr);
270 lr_getc (lr);
271 lr->token.tok = tok_ellipsis3;
272 return &lr->token;
274 if (strncmp (&lr->buf[lr->idx], ".(2)..", 6) == 0)
276 int cnt;
277 for (cnt = 0; cnt < 6; ++cnt)
278 lr_getc (lr);
279 lr->token.tok = tok_ellipsis2_2;
280 return &lr->token;
282 if (lr->buf[lr->idx] == '.')
284 lr_getc (lr);
285 lr->token.tok = tok_ellipsis2;
286 return &lr->token;
290 switch (ch)
292 case '<':
293 return get_symname (lr);
295 case '0' ... '9':
296 lr->token.tok = tok_number;
297 lr->token.val.num = ch - '0';
299 while (isdigit (ch = lr_getc (lr)))
301 lr->token.val.num *= 10;
302 lr->token.val.num += ch - '0';
304 if (isalpha (ch))
305 lr_error (lr, _("garbage at end of number"));
306 lr_ungetn (lr, 1);
308 return &lr->token;
310 case ';':
311 lr->token.tok = tok_semicolon;
312 return &lr->token;
314 case ',':
315 lr->token.tok = tok_comma;
316 return &lr->token;
318 case '(':
319 lr->token.tok = tok_open_brace;
320 return &lr->token;
322 case ')':
323 lr->token.tok = tok_close_brace;
324 return &lr->token;
326 case '"':
327 return get_string (lr, charmap, locale, repertoire, verbose);
329 case '-':
330 ch = lr_getc (lr);
331 if (ch == '1')
333 lr->token.tok = tok_minus1;
334 return &lr->token;
336 lr_ungetn (lr, 2);
337 break;
340 return get_ident (lr);
344 static struct token *
345 get_toplvl_escape (struct linereader *lr)
347 /* This is supposed to be a numeric value. We return the
348 numerical value and the number of bytes. */
349 size_t start_idx = lr->idx - 1;
350 char *bytes = lr->token.val.charcode.bytes;
351 int nbytes = 0;
352 int ch;
356 unsigned int byte = 0;
357 unsigned int base = 8;
359 ch = lr_getc (lr);
361 if (ch == 'd')
363 base = 10;
364 ch = lr_getc (lr);
366 else if (ch == 'x')
368 base = 16;
369 ch = lr_getc (lr);
372 if ((base == 16 && !isxdigit (ch))
373 || (base != 16 && (ch < '0' || ch >= (int) ('0' + base))))
375 esc_error:
376 lr->token.val.str.startmb = &lr->buf[start_idx];
378 while (ch != EOF && !isspace (ch))
379 ch = lr_getc (lr);
380 lr->token.val.str.lenmb = lr->idx - start_idx;
382 lr->token.tok = tok_error;
383 return &lr->token;
386 if (isdigit (ch))
387 byte = ch - '0';
388 else
389 byte = tolower (ch) - 'a' + 10;
391 ch = lr_getc (lr);
392 if ((base == 16 && !isxdigit (ch))
393 || (base != 16 && (ch < '0' || ch >= (int) ('0' + base))))
394 goto esc_error;
396 byte *= base;
397 if (isdigit (ch))
398 byte += ch - '0';
399 else
400 byte += tolower (ch) - 'a' + 10;
402 ch = lr_getc (lr);
403 if (base != 16 && isdigit (ch))
405 byte *= base;
406 byte += ch - '0';
408 ch = lr_getc (lr);
411 bytes[nbytes++] = byte;
413 while (ch == lr->escape_char
414 && nbytes < (int) sizeof (lr->token.val.charcode.bytes));
416 if (!isspace (ch))
417 lr_error (lr, _("garbage at end of character code specification"));
419 lr_ungetn (lr, 1);
421 lr->token.tok = tok_charcode;
422 lr->token.val.charcode.nbytes = nbytes;
424 return &lr->token;
428 #define ADDC(ch) \
429 do \
431 if (bufact == bufmax) \
433 bufmax *= 2; \
434 buf = xrealloc (buf, bufmax); \
436 buf[bufact++] = (ch); \
438 while (0)
441 #define ADDS(s, l) \
442 do \
444 size_t _l = (l); \
445 if (bufact + _l > bufmax) \
447 if (bufact < _l) \
448 bufact = _l; \
449 bufmax *= 2; \
450 buf = xrealloc (buf, bufmax); \
452 memcpy (&buf[bufact], s, _l); \
453 bufact += _l; \
455 while (0)
458 #define ADDWC(ch) \
459 do \
461 if (buf2act == buf2max) \
463 buf2max *= 2; \
464 buf2 = xrealloc (buf2, buf2max * 4); \
466 buf2[buf2act++] = (ch); \
468 while (0)
471 static struct token *
472 get_symname (struct linereader *lr)
474 /* Symbol in brackets. We must distinguish three kinds:
475 1. reserved words
476 2. ISO 10646 position values
477 3. all other. */
478 char *buf;
479 size_t bufact = 0;
480 size_t bufmax = 56;
481 const struct keyword_t *kw;
482 int ch;
484 buf = (char *) xmalloc (bufmax);
488 ch = lr_getc (lr);
489 if (ch == lr->escape_char)
491 int c2 = lr_getc (lr);
492 ADDC (c2);
494 if (c2 == '\n')
495 ch = '\n';
497 else
498 ADDC (ch);
500 while (ch != '>' && ch != '\n');
502 if (ch == '\n')
503 lr_error (lr, _("unterminated symbolic name"));
505 /* Test for ISO 10646 position value. */
506 if (buf[0] == 'U' && (bufact == 6 || bufact == 10))
508 char *cp = buf + 1;
509 while (cp < &buf[bufact - 1] && isxdigit (*cp))
510 ++cp;
512 if (cp == &buf[bufact - 1])
514 /* Yes, it is. */
515 lr->token.tok = tok_ucs4;
516 lr->token.val.ucs4 = strtoul (buf + 1, NULL, 16);
518 return &lr->token;
522 /* It is a symbolic name. Test for reserved words. */
523 kw = lr->hash_fct (buf, bufact - 1);
525 if (kw != NULL && kw->symname_or_ident == 1)
527 lr->token.tok = kw->token;
528 free (buf);
530 else
532 lr->token.tok = tok_bsymbol;
534 buf[bufact] = '\0';
535 buf = xrealloc (buf, bufact + 1);
537 lr->token.val.str.startmb = buf;
538 lr->token.val.str.lenmb = bufact - 1;
541 return &lr->token;
545 static struct token *
546 get_ident (struct linereader *lr)
548 char *buf;
549 size_t bufact;
550 size_t bufmax = 56;
551 const struct keyword_t *kw;
552 int ch;
554 buf = xmalloc (bufmax);
555 bufact = 0;
557 ADDC (lr->buf[lr->idx - 1]);
559 while (!isspace ((ch = lr_getc (lr))) && ch != '"' && ch != ';'
560 && ch != '<' && ch != ',' && ch != EOF)
562 if (ch == lr->escape_char)
564 ch = lr_getc (lr);
565 if (ch == '\n' || ch == EOF)
567 lr_error (lr, _("invalid escape sequence"));
568 break;
571 ADDC (ch);
574 lr_ungetc (lr, ch);
576 kw = lr->hash_fct (buf, bufact);
578 if (kw != NULL && kw->symname_or_ident == 0)
580 lr->token.tok = kw->token;
581 free (buf);
583 else
585 lr->token.tok = tok_ident;
587 buf[bufact] = '\0';
588 buf = xrealloc (buf, bufact + 1);
590 lr->token.val.str.startmb = buf;
591 lr->token.val.str.lenmb = bufact;
594 return &lr->token;
598 static struct token *
599 get_string (struct linereader *lr, const struct charmap_t *charmap,
600 struct localedef_t *locale, const struct repertoire_t *repertoire,
601 int verbose)
603 int return_widestr = lr->return_widestr;
604 char *buf;
605 wchar_t *buf2 = NULL;
606 size_t bufact;
607 size_t bufmax = 56;
609 /* We must return two different strings. */
610 buf = xmalloc (bufmax);
611 bufact = 0;
613 /* We know it'll be a string. */
614 lr->token.tok = tok_string;
616 /* If we need not translate the strings (i.e., expand <...> parts)
617 we can run a simple loop. */
618 if (!lr->translate_strings)
620 int ch;
622 buf2 = NULL;
623 while ((ch = lr_getc (lr)) != '"' && ch != '\n' && ch != EOF)
624 ADDC (ch);
626 /* Catch errors with trailing escape character. */
627 if (bufact > 0 && buf[bufact - 1] == lr->escape_char
628 && (bufact == 1 || buf[bufact - 2] != lr->escape_char))
630 lr_error (lr, _("illegal escape sequence at end of string"));
631 --bufact;
633 else if (ch == '\n' || ch == EOF)
634 lr_error (lr, _("unterminated string"));
636 ADDC ('\0');
638 else
640 int illegal_string = 0;
641 size_t buf2act = 0;
642 size_t buf2max = 56 * sizeof (uint32_t);
643 int ch;
644 int warned = 0;
646 /* We have to provide the wide character result as well. */
647 if (return_widestr)
648 buf2 = xmalloc (buf2max);
650 /* Read until the end of the string (or end of the line or file). */
651 while ((ch = lr_getc (lr)) != '"' && ch != '\n' && ch != EOF)
653 size_t startidx;
654 uint32_t wch;
655 struct charseq *seq;
657 if (ch != '<')
659 /* The standards leave it up to the implementation to decide
660 what to do with character which stand for themself. We
661 could jump through hoops to find out the value relative to
662 the charmap and the repertoire map, but instead we leave
663 it up to the locale definition author to write a better
664 definition. We assume here that every character which
665 stands for itself is encoded using ISO 8859-1. Using the
666 escape character is allowed. */
667 if (ch == lr->escape_char)
669 ch = lr_getc (lr);
670 if (ch == '\n' || ch == EOF)
671 break;
674 if (verbose && !warned)
676 lr_error (lr, _("\
677 non-symbolic character value should not be used"));
678 warned = 1;
681 ADDC (ch);
682 if (return_widestr)
683 ADDWC ((uint32_t) ch);
685 continue;
688 /* Now we have to search for the end of the symbolic name, i.e.,
689 the closing '>'. */
690 startidx = bufact;
691 while ((ch = lr_getc (lr)) != '>' && ch != '\n' && ch != EOF)
693 if (ch == lr->escape_char)
695 ch = lr_getc (lr);
696 if (ch == '\n' || ch == EOF)
697 break;
699 ADDC (ch);
701 if (ch == '\n' || ch == EOF)
702 /* Not a correct string. */
703 break;
704 if (bufact == startidx)
706 /* <> is no correct name. Ignore it and also signal an
707 error. */
708 illegal_string = 1;
709 continue;
712 /* It might be a Uxxxx symbol. */
713 if (buf[startidx] == 'U'
714 && (bufact - startidx == 5 || bufact - startidx == 9))
716 char *cp = buf + startidx + 1;
717 while (cp < &buf[bufact] && isxdigit (*cp))
718 ++cp;
720 if (cp == &buf[bufact])
722 char utmp[10];
724 /* Yes, it is. */
725 ADDC ('\0');
726 wch = strtoul (buf + startidx + 1, NULL, 16);
728 /* Now forget about the name we just added. */
729 bufact = startidx;
731 if (return_widestr)
732 ADDWC (wch);
734 /* See whether the charmap contains the Uxxxxxxxx names. */
735 snprintf (utmp, sizeof (utmp), "U%08X", wch);
736 seq = charmap_find_value (charmap, utmp, 9);
738 if (seq == NULL)
740 /* No, this isn't the case. Now determine from
741 the repertoire the name of the character and
742 find it in the charmap. */
743 if (repertoire != NULL)
745 const char *symbol;
747 symbol = repertoire_find_symbol (repertoire, wch);
749 if (symbol != NULL)
750 seq = charmap_find_value (charmap, symbol,
751 strlen (symbol));
754 if (seq == NULL)
756 #ifndef NO_TRANSLITERATION
757 /* Transliterate if possible. */
758 if (locale != NULL)
760 uint32_t *translit;
762 if ((locale->avail & CTYPE_LOCALE) == 0)
764 /* Load the CTYPE data now. */
765 int old_needed = locale->needed;
767 locale->needed = 0;
768 locale = load_locale (LC_CTYPE,
769 locale->name,
770 locale->repertoire_name,
771 charmap, locale);
772 locale->needed = old_needed;
775 if ((locale->avail & CTYPE_LOCALE) != 0
776 && ((translit = find_translit (locale,
777 charmap, wch))
778 != NULL))
779 /* The CTYPE data contains a matching
780 transliteration. */
782 int i;
784 for (i = 0; translit[i] != 0; ++i)
786 char utmp[10];
788 snprintf (utmp, sizeof (utmp), "U%08X",
789 translit[i]);
790 seq = charmap_find_value (charmap, utmp,
792 assert (seq != NULL);
793 ADDS (seq->bytes, seq->nbytes);
796 continue;
799 #endif /* NO_TRANSLITERATION */
801 /* Not a known name. */
802 illegal_string = 1;
806 if (seq != NULL)
807 ADDS (seq->bytes, seq->nbytes);
809 continue;
813 /* We now have the symbolic name in buf[startidx] to
814 buf[bufact-1]. Now find out the value for this character
815 in the charmap as well as in the repertoire map (in this
816 order). */
817 seq = charmap_find_value (charmap, &buf[startidx],
818 bufact - startidx);
820 if (seq == NULL)
822 /* This name is not in the charmap. */
823 lr_error (lr, _("symbol `%.*s' not in charmap"),
824 (int) (bufact - startidx), &buf[startidx]);
825 illegal_string = 1;
828 if (return_widestr)
830 /* Now the same for the multibyte representation. */
831 if (seq != NULL && seq->ucs4 != UNINITIALIZED_CHAR_VALUE)
832 wch = seq->ucs4;
833 else
835 wch = repertoire_find_value (repertoire, &buf[startidx],
836 bufact - startidx);
837 if (seq != NULL)
838 seq->ucs4 = wch;
841 if (wch == ILLEGAL_CHAR_VALUE)
843 /* This name is not in the repertoire map. */
844 lr_error (lr, _("symbol `%.*s' not in repertoire map"),
845 (int) (bufact - startidx), &buf[startidx]);
846 illegal_string = 1;
848 else
849 ADDWC (wch);
852 /* Now forget about the name we just added. */
853 bufact = startidx;
855 /* And copy the bytes. */
856 if (seq != NULL)
857 ADDS (seq->bytes, seq->nbytes);
860 if (ch == '\n' || ch == EOF)
862 lr_error (lr, _("unterminated string"));
863 illegal_string = 1;
866 if (illegal_string)
868 free (buf);
869 if (buf2 != NULL)
870 free (buf2);
871 lr->token.val.str.startmb = NULL;
872 lr->token.val.str.lenmb = 0;
873 lr->token.val.str.startwc = NULL;
874 lr->token.val.str.lenwc = 0;
876 return &lr->token;
879 ADDC ('\0');
881 if (return_widestr)
883 ADDWC (0);
884 lr->token.val.str.startwc = xrealloc (buf2,
885 buf2act * sizeof (uint32_t));
886 lr->token.val.str.lenwc = buf2act;
890 lr->token.val.str.startmb = xrealloc (buf, bufact);
891 lr->token.val.str.lenmb = bufact;
893 return &lr->token;