[BZ #2509]
[glibc.git] / locale / programs / linereader.c
blob6587f7305bb4015ab39f8e94fe8a086cb0a45bf6
1 /* Copyright (C) 1996-2005, 2006 Free Software Foundation, Inc.
2 This file is part of the GNU C Library.
3 Contributed by Ulrich Drepper <drepper@gnu.org>, 1996.
5 This program is free software; you can redistribute it and/or modify
6 it under the terms of the GNU General Public License version 2 as
7 published by the Free Software Foundation.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program; if not, write to the Free Software Foundation,
16 Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */
18 #ifdef HAVE_CONFIG_H
19 # include <config.h>
20 #endif
22 #include <assert.h>
23 #include <ctype.h>
24 #include <errno.h>
25 #include <libintl.h>
26 #include <stdarg.h>
27 #include <stdlib.h>
28 #include <string.h>
30 #include "localedef.h"
31 #include "charmap.h"
32 #include "error.h"
33 #include "linereader.h"
34 #include "locfile.h"
36 /* Prototypes for local functions. */
37 static struct token *get_toplvl_escape (struct linereader *lr);
38 static struct token *get_symname (struct linereader *lr);
39 static struct token *get_ident (struct linereader *lr);
40 static struct token *get_string (struct linereader *lr,
41 const struct charmap_t *charmap,
42 struct localedef_t *locale,
43 const struct repertoire_t *repertoire,
44 int verbose);
47 struct linereader *
48 lr_open (const char *fname, kw_hash_fct_t hf)
50 FILE *fp;
52 if (fname == NULL || strcmp (fname, "-") == 0
53 || strcmp (fname, "/dev/stdin") == 0)
54 return lr_create (stdin, "<stdin>", hf);
55 else
57 fp = fopen (fname, "rm");
58 if (fp == NULL)
59 return NULL;
60 return lr_create (fp, fname, hf);
64 struct linereader *
65 lr_create (FILE *fp, const char *fname, kw_hash_fct_t hf)
67 struct linereader *result;
68 int n;
70 result = (struct linereader *) xmalloc (sizeof (*result));
72 result->fp = fp;
73 result->fname = xstrdup (fname);
74 result->buf = NULL;
75 result->bufsize = 0;
76 result->lineno = 1;
77 result->idx = 0;
78 result->comment_char = '#';
79 result->escape_char = '\\';
80 result->translate_strings = 1;
81 result->return_widestr = 0;
83 n = getdelim (&result->buf, &result->bufsize, '\n', result->fp);
84 if (n < 0)
86 int save = errno;
87 fclose (result->fp);
88 free ((char *) result->fname);
89 free (result);
90 errno = save;
91 return NULL;
94 if (n > 1 && result->buf[n - 2] == '\\' && result->buf[n - 1] == '\n')
95 n -= 2;
97 result->buf[n] = '\0';
98 result->bufact = n;
99 result->hash_fct = hf;
101 return result;
106 lr_eof (struct linereader *lr)
108 return lr->bufact = 0;
112 void
113 lr_ignore_rest (struct linereader *lr, int verbose)
115 if (verbose)
117 while (isspace (lr->buf[lr->idx]) && lr->buf[lr->idx] != '\n'
118 && lr->buf[lr->idx] != lr->comment_char)
119 if (lr->buf[lr->idx] == '\0')
121 if (lr_next (lr) < 0)
122 return;
124 else
125 ++lr->idx;
127 if (lr->buf[lr->idx] != '\n' && ! feof (lr->fp)
128 && lr->buf[lr->idx] != lr->comment_char)
129 lr_error (lr, _("trailing garbage at end of line"));
132 /* Ignore continued line. */
133 while (lr->bufact > 0 && lr->buf[lr->bufact - 1] != '\n')
134 if (lr_next (lr) < 0)
135 break;
137 lr->idx = lr->bufact;
141 void
142 lr_close (struct linereader *lr)
144 fclose (lr->fp);
145 free (lr->buf);
146 free (lr);
151 lr_next (struct linereader *lr)
153 int n;
155 n = getdelim (&lr->buf, &lr->bufsize, '\n', lr->fp);
156 if (n < 0)
157 return -1;
159 ++lr->lineno;
161 if (n > 1 && lr->buf[n - 2] == lr->escape_char && lr->buf[n - 1] == '\n')
163 #if 0
164 /* XXX Is this correct? */
165 /* An escaped newline character is substituted with a single <SP>. */
166 --n;
167 lr->buf[n - 1] = ' ';
168 #else
169 n -= 2;
170 #endif
173 lr->buf[n] = '\0';
174 lr->bufact = n;
175 lr->idx = 0;
177 return 0;
181 /* Defined in error.c. */
182 /* This variable is incremented each time `error' is called. */
183 extern unsigned int error_message_count;
185 /* The calling program should define program_name and set it to the
186 name of the executing program. */
187 extern char *program_name;
190 struct token *
191 lr_token (struct linereader *lr, const struct charmap_t *charmap,
192 struct localedef_t *locale, const struct repertoire_t *repertoire,
193 int verbose)
195 int ch;
197 while (1)
201 ch = lr_getc (lr);
203 if (ch == EOF)
205 lr->token.tok = tok_eof;
206 return &lr->token;
209 if (ch == '\n')
211 lr->token.tok = tok_eol;
212 return &lr->token;
215 while (isspace (ch));
217 if (ch != lr->comment_char)
218 break;
220 /* Is there an newline at the end of the buffer? */
221 if (lr->buf[lr->bufact - 1] != '\n')
223 /* No. Some people want this to mean that only the line in
224 the file not the logical, concatenated line is ignored.
225 Let's try this. */
226 lr->idx = lr->bufact;
227 continue;
230 /* Ignore rest of line. */
231 lr_ignore_rest (lr, 0);
232 lr->token.tok = tok_eol;
233 return &lr->token;
236 /* Match escape sequences. */
237 if (ch == lr->escape_char)
238 return get_toplvl_escape (lr);
240 /* Match ellipsis. */
241 if (ch == '.')
243 if (strncmp (&lr->buf[lr->idx], "...(2)....", 10) == 0)
245 int cnt;
246 for (cnt = 0; cnt < 10; ++cnt)
247 lr_getc (lr);
248 lr->token.tok = tok_ellipsis4_2;
249 return &lr->token;
251 if (strncmp (&lr->buf[lr->idx], "...", 3) == 0)
253 lr_getc (lr);
254 lr_getc (lr);
255 lr_getc (lr);
256 lr->token.tok = tok_ellipsis4;
257 return &lr->token;
259 if (strncmp (&lr->buf[lr->idx], "..", 2) == 0)
261 lr_getc (lr);
262 lr_getc (lr);
263 lr->token.tok = tok_ellipsis3;
264 return &lr->token;
266 if (strncmp (&lr->buf[lr->idx], ".(2)..", 6) == 0)
268 int cnt;
269 for (cnt = 0; cnt < 6; ++cnt)
270 lr_getc (lr);
271 lr->token.tok = tok_ellipsis2_2;
272 return &lr->token;
274 if (lr->buf[lr->idx] == '.')
276 lr_getc (lr);
277 lr->token.tok = tok_ellipsis2;
278 return &lr->token;
282 switch (ch)
284 case '<':
285 return get_symname (lr);
287 case '0' ... '9':
288 lr->token.tok = tok_number;
289 lr->token.val.num = ch - '0';
291 while (isdigit (ch = lr_getc (lr)))
293 lr->token.val.num *= 10;
294 lr->token.val.num += ch - '0';
296 if (isalpha (ch))
297 lr_error (lr, _("garbage at end of number"));
298 lr_ungetn (lr, 1);
300 return &lr->token;
302 case ';':
303 lr->token.tok = tok_semicolon;
304 return &lr->token;
306 case ',':
307 lr->token.tok = tok_comma;
308 return &lr->token;
310 case '(':
311 lr->token.tok = tok_open_brace;
312 return &lr->token;
314 case ')':
315 lr->token.tok = tok_close_brace;
316 return &lr->token;
318 case '"':
319 return get_string (lr, charmap, locale, repertoire, verbose);
321 case '-':
322 ch = lr_getc (lr);
323 if (ch == '1')
325 lr->token.tok = tok_minus1;
326 return &lr->token;
328 lr_ungetn (lr, 2);
329 break;
332 return get_ident (lr);
336 static struct token *
337 get_toplvl_escape (struct linereader *lr)
339 /* This is supposed to be a numeric value. We return the
340 numerical value and the number of bytes. */
341 size_t start_idx = lr->idx - 1;
342 unsigned char *bytes = lr->token.val.charcode.bytes;
343 size_t nbytes = 0;
344 int ch;
348 unsigned int byte = 0;
349 unsigned int base = 8;
351 ch = lr_getc (lr);
353 if (ch == 'd')
355 base = 10;
356 ch = lr_getc (lr);
358 else if (ch == 'x')
360 base = 16;
361 ch = lr_getc (lr);
364 if ((base == 16 && !isxdigit (ch))
365 || (base != 16 && (ch < '0' || ch >= (int) ('0' + base))))
367 esc_error:
368 lr->token.val.str.startmb = &lr->buf[start_idx];
370 while (ch != EOF && !isspace (ch))
371 ch = lr_getc (lr);
372 lr->token.val.str.lenmb = lr->idx - start_idx;
374 lr->token.tok = tok_error;
375 return &lr->token;
378 if (isdigit (ch))
379 byte = ch - '0';
380 else
381 byte = tolower (ch) - 'a' + 10;
383 ch = lr_getc (lr);
384 if ((base == 16 && !isxdigit (ch))
385 || (base != 16 && (ch < '0' || ch >= (int) ('0' + base))))
386 goto esc_error;
388 byte *= base;
389 if (isdigit (ch))
390 byte += ch - '0';
391 else
392 byte += tolower (ch) - 'a' + 10;
394 ch = lr_getc (lr);
395 if (base != 16 && isdigit (ch))
397 byte *= base;
398 byte += ch - '0';
400 ch = lr_getc (lr);
403 bytes[nbytes++] = byte;
405 while (ch == lr->escape_char
406 && nbytes < (int) sizeof (lr->token.val.charcode.bytes));
408 if (!isspace (ch))
409 lr_error (lr, _("garbage at end of character code specification"));
411 lr_ungetn (lr, 1);
413 lr->token.tok = tok_charcode;
414 lr->token.val.charcode.nbytes = nbytes;
416 return &lr->token;
420 #define ADDC(ch) \
421 do \
423 if (bufact == bufmax) \
425 bufmax *= 2; \
426 buf = xrealloc (buf, bufmax); \
428 buf[bufact++] = (ch); \
430 while (0)
433 #define ADDS(s, l) \
434 do \
436 size_t _l = (l); \
437 if (bufact + _l > bufmax) \
439 if (bufact < _l) \
440 bufact = _l; \
441 bufmax *= 2; \
442 buf = xrealloc (buf, bufmax); \
444 memcpy (&buf[bufact], s, _l); \
445 bufact += _l; \
447 while (0)
450 #define ADDWC(ch) \
451 do \
453 if (buf2act == buf2max) \
455 buf2max *= 2; \
456 buf2 = xrealloc (buf2, buf2max * 4); \
458 buf2[buf2act++] = (ch); \
460 while (0)
463 static struct token *
464 get_symname (struct linereader *lr)
466 /* Symbol in brackets. We must distinguish three kinds:
467 1. reserved words
468 2. ISO 10646 position values
469 3. all other. */
470 char *buf;
471 size_t bufact = 0;
472 size_t bufmax = 56;
473 const struct keyword_t *kw;
474 int ch;
476 buf = (char *) xmalloc (bufmax);
480 ch = lr_getc (lr);
481 if (ch == lr->escape_char)
483 int c2 = lr_getc (lr);
484 ADDC (c2);
486 if (c2 == '\n')
487 ch = '\n';
489 else
490 ADDC (ch);
492 while (ch != '>' && ch != '\n');
494 if (ch == '\n')
495 lr_error (lr, _("unterminated symbolic name"));
497 /* Test for ISO 10646 position value. */
498 if (buf[0] == 'U' && (bufact == 6 || bufact == 10))
500 char *cp = buf + 1;
501 while (cp < &buf[bufact - 1] && isxdigit (*cp))
502 ++cp;
504 if (cp == &buf[bufact - 1])
506 /* Yes, it is. */
507 lr->token.tok = tok_ucs4;
508 lr->token.val.ucs4 = strtoul (buf + 1, NULL, 16);
510 return &lr->token;
514 /* It is a symbolic name. Test for reserved words. */
515 kw = lr->hash_fct (buf, bufact - 1);
517 if (kw != NULL && kw->symname_or_ident == 1)
519 lr->token.tok = kw->token;
520 free (buf);
522 else
524 lr->token.tok = tok_bsymbol;
526 buf[bufact] = '\0';
527 buf = xrealloc (buf, bufact + 1);
529 lr->token.val.str.startmb = buf;
530 lr->token.val.str.lenmb = bufact - 1;
533 return &lr->token;
537 static struct token *
538 get_ident (struct linereader *lr)
540 char *buf;
541 size_t bufact;
542 size_t bufmax = 56;
543 const struct keyword_t *kw;
544 int ch;
546 buf = xmalloc (bufmax);
547 bufact = 0;
549 ADDC (lr->buf[lr->idx - 1]);
551 while (!isspace ((ch = lr_getc (lr))) && ch != '"' && ch != ';'
552 && ch != '<' && ch != ',' && ch != EOF)
554 if (ch == lr->escape_char)
556 ch = lr_getc (lr);
557 if (ch == '\n' || ch == EOF)
559 lr_error (lr, _("invalid escape sequence"));
560 break;
563 ADDC (ch);
566 lr_ungetc (lr, ch);
568 kw = lr->hash_fct (buf, bufact);
570 if (kw != NULL && kw->symname_or_ident == 0)
572 lr->token.tok = kw->token;
573 free (buf);
575 else
577 lr->token.tok = tok_ident;
579 buf[bufact] = '\0';
580 buf = xrealloc (buf, bufact + 1);
582 lr->token.val.str.startmb = buf;
583 lr->token.val.str.lenmb = bufact;
586 return &lr->token;
590 static struct token *
591 get_string (struct linereader *lr, const struct charmap_t *charmap,
592 struct localedef_t *locale, const struct repertoire_t *repertoire,
593 int verbose)
595 int return_widestr = lr->return_widestr;
596 char *buf;
597 wchar_t *buf2 = NULL;
598 size_t bufact;
599 size_t bufmax = 56;
601 /* We must return two different strings. */
602 buf = xmalloc (bufmax);
603 bufact = 0;
605 /* We know it'll be a string. */
606 lr->token.tok = tok_string;
608 /* If we need not translate the strings (i.e., expand <...> parts)
609 we can run a simple loop. */
610 if (!lr->translate_strings)
612 int ch;
614 buf2 = NULL;
615 while ((ch = lr_getc (lr)) != '"' && ch != '\n' && ch != EOF)
616 ADDC (ch);
618 /* Catch errors with trailing escape character. */
619 if (bufact > 0 && buf[bufact - 1] == lr->escape_char
620 && (bufact == 1 || buf[bufact - 2] != lr->escape_char))
622 lr_error (lr, _("illegal escape sequence at end of string"));
623 --bufact;
625 else if (ch == '\n' || ch == EOF)
626 lr_error (lr, _("unterminated string"));
628 ADDC ('\0');
630 else
632 int illegal_string = 0;
633 size_t buf2act = 0;
634 size_t buf2max = 56 * sizeof (uint32_t);
635 int ch;
636 int warned = 0;
638 /* We have to provide the wide character result as well. */
639 if (return_widestr)
640 buf2 = xmalloc (buf2max);
642 /* Read until the end of the string (or end of the line or file). */
643 while ((ch = lr_getc (lr)) != '"' && ch != '\n' && ch != EOF)
645 size_t startidx;
646 uint32_t wch;
647 struct charseq *seq;
649 if (ch != '<')
651 /* The standards leave it up to the implementation to decide
652 what to do with character which stand for themself. We
653 could jump through hoops to find out the value relative to
654 the charmap and the repertoire map, but instead we leave
655 it up to the locale definition author to write a better
656 definition. We assume here that every character which
657 stands for itself is encoded using ISO 8859-1. Using the
658 escape character is allowed. */
659 if (ch == lr->escape_char)
661 ch = lr_getc (lr);
662 if (ch == '\n' || ch == EOF)
663 break;
666 if (verbose && !warned)
668 lr_error (lr, _("\
669 non-symbolic character value should not be used"));
670 warned = 1;
673 ADDC (ch);
674 if (return_widestr)
675 ADDWC ((uint32_t) ch);
677 continue;
680 /* Now we have to search for the end of the symbolic name, i.e.,
681 the closing '>'. */
682 startidx = bufact;
683 while ((ch = lr_getc (lr)) != '>' && ch != '\n' && ch != EOF)
685 if (ch == lr->escape_char)
687 ch = lr_getc (lr);
688 if (ch == '\n' || ch == EOF)
689 break;
691 ADDC (ch);
693 if (ch == '\n' || ch == EOF)
694 /* Not a correct string. */
695 break;
696 if (bufact == startidx)
698 /* <> is no correct name. Ignore it and also signal an
699 error. */
700 illegal_string = 1;
701 continue;
704 /* It might be a Uxxxx symbol. */
705 if (buf[startidx] == 'U'
706 && (bufact - startidx == 5 || bufact - startidx == 9))
708 char *cp = buf + startidx + 1;
709 while (cp < &buf[bufact] && isxdigit (*cp))
710 ++cp;
712 if (cp == &buf[bufact])
714 char utmp[10];
716 /* Yes, it is. */
717 ADDC ('\0');
718 wch = strtoul (buf + startidx + 1, NULL, 16);
720 /* Now forget about the name we just added. */
721 bufact = startidx;
723 if (return_widestr)
724 ADDWC (wch);
726 /* See whether the charmap contains the Uxxxxxxxx names. */
727 snprintf (utmp, sizeof (utmp), "U%08X", wch);
728 seq = charmap_find_value (charmap, utmp, 9);
730 if (seq == NULL)
732 /* No, this isn't the case. Now determine from
733 the repertoire the name of the character and
734 find it in the charmap. */
735 if (repertoire != NULL)
737 const char *symbol;
739 symbol = repertoire_find_symbol (repertoire, wch);
741 if (symbol != NULL)
742 seq = charmap_find_value (charmap, symbol,
743 strlen (symbol));
746 if (seq == NULL)
748 #ifndef NO_TRANSLITERATION
749 /* Transliterate if possible. */
750 if (locale != NULL)
752 uint32_t *translit;
754 if ((locale->avail & CTYPE_LOCALE) == 0)
756 /* Load the CTYPE data now. */
757 int old_needed = locale->needed;
759 locale->needed = 0;
760 locale = load_locale (LC_CTYPE,
761 locale->name,
762 locale->repertoire_name,
763 charmap, locale);
764 locale->needed = old_needed;
767 if ((locale->avail & CTYPE_LOCALE) != 0
768 && ((translit = find_translit (locale,
769 charmap, wch))
770 != NULL))
771 /* The CTYPE data contains a matching
772 transliteration. */
774 int i;
776 for (i = 0; translit[i] != 0; ++i)
778 char utmp[10];
780 snprintf (utmp, sizeof (utmp), "U%08X",
781 translit[i]);
782 seq = charmap_find_value (charmap, utmp,
784 assert (seq != NULL);
785 ADDS (seq->bytes, seq->nbytes);
788 continue;
791 #endif /* NO_TRANSLITERATION */
793 /* Not a known name. */
794 illegal_string = 1;
798 if (seq != NULL)
799 ADDS (seq->bytes, seq->nbytes);
801 continue;
805 /* We now have the symbolic name in buf[startidx] to
806 buf[bufact-1]. Now find out the value for this character
807 in the charmap as well as in the repertoire map (in this
808 order). */
809 seq = charmap_find_value (charmap, &buf[startidx],
810 bufact - startidx);
812 if (seq == NULL)
814 /* This name is not in the charmap. */
815 lr_error (lr, _("symbol `%.*s' not in charmap"),
816 (int) (bufact - startidx), &buf[startidx]);
817 illegal_string = 1;
820 if (return_widestr)
822 /* Now the same for the multibyte representation. */
823 if (seq != NULL && seq->ucs4 != UNINITIALIZED_CHAR_VALUE)
824 wch = seq->ucs4;
825 else
827 wch = repertoire_find_value (repertoire, &buf[startidx],
828 bufact - startidx);
829 if (seq != NULL)
830 seq->ucs4 = wch;
833 if (wch == ILLEGAL_CHAR_VALUE)
835 /* This name is not in the repertoire map. */
836 lr_error (lr, _("symbol `%.*s' not in repertoire map"),
837 (int) (bufact - startidx), &buf[startidx]);
838 illegal_string = 1;
840 else
841 ADDWC (wch);
844 /* Now forget about the name we just added. */
845 bufact = startidx;
847 /* And copy the bytes. */
848 if (seq != NULL)
849 ADDS (seq->bytes, seq->nbytes);
852 if (ch == '\n' || ch == EOF)
854 lr_error (lr, _("unterminated string"));
855 illegal_string = 1;
858 if (illegal_string)
860 free (buf);
861 if (buf2 != NULL)
862 free (buf2);
863 lr->token.val.str.startmb = NULL;
864 lr->token.val.str.lenmb = 0;
865 lr->token.val.str.startwc = NULL;
866 lr->token.val.str.lenwc = 0;
868 return &lr->token;
871 ADDC ('\0');
873 if (return_widestr)
875 ADDWC (0);
876 lr->token.val.str.startwc = xrealloc (buf2,
877 buf2act * sizeof (uint32_t));
878 lr->token.val.str.lenwc = buf2act;
882 lr->token.val.str.startmb = xrealloc (buf, bufact);
883 lr->token.val.str.lenmb = bufact;
885 return &lr->token;