Update.
[glibc.git] / locale / programs / linereader.c
blob6237094f0743266853b3bfbceea2322c0ed804b1
1 /* Copyright (C) 1996, 1997, 1998, 1999, 2000 Free Software Foundation, Inc.
2 This file is part of the GNU C Library.
3 Contributed by Ulrich Drepper <drepper@gnu.org>, 1996.
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Library General Public License as
7 published by the Free Software Foundation; either version 2 of the
8 License, or (at your option) any later version.
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Library General Public License for more details.
15 You should have received a copy of the GNU Library General Public
16 License along with the GNU C Library; see the file COPYING.LIB. If not,
17 write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
18 Boston, MA 02111-1307, USA. */
20 #ifdef HAVE_CONFIG_H
21 # include <config.h>
22 #endif
24 #include <ctype.h>
25 #include <errno.h>
26 #include <libintl.h>
27 #include <stdarg.h>
28 #include <stdlib.h>
29 #include <string.h>
31 #include "charmap.h"
32 #include "error.h"
33 #include "linereader.h"
34 #include "localedef.h"
37 /* Prototypes for local functions. */
38 static struct token *get_toplvl_escape (struct linereader *lr);
39 static struct token *get_symname (struct linereader *lr);
40 static struct token *get_ident (struct linereader *lr);
41 static struct token *get_string (struct linereader *lr,
42 const struct charmap_t *charmap,
43 const struct repertoire_t *repertoire);
46 struct linereader *
47 lr_open (const char *fname, kw_hash_fct_t hf)
49 FILE *fp;
50 struct linereader *result;
51 int n;
53 if (fname == NULL || strcmp (fname, "-") == 0
54 || strcmp (fname, "/dev/stdin") == 0)
55 fp = stdin;
56 else
58 fp = fopen (fname, "r");
59 if (fp == NULL)
60 return NULL;
63 result = (struct linereader *) xmalloc (sizeof (*result));
65 result->fp = fp;
66 result->fname = xstrdup (fname ? : "<stdin>");
67 result->buf = NULL;
68 result->bufsize = 0;
69 result->lineno = 1;
70 result->idx = 0;
71 result->comment_char = '#';
72 result->escape_char = '\\';
73 result->translate_strings = 1;
75 n = getdelim (&result->buf, &result->bufsize, '\n', result->fp);
76 if (n < 0)
78 int save = errno;
79 fclose (result->fp);
80 free ((char *) result->fname);
81 free (result);
82 errno = save;
83 return NULL;
86 if (n > 1 && result->buf[n - 2] == '\\' && result->buf[n - 1] == '\n')
87 n -= 2;
89 result->buf[n] = '\0';
90 result->bufact = n;
91 result->hash_fct = hf;
93 return result;
97 int
98 lr_eof (struct linereader *lr)
100 return lr->bufact = 0;
104 void
105 lr_close (struct linereader *lr)
107 fclose (lr->fp);
108 free (lr->buf);
109 free (lr);
114 lr_next (struct linereader *lr)
116 int n;
118 n = getdelim (&lr->buf, &lr->bufsize, '\n', lr->fp);
119 if (n < 0)
120 return -1;
122 ++lr->lineno;
124 if (n > 1 && lr->buf[n - 2] == lr->escape_char && lr->buf[n - 1] == '\n')
126 #if 0
127 /* XXX Is this correct? */
128 /* An escaped newline character is substituted with a single <SP>. */
129 --n;
130 lr->buf[n - 1] = ' ';
131 #else
132 n -= 2;
133 #endif
136 lr->buf[n] = '\0';
137 lr->bufact = n;
138 lr->idx = 0;
140 return 0;
144 /* Defined in error.c. */
145 /* This variable is incremented each time `error' is called. */
146 extern unsigned int error_message_count;
148 /* The calling program should define program_name and set it to the
149 name of the executing program. */
150 extern char *program_name;
153 struct token *
154 lr_token (struct linereader *lr, const struct charmap_t *charmap,
155 const struct repertoire_t *repertoire)
157 int ch;
159 while (1)
163 ch = lr_getc (lr);
165 if (ch == EOF)
167 lr->token.tok = tok_eof;
168 return &lr->token;
171 if (ch == '\n')
173 lr->token.tok = tok_eol;
174 return &lr->token;
177 while (isspace (ch));
179 if (ch == EOF)
181 lr->token.tok = tok_eof;
182 return &lr->token;
185 if (ch != lr->comment_char)
186 break;
188 /* Is there an newline at the end of the buffer? */
189 if (lr->buf[lr->bufact - 1] != '\n')
191 /* No. Some people want this to mean that only the line in
192 the file not the logical, concatenated line is ignored.
193 Let's try this. */
194 lr->idx = lr->bufact;
195 continue;
198 /* Ignore rest of line. */
199 lr_ignore_rest (lr, 0);
200 lr->token.tok = tok_eol;
201 return &lr->token;
204 /* Match escape sequences. */
205 if (ch == lr->escape_char)
206 return get_toplvl_escape (lr);
208 /* Match ellipsis. */
209 if (ch == '.')
211 if (strncmp (&lr->buf[lr->idx], "...(2)....", 10) == 0)
213 int cnt;
214 for (cnt = 0; cnt < 10; ++cnt)
215 lr_getc (lr);
216 lr->token.tok = tok_ellipsis4_2;
217 return &lr->token;
219 if (strncmp (&lr->buf[lr->idx], "...", 3) == 0)
221 lr_getc (lr);
222 lr_getc (lr);
223 lr_getc (lr);
224 lr->token.tok = tok_ellipsis4;
225 return &lr->token;
227 if (strncmp (&lr->buf[lr->idx], "..", 2) == 0)
229 lr_getc (lr);
230 lr_getc (lr);
231 lr->token.tok = tok_ellipsis3;
232 return &lr->token;
234 if (strncmp (&lr->buf[lr->idx], ".(2)..", 6) == 0)
236 int cnt;
237 for (cnt = 0; cnt < 6; ++cnt)
238 lr_getc (lr);
239 lr->token.tok = tok_ellipsis2_2;
240 return &lr->token;
242 if (lr->buf[lr->idx] == '.')
244 lr_getc (lr);
245 lr->token.tok = tok_ellipsis2;
246 return &lr->token;
250 switch (ch)
252 case '<':
253 return get_symname (lr);
255 case '0' ... '9':
256 lr->token.tok = tok_number;
257 lr->token.val.num = ch - '0';
259 while (isdigit (ch = lr_getc (lr)))
261 lr->token.val.num *= 10;
262 lr->token.val.num += ch - '0';
264 if (isalpha (ch))
265 lr_error (lr, _("garbage at end of number"));
266 lr_ungetn (lr, 1);
268 return &lr->token;
270 case ';':
271 lr->token.tok = tok_semicolon;
272 return &lr->token;
274 case ',':
275 lr->token.tok = tok_comma;
276 return &lr->token;
278 case '(':
279 lr->token.tok = tok_open_brace;
280 return &lr->token;
282 case ')':
283 lr->token.tok = tok_close_brace;
284 return &lr->token;
286 case '"':
287 return get_string (lr, charmap, repertoire);
289 case '-':
290 ch = lr_getc (lr);
291 if (ch == '1')
293 lr->token.tok = tok_minus1;
294 return &lr->token;
296 lr_ungetn (lr, 2);
297 break;
300 return get_ident (lr);
304 static struct token *
305 get_toplvl_escape (struct linereader *lr)
307 /* This is supposed to be a numeric value. We return the
308 numerical value and the number of bytes. */
309 size_t start_idx = lr->idx - 1;
310 char *bytes = lr->token.val.charcode.bytes;
311 int nbytes = 0;
312 int ch;
316 unsigned int byte = 0;
317 unsigned int base = 8;
319 ch = lr_getc (lr);
321 if (ch == 'd')
323 base = 10;
324 ch = lr_getc (lr);
326 else if (ch == 'x')
328 base = 16;
329 ch = lr_getc (lr);
332 if ((base == 16 && !isxdigit (ch))
333 || (base != 16 && (ch < '0' || ch >= (int) ('0' + base))))
335 esc_error:
336 lr->token.val.str.startmb = &lr->buf[start_idx];
338 while (ch != EOF && !isspace (ch))
339 ch = lr_getc (lr);
340 lr->token.val.str.lenmb = lr->idx - start_idx;
342 lr->token.tok = tok_error;
343 return &lr->token;
346 if (isdigit (ch))
347 byte = ch - '0';
348 else
349 byte = tolower (ch) - 'a' + 10;
351 ch = lr_getc (lr);
352 if ((base == 16 && !isxdigit (ch))
353 || (base != 16 && (ch < '0' || ch >= (int) ('0' + base))))
354 goto esc_error;
356 byte *= base;
357 if (isdigit (ch))
358 byte += ch - '0';
359 else
360 byte += tolower (ch) - 'a' + 10;
362 ch = lr_getc (lr);
363 if (base != 16 && isdigit (ch))
365 byte *= base;
366 byte += ch - '0';
368 ch = lr_getc (lr);
371 bytes[nbytes++] = byte;
373 while (ch == lr->escape_char && nbytes < 4);
375 if (!isspace (ch))
376 lr_error (lr, _("garbage at end of character code specification"));
378 lr_ungetn (lr, 1);
380 lr->token.tok = tok_charcode;
381 lr->token.val.charcode.nbytes = nbytes;
383 return &lr->token;
387 #define ADDC(ch) \
388 do \
390 if (bufact == bufmax) \
392 bufmax *= 2; \
393 buf = xrealloc (buf, bufmax); \
395 buf[bufact++] = (ch); \
397 while (0)
400 #define ADDS(s, l) \
401 do \
403 size_t _l = (l); \
404 if (bufact + _l > bufmax) \
406 if (bufact < _l) \
407 bufact = _l; \
408 bufmax *= 2; \
409 buf = xrealloc (buf, bufmax); \
411 memcpy (&buf[bufact], s, _l); \
412 bufact += _l; \
414 while (0)
417 #define ADDWC(ch) \
418 do \
420 if (buf2act == buf2max) \
422 buf2max *= 2; \
423 buf2 = xrealloc (buf2, buf2max * 4); \
425 buf2[buf2act++] = (ch); \
427 while (0)
430 static struct token *
431 get_symname (struct linereader *lr)
433 /* Symbol in brackets. We must distinguish three kinds:
434 1. reserved words
435 2. ISO 10646 position values
436 3. all other. */
437 char *buf;
438 size_t bufact = 0;
439 size_t bufmax = 56;
440 const struct keyword_t *kw;
441 int ch;
443 buf = (char *) xmalloc (bufmax);
447 ch = lr_getc (lr);
448 if (ch == lr->escape_char)
450 int c2 = lr_getc (lr);
451 ADDC (c2);
453 if (c2 == '\n')
454 ch = '\n';
456 else
457 ADDC (ch);
459 while (ch != '>' && ch != '\n');
461 if (ch == '\n')
462 lr_error (lr, _("unterminated symbolic name"));
464 /* Test for ISO 10646 position value. */
465 if (buf[0] == 'U' && (bufact == 6 || bufact == 10))
467 char *cp = buf + 1;
468 while (cp < &buf[bufact - 1] && isxdigit (*cp))
469 ++cp;
471 if (cp == &buf[bufact - 1])
473 /* Yes, it is. */
474 lr->token.tok = tok_ucs4;
475 lr->token.val.ucs4 = strtoul (buf + 1, NULL, 16);
477 return &lr->token;
481 /* It is a symbolic name. Test for reserved words. */
482 kw = lr->hash_fct (buf, bufact - 1);
484 if (kw != NULL && kw->symname_or_ident == 1)
486 lr->token.tok = kw->token;
487 free (buf);
489 else
491 lr->token.tok = tok_bsymbol;
493 buf[bufact] = '\0';
494 buf = xrealloc (buf, bufact + 1);
496 lr->token.val.str.startmb = buf;
497 lr->token.val.str.lenmb = bufact - 1;
500 return &lr->token;
504 static struct token *
505 get_ident (struct linereader *lr)
507 char *buf;
508 size_t bufact;
509 size_t bufmax = 56;
510 const struct keyword_t *kw;
511 int ch;
513 buf = xmalloc (bufmax);
514 bufact = 0;
516 ADDC (lr->buf[lr->idx - 1]);
518 while (!isspace ((ch = lr_getc (lr))) && ch != '"' && ch != ';'
519 && ch != '<' && ch != ',')
521 if (ch == lr->escape_char)
523 ch = lr_getc (lr);
524 if (ch == '\n' || ch == EOF)
526 lr_error (lr, _("invalid escape sequence"));
527 break;
530 ADDC (ch);
533 lr_ungetn (lr, 1);
535 kw = lr->hash_fct (buf, bufact);
537 if (kw != NULL && kw->symname_or_ident == 0)
539 lr->token.tok = kw->token;
540 free (buf);
542 else
544 lr->token.tok = tok_ident;
546 buf[bufact] = '\0';
547 buf = xrealloc (buf, bufact + 1);
549 lr->token.val.str.startmb = buf;
550 lr->token.val.str.lenmb = bufact;
553 return &lr->token;
557 static struct token *
558 get_string (struct linereader *lr, const struct charmap_t *charmap,
559 const struct repertoire_t *repertoire)
561 int return_widestr = lr->return_widestr;
562 char *buf;
563 wchar_t *buf2 = NULL;
564 size_t bufact;
565 size_t bufmax = 56;
567 /* We must return two different strings. */
568 buf = xmalloc (bufmax);
569 bufact = 0;
571 /* We know it'll be a string. */
572 lr->token.tok = tok_string;
574 /* If we need not translate the strings (i.e., expand <...> parts)
575 we can run a simple loop. */
576 if (!lr->translate_strings)
578 int ch;
580 buf2 = NULL;
581 while ((ch = lr_getc (lr)) != '"' && ch != '\n' && ch != EOF)
582 ADDC (ch);
584 /* Catch errors with trailing escape character. */
585 if (bufact > 0 && buf[bufact - 1] == lr->escape_char
586 && (bufact == 1 || buf[bufact - 2] != lr->escape_char))
588 lr_error (lr, _("illegal escape sequence at end of string"));
589 --bufact;
591 else if (ch == '\n' || ch == EOF)
592 lr_error (lr, _("unterminated string"));
594 ADDC ('\0');
596 else
598 int illegal_string = 0;
599 size_t buf2act = 0;
600 size_t buf2max = 56 * sizeof (uint32_t);
601 int ch;
602 int warned = 0;
604 /* We have to provide the wide character result as well. */
605 if (return_widestr)
606 buf2 = xmalloc (buf2max);
608 /* Read until the end of the string (or end of the line or file). */
609 while ((ch = lr_getc (lr)) != '"' && ch != '\n' && ch != EOF)
611 size_t startidx;
612 uint32_t wch;
613 struct charseq *seq;
615 if (ch != '<')
617 /* The standards leave it up to the implementation to decide
618 what to do with character which stand for themself. We
619 could jump through hoops to find out the value relative to
620 the charmap and the repertoire map, but instead we leave
621 it up to the locale definition author to write a better
622 definition. We assume here that every character which
623 stands for itself is encoded using ISO 8859-1. Using the
624 escape character is allowed. */
625 if (ch == lr->escape_char)
627 ch = lr_getc (lr);
628 if (ch == '\n' || ch == EOF)
629 break;
632 if (verbose && !warned)
634 lr_error (lr, _("\
635 non-symbolic character value should not be used"));
636 warned = 1;
639 ADDC (ch);
640 if (return_widestr)
641 ADDWC ((uint32_t) ch);
643 continue;
646 /* Now we have to search for the end of the symbolic name, i.e.,
647 the closing '>'. */
648 startidx = bufact;
649 while ((ch = lr_getc (lr)) != '>' && ch != '\n' && ch != EOF)
651 if (ch == lr->escape_char)
653 ch = lr_getc (lr);
654 if (ch == '\n' || ch == EOF)
655 break;
657 ADDC (ch);
659 if (ch == '\n' || ch == EOF)
660 /* Not a correct string. */
661 break;
662 if (bufact == startidx)
664 /* <> is no correct name. Ignore it and also signal an
665 error. */
666 illegal_string = 1;
667 continue;
670 /* It might be a Uxxxx symbol. */
671 if (buf[startidx] == 'U'
672 && (bufact - startidx == 5 || bufact - startidx == 9))
674 char *cp = buf + startidx + 1;
675 while (cp < &buf[bufact] && isxdigit (*cp))
676 ++cp;
678 if (cp == &buf[bufact])
680 char utmp[10];
681 const char *symbol = NULL;
683 /* Yes, it is. */
684 ADDC ('\0');
685 wch = strtoul (buf + startidx + 1, NULL, 16);
687 /* Now forget about the name we just added. */
688 bufact = startidx;
690 if (return_widestr)
691 ADDWC (wch);
693 /* See whether the charmap contains the Uxxxxxxxx names. */
694 snprintf (utmp, sizeof (utmp), "U%08X", wch);
695 seq = charmap_find_value (charmap, utmp, 9);
697 if (seq == NULL)
699 /* No, this isn't the case. Now determine from
700 the repertoire the name of the character and
701 find it in the charmap. */
702 if (repertoire != NULL)
703 symbol = repertoire_find_symbol (repertoire, wch);
705 if (symbol == NULL)
706 /* We cannot generate a string since we
707 cannot map from the Unicode number to the
708 character symbol. */
709 illegal_string = 1;
710 else
712 seq = charmap_find_value (charmap, symbol,
713 strlen (symbol));
715 if (seq == NULL)
716 /* Not a known name. */
717 illegal_string = 1;
721 if (seq != NULL)
722 ADDS (seq->bytes, seq->nbytes);
724 continue;
728 /* We now have the symbolic name in buf[startidx] to
729 buf[bufact-1]. Now find out the value for this character
730 in the charmap as well as in the repertoire map (in this
731 order). */
732 seq = charmap_find_value (charmap, &buf[startidx],
733 bufact - startidx);
735 if (seq == NULL)
737 /* This name is not in the charmap. */
738 lr_error (lr, _("symbol `%.*s' not in charmap"),
739 (int) (bufact - startidx), &buf[startidx]);
740 illegal_string = 1;
743 if (return_widestr)
745 /* Now the same for the multibyte representation. */
746 if (seq != NULL && seq->ucs4 != UNINITIALIZED_CHAR_VALUE)
747 wch = seq->ucs4;
748 else
750 wch = repertoire_find_value (repertoire, &buf[startidx],
751 bufact - startidx);
752 if (seq != NULL)
753 seq->ucs4 = wch;
756 if (wch == ILLEGAL_CHAR_VALUE)
758 /* This name is not in the repertoire map. */
759 lr_error (lr, _("symbol `%.*s' not in repertoire map"),
760 (int) (bufact - startidx), &buf[startidx]);
761 illegal_string = 1;
763 else
764 ADDWC (wch);
767 /* Now forget about the name we just added. */
768 bufact = startidx;
770 /* And copy the bytes. */
771 if (seq != NULL)
772 ADDS (seq->bytes, seq->nbytes);
775 if (ch == '\n' || ch == EOF)
777 lr_error (lr, _("unterminated string"));
778 illegal_string = 1;
781 if (illegal_string)
783 free (buf);
784 if (buf2 != NULL)
785 free (buf2);
786 lr->token.val.str.startmb = NULL;
787 lr->token.val.str.lenmb = 0;
789 return &lr->token;
792 ADDC ('\0');
794 if (return_widestr)
796 ADDWC (0);
797 lr->token.val.str.startwc = xrealloc (buf2,
798 buf2act * sizeof (uint32_t));
799 lr->token.val.str.lenwc = buf2act;
803 lr->token.val.str.startmb = xrealloc (buf, bufact);
804 lr->token.val.str.lenmb = bufact;
806 return &lr->token;