* sysdeps/mach/hurd/bits/fcntl.h [__USE_GNU] (O_CLOEXEC): New macro.
[glibc.git] / locale / programs / linereader.c
blob8a04e322769116c2f2165b7ac428323a09d1e6c0
1 /* Copyright (C) 1996-2005, 2006 Free Software Foundation, Inc.
2 This file is part of the GNU C Library.
3 Contributed by Ulrich Drepper <drepper@gnu.org>, 1996.
5 This program is free software; you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published
7 by the Free Software Foundation; version 2 of the License, or
8 (at your option) any later version.
10 This program is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 GNU General Public License for more details.
15 You should have received a copy of the GNU General Public License
16 along with this program; if not, write to the Free Software Foundation,
17 Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */
19 #ifdef HAVE_CONFIG_H
20 # include <config.h>
21 #endif
23 #include <assert.h>
24 #include <ctype.h>
25 #include <errno.h>
26 #include <libintl.h>
27 #include <stdarg.h>
28 #include <stdlib.h>
29 #include <string.h>
31 #include "localedef.h"
32 #include "charmap.h"
33 #include "error.h"
34 #include "linereader.h"
35 #include "locfile.h"
37 /* Prototypes for local functions. */
38 static struct token *get_toplvl_escape (struct linereader *lr);
39 static struct token *get_symname (struct linereader *lr);
40 static struct token *get_ident (struct linereader *lr);
41 static struct token *get_string (struct linereader *lr,
42 const struct charmap_t *charmap,
43 struct localedef_t *locale,
44 const struct repertoire_t *repertoire,
45 int verbose);
48 struct linereader *
49 lr_open (const char *fname, kw_hash_fct_t hf)
51 FILE *fp;
53 if (fname == NULL || strcmp (fname, "-") == 0
54 || strcmp (fname, "/dev/stdin") == 0)
55 return lr_create (stdin, "<stdin>", hf);
56 else
58 fp = fopen (fname, "rm");
59 if (fp == NULL)
60 return NULL;
61 return lr_create (fp, fname, hf);
65 struct linereader *
66 lr_create (FILE *fp, const char *fname, kw_hash_fct_t hf)
68 struct linereader *result;
69 int n;
71 result = (struct linereader *) xmalloc (sizeof (*result));
73 result->fp = fp;
74 result->fname = xstrdup (fname);
75 result->buf = NULL;
76 result->bufsize = 0;
77 result->lineno = 1;
78 result->idx = 0;
79 result->comment_char = '#';
80 result->escape_char = '\\';
81 result->translate_strings = 1;
82 result->return_widestr = 0;
84 n = getdelim (&result->buf, &result->bufsize, '\n', result->fp);
85 if (n < 0)
87 int save = errno;
88 fclose (result->fp);
89 free ((char *) result->fname);
90 free (result);
91 errno = save;
92 return NULL;
95 if (n > 1 && result->buf[n - 2] == '\\' && result->buf[n - 1] == '\n')
96 n -= 2;
98 result->buf[n] = '\0';
99 result->bufact = n;
100 result->hash_fct = hf;
102 return result;
107 lr_eof (struct linereader *lr)
109 return lr->bufact = 0;
113 void
114 lr_ignore_rest (struct linereader *lr, int verbose)
116 if (verbose)
118 while (isspace (lr->buf[lr->idx]) && lr->buf[lr->idx] != '\n'
119 && lr->buf[lr->idx] != lr->comment_char)
120 if (lr->buf[lr->idx] == '\0')
122 if (lr_next (lr) < 0)
123 return;
125 else
126 ++lr->idx;
128 if (lr->buf[lr->idx] != '\n' && ! feof (lr->fp)
129 && lr->buf[lr->idx] != lr->comment_char)
130 lr_error (lr, _("trailing garbage at end of line"));
133 /* Ignore continued line. */
134 while (lr->bufact > 0 && lr->buf[lr->bufact - 1] != '\n')
135 if (lr_next (lr) < 0)
136 break;
138 lr->idx = lr->bufact;
142 void
143 lr_close (struct linereader *lr)
145 fclose (lr->fp);
146 free (lr->buf);
147 free (lr);
152 lr_next (struct linereader *lr)
154 int n;
156 n = getdelim (&lr->buf, &lr->bufsize, '\n', lr->fp);
157 if (n < 0)
158 return -1;
160 ++lr->lineno;
162 if (n > 1 && lr->buf[n - 2] == lr->escape_char && lr->buf[n - 1] == '\n')
164 #if 0
165 /* XXX Is this correct? */
166 /* An escaped newline character is substituted with a single <SP>. */
167 --n;
168 lr->buf[n - 1] = ' ';
169 #else
170 n -= 2;
171 #endif
174 lr->buf[n] = '\0';
175 lr->bufact = n;
176 lr->idx = 0;
178 return 0;
182 /* Defined in error.c. */
183 /* This variable is incremented each time `error' is called. */
184 extern unsigned int error_message_count;
186 /* The calling program should define program_name and set it to the
187 name of the executing program. */
188 extern char *program_name;
191 struct token *
192 lr_token (struct linereader *lr, const struct charmap_t *charmap,
193 struct localedef_t *locale, const struct repertoire_t *repertoire,
194 int verbose)
196 int ch;
198 while (1)
202 ch = lr_getc (lr);
204 if (ch == EOF)
206 lr->token.tok = tok_eof;
207 return &lr->token;
210 if (ch == '\n')
212 lr->token.tok = tok_eol;
213 return &lr->token;
216 while (isspace (ch));
218 if (ch != lr->comment_char)
219 break;
221 /* Is there an newline at the end of the buffer? */
222 if (lr->buf[lr->bufact - 1] != '\n')
224 /* No. Some people want this to mean that only the line in
225 the file not the logical, concatenated line is ignored.
226 Let's try this. */
227 lr->idx = lr->bufact;
228 continue;
231 /* Ignore rest of line. */
232 lr_ignore_rest (lr, 0);
233 lr->token.tok = tok_eol;
234 return &lr->token;
237 /* Match escape sequences. */
238 if (ch == lr->escape_char)
239 return get_toplvl_escape (lr);
241 /* Match ellipsis. */
242 if (ch == '.')
244 if (strncmp (&lr->buf[lr->idx], "...(2)....", 10) == 0)
246 int cnt;
247 for (cnt = 0; cnt < 10; ++cnt)
248 lr_getc (lr);
249 lr->token.tok = tok_ellipsis4_2;
250 return &lr->token;
252 if (strncmp (&lr->buf[lr->idx], "...", 3) == 0)
254 lr_getc (lr);
255 lr_getc (lr);
256 lr_getc (lr);
257 lr->token.tok = tok_ellipsis4;
258 return &lr->token;
260 if (strncmp (&lr->buf[lr->idx], "..", 2) == 0)
262 lr_getc (lr);
263 lr_getc (lr);
264 lr->token.tok = tok_ellipsis3;
265 return &lr->token;
267 if (strncmp (&lr->buf[lr->idx], ".(2)..", 6) == 0)
269 int cnt;
270 for (cnt = 0; cnt < 6; ++cnt)
271 lr_getc (lr);
272 lr->token.tok = tok_ellipsis2_2;
273 return &lr->token;
275 if (lr->buf[lr->idx] == '.')
277 lr_getc (lr);
278 lr->token.tok = tok_ellipsis2;
279 return &lr->token;
283 switch (ch)
285 case '<':
286 return get_symname (lr);
288 case '0' ... '9':
289 lr->token.tok = tok_number;
290 lr->token.val.num = ch - '0';
292 while (isdigit (ch = lr_getc (lr)))
294 lr->token.val.num *= 10;
295 lr->token.val.num += ch - '0';
297 if (isalpha (ch))
298 lr_error (lr, _("garbage at end of number"));
299 lr_ungetn (lr, 1);
301 return &lr->token;
303 case ';':
304 lr->token.tok = tok_semicolon;
305 return &lr->token;
307 case ',':
308 lr->token.tok = tok_comma;
309 return &lr->token;
311 case '(':
312 lr->token.tok = tok_open_brace;
313 return &lr->token;
315 case ')':
316 lr->token.tok = tok_close_brace;
317 return &lr->token;
319 case '"':
320 return get_string (lr, charmap, locale, repertoire, verbose);
322 case '-':
323 ch = lr_getc (lr);
324 if (ch == '1')
326 lr->token.tok = tok_minus1;
327 return &lr->token;
329 lr_ungetn (lr, 2);
330 break;
333 return get_ident (lr);
337 static struct token *
338 get_toplvl_escape (struct linereader *lr)
340 /* This is supposed to be a numeric value. We return the
341 numerical value and the number of bytes. */
342 size_t start_idx = lr->idx - 1;
343 unsigned char *bytes = lr->token.val.charcode.bytes;
344 size_t nbytes = 0;
345 int ch;
349 unsigned int byte = 0;
350 unsigned int base = 8;
352 ch = lr_getc (lr);
354 if (ch == 'd')
356 base = 10;
357 ch = lr_getc (lr);
359 else if (ch == 'x')
361 base = 16;
362 ch = lr_getc (lr);
365 if ((base == 16 && !isxdigit (ch))
366 || (base != 16 && (ch < '0' || ch >= (int) ('0' + base))))
368 esc_error:
369 lr->token.val.str.startmb = &lr->buf[start_idx];
371 while (ch != EOF && !isspace (ch))
372 ch = lr_getc (lr);
373 lr->token.val.str.lenmb = lr->idx - start_idx;
375 lr->token.tok = tok_error;
376 return &lr->token;
379 if (isdigit (ch))
380 byte = ch - '0';
381 else
382 byte = tolower (ch) - 'a' + 10;
384 ch = lr_getc (lr);
385 if ((base == 16 && !isxdigit (ch))
386 || (base != 16 && (ch < '0' || ch >= (int) ('0' + base))))
387 goto esc_error;
389 byte *= base;
390 if (isdigit (ch))
391 byte += ch - '0';
392 else
393 byte += tolower (ch) - 'a' + 10;
395 ch = lr_getc (lr);
396 if (base != 16 && isdigit (ch))
398 byte *= base;
399 byte += ch - '0';
401 ch = lr_getc (lr);
404 bytes[nbytes++] = byte;
406 while (ch == lr->escape_char
407 && nbytes < (int) sizeof (lr->token.val.charcode.bytes));
409 if (!isspace (ch))
410 lr_error (lr, _("garbage at end of character code specification"));
412 lr_ungetn (lr, 1);
414 lr->token.tok = tok_charcode;
415 lr->token.val.charcode.nbytes = nbytes;
417 return &lr->token;
421 #define ADDC(ch) \
422 do \
424 if (bufact == bufmax) \
426 bufmax *= 2; \
427 buf = xrealloc (buf, bufmax); \
429 buf[bufact++] = (ch); \
431 while (0)
434 #define ADDS(s, l) \
435 do \
437 size_t _l = (l); \
438 if (bufact + _l > bufmax) \
440 if (bufact < _l) \
441 bufact = _l; \
442 bufmax *= 2; \
443 buf = xrealloc (buf, bufmax); \
445 memcpy (&buf[bufact], s, _l); \
446 bufact += _l; \
448 while (0)
451 #define ADDWC(ch) \
452 do \
454 if (buf2act == buf2max) \
456 buf2max *= 2; \
457 buf2 = xrealloc (buf2, buf2max * 4); \
459 buf2[buf2act++] = (ch); \
461 while (0)
464 static struct token *
465 get_symname (struct linereader *lr)
467 /* Symbol in brackets. We must distinguish three kinds:
468 1. reserved words
469 2. ISO 10646 position values
470 3. all other. */
471 char *buf;
472 size_t bufact = 0;
473 size_t bufmax = 56;
474 const struct keyword_t *kw;
475 int ch;
477 buf = (char *) xmalloc (bufmax);
481 ch = lr_getc (lr);
482 if (ch == lr->escape_char)
484 int c2 = lr_getc (lr);
485 ADDC (c2);
487 if (c2 == '\n')
488 ch = '\n';
490 else
491 ADDC (ch);
493 while (ch != '>' && ch != '\n');
495 if (ch == '\n')
496 lr_error (lr, _("unterminated symbolic name"));
498 /* Test for ISO 10646 position value. */
499 if (buf[0] == 'U' && (bufact == 6 || bufact == 10))
501 char *cp = buf + 1;
502 while (cp < &buf[bufact - 1] && isxdigit (*cp))
503 ++cp;
505 if (cp == &buf[bufact - 1])
507 /* Yes, it is. */
508 lr->token.tok = tok_ucs4;
509 lr->token.val.ucs4 = strtoul (buf + 1, NULL, 16);
511 return &lr->token;
515 /* It is a symbolic name. Test for reserved words. */
516 kw = lr->hash_fct (buf, bufact - 1);
518 if (kw != NULL && kw->symname_or_ident == 1)
520 lr->token.tok = kw->token;
521 free (buf);
523 else
525 lr->token.tok = tok_bsymbol;
527 buf = xrealloc (buf, bufact + 1);
528 buf[bufact] = '\0';
530 lr->token.val.str.startmb = buf;
531 lr->token.val.str.lenmb = bufact - 1;
534 return &lr->token;
538 static struct token *
539 get_ident (struct linereader *lr)
541 char *buf;
542 size_t bufact;
543 size_t bufmax = 56;
544 const struct keyword_t *kw;
545 int ch;
547 buf = xmalloc (bufmax);
548 bufact = 0;
550 ADDC (lr->buf[lr->idx - 1]);
552 while (!isspace ((ch = lr_getc (lr))) && ch != '"' && ch != ';'
553 && ch != '<' && ch != ',' && ch != EOF)
555 if (ch == lr->escape_char)
557 ch = lr_getc (lr);
558 if (ch == '\n' || ch == EOF)
560 lr_error (lr, _("invalid escape sequence"));
561 break;
564 ADDC (ch);
567 lr_ungetc (lr, ch);
569 kw = lr->hash_fct (buf, bufact);
571 if (kw != NULL && kw->symname_or_ident == 0)
573 lr->token.tok = kw->token;
574 free (buf);
576 else
578 lr->token.tok = tok_ident;
580 buf = xrealloc (buf, bufact + 1);
581 buf[bufact] = '\0';
583 lr->token.val.str.startmb = buf;
584 lr->token.val.str.lenmb = bufact;
587 return &lr->token;
591 static struct token *
592 get_string (struct linereader *lr, const struct charmap_t *charmap,
593 struct localedef_t *locale, const struct repertoire_t *repertoire,
594 int verbose)
596 int return_widestr = lr->return_widestr;
597 char *buf;
598 wchar_t *buf2 = NULL;
599 size_t bufact;
600 size_t bufmax = 56;
602 /* We must return two different strings. */
603 buf = xmalloc (bufmax);
604 bufact = 0;
606 /* We know it'll be a string. */
607 lr->token.tok = tok_string;
609 /* If we need not translate the strings (i.e., expand <...> parts)
610 we can run a simple loop. */
611 if (!lr->translate_strings)
613 int ch;
615 buf2 = NULL;
616 while ((ch = lr_getc (lr)) != '"' && ch != '\n' && ch != EOF)
617 ADDC (ch);
619 /* Catch errors with trailing escape character. */
620 if (bufact > 0 && buf[bufact - 1] == lr->escape_char
621 && (bufact == 1 || buf[bufact - 2] != lr->escape_char))
623 lr_error (lr, _("illegal escape sequence at end of string"));
624 --bufact;
626 else if (ch == '\n' || ch == EOF)
627 lr_error (lr, _("unterminated string"));
629 ADDC ('\0');
631 else
633 int illegal_string = 0;
634 size_t buf2act = 0;
635 size_t buf2max = 56 * sizeof (uint32_t);
636 int ch;
637 int warned = 0;
639 /* We have to provide the wide character result as well. */
640 if (return_widestr)
641 buf2 = xmalloc (buf2max);
643 /* Read until the end of the string (or end of the line or file). */
644 while ((ch = lr_getc (lr)) != '"' && ch != '\n' && ch != EOF)
646 size_t startidx;
647 uint32_t wch;
648 struct charseq *seq;
650 if (ch != '<')
652 /* The standards leave it up to the implementation to decide
653 what to do with character which stand for themself. We
654 could jump through hoops to find out the value relative to
655 the charmap and the repertoire map, but instead we leave
656 it up to the locale definition author to write a better
657 definition. We assume here that every character which
658 stands for itself is encoded using ISO 8859-1. Using the
659 escape character is allowed. */
660 if (ch == lr->escape_char)
662 ch = lr_getc (lr);
663 if (ch == '\n' || ch == EOF)
664 break;
667 if (verbose && !warned)
669 lr_error (lr, _("\
670 non-symbolic character value should not be used"));
671 warned = 1;
674 ADDC (ch);
675 if (return_widestr)
676 ADDWC ((uint32_t) ch);
678 continue;
681 /* Now we have to search for the end of the symbolic name, i.e.,
682 the closing '>'. */
683 startidx = bufact;
684 while ((ch = lr_getc (lr)) != '>' && ch != '\n' && ch != EOF)
686 if (ch == lr->escape_char)
688 ch = lr_getc (lr);
689 if (ch == '\n' || ch == EOF)
690 break;
692 ADDC (ch);
694 if (ch == '\n' || ch == EOF)
695 /* Not a correct string. */
696 break;
697 if (bufact == startidx)
699 /* <> is no correct name. Ignore it and also signal an
700 error. */
701 illegal_string = 1;
702 continue;
705 /* It might be a Uxxxx symbol. */
706 if (buf[startidx] == 'U'
707 && (bufact - startidx == 5 || bufact - startidx == 9))
709 char *cp = buf + startidx + 1;
710 while (cp < &buf[bufact] && isxdigit (*cp))
711 ++cp;
713 if (cp == &buf[bufact])
715 char utmp[10];
717 /* Yes, it is. */
718 ADDC ('\0');
719 wch = strtoul (buf + startidx + 1, NULL, 16);
721 /* Now forget about the name we just added. */
722 bufact = startidx;
724 if (return_widestr)
725 ADDWC (wch);
727 /* See whether the charmap contains the Uxxxxxxxx names. */
728 snprintf (utmp, sizeof (utmp), "U%08X", wch);
729 seq = charmap_find_value (charmap, utmp, 9);
731 if (seq == NULL)
733 /* No, this isn't the case. Now determine from
734 the repertoire the name of the character and
735 find it in the charmap. */
736 if (repertoire != NULL)
738 const char *symbol;
740 symbol = repertoire_find_symbol (repertoire, wch);
742 if (symbol != NULL)
743 seq = charmap_find_value (charmap, symbol,
744 strlen (symbol));
747 if (seq == NULL)
749 #ifndef NO_TRANSLITERATION
750 /* Transliterate if possible. */
751 if (locale != NULL)
753 uint32_t *translit;
755 if ((locale->avail & CTYPE_LOCALE) == 0)
757 /* Load the CTYPE data now. */
758 int old_needed = locale->needed;
760 locale->needed = 0;
761 locale = load_locale (LC_CTYPE,
762 locale->name,
763 locale->repertoire_name,
764 charmap, locale);
765 locale->needed = old_needed;
768 if ((locale->avail & CTYPE_LOCALE) != 0
769 && ((translit = find_translit (locale,
770 charmap, wch))
771 != NULL))
772 /* The CTYPE data contains a matching
773 transliteration. */
775 int i;
777 for (i = 0; translit[i] != 0; ++i)
779 char utmp[10];
781 snprintf (utmp, sizeof (utmp), "U%08X",
782 translit[i]);
783 seq = charmap_find_value (charmap, utmp,
785 assert (seq != NULL);
786 ADDS (seq->bytes, seq->nbytes);
789 continue;
792 #endif /* NO_TRANSLITERATION */
794 /* Not a known name. */
795 illegal_string = 1;
799 if (seq != NULL)
800 ADDS (seq->bytes, seq->nbytes);
802 continue;
806 /* We now have the symbolic name in buf[startidx] to
807 buf[bufact-1]. Now find out the value for this character
808 in the charmap as well as in the repertoire map (in this
809 order). */
810 seq = charmap_find_value (charmap, &buf[startidx],
811 bufact - startidx);
813 if (seq == NULL)
815 /* This name is not in the charmap. */
816 lr_error (lr, _("symbol `%.*s' not in charmap"),
817 (int) (bufact - startidx), &buf[startidx]);
818 illegal_string = 1;
821 if (return_widestr)
823 /* Now the same for the multibyte representation. */
824 if (seq != NULL && seq->ucs4 != UNINITIALIZED_CHAR_VALUE)
825 wch = seq->ucs4;
826 else
828 wch = repertoire_find_value (repertoire, &buf[startidx],
829 bufact - startidx);
830 if (seq != NULL)
831 seq->ucs4 = wch;
834 if (wch == ILLEGAL_CHAR_VALUE)
836 /* This name is not in the repertoire map. */
837 lr_error (lr, _("symbol `%.*s' not in repertoire map"),
838 (int) (bufact - startidx), &buf[startidx]);
839 illegal_string = 1;
841 else
842 ADDWC (wch);
845 /* Now forget about the name we just added. */
846 bufact = startidx;
848 /* And copy the bytes. */
849 if (seq != NULL)
850 ADDS (seq->bytes, seq->nbytes);
853 if (ch == '\n' || ch == EOF)
855 lr_error (lr, _("unterminated string"));
856 illegal_string = 1;
859 if (illegal_string)
861 free (buf);
862 if (buf2 != NULL)
863 free (buf2);
864 lr->token.val.str.startmb = NULL;
865 lr->token.val.str.lenmb = 0;
866 lr->token.val.str.startwc = NULL;
867 lr->token.val.str.lenwc = 0;
869 return &lr->token;
872 ADDC ('\0');
874 if (return_widestr)
876 ADDWC (0);
877 lr->token.val.str.startwc = xrealloc (buf2,
878 buf2act * sizeof (uint32_t));
879 lr->token.val.str.lenwc = buf2act;
883 lr->token.val.str.startmb = xrealloc (buf, bufact);
884 lr->token.val.str.lenmb = bufact;
886 return &lr->token;