1 /* Copyright (C) 1996-2005, 2006 Free Software Foundation, Inc.
2 This file is part of the GNU C Library.
3 Contributed by Ulrich Drepper <drepper@gnu.org>, 1996.
5 This program is free software; you can redistribute it and/or modify
6 it under the terms of the GNU General Public License version 2 as
7 published by the Free Software Foundation.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program; if not, write to the Free Software Foundation,
16 Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */
30 #include "localedef.h"
33 #include "linereader.h"
36 /* Prototypes for local functions. */
37 static struct token
*get_toplvl_escape (struct linereader
*lr
);
38 static struct token
*get_symname (struct linereader
*lr
);
39 static struct token
*get_ident (struct linereader
*lr
);
40 static struct token
*get_string (struct linereader
*lr
,
41 const struct charmap_t
*charmap
,
42 struct localedef_t
*locale
,
43 const struct repertoire_t
*repertoire
,
48 lr_open (const char *fname
, kw_hash_fct_t hf
)
52 if (fname
== NULL
|| strcmp (fname
, "-") == 0
53 || strcmp (fname
, "/dev/stdin") == 0)
54 return lr_create (stdin
, "<stdin>", hf
);
57 fp
= fopen (fname
, "rm");
60 return lr_create (fp
, fname
, hf
);
65 lr_create (FILE *fp
, const char *fname
, kw_hash_fct_t hf
)
67 struct linereader
*result
;
70 result
= (struct linereader
*) xmalloc (sizeof (*result
));
73 result
->fname
= xstrdup (fname
);
78 result
->comment_char
= '#';
79 result
->escape_char
= '\\';
80 result
->translate_strings
= 1;
81 result
->return_widestr
= 0;
83 n
= getdelim (&result
->buf
, &result
->bufsize
, '\n', result
->fp
);
88 free ((char *) result
->fname
);
94 if (n
> 1 && result
->buf
[n
- 2] == '\\' && result
->buf
[n
- 1] == '\n')
97 result
->buf
[n
] = '\0';
99 result
->hash_fct
= hf
;
106 lr_eof (struct linereader
*lr
)
108 return lr
->bufact
= 0;
113 lr_ignore_rest (struct linereader
*lr
, int verbose
)
117 while (isspace (lr
->buf
[lr
->idx
]) && lr
->buf
[lr
->idx
] != '\n'
118 && lr
->buf
[lr
->idx
] != lr
->comment_char
)
119 if (lr
->buf
[lr
->idx
] == '\0')
121 if (lr_next (lr
) < 0)
127 if (lr
->buf
[lr
->idx
] != '\n' && ! feof (lr
->fp
)
128 && lr
->buf
[lr
->idx
] != lr
->comment_char
)
129 lr_error (lr
, _("trailing garbage at end of line"));
132 /* Ignore continued line. */
133 while (lr
->bufact
> 0 && lr
->buf
[lr
->bufact
- 1] != '\n')
134 if (lr_next (lr
) < 0)
137 lr
->idx
= lr
->bufact
;
142 lr_close (struct linereader
*lr
)
151 lr_next (struct linereader
*lr
)
155 n
= getdelim (&lr
->buf
, &lr
->bufsize
, '\n', lr
->fp
);
161 if (n
> 1 && lr
->buf
[n
- 2] == lr
->escape_char
&& lr
->buf
[n
- 1] == '\n')
164 /* XXX Is this correct? */
165 /* An escaped newline character is substituted with a single <SP>. */
167 lr
->buf
[n
- 1] = ' ';
181 /* Defined in error.c. */
182 /* This variable is incremented each time `error' is called. */
183 extern unsigned int error_message_count
;
185 /* The calling program should define program_name and set it to the
186 name of the executing program. */
187 extern char *program_name
;
191 lr_token (struct linereader
*lr
, const struct charmap_t
*charmap
,
192 struct localedef_t
*locale
, const struct repertoire_t
*repertoire
,
205 lr
->token
.tok
= tok_eof
;
211 lr
->token
.tok
= tok_eol
;
215 while (isspace (ch
));
217 if (ch
!= lr
->comment_char
)
220 /* Is there an newline at the end of the buffer? */
221 if (lr
->buf
[lr
->bufact
- 1] != '\n')
223 /* No. Some people want this to mean that only the line in
224 the file not the logical, concatenated line is ignored.
226 lr
->idx
= lr
->bufact
;
230 /* Ignore rest of line. */
231 lr_ignore_rest (lr
, 0);
232 lr
->token
.tok
= tok_eol
;
236 /* Match escape sequences. */
237 if (ch
== lr
->escape_char
)
238 return get_toplvl_escape (lr
);
240 /* Match ellipsis. */
243 if (strncmp (&lr
->buf
[lr
->idx
], "...(2)....", 10) == 0)
246 for (cnt
= 0; cnt
< 10; ++cnt
)
248 lr
->token
.tok
= tok_ellipsis4_2
;
251 if (strncmp (&lr
->buf
[lr
->idx
], "...", 3) == 0)
256 lr
->token
.tok
= tok_ellipsis4
;
259 if (strncmp (&lr
->buf
[lr
->idx
], "..", 2) == 0)
263 lr
->token
.tok
= tok_ellipsis3
;
266 if (strncmp (&lr
->buf
[lr
->idx
], ".(2)..", 6) == 0)
269 for (cnt
= 0; cnt
< 6; ++cnt
)
271 lr
->token
.tok
= tok_ellipsis2_2
;
274 if (lr
->buf
[lr
->idx
] == '.')
277 lr
->token
.tok
= tok_ellipsis2
;
285 return get_symname (lr
);
288 lr
->token
.tok
= tok_number
;
289 lr
->token
.val
.num
= ch
- '0';
291 while (isdigit (ch
= lr_getc (lr
)))
293 lr
->token
.val
.num
*= 10;
294 lr
->token
.val
.num
+= ch
- '0';
297 lr_error (lr
, _("garbage at end of number"));
303 lr
->token
.tok
= tok_semicolon
;
307 lr
->token
.tok
= tok_comma
;
311 lr
->token
.tok
= tok_open_brace
;
315 lr
->token
.tok
= tok_close_brace
;
319 return get_string (lr
, charmap
, locale
, repertoire
, verbose
);
325 lr
->token
.tok
= tok_minus1
;
332 return get_ident (lr
);
336 static struct token
*
337 get_toplvl_escape (struct linereader
*lr
)
339 /* This is supposed to be a numeric value. We return the
340 numerical value and the number of bytes. */
341 size_t start_idx
= lr
->idx
- 1;
342 unsigned char *bytes
= lr
->token
.val
.charcode
.bytes
;
348 unsigned int byte
= 0;
349 unsigned int base
= 8;
364 if ((base
== 16 && !isxdigit (ch
))
365 || (base
!= 16 && (ch
< '0' || ch
>= (int) ('0' + base
))))
368 lr
->token
.val
.str
.startmb
= &lr
->buf
[start_idx
];
370 while (ch
!= EOF
&& !isspace (ch
))
372 lr
->token
.val
.str
.lenmb
= lr
->idx
- start_idx
;
374 lr
->token
.tok
= tok_error
;
381 byte
= tolower (ch
) - 'a' + 10;
384 if ((base
== 16 && !isxdigit (ch
))
385 || (base
!= 16 && (ch
< '0' || ch
>= (int) ('0' + base
))))
392 byte
+= tolower (ch
) - 'a' + 10;
395 if (base
!= 16 && isdigit (ch
))
403 bytes
[nbytes
++] = byte
;
405 while (ch
== lr
->escape_char
406 && nbytes
< (int) sizeof (lr
->token
.val
.charcode
.bytes
));
409 lr_error (lr
, _("garbage at end of character code specification"));
413 lr
->token
.tok
= tok_charcode
;
414 lr
->token
.val
.charcode
.nbytes
= nbytes
;
423 if (bufact == bufmax) \
426 buf = xrealloc (buf, bufmax); \
428 buf[bufact++] = (ch); \
437 if (bufact + _l > bufmax) \
442 buf = xrealloc (buf, bufmax); \
444 memcpy (&buf[bufact], s, _l); \
453 if (buf2act == buf2max) \
456 buf2 = xrealloc (buf2, buf2max * 4); \
458 buf2[buf2act++] = (ch); \
463 static struct token
*
464 get_symname (struct linereader
*lr
)
466 /* Symbol in brackets. We must distinguish three kinds:
468 2. ISO 10646 position values
473 const struct keyword_t
*kw
;
476 buf
= (char *) xmalloc (bufmax
);
481 if (ch
== lr
->escape_char
)
483 int c2
= lr_getc (lr
);
492 while (ch
!= '>' && ch
!= '\n');
495 lr_error (lr
, _("unterminated symbolic name"));
497 /* Test for ISO 10646 position value. */
498 if (buf
[0] == 'U' && (bufact
== 6 || bufact
== 10))
501 while (cp
< &buf
[bufact
- 1] && isxdigit (*cp
))
504 if (cp
== &buf
[bufact
- 1])
507 lr
->token
.tok
= tok_ucs4
;
508 lr
->token
.val
.ucs4
= strtoul (buf
+ 1, NULL
, 16);
514 /* It is a symbolic name. Test for reserved words. */
515 kw
= lr
->hash_fct (buf
, bufact
- 1);
517 if (kw
!= NULL
&& kw
->symname_or_ident
== 1)
519 lr
->token
.tok
= kw
->token
;
524 lr
->token
.tok
= tok_bsymbol
;
527 buf
= xrealloc (buf
, bufact
+ 1);
529 lr
->token
.val
.str
.startmb
= buf
;
530 lr
->token
.val
.str
.lenmb
= bufact
- 1;
537 static struct token
*
538 get_ident (struct linereader
*lr
)
543 const struct keyword_t
*kw
;
546 buf
= xmalloc (bufmax
);
549 ADDC (lr
->buf
[lr
->idx
- 1]);
551 while (!isspace ((ch
= lr_getc (lr
))) && ch
!= '"' && ch
!= ';'
552 && ch
!= '<' && ch
!= ',' && ch
!= EOF
)
554 if (ch
== lr
->escape_char
)
557 if (ch
== '\n' || ch
== EOF
)
559 lr_error (lr
, _("invalid escape sequence"));
568 kw
= lr
->hash_fct (buf
, bufact
);
570 if (kw
!= NULL
&& kw
->symname_or_ident
== 0)
572 lr
->token
.tok
= kw
->token
;
577 lr
->token
.tok
= tok_ident
;
580 buf
= xrealloc (buf
, bufact
+ 1);
582 lr
->token
.val
.str
.startmb
= buf
;
583 lr
->token
.val
.str
.lenmb
= bufact
;
590 static struct token
*
591 get_string (struct linereader
*lr
, const struct charmap_t
*charmap
,
592 struct localedef_t
*locale
, const struct repertoire_t
*repertoire
,
595 int return_widestr
= lr
->return_widestr
;
597 wchar_t *buf2
= NULL
;
601 /* We must return two different strings. */
602 buf
= xmalloc (bufmax
);
605 /* We know it'll be a string. */
606 lr
->token
.tok
= tok_string
;
608 /* If we need not translate the strings (i.e., expand <...> parts)
609 we can run a simple loop. */
610 if (!lr
->translate_strings
)
615 while ((ch
= lr_getc (lr
)) != '"' && ch
!= '\n' && ch
!= EOF
)
618 /* Catch errors with trailing escape character. */
619 if (bufact
> 0 && buf
[bufact
- 1] == lr
->escape_char
620 && (bufact
== 1 || buf
[bufact
- 2] != lr
->escape_char
))
622 lr_error (lr
, _("illegal escape sequence at end of string"));
625 else if (ch
== '\n' || ch
== EOF
)
626 lr_error (lr
, _("unterminated string"));
632 int illegal_string
= 0;
634 size_t buf2max
= 56 * sizeof (uint32_t);
638 /* We have to provide the wide character result as well. */
640 buf2
= xmalloc (buf2max
);
642 /* Read until the end of the string (or end of the line or file). */
643 while ((ch
= lr_getc (lr
)) != '"' && ch
!= '\n' && ch
!= EOF
)
651 /* The standards leave it up to the implementation to decide
652 what to do with character which stand for themself. We
653 could jump through hoops to find out the value relative to
654 the charmap and the repertoire map, but instead we leave
655 it up to the locale definition author to write a better
656 definition. We assume here that every character which
657 stands for itself is encoded using ISO 8859-1. Using the
658 escape character is allowed. */
659 if (ch
== lr
->escape_char
)
662 if (ch
== '\n' || ch
== EOF
)
666 if (verbose
&& !warned
)
669 non-symbolic character value should not be used"));
675 ADDWC ((uint32_t) ch
);
680 /* Now we have to search for the end of the symbolic name, i.e.,
683 while ((ch
= lr_getc (lr
)) != '>' && ch
!= '\n' && ch
!= EOF
)
685 if (ch
== lr
->escape_char
)
688 if (ch
== '\n' || ch
== EOF
)
693 if (ch
== '\n' || ch
== EOF
)
694 /* Not a correct string. */
696 if (bufact
== startidx
)
698 /* <> is no correct name. Ignore it and also signal an
704 /* It might be a Uxxxx symbol. */
705 if (buf
[startidx
] == 'U'
706 && (bufact
- startidx
== 5 || bufact
- startidx
== 9))
708 char *cp
= buf
+ startidx
+ 1;
709 while (cp
< &buf
[bufact
] && isxdigit (*cp
))
712 if (cp
== &buf
[bufact
])
718 wch
= strtoul (buf
+ startidx
+ 1, NULL
, 16);
720 /* Now forget about the name we just added. */
726 /* See whether the charmap contains the Uxxxxxxxx names. */
727 snprintf (utmp
, sizeof (utmp
), "U%08X", wch
);
728 seq
= charmap_find_value (charmap
, utmp
, 9);
732 /* No, this isn't the case. Now determine from
733 the repertoire the name of the character and
734 find it in the charmap. */
735 if (repertoire
!= NULL
)
739 symbol
= repertoire_find_symbol (repertoire
, wch
);
742 seq
= charmap_find_value (charmap
, symbol
,
748 #ifndef NO_TRANSLITERATION
749 /* Transliterate if possible. */
754 if ((locale
->avail
& CTYPE_LOCALE
) == 0)
756 /* Load the CTYPE data now. */
757 int old_needed
= locale
->needed
;
760 locale
= load_locale (LC_CTYPE
,
762 locale
->repertoire_name
,
764 locale
->needed
= old_needed
;
767 if ((locale
->avail
& CTYPE_LOCALE
) != 0
768 && ((translit
= find_translit (locale
,
771 /* The CTYPE data contains a matching
776 for (i
= 0; translit
[i
] != 0; ++i
)
780 snprintf (utmp
, sizeof (utmp
), "U%08X",
782 seq
= charmap_find_value (charmap
, utmp
,
784 assert (seq
!= NULL
);
785 ADDS (seq
->bytes
, seq
->nbytes
);
791 #endif /* NO_TRANSLITERATION */
793 /* Not a known name. */
799 ADDS (seq
->bytes
, seq
->nbytes
);
805 /* We now have the symbolic name in buf[startidx] to
806 buf[bufact-1]. Now find out the value for this character
807 in the charmap as well as in the repertoire map (in this
809 seq
= charmap_find_value (charmap
, &buf
[startidx
],
814 /* This name is not in the charmap. */
815 lr_error (lr
, _("symbol `%.*s' not in charmap"),
816 (int) (bufact
- startidx
), &buf
[startidx
]);
822 /* Now the same for the multibyte representation. */
823 if (seq
!= NULL
&& seq
->ucs4
!= UNINITIALIZED_CHAR_VALUE
)
827 wch
= repertoire_find_value (repertoire
, &buf
[startidx
],
833 if (wch
== ILLEGAL_CHAR_VALUE
)
835 /* This name is not in the repertoire map. */
836 lr_error (lr
, _("symbol `%.*s' not in repertoire map"),
837 (int) (bufact
- startidx
), &buf
[startidx
]);
844 /* Now forget about the name we just added. */
847 /* And copy the bytes. */
849 ADDS (seq
->bytes
, seq
->nbytes
);
852 if (ch
== '\n' || ch
== EOF
)
854 lr_error (lr
, _("unterminated string"));
863 lr
->token
.val
.str
.startmb
= NULL
;
864 lr
->token
.val
.str
.lenmb
= 0;
865 lr
->token
.val
.str
.startwc
= NULL
;
866 lr
->token
.val
.str
.lenwc
= 0;
876 lr
->token
.val
.str
.startwc
= xrealloc (buf2
,
877 buf2act
* sizeof (uint32_t));
878 lr
->token
.val
.str
.lenwc
= buf2act
;
882 lr
->token
.val
.str
.startmb
= xrealloc (buf
, bufact
);
883 lr
->token
.val
.str
.lenmb
= bufact
;