1 /* Copyright (C) 1996-2001, 2002 Free Software Foundation, Inc.
2 This file is part of the GNU C Library.
3 Contributed by Ulrich Drepper <drepper@gnu.org>, 1996.
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, write to the Free
17 Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
32 #include "localedef.h"
35 #include "linereader.h"
38 /* Prototypes for local functions. */
39 static struct token
*get_toplvl_escape (struct linereader
*lr
);
40 static struct token
*get_symname (struct linereader
*lr
);
41 static struct token
*get_ident (struct linereader
*lr
);
42 static struct token
*get_string (struct linereader
*lr
,
43 const struct charmap_t
*charmap
,
44 struct localedef_t
*locale
,
45 const struct repertoire_t
*repertoire
,
50 lr_open (const char *fname
, kw_hash_fct_t hf
)
54 if (fname
== NULL
|| strcmp (fname
, "-") == 0
55 || strcmp (fname
, "/dev/stdin") == 0)
56 return lr_create (stdin
, "<stdin>", hf
);
59 fp
= fopen (fname
, "r");
62 return lr_create (fp
, fname
, hf
);
67 lr_create (FILE *fp
, const char *fname
, kw_hash_fct_t hf
)
69 struct linereader
*result
;
72 result
= (struct linereader
*) xmalloc (sizeof (*result
));
75 result
->fname
= xstrdup (fname
);
80 result
->comment_char
= '#';
81 result
->escape_char
= '\\';
82 result
->translate_strings
= 1;
84 n
= getdelim (&result
->buf
, &result
->bufsize
, '\n', result
->fp
);
89 free ((char *) result
->fname
);
95 if (n
> 1 && result
->buf
[n
- 2] == '\\' && result
->buf
[n
- 1] == '\n')
98 result
->buf
[n
] = '\0';
100 result
->hash_fct
= hf
;
107 lr_eof (struct linereader
*lr
)
109 return lr
->bufact
= 0;
114 lr_close (struct linereader
*lr
)
123 lr_next (struct linereader
*lr
)
127 n
= getdelim (&lr
->buf
, &lr
->bufsize
, '\n', lr
->fp
);
133 if (n
> 1 && lr
->buf
[n
- 2] == lr
->escape_char
&& lr
->buf
[n
- 1] == '\n')
136 /* XXX Is this correct? */
137 /* An escaped newline character is substituted with a single <SP>. */
139 lr
->buf
[n
- 1] = ' ';
153 /* Defined in error.c. */
154 /* This variable is incremented each time `error' is called. */
155 extern unsigned int error_message_count
;
157 /* The calling program should define program_name and set it to the
158 name of the executing program. */
159 extern char *program_name
;
163 lr_token (struct linereader
*lr
, const struct charmap_t
*charmap
,
164 struct localedef_t
*locale
, const struct repertoire_t
*repertoire
,
177 lr
->token
.tok
= tok_eof
;
183 lr
->token
.tok
= tok_eol
;
187 while (isspace (ch
));
191 lr
->token
.tok
= tok_eof
;
195 if (ch
!= lr
->comment_char
)
198 /* Is there an newline at the end of the buffer? */
199 if (lr
->buf
[lr
->bufact
- 1] != '\n')
201 /* No. Some people want this to mean that only the line in
202 the file not the logical, concatenated line is ignored.
204 lr
->idx
= lr
->bufact
;
208 /* Ignore rest of line. */
209 lr_ignore_rest (lr
, 0);
210 lr
->token
.tok
= tok_eol
;
214 /* Match escape sequences. */
215 if (ch
== lr
->escape_char
)
216 return get_toplvl_escape (lr
);
218 /* Match ellipsis. */
221 if (strncmp (&lr
->buf
[lr
->idx
], "...(2)....", 10) == 0)
224 for (cnt
= 0; cnt
< 10; ++cnt
)
226 lr
->token
.tok
= tok_ellipsis4_2
;
229 if (strncmp (&lr
->buf
[lr
->idx
], "...", 3) == 0)
234 lr
->token
.tok
= tok_ellipsis4
;
237 if (strncmp (&lr
->buf
[lr
->idx
], "..", 2) == 0)
241 lr
->token
.tok
= tok_ellipsis3
;
244 if (strncmp (&lr
->buf
[lr
->idx
], ".(2)..", 6) == 0)
247 for (cnt
= 0; cnt
< 6; ++cnt
)
249 lr
->token
.tok
= tok_ellipsis2_2
;
252 if (lr
->buf
[lr
->idx
] == '.')
255 lr
->token
.tok
= tok_ellipsis2
;
263 return get_symname (lr
);
266 lr
->token
.tok
= tok_number
;
267 lr
->token
.val
.num
= ch
- '0';
269 while (isdigit (ch
= lr_getc (lr
)))
271 lr
->token
.val
.num
*= 10;
272 lr
->token
.val
.num
+= ch
- '0';
275 lr_error (lr
, _("garbage at end of number"));
281 lr
->token
.tok
= tok_semicolon
;
285 lr
->token
.tok
= tok_comma
;
289 lr
->token
.tok
= tok_open_brace
;
293 lr
->token
.tok
= tok_close_brace
;
297 return get_string (lr
, charmap
, locale
, repertoire
, verbose
);
303 lr
->token
.tok
= tok_minus1
;
310 return get_ident (lr
);
314 static struct token
*
315 get_toplvl_escape (struct linereader
*lr
)
317 /* This is supposed to be a numeric value. We return the
318 numerical value and the number of bytes. */
319 size_t start_idx
= lr
->idx
- 1;
320 char *bytes
= lr
->token
.val
.charcode
.bytes
;
326 unsigned int byte
= 0;
327 unsigned int base
= 8;
342 if ((base
== 16 && !isxdigit (ch
))
343 || (base
!= 16 && (ch
< '0' || ch
>= (int) ('0' + base
))))
346 lr
->token
.val
.str
.startmb
= &lr
->buf
[start_idx
];
348 while (ch
!= EOF
&& !isspace (ch
))
350 lr
->token
.val
.str
.lenmb
= lr
->idx
- start_idx
;
352 lr
->token
.tok
= tok_error
;
359 byte
= tolower (ch
) - 'a' + 10;
362 if ((base
== 16 && !isxdigit (ch
))
363 || (base
!= 16 && (ch
< '0' || ch
>= (int) ('0' + base
))))
370 byte
+= tolower (ch
) - 'a' + 10;
373 if (base
!= 16 && isdigit (ch
))
381 bytes
[nbytes
++] = byte
;
383 while (ch
== lr
->escape_char
384 && nbytes
< (int) sizeof (lr
->token
.val
.charcode
.bytes
));
387 lr_error (lr
, _("garbage at end of character code specification"));
391 lr
->token
.tok
= tok_charcode
;
392 lr
->token
.val
.charcode
.nbytes
= nbytes
;
401 if (bufact == bufmax) \
404 buf = xrealloc (buf, bufmax); \
406 buf[bufact++] = (ch); \
415 if (bufact + _l > bufmax) \
420 buf = xrealloc (buf, bufmax); \
422 memcpy (&buf[bufact], s, _l); \
431 if (buf2act == buf2max) \
434 buf2 = xrealloc (buf2, buf2max * 4); \
436 buf2[buf2act++] = (ch); \
441 static struct token
*
442 get_symname (struct linereader
*lr
)
444 /* Symbol in brackets. We must distinguish three kinds:
446 2. ISO 10646 position values
451 const struct keyword_t
*kw
;
454 buf
= (char *) xmalloc (bufmax
);
459 if (ch
== lr
->escape_char
)
461 int c2
= lr_getc (lr
);
470 while (ch
!= '>' && ch
!= '\n');
473 lr_error (lr
, _("unterminated symbolic name"));
475 /* Test for ISO 10646 position value. */
476 if (buf
[0] == 'U' && (bufact
== 6 || bufact
== 10))
479 while (cp
< &buf
[bufact
- 1] && isxdigit (*cp
))
482 if (cp
== &buf
[bufact
- 1])
485 lr
->token
.tok
= tok_ucs4
;
486 lr
->token
.val
.ucs4
= strtoul (buf
+ 1, NULL
, 16);
492 /* It is a symbolic name. Test for reserved words. */
493 kw
= lr
->hash_fct (buf
, bufact
- 1);
495 if (kw
!= NULL
&& kw
->symname_or_ident
== 1)
497 lr
->token
.tok
= kw
->token
;
502 lr
->token
.tok
= tok_bsymbol
;
505 buf
= xrealloc (buf
, bufact
+ 1);
507 lr
->token
.val
.str
.startmb
= buf
;
508 lr
->token
.val
.str
.lenmb
= bufact
- 1;
515 static struct token
*
516 get_ident (struct linereader
*lr
)
521 const struct keyword_t
*kw
;
524 buf
= xmalloc (bufmax
);
527 ADDC (lr
->buf
[lr
->idx
- 1]);
529 while (!isspace ((ch
= lr_getc (lr
))) && ch
!= '"' && ch
!= ';'
530 && ch
!= '<' && ch
!= ',' && ch
!= EOF
)
532 if (ch
== lr
->escape_char
)
535 if (ch
== '\n' || ch
== EOF
)
537 lr_error (lr
, _("invalid escape sequence"));
546 kw
= lr
->hash_fct (buf
, bufact
);
548 if (kw
!= NULL
&& kw
->symname_or_ident
== 0)
550 lr
->token
.tok
= kw
->token
;
555 lr
->token
.tok
= tok_ident
;
558 buf
= xrealloc (buf
, bufact
+ 1);
560 lr
->token
.val
.str
.startmb
= buf
;
561 lr
->token
.val
.str
.lenmb
= bufact
;
568 static struct token
*
569 get_string (struct linereader
*lr
, const struct charmap_t
*charmap
,
570 struct localedef_t
*locale
, const struct repertoire_t
*repertoire
,
573 int return_widestr
= lr
->return_widestr
;
575 wchar_t *buf2
= NULL
;
579 /* We must return two different strings. */
580 buf
= xmalloc (bufmax
);
583 /* We know it'll be a string. */
584 lr
->token
.tok
= tok_string
;
586 /* If we need not translate the strings (i.e., expand <...> parts)
587 we can run a simple loop. */
588 if (!lr
->translate_strings
)
593 while ((ch
= lr_getc (lr
)) != '"' && ch
!= '\n' && ch
!= EOF
)
596 /* Catch errors with trailing escape character. */
597 if (bufact
> 0 && buf
[bufact
- 1] == lr
->escape_char
598 && (bufact
== 1 || buf
[bufact
- 2] != lr
->escape_char
))
600 lr_error (lr
, _("illegal escape sequence at end of string"));
603 else if (ch
== '\n' || ch
== EOF
)
604 lr_error (lr
, _("unterminated string"));
610 int illegal_string
= 0;
612 size_t buf2max
= 56 * sizeof (uint32_t);
616 /* We have to provide the wide character result as well. */
618 buf2
= xmalloc (buf2max
);
620 /* Read until the end of the string (or end of the line or file). */
621 while ((ch
= lr_getc (lr
)) != '"' && ch
!= '\n' && ch
!= EOF
)
629 /* The standards leave it up to the implementation to decide
630 what to do with character which stand for themself. We
631 could jump through hoops to find out the value relative to
632 the charmap and the repertoire map, but instead we leave
633 it up to the locale definition author to write a better
634 definition. We assume here that every character which
635 stands for itself is encoded using ISO 8859-1. Using the
636 escape character is allowed. */
637 if (ch
== lr
->escape_char
)
640 if (ch
== '\n' || ch
== EOF
)
644 if (verbose
&& !warned
)
647 non-symbolic character value should not be used"));
653 ADDWC ((uint32_t) ch
);
658 /* Now we have to search for the end of the symbolic name, i.e.,
661 while ((ch
= lr_getc (lr
)) != '>' && ch
!= '\n' && ch
!= EOF
)
663 if (ch
== lr
->escape_char
)
666 if (ch
== '\n' || ch
== EOF
)
671 if (ch
== '\n' || ch
== EOF
)
672 /* Not a correct string. */
674 if (bufact
== startidx
)
676 /* <> is no correct name. Ignore it and also signal an
682 /* It might be a Uxxxx symbol. */
683 if (buf
[startidx
] == 'U'
684 && (bufact
- startidx
== 5 || bufact
- startidx
== 9))
686 char *cp
= buf
+ startidx
+ 1;
687 while (cp
< &buf
[bufact
] && isxdigit (*cp
))
690 if (cp
== &buf
[bufact
])
696 wch
= strtoul (buf
+ startidx
+ 1, NULL
, 16);
698 /* Now forget about the name we just added. */
704 /* See whether the charmap contains the Uxxxxxxxx names. */
705 snprintf (utmp
, sizeof (utmp
), "U%08X", wch
);
706 seq
= charmap_find_value (charmap
, utmp
, 9);
710 /* No, this isn't the case. Now determine from
711 the repertoire the name of the character and
712 find it in the charmap. */
713 if (repertoire
!= NULL
)
717 symbol
= repertoire_find_symbol (repertoire
, wch
);
720 seq
= charmap_find_value (charmap
, symbol
,
726 #ifndef NO_TRANSLITERATION
727 /* Transliterate if possible. */
732 if ((locale
->avail
& CTYPE_LOCALE
) == 0)
734 /* Load the CTYPE data now. */
735 int old_needed
= locale
->needed
;
738 locale
= load_locale (LC_CTYPE
,
740 locale
->repertoire_name
,
742 locale
->needed
= old_needed
;
745 if ((locale
->avail
& CTYPE_LOCALE
) != 0
746 && ((translit
= find_translit (locale
,
749 /* The CTYPE data contains a matching
754 for (i
= 0; translit
[i
] != 0; ++i
)
758 snprintf (utmp
, sizeof (utmp
), "U%08X",
760 seq
= charmap_find_value (charmap
, utmp
,
762 assert (seq
!= NULL
);
763 ADDS (seq
->bytes
, seq
->nbytes
);
769 #endif /* NO_TRANSLITERATION */
771 /* Not a known name. */
777 ADDS (seq
->bytes
, seq
->nbytes
);
783 /* We now have the symbolic name in buf[startidx] to
784 buf[bufact-1]. Now find out the value for this character
785 in the charmap as well as in the repertoire map (in this
787 seq
= charmap_find_value (charmap
, &buf
[startidx
],
792 /* This name is not in the charmap. */
793 lr_error (lr
, _("symbol `%.*s' not in charmap"),
794 (int) (bufact
- startidx
), &buf
[startidx
]);
800 /* Now the same for the multibyte representation. */
801 if (seq
!= NULL
&& seq
->ucs4
!= UNINITIALIZED_CHAR_VALUE
)
805 wch
= repertoire_find_value (repertoire
, &buf
[startidx
],
811 if (wch
== ILLEGAL_CHAR_VALUE
)
813 /* This name is not in the repertoire map. */
814 lr_error (lr
, _("symbol `%.*s' not in repertoire map"),
815 (int) (bufact
- startidx
), &buf
[startidx
]);
822 /* Now forget about the name we just added. */
825 /* And copy the bytes. */
827 ADDS (seq
->bytes
, seq
->nbytes
);
830 if (ch
== '\n' || ch
== EOF
)
832 lr_error (lr
, _("unterminated string"));
841 lr
->token
.val
.str
.startmb
= NULL
;
842 lr
->token
.val
.str
.lenmb
= 0;
843 lr
->token
.val
.str
.startwc
= NULL
;
844 lr
->token
.val
.str
.lenwc
= 0;
854 lr
->token
.val
.str
.startwc
= xrealloc (buf2
,
855 buf2act
* sizeof (uint32_t));
856 lr
->token
.val
.str
.lenwc
= buf2act
;
860 lr
->token
.val
.str
.startmb
= xrealloc (buf
, bufact
);
861 lr
->token
.val
.str
.lenmb
= bufact
;