1 /* Copyright (C) 1996-2001, 2002, 2003, 2004 Free Software Foundation, Inc.
2 This file is part of the GNU C Library.
3 Contributed by Ulrich Drepper <drepper@gnu.org>, 1996.
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, write to the Free
17 Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
32 #include "localedef.h"
35 #include "linereader.h"
38 /* Prototypes for local functions. */
39 static struct token
*get_toplvl_escape (struct linereader
*lr
);
40 static struct token
*get_symname (struct linereader
*lr
);
41 static struct token
*get_ident (struct linereader
*lr
);
42 static struct token
*get_string (struct linereader
*lr
,
43 const struct charmap_t
*charmap
,
44 struct localedef_t
*locale
,
45 const struct repertoire_t
*repertoire
,
50 lr_open (const char *fname
, kw_hash_fct_t hf
)
54 if (fname
== NULL
|| strcmp (fname
, "-") == 0
55 || strcmp (fname
, "/dev/stdin") == 0)
56 return lr_create (stdin
, "<stdin>", hf
);
59 fp
= fopen (fname
, "rm");
62 return lr_create (fp
, fname
, hf
);
67 lr_create (FILE *fp
, const char *fname
, kw_hash_fct_t hf
)
69 struct linereader
*result
;
72 result
= (struct linereader
*) xmalloc (sizeof (*result
));
75 result
->fname
= xstrdup (fname
);
80 result
->comment_char
= '#';
81 result
->escape_char
= '\\';
82 result
->translate_strings
= 1;
83 result
->return_widestr
= 0;
85 n
= getdelim (&result
->buf
, &result
->bufsize
, '\n', result
->fp
);
90 free ((char *) result
->fname
);
96 if (n
> 1 && result
->buf
[n
- 2] == '\\' && result
->buf
[n
- 1] == '\n')
99 result
->buf
[n
] = '\0';
101 result
->hash_fct
= hf
;
108 lr_eof (struct linereader
*lr
)
110 return lr
->bufact
= 0;
115 lr_ignore_rest (struct linereader
*lr
, int verbose
)
119 while (isspace (lr
->buf
[lr
->idx
]) && lr
->buf
[lr
->idx
] != '\n'
120 && lr
->buf
[lr
->idx
] != lr
->comment_char
)
121 if (lr
->buf
[lr
->idx
] == '\0')
123 if (lr_next (lr
) < 0)
129 if (lr
->buf
[lr
->idx
] != '\n' && ! feof (lr
->fp
)
130 && lr
->buf
[lr
->idx
] != lr
->comment_char
)
131 lr_error (lr
, _("trailing garbage at end of line"));
134 /* Ignore continued line. */
135 while (lr
->bufact
> 0 && lr
->buf
[lr
->bufact
- 1] != '\n')
136 if (lr_next (lr
) < 0)
139 lr
->idx
= lr
->bufact
;
144 lr_close (struct linereader
*lr
)
153 lr_next (struct linereader
*lr
)
157 n
= getdelim (&lr
->buf
, &lr
->bufsize
, '\n', lr
->fp
);
163 if (n
> 1 && lr
->buf
[n
- 2] == lr
->escape_char
&& lr
->buf
[n
- 1] == '\n')
166 /* XXX Is this correct? */
167 /* An escaped newline character is substituted with a single <SP>. */
169 lr
->buf
[n
- 1] = ' ';
183 /* Defined in error.c. */
184 /* This variable is incremented each time `error' is called. */
185 extern unsigned int error_message_count
;
187 /* The calling program should define program_name and set it to the
188 name of the executing program. */
189 extern char *program_name
;
193 lr_token (struct linereader
*lr
, const struct charmap_t
*charmap
,
194 struct localedef_t
*locale
, const struct repertoire_t
*repertoire
,
207 lr
->token
.tok
= tok_eof
;
213 lr
->token
.tok
= tok_eol
;
217 while (isspace (ch
));
221 lr
->token
.tok
= tok_eof
;
225 if (ch
!= lr
->comment_char
)
228 /* Is there an newline at the end of the buffer? */
229 if (lr
->buf
[lr
->bufact
- 1] != '\n')
231 /* No. Some people want this to mean that only the line in
232 the file not the logical, concatenated line is ignored.
234 lr
->idx
= lr
->bufact
;
238 /* Ignore rest of line. */
239 lr_ignore_rest (lr
, 0);
240 lr
->token
.tok
= tok_eol
;
244 /* Match escape sequences. */
245 if (ch
== lr
->escape_char
)
246 return get_toplvl_escape (lr
);
248 /* Match ellipsis. */
251 if (strncmp (&lr
->buf
[lr
->idx
], "...(2)....", 10) == 0)
254 for (cnt
= 0; cnt
< 10; ++cnt
)
256 lr
->token
.tok
= tok_ellipsis4_2
;
259 if (strncmp (&lr
->buf
[lr
->idx
], "...", 3) == 0)
264 lr
->token
.tok
= tok_ellipsis4
;
267 if (strncmp (&lr
->buf
[lr
->idx
], "..", 2) == 0)
271 lr
->token
.tok
= tok_ellipsis3
;
274 if (strncmp (&lr
->buf
[lr
->idx
], ".(2)..", 6) == 0)
277 for (cnt
= 0; cnt
< 6; ++cnt
)
279 lr
->token
.tok
= tok_ellipsis2_2
;
282 if (lr
->buf
[lr
->idx
] == '.')
285 lr
->token
.tok
= tok_ellipsis2
;
293 return get_symname (lr
);
296 lr
->token
.tok
= tok_number
;
297 lr
->token
.val
.num
= ch
- '0';
299 while (isdigit (ch
= lr_getc (lr
)))
301 lr
->token
.val
.num
*= 10;
302 lr
->token
.val
.num
+= ch
- '0';
305 lr_error (lr
, _("garbage at end of number"));
311 lr
->token
.tok
= tok_semicolon
;
315 lr
->token
.tok
= tok_comma
;
319 lr
->token
.tok
= tok_open_brace
;
323 lr
->token
.tok
= tok_close_brace
;
327 return get_string (lr
, charmap
, locale
, repertoire
, verbose
);
333 lr
->token
.tok
= tok_minus1
;
340 return get_ident (lr
);
344 static struct token
*
345 get_toplvl_escape (struct linereader
*lr
)
347 /* This is supposed to be a numeric value. We return the
348 numerical value and the number of bytes. */
349 size_t start_idx
= lr
->idx
- 1;
350 char *bytes
= lr
->token
.val
.charcode
.bytes
;
356 unsigned int byte
= 0;
357 unsigned int base
= 8;
372 if ((base
== 16 && !isxdigit (ch
))
373 || (base
!= 16 && (ch
< '0' || ch
>= (int) ('0' + base
))))
376 lr
->token
.val
.str
.startmb
= &lr
->buf
[start_idx
];
378 while (ch
!= EOF
&& !isspace (ch
))
380 lr
->token
.val
.str
.lenmb
= lr
->idx
- start_idx
;
382 lr
->token
.tok
= tok_error
;
389 byte
= tolower (ch
) - 'a' + 10;
392 if ((base
== 16 && !isxdigit (ch
))
393 || (base
!= 16 && (ch
< '0' || ch
>= (int) ('0' + base
))))
400 byte
+= tolower (ch
) - 'a' + 10;
403 if (base
!= 16 && isdigit (ch
))
411 bytes
[nbytes
++] = byte
;
413 while (ch
== lr
->escape_char
414 && nbytes
< (int) sizeof (lr
->token
.val
.charcode
.bytes
));
417 lr_error (lr
, _("garbage at end of character code specification"));
421 lr
->token
.tok
= tok_charcode
;
422 lr
->token
.val
.charcode
.nbytes
= nbytes
;
431 if (bufact == bufmax) \
434 buf = xrealloc (buf, bufmax); \
436 buf[bufact++] = (ch); \
445 if (bufact + _l > bufmax) \
450 buf = xrealloc (buf, bufmax); \
452 memcpy (&buf[bufact], s, _l); \
461 if (buf2act == buf2max) \
464 buf2 = xrealloc (buf2, buf2max * 4); \
466 buf2[buf2act++] = (ch); \
471 static struct token
*
472 get_symname (struct linereader
*lr
)
474 /* Symbol in brackets. We must distinguish three kinds:
476 2. ISO 10646 position values
481 const struct keyword_t
*kw
;
484 buf
= (char *) xmalloc (bufmax
);
489 if (ch
== lr
->escape_char
)
491 int c2
= lr_getc (lr
);
500 while (ch
!= '>' && ch
!= '\n');
503 lr_error (lr
, _("unterminated symbolic name"));
505 /* Test for ISO 10646 position value. */
506 if (buf
[0] == 'U' && (bufact
== 6 || bufact
== 10))
509 while (cp
< &buf
[bufact
- 1] && isxdigit (*cp
))
512 if (cp
== &buf
[bufact
- 1])
515 lr
->token
.tok
= tok_ucs4
;
516 lr
->token
.val
.ucs4
= strtoul (buf
+ 1, NULL
, 16);
522 /* It is a symbolic name. Test for reserved words. */
523 kw
= lr
->hash_fct (buf
, bufact
- 1);
525 if (kw
!= NULL
&& kw
->symname_or_ident
== 1)
527 lr
->token
.tok
= kw
->token
;
532 lr
->token
.tok
= tok_bsymbol
;
535 buf
= xrealloc (buf
, bufact
+ 1);
537 lr
->token
.val
.str
.startmb
= buf
;
538 lr
->token
.val
.str
.lenmb
= bufact
- 1;
545 static struct token
*
546 get_ident (struct linereader
*lr
)
551 const struct keyword_t
*kw
;
554 buf
= xmalloc (bufmax
);
557 ADDC (lr
->buf
[lr
->idx
- 1]);
559 while (!isspace ((ch
= lr_getc (lr
))) && ch
!= '"' && ch
!= ';'
560 && ch
!= '<' && ch
!= ',' && ch
!= EOF
)
562 if (ch
== lr
->escape_char
)
565 if (ch
== '\n' || ch
== EOF
)
567 lr_error (lr
, _("invalid escape sequence"));
576 kw
= lr
->hash_fct (buf
, bufact
);
578 if (kw
!= NULL
&& kw
->symname_or_ident
== 0)
580 lr
->token
.tok
= kw
->token
;
585 lr
->token
.tok
= tok_ident
;
588 buf
= xrealloc (buf
, bufact
+ 1);
590 lr
->token
.val
.str
.startmb
= buf
;
591 lr
->token
.val
.str
.lenmb
= bufact
;
598 static struct token
*
599 get_string (struct linereader
*lr
, const struct charmap_t
*charmap
,
600 struct localedef_t
*locale
, const struct repertoire_t
*repertoire
,
603 int return_widestr
= lr
->return_widestr
;
605 wchar_t *buf2
= NULL
;
609 /* We must return two different strings. */
610 buf
= xmalloc (bufmax
);
613 /* We know it'll be a string. */
614 lr
->token
.tok
= tok_string
;
616 /* If we need not translate the strings (i.e., expand <...> parts)
617 we can run a simple loop. */
618 if (!lr
->translate_strings
)
623 while ((ch
= lr_getc (lr
)) != '"' && ch
!= '\n' && ch
!= EOF
)
626 /* Catch errors with trailing escape character. */
627 if (bufact
> 0 && buf
[bufact
- 1] == lr
->escape_char
628 && (bufact
== 1 || buf
[bufact
- 2] != lr
->escape_char
))
630 lr_error (lr
, _("illegal escape sequence at end of string"));
633 else if (ch
== '\n' || ch
== EOF
)
634 lr_error (lr
, _("unterminated string"));
640 int illegal_string
= 0;
642 size_t buf2max
= 56 * sizeof (uint32_t);
646 /* We have to provide the wide character result as well. */
648 buf2
= xmalloc (buf2max
);
650 /* Read until the end of the string (or end of the line or file). */
651 while ((ch
= lr_getc (lr
)) != '"' && ch
!= '\n' && ch
!= EOF
)
659 /* The standards leave it up to the implementation to decide
660 what to do with character which stand for themself. We
661 could jump through hoops to find out the value relative to
662 the charmap and the repertoire map, but instead we leave
663 it up to the locale definition author to write a better
664 definition. We assume here that every character which
665 stands for itself is encoded using ISO 8859-1. Using the
666 escape character is allowed. */
667 if (ch
== lr
->escape_char
)
670 if (ch
== '\n' || ch
== EOF
)
674 if (verbose
&& !warned
)
677 non-symbolic character value should not be used"));
683 ADDWC ((uint32_t) ch
);
688 /* Now we have to search for the end of the symbolic name, i.e.,
691 while ((ch
= lr_getc (lr
)) != '>' && ch
!= '\n' && ch
!= EOF
)
693 if (ch
== lr
->escape_char
)
696 if (ch
== '\n' || ch
== EOF
)
701 if (ch
== '\n' || ch
== EOF
)
702 /* Not a correct string. */
704 if (bufact
== startidx
)
706 /* <> is no correct name. Ignore it and also signal an
712 /* It might be a Uxxxx symbol. */
713 if (buf
[startidx
] == 'U'
714 && (bufact
- startidx
== 5 || bufact
- startidx
== 9))
716 char *cp
= buf
+ startidx
+ 1;
717 while (cp
< &buf
[bufact
] && isxdigit (*cp
))
720 if (cp
== &buf
[bufact
])
726 wch
= strtoul (buf
+ startidx
+ 1, NULL
, 16);
728 /* Now forget about the name we just added. */
734 /* See whether the charmap contains the Uxxxxxxxx names. */
735 snprintf (utmp
, sizeof (utmp
), "U%08X", wch
);
736 seq
= charmap_find_value (charmap
, utmp
, 9);
740 /* No, this isn't the case. Now determine from
741 the repertoire the name of the character and
742 find it in the charmap. */
743 if (repertoire
!= NULL
)
747 symbol
= repertoire_find_symbol (repertoire
, wch
);
750 seq
= charmap_find_value (charmap
, symbol
,
756 #ifndef NO_TRANSLITERATION
757 /* Transliterate if possible. */
762 if ((locale
->avail
& CTYPE_LOCALE
) == 0)
764 /* Load the CTYPE data now. */
765 int old_needed
= locale
->needed
;
768 locale
= load_locale (LC_CTYPE
,
770 locale
->repertoire_name
,
772 locale
->needed
= old_needed
;
775 if ((locale
->avail
& CTYPE_LOCALE
) != 0
776 && ((translit
= find_translit (locale
,
779 /* The CTYPE data contains a matching
784 for (i
= 0; translit
[i
] != 0; ++i
)
788 snprintf (utmp
, sizeof (utmp
), "U%08X",
790 seq
= charmap_find_value (charmap
, utmp
,
792 assert (seq
!= NULL
);
793 ADDS (seq
->bytes
, seq
->nbytes
);
799 #endif /* NO_TRANSLITERATION */
801 /* Not a known name. */
807 ADDS (seq
->bytes
, seq
->nbytes
);
813 /* We now have the symbolic name in buf[startidx] to
814 buf[bufact-1]. Now find out the value for this character
815 in the charmap as well as in the repertoire map (in this
817 seq
= charmap_find_value (charmap
, &buf
[startidx
],
822 /* This name is not in the charmap. */
823 lr_error (lr
, _("symbol `%.*s' not in charmap"),
824 (int) (bufact
- startidx
), &buf
[startidx
]);
830 /* Now the same for the multibyte representation. */
831 if (seq
!= NULL
&& seq
->ucs4
!= UNINITIALIZED_CHAR_VALUE
)
835 wch
= repertoire_find_value (repertoire
, &buf
[startidx
],
841 if (wch
== ILLEGAL_CHAR_VALUE
)
843 /* This name is not in the repertoire map. */
844 lr_error (lr
, _("symbol `%.*s' not in repertoire map"),
845 (int) (bufact
- startidx
), &buf
[startidx
]);
852 /* Now forget about the name we just added. */
855 /* And copy the bytes. */
857 ADDS (seq
->bytes
, seq
->nbytes
);
860 if (ch
== '\n' || ch
== EOF
)
862 lr_error (lr
, _("unterminated string"));
871 lr
->token
.val
.str
.startmb
= NULL
;
872 lr
->token
.val
.str
.lenmb
= 0;
873 lr
->token
.val
.str
.startwc
= NULL
;
874 lr
->token
.val
.str
.lenwc
= 0;
884 lr
->token
.val
.str
.startwc
= xrealloc (buf2
,
885 buf2act
* sizeof (uint32_t));
886 lr
->token
.val
.str
.lenwc
= buf2act
;
890 lr
->token
.val
.str
.startmb
= xrealloc (buf
, bufact
);
891 lr
->token
.val
.str
.lenmb
= bufact
;