1 /* Copyright (C) 1996, 1997, 1998, 1999, 2000 Free Software Foundation, Inc.
2 This file is part of the GNU C Library.
3 Contributed by Ulrich Drepper <drepper@gnu.org>, 1996.
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Library General Public License as
7 published by the Free Software Foundation; either version 2 of the
8 License, or (at your option) any later version.
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Library General Public License for more details.
15 You should have received a copy of the GNU Library General Public
16 License along with the GNU C Library; see the file COPYING.LIB. If not,
17 write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
18 Boston, MA 02111-1307, USA. */
33 #include "linereader.h"
34 #include "localedef.h"
37 /* Prototypes for local functions. */
38 static struct token
*get_toplvl_escape (struct linereader
*lr
);
39 static struct token
*get_symname (struct linereader
*lr
);
40 static struct token
*get_ident (struct linereader
*lr
);
41 static struct token
*get_string (struct linereader
*lr
,
42 const struct charmap_t
*charmap
,
43 const struct repertoire_t
*repertoire
);
47 lr_open (const char *fname
, kw_hash_fct_t hf
)
50 struct linereader
*result
;
53 if (fname
== NULL
|| strcmp (fname
, "-") == 0
54 || strcmp (fname
, "/dev/stdin") == 0)
58 fp
= fopen (fname
, "r");
63 result
= (struct linereader
*) xmalloc (sizeof (*result
));
66 result
->fname
= xstrdup (fname
? : "<stdin>");
71 result
->comment_char
= '#';
72 result
->escape_char
= '\\';
73 result
->translate_strings
= 1;
75 n
= getdelim (&result
->buf
, &result
->bufsize
, '\n', result
->fp
);
80 free ((char *) result
->fname
);
86 if (n
> 1 && result
->buf
[n
- 2] == '\\' && result
->buf
[n
- 1] == '\n')
89 result
->buf
[n
] = '\0';
91 result
->hash_fct
= hf
;
98 lr_eof (struct linereader
*lr
)
100 return lr
->bufact
= 0;
105 lr_close (struct linereader
*lr
)
114 lr_next (struct linereader
*lr
)
118 n
= getdelim (&lr
->buf
, &lr
->bufsize
, '\n', lr
->fp
);
124 if (n
> 1 && lr
->buf
[n
- 2] == lr
->escape_char
&& lr
->buf
[n
- 1] == '\n')
127 /* XXX Is this correct? */
128 /* An escaped newline character is substituted with a single <SP>. */
130 lr
->buf
[n
- 1] = ' ';
144 /* Defined in error.c. */
145 /* This variable is incremented each time `error' is called. */
146 extern unsigned int error_message_count
;
148 /* The calling program should define program_name and set it to the
149 name of the executing program. */
150 extern char *program_name
;
154 lr_token (struct linereader
*lr
, const struct charmap_t
*charmap
,
155 const struct repertoire_t
*repertoire
)
167 lr
->token
.tok
= tok_eof
;
173 lr
->token
.tok
= tok_eol
;
177 while (isspace (ch
));
181 lr
->token
.tok
= tok_eof
;
185 if (ch
!= lr
->comment_char
)
188 /* Is there an newline at the end of the buffer? */
189 if (lr
->buf
[lr
->bufact
- 1] != '\n')
191 /* No. Some people want this to mean that only the line in
192 the file not the logical, concatenated line is ignored.
194 lr
->idx
= lr
->bufact
;
198 /* Ignore rest of line. */
199 lr_ignore_rest (lr
, 0);
200 lr
->token
.tok
= tok_eol
;
204 /* Match escape sequences. */
205 if (ch
== lr
->escape_char
)
206 return get_toplvl_escape (lr
);
208 /* Match ellipsis. */
211 if (strncmp (&lr
->buf
[lr
->idx
], "...(2)....", 10) == 0)
214 for (cnt
= 0; cnt
< 10; ++cnt
)
216 lr
->token
.tok
= tok_ellipsis4_2
;
219 if (strncmp (&lr
->buf
[lr
->idx
], "...", 3) == 0)
224 lr
->token
.tok
= tok_ellipsis4
;
227 if (strncmp (&lr
->buf
[lr
->idx
], "..", 2) == 0)
231 lr
->token
.tok
= tok_ellipsis3
;
234 if (strncmp (&lr
->buf
[lr
->idx
], ".(2)..", 6) == 0)
237 for (cnt
= 0; cnt
< 6; ++cnt
)
239 lr
->token
.tok
= tok_ellipsis2_2
;
242 if (lr
->buf
[lr
->idx
] == '.')
245 lr
->token
.tok
= tok_ellipsis2
;
253 return get_symname (lr
);
256 lr
->token
.tok
= tok_number
;
257 lr
->token
.val
.num
= ch
- '0';
259 while (isdigit (ch
= lr_getc (lr
)))
261 lr
->token
.val
.num
*= 10;
262 lr
->token
.val
.num
+= ch
- '0';
265 lr_error (lr
, _("garbage at end of number"));
271 lr
->token
.tok
= tok_semicolon
;
275 lr
->token
.tok
= tok_comma
;
279 lr
->token
.tok
= tok_open_brace
;
283 lr
->token
.tok
= tok_close_brace
;
287 return get_string (lr
, charmap
, repertoire
);
293 lr
->token
.tok
= tok_minus1
;
300 return get_ident (lr
);
304 static struct token
*
305 get_toplvl_escape (struct linereader
*lr
)
307 /* This is supposed to be a numeric value. We return the
308 numerical value and the number of bytes. */
309 size_t start_idx
= lr
->idx
- 1;
310 char *bytes
= lr
->token
.val
.charcode
.bytes
;
316 unsigned int byte
= 0;
317 unsigned int base
= 8;
332 if ((base
== 16 && !isxdigit (ch
))
333 || (base
!= 16 && (ch
< '0' || ch
>= (int) ('0' + base
))))
336 lr
->token
.val
.str
.startmb
= &lr
->buf
[start_idx
];
338 while (ch
!= EOF
&& !isspace (ch
))
340 lr
->token
.val
.str
.lenmb
= lr
->idx
- start_idx
;
342 lr
->token
.tok
= tok_error
;
349 byte
= tolower (ch
) - 'a' + 10;
352 if ((base
== 16 && !isxdigit (ch
))
353 || (base
!= 16 && (ch
< '0' || ch
>= (int) ('0' + base
))))
360 byte
+= tolower (ch
) - 'a' + 10;
363 if (base
!= 16 && isdigit (ch
))
371 bytes
[nbytes
++] = byte
;
373 while (ch
== lr
->escape_char
&& nbytes
< 4);
376 lr_error (lr
, _("garbage at end of character code specification"));
380 lr
->token
.tok
= tok_charcode
;
381 lr
->token
.val
.charcode
.nbytes
= nbytes
;
390 if (bufact == bufmax) \
393 buf = xrealloc (buf, bufmax); \
395 buf[bufact++] = (ch); \
404 if (bufact + _l > bufmax) \
409 buf = xrealloc (buf, bufmax); \
411 memcpy (&buf[bufact], s, _l); \
420 if (buf2act == buf2max) \
423 buf2 = xrealloc (buf2, buf2max * 4); \
425 buf2[buf2act++] = (ch); \
430 static struct token
*
431 get_symname (struct linereader
*lr
)
433 /* Symbol in brackets. We must distinguish three kinds:
435 2. ISO 10646 position values
440 const struct keyword_t
*kw
;
443 buf
= (char *) xmalloc (bufmax
);
448 if (ch
== lr
->escape_char
)
450 int c2
= lr_getc (lr
);
459 while (ch
!= '>' && ch
!= '\n');
462 lr_error (lr
, _("unterminated symbolic name"));
464 /* Test for ISO 10646 position value. */
465 if (buf
[0] == 'U' && (bufact
== 6 || bufact
== 10))
468 while (cp
< &buf
[bufact
- 1] && isxdigit (*cp
))
471 if (cp
== &buf
[bufact
- 1])
474 lr
->token
.tok
= tok_ucs4
;
475 lr
->token
.val
.ucs4
= strtoul (buf
+ 1, NULL
, 16);
481 /* It is a symbolic name. Test for reserved words. */
482 kw
= lr
->hash_fct (buf
, bufact
- 1);
484 if (kw
!= NULL
&& kw
->symname_or_ident
== 1)
486 lr
->token
.tok
= kw
->token
;
491 lr
->token
.tok
= tok_bsymbol
;
494 buf
= xrealloc (buf
, bufact
+ 1);
496 lr
->token
.val
.str
.startmb
= buf
;
497 lr
->token
.val
.str
.lenmb
= bufact
- 1;
504 static struct token
*
505 get_ident (struct linereader
*lr
)
510 const struct keyword_t
*kw
;
513 buf
= xmalloc (bufmax
);
516 ADDC (lr
->buf
[lr
->idx
- 1]);
518 while (!isspace ((ch
= lr_getc (lr
))) && ch
!= '"' && ch
!= ';'
519 && ch
!= '<' && ch
!= ',')
521 if (ch
== lr
->escape_char
)
524 if (ch
== '\n' || ch
== EOF
)
526 lr_error (lr
, _("invalid escape sequence"));
535 kw
= lr
->hash_fct (buf
, bufact
);
537 if (kw
!= NULL
&& kw
->symname_or_ident
== 0)
539 lr
->token
.tok
= kw
->token
;
544 lr
->token
.tok
= tok_ident
;
547 buf
= xrealloc (buf
, bufact
+ 1);
549 lr
->token
.val
.str
.startmb
= buf
;
550 lr
->token
.val
.str
.lenmb
= bufact
;
557 static struct token
*
558 get_string (struct linereader
*lr
, const struct charmap_t
*charmap
,
559 const struct repertoire_t
*repertoire
)
561 int return_widestr
= lr
->return_widestr
;
563 wchar_t *buf2
= NULL
;
567 /* We must return two different strings. */
568 buf
= xmalloc (bufmax
);
571 /* We know it'll be a string. */
572 lr
->token
.tok
= tok_string
;
574 /* If we need not translate the strings (i.e., expand <...> parts)
575 we can run a simple loop. */
576 if (!lr
->translate_strings
)
581 while ((ch
= lr_getc (lr
)) != '"' && ch
!= '\n' && ch
!= EOF
)
584 /* Catch errors with trailing escape character. */
585 if (bufact
> 0 && buf
[bufact
- 1] == lr
->escape_char
586 && (bufact
== 1 || buf
[bufact
- 2] != lr
->escape_char
))
588 lr_error (lr
, _("illegal escape sequence at end of string"));
591 else if (ch
== '\n' || ch
== EOF
)
592 lr_error (lr
, _("unterminated string"));
598 int illegal_string
= 0;
600 size_t buf2max
= 56 * sizeof (uint32_t);
604 /* We have to provide the wide character result as well. */
606 buf2
= xmalloc (buf2max
);
608 /* Read until the end of the string (or end of the line or file). */
609 while ((ch
= lr_getc (lr
)) != '"' && ch
!= '\n' && ch
!= EOF
)
617 /* The standards leave it up to the implementation to decide
618 what to do with character which stand for themself. We
619 could jump through hoops to find out the value relative to
620 the charmap and the repertoire map, but instead we leave
621 it up to the locale definition author to write a better
622 definition. We assume here that every character which
623 stands for itself is encoded using ISO 8859-1. Using the
624 escape character is allowed. */
625 if (ch
== lr
->escape_char
)
628 if (ch
== '\n' || ch
== EOF
)
632 if (verbose
&& !warned
)
635 non-symbolic character value should not be used"));
641 ADDWC ((uint32_t) ch
);
646 /* Now we have to search for the end of the symbolic name, i.e.,
649 while ((ch
= lr_getc (lr
)) != '>' && ch
!= '\n' && ch
!= EOF
)
651 if (ch
== lr
->escape_char
)
654 if (ch
== '\n' || ch
== EOF
)
659 if (ch
== '\n' || ch
== EOF
)
660 /* Not a correct string. */
662 if (bufact
== startidx
)
664 /* <> is no correct name. Ignore it and also signal an
670 /* It might be a Uxxxx symbol. */
671 if (buf
[startidx
] == 'U'
672 && (bufact
- startidx
== 5 || bufact
- startidx
== 9))
674 char *cp
= buf
+ startidx
+ 1;
675 while (cp
< &buf
[bufact
] && isxdigit (*cp
))
678 if (cp
== &buf
[bufact
])
681 const char *symbol
= NULL
;
685 wch
= strtoul (buf
+ startidx
+ 1, NULL
, 16);
687 /* Now forget about the name we just added. */
693 /* See whether the charmap contains the Uxxxxxxxx names. */
694 snprintf (utmp
, sizeof (utmp
), "U%08X", wch
);
695 seq
= charmap_find_value (charmap
, utmp
, 9);
699 /* No, this isn't the case. Now determine from
700 the repertoire the name of the character and
701 find it in the charmap. */
702 if (repertoire
!= NULL
)
703 symbol
= repertoire_find_symbol (repertoire
, wch
);
706 /* We cannot generate a string since we
707 cannot map from the Unicode number to the
712 seq
= charmap_find_value (charmap
, symbol
,
716 /* Not a known name. */
722 ADDS (seq
->bytes
, seq
->nbytes
);
728 /* We now have the symbolic name in buf[startidx] to
729 buf[bufact-1]. Now find out the value for this character
730 in the charmap as well as in the repertoire map (in this
732 seq
= charmap_find_value (charmap
, &buf
[startidx
],
737 /* This name is not in the charmap. */
738 lr_error (lr
, _("symbol `%.*s' not in charmap"),
739 (int) (bufact
- startidx
), &buf
[startidx
]);
745 /* Now the same for the multibyte representation. */
746 if (seq
!= NULL
&& seq
->ucs4
!= UNINITIALIZED_CHAR_VALUE
)
750 wch
= repertoire_find_value (repertoire
, &buf
[startidx
],
756 if (wch
== ILLEGAL_CHAR_VALUE
)
758 /* This name is not in the repertoire map. */
759 lr_error (lr
, _("symbol `%.*s' not in repertoire map"),
760 (int) (bufact
- startidx
), &buf
[startidx
]);
767 /* Now forget about the name we just added. */
770 /* And copy the bytes. */
772 ADDS (seq
->bytes
, seq
->nbytes
);
775 if (ch
== '\n' || ch
== EOF
)
777 lr_error (lr
, _("unterminated string"));
786 lr
->token
.val
.str
.startmb
= NULL
;
787 lr
->token
.val
.str
.lenmb
= 0;
797 lr
->token
.val
.str
.startwc
= xrealloc (buf2
,
798 buf2act
* sizeof (uint32_t));
799 lr
->token
.val
.str
.lenwc
= buf2act
;
803 lr
->token
.val
.str
.startmb
= xrealloc (buf
, bufact
);
804 lr
->token
.val
.str
.lenmb
= bufact
;