1 /* Copyright (C) 1996, 1997, 1998, 1999 Free Software Foundation, Inc.
2 This file is part of the GNU C Library.
3 Contributed by Ulrich Drepper <drepper@gnu.org>, 1996.
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Library General Public License as
7 published by the Free Software Foundation; either version 2 of the
8 License, or (at your option) any later version.
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Library General Public License for more details.
15 You should have received a copy of the GNU Library General Public
16 License along with the GNU C Library; see the file COPYING.LIB. If not,
17 write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
18 Boston, MA 02111-1307, USA. */
33 #include "linereader.h"
34 #include "localedef.h"
35 #include "stringtrans.h"
38 /* Prototypes for local functions. */
39 static struct token
*get_toplvl_escape (struct linereader
*lr
);
40 static struct token
*get_symname (struct linereader
*lr
);
41 static struct token
*get_ident (struct linereader
*lr
);
42 static struct token
*get_string (struct linereader
*lr
,
43 const struct charmap_t
*charmap
,
44 const struct repertoire_t
*repertoire
);
48 lr_open (const char *fname
, kw_hash_fct_t hf
)
51 struct linereader
*result
;
54 if (fname
== NULL
|| strcmp (fname
, "-") == 0
55 || strcmp (fname
, "/dev/stdin") == 0)
59 fp
= fopen (fname
, "r");
64 result
= (struct linereader
*) xmalloc (sizeof (*result
));
67 result
->fname
= xstrdup (fname
? : "<stdin>");
72 result
->comment_char
= '#';
73 result
->escape_char
= '\\';
74 result
->translate_strings
= 1;
76 n
= getdelim (&result
->buf
, &result
->bufsize
, '\n', result
->fp
);
81 free ((char *) result
->fname
);
87 if (n
> 1 && result
->buf
[n
- 2] == '\\' && result
->buf
[n
- 1] == '\n')
90 result
->buf
[n
] = '\0';
92 result
->hash_fct
= hf
;
99 lr_eof (struct linereader
*lr
)
101 return lr
->bufact
= 0;
106 lr_close (struct linereader
*lr
)
115 lr_next (struct linereader
*lr
)
119 n
= getdelim (&lr
->buf
, &lr
->bufsize
, '\n', lr
->fp
);
125 if (n
> 1 && lr
->buf
[n
- 2] == lr
->escape_char
&& lr
->buf
[n
- 1] == '\n')
128 /* XXX Is this correct? */
129 /* An escaped newline character is substituted with a single <SP>. */
131 lr
->buf
[n
- 1] = ' ';
145 /* Defined in error.c. */
146 /* This variable is incremented each time `error' is called. */
147 extern unsigned int error_message_count
;
149 /* The calling program should define program_name and set it to the
150 name of the executing program. */
151 extern char *program_name
;
155 lr_token (struct linereader
*lr
, const struct charmap_t
*charmap
,
156 const struct repertoire_t
*repertoire
)
168 lr
->token
.tok
= tok_eof
;
174 lr
->token
.tok
= tok_eol
;
178 while (isspace (ch
));
182 lr
->token
.tok
= tok_eof
;
186 if (ch
!= lr
->comment_char
)
189 /* Ignore rest of line. */
190 lr_ignore_rest (lr
, 0);
191 lr
->token
.tok
= tok_eol
;
195 /* Match escape sequences. */
196 if (ch
== lr
->escape_char
)
197 return get_toplvl_escape (lr
);
199 /* Match ellipsis. */
202 if (strncmp (&lr
->buf
[lr
->idx
], "...", 3) == 0)
207 lr
->token
.tok
= tok_ellipsis4
;
210 if (strncmp (&lr
->buf
[lr
->idx
], "..", 2) == 0)
214 lr
->token
.tok
= tok_ellipsis3
;
217 if (lr
->buf
[lr
->idx
] == '.')
220 lr
->token
.tok
= tok_ellipsis2
;
228 return get_symname (lr
);
231 lr
->token
.tok
= tok_number
;
232 lr
->token
.val
.num
= ch
- '0';
234 while (isdigit (ch
= lr_getc (lr
)))
236 lr
->token
.val
.num
*= 10;
237 lr
->token
.val
.num
+= ch
- '0';
240 lr_error (lr
, _("garbage at end of number"));
246 lr
->token
.tok
= tok_semicolon
;
250 lr
->token
.tok
= tok_comma
;
254 lr
->token
.tok
= tok_open_brace
;
258 lr
->token
.tok
= tok_close_brace
;
262 return get_string (lr
, charmap
, repertoire
);
268 lr
->token
.tok
= tok_minus1
;
275 return get_ident (lr
);
279 static struct token
*
280 get_toplvl_escape (struct linereader
*lr
)
282 /* This is supposed to be a numeric value. We return the
283 numerical value and the number of bytes. */
284 size_t start_idx
= lr
->idx
- 1;
285 char *bytes
= lr
->token
.val
.charcode
.bytes
;
291 unsigned int byte
= 0;
292 unsigned int base
= 8;
307 if ((base
== 16 && !isxdigit (ch
))
308 || (base
!= 16 && (ch
< '0' || ch
>= (int) ('0' + base
))))
311 lr
->token
.val
.str
.startmb
= &lr
->buf
[start_idx
];
313 while (ch
!= EOF
&& !isspace (ch
))
315 lr
->token
.val
.str
.lenmb
= lr
->idx
- start_idx
;
317 lr
->token
.tok
= tok_error
;
324 byte
= tolower (ch
) - 'a' + 10;
327 if ((base
== 16 && !isxdigit (ch
))
328 || (base
!= 16 && (ch
< '0' || ch
>= (int) ('0' + base
))))
335 byte
+= tolower (ch
) - 'a' + 10;
338 if (base
!= 16 && isdigit (ch
))
346 bytes
[nbytes
++] = byte
;
348 while (ch
== lr
->escape_char
&& nbytes
< 4);
351 lr_error (lr
, _("garbage at end of character code specification"));
355 lr
->token
.tok
= tok_charcode
;
356 lr
->token
.val
.charcode
.nbytes
= nbytes
;
365 if (bufact == bufmax) \
368 buf = xrealloc (buf, bufmax); \
370 buf[bufact++] = (ch); \
379 if (bufact + _l > bufmax) \
384 buf = xrealloc (buf, bufmax); \
386 memcpy (&buf[bufact], s, _l); \
395 if (buf2act == buf2max) \
398 buf2 = xrealloc (buf2, buf2max * 4); \
400 buf2[buf2act++] = (ch); \
405 static struct token
*
406 get_symname (struct linereader
*lr
)
408 /* Symbol in brackets. We must distinguish three kinds:
410 2. ISO 10646 position values
415 const struct keyword_t
*kw
;
418 buf
= (char *) xmalloc (bufmax
);
423 if (ch
== lr
->escape_char
)
425 int c2
= lr_getc (lr
);
434 while (ch
!= '>' && ch
!= '\n');
437 lr_error (lr
, _("unterminated symbolic name"));
439 /* Test for ISO 10646 position value. */
440 if (buf
[0] == 'U' && (bufact
== 6 || bufact
== 10))
443 while (cp
< &buf
[bufact
- 1] && isxdigit (*cp
))
446 if (cp
== &buf
[bufact
- 1])
449 lr
->token
.tok
= tok_ucs4
;
450 lr
->token
.val
.ucs4
= strtoul (buf
+ 1, NULL
, 16);
456 /* It is a symbolic name. Test for reserved words. */
457 kw
= lr
->hash_fct (buf
, bufact
- 1);
459 if (kw
!= NULL
&& kw
->symname_or_ident
== 1)
461 lr
->token
.tok
= kw
->token
;
466 lr
->token
.tok
= tok_bsymbol
;
469 buf
= xrealloc (buf
, bufact
+ 1);
471 lr
->token
.val
.str
.startmb
= buf
;
472 lr
->token
.val
.str
.lenmb
= bufact
- 1;
479 static struct token
*
480 get_ident (struct linereader
*lr
)
485 const struct keyword_t
*kw
;
488 buf
= xmalloc (bufmax
);
491 ADDC (lr
->buf
[lr
->idx
- 1]);
493 while (!isspace ((ch
= lr_getc (lr
))) && ch
!= '"' && ch
!= ';'
494 && ch
!= '<' && ch
!= ',')
496 if (ch
== lr
->escape_char
)
499 if (ch
== '\n' || ch
== EOF
)
501 lr_error (lr
, _("invalid escape sequence"));
510 kw
= lr
->hash_fct (buf
, bufact
);
512 if (kw
!= NULL
&& kw
->symname_or_ident
== 0)
514 lr
->token
.tok
= kw
->token
;
519 lr
->token
.tok
= tok_ident
;
522 buf
= xrealloc (buf
, bufact
+ 1);
524 lr
->token
.val
.str
.startmb
= buf
;
525 lr
->token
.val
.str
.lenmb
= bufact
;
532 static struct token
*
533 get_string (struct linereader
*lr
, const struct charmap_t
*charmap
,
534 const struct repertoire_t
*repertoire
)
536 int return_widestr
= lr
->return_widestr
;
538 wchar_t *buf2
= NULL
;
542 /* We must return two different strings. */
543 buf
= xmalloc (bufmax
);
546 /* We know it'll be a string. */
547 lr
->token
.tok
= tok_string
;
549 /* If we need not translate the strings (i.e., expand <...> parts)
550 we can run a simple loop. */
551 if (!lr
->translate_strings
)
556 while ((ch
= lr_getc (lr
)) != '"' && ch
!= '\n' && ch
!= EOF
)
559 /* Catch errors with trailing escape character. */
560 if (bufact
> 0 && buf
[bufact
- 1] == lr
->escape_char
561 && (bufact
== 1 || buf
[bufact
- 2] != lr
->escape_char
))
563 lr_error (lr
, _("illegal escape sequence at end of string"));
566 else if (ch
== '\n' || ch
== EOF
)
567 lr_error (lr
, _("unterminated string"));
573 int illegal_string
= 0;
575 size_t buf2max
= 56 * sizeof (uint32_t);
579 /* We have to provide the wide character result as well. */
581 buf2
= xmalloc (buf2max
);
583 /* Read until the end of the string (or end of the line or file). */
584 while ((ch
= lr_getc (lr
)) != '"' && ch
!= '\n' && ch
!= EOF
)
592 /* The standards leave it up to the implementation to decide
593 what to do with character which stand for themself. We
594 could jump through hoops to find out the value relative to
595 the charmap and the repertoire map, but instead we leave
596 it up to the locale definition author to write a better
597 definition. We assume here that every character which
598 stands for itself is encoded using ISO 8859-1. Using the
599 escape character is allowed. */
600 if (ch
== lr
->escape_char
)
603 if (ch
== '\n' || ch
== EOF
)
607 if (verbose
&& !warned
)
610 non-symbolic character value should not be used"));
616 ADDWC ((uint32_t) ch
);
621 /* Now we have to search for the end of the symbolic name, i.e.,
624 while ((ch
= lr_getc (lr
)) != '>' && ch
!= '\n' && ch
!= EOF
)
626 if (ch
== lr
->escape_char
)
629 if (ch
== '\n' || ch
== EOF
)
634 if (ch
== '\n' || ch
== EOF
)
635 /* Not a correct string. */
637 if (bufact
== startidx
)
639 /* <> is no correct name. Ignore it and also signal an
645 /* It might be a Uxxxx symbol. */
646 if (buf
[startidx
] == 'U'
647 && (bufact
- startidx
== 5 || bufact
- startidx
== 9))
649 char *cp
= buf
+ startidx
+ 1;
650 while (cp
< &buf
[bufact
] && isxdigit (*cp
))
653 if (cp
== &buf
[bufact
])
655 const char *symbol
= NULL
;
659 wch
= strtoul (buf
+ startidx
+ 1, NULL
, 16);
661 /* Now forget about the name we just added. */
667 /* Now determine from the repertoire the name of the
668 character and find it in the charmap. */
669 if (repertoire
!= NULL
)
670 symbol
= repertoire_find_symbol (repertoire
, wch
);
674 /* We cannot generate a string since we cannot map
675 from the Unicode number to the character symbol. */
677 _("character <U%0*X> not in repertoire map"),
678 wch
> 0xffff ? 8 : 4, wch
);
684 seq
= charmap_find_value (charmap
, symbol
,
689 /* Not a known name. */
691 _("symbol `%s' not in charmap"), symbol
);
695 ADDS (seq
->bytes
, seq
->nbytes
);
704 /* We now have the symbolic name in buf[startidx] to
705 buf[bufact-1]. Now find out the value for this
706 character in the repertoire map as well as in the
707 charmap (in this order). */
708 wch
= repertoire_find_value (repertoire
, &buf
[startidx
],
710 if (wch
== ILLEGAL_CHAR_VALUE
)
712 /* This name is not in the repertoire map. */
713 lr_error (lr
, _("symbol `%.*s' not in repertoire map"),
714 bufact
- startidx
, &buf
[startidx
]);
721 /* Now the same for the multibyte representation. */
722 seq
= charmap_find_value (charmap
, &buf
[startidx
],
727 /* This name is not in the charmap. */
728 lr_error (lr
, _("symbol `%.*s' not in charmap"),
729 bufact
- startidx
, &buf
[startidx
]);
732 /* Now forget about the name we just added. */
737 /* Now forget about the name we just added. */
740 ADDS (seq
->bytes
, seq
->nbytes
);
744 if (ch
== '\n' || ch
== EOF
)
746 lr_error (lr
, _("unterminated string"));
755 lr
->token
.val
.str
.startmb
= NULL
;
756 lr
->token
.val
.str
.lenmb
= 0;
766 lr
->token
.val
.str
.startwc
= xrealloc (buf2
,
767 buf2act
* sizeof (uint32_t));
768 lr
->token
.val
.str
.lenwc
= buf2act
;
772 lr
->token
.val
.str
.startmb
= xrealloc (buf
, bufact
);
773 lr
->token
.val
.str
.lenmb
= bufact
;