1 /* Copyright (C) 1996,1997,1998,1999,2000,2001 Free Software Foundation, Inc.
2 This file is part of the GNU C Library.
3 Contributed by Ulrich Drepper <drepper@gnu.org>, 1996.
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, write to the Free
17 Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
33 #include "linereader.h"
35 /* Prototypes for a few program-wide used functions. */
36 extern void *xmalloc (size_t __n
);
37 extern void *xrealloc (void *__p
, size_t __n
);
38 extern char *xstrdup (const char *__str
);
41 /* Prototypes for local functions. */
42 static struct token
*get_toplvl_escape (struct linereader
*lr
);
43 static struct token
*get_symname (struct linereader
*lr
);
44 static struct token
*get_ident (struct linereader
*lr
);
45 static struct token
*get_string (struct linereader
*lr
,
46 const struct charmap_t
*charmap
,
47 const struct repertoire_t
*repertoire
,
52 lr_open (const char *fname
, kw_hash_fct_t hf
)
56 if (fname
== NULL
|| strcmp (fname
, "-") == 0
57 || strcmp (fname
, "/dev/stdin") == 0)
58 return lr_create (stdin
, "<stdin>", hf
);
61 fp
= fopen (fname
, "r");
64 return lr_create (fp
, fname
, hf
);
69 lr_create (FILE *fp
, const char *fname
, kw_hash_fct_t hf
)
71 struct linereader
*result
;
74 result
= (struct linereader
*) xmalloc (sizeof (*result
));
77 result
->fname
= xstrdup (fname
);
82 result
->comment_char
= '#';
83 result
->escape_char
= '\\';
84 result
->translate_strings
= 1;
86 n
= getdelim (&result
->buf
, &result
->bufsize
, '\n', result
->fp
);
91 free ((char *) result
->fname
);
97 if (n
> 1 && result
->buf
[n
- 2] == '\\' && result
->buf
[n
- 1] == '\n')
100 result
->buf
[n
] = '\0';
102 result
->hash_fct
= hf
;
109 lr_eof (struct linereader
*lr
)
111 return lr
->bufact
= 0;
116 lr_close (struct linereader
*lr
)
125 lr_next (struct linereader
*lr
)
129 n
= getdelim (&lr
->buf
, &lr
->bufsize
, '\n', lr
->fp
);
135 if (n
> 1 && lr
->buf
[n
- 2] == lr
->escape_char
&& lr
->buf
[n
- 1] == '\n')
138 /* XXX Is this correct? */
139 /* An escaped newline character is substituted with a single <SP>. */
141 lr
->buf
[n
- 1] = ' ';
155 /* Defined in error.c. */
156 /* This variable is incremented each time `error' is called. */
157 extern unsigned int error_message_count
;
159 /* The calling program should define program_name and set it to the
160 name of the executing program. */
161 extern char *program_name
;
165 lr_token (struct linereader
*lr
, const struct charmap_t
*charmap
,
166 const struct repertoire_t
*repertoire
, int verbose
)
178 lr
->token
.tok
= tok_eof
;
184 lr
->token
.tok
= tok_eol
;
188 while (isspace (ch
));
192 lr
->token
.tok
= tok_eof
;
196 if (ch
!= lr
->comment_char
)
199 /* Is there an newline at the end of the buffer? */
200 if (lr
->buf
[lr
->bufact
- 1] != '\n')
202 /* No. Some people want this to mean that only the line in
203 the file not the logical, concatenated line is ignored.
205 lr
->idx
= lr
->bufact
;
209 /* Ignore rest of line. */
210 lr_ignore_rest (lr
, 0);
211 lr
->token
.tok
= tok_eol
;
215 /* Match escape sequences. */
216 if (ch
== lr
->escape_char
)
217 return get_toplvl_escape (lr
);
219 /* Match ellipsis. */
222 if (strncmp (&lr
->buf
[lr
->idx
], "...(2)....", 10) == 0)
225 for (cnt
= 0; cnt
< 10; ++cnt
)
227 lr
->token
.tok
= tok_ellipsis4_2
;
230 if (strncmp (&lr
->buf
[lr
->idx
], "...", 3) == 0)
235 lr
->token
.tok
= tok_ellipsis4
;
238 if (strncmp (&lr
->buf
[lr
->idx
], "..", 2) == 0)
242 lr
->token
.tok
= tok_ellipsis3
;
245 if (strncmp (&lr
->buf
[lr
->idx
], ".(2)..", 6) == 0)
248 for (cnt
= 0; cnt
< 6; ++cnt
)
250 lr
->token
.tok
= tok_ellipsis2_2
;
253 if (lr
->buf
[lr
->idx
] == '.')
256 lr
->token
.tok
= tok_ellipsis2
;
264 return get_symname (lr
);
267 lr
->token
.tok
= tok_number
;
268 lr
->token
.val
.num
= ch
- '0';
270 while (isdigit (ch
= lr_getc (lr
)))
272 lr
->token
.val
.num
*= 10;
273 lr
->token
.val
.num
+= ch
- '0';
276 lr_error (lr
, _("garbage at end of number"));
282 lr
->token
.tok
= tok_semicolon
;
286 lr
->token
.tok
= tok_comma
;
290 lr
->token
.tok
= tok_open_brace
;
294 lr
->token
.tok
= tok_close_brace
;
298 return get_string (lr
, charmap
, repertoire
, verbose
);
304 lr
->token
.tok
= tok_minus1
;
311 return get_ident (lr
);
315 static struct token
*
316 get_toplvl_escape (struct linereader
*lr
)
318 /* This is supposed to be a numeric value. We return the
319 numerical value and the number of bytes. */
320 size_t start_idx
= lr
->idx
- 1;
321 char *bytes
= lr
->token
.val
.charcode
.bytes
;
327 unsigned int byte
= 0;
328 unsigned int base
= 8;
343 if ((base
== 16 && !isxdigit (ch
))
344 || (base
!= 16 && (ch
< '0' || ch
>= (int) ('0' + base
))))
347 lr
->token
.val
.str
.startmb
= &lr
->buf
[start_idx
];
349 while (ch
!= EOF
&& !isspace (ch
))
351 lr
->token
.val
.str
.lenmb
= lr
->idx
- start_idx
;
353 lr
->token
.tok
= tok_error
;
360 byte
= tolower (ch
) - 'a' + 10;
363 if ((base
== 16 && !isxdigit (ch
))
364 || (base
!= 16 && (ch
< '0' || ch
>= (int) ('0' + base
))))
371 byte
+= tolower (ch
) - 'a' + 10;
374 if (base
!= 16 && isdigit (ch
))
382 bytes
[nbytes
++] = byte
;
384 while (ch
== lr
->escape_char
385 && nbytes
< sizeof (lr
->token
.val
.charcode
.bytes
));
388 lr_error (lr
, _("garbage at end of character code specification"));
392 lr
->token
.tok
= tok_charcode
;
393 lr
->token
.val
.charcode
.nbytes
= nbytes
;
402 if (bufact == bufmax) \
405 buf = xrealloc (buf, bufmax); \
407 buf[bufact++] = (ch); \
416 if (bufact + _l > bufmax) \
421 buf = xrealloc (buf, bufmax); \
423 memcpy (&buf[bufact], s, _l); \
432 if (buf2act == buf2max) \
435 buf2 = xrealloc (buf2, buf2max * 4); \
437 buf2[buf2act++] = (ch); \
442 static struct token
*
443 get_symname (struct linereader
*lr
)
445 /* Symbol in brackets. We must distinguish three kinds:
447 2. ISO 10646 position values
452 const struct keyword_t
*kw
;
455 buf
= (char *) xmalloc (bufmax
);
460 if (ch
== lr
->escape_char
)
462 int c2
= lr_getc (lr
);
471 while (ch
!= '>' && ch
!= '\n');
474 lr_error (lr
, _("unterminated symbolic name"));
476 /* Test for ISO 10646 position value. */
477 if (buf
[0] == 'U' && (bufact
== 6 || bufact
== 10))
480 while (cp
< &buf
[bufact
- 1] && isxdigit (*cp
))
483 if (cp
== &buf
[bufact
- 1])
486 lr
->token
.tok
= tok_ucs4
;
487 lr
->token
.val
.ucs4
= strtoul (buf
+ 1, NULL
, 16);
493 /* It is a symbolic name. Test for reserved words. */
494 kw
= lr
->hash_fct (buf
, bufact
- 1);
496 if (kw
!= NULL
&& kw
->symname_or_ident
== 1)
498 lr
->token
.tok
= kw
->token
;
503 lr
->token
.tok
= tok_bsymbol
;
506 buf
= xrealloc (buf
, bufact
+ 1);
508 lr
->token
.val
.str
.startmb
= buf
;
509 lr
->token
.val
.str
.lenmb
= bufact
- 1;
516 static struct token
*
517 get_ident (struct linereader
*lr
)
522 const struct keyword_t
*kw
;
525 buf
= xmalloc (bufmax
);
528 ADDC (lr
->buf
[lr
->idx
- 1]);
530 while (!isspace ((ch
= lr_getc (lr
))) && ch
!= '"' && ch
!= ';'
531 && ch
!= '<' && ch
!= ',' && ch
!= EOF
)
533 if (ch
== lr
->escape_char
)
536 if (ch
== '\n' || ch
== EOF
)
538 lr_error (lr
, _("invalid escape sequence"));
547 kw
= lr
->hash_fct (buf
, bufact
);
549 if (kw
!= NULL
&& kw
->symname_or_ident
== 0)
551 lr
->token
.tok
= kw
->token
;
556 lr
->token
.tok
= tok_ident
;
559 buf
= xrealloc (buf
, bufact
+ 1);
561 lr
->token
.val
.str
.startmb
= buf
;
562 lr
->token
.val
.str
.lenmb
= bufact
;
569 static struct token
*
570 get_string (struct linereader
*lr
, const struct charmap_t
*charmap
,
571 const struct repertoire_t
*repertoire
, int verbose
)
573 int return_widestr
= lr
->return_widestr
;
575 wchar_t *buf2
= NULL
;
579 /* We must return two different strings. */
580 buf
= xmalloc (bufmax
);
583 /* We know it'll be a string. */
584 lr
->token
.tok
= tok_string
;
586 /* If we need not translate the strings (i.e., expand <...> parts)
587 we can run a simple loop. */
588 if (!lr
->translate_strings
)
593 while ((ch
= lr_getc (lr
)) != '"' && ch
!= '\n' && ch
!= EOF
)
596 /* Catch errors with trailing escape character. */
597 if (bufact
> 0 && buf
[bufact
- 1] == lr
->escape_char
598 && (bufact
== 1 || buf
[bufact
- 2] != lr
->escape_char
))
600 lr_error (lr
, _("illegal escape sequence at end of string"));
603 else if (ch
== '\n' || ch
== EOF
)
604 lr_error (lr
, _("unterminated string"));
610 int illegal_string
= 0;
612 size_t buf2max
= 56 * sizeof (uint32_t);
616 /* We have to provide the wide character result as well. */
618 buf2
= xmalloc (buf2max
);
620 /* Read until the end of the string (or end of the line or file). */
621 while ((ch
= lr_getc (lr
)) != '"' && ch
!= '\n' && ch
!= EOF
)
629 /* The standards leave it up to the implementation to decide
630 what to do with character which stand for themself. We
631 could jump through hoops to find out the value relative to
632 the charmap and the repertoire map, but instead we leave
633 it up to the locale definition author to write a better
634 definition. We assume here that every character which
635 stands for itself is encoded using ISO 8859-1. Using the
636 escape character is allowed. */
637 if (ch
== lr
->escape_char
)
640 if (ch
== '\n' || ch
== EOF
)
644 if (verbose
&& !warned
)
647 non-symbolic character value should not be used"));
653 ADDWC ((uint32_t) ch
);
658 /* Now we have to search for the end of the symbolic name, i.e.,
661 while ((ch
= lr_getc (lr
)) != '>' && ch
!= '\n' && ch
!= EOF
)
663 if (ch
== lr
->escape_char
)
666 if (ch
== '\n' || ch
== EOF
)
671 if (ch
== '\n' || ch
== EOF
)
672 /* Not a correct string. */
674 if (bufact
== startidx
)
676 /* <> is no correct name. Ignore it and also signal an
682 /* It might be a Uxxxx symbol. */
683 if (buf
[startidx
] == 'U'
684 && (bufact
- startidx
== 5 || bufact
- startidx
== 9))
686 char *cp
= buf
+ startidx
+ 1;
687 while (cp
< &buf
[bufact
] && isxdigit (*cp
))
690 if (cp
== &buf
[bufact
])
693 const char *symbol
= NULL
;
697 wch
= strtoul (buf
+ startidx
+ 1, NULL
, 16);
699 /* Now forget about the name we just added. */
705 /* See whether the charmap contains the Uxxxxxxxx names. */
706 snprintf (utmp
, sizeof (utmp
), "U%08X", wch
);
707 seq
= charmap_find_value (charmap
, utmp
, 9);
711 /* No, this isn't the case. Now determine from
712 the repertoire the name of the character and
713 find it in the charmap. */
714 if (repertoire
!= NULL
)
715 symbol
= repertoire_find_symbol (repertoire
, wch
);
718 /* We cannot generate a string since we
719 cannot map from the Unicode number to the
724 seq
= charmap_find_value (charmap
, symbol
,
728 /* Not a known name. */
734 ADDS (seq
->bytes
, seq
->nbytes
);
740 /* We now have the symbolic name in buf[startidx] to
741 buf[bufact-1]. Now find out the value for this character
742 in the charmap as well as in the repertoire map (in this
744 seq
= charmap_find_value (charmap
, &buf
[startidx
],
749 /* This name is not in the charmap. */
750 lr_error (lr
, _("symbol `%.*s' not in charmap"),
751 (int) (bufact
- startidx
), &buf
[startidx
]);
757 /* Now the same for the multibyte representation. */
758 if (seq
!= NULL
&& seq
->ucs4
!= UNINITIALIZED_CHAR_VALUE
)
762 wch
= repertoire_find_value (repertoire
, &buf
[startidx
],
768 if (wch
== ILLEGAL_CHAR_VALUE
)
770 /* This name is not in the repertoire map. */
771 lr_error (lr
, _("symbol `%.*s' not in repertoire map"),
772 (int) (bufact
- startidx
), &buf
[startidx
]);
779 /* Now forget about the name we just added. */
782 /* And copy the bytes. */
784 ADDS (seq
->bytes
, seq
->nbytes
);
787 if (ch
== '\n' || ch
== EOF
)
789 lr_error (lr
, _("unterminated string"));
798 lr
->token
.val
.str
.startmb
= NULL
;
799 lr
->token
.val
.str
.lenmb
= 0;
800 lr
->token
.val
.str
.startwc
= NULL
;
801 lr
->token
.val
.str
.lenwc
= 0;
811 lr
->token
.val
.str
.startwc
= xrealloc (buf2
,
812 buf2act
* sizeof (uint32_t));
813 lr
->token
.val
.str
.lenwc
= buf2act
;
817 lr
->token
.val
.str
.startmb
= xrealloc (buf
, bufact
);
818 lr
->token
.val
.str
.lenmb
= bufact
;