1 /* Copyright (C) 1996-2017 Free Software Foundation, Inc.
2 This file is part of the GNU C Library.
3 Contributed by Ulrich Drepper <drepper@gnu.org>, 1996.
5 This program is free software; you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published
7 by the Free Software Foundation; version 2 of the License, or
8 (at your option) any later version.
10 This program is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 GNU General Public License for more details.
15 You should have received a copy of the GNU General Public License
16 along with this program; if not, see <http://www.gnu.org/licenses/>. */
31 #include "localedef.h"
34 #include "linereader.h"
37 /* Prototypes for local functions. */
38 static struct token
*get_toplvl_escape (struct linereader
*lr
);
39 static struct token
*get_symname (struct linereader
*lr
);
40 static struct token
*get_ident (struct linereader
*lr
);
41 static struct token
*get_string (struct linereader
*lr
,
42 const struct charmap_t
*charmap
,
43 struct localedef_t
*locale
,
44 const struct repertoire_t
*repertoire
,
49 lr_open (const char *fname
, kw_hash_fct_t hf
)
53 if (fname
== NULL
|| strcmp (fname
, "-") == 0
54 || strcmp (fname
, "/dev/stdin") == 0)
55 return lr_create (stdin
, "<stdin>", hf
);
58 fp
= fopen (fname
, "rm");
61 return lr_create (fp
, fname
, hf
);
66 lr_create (FILE *fp
, const char *fname
, kw_hash_fct_t hf
)
68 struct linereader
*result
;
71 result
= (struct linereader
*) xmalloc (sizeof (*result
));
74 result
->fname
= xstrdup (fname
);
79 result
->comment_char
= '#';
80 result
->escape_char
= '\\';
81 result
->translate_strings
= 1;
82 result
->return_widestr
= 0;
84 n
= getdelim (&result
->buf
, &result
->bufsize
, '\n', result
->fp
);
89 free ((char *) result
->fname
);
95 if (n
> 1 && result
->buf
[n
- 2] == '\\' && result
->buf
[n
- 1] == '\n')
98 result
->buf
[n
] = '\0';
100 result
->hash_fct
= hf
;
107 lr_eof (struct linereader
*lr
)
109 return lr
->bufact
= 0;
114 lr_ignore_rest (struct linereader
*lr
, int verbose
)
118 while (isspace (lr
->buf
[lr
->idx
]) && lr
->buf
[lr
->idx
] != '\n'
119 && lr
->buf
[lr
->idx
] != lr
->comment_char
)
120 if (lr
->buf
[lr
->idx
] == '\0')
122 if (lr_next (lr
) < 0)
128 if (lr
->buf
[lr
->idx
] != '\n' && ! feof (lr
->fp
)
129 && lr
->buf
[lr
->idx
] != lr
->comment_char
)
130 lr_error (lr
, _("trailing garbage at end of line"));
133 /* Ignore continued line. */
134 while (lr
->bufact
> 0 && lr
->buf
[lr
->bufact
- 1] != '\n')
135 if (lr_next (lr
) < 0)
138 lr
->idx
= lr
->bufact
;
143 lr_close (struct linereader
*lr
)
152 lr_next (struct linereader
*lr
)
156 n
= getdelim (&lr
->buf
, &lr
->bufsize
, '\n', lr
->fp
);
162 if (n
> 1 && lr
->buf
[n
- 2] == lr
->escape_char
&& lr
->buf
[n
- 1] == '\n')
165 /* XXX Is this correct? */
166 /* An escaped newline character is substituted with a single <SP>. */
168 lr
->buf
[n
- 1] = ' ';
182 /* Defined in error.c. */
183 /* This variable is incremented each time `error' is called. */
184 extern unsigned int error_message_count
;
186 /* The calling program should define program_name and set it to the
187 name of the executing program. */
188 extern char *program_name
;
192 lr_token (struct linereader
*lr
, const struct charmap_t
*charmap
,
193 struct localedef_t
*locale
, const struct repertoire_t
*repertoire
,
206 lr
->token
.tok
= tok_eof
;
212 lr
->token
.tok
= tok_eol
;
216 while (isspace (ch
));
218 if (ch
!= lr
->comment_char
)
221 /* Is there an newline at the end of the buffer? */
222 if (lr
->buf
[lr
->bufact
- 1] != '\n')
224 /* No. Some people want this to mean that only the line in
225 the file not the logical, concatenated line is ignored.
227 lr
->idx
= lr
->bufact
;
231 /* Ignore rest of line. */
232 lr_ignore_rest (lr
, 0);
233 lr
->token
.tok
= tok_eol
;
237 /* Match escape sequences. */
238 if (ch
== lr
->escape_char
)
239 return get_toplvl_escape (lr
);
241 /* Match ellipsis. */
244 if (strncmp (&lr
->buf
[lr
->idx
], "...(2)....", 10) == 0)
247 for (cnt
= 0; cnt
< 10; ++cnt
)
249 lr
->token
.tok
= tok_ellipsis4_2
;
252 if (strncmp (&lr
->buf
[lr
->idx
], "...", 3) == 0)
257 lr
->token
.tok
= tok_ellipsis4
;
260 if (strncmp (&lr
->buf
[lr
->idx
], "..", 2) == 0)
264 lr
->token
.tok
= tok_ellipsis3
;
267 if (strncmp (&lr
->buf
[lr
->idx
], ".(2)..", 6) == 0)
270 for (cnt
= 0; cnt
< 6; ++cnt
)
272 lr
->token
.tok
= tok_ellipsis2_2
;
275 if (lr
->buf
[lr
->idx
] == '.')
278 lr
->token
.tok
= tok_ellipsis2
;
286 return get_symname (lr
);
289 lr
->token
.tok
= tok_number
;
290 lr
->token
.val
.num
= ch
- '0';
292 while (isdigit (ch
= lr_getc (lr
)))
294 lr
->token
.val
.num
*= 10;
295 lr
->token
.val
.num
+= ch
- '0';
298 lr_error (lr
, _("garbage at end of number"));
304 lr
->token
.tok
= tok_semicolon
;
308 lr
->token
.tok
= tok_comma
;
312 lr
->token
.tok
= tok_open_brace
;
316 lr
->token
.tok
= tok_close_brace
;
320 return get_string (lr
, charmap
, locale
, repertoire
, verbose
);
326 lr
->token
.tok
= tok_minus1
;
333 return get_ident (lr
);
337 static struct token
*
338 get_toplvl_escape (struct linereader
*lr
)
340 /* This is supposed to be a numeric value. We return the
341 numerical value and the number of bytes. */
342 size_t start_idx
= lr
->idx
- 1;
343 unsigned char *bytes
= lr
->token
.val
.charcode
.bytes
;
349 unsigned int byte
= 0;
350 unsigned int base
= 8;
365 if ((base
== 16 && !isxdigit (ch
))
366 || (base
!= 16 && (ch
< '0' || ch
>= (int) ('0' + base
))))
369 lr
->token
.val
.str
.startmb
= &lr
->buf
[start_idx
];
371 while (ch
!= EOF
&& !isspace (ch
))
373 lr
->token
.val
.str
.lenmb
= lr
->idx
- start_idx
;
375 lr
->token
.tok
= tok_error
;
382 byte
= tolower (ch
) - 'a' + 10;
385 if ((base
== 16 && !isxdigit (ch
))
386 || (base
!= 16 && (ch
< '0' || ch
>= (int) ('0' + base
))))
393 byte
+= tolower (ch
) - 'a' + 10;
396 if (base
!= 16 && isdigit (ch
))
404 bytes
[nbytes
++] = byte
;
406 while (ch
== lr
->escape_char
407 && nbytes
< (int) sizeof (lr
->token
.val
.charcode
.bytes
));
410 lr_error (lr
, _("garbage at end of character code specification"));
414 lr
->token
.tok
= tok_charcode
;
415 lr
->token
.val
.charcode
.nbytes
= nbytes
;
424 if (bufact == bufmax) \
427 buf = xrealloc (buf, bufmax); \
429 buf[bufact++] = (ch); \
438 if (bufact + _l > bufmax) \
443 buf = xrealloc (buf, bufmax); \
445 memcpy (&buf[bufact], s, _l); \
454 if (buf2act == buf2max) \
457 buf2 = xrealloc (buf2, buf2max * 4); \
459 buf2[buf2act++] = (ch); \
464 static struct token
*
465 get_symname (struct linereader
*lr
)
467 /* Symbol in brackets. We must distinguish three kinds:
469 2. ISO 10646 position values
474 const struct keyword_t
*kw
;
477 buf
= (char *) xmalloc (bufmax
);
482 if (ch
== lr
->escape_char
)
484 int c2
= lr_getc (lr
);
493 while (ch
!= '>' && ch
!= '\n');
496 lr_error (lr
, _("unterminated symbolic name"));
498 /* Test for ISO 10646 position value. */
499 if (buf
[0] == 'U' && (bufact
== 6 || bufact
== 10))
502 while (cp
< &buf
[bufact
- 1] && isxdigit (*cp
))
505 if (cp
== &buf
[bufact
- 1])
508 lr
->token
.tok
= tok_ucs4
;
509 lr
->token
.val
.ucs4
= strtoul (buf
+ 1, NULL
, 16);
515 /* It is a symbolic name. Test for reserved words. */
516 kw
= lr
->hash_fct (buf
, bufact
- 1);
518 if (kw
!= NULL
&& kw
->symname_or_ident
== 1)
520 lr
->token
.tok
= kw
->token
;
525 lr
->token
.tok
= tok_bsymbol
;
527 buf
= xrealloc (buf
, bufact
+ 1);
530 lr
->token
.val
.str
.startmb
= buf
;
531 lr
->token
.val
.str
.lenmb
= bufact
- 1;
538 static struct token
*
539 get_ident (struct linereader
*lr
)
544 const struct keyword_t
*kw
;
547 buf
= xmalloc (bufmax
);
550 ADDC (lr
->buf
[lr
->idx
- 1]);
552 while (!isspace ((ch
= lr_getc (lr
))) && ch
!= '"' && ch
!= ';'
553 && ch
!= '<' && ch
!= ',' && ch
!= EOF
)
555 if (ch
== lr
->escape_char
)
558 if (ch
== '\n' || ch
== EOF
)
560 lr_error (lr
, _("invalid escape sequence"));
569 kw
= lr
->hash_fct (buf
, bufact
);
571 if (kw
!= NULL
&& kw
->symname_or_ident
== 0)
573 lr
->token
.tok
= kw
->token
;
578 lr
->token
.tok
= tok_ident
;
580 buf
= xrealloc (buf
, bufact
+ 1);
583 lr
->token
.val
.str
.startmb
= buf
;
584 lr
->token
.val
.str
.lenmb
= bufact
;
591 static struct token
*
592 get_string (struct linereader
*lr
, const struct charmap_t
*charmap
,
593 struct localedef_t
*locale
, const struct repertoire_t
*repertoire
,
596 int return_widestr
= lr
->return_widestr
;
598 wchar_t *buf2
= NULL
;
602 /* We must return two different strings. */
603 buf
= xmalloc (bufmax
);
606 /* We know it'll be a string. */
607 lr
->token
.tok
= tok_string
;
609 /* If we need not translate the strings (i.e., expand <...> parts)
610 we can run a simple loop. */
611 if (!lr
->translate_strings
)
616 while ((ch
= lr_getc (lr
)) != '"' && ch
!= '\n' && ch
!= EOF
)
619 /* Catch errors with trailing escape character. */
620 if (bufact
> 0 && buf
[bufact
- 1] == lr
->escape_char
621 && (bufact
== 1 || buf
[bufact
- 2] != lr
->escape_char
))
623 lr_error (lr
, _("illegal escape sequence at end of string"));
626 else if (ch
== '\n' || ch
== EOF
)
627 lr_error (lr
, _("unterminated string"));
633 int illegal_string
= 0;
635 size_t buf2max
= 56 * sizeof (uint32_t);
639 /* We have to provide the wide character result as well. */
641 buf2
= xmalloc (buf2max
);
643 /* Read until the end of the string (or end of the line or file). */
644 while ((ch
= lr_getc (lr
)) != '"' && ch
!= '\n' && ch
!= EOF
)
652 /* The standards leave it up to the implementation to decide
653 what to do with character which stand for themself. We
654 could jump through hoops to find out the value relative to
655 the charmap and the repertoire map, but instead we leave
656 it up to the locale definition author to write a better
657 definition. We assume here that every character which
658 stands for itself is encoded using ISO 8859-1. Using the
659 escape character is allowed. */
660 if (ch
== lr
->escape_char
)
663 if (ch
== '\n' || ch
== EOF
)
667 if (verbose
&& !warned
)
670 non-symbolic character value should not be used"));
676 ADDWC ((uint32_t) ch
);
681 /* Now we have to search for the end of the symbolic name, i.e.,
684 while ((ch
= lr_getc (lr
)) != '>' && ch
!= '\n' && ch
!= EOF
)
686 if (ch
== lr
->escape_char
)
689 if (ch
== '\n' || ch
== EOF
)
694 if (ch
== '\n' || ch
== EOF
)
695 /* Not a correct string. */
697 if (bufact
== startidx
)
699 /* <> is no correct name. Ignore it and also signal an
705 /* It might be a Uxxxx symbol. */
706 if (buf
[startidx
] == 'U'
707 && (bufact
- startidx
== 5 || bufact
- startidx
== 9))
709 char *cp
= buf
+ startidx
+ 1;
710 while (cp
< &buf
[bufact
] && isxdigit (*cp
))
713 if (cp
== &buf
[bufact
])
719 wch
= strtoul (buf
+ startidx
+ 1, NULL
, 16);
721 /* Now forget about the name we just added. */
727 /* See whether the charmap contains the Uxxxxxxxx names. */
728 snprintf (utmp
, sizeof (utmp
), "U%08X", wch
);
729 seq
= charmap_find_value (charmap
, utmp
, 9);
733 /* No, this isn't the case. Now determine from
734 the repertoire the name of the character and
735 find it in the charmap. */
736 if (repertoire
!= NULL
)
740 symbol
= repertoire_find_symbol (repertoire
, wch
);
743 seq
= charmap_find_value (charmap
, symbol
,
749 #ifndef NO_TRANSLITERATION
750 /* Transliterate if possible. */
755 if ((locale
->avail
& CTYPE_LOCALE
) == 0)
757 /* Load the CTYPE data now. */
758 int old_needed
= locale
->needed
;
761 locale
= load_locale (LC_CTYPE
,
763 locale
->repertoire_name
,
765 locale
->needed
= old_needed
;
768 if ((locale
->avail
& CTYPE_LOCALE
) != 0
769 && ((translit
= find_translit (locale
,
772 /* The CTYPE data contains a matching
777 for (i
= 0; translit
[i
] != 0; ++i
)
781 snprintf (utmp
, sizeof (utmp
), "U%08X",
783 seq
= charmap_find_value (charmap
, utmp
,
785 assert (seq
!= NULL
);
786 ADDS (seq
->bytes
, seq
->nbytes
);
792 #endif /* NO_TRANSLITERATION */
794 /* Not a known name. */
800 ADDS (seq
->bytes
, seq
->nbytes
);
806 /* We now have the symbolic name in buf[startidx] to
807 buf[bufact-1]. Now find out the value for this character
808 in the charmap as well as in the repertoire map (in this
810 seq
= charmap_find_value (charmap
, &buf
[startidx
],
815 /* This name is not in the charmap. */
816 lr_error (lr
, _("symbol `%.*s' not in charmap"),
817 (int) (bufact
- startidx
), &buf
[startidx
]);
823 /* Now the same for the multibyte representation. */
824 if (seq
!= NULL
&& seq
->ucs4
!= UNINITIALIZED_CHAR_VALUE
)
828 wch
= repertoire_find_value (repertoire
, &buf
[startidx
],
834 if (wch
== ILLEGAL_CHAR_VALUE
)
836 /* This name is not in the repertoire map. */
837 lr_error (lr
, _("symbol `%.*s' not in repertoire map"),
838 (int) (bufact
- startidx
), &buf
[startidx
]);
845 /* Now forget about the name we just added. */
848 /* And copy the bytes. */
850 ADDS (seq
->bytes
, seq
->nbytes
);
853 if (ch
== '\n' || ch
== EOF
)
855 lr_error (lr
, _("unterminated string"));
863 lr
->token
.val
.str
.startmb
= NULL
;
864 lr
->token
.val
.str
.lenmb
= 0;
865 lr
->token
.val
.str
.startwc
= NULL
;
866 lr
->token
.val
.str
.lenwc
= 0;
876 lr
->token
.val
.str
.startwc
= xrealloc (buf2
,
877 buf2act
* sizeof (uint32_t));
878 lr
->token
.val
.str
.lenwc
= buf2act
;
882 lr
->token
.val
.str
.startmb
= xrealloc (buf
, bufact
);
883 lr
->token
.val
.str
.lenmb
= bufact
;