locale: Use correct buffer size for utf8_sequence_error [BZ #19444]
[glibc.git] / locale / programs / linereader.c
blobf8c49ac06f3bc53e898ee978b7a11e41b41f424c
1 /* Copyright (C) 1996-2023 Free Software Foundation, Inc.
2 This file is part of the GNU C Library.
4 This program is free software; you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published
6 by the Free Software Foundation; version 2 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program; if not, see <https://www.gnu.org/licenses/>. */
17 #ifdef HAVE_CONFIG_H
18 # include <config.h>
19 #endif
21 #include <assert.h>
22 #include <ctype.h>
23 #include <errno.h>
24 #include <libintl.h>
25 #include <stdarg.h>
26 #include <stdlib.h>
27 #include <string.h>
28 #include <stdint.h>
30 #include "localedef.h"
31 #include "charmap.h"
32 #include "error.h"
33 #include "linereader.h"
34 #include "locfile.h"
36 /* Prototypes for local functions. */
37 static struct token *get_toplvl_escape (struct linereader *lr);
38 static struct token *get_symname (struct linereader *lr);
39 static struct token *get_ident (struct linereader *lr);
40 static struct token *get_string (struct linereader *lr,
41 const struct charmap_t *charmap,
42 struct localedef_t *locale,
43 const struct repertoire_t *repertoire,
44 int verbose);
45 static bool utf8_decode (struct linereader *lr, uint8_t ch1, uint32_t *wch);
48 struct linereader *
49 lr_open (const char *fname, kw_hash_fct_t hf)
51 FILE *fp;
53 if (fname == NULL || strcmp (fname, "-") == 0
54 || strcmp (fname, "/dev/stdin") == 0)
55 return lr_create (stdin, "<stdin>", hf);
56 else
58 fp = fopen (fname, "rm");
59 if (fp == NULL)
60 return NULL;
61 return lr_create (fp, fname, hf);
65 struct linereader *
66 lr_create (FILE *fp, const char *fname, kw_hash_fct_t hf)
68 struct linereader *result;
69 int n;
71 result = (struct linereader *) xmalloc (sizeof (*result));
73 result->fp = fp;
74 result->fname = xstrdup (fname);
75 result->buf = NULL;
76 result->bufsize = 0;
77 result->lineno = 1;
78 result->idx = 0;
79 result->comment_char = '#';
80 result->escape_char = '\\';
81 result->translate_strings = 1;
82 result->return_widestr = 0;
84 n = getdelim (&result->buf, &result->bufsize, '\n', result->fp);
85 if (n < 0)
87 int save = errno;
88 fclose (result->fp);
89 free ((char *) result->fname);
90 free (result);
91 errno = save;
92 return NULL;
95 if (n > 1 && result->buf[n - 2] == '\\' && result->buf[n - 1] == '\n')
96 n -= 2;
98 result->buf[n] = '\0';
99 result->bufact = n;
100 result->hash_fct = hf;
102 return result;
107 lr_eof (struct linereader *lr)
109 return lr->bufact = 0;
113 void
114 lr_ignore_rest (struct linereader *lr, int verbose)
116 if (verbose)
118 while (isspace (lr->buf[lr->idx]) && lr->buf[lr->idx] != '\n'
119 && lr->buf[lr->idx] != lr->comment_char)
120 if (lr->buf[lr->idx] == '\0')
122 if (lr_next (lr) < 0)
123 return;
125 else
126 ++lr->idx;
128 if (lr->buf[lr->idx] != '\n' && ! feof (lr->fp)
129 && lr->buf[lr->idx] != lr->comment_char)
130 lr_error (lr, _("trailing garbage at end of line"));
133 /* Ignore continued line. */
134 while (lr->bufact > 0 && lr->buf[lr->bufact - 1] != '\n')
135 if (lr_next (lr) < 0)
136 break;
138 lr->idx = lr->bufact;
142 void
143 lr_close (struct linereader *lr)
145 fclose (lr->fp);
146 free (lr->buf);
147 free (lr);
152 lr_next (struct linereader *lr)
154 int n;
156 n = getdelim (&lr->buf, &lr->bufsize, '\n', lr->fp);
157 if (n < 0)
158 return -1;
160 ++lr->lineno;
162 if (n > 1 && lr->buf[n - 2] == lr->escape_char && lr->buf[n - 1] == '\n')
164 #if 0
165 /* XXX Is this correct? */
166 /* An escaped newline character is substituted with a single <SP>. */
167 --n;
168 lr->buf[n - 1] = ' ';
169 #else
170 n -= 2;
171 #endif
174 lr->buf[n] = '\0';
175 lr->bufact = n;
176 lr->idx = 0;
178 return 0;
182 /* Defined in error.c. */
183 /* This variable is incremented each time `error' is called. */
184 extern unsigned int error_message_count;
186 /* The calling program should define program_name and set it to the
187 name of the executing program. */
188 extern char *program_name;
191 struct token *
192 lr_token (struct linereader *lr, const struct charmap_t *charmap,
193 struct localedef_t *locale, const struct repertoire_t *repertoire,
194 int verbose)
196 int ch;
198 while (1)
202 ch = lr_getc (lr);
204 if (ch == EOF)
206 lr->token.tok = tok_eof;
207 return &lr->token;
210 if (ch == '\n')
212 lr->token.tok = tok_eol;
213 return &lr->token;
216 while (isspace (ch));
218 if (ch != lr->comment_char)
219 break;
221 /* Is there an newline at the end of the buffer? */
222 if (lr->buf[lr->bufact - 1] != '\n')
224 /* No. Some people want this to mean that only the line in
225 the file not the logical, concatenated line is ignored.
226 Let's try this. */
227 lr->idx = lr->bufact;
228 continue;
231 /* Ignore rest of line. */
232 lr_ignore_rest (lr, 0);
233 lr->token.tok = tok_eol;
234 return &lr->token;
237 /* Match escape sequences. */
238 if (ch == lr->escape_char)
239 return get_toplvl_escape (lr);
241 /* Match ellipsis. */
242 if (ch == '.')
244 if (strncmp (&lr->buf[lr->idx], "...(2)....", 10) == 0)
246 int cnt;
247 for (cnt = 0; cnt < 10; ++cnt)
248 lr_getc (lr);
249 lr->token.tok = tok_ellipsis4_2;
250 return &lr->token;
252 if (strncmp (&lr->buf[lr->idx], "...", 3) == 0)
254 lr_getc (lr);
255 lr_getc (lr);
256 lr_getc (lr);
257 lr->token.tok = tok_ellipsis4;
258 return &lr->token;
260 if (strncmp (&lr->buf[lr->idx], "..", 2) == 0)
262 lr_getc (lr);
263 lr_getc (lr);
264 lr->token.tok = tok_ellipsis3;
265 return &lr->token;
267 if (strncmp (&lr->buf[lr->idx], ".(2)..", 6) == 0)
269 int cnt;
270 for (cnt = 0; cnt < 6; ++cnt)
271 lr_getc (lr);
272 lr->token.tok = tok_ellipsis2_2;
273 return &lr->token;
275 if (lr->buf[lr->idx] == '.')
277 lr_getc (lr);
278 lr->token.tok = tok_ellipsis2;
279 return &lr->token;
283 switch (ch)
285 case '<':
286 return get_symname (lr);
288 case '0' ... '9':
289 lr->token.tok = tok_number;
290 lr->token.val.num = ch - '0';
292 while (isdigit (ch = lr_getc (lr)))
294 lr->token.val.num *= 10;
295 lr->token.val.num += ch - '0';
297 if (isalpha (ch))
298 lr_error (lr, _("garbage at end of number"));
299 lr_ungetn (lr, 1);
301 return &lr->token;
303 case ';':
304 lr->token.tok = tok_semicolon;
305 return &lr->token;
307 case ',':
308 lr->token.tok = tok_comma;
309 return &lr->token;
311 case '(':
312 lr->token.tok = tok_open_brace;
313 return &lr->token;
315 case ')':
316 lr->token.tok = tok_close_brace;
317 return &lr->token;
319 case '"':
320 return get_string (lr, charmap, locale, repertoire, verbose);
322 case '-':
323 ch = lr_getc (lr);
324 if (ch == '1')
326 lr->token.tok = tok_minus1;
327 return &lr->token;
329 lr_ungetn (lr, 2);
330 break;
332 case 0x80 ... 0xff: /* UTF-8 sequence. */
334 uint32_t wch;
335 if (!utf8_decode (lr, ch, &wch))
337 lr->token.tok = tok_error;
338 return &lr->token;
340 lr->token.tok = tok_ucs4;
341 lr->token.val.ucs4 = wch;
342 return &lr->token;
346 return get_ident (lr);
350 static struct token *
351 get_toplvl_escape (struct linereader *lr)
353 /* This is supposed to be a numeric value. We return the
354 numerical value and the number of bytes. */
355 size_t start_idx = lr->idx - 1;
356 unsigned char *bytes = lr->token.val.charcode.bytes;
357 size_t nbytes = 0;
358 int ch;
362 unsigned int byte = 0;
363 unsigned int base = 8;
365 ch = lr_getc (lr);
367 if (ch == 'd')
369 base = 10;
370 ch = lr_getc (lr);
372 else if (ch == 'x')
374 base = 16;
375 ch = lr_getc (lr);
378 if ((base == 16 && !isxdigit (ch))
379 || (base != 16 && (ch < '0' || ch >= (int) ('0' + base))))
381 esc_error:
382 lr->token.val.str.startmb = &lr->buf[start_idx];
384 while (ch != EOF && !isspace (ch))
385 ch = lr_getc (lr);
386 lr->token.val.str.lenmb = lr->idx - start_idx;
388 lr->token.tok = tok_error;
389 return &lr->token;
392 if (isdigit (ch))
393 byte = ch - '0';
394 else
395 byte = tolower (ch) - 'a' + 10;
397 ch = lr_getc (lr);
398 if ((base == 16 && !isxdigit (ch))
399 || (base != 16 && (ch < '0' || ch >= (int) ('0' + base))))
400 goto esc_error;
402 byte *= base;
403 if (isdigit (ch))
404 byte += ch - '0';
405 else
406 byte += tolower (ch) - 'a' + 10;
408 ch = lr_getc (lr);
409 if (base != 16 && isdigit (ch))
411 byte *= base;
412 byte += ch - '0';
414 ch = lr_getc (lr);
417 bytes[nbytes++] = byte;
419 while (ch == lr->escape_char
420 && nbytes < (int) sizeof (lr->token.val.charcode.bytes));
422 if (!isspace (ch))
423 lr_error (lr, _("garbage at end of character code specification"));
425 lr_ungetn (lr, 1);
427 lr->token.tok = tok_charcode;
428 lr->token.val.charcode.nbytes = nbytes;
430 return &lr->token;
433 /* Multibyte string buffer. */
434 struct lr_buffer
436 size_t act;
437 size_t max;
438 char *buf;
441 /* Initialize *LRB with a default-sized buffer. */
442 static void
443 lr_buffer_init (struct lr_buffer *lrb)
445 lrb->act = 0;
446 lrb->max = 56;
447 lrb->buf = xmalloc (lrb->max);
450 /* Transfers the buffer string from *LRB to LR->token.mbstr. */
451 static void
452 lr_buffer_to_token (struct lr_buffer *lrb, struct linereader *lr)
454 lr->token.val.str.startmb = xrealloc (lrb->buf, lrb->act + 1);
455 lr->token.val.str.startmb[lrb->act] = '\0';
456 lr->token.val.str.lenmb = lrb->act;
459 /* Adds CH to *LRB. */
460 static void
461 addc (struct lr_buffer *lrb, char ch)
463 if (lrb->act == lrb->max)
465 lrb->max *= 2;
466 lrb->buf = xrealloc (lrb->buf, lrb->max);
468 lrb->buf[lrb->act++] = ch;
471 /* Adds L bytes at S to *LRB. */
472 static void
473 adds (struct lr_buffer *lrb, const unsigned char *s, size_t l)
475 if (lrb->max - lrb->act < l)
477 size_t required_size = lrb->act + l;
478 size_t new_max = 2 * lrb->max;
479 if (new_max < required_size)
480 new_max = required_size;
481 lrb->buf = xrealloc (lrb->buf, new_max);
482 lrb->max = new_max;
484 memcpy (lrb->buf + lrb->act, s, l);
485 lrb->act += l;
488 #define ADDWC(ch) \
489 do \
491 if (buf2act == buf2max) \
493 buf2max *= 2; \
494 buf2 = xrealloc (buf2, buf2max * 4); \
496 buf2[buf2act++] = (ch); \
498 while (0)
501 static struct token *
502 get_symname (struct linereader *lr)
504 /* Symbol in brackets. We must distinguish three kinds:
505 1. reserved words
506 2. ISO 10646 position values
507 3. all other. */
508 const struct keyword_t *kw;
509 int ch;
510 struct lr_buffer lrb;
512 lr_buffer_init (&lrb);
516 ch = lr_getc (lr);
517 if (ch == lr->escape_char)
519 int c2 = lr_getc (lr);
520 addc (&lrb, c2);
522 if (c2 == '\n')
523 ch = '\n';
525 else
526 addc (&lrb, ch);
528 while (ch != '>' && ch != '\n');
530 if (ch == '\n')
531 lr_error (lr, _("unterminated symbolic name"));
533 /* Test for ISO 10646 position value. */
534 if (lrb.buf[0] == 'U' && (lrb.act == 6 || lrb.act == 10))
536 char *cp = lrb.buf + 1;
537 while (cp < &lrb.buf[lrb.act - 1] && isxdigit (*cp))
538 ++cp;
540 if (cp == &lrb.buf[lrb.act - 1])
542 /* Yes, it is. */
543 lr->token.tok = tok_ucs4;
544 lr->token.val.ucs4 = strtoul (lrb.buf + 1, NULL, 16);
546 return &lr->token;
550 /* It is a symbolic name. Test for reserved words. */
551 kw = lr->hash_fct (lrb.buf, lrb.act - 1);
553 if (kw != NULL && kw->symname_or_ident == 1)
555 lr->token.tok = kw->token;
556 free (lrb.buf);
558 else
560 lr->token.tok = tok_bsymbol;
561 lr_buffer_to_token (&lrb, lr);
562 --lr->token.val.str.lenmb; /* Hide the training '>'. */
565 return &lr->token;
569 static struct token *
570 get_ident (struct linereader *lr)
572 const struct keyword_t *kw;
573 int ch;
574 struct lr_buffer lrb;
576 lr_buffer_init (&lrb);
578 addc (&lrb, lr->buf[lr->idx - 1]);
580 while (!isspace ((ch = lr_getc (lr))) && ch != '"' && ch != ';'
581 && ch != '<' && ch != ',' && ch != EOF)
583 if (ch == lr->escape_char)
585 ch = lr_getc (lr);
586 if (ch == '\n' || ch == EOF)
588 lr_error (lr, _("invalid escape sequence"));
589 break;
592 addc (&lrb, ch);
595 lr_ungetc (lr, ch);
597 kw = lr->hash_fct (lrb.buf, lrb.act);
599 if (kw != NULL && kw->symname_or_ident == 0)
601 lr->token.tok = kw->token;
602 free (lrb.buf);
604 else
606 lr->token.tok = tok_ident;
607 lr_buffer_to_token (&lrb, lr);
610 return &lr->token;
613 /* Process a decoded Unicode codepoint WCH in a string, placing the
614 multibyte sequence into LRB. Return false if the character is not
615 found in CHARMAP/REPERTOIRE. */
616 static bool
617 translate_unicode_codepoint (struct localedef_t *locale,
618 const struct charmap_t *charmap,
619 const struct repertoire_t *repertoire,
620 uint32_t wch, struct lr_buffer *lrb)
622 /* See whether the charmap contains the Uxxxxxxxx names. */
623 char utmp[10];
624 snprintf (utmp, sizeof (utmp), "U%08X", wch);
625 struct charseq *seq = charmap_find_value (charmap, utmp, 9);
627 if (seq == NULL)
629 /* No, this isn't the case. Now determine from
630 the repertoire the name of the character and
631 find it in the charmap. */
632 if (repertoire != NULL)
634 const char *symbol = repertoire_find_symbol (repertoire, wch);
635 if (symbol != NULL)
636 seq = charmap_find_value (charmap, symbol, strlen (symbol));
639 if (seq == NULL)
641 #ifndef NO_TRANSLITERATION
642 /* Transliterate if possible. */
643 if (locale != NULL)
645 if ((locale->avail & CTYPE_LOCALE) == 0)
647 /* Load the CTYPE data now. */
648 int old_needed = locale->needed;
650 locale->needed = 0;
651 locale = load_locale (LC_CTYPE, locale->name,
652 locale->repertoire_name,
653 charmap, locale);
654 locale->needed = old_needed;
657 uint32_t *translit;
658 if ((locale->avail & CTYPE_LOCALE) != 0
659 && ((translit = find_translit (locale, charmap, wch))
660 != NULL))
661 /* The CTYPE data contains a matching
662 transliteration. */
664 for (int i = 0; translit[i] != 0; ++i)
666 snprintf (utmp, sizeof (utmp), "U%08X", translit[i]);
667 seq = charmap_find_value (charmap, utmp, 9);
668 assert (seq != NULL);
669 adds (lrb, seq->bytes, seq->nbytes);
671 return true;
674 #endif /* NO_TRANSLITERATION */
676 /* Not a known name. */
677 return false;
681 if (seq != NULL)
683 adds (lrb, seq->bytes, seq->nbytes);
684 return true;
686 else
687 return false;
690 /* Returns true if ch is not EOF (that is, non-negative) and a valid
691 UTF-8 trailing byte. */
692 static bool
693 utf8_valid_trailing (int ch)
695 return ch >= 0 && (ch & 0xc0) == 0x80;
698 /* Reports an error for a broken UTF-8 sequence. CH2 to CH4 may be
699 EOF. Always returns false. */
700 static bool
701 utf8_sequence_error (struct linereader *lr, uint8_t ch1, int ch2, int ch3,
702 int ch4)
704 char buf[38];
706 if (ch2 < 0)
707 snprintf (buf, sizeof (buf), "0x%02x", ch1);
708 else if (ch3 < 0)
709 snprintf (buf, sizeof (buf), "0x%02x 0x%02x", ch1, ch2);
710 else if (ch4 < 0)
711 snprintf (buf, sizeof (buf), "0x%02x 0x%02x 0x%02x", ch1, ch2, ch3);
712 else
713 snprintf (buf, sizeof (buf), "0x%02x 0x%02x 0x%02x 0x%02x",
714 ch1, ch2, ch3, ch4);
716 lr_error (lr, _("invalid UTF-8 sequence %s"), buf);
717 return false;
720 /* Reads a UTF-8 sequence from LR, with the leading byte CH1, and
721 stores the decoded codepoint in *WCH. Returns false on failure and
722 reports an error. */
723 static bool
724 utf8_decode (struct linereader *lr, uint8_t ch1, uint32_t *wch)
726 /* See RFC 3629 section 4 and __gconv_transform_utf8_internal. */
727 if (ch1 < 0xc2)
728 return utf8_sequence_error (lr, ch1, -1, -1, -1);
730 int ch2 = lr_getc (lr);
731 if (!utf8_valid_trailing (ch2))
732 return utf8_sequence_error (lr, ch1, ch2, -1, -1);
734 if (ch1 <= 0xdf)
736 uint32_t result = ((ch1 & 0x1f) << 6) | (ch2 & 0x3f);
737 if (result < 0x80)
738 return utf8_sequence_error (lr, ch1, ch2, -1, -1);
739 *wch = result;
740 return true;
743 int ch3 = lr_getc (lr);
744 if (!utf8_valid_trailing (ch3) || ch1 < 0xe0)
745 return utf8_sequence_error (lr, ch1, ch2, ch3, -1);
747 if (ch1 <= 0xef)
749 uint32_t result = (((ch1 & 0x0f) << 12)
750 | ((ch2 & 0x3f) << 6)
751 | (ch3 & 0x3f));
752 if (result < 0x800)
753 return utf8_sequence_error (lr, ch1, ch2, ch3, -1);
754 *wch = result;
755 return true;
758 int ch4 = lr_getc (lr);
759 if (!utf8_valid_trailing (ch4) || ch1 < 0xf0 || ch1 > 0xf4)
760 return utf8_sequence_error (lr, ch1, ch2, ch3, ch4);
762 uint32_t result = (((ch1 & 0x07) << 18)
763 | ((ch2 & 0x3f) << 12)
764 | ((ch3 & 0x3f) << 6)
765 | (ch4 & 0x3f));
766 if (result < 0x10000)
767 return utf8_sequence_error (lr, ch1, ch2, ch3, ch4);
768 *wch = result;
769 return true;
772 static struct token *
773 get_string (struct linereader *lr, const struct charmap_t *charmap,
774 struct localedef_t *locale, const struct repertoire_t *repertoire,
775 int verbose)
777 int return_widestr = lr->return_widestr;
778 struct lr_buffer lrb;
779 wchar_t *buf2 = NULL;
781 lr_buffer_init (&lrb);
783 /* We know it'll be a string. */
784 lr->token.tok = tok_string;
786 /* If we need not translate the strings (i.e., expand <...> parts)
787 we can run a simple loop. */
788 if (!lr->translate_strings)
790 int ch;
792 buf2 = NULL;
793 while ((ch = lr_getc (lr)) != '"' && ch != '\n' && ch != EOF)
795 if (ch >= 0x80)
796 lr_error (lr, _("illegal 8-bit character in untranslated string"));
797 addc (&lrb, ch);
800 /* Catch errors with trailing escape character. */
801 if (lrb.act > 0 && lrb.buf[lrb.act - 1] == lr->escape_char
802 && (lrb.act == 1 || lrb.buf[lrb.act - 2] != lr->escape_char))
804 lr_error (lr, _("illegal escape sequence at end of string"));
805 --lrb.act;
807 else if (ch == '\n' || ch == EOF)
808 lr_error (lr, _("unterminated string"));
810 addc (&lrb, '\0');
812 else
814 bool illegal_string = false;
815 size_t buf2act = 0;
816 size_t buf2max = 56 * sizeof (uint32_t);
817 int ch;
819 /* We have to provide the wide character result as well. */
820 if (return_widestr)
821 buf2 = xmalloc (buf2max);
823 /* Read until the end of the string (or end of the line or file). */
824 while ((ch = lr_getc (lr)) != '"' && ch != '\n' && ch != EOF)
826 size_t startidx;
827 uint32_t wch;
828 struct charseq *seq;
830 if (ch != '<')
832 /* The standards leave it up to the implementation to
833 decide what to do with characters which stand for
834 themselves. This implementation treats the input
835 file as encoded in UTF-8. */
836 if (ch == lr->escape_char)
838 ch = lr_getc (lr);
839 if (ch >= 0x80)
841 lr_error (lr, _("illegal 8-bit escape sequence"));
842 illegal_string = true;
843 break;
845 if (ch == '\n' || ch == EOF)
846 break;
847 addc (&lrb, ch);
848 wch = ch;
850 else if (ch < 0x80)
852 wch = ch;
853 addc (&lrb, ch);
855 else /* UTF-8 sequence. */
857 if (!utf8_decode (lr, ch, &wch))
859 illegal_string = true;
860 break;
862 if (!translate_unicode_codepoint (locale, charmap,
863 repertoire, wch, &lrb))
865 /* Ignore the rest of the string. Callers may
866 skip this string because it cannot be encoded
867 in the output character set. */
868 illegal_string = true;
869 continue;
873 if (return_widestr)
874 ADDWC (wch);
876 continue;
879 /* Now we have to search for the end of the symbolic name, i.e.,
880 the closing '>'. */
881 startidx = lrb.act;
882 while ((ch = lr_getc (lr)) != '>' && ch != '\n' && ch != EOF)
884 if (ch == lr->escape_char)
886 ch = lr_getc (lr);
887 if (ch == '\n' || ch == EOF)
888 break;
890 addc (&lrb, ch);
892 if (ch == '\n' || ch == EOF)
893 /* Not a correct string. */
894 break;
895 if (lrb.act == startidx)
897 /* <> is no correct name. Ignore it and also signal an
898 error. */
899 illegal_string = true;
900 continue;
903 /* It might be a Uxxxx symbol. */
904 if (lrb.buf[startidx] == 'U'
905 && (lrb.act - startidx == 5 || lrb.act - startidx == 9))
907 char *cp = lrb.buf + startidx + 1;
908 while (cp < &lrb.buf[lrb.act] && isxdigit (*cp))
909 ++cp;
911 if (cp == &lrb.buf[lrb.act])
913 /* Yes, it is. */
914 addc (&lrb, '\0');
915 wch = strtoul (lrb.buf + startidx + 1, NULL, 16);
917 /* Now forget about the name we just added. */
918 lrb.act = startidx;
920 if (return_widestr)
921 ADDWC (wch);
923 if (!translate_unicode_codepoint (locale, charmap,
924 repertoire, wch, &lrb))
925 illegal_string = true;
926 continue;
930 /* We now have the symbolic name in lrb.buf[startidx] to
931 lrb.buf[lrb.act-1]. Now find out the value for this character
932 in the charmap as well as in the repertoire map (in this
933 order). */
934 seq = charmap_find_value (charmap, &lrb.buf[startidx],
935 lrb.act - startidx);
937 if (seq == NULL)
939 /* This name is not in the charmap. */
940 lr_error (lr, _("symbol `%.*s' not in charmap"),
941 (int) (lrb.act - startidx), &lrb.buf[startidx]);
942 illegal_string = true;
945 if (return_widestr)
947 /* Now the same for the multibyte representation. */
948 if (seq != NULL && seq->ucs4 != UNINITIALIZED_CHAR_VALUE)
949 wch = seq->ucs4;
950 else
952 wch = repertoire_find_value (repertoire, &lrb.buf[startidx],
953 lrb.act - startidx);
954 if (seq != NULL)
955 seq->ucs4 = wch;
958 if (wch == ILLEGAL_CHAR_VALUE)
960 /* This name is not in the repertoire map. */
961 lr_error (lr, _("symbol `%.*s' not in repertoire map"),
962 (int) (lrb.act - startidx), &lrb.buf[startidx]);
963 illegal_string = true;
965 else
966 ADDWC (wch);
969 /* Now forget about the name we just added. */
970 lrb.act = startidx;
972 /* And copy the bytes. */
973 if (seq != NULL)
974 adds (&lrb, seq->bytes, seq->nbytes);
977 if (ch == '\n' || ch == EOF)
979 lr_error (lr, _("unterminated string"));
980 illegal_string = true;
983 if (illegal_string)
985 free (lrb.buf);
986 free (buf2);
987 lr->token.val.str.startmb = NULL;
988 lr->token.val.str.lenmb = 0;
989 lr->token.val.str.startwc = NULL;
990 lr->token.val.str.lenwc = 0;
992 return &lr->token;
995 addc (&lrb, '\0');
997 if (return_widestr)
999 ADDWC (0);
1000 lr->token.val.str.startwc = xrealloc (buf2,
1001 buf2act * sizeof (uint32_t));
1002 lr->token.val.str.lenwc = buf2act;
1006 lr_buffer_to_token (&lrb, lr);
1008 return &lr->token;