Fix tests which expose ldbl -> _Float128 redirects
[glibc.git] / locale / programs / charmap.c
blobc23e50944f50e15fdf6e8385883d4f37617b7d45
1 /* Copyright (C) 1996-2020 Free Software Foundation, Inc.
2 This file is part of the GNU C Library.
3 Contributed by Ulrich Drepper <drepper@gnu.org>, 1996.
5 This program is free software; you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published
7 by the Free Software Foundation; version 2 of the License, or
8 (at your option) any later version.
10 This program is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 GNU General Public License for more details.
15 You should have received a copy of the GNU General Public License
16 along with this program; if not, see <https://www.gnu.org/licenses/>. */
18 #ifdef HAVE_CONFIG_H
19 # include <config.h>
20 #endif
22 #include <ctype.h>
23 #include <errno.h>
24 #include <libintl.h>
25 #include <limits.h>
26 #include <stdio.h>
27 #include <stdlib.h>
28 #include <string.h>
29 #include <stdint.h>
31 #include "localedef.h"
32 #include "linereader.h"
33 #include "charmap.h"
34 #include "charmap-dir.h"
36 #include <assert.h>
39 /* Define the lookup function. */
40 #include "charmap-kw.h"
43 /* Prototypes for local functions. */
44 static struct charmap_t *parse_charmap (struct linereader *cmfile,
45 int verbose, int be_quiet);
46 static void new_width (struct linereader *cmfile, struct charmap_t *result,
47 const char *from, const char *to,
48 unsigned long int width);
49 static void charmap_new_char (struct linereader *lr, struct charmap_t *cm,
50 size_t nbytes, unsigned char *bytes,
51 const char *from, const char *to,
52 int decimal_ellipsis, int step);
55 bool enc_not_ascii_compatible;
58 #ifdef NEED_NULL_POINTER
59 static const char *null_pointer;
60 #endif
62 static struct linereader *
63 cmlr_open (const char *directory, const char *name, kw_hash_fct_t hf)
65 FILE *fp;
67 fp = charmap_open (directory, name);
68 if (fp == NULL)
69 return NULL;
70 else
72 size_t dlen = strlen (directory);
73 int add_slash = (dlen == 0 || directory[dlen - 1] != '/');
74 size_t nlen = strlen (name);
75 char *pathname;
76 char *p;
78 pathname = alloca (dlen + add_slash + nlen + 1);
79 p = stpcpy (pathname, directory);
80 if (add_slash)
81 *p++ = '/';
82 stpcpy (p, name);
84 return lr_create (fp, pathname, hf);
88 struct charmap_t *
89 charmap_read (const char *filename, int verbose, int error_not_found,
90 int be_quiet, int use_default)
92 struct charmap_t *result = NULL;
94 if (filename != NULL)
96 struct linereader *cmfile;
98 /* First try the name as found in the parameter. */
99 cmfile = lr_open (filename, charmap_hash);
100 if (cmfile == NULL)
102 /* No successful. So start looking through the directories
103 in the I18NPATH if this is a simple name. */
104 if (strchr (filename, '/') == NULL)
106 char *i18npath = getenv ("I18NPATH");
107 if (i18npath != NULL && *i18npath != '\0')
109 const size_t pathlen = strlen (i18npath);
110 char i18npathbuf[pathlen + 1];
111 char path[pathlen + sizeof ("/charmaps")];
112 char *next;
113 i18npath = memcpy (i18npathbuf, i18npath, pathlen + 1);
115 while (cmfile == NULL
116 && (next = strsep (&i18npath, ":")) != NULL)
118 stpcpy (stpcpy (path, next), "/charmaps");
119 cmfile = cmlr_open (path, filename, charmap_hash);
121 if (cmfile == NULL)
122 /* Try without the "/charmaps" part. */
123 cmfile = cmlr_open (next, filename, charmap_hash);
127 if (cmfile == NULL)
128 /* Try the default directory. */
129 cmfile = cmlr_open (CHARMAP_PATH, filename, charmap_hash);
133 if (cmfile != NULL)
134 result = parse_charmap (cmfile, verbose, be_quiet);
136 if (result == NULL && error_not_found)
137 record_error (0, errno,
138 _("character map file `%s' not found"),
139 filename);
142 if (result == NULL && filename != NULL && strchr (filename, '/') == NULL)
144 /* OK, one more try. We also accept the names given to the
145 character sets in the files. Sometimes they differ from the
146 file name. */
147 CHARMAP_DIR *dir;
149 dir = charmap_opendir (CHARMAP_PATH);
150 if (dir != NULL)
152 const char *dirent;
154 while ((dirent = charmap_readdir (dir)) != NULL)
156 char **aliases;
157 char **p;
158 int found;
160 aliases = charmap_aliases (CHARMAP_PATH, dirent);
161 found = 0;
162 for (p = aliases; *p; p++)
163 if (strcasecmp (*p, filename) == 0)
165 found = 1;
166 break;
168 charmap_free_aliases (aliases);
170 if (found)
172 struct linereader *cmfile;
174 cmfile = cmlr_open (CHARMAP_PATH, dirent, charmap_hash);
175 if (cmfile != NULL)
176 result = parse_charmap (cmfile, verbose, be_quiet);
178 break;
182 charmap_closedir (dir);
186 if (result == NULL && DEFAULT_CHARMAP != NULL)
188 struct linereader *cmfile;
190 cmfile = cmlr_open (CHARMAP_PATH, DEFAULT_CHARMAP, charmap_hash);
191 if (cmfile != NULL)
192 result = parse_charmap (cmfile, verbose, be_quiet);
194 if (result == NULL)
195 record_error (4, errno,
196 _("default character map file `%s' not found"),
197 DEFAULT_CHARMAP);
200 if (result != NULL && result->code_set_name == NULL)
201 /* The input file does not specify a code set name. This
202 shouldn't happen but we should cope with it. */
203 result->code_set_name = basename (filename);
205 /* Test of ASCII compatibility of locale encoding.
207 Verify that the encoding to be used in a locale is ASCII compatible,
208 at least for the graphic characters, excluding the control characters,
209 '$' and '@'. This constraint comes from an ISO C 99 restriction.
211 ISO C 99 section 7.17.(2) (about wchar_t):
212 the null character shall have the code value zero and each member of
213 the basic character set shall have a code value equal to its value
214 when used as the lone character in an integer character constant.
215 ISO C 99 section 5.2.1.(3):
216 Both the basic source and basic execution character sets shall have
217 the following members: the 26 uppercase letters of the Latin alphabet
218 A B C D E F G H I J K L M N O P Q R S T U V W X Y Z
219 the 26 lowercase letters of the Latin alphabet
220 a b c d e f g h i j k l m n o p q r s t u v w x y z
221 the 10 decimal digits
222 0 1 2 3 4 5 6 7 8 9
223 the following 29 graphic characters
224 ! " # % & ' ( ) * + , - . / : ; < = > ? [ \ ] ^ _ { | } ~
225 the space character, and control characters representing horizontal
226 tab, vertical tab, and form feed.
228 Therefore, for all members of the "basic character set", the 'char' code
229 must have the same value as the 'wchar_t' code, which in glibc is the
230 same as the Unicode code, which for all of the enumerated characters
231 is identical to the ASCII code. */
232 if (result != NULL && use_default)
234 static const char basic_charset[] =
236 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M',
237 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z',
238 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm',
239 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
240 '0', '1', '2', '3', '4', '5', '6', '7', '8', '9',
241 '!', '"', '#', '%', '&', '\'', '(', ')', '*', '+', ',', '-',
242 '.', '/', ':', ';', '<', '=', '>', '?', '[', '\\', ']', '^',
243 '_', '{', '|', '}', '~', ' ', '\t', '\v', '\f', '\0'
245 int failed = 0;
246 const char *p = basic_charset;
250 struct charseq *seq = charmap_find_symbol (result, p, 1);
252 if (seq == NULL || seq->ucs4 != (uint32_t) *p)
253 failed = 1;
255 while (*p++ != '\0');
257 if (failed)
259 /* A user may disable the ASCII compatibility warning check,
260 but we must remember that the encoding is not ASCII
261 compatible, since it may have other implications. Later
262 we will set _NL_CTYPE_MAP_TO_NONASCII from this value. */
263 if (warn_ascii)
264 record_warning (_(
265 "character map `%s' is not ASCII compatible, locale not ISO C compliant "
266 "[--no-warnings=ascii]"),
267 result->code_set_name);
268 enc_not_ascii_compatible = true;
272 return result;
276 static struct charmap_t *
277 parse_charmap (struct linereader *cmfile, int verbose, int be_quiet)
279 struct charmap_t *result;
280 int state;
281 enum token_t expected_tok = tok_error;
282 const char *expected_str = NULL;
283 char *from_name = NULL;
284 char *to_name = NULL;
285 enum token_t ellipsis = 0;
286 int step = 1;
288 /* We don't want symbolic names in string to be translated. */
289 cmfile->translate_strings = 0;
291 /* Allocate room for result. */
292 result = (struct charmap_t *) xmalloc (sizeof (struct charmap_t));
293 memset (result, '\0', sizeof (struct charmap_t));
294 /* The default DEFAULT_WIDTH is 1. */
295 result->width_default = 1;
297 #define obstack_chunk_alloc malloc
298 #define obstack_chunk_free free
299 obstack_init (&result->mem_pool);
301 if (init_hash (&result->char_table, 256)
302 || init_hash (&result->byte_table, 256))
304 free (result);
305 return NULL;
308 /* We use a state machine to describe the charmap description file
309 format. */
310 state = 1;
311 while (1)
313 /* What's on? */
314 struct token *now = lr_token (cmfile, NULL, NULL, NULL, verbose);
315 enum token_t nowtok = now->tok;
316 struct token *arg;
318 if (nowtok == tok_eof)
319 break;
321 switch (state)
323 case 1:
324 /* The beginning. We expect the special declarations, EOL or
325 `CHARMAP'. */
326 if (nowtok == tok_eol)
327 /* Ignore empty lines. */
328 continue;
330 if (nowtok == tok_charmap)
332 from_name = NULL;
333 to_name = NULL;
335 /* We have to set up the real work. Fill in some
336 default values. */
337 if (result->mb_cur_max == 0)
338 result->mb_cur_max = 1;
339 if (result->mb_cur_min == 0)
340 result->mb_cur_min = result->mb_cur_max;
341 if (result->mb_cur_min > result->mb_cur_max)
343 record_error (0, 0, _("\
344 %s: <mb_cur_max> must be greater than <mb_cur_min>\n"),
345 cmfile->fname);
347 result->mb_cur_min = result->mb_cur_max;
350 lr_ignore_rest (cmfile, 1);
352 state = 2;
353 continue;
356 if (nowtok != tok_code_set_name && nowtok != tok_mb_cur_max
357 && nowtok != tok_mb_cur_min && nowtok != tok_escape_char
358 && nowtok != tok_comment_char && nowtok != tok_g0esc
359 && nowtok != tok_g1esc && nowtok != tok_g2esc
360 && nowtok != tok_g3esc && nowtok != tok_repertoiremap
361 && nowtok != tok_include)
363 lr_error (cmfile, _("syntax error in prolog: %s"),
364 _("invalid definition"));
366 lr_ignore_rest (cmfile, 0);
367 continue;
370 /* We know that we need an argument. */
371 arg = lr_token (cmfile, NULL, NULL, NULL, verbose);
373 switch (nowtok)
375 case tok_code_set_name:
376 case tok_repertoiremap:
377 if (arg->tok != tok_ident && arg->tok != tok_string)
379 badarg:
380 lr_error (cmfile, _("syntax error in prolog: %s"),
381 _("bad argument"));
383 lr_ignore_rest (cmfile, 0);
384 continue;
387 if (nowtok == tok_code_set_name)
388 result->code_set_name = obstack_copy0 (&result->mem_pool,
389 arg->val.str.startmb,
390 arg->val.str.lenmb);
391 else
392 result->repertoiremap = obstack_copy0 (&result->mem_pool,
393 arg->val.str.startmb,
394 arg->val.str.lenmb);
396 lr_ignore_rest (cmfile, 1);
397 continue;
399 case tok_mb_cur_max:
400 case tok_mb_cur_min:
401 if (arg->tok != tok_number)
402 goto badarg;
404 if ((nowtok == tok_mb_cur_max
405 && result->mb_cur_max != 0)
406 || (nowtok == tok_mb_cur_max
407 && result->mb_cur_max != 0))
408 lr_error (cmfile, _("duplicate definition of <%s>"),
409 nowtok == tok_mb_cur_min
410 ? "mb_cur_min" : "mb_cur_max");
412 if (arg->val.num < 1)
414 lr_error (cmfile,
415 _("value for <%s> must be 1 or greater"),
416 nowtok == tok_mb_cur_min
417 ? "mb_cur_min" : "mb_cur_max");
419 lr_ignore_rest (cmfile, 0);
420 continue;
422 if ((nowtok == tok_mb_cur_max && result->mb_cur_min != 0
423 && (int) arg->val.num < result->mb_cur_min)
424 || (nowtok == tok_mb_cur_min && result->mb_cur_max != 0
425 && (int) arg->val.num > result->mb_cur_max))
427 lr_error (cmfile, _("\
428 value of <%s> must be greater or equal than the value of <%s>"),
429 "mb_cur_max", "mb_cur_min");
431 lr_ignore_rest (cmfile, 0);
432 continue;
435 if (nowtok == tok_mb_cur_max)
436 result->mb_cur_max = arg->val.num;
437 else
438 result->mb_cur_min = arg->val.num;
440 lr_ignore_rest (cmfile, 1);
441 continue;
443 case tok_escape_char:
444 case tok_comment_char:
445 if (arg->tok != tok_ident)
446 goto badarg;
448 if (arg->val.str.lenmb != 1)
450 lr_error (cmfile, _("\
451 argument to <%s> must be a single character"),
452 nowtok == tok_escape_char ? "escape_char"
453 : "comment_char");
455 lr_ignore_rest (cmfile, 0);
456 continue;
459 if (nowtok == tok_escape_char)
460 cmfile->escape_char = *arg->val.str.startmb;
461 else
462 cmfile->comment_char = *arg->val.str.startmb;
464 lr_ignore_rest (cmfile, 1);
465 continue;
467 case tok_g0esc:
468 case tok_g1esc:
469 case tok_g2esc:
470 case tok_g3esc:
471 case tok_escseq:
472 lr_ignore_rest (cmfile, 0); /* XXX */
473 continue;
475 case tok_include:
476 lr_error (cmfile, _("\
477 character sets with locking states are not supported"));
478 exit (4);
480 default:
481 /* Cannot happen. */
482 assert (! "Should not happen");
484 break;
486 case 2:
487 /* We have seen `CHARMAP' and now are in the body. Each line
488 must have the format "%s %s %s\n" or "%s...%s %s %s\n". */
489 if (nowtok == tok_eol)
490 /* Ignore empty lines. */
491 continue;
493 if (nowtok == tok_end)
495 expected_tok = tok_charmap;
496 expected_str = "CHARMAP";
497 state = 90;
498 continue;
501 if (nowtok != tok_bsymbol && nowtok != tok_ucs4)
503 lr_error (cmfile, _("syntax error in %s definition: %s"),
504 "CHARMAP", _("no symbolic name given"));
506 lr_ignore_rest (cmfile, 0);
507 continue;
510 /* If the previous line was not completely correct free the
511 used memory. */
512 if (from_name != NULL)
513 obstack_free (&result->mem_pool, from_name);
515 if (nowtok == tok_bsymbol)
516 from_name = (char *) obstack_copy0 (&result->mem_pool,
517 now->val.str.startmb,
518 now->val.str.lenmb);
519 else
521 obstack_printf (&result->mem_pool, "U%08X",
522 cmfile->token.val.ucs4);
523 obstack_1grow (&result->mem_pool, '\0');
524 from_name = (char *) obstack_finish (&result->mem_pool);
526 to_name = NULL;
528 state = 3;
529 continue;
531 case 3:
532 /* We have two possibilities: We can see an ellipsis or an
533 encoding value. */
534 if (nowtok == tok_ellipsis3 || nowtok == tok_ellipsis4
535 || nowtok == tok_ellipsis2 || nowtok == tok_ellipsis4_2
536 || nowtok == tok_ellipsis2_2)
538 ellipsis = nowtok;
539 if (nowtok == tok_ellipsis4_2)
541 step = 2;
542 nowtok = tok_ellipsis4;
544 else if (nowtok == tok_ellipsis2_2)
546 step = 2;
547 nowtok = tok_ellipsis2;
549 state = 4;
550 continue;
552 /* FALLTHROUGH */
554 case 5:
555 if (nowtok != tok_charcode)
557 lr_error (cmfile, _("syntax error in %s definition: %s"),
558 "CHARMAP", _("invalid encoding given"));
560 lr_ignore_rest (cmfile, 0);
562 state = 2;
563 continue;
566 if (now->val.charcode.nbytes < result->mb_cur_min)
567 lr_error (cmfile, _("too few bytes in character encoding"));
568 else if (now->val.charcode.nbytes > result->mb_cur_max)
569 lr_error (cmfile, _("too many bytes in character encoding"));
570 else
571 charmap_new_char (cmfile, result, now->val.charcode.nbytes,
572 now->val.charcode.bytes, from_name, to_name,
573 ellipsis != tok_ellipsis2, step);
575 /* Ignore trailing comment silently. */
576 lr_ignore_rest (cmfile, 0);
578 from_name = NULL;
579 to_name = NULL;
580 ellipsis = tok_none;
581 step = 1;
583 state = 2;
584 continue;
586 case 4:
587 if (nowtok != tok_bsymbol && nowtok != tok_ucs4)
589 lr_error (cmfile, _("syntax error in %s definition: %s"),
590 "CHARMAP",
591 _("no symbolic name given for end of range"));
593 lr_ignore_rest (cmfile, 0);
594 continue;
597 /* Copy the to-name in a safe place. */
598 if (nowtok == tok_bsymbol)
599 to_name = (char *) obstack_copy0 (&result->mem_pool,
600 cmfile->token.val.str.startmb,
601 cmfile->token.val.str.lenmb);
602 else
604 obstack_printf (&result->mem_pool, "U%08X",
605 cmfile->token.val.ucs4);
606 obstack_1grow (&result->mem_pool, '\0');
607 to_name = (char *) obstack_finish (&result->mem_pool);
610 state = 5;
611 continue;
613 case 90:
614 if (nowtok != expected_tok)
615 lr_error (cmfile, _("\
616 %1$s: definition does not end with `END %1$s'"), expected_str);
618 lr_ignore_rest (cmfile, nowtok == expected_tok);
619 state = 91;
620 continue;
622 case 91:
623 /* Waiting for WIDTH... */
624 if (nowtok == tok_eol)
625 /* Ignore empty lines. */
626 continue;
628 if (nowtok == tok_width_default)
630 state = 92;
631 continue;
634 if (nowtok == tok_width)
636 lr_ignore_rest (cmfile, 1);
637 state = 93;
638 continue;
641 if (nowtok == tok_width_variable)
643 lr_ignore_rest (cmfile, 1);
644 state = 98;
645 continue;
648 lr_error (cmfile, _("\
649 only WIDTH definitions are allowed to follow the CHARMAP definition"));
651 lr_ignore_rest (cmfile, 0);
652 continue;
654 case 92:
655 if (nowtok != tok_number)
656 lr_error (cmfile, _("value for %s must be an integer"),
657 "WIDTH_DEFAULT");
658 else
659 result->width_default = now->val.num;
661 lr_ignore_rest (cmfile, nowtok == tok_number);
663 state = 91;
664 continue;
666 case 93:
667 /* We now expect `END WIDTH' or lines of the format "%s %d\n" or
668 "%s...%s %d\n". */
669 if (nowtok == tok_eol)
670 /* ignore empty lines. */
671 continue;
673 if (nowtok == tok_end)
675 expected_tok = tok_width;
676 expected_str = "WIDTH";
677 state = 90;
678 continue;
681 if (nowtok != tok_bsymbol && nowtok != tok_ucs4)
683 lr_error (cmfile, _("syntax error in %s definition: %s"),
684 "WIDTH", _("no symbolic name given"));
686 lr_ignore_rest (cmfile, 0);
687 continue;
690 if (from_name != NULL)
691 obstack_free (&result->mem_pool, from_name);
693 if (nowtok == tok_bsymbol)
694 from_name = (char *) obstack_copy0 (&result->mem_pool,
695 now->val.str.startmb,
696 now->val.str.lenmb);
697 else
699 obstack_printf (&result->mem_pool, "U%08X",
700 cmfile->token.val.ucs4);
701 obstack_1grow (&result->mem_pool, '\0');
702 from_name = (char *) obstack_finish (&result->mem_pool);
705 to_name = NULL;
707 state = 94;
708 continue;
710 case 94:
711 if (nowtok == tok_ellipsis3)
713 state = 95;
714 continue;
716 /* Fall through. */
718 case 96:
719 if (nowtok != tok_number)
720 lr_error (cmfile, _("value for %s must be an integer"),
721 "WIDTH");
722 else
724 /* Store width for chars. */
725 new_width (cmfile, result, from_name, to_name, now->val.num);
727 from_name = NULL;
728 to_name = NULL;
731 lr_ignore_rest (cmfile, nowtok == tok_number);
733 state = 93;
734 continue;
736 case 95:
737 if (nowtok != tok_bsymbol && nowtok != tok_ucs4)
739 lr_error (cmfile, _("syntax error in %s definition: %s"),
740 "WIDTH", _("no symbolic name given for end of range"));
742 lr_ignore_rest (cmfile, 0);
744 state = 93;
745 continue;
748 if (nowtok == tok_bsymbol)
749 to_name = (char *) obstack_copy0 (&result->mem_pool,
750 now->val.str.startmb,
751 now->val.str.lenmb);
752 else
754 obstack_printf (&result->mem_pool, "U%08X",
755 cmfile->token.val.ucs4);
756 obstack_1grow (&result->mem_pool, '\0');
757 to_name = (char *) obstack_finish (&result->mem_pool);
760 state = 96;
761 continue;
763 case 98:
764 /* We now expect `END WIDTH_VARIABLE' or lines of the format
765 "%s\n" or "%s...%s\n". */
766 if (nowtok == tok_eol)
767 /* ignore empty lines. */
768 continue;
770 if (nowtok == tok_end)
772 expected_tok = tok_width_variable;
773 expected_str = "WIDTH_VARIABLE";
774 state = 90;
775 continue;
778 if (nowtok != tok_bsymbol && nowtok != tok_ucs4)
780 lr_error (cmfile, _("syntax error in %s definition: %s"),
781 "WIDTH_VARIABLE", _("no symbolic name given"));
783 lr_ignore_rest (cmfile, 0);
785 continue;
788 if (from_name != NULL)
789 obstack_free (&result->mem_pool, from_name);
791 if (nowtok == tok_bsymbol)
792 from_name = (char *) obstack_copy0 (&result->mem_pool,
793 now->val.str.startmb,
794 now->val.str.lenmb);
795 else
797 obstack_printf (&result->mem_pool, "U%08X",
798 cmfile->token.val.ucs4);
799 obstack_1grow (&result->mem_pool, '\0');
800 from_name = (char *) obstack_finish (&result->mem_pool);
802 to_name = NULL;
804 state = 99;
805 continue;
807 case 99:
808 if (nowtok == tok_ellipsis3)
809 state = 100;
811 /* Store info. */
812 from_name = NULL;
814 /* Warn */
815 state = 98;
816 continue;
818 case 100:
819 if (nowtok != tok_bsymbol && nowtok != tok_ucs4)
821 lr_error (cmfile, _("syntax error in %s definition: %s"),
822 "WIDTH_VARIABLE",
823 _("no symbolic name given for end of range"));
824 lr_ignore_rest (cmfile, 0);
825 continue;
828 if (nowtok == tok_bsymbol)
829 to_name = (char *) obstack_copy0 (&result->mem_pool,
830 now->val.str.startmb,
831 now->val.str.lenmb);
832 else
834 obstack_printf (&result->mem_pool, "U%08X",
835 cmfile->token.val.ucs4);
836 obstack_1grow (&result->mem_pool, '\0');
837 to_name = (char *) obstack_finish (&result->mem_pool);
840 /* XXX Enter value into table. */
842 lr_ignore_rest (cmfile, 1);
844 state = 98;
845 continue;
847 default:
848 record_error (5, 0, _("%s: error in state machine"),
849 __FILE__);
850 /* NOTREACHED */
852 break;
855 if (state != 91)
856 record_error (0, 0, _("%s: premature end of file"),
857 cmfile->fname);
859 lr_close (cmfile);
861 return result;
865 static void
866 new_width (struct linereader *cmfile, struct charmap_t *result,
867 const char *from, const char *to, unsigned long int width)
869 struct charseq *from_val;
870 struct charseq *to_val;
872 from_val = charmap_find_value (result, from, strlen (from));
873 if (from_val == NULL)
875 lr_error (cmfile, _("unknown character `%s'"), from);
876 return;
879 if (to == NULL)
880 to_val = from_val;
881 else
883 to_val = charmap_find_value (result, to, strlen (to));
884 if (to_val == NULL)
886 lr_error (cmfile, _("unknown character `%s'"), to);
887 return;
890 /* Make sure the number of bytes for the end points of the range
891 is correct. */
892 if (from_val->nbytes != to_val->nbytes)
894 lr_error (cmfile, _("\
895 number of bytes for byte sequence of beginning and end of range not the same: %d vs %d"),
896 from_val->nbytes, to_val->nbytes);
897 return;
901 if (result->nwidth_rules >= result->nwidth_rules_max)
903 size_t new_size = result->nwidth_rules + 32;
904 struct width_rule *new_rules =
905 (struct width_rule *) obstack_alloc (&result->mem_pool,
906 (new_size
907 * sizeof (struct width_rule)));
909 memcpy (new_rules, result->width_rules,
910 result->nwidth_rules_max * sizeof (struct width_rule));
912 result->width_rules = new_rules;
913 result->nwidth_rules_max = new_size;
916 result->width_rules[result->nwidth_rules].from = from_val;
917 result->width_rules[result->nwidth_rules].to = to_val;
918 result->width_rules[result->nwidth_rules].width = (unsigned int) width;
919 ++result->nwidth_rules;
923 struct charseq *
924 charmap_find_value (const struct charmap_t *cm, const char *name, size_t len)
926 void *result;
928 return (find_entry ((hash_table *) &cm->char_table, name, len, &result)
929 < 0 ? NULL : (struct charseq *) result);
933 static void
934 charmap_new_char (struct linereader *lr, struct charmap_t *cm,
935 size_t nbytes, unsigned char *bytes,
936 const char *from, const char *to,
937 int decimal_ellipsis, int step)
939 hash_table *ht = &cm->char_table;
940 hash_table *bt = &cm->byte_table;
941 struct obstack *ob = &cm->mem_pool;
942 char *from_end;
943 char *to_end;
944 const char *cp;
945 int prefix_len, len1, len2;
946 unsigned int from_nr, to_nr, cnt;
947 struct charseq *newp;
949 len1 = strlen (from);
951 if (to == NULL)
953 newp = (struct charseq *) obstack_alloc (ob, sizeof (*newp) + nbytes);
954 newp->nbytes = nbytes;
955 memcpy (newp->bytes, bytes, nbytes);
956 newp->name = from;
958 newp->ucs4 = UNINITIALIZED_CHAR_VALUE;
959 if ((from[0] == 'U' || from[0] == 'P') && (len1 == 5 || len1 == 9))
961 /* Maybe the name is of the form `Uxxxx' or `Uxxxxxxxx' where
962 xxxx and xxxxxxxx are hexadecimal numbers. In this case
963 we use the value of xxxx or xxxxxxxx as the UCS4 value of
964 this character and we don't have to consult the repertoire
965 map.
967 If the name is of the form `Pxxxx' or `Pxxxxxxxx' the xxxx
968 and xxxxxxxx also give the code point in UCS4 but this must
969 be in the private, i.e., unassigned, area. This should be
970 used for characters which do not (yet) have an equivalent
971 in ISO 10646 and Unicode. */
972 char *endp;
974 errno = 0;
975 newp->ucs4 = strtoul (from + 1, &endp, 16);
976 if (endp - from != len1
977 || (newp->ucs4 == ~((uint32_t) 0) && errno == ERANGE)
978 || newp->ucs4 >= 0x80000000)
979 /* This wasn't successful. Signal this name cannot be a
980 correct UCS value. */
981 newp->ucs4 = UNINITIALIZED_CHAR_VALUE;
984 insert_entry (ht, from, len1, newp);
985 insert_entry (bt, newp->bytes, nbytes, newp);
986 /* Please note that it isn't a bug if a symbol is defined more
987 than once. All later definitions are simply discarded. */
988 return;
991 /* We have a range: the names must have names with equal prefixes
992 and an equal number of digits, where the second number is greater
993 or equal than the first. */
994 len2 = strlen (to);
996 if (len1 != len2)
998 illegal_range:
999 lr_error (lr, _("invalid names for character range"));
1000 return;
1003 cp = &from[len1 - 1];
1004 if (decimal_ellipsis)
1005 while (isdigit (*cp) && cp >= from)
1006 --cp;
1007 else
1008 while (isxdigit (*cp) && cp >= from)
1010 if (!isdigit (*cp) && !isupper (*cp))
1011 lr_error (lr, _("\
1012 hexadecimal range format should use only capital characters"));
1013 --cp;
1016 prefix_len = (cp - from) + 1;
1018 if (cp == &from[len1 - 1] || strncmp (from, to, prefix_len) != 0)
1019 goto illegal_range;
1021 errno = 0;
1022 from_nr = strtoul (&from[prefix_len], &from_end, decimal_ellipsis ? 10 : 16);
1023 if (*from_end != '\0' || (from_nr == UINT_MAX && errno == ERANGE)
1024 || ((to_nr = strtoul (&to[prefix_len], &to_end,
1025 decimal_ellipsis ? 10 : 16)) == UINT_MAX
1026 && errno == ERANGE)
1027 || *to_end != '\0')
1029 lr_error (lr, _("<%s> and <%s> are invalid names for range"), from, to);
1030 return;
1033 if (from_nr > to_nr)
1035 lr_error (lr, _("upper limit in range is smaller than lower limit"));
1036 return;
1039 for (cnt = from_nr; cnt <= to_nr; cnt += step)
1041 char *name_end;
1042 obstack_printf (ob, decimal_ellipsis ? "%.*s%0*d" : "%.*s%0*X",
1043 prefix_len, from, len1 - prefix_len, cnt);
1044 obstack_1grow (ob, '\0');
1045 name_end = obstack_finish (ob);
1047 newp = (struct charseq *) obstack_alloc (ob, sizeof (*newp) + nbytes);
1048 newp->nbytes = nbytes;
1049 memcpy (newp->bytes, bytes, nbytes);
1050 newp->name = name_end;
1052 newp->ucs4 = UNINITIALIZED_CHAR_VALUE;
1053 if ((name_end[0] == 'U' || name_end[0] == 'P')
1054 && (len1 == 5 || len1 == 9))
1056 /* Maybe the name is of the form `Uxxxx' or `Uxxxxxxxx' where
1057 xxxx and xxxxxxxx are hexadecimal numbers. In this case
1058 we use the value of xxxx or xxxxxxxx as the UCS4 value of
1059 this character and we don't have to consult the repertoire
1060 map.
1062 If the name is of the form `Pxxxx' or `Pxxxxxxxx' the xxxx
1063 and xxxxxxxx also give the code point in UCS4 but this must
1064 be in the private, i.e., unassigned, area. This should be
1065 used for characters which do not (yet) have an equivalent
1066 in ISO 10646 and Unicode. */
1067 char *endp;
1069 errno = 0;
1070 newp->ucs4 = strtoul (name_end + 1, &endp, 16);
1071 if (endp - name_end != len1
1072 || (newp->ucs4 == ~((uint32_t) 0) && errno == ERANGE)
1073 || newp->ucs4 >= 0x80000000)
1074 /* This wasn't successful. Signal this name cannot be a
1075 correct UCS value. */
1076 newp->ucs4 = UNINITIALIZED_CHAR_VALUE;
1079 insert_entry (ht, name_end, len1, newp);
1080 insert_entry (bt, newp->bytes, nbytes, newp);
1081 /* Please note we don't examine the return value since it is no error
1082 if we have two definitions for a symbol. */
1084 /* Increment the value in the byte sequence. */
1085 if (++bytes[nbytes - 1] == '\0')
1087 int b = nbytes - 2;
1090 if (b < 0)
1092 lr_error (lr,
1093 _("resulting bytes for range not representable."));
1094 return;
1096 while (++bytes[b--] == 0);
1102 struct charseq *
1103 charmap_find_symbol (const struct charmap_t *cm, const char *bytes,
1104 size_t nbytes)
1106 void *result;
1108 return (find_entry ((hash_table *) &cm->byte_table, bytes, nbytes, &result)
1109 < 0 ? NULL : (struct charseq *) result);