Add new locale shn_MM [BZ #13605]
[glibc.git] / locale / programs / charmap.c
bloba670db95326560a4a6cd33b28038e96967196a83
1 /* Copyright (C) 1996-2017 Free Software Foundation, Inc.
2 This file is part of the GNU C Library.
3 Contributed by Ulrich Drepper <drepper@gnu.org>, 1996.
5 This program is free software; you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published
7 by the Free Software Foundation; version 2 of the License, or
8 (at your option) any later version.
10 This program is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 GNU General Public License for more details.
15 You should have received a copy of the GNU General Public License
16 along with this program; if not, see <http://www.gnu.org/licenses/>. */
18 #ifdef HAVE_CONFIG_H
19 # include <config.h>
20 #endif
22 #include <ctype.h>
23 #include <errno.h>
24 #include <libintl.h>
25 #include <limits.h>
26 #include <stdio.h>
27 #include <stdlib.h>
28 #include <string.h>
29 #include <stdint.h>
31 #include "localedef.h"
32 #include "linereader.h"
33 #include "charmap.h"
34 #include "charmap-dir.h"
36 #include <assert.h>
39 /* Define the lookup function. */
40 #include "charmap-kw.h"
43 /* Prototypes for local functions. */
44 static struct charmap_t *parse_charmap (struct linereader *cmfile,
45 int verbose, int be_quiet);
46 static void new_width (struct linereader *cmfile, struct charmap_t *result,
47 const char *from, const char *to,
48 unsigned long int width);
49 static void charmap_new_char (struct linereader *lr, struct charmap_t *cm,
50 size_t nbytes, unsigned char *bytes,
51 const char *from, const char *to,
52 int decimal_ellipsis, int step);
55 bool enc_not_ascii_compatible;
58 #ifdef NEED_NULL_POINTER
59 static const char *null_pointer;
60 #endif
62 static struct linereader *
63 cmlr_open (const char *directory, const char *name, kw_hash_fct_t hf)
65 FILE *fp;
67 fp = charmap_open (directory, name);
68 if (fp == NULL)
69 return NULL;
70 else
72 size_t dlen = strlen (directory);
73 int add_slash = (dlen == 0 || directory[dlen - 1] != '/');
74 size_t nlen = strlen (name);
75 char *pathname;
76 char *p;
78 pathname = alloca (dlen + add_slash + nlen + 1);
79 p = stpcpy (pathname, directory);
80 if (add_slash)
81 *p++ = '/';
82 stpcpy (p, name);
84 return lr_create (fp, pathname, hf);
88 struct charmap_t *
89 charmap_read (const char *filename, int verbose, int error_not_found,
90 int be_quiet, int use_default)
92 struct charmap_t *result = NULL;
94 if (filename != NULL)
96 struct linereader *cmfile;
98 /* First try the name as found in the parameter. */
99 cmfile = lr_open (filename, charmap_hash);
100 if (cmfile == NULL)
102 /* No successful. So start looking through the directories
103 in the I18NPATH if this is a simple name. */
104 if (strchr (filename, '/') == NULL)
106 char *i18npath = getenv ("I18NPATH");
107 if (i18npath != NULL && *i18npath != '\0')
109 const size_t pathlen = strlen (i18npath);
110 char i18npathbuf[pathlen + 1];
111 char path[pathlen + sizeof ("/charmaps")];
112 char *next;
113 i18npath = memcpy (i18npathbuf, i18npath, pathlen + 1);
115 while (cmfile == NULL
116 && (next = strsep (&i18npath, ":")) != NULL)
118 stpcpy (stpcpy (path, next), "/charmaps");
119 cmfile = cmlr_open (path, filename, charmap_hash);
121 if (cmfile == NULL)
122 /* Try without the "/charmaps" part. */
123 cmfile = cmlr_open (next, filename, charmap_hash);
127 if (cmfile == NULL)
128 /* Try the default directory. */
129 cmfile = cmlr_open (CHARMAP_PATH, filename, charmap_hash);
133 if (cmfile != NULL)
134 result = parse_charmap (cmfile, verbose, be_quiet);
136 if (result == NULL && error_not_found)
137 record_error (0, errno,
138 _("character map file `%s' not found"),
139 filename);
142 if (result == NULL && filename != NULL && strchr (filename, '/') == NULL)
144 /* OK, one more try. We also accept the names given to the
145 character sets in the files. Sometimes they differ from the
146 file name. */
147 CHARMAP_DIR *dir;
149 dir = charmap_opendir (CHARMAP_PATH);
150 if (dir != NULL)
152 const char *dirent;
154 while ((dirent = charmap_readdir (dir)) != NULL)
156 char **aliases;
157 char **p;
158 int found;
160 aliases = charmap_aliases (CHARMAP_PATH, dirent);
161 found = 0;
162 for (p = aliases; *p; p++)
163 if (strcasecmp (*p, filename) == 0)
165 found = 1;
166 break;
168 charmap_free_aliases (aliases);
170 if (found)
172 struct linereader *cmfile;
174 cmfile = cmlr_open (CHARMAP_PATH, dirent, charmap_hash);
175 if (cmfile != NULL)
176 result = parse_charmap (cmfile, verbose, be_quiet);
178 break;
182 charmap_closedir (dir);
186 if (result == NULL && DEFAULT_CHARMAP != NULL)
188 struct linereader *cmfile;
190 cmfile = cmlr_open (CHARMAP_PATH, DEFAULT_CHARMAP, charmap_hash);
191 if (cmfile != NULL)
192 result = parse_charmap (cmfile, verbose, be_quiet);
194 if (result == NULL)
195 record_error (4, errno,
196 _("default character map file `%s' not found"),
197 DEFAULT_CHARMAP);
200 if (result != NULL && result->code_set_name == NULL)
201 /* The input file does not specify a code set name. This
202 shouldn't happen but we should cope with it. */
203 result->code_set_name = basename (filename);
205 /* Test of ASCII compatibility of locale encoding.
207 Verify that the encoding to be used in a locale is ASCII compatible,
208 at least for the graphic characters, excluding the control characters,
209 '$' and '@'. This constraint comes from an ISO C 99 restriction.
211 ISO C 99 section 7.17.(2) (about wchar_t):
212 the null character shall have the code value zero and each member of
213 the basic character set shall have a code value equal to its value
214 when used as the lone character in an integer character constant.
215 ISO C 99 section 5.2.1.(3):
216 Both the basic source and basic execution character sets shall have
217 the following members: the 26 uppercase letters of the Latin alphabet
218 A B C D E F G H I J K L M N O P Q R S T U V W X Y Z
219 the 26 lowercase letters of the Latin alphabet
220 a b c d e f g h i j k l m n o p q r s t u v w x y z
221 the 10 decimal digits
222 0 1 2 3 4 5 6 7 8 9
223 the following 29 graphic characters
224 ! " # % & ' ( ) * + , - . / : ; < = > ? [ \ ] ^ _ { | } ~
225 the space character, and control characters representing horizontal
226 tab, vertical tab, and form feed.
228 Therefore, for all members of the "basic character set", the 'char' code
229 must have the same value as the 'wchar_t' code, which in glibc is the
230 same as the Unicode code, which for all of the enumerated characters
231 is identical to the ASCII code. */
232 if (result != NULL && use_default)
234 static const char basic_charset[] =
236 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M',
237 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z',
238 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm',
239 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
240 '0', '1', '2', '3', '4', '5', '6', '7', '8', '9',
241 '!', '"', '#', '%', '&', '\'', '(', ')', '*', '+', ',', '-',
242 '.', '/', ':', ';', '<', '=', '>', '?', '[', '\\', ']', '^',
243 '_', '{', '|', '}', '~', ' ', '\t', '\v', '\f', '\0'
245 int failed = 0;
246 const char *p = basic_charset;
250 struct charseq *seq = charmap_find_symbol (result, p, 1);
252 if (seq == NULL || seq->ucs4 != (uint32_t) *p)
253 failed = 1;
255 while (*p++ != '\0');
257 if (failed)
259 record_warning (_("\
260 character map `%s' is not ASCII compatible, locale not ISO C compliant\n"),
261 result->code_set_name);
262 enc_not_ascii_compatible = true;
266 return result;
270 static struct charmap_t *
271 parse_charmap (struct linereader *cmfile, int verbose, int be_quiet)
273 struct charmap_t *result;
274 int state;
275 enum token_t expected_tok = tok_error;
276 const char *expected_str = NULL;
277 char *from_name = NULL;
278 char *to_name = NULL;
279 enum token_t ellipsis = 0;
280 int step = 1;
282 /* We don't want symbolic names in string to be translated. */
283 cmfile->translate_strings = 0;
285 /* Allocate room for result. */
286 result = (struct charmap_t *) xmalloc (sizeof (struct charmap_t));
287 memset (result, '\0', sizeof (struct charmap_t));
288 /* The default DEFAULT_WIDTH is 1. */
289 result->width_default = 1;
291 #define obstack_chunk_alloc malloc
292 #define obstack_chunk_free free
293 obstack_init (&result->mem_pool);
295 if (init_hash (&result->char_table, 256)
296 || init_hash (&result->byte_table, 256))
298 free (result);
299 return NULL;
302 /* We use a state machine to describe the charmap description file
303 format. */
304 state = 1;
305 while (1)
307 /* What's on? */
308 struct token *now = lr_token (cmfile, NULL, NULL, NULL, verbose);
309 enum token_t nowtok = now->tok;
310 struct token *arg;
312 if (nowtok == tok_eof)
313 break;
315 switch (state)
317 case 1:
318 /* The beginning. We expect the special declarations, EOL or
319 `CHARMAP'. */
320 if (nowtok == tok_eol)
321 /* Ignore empty lines. */
322 continue;
324 if (nowtok == tok_charmap)
326 from_name = NULL;
327 to_name = NULL;
329 /* We have to set up the real work. Fill in some
330 default values. */
331 if (result->mb_cur_max == 0)
332 result->mb_cur_max = 1;
333 if (result->mb_cur_min == 0)
334 result->mb_cur_min = result->mb_cur_max;
335 if (result->mb_cur_min > result->mb_cur_max)
337 record_error (0, 0, _("\
338 %s: <mb_cur_max> must be greater than <mb_cur_min>\n"),
339 cmfile->fname);
341 result->mb_cur_min = result->mb_cur_max;
344 lr_ignore_rest (cmfile, 1);
346 state = 2;
347 continue;
350 if (nowtok != tok_code_set_name && nowtok != tok_mb_cur_max
351 && nowtok != tok_mb_cur_min && nowtok != tok_escape_char
352 && nowtok != tok_comment_char && nowtok != tok_g0esc
353 && nowtok != tok_g1esc && nowtok != tok_g2esc
354 && nowtok != tok_g3esc && nowtok != tok_repertoiremap
355 && nowtok != tok_include)
357 lr_error (cmfile, _("syntax error in prolog: %s"),
358 _("invalid definition"));
360 lr_ignore_rest (cmfile, 0);
361 continue;
364 /* We know that we need an argument. */
365 arg = lr_token (cmfile, NULL, NULL, NULL, verbose);
367 switch (nowtok)
369 case tok_code_set_name:
370 case tok_repertoiremap:
371 if (arg->tok != tok_ident && arg->tok != tok_string)
373 badarg:
374 lr_error (cmfile, _("syntax error in prolog: %s"),
375 _("bad argument"));
377 lr_ignore_rest (cmfile, 0);
378 continue;
381 if (nowtok == tok_code_set_name)
382 result->code_set_name = obstack_copy0 (&result->mem_pool,
383 arg->val.str.startmb,
384 arg->val.str.lenmb);
385 else
386 result->repertoiremap = obstack_copy0 (&result->mem_pool,
387 arg->val.str.startmb,
388 arg->val.str.lenmb);
390 lr_ignore_rest (cmfile, 1);
391 continue;
393 case tok_mb_cur_max:
394 case tok_mb_cur_min:
395 if (arg->tok != tok_number)
396 goto badarg;
398 if ((nowtok == tok_mb_cur_max
399 && result->mb_cur_max != 0)
400 || (nowtok == tok_mb_cur_max
401 && result->mb_cur_max != 0))
402 lr_error (cmfile, _("duplicate definition of <%s>"),
403 nowtok == tok_mb_cur_min
404 ? "mb_cur_min" : "mb_cur_max");
406 if (arg->val.num < 1)
408 lr_error (cmfile,
409 _("value for <%s> must be 1 or greater"),
410 nowtok == tok_mb_cur_min
411 ? "mb_cur_min" : "mb_cur_max");
413 lr_ignore_rest (cmfile, 0);
414 continue;
416 if ((nowtok == tok_mb_cur_max && result->mb_cur_min != 0
417 && (int) arg->val.num < result->mb_cur_min)
418 || (nowtok == tok_mb_cur_min && result->mb_cur_max != 0
419 && (int) arg->val.num > result->mb_cur_max))
421 lr_error (cmfile, _("\
422 value of <%s> must be greater or equal than the value of <%s>"),
423 "mb_cur_max", "mb_cur_min");
425 lr_ignore_rest (cmfile, 0);
426 continue;
429 if (nowtok == tok_mb_cur_max)
430 result->mb_cur_max = arg->val.num;
431 else
432 result->mb_cur_min = arg->val.num;
434 lr_ignore_rest (cmfile, 1);
435 continue;
437 case tok_escape_char:
438 case tok_comment_char:
439 if (arg->tok != tok_ident)
440 goto badarg;
442 if (arg->val.str.lenmb != 1)
444 lr_error (cmfile, _("\
445 argument to <%s> must be a single character"),
446 nowtok == tok_escape_char ? "escape_char"
447 : "comment_char");
449 lr_ignore_rest (cmfile, 0);
450 continue;
453 if (nowtok == tok_escape_char)
454 cmfile->escape_char = *arg->val.str.startmb;
455 else
456 cmfile->comment_char = *arg->val.str.startmb;
458 lr_ignore_rest (cmfile, 1);
459 continue;
461 case tok_g0esc:
462 case tok_g1esc:
463 case tok_g2esc:
464 case tok_g3esc:
465 case tok_escseq:
466 lr_ignore_rest (cmfile, 0); /* XXX */
467 continue;
469 case tok_include:
470 lr_error (cmfile, _("\
471 character sets with locking states are not supported"));
472 exit (4);
474 default:
475 /* Cannot happen. */
476 assert (! "Should not happen");
478 break;
480 case 2:
481 /* We have seen `CHARMAP' and now are in the body. Each line
482 must have the format "%s %s %s\n" or "%s...%s %s %s\n". */
483 if (nowtok == tok_eol)
484 /* Ignore empty lines. */
485 continue;
487 if (nowtok == tok_end)
489 expected_tok = tok_charmap;
490 expected_str = "CHARMAP";
491 state = 90;
492 continue;
495 if (nowtok != tok_bsymbol && nowtok != tok_ucs4)
497 lr_error (cmfile, _("syntax error in %s definition: %s"),
498 "CHARMAP", _("no symbolic name given"));
500 lr_ignore_rest (cmfile, 0);
501 continue;
504 /* If the previous line was not completely correct free the
505 used memory. */
506 if (from_name != NULL)
507 obstack_free (&result->mem_pool, from_name);
509 if (nowtok == tok_bsymbol)
510 from_name = (char *) obstack_copy0 (&result->mem_pool,
511 now->val.str.startmb,
512 now->val.str.lenmb);
513 else
515 obstack_printf (&result->mem_pool, "U%08X",
516 cmfile->token.val.ucs4);
517 obstack_1grow (&result->mem_pool, '\0');
518 from_name = (char *) obstack_finish (&result->mem_pool);
520 to_name = NULL;
522 state = 3;
523 continue;
525 case 3:
526 /* We have two possibilities: We can see an ellipsis or an
527 encoding value. */
528 if (nowtok == tok_ellipsis3 || nowtok == tok_ellipsis4
529 || nowtok == tok_ellipsis2 || nowtok == tok_ellipsis4_2
530 || nowtok == tok_ellipsis2_2)
532 ellipsis = nowtok;
533 if (nowtok == tok_ellipsis4_2)
535 step = 2;
536 nowtok = tok_ellipsis4;
538 else if (nowtok == tok_ellipsis2_2)
540 step = 2;
541 nowtok = tok_ellipsis2;
543 state = 4;
544 continue;
546 /* FALLTHROUGH */
548 case 5:
549 if (nowtok != tok_charcode)
551 lr_error (cmfile, _("syntax error in %s definition: %s"),
552 "CHARMAP", _("invalid encoding given"));
554 lr_ignore_rest (cmfile, 0);
556 state = 2;
557 continue;
560 if (now->val.charcode.nbytes < result->mb_cur_min)
561 lr_error (cmfile, _("too few bytes in character encoding"));
562 else if (now->val.charcode.nbytes > result->mb_cur_max)
563 lr_error (cmfile, _("too many bytes in character encoding"));
564 else
565 charmap_new_char (cmfile, result, now->val.charcode.nbytes,
566 now->val.charcode.bytes, from_name, to_name,
567 ellipsis != tok_ellipsis2, step);
569 /* Ignore trailing comment silently. */
570 lr_ignore_rest (cmfile, 0);
572 from_name = NULL;
573 to_name = NULL;
574 ellipsis = tok_none;
575 step = 1;
577 state = 2;
578 continue;
580 case 4:
581 if (nowtok != tok_bsymbol && nowtok != tok_ucs4)
583 lr_error (cmfile, _("syntax error in %s definition: %s"),
584 "CHARMAP",
585 _("no symbolic name given for end of range"));
587 lr_ignore_rest (cmfile, 0);
588 continue;
591 /* Copy the to-name in a safe place. */
592 if (nowtok == tok_bsymbol)
593 to_name = (char *) obstack_copy0 (&result->mem_pool,
594 cmfile->token.val.str.startmb,
595 cmfile->token.val.str.lenmb);
596 else
598 obstack_printf (&result->mem_pool, "U%08X",
599 cmfile->token.val.ucs4);
600 obstack_1grow (&result->mem_pool, '\0');
601 to_name = (char *) obstack_finish (&result->mem_pool);
604 state = 5;
605 continue;
607 case 90:
608 if (nowtok != expected_tok)
609 lr_error (cmfile, _("\
610 %1$s: definition does not end with `END %1$s'"), expected_str);
612 lr_ignore_rest (cmfile, nowtok == expected_tok);
613 state = 91;
614 continue;
616 case 91:
617 /* Waiting for WIDTH... */
618 if (nowtok == tok_eol)
619 /* Ignore empty lines. */
620 continue;
622 if (nowtok == tok_width_default)
624 state = 92;
625 continue;
628 if (nowtok == tok_width)
630 lr_ignore_rest (cmfile, 1);
631 state = 93;
632 continue;
635 if (nowtok == tok_width_variable)
637 lr_ignore_rest (cmfile, 1);
638 state = 98;
639 continue;
642 lr_error (cmfile, _("\
643 only WIDTH definitions are allowed to follow the CHARMAP definition"));
645 lr_ignore_rest (cmfile, 0);
646 continue;
648 case 92:
649 if (nowtok != tok_number)
650 lr_error (cmfile, _("value for %s must be an integer"),
651 "WIDTH_DEFAULT");
652 else
653 result->width_default = now->val.num;
655 lr_ignore_rest (cmfile, nowtok == tok_number);
657 state = 91;
658 continue;
660 case 93:
661 /* We now expect `END WIDTH' or lines of the format "%s %d\n" or
662 "%s...%s %d\n". */
663 if (nowtok == tok_eol)
664 /* ignore empty lines. */
665 continue;
667 if (nowtok == tok_end)
669 expected_tok = tok_width;
670 expected_str = "WIDTH";
671 state = 90;
672 continue;
675 if (nowtok != tok_bsymbol && nowtok != tok_ucs4)
677 lr_error (cmfile, _("syntax error in %s definition: %s"),
678 "WIDTH", _("no symbolic name given"));
680 lr_ignore_rest (cmfile, 0);
681 continue;
684 if (from_name != NULL)
685 obstack_free (&result->mem_pool, from_name);
687 if (nowtok == tok_bsymbol)
688 from_name = (char *) obstack_copy0 (&result->mem_pool,
689 now->val.str.startmb,
690 now->val.str.lenmb);
691 else
693 obstack_printf (&result->mem_pool, "U%08X",
694 cmfile->token.val.ucs4);
695 obstack_1grow (&result->mem_pool, '\0');
696 from_name = (char *) obstack_finish (&result->mem_pool);
699 to_name = NULL;
701 state = 94;
702 continue;
704 case 94:
705 if (nowtok == tok_ellipsis3)
707 state = 95;
708 continue;
711 case 96:
712 if (nowtok != tok_number)
713 lr_error (cmfile, _("value for %s must be an integer"),
714 "WIDTH");
715 else
717 /* Store width for chars. */
718 new_width (cmfile, result, from_name, to_name, now->val.num);
720 from_name = NULL;
721 to_name = NULL;
724 lr_ignore_rest (cmfile, nowtok == tok_number);
726 state = 93;
727 continue;
729 case 95:
730 if (nowtok != tok_bsymbol && nowtok != tok_ucs4)
732 lr_error (cmfile, _("syntax error in %s definition: %s"),
733 "WIDTH", _("no symbolic name given for end of range"));
735 lr_ignore_rest (cmfile, 0);
737 state = 93;
738 continue;
741 if (nowtok == tok_bsymbol)
742 to_name = (char *) obstack_copy0 (&result->mem_pool,
743 now->val.str.startmb,
744 now->val.str.lenmb);
745 else
747 obstack_printf (&result->mem_pool, "U%08X",
748 cmfile->token.val.ucs4);
749 obstack_1grow (&result->mem_pool, '\0');
750 to_name = (char *) obstack_finish (&result->mem_pool);
753 state = 96;
754 continue;
756 case 98:
757 /* We now expect `END WIDTH_VARIABLE' or lines of the format
758 "%s\n" or "%s...%s\n". */
759 if (nowtok == tok_eol)
760 /* ignore empty lines. */
761 continue;
763 if (nowtok == tok_end)
765 expected_tok = tok_width_variable;
766 expected_str = "WIDTH_VARIABLE";
767 state = 90;
768 continue;
771 if (nowtok != tok_bsymbol && nowtok != tok_ucs4)
773 lr_error (cmfile, _("syntax error in %s definition: %s"),
774 "WIDTH_VARIABLE", _("no symbolic name given"));
776 lr_ignore_rest (cmfile, 0);
778 continue;
781 if (from_name != NULL)
782 obstack_free (&result->mem_pool, from_name);
784 if (nowtok == tok_bsymbol)
785 from_name = (char *) obstack_copy0 (&result->mem_pool,
786 now->val.str.startmb,
787 now->val.str.lenmb);
788 else
790 obstack_printf (&result->mem_pool, "U%08X",
791 cmfile->token.val.ucs4);
792 obstack_1grow (&result->mem_pool, '\0');
793 from_name = (char *) obstack_finish (&result->mem_pool);
795 to_name = NULL;
797 state = 99;
798 continue;
800 case 99:
801 if (nowtok == tok_ellipsis3)
802 state = 100;
804 /* Store info. */
805 from_name = NULL;
807 /* Warn */
808 state = 98;
809 continue;
811 case 100:
812 if (nowtok != tok_bsymbol && nowtok != tok_ucs4)
814 lr_error (cmfile, _("syntax error in %s definition: %s"),
815 "WIDTH_VARIABLE",
816 _("no symbolic name given for end of range"));
817 lr_ignore_rest (cmfile, 0);
818 continue;
821 if (nowtok == tok_bsymbol)
822 to_name = (char *) obstack_copy0 (&result->mem_pool,
823 now->val.str.startmb,
824 now->val.str.lenmb);
825 else
827 obstack_printf (&result->mem_pool, "U%08X",
828 cmfile->token.val.ucs4);
829 obstack_1grow (&result->mem_pool, '\0');
830 to_name = (char *) obstack_finish (&result->mem_pool);
833 /* XXX Enter value into table. */
835 lr_ignore_rest (cmfile, 1);
837 state = 98;
838 continue;
840 default:
841 record_error (5, 0, _("%s: error in state machine"),
842 __FILE__);
843 /* NOTREACHED */
845 break;
848 if (state != 91)
849 record_error (0, 0, _("%s: premature end of file"),
850 cmfile->fname);
852 lr_close (cmfile);
854 return result;
858 static void
859 new_width (struct linereader *cmfile, struct charmap_t *result,
860 const char *from, const char *to, unsigned long int width)
862 struct charseq *from_val;
863 struct charseq *to_val;
865 from_val = charmap_find_value (result, from, strlen (from));
866 if (from_val == NULL)
868 lr_error (cmfile, _("unknown character `%s'"), from);
869 return;
872 if (to == NULL)
873 to_val = from_val;
874 else
876 to_val = charmap_find_value (result, to, strlen (to));
877 if (to_val == NULL)
879 lr_error (cmfile, _("unknown character `%s'"), to);
880 return;
883 /* Make sure the number of bytes for the end points of the range
884 is correct. */
885 if (from_val->nbytes != to_val->nbytes)
887 lr_error (cmfile, _("\
888 number of bytes for byte sequence of beginning and end of range not the same: %d vs %d"),
889 from_val->nbytes, to_val->nbytes);
890 return;
894 if (result->nwidth_rules >= result->nwidth_rules_max)
896 size_t new_size = result->nwidth_rules + 32;
897 struct width_rule *new_rules =
898 (struct width_rule *) obstack_alloc (&result->mem_pool,
899 (new_size
900 * sizeof (struct width_rule)));
902 memcpy (new_rules, result->width_rules,
903 result->nwidth_rules_max * sizeof (struct width_rule));
905 result->width_rules = new_rules;
906 result->nwidth_rules_max = new_size;
909 result->width_rules[result->nwidth_rules].from = from_val;
910 result->width_rules[result->nwidth_rules].to = to_val;
911 result->width_rules[result->nwidth_rules].width = (unsigned int) width;
912 ++result->nwidth_rules;
916 struct charseq *
917 charmap_find_value (const struct charmap_t *cm, const char *name, size_t len)
919 void *result;
921 return (find_entry ((hash_table *) &cm->char_table, name, len, &result)
922 < 0 ? NULL : (struct charseq *) result);
926 static void
927 charmap_new_char (struct linereader *lr, struct charmap_t *cm,
928 size_t nbytes, unsigned char *bytes,
929 const char *from, const char *to,
930 int decimal_ellipsis, int step)
932 hash_table *ht = &cm->char_table;
933 hash_table *bt = &cm->byte_table;
934 struct obstack *ob = &cm->mem_pool;
935 char *from_end;
936 char *to_end;
937 const char *cp;
938 int prefix_len, len1, len2;
939 unsigned int from_nr, to_nr, cnt;
940 struct charseq *newp;
942 len1 = strlen (from);
944 if (to == NULL)
946 newp = (struct charseq *) obstack_alloc (ob, sizeof (*newp) + nbytes);
947 newp->nbytes = nbytes;
948 memcpy (newp->bytes, bytes, nbytes);
949 newp->name = from;
951 newp->ucs4 = UNINITIALIZED_CHAR_VALUE;
952 if ((from[0] == 'U' || from[0] == 'P') && (len1 == 5 || len1 == 9))
954 /* Maybe the name is of the form `Uxxxx' or `Uxxxxxxxx' where
955 xxxx and xxxxxxxx are hexadecimal numbers. In this case
956 we use the value of xxxx or xxxxxxxx as the UCS4 value of
957 this character and we don't have to consult the repertoire
958 map.
960 If the name is of the form `Pxxxx' or `Pxxxxxxxx' the xxxx
961 and xxxxxxxx also give the code point in UCS4 but this must
962 be in the private, i.e., unassigned, area. This should be
963 used for characters which do not (yet) have an equivalent
964 in ISO 10646 and Unicode. */
965 char *endp;
967 errno = 0;
968 newp->ucs4 = strtoul (from + 1, &endp, 16);
969 if (endp - from != len1
970 || (newp->ucs4 == ~((uint32_t) 0) && errno == ERANGE)
971 || newp->ucs4 >= 0x80000000)
972 /* This wasn't successful. Signal this name cannot be a
973 correct UCS value. */
974 newp->ucs4 = UNINITIALIZED_CHAR_VALUE;
977 insert_entry (ht, from, len1, newp);
978 insert_entry (bt, newp->bytes, nbytes, newp);
979 /* Please note that it isn't a bug if a symbol is defined more
980 than once. All later definitions are simply discarded. */
981 return;
984 /* We have a range: the names must have names with equal prefixes
985 and an equal number of digits, where the second number is greater
986 or equal than the first. */
987 len2 = strlen (to);
989 if (len1 != len2)
991 illegal_range:
992 lr_error (lr, _("invalid names for character range"));
993 return;
996 cp = &from[len1 - 1];
997 if (decimal_ellipsis)
998 while (isdigit (*cp) && cp >= from)
999 --cp;
1000 else
1001 while (isxdigit (*cp) && cp >= from)
1003 if (!isdigit (*cp) && !isupper (*cp))
1004 lr_error (lr, _("\
1005 hexadecimal range format should use only capital characters"));
1006 --cp;
1009 prefix_len = (cp - from) + 1;
1011 if (cp == &from[len1 - 1] || strncmp (from, to, prefix_len) != 0)
1012 goto illegal_range;
1014 errno = 0;
1015 from_nr = strtoul (&from[prefix_len], &from_end, decimal_ellipsis ? 10 : 16);
1016 if (*from_end != '\0' || (from_nr == UINT_MAX && errno == ERANGE)
1017 || ((to_nr = strtoul (&to[prefix_len], &to_end,
1018 decimal_ellipsis ? 10 : 16)) == UINT_MAX
1019 && errno == ERANGE)
1020 || *to_end != '\0')
1022 lr_error (lr, _("<%s> and <%s> are invalid names for range"), from, to);
1023 return;
1026 if (from_nr > to_nr)
1028 lr_error (lr, _("upper limit in range is smaller than lower limit"));
1029 return;
1032 for (cnt = from_nr; cnt <= to_nr; cnt += step)
1034 char *name_end;
1035 obstack_printf (ob, decimal_ellipsis ? "%.*s%0*d" : "%.*s%0*X",
1036 prefix_len, from, len1 - prefix_len, cnt);
1037 obstack_1grow (ob, '\0');
1038 name_end = obstack_finish (ob);
1040 newp = (struct charseq *) obstack_alloc (ob, sizeof (*newp) + nbytes);
1041 newp->nbytes = nbytes;
1042 memcpy (newp->bytes, bytes, nbytes);
1043 newp->name = name_end;
1045 newp->ucs4 = UNINITIALIZED_CHAR_VALUE;
1046 if ((name_end[0] == 'U' || name_end[0] == 'P')
1047 && (len1 == 5 || len1 == 9))
1049 /* Maybe the name is of the form `Uxxxx' or `Uxxxxxxxx' where
1050 xxxx and xxxxxxxx are hexadecimal numbers. In this case
1051 we use the value of xxxx or xxxxxxxx as the UCS4 value of
1052 this character and we don't have to consult the repertoire
1053 map.
1055 If the name is of the form `Pxxxx' or `Pxxxxxxxx' the xxxx
1056 and xxxxxxxx also give the code point in UCS4 but this must
1057 be in the private, i.e., unassigned, area. This should be
1058 used for characters which do not (yet) have an equivalent
1059 in ISO 10646 and Unicode. */
1060 char *endp;
1062 errno = 0;
1063 newp->ucs4 = strtoul (name_end + 1, &endp, 16);
1064 if (endp - name_end != len1
1065 || (newp->ucs4 == ~((uint32_t) 0) && errno == ERANGE)
1066 || newp->ucs4 >= 0x80000000)
1067 /* This wasn't successful. Signal this name cannot be a
1068 correct UCS value. */
1069 newp->ucs4 = UNINITIALIZED_CHAR_VALUE;
1072 insert_entry (ht, name_end, len1, newp);
1073 insert_entry (bt, newp->bytes, nbytes, newp);
1074 /* Please note we don't examine the return value since it is no error
1075 if we have two definitions for a symbol. */
1077 /* Increment the value in the byte sequence. */
1078 if (++bytes[nbytes - 1] == '\0')
1080 int b = nbytes - 2;
1083 if (b < 0)
1085 lr_error (lr,
1086 _("resulting bytes for range not representable."));
1087 return;
1089 while (++bytes[b--] == 0);
1095 struct charseq *
1096 charmap_find_symbol (const struct charmap_t *cm, const char *bytes,
1097 size_t nbytes)
1099 void *result;
1101 return (find_entry ((hash_table *) &cm->byte_table, bytes, nbytes, &result)
1102 < 0 ? NULL : (struct charseq *) result);