Updated to fedora-glibc-20050302T1820
[glibc.git] / locale / programs / charmap.c
blobb8aa1aafc46458e7d90691db4d36108d7baa4b53
1 /* Copyright (C) 1996, 1998-2002, 2003, 2004 Free Software Foundation, Inc.
2 This file is part of the GNU C Library.
3 Contributed by Ulrich Drepper <drepper@gnu.org>, 1996.
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, write to the Free
17 Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
18 02111-1307 USA. */
20 #ifdef HAVE_CONFIG_H
21 # include <config.h>
22 #endif
24 #include <ctype.h>
25 #include <errno.h>
26 #include <libintl.h>
27 #include <limits.h>
28 #include <stdio.h>
29 #include <stdlib.h>
30 #include <string.h>
31 #include <error.h>
33 #include "localedef.h"
34 #include "linereader.h"
35 #include "charmap.h"
36 #include "charmap-dir.h"
38 #include <assert.h>
41 /* Define the lookup function. */
42 #include "charmap-kw.h"
45 /* Prototypes for local functions. */
46 static struct charmap_t *parse_charmap (struct linereader *cmfile,
47 int verbose, int be_quiet);
48 static void new_width (struct linereader *cmfile, struct charmap_t *result,
49 const char *from, const char *to,
50 unsigned long int width);
51 static void charmap_new_char (struct linereader *lr, struct charmap_t *cm,
52 size_t nbytes, unsigned char *bytes,
53 const char *from, const char *to,
54 int decimal_ellipsis, int step);
57 bool enc_not_ascii_compatible;
60 #ifdef NEED_NULL_POINTER
61 static const char *null_pointer;
62 #endif
64 static struct linereader *
65 cmlr_open (const char *directory, const char *name, kw_hash_fct_t hf)
67 FILE *fp;
69 fp = charmap_open (directory, name);
70 if (fp == NULL)
71 return NULL;
72 else
74 size_t dlen = strlen (directory);
75 int add_slash = (dlen == 0 || directory[dlen - 1] != '/');
76 size_t nlen = strlen (name);
77 char *pathname;
78 char *p;
80 pathname = alloca (dlen + add_slash + nlen + 1);
81 p = stpcpy (pathname, directory);
82 if (add_slash)
83 *p++ = '/';
84 stpcpy (p, name);
86 return lr_create (fp, pathname, hf);
90 struct charmap_t *
91 charmap_read (const char *filename, int verbose, int be_quiet, int use_default)
93 struct charmap_t *result = NULL;
95 if (filename != NULL)
97 struct linereader *cmfile;
99 /* First try the name as found in the parameter. */
100 cmfile = lr_open (filename, charmap_hash);
101 if (cmfile == NULL)
103 /* No successful. So start looking through the directories
104 in the I18NPATH if this is a simple name. */
105 if (strchr (filename, '/') == NULL)
107 char *i18npath = getenv ("I18NPATH");
108 if (i18npath != NULL && *i18npath != '\0')
110 const size_t pathlen = strlen (i18npath);
111 char i18npathbuf[pathlen + 1];
112 char path[pathlen + sizeof ("/charmaps")];
113 char *next;
114 i18npath = memcpy (i18npathbuf, i18npath, pathlen + 1);
116 while (cmfile == NULL
117 && (next = strsep (&i18npath, ":")) != NULL)
119 stpcpy (stpcpy (path, next), "/charmaps");
120 cmfile = cmlr_open (path, filename, charmap_hash);
122 if (cmfile == NULL)
123 /* Try without the "/charmaps" part. */
124 cmfile = cmlr_open (next, filename, charmap_hash);
128 if (cmfile == NULL)
129 /* Try the default directory. */
130 cmfile = cmlr_open (CHARMAP_PATH, filename, charmap_hash);
134 if (cmfile != NULL)
136 result = parse_charmap (cmfile, verbose, be_quiet);
138 if (result == NULL && !be_quiet)
139 WITH_CUR_LOCALE (error (0, errno, _("\
140 character map file `%s' not found"), filename));
144 if (result == NULL && filename != NULL && strchr (filename, '/') == NULL)
146 /* OK, one more try. We also accept the names given to the
147 character sets in the files. Sometimes they differ from the
148 file name. */
149 CHARMAP_DIR *dir;
151 dir = charmap_opendir (CHARMAP_PATH);
152 if (dir != NULL)
154 const char *dirent;
156 while ((dirent = charmap_readdir (dir)) != NULL)
158 char **aliases;
159 char **p;
160 int found;
162 aliases = charmap_aliases (CHARMAP_PATH, dirent);
163 found = 0;
164 for (p = aliases; *p; p++)
165 if (strcasecmp (*p, filename) == 0)
167 found = 1;
168 break;
170 charmap_free_aliases (aliases);
172 if (found)
174 struct linereader *cmfile;
176 cmfile = cmlr_open (CHARMAP_PATH, dirent, charmap_hash);
177 if (cmfile != NULL)
178 result = parse_charmap (cmfile, verbose, be_quiet);
180 break;
184 charmap_closedir (dir);
188 if (result == NULL && DEFAULT_CHARMAP != NULL)
190 struct linereader *cmfile;
192 cmfile = cmlr_open (CHARMAP_PATH, DEFAULT_CHARMAP, charmap_hash);
193 if (cmfile != NULL)
194 result = parse_charmap (cmfile, verbose, be_quiet);
196 if (result == NULL)
197 WITH_CUR_LOCALE (error (4, errno, _("\
198 default character map file `%s' not found"), DEFAULT_CHARMAP));
201 if (result != NULL && result->code_set_name == NULL)
202 /* The input file does not specify a code set name. This
203 shouldn't happen but we should cope with it. */
204 result->code_set_name = basename (filename);
206 /* Test of ASCII compatibility of locale encoding.
208 Verify that the encoding to be used in a locale is ASCII compatible,
209 at least for the graphic characters, excluding the control characters,
210 '$' and '@'. This constraint comes from an ISO C 99 restriction.
212 ISO C 99 section 7.17.(2) (about wchar_t):
213 the null character shall have the code value zero and each member of
214 the basic character set shall have a code value equal to its value
215 when used as the lone character in an integer character constant.
216 ISO C 99 section 5.2.1.(3):
217 Both the basic source and basic execution character sets shall have
218 the following members: the 26 uppercase letters of the Latin alphabet
219 A B C D E F G H I J K L M N O P Q R S T U V W X Y Z
220 the 26 lowercase letters of the Latin alphabet
221 a b c d e f g h i j k l m n o p q r s t u v w x y z
222 the 10 decimal digits
223 0 1 2 3 4 5 6 7 8 9
224 the following 29 graphic characters
225 ! " # % & ' ( ) * + , - . / : ; < = > ? [ \ ] ^ _ { | } ~
226 the space character, and control characters representing horizontal
227 tab, vertical tab, and form feed.
229 Therefore, for all members of the "basic character set", the 'char' code
230 must have the same value as the 'wchar_t' code, which in glibc is the
231 same as the Unicode code, which for all of the enumerated characters
232 is identical to the ASCII code. */
233 if (result != NULL && use_default)
235 static const char basic_charset[] =
237 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M',
238 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z',
239 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm',
240 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
241 '0', '1', '2', '3', '4', '5', '6', '7', '8', '9',
242 '!', '"', '#', '%', '&', '\'', '(', ')', '*', '+', ',', '-',
243 '.', '/', ':', ';', '<', '=', '>', '?', '[', '\\', ']', '^',
244 '_', '{', '|', '}', '~', ' ', '\t', '\v', '\f', '\0'
246 int failed = 0;
247 const char *p = basic_charset;
251 struct charseq *seq = charmap_find_symbol (result, p, 1);
253 if (seq == NULL || seq->ucs4 != (uint32_t) *p)
254 failed = 1;
256 while (*p++ != '\0');
258 if (failed)
260 WITH_CUR_LOCALE (fprintf (stderr, _("\
261 character map `%s' is not ASCII compatible, locale not ISO C compliant\n"),
262 result->code_set_name));
263 enc_not_ascii_compatible = true;
267 return result;
271 static struct charmap_t *
272 parse_charmap (struct linereader *cmfile, int verbose, int be_quiet)
274 struct charmap_t *result;
275 int state;
276 enum token_t expected_tok = tok_error;
277 const char *expected_str = NULL;
278 char *from_name = NULL;
279 char *to_name = NULL;
280 enum token_t ellipsis = 0;
281 int step = 1;
283 /* We don't want symbolic names in string to be translated. */
284 cmfile->translate_strings = 0;
286 /* Allocate room for result. */
287 result = (struct charmap_t *) xmalloc (sizeof (struct charmap_t));
288 memset (result, '\0', sizeof (struct charmap_t));
289 /* The default DEFAULT_WIDTH is 1. */
290 result->width_default = 1;
292 #define obstack_chunk_alloc malloc
293 #define obstack_chunk_free free
294 obstack_init (&result->mem_pool);
296 if (init_hash (&result->char_table, 256)
297 || init_hash (&result->byte_table, 256))
299 free (result);
300 return NULL;
303 /* We use a state machine to describe the charmap description file
304 format. */
305 state = 1;
306 while (1)
308 /* What's on? */
309 struct token *now = lr_token (cmfile, NULL, NULL, NULL, verbose);
310 enum token_t nowtok = now->tok;
311 struct token *arg;
313 if (nowtok == tok_eof)
314 break;
316 switch (state)
318 case 1:
319 /* The beginning. We expect the special declarations, EOL or
320 `CHARMAP'. */
321 if (nowtok == tok_eol)
322 /* Ignore empty lines. */
323 continue;
325 if (nowtok == tok_charmap)
327 from_name = NULL;
328 to_name = NULL;
330 /* We have to set up the real work. Fill in some
331 default values. */
332 if (result->mb_cur_max == 0)
333 result->mb_cur_max = 1;
334 if (result->mb_cur_min == 0)
335 result->mb_cur_min = result->mb_cur_max;
336 if (result->mb_cur_min > result->mb_cur_max)
338 if (!be_quiet)
339 WITH_CUR_LOCALE (error (0, 0, _("\
340 %s: <mb_cur_max> must be greater than <mb_cur_min>\n"),
341 cmfile->fname));
343 result->mb_cur_min = result->mb_cur_max;
346 lr_ignore_rest (cmfile, 1);
348 state = 2;
349 continue;
352 if (nowtok != tok_code_set_name && nowtok != tok_mb_cur_max
353 && nowtok != tok_mb_cur_min && nowtok != tok_escape_char
354 && nowtok != tok_comment_char && nowtok != tok_g0esc
355 && nowtok != tok_g1esc && nowtok != tok_g2esc
356 && nowtok != tok_g3esc && nowtok != tok_repertoiremap
357 && nowtok != tok_include)
359 lr_error (cmfile, _("syntax error in prolog: %s"),
360 _("invalid definition"));
362 lr_ignore_rest (cmfile, 0);
363 continue;
366 /* We know that we need an argument. */
367 arg = lr_token (cmfile, NULL, NULL, NULL, verbose);
369 switch (nowtok)
371 case tok_code_set_name:
372 case tok_repertoiremap:
373 if (arg->tok != tok_ident && arg->tok != tok_string)
375 badarg:
376 lr_error (cmfile, _("syntax error in prolog: %s"),
377 _("bad argument"));
379 lr_ignore_rest (cmfile, 0);
380 continue;
383 if (nowtok == tok_code_set_name)
384 result->code_set_name = obstack_copy0 (&result->mem_pool,
385 arg->val.str.startmb,
386 arg->val.str.lenmb);
387 else
388 result->repertoiremap = obstack_copy0 (&result->mem_pool,
389 arg->val.str.startmb,
390 arg->val.str.lenmb);
392 lr_ignore_rest (cmfile, 1);
393 continue;
395 case tok_mb_cur_max:
396 case tok_mb_cur_min:
397 if (arg->tok != tok_number)
398 goto badarg;
400 if (verbose
401 && ((nowtok == tok_mb_cur_max
402 && result->mb_cur_max != 0)
403 || (nowtok == tok_mb_cur_max
404 && result->mb_cur_max != 0)))
405 lr_error (cmfile, _("duplicate definition of <%s>"),
406 nowtok == tok_mb_cur_min
407 ? "mb_cur_min" : "mb_cur_max");
409 if (arg->val.num < 1)
411 lr_error (cmfile,
412 _("value for <%s> must be 1 or greater"),
413 nowtok == tok_mb_cur_min
414 ? "mb_cur_min" : "mb_cur_max");
416 lr_ignore_rest (cmfile, 0);
417 continue;
419 if ((nowtok == tok_mb_cur_max && result->mb_cur_min != 0
420 && (int) arg->val.num < result->mb_cur_min)
421 || (nowtok == tok_mb_cur_min && result->mb_cur_max != 0
422 && (int) arg->val.num > result->mb_cur_max))
424 lr_error (cmfile, _("\
425 value of <%s> must be greater or equal than the value of <%s>"),
426 "mb_cur_max", "mb_cur_min");
428 lr_ignore_rest (cmfile, 0);
429 continue;
432 if (nowtok == tok_mb_cur_max)
433 result->mb_cur_max = arg->val.num;
434 else
435 result->mb_cur_min = arg->val.num;
437 lr_ignore_rest (cmfile, 1);
438 continue;
440 case tok_escape_char:
441 case tok_comment_char:
442 if (arg->tok != tok_ident)
443 goto badarg;
445 if (arg->val.str.lenmb != 1)
447 lr_error (cmfile, _("\
448 argument to <%s> must be a single character"),
449 nowtok == tok_escape_char ? "escape_char"
450 : "comment_char");
452 lr_ignore_rest (cmfile, 0);
453 continue;
456 if (nowtok == tok_escape_char)
457 cmfile->escape_char = *arg->val.str.startmb;
458 else
459 cmfile->comment_char = *arg->val.str.startmb;
461 lr_ignore_rest (cmfile, 1);
462 continue;
464 case tok_g0esc:
465 case tok_g1esc:
466 case tok_g2esc:
467 case tok_g3esc:
468 case tok_escseq:
469 lr_ignore_rest (cmfile, 0); /* XXX */
470 continue;
472 case tok_include:
473 lr_error (cmfile, _("\
474 character sets with locking states are not supported"));
475 exit (4);
477 default:
478 /* Cannot happen. */
479 assert (! "Should not happen");
481 break;
483 case 2:
484 /* We have seen `CHARMAP' and now are in the body. Each line
485 must have the format "%s %s %s\n" or "%s...%s %s %s\n". */
486 if (nowtok == tok_eol)
487 /* Ignore empty lines. */
488 continue;
490 if (nowtok == tok_end)
492 expected_tok = tok_charmap;
493 expected_str = "CHARMAP";
494 state = 90;
495 continue;
498 if (nowtok != tok_bsymbol && nowtok != tok_ucs4)
500 lr_error (cmfile, _("syntax error in %s definition: %s"),
501 "CHARMAP", _("no symbolic name given"));
503 lr_ignore_rest (cmfile, 0);
504 continue;
507 /* If the previous line was not completely correct free the
508 used memory. */
509 if (from_name != NULL)
510 obstack_free (&result->mem_pool, from_name);
512 if (nowtok == tok_bsymbol)
513 from_name = (char *) obstack_copy0 (&result->mem_pool,
514 now->val.str.startmb,
515 now->val.str.lenmb);
516 else
518 obstack_printf (&result->mem_pool, "U%08X",
519 cmfile->token.val.ucs4);
520 obstack_1grow (&result->mem_pool, '\0');
521 from_name = (char *) obstack_finish (&result->mem_pool);
523 to_name = NULL;
525 state = 3;
526 continue;
528 case 3:
529 /* We have two possibilities: We can see an ellipsis or an
530 encoding value. */
531 if (nowtok == tok_ellipsis3 || nowtok == tok_ellipsis4
532 || nowtok == tok_ellipsis2 || nowtok == tok_ellipsis4_2
533 || nowtok == tok_ellipsis2_2)
535 ellipsis = nowtok;
536 if (nowtok == tok_ellipsis4_2)
538 step = 2;
539 nowtok = tok_ellipsis4;
541 else if (nowtok == tok_ellipsis2_2)
543 step = 2;
544 nowtok = tok_ellipsis2;
546 state = 4;
547 continue;
549 /* FALLTHROUGH */
551 case 5:
552 if (nowtok != tok_charcode)
554 lr_error (cmfile, _("syntax error in %s definition: %s"),
555 "CHARMAP", _("invalid encoding given"));
557 lr_ignore_rest (cmfile, 0);
559 state = 2;
560 continue;
563 if (now->val.charcode.nbytes < result->mb_cur_min)
564 lr_error (cmfile, _("too few bytes in character encoding"));
565 else if (now->val.charcode.nbytes > result->mb_cur_max)
566 lr_error (cmfile, _("too many bytes in character encoding"));
567 else
568 charmap_new_char (cmfile, result, now->val.charcode.nbytes,
569 now->val.charcode.bytes, from_name, to_name,
570 ellipsis != tok_ellipsis2, step);
572 /* Ignore trailing comment silently. */
573 lr_ignore_rest (cmfile, 0);
575 from_name = NULL;
576 to_name = NULL;
577 ellipsis = tok_none;
578 step = 1;
580 state = 2;
581 continue;
583 case 4:
584 if (nowtok != tok_bsymbol && nowtok != tok_ucs4)
586 lr_error (cmfile, _("syntax error in %s definition: %s"),
587 "CHARMAP",
588 _("no symbolic name given for end of range"));
590 lr_ignore_rest (cmfile, 0);
591 continue;
594 /* Copy the to-name in a safe place. */
595 if (nowtok == tok_bsymbol)
596 to_name = (char *) obstack_copy0 (&result->mem_pool,
597 cmfile->token.val.str.startmb,
598 cmfile->token.val.str.lenmb);
599 else
601 obstack_printf (&result->mem_pool, "U%08X",
602 cmfile->token.val.ucs4);
603 obstack_1grow (&result->mem_pool, '\0');
604 to_name = (char *) obstack_finish (&result->mem_pool);
607 state = 5;
608 continue;
610 case 90:
611 if (nowtok != expected_tok)
612 lr_error (cmfile, _("\
613 `%1$s' definition does not end with `END %1$s'"), expected_str);
615 lr_ignore_rest (cmfile, nowtok == expected_tok);
616 state = 91;
617 continue;
619 case 91:
620 /* Waiting for WIDTH... */
621 if (nowtok == tok_eol)
622 /* Ignore empty lines. */
623 continue;
625 if (nowtok == tok_width_default)
627 state = 92;
628 continue;
631 if (nowtok == tok_width)
633 lr_ignore_rest (cmfile, 1);
634 state = 93;
635 continue;
638 if (nowtok == tok_width_variable)
640 lr_ignore_rest (cmfile, 1);
641 state = 98;
642 continue;
645 lr_error (cmfile, _("\
646 only WIDTH definitions are allowed to follow the CHARMAP definition"));
648 lr_ignore_rest (cmfile, 0);
649 continue;
651 case 92:
652 if (nowtok != tok_number)
653 lr_error (cmfile, _("value for %s must be an integer"),
654 "WIDTH_DEFAULT");
655 else
656 result->width_default = now->val.num;
658 lr_ignore_rest (cmfile, nowtok == tok_number);
660 state = 91;
661 continue;
663 case 93:
664 /* We now expect `END WIDTH' or lines of the format "%s %d\n" or
665 "%s...%s %d\n". */
666 if (nowtok == tok_eol)
667 /* ignore empty lines. */
668 continue;
670 if (nowtok == tok_end)
672 expected_tok = tok_width;
673 expected_str = "WIDTH";
674 state = 90;
675 continue;
678 if (nowtok != tok_bsymbol && nowtok != tok_ucs4)
680 lr_error (cmfile, _("syntax error in %s definition: %s"),
681 "WIDTH", _("no symbolic name given"));
683 lr_ignore_rest (cmfile, 0);
684 continue;
687 if (from_name != NULL)
688 obstack_free (&result->mem_pool, from_name);
690 if (nowtok == tok_bsymbol)
691 from_name = (char *) obstack_copy0 (&result->mem_pool,
692 now->val.str.startmb,
693 now->val.str.lenmb);
694 else
696 obstack_printf (&result->mem_pool, "U%08X",
697 cmfile->token.val.ucs4);
698 obstack_1grow (&result->mem_pool, '\0');
699 from_name = (char *) obstack_finish (&result->mem_pool);
702 to_name = NULL;
704 state = 94;
705 continue;
707 case 94:
708 if (nowtok == tok_ellipsis3)
710 state = 95;
711 continue;
714 case 96:
715 if (nowtok != tok_number)
716 lr_error (cmfile, _("value for %s must be an integer"),
717 "WIDTH");
718 else
720 /* Store width for chars. */
721 new_width (cmfile, result, from_name, to_name, now->val.num);
723 from_name = NULL;
724 to_name = NULL;
727 lr_ignore_rest (cmfile, nowtok == tok_number);
729 state = 93;
730 continue;
732 case 95:
733 if (nowtok != tok_bsymbol && nowtok != tok_ucs4)
735 lr_error (cmfile, _("syntax error in %s definition: %s"),
736 "WIDTH", _("no symbolic name given for end of range"));
738 lr_ignore_rest (cmfile, 0);
740 state = 93;
741 continue;
744 if (nowtok == tok_bsymbol)
745 to_name = (char *) obstack_copy0 (&result->mem_pool,
746 now->val.str.startmb,
747 now->val.str.lenmb);
748 else
750 obstack_printf (&result->mem_pool, "U%08X",
751 cmfile->token.val.ucs4);
752 obstack_1grow (&result->mem_pool, '\0');
753 to_name = (char *) obstack_finish (&result->mem_pool);
756 state = 96;
757 continue;
759 case 98:
760 /* We now expect `END WIDTH_VARIABLE' or lines of the format
761 "%s\n" or "%s...%s\n". */
762 if (nowtok == tok_eol)
763 /* ignore empty lines. */
764 continue;
766 if (nowtok == tok_end)
768 expected_tok = tok_width_variable;
769 expected_str = "WIDTH_VARIABLE";
770 state = 90;
771 continue;
774 if (nowtok != tok_bsymbol && nowtok != tok_ucs4)
776 lr_error (cmfile, _("syntax error in %s definition: %s"),
777 "WIDTH_VARIABLE", _("no symbolic name given"));
779 lr_ignore_rest (cmfile, 0);
781 continue;
784 if (from_name != NULL)
785 obstack_free (&result->mem_pool, from_name);
787 if (nowtok == tok_bsymbol)
788 from_name = (char *) obstack_copy0 (&result->mem_pool,
789 now->val.str.startmb,
790 now->val.str.lenmb);
791 else
793 obstack_printf (&result->mem_pool, "U%08X",
794 cmfile->token.val.ucs4);
795 obstack_1grow (&result->mem_pool, '\0');
796 from_name = (char *) obstack_finish (&result->mem_pool);
798 to_name = NULL;
800 state = 99;
801 continue;
803 case 99:
804 if (nowtok == tok_ellipsis3)
805 state = 100;
807 /* Store info. */
808 from_name = NULL;
810 /* Warn */
811 state = 98;
812 continue;
814 case 100:
815 if (nowtok != tok_bsymbol && nowtok != tok_ucs4)
817 lr_error (cmfile, _("syntax error in %s definition: %s"),
818 "WIDTH_VARIABLE",
819 _("no symbolic name given for end of range"));
820 lr_ignore_rest (cmfile, 0);
821 continue;
824 if (nowtok == tok_bsymbol)
825 to_name = (char *) obstack_copy0 (&result->mem_pool,
826 now->val.str.startmb,
827 now->val.str.lenmb);
828 else
830 obstack_printf (&result->mem_pool, "U%08X",
831 cmfile->token.val.ucs4);
832 obstack_1grow (&result->mem_pool, '\0');
833 to_name = (char *) obstack_finish (&result->mem_pool);
836 /* XXX Enter value into table. */
838 lr_ignore_rest (cmfile, 1);
840 state = 98;
841 continue;
843 default:
844 WITH_CUR_LOCALE (error (5, 0, _("%s: error in state machine"),
845 __FILE__));
846 /* NOTREACHED */
848 break;
851 if (state != 91 && !be_quiet)
852 WITH_CUR_LOCALE (error (0, 0, _("%s: premature end of file"),
853 cmfile->fname));
855 lr_close (cmfile);
857 return result;
861 static void
862 new_width (struct linereader *cmfile, struct charmap_t *result,
863 const char *from, const char *to, unsigned long int width)
865 struct charseq *from_val;
866 struct charseq *to_val;
868 from_val = charmap_find_value (result, from, strlen (from));
869 if (from_val == NULL)
871 lr_error (cmfile, _("unknown character `%s'"), from);
872 return;
875 if (to == NULL)
876 to_val = from_val;
877 else
879 to_val = charmap_find_value (result, to, strlen (to));
880 if (to_val == NULL)
882 lr_error (cmfile, _("unknown character `%s'"), to);
883 return;
886 /* Make sure the number of bytes for the end points of the range
887 is correct. */
888 if (from_val->nbytes != to_val->nbytes)
890 lr_error (cmfile, _("\
891 number of bytes for byte sequence of beginning and end of range not the same: %d vs %d"),
892 from_val->nbytes, to_val->nbytes);
893 return;
897 if (result->nwidth_rules >= result->nwidth_rules_max)
899 size_t new_size = result->nwidth_rules + 32;
900 struct width_rule *new_rules =
901 (struct width_rule *) obstack_alloc (&result->mem_pool,
902 (new_size
903 * sizeof (struct width_rule)));
905 memcpy (new_rules, result->width_rules,
906 result->nwidth_rules_max * sizeof (struct width_rule));
908 result->width_rules = new_rules;
909 result->nwidth_rules_max = new_size;
912 result->width_rules[result->nwidth_rules].from = from_val;
913 result->width_rules[result->nwidth_rules].to = to_val;
914 result->width_rules[result->nwidth_rules].width = (unsigned int) width;
915 ++result->nwidth_rules;
919 struct charseq *
920 charmap_find_value (const struct charmap_t *cm, const char *name, size_t len)
922 void *result;
924 return (find_entry ((hash_table *) &cm->char_table, name, len, &result)
925 < 0 ? NULL : (struct charseq *) result);
929 static void
930 charmap_new_char (struct linereader *lr, struct charmap_t *cm,
931 size_t nbytes, unsigned char *bytes,
932 const char *from, const char *to,
933 int decimal_ellipsis, int step)
935 hash_table *ht = &cm->char_table;
936 hash_table *bt = &cm->byte_table;
937 struct obstack *ob = &cm->mem_pool;
938 char *from_end;
939 char *to_end;
940 const char *cp;
941 int prefix_len, len1, len2;
942 unsigned int from_nr, to_nr, cnt;
943 struct charseq *newp;
945 len1 = strlen (from);
947 if (to == NULL)
949 newp = (struct charseq *) obstack_alloc (ob, sizeof (*newp) + nbytes);
950 newp->nbytes = nbytes;
951 memcpy (newp->bytes, bytes, nbytes);
952 newp->name = from;
954 newp->ucs4 = UNINITIALIZED_CHAR_VALUE;
955 if ((from[0] == 'U' || from[0] == 'P') && (len1 == 5 || len1 == 9))
957 /* Maybe the name is of the form `Uxxxx' or `Uxxxxxxxx' where
958 xxxx and xxxxxxxx are hexadecimal numbers. In this case
959 we use the value of xxxx or xxxxxxxx as the UCS4 value of
960 this character and we don't have to consult the repertoire
961 map.
963 If the name is of the form `Pxxxx' or `Pxxxxxxxx' the xxxx
964 and xxxxxxxx also give the code point in UCS4 but this must
965 be in the private, i.e., unassigned, area. This should be
966 used for characters which do not (yet) have an equivalent
967 in ISO 10646 and Unicode. */
968 char *endp;
970 errno = 0;
971 newp->ucs4 = strtoul (from + 1, &endp, 16);
972 if (endp - from != len1
973 || (newp->ucs4 == ~((uint32_t) 0) && errno == ERANGE)
974 || newp->ucs4 >= 0x80000000)
975 /* This wasn't successful. Signal this name cannot be a
976 correct UCS value. */
977 newp->ucs4 = UNINITIALIZED_CHAR_VALUE;
980 insert_entry (ht, from, len1, newp);
981 insert_entry (bt, newp->bytes, nbytes, newp);
982 /* Please note that it isn't a bug if a symbol is defined more
983 than once. All later definitions are simply discarded. */
984 return;
987 /* We have a range: the names must have names with equal prefixes
988 and an equal number of digits, where the second number is greater
989 or equal than the first. */
990 len2 = strlen (to);
992 if (len1 != len2)
994 illegal_range:
995 lr_error (lr, _("invalid names for character range"));
996 return;
999 cp = &from[len1 - 1];
1000 if (decimal_ellipsis)
1001 while (isdigit (*cp) && cp >= from)
1002 --cp;
1003 else
1004 while (isxdigit (*cp) && cp >= from)
1006 if (!isdigit (*cp) && !isupper (*cp))
1007 lr_error (lr, _("\
1008 hexadecimal range format should use only capital characters"));
1009 --cp;
1012 prefix_len = (cp - from) + 1;
1014 if (cp == &from[len1 - 1] || strncmp (from, to, prefix_len) != 0)
1015 goto illegal_range;
1017 errno = 0;
1018 from_nr = strtoul (&from[prefix_len], &from_end, decimal_ellipsis ? 10 : 16);
1019 if (*from_end != '\0' || (from_nr == UINT_MAX && errno == ERANGE)
1020 || ((to_nr = strtoul (&to[prefix_len], &to_end,
1021 decimal_ellipsis ? 10 : 16)) == UINT_MAX
1022 && errno == ERANGE)
1023 || *to_end != '\0')
1025 lr_error (lr, _("<%s> and <%s> are illegal names for range"), from, to);
1026 return;
1029 if (from_nr > to_nr)
1031 lr_error (lr, _("upper limit in range is not higher then lower limit"));
1032 return;
1035 for (cnt = from_nr; cnt <= to_nr; cnt += step)
1037 char *name_end;
1038 obstack_printf (ob, decimal_ellipsis ? "%.*s%0*d" : "%.*s%0*X",
1039 prefix_len, from, len1 - prefix_len, cnt);
1040 obstack_1grow (ob, '\0');
1041 name_end = obstack_finish (ob);
1043 newp = (struct charseq *) obstack_alloc (ob, sizeof (*newp) + nbytes);
1044 newp->nbytes = nbytes;
1045 memcpy (newp->bytes, bytes, nbytes);
1046 newp->name = name_end;
1048 newp->ucs4 = UNINITIALIZED_CHAR_VALUE;
1049 if ((name_end[0] == 'U' || name_end[0] == 'P')
1050 && (len1 == 5 || len1 == 9))
1052 /* Maybe the name is of the form `Uxxxx' or `Uxxxxxxxx' where
1053 xxxx and xxxxxxxx are hexadecimal numbers. In this case
1054 we use the value of xxxx or xxxxxxxx as the UCS4 value of
1055 this character and we don't have to consult the repertoire
1056 map.
1058 If the name is of the form `Pxxxx' or `Pxxxxxxxx' the xxxx
1059 and xxxxxxxx also give the code point in UCS4 but this must
1060 be in the private, i.e., unassigned, area. This should be
1061 used for characters which do not (yet) have an equivalent
1062 in ISO 10646 and Unicode. */
1063 char *endp;
1065 errno = 0;
1066 newp->ucs4 = strtoul (name_end + 1, &endp, 16);
1067 if (endp - name_end != len1
1068 || (newp->ucs4 == ~((uint32_t) 0) && errno == ERANGE)
1069 || newp->ucs4 >= 0x80000000)
1070 /* This wasn't successful. Signal this name cannot be a
1071 correct UCS value. */
1072 newp->ucs4 = UNINITIALIZED_CHAR_VALUE;
1075 insert_entry (ht, name_end, len1, newp);
1076 insert_entry (bt, newp->bytes, nbytes, newp);
1077 /* Please note we don't examine the return value since it is no error
1078 if we have two definitions for a symbol. */
1080 /* Increment the value in the byte sequence. */
1081 if (++bytes[nbytes - 1] == '\0')
1083 int b = nbytes - 2;
1086 if (b < 0)
1088 lr_error (lr,
1089 _("resulting bytes for range not representable."));
1090 return;
1092 while (++bytes[b--] == 0);
1098 struct charseq *
1099 charmap_find_symbol (const struct charmap_t *cm, const char *bytes,
1100 size_t nbytes)
1102 void *result;
1104 return (find_entry ((hash_table *) &cm->byte_table, bytes, nbytes, &result)
1105 < 0 ? NULL : (struct charseq *) result);