Update.
[glibc.git] / locale / programs / charmap.c
blob5d2441d17a6c2f5ad971af35f2bbb81ddfe48ecd
1 /* Copyright (C) 1996,1998,1999,2000,2001 Free Software Foundation, Inc.
2 This file is part of the GNU C Library.
3 Contributed by Ulrich Drepper <drepper@gnu.org>, 1996.
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, write to the Free
17 Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
18 02111-1307 USA. */
20 #ifdef HAVE_CONFIG_H
21 # include <config.h>
22 #endif
24 #include <ctype.h>
25 #include <errno.h>
26 #include <libintl.h>
27 #include <limits.h>
28 #include <obstack.h>
29 #include <stdio.h>
30 #include <stdlib.h>
31 #include <string.h>
33 #include "error.h"
34 #include "linereader.h"
35 #include "charmap.h"
36 #include "charmap-dir.h"
37 #include "repertoire.h"
39 #include <assert.h>
42 /* Define the lookup function. */
43 #include "charmap-kw.h"
46 extern void *xmalloc (size_t __n);
48 /* Prototypes for local functions. */
49 static struct charmap_t *parse_charmap (struct linereader *cmfile,
50 int verbose, int be_quiet);
51 static void new_width (struct linereader *cmfile, struct charmap_t *result,
52 const char *from, const char *to,
53 unsigned long int width);
54 static void charmap_new_char (struct linereader *lr, struct charmap_t *cm,
55 int nbytes, char *bytes, const char *from,
56 const char *to, int decimal_ellipsis, int step);
59 #ifdef NEED_NULL_POINTER
60 static const char *null_pointer;
61 #endif
63 static struct linereader *
64 cmlr_open (const char *directory, const char *name, kw_hash_fct_t hf)
66 FILE *fp;
68 fp = charmap_open (directory, name);
69 if (fp == NULL)
70 return NULL;
71 else
73 size_t dlen = strlen (directory);
74 int add_slash = (dlen == 0 || directory[dlen - 1] != '/');
75 size_t nlen = strlen (name);
76 char *pathname;
77 char *p;
79 pathname = alloca (dlen + add_slash + nlen + 1);
80 p = stpcpy (pathname, directory);
81 if (add_slash)
82 *p++ = '/';
83 stpcpy (p, name);
85 return lr_create (fp, pathname, hf);
89 struct charmap_t *
90 charmap_read (const char *filename, int verbose, int be_quiet, int use_default)
92 struct charmap_t *result = NULL;
94 if (filename != NULL)
96 struct linereader *cmfile;
98 /* First try the name as found in the parameter. */
99 cmfile = lr_open (filename, charmap_hash);
100 if (cmfile == NULL)
102 /* No successful. So start looking through the directories
103 in the I18NPATH if this is a simple name. */
104 if (strchr (filename, '/') == NULL)
106 char *i18npath = getenv ("I18NPATH");
107 if (i18npath != NULL && *i18npath != '\0')
109 char path[strlen (i18npath) + sizeof ("/charmaps")];
110 char *next;
111 i18npath = strdupa (i18npath);
113 while (cmfile == NULL
114 && (next = strsep (&i18npath, ":")) != NULL)
116 stpcpy (stpcpy (path, next), "/charmaps");
117 cmfile = cmlr_open (path, filename, charmap_hash);
119 if (cmfile == NULL)
121 /* Try without the "/charmaps" part. */
122 cmfile = cmlr_open (next, filename, charmap_hash);
127 if (cmfile == NULL)
129 /* Try the default directory. */
130 cmfile = cmlr_open (CHARMAP_PATH, filename, charmap_hash);
135 if (cmfile != NULL)
137 result = parse_charmap (cmfile, verbose, be_quiet);
139 if (result == NULL && !be_quiet)
140 error (0, errno, _("character map file `%s' not found"), filename);
144 if (result == NULL && filename != NULL && strchr (filename, '/') == NULL)
146 /* OK, one more try. We also accept the names given to the
147 character sets in the files. Sometimes they differ from the
148 file name. */
149 CHARMAP_DIR *dir;
151 dir = charmap_opendir (CHARMAP_PATH);
152 if (dir != NULL)
154 const char *dirent;
156 while ((dirent = charmap_readdir (dir)) != NULL)
158 char **aliases;
159 char **p;
160 int found;
162 aliases = charmap_aliases (CHARMAP_PATH, dirent);
163 found = 0;
164 for (p = aliases; *p; p++)
165 if (strcasecmp (*p, filename) == 0)
167 found = 1;
168 break;
170 charmap_free_aliases (aliases);
172 if (found)
174 struct linereader *cmfile;
176 cmfile = cmlr_open (CHARMAP_PATH, dirent, charmap_hash);
177 if (cmfile != NULL)
178 result = parse_charmap (cmfile, verbose, be_quiet);
180 break;
184 charmap_closedir (dir);
188 if (result == NULL && DEFAULT_CHARMAP != NULL)
190 struct linereader *cmfile;
192 cmfile = cmlr_open (CHARMAP_PATH, DEFAULT_CHARMAP, charmap_hash);
193 if (cmfile != NULL)
194 result = parse_charmap (cmfile, verbose, be_quiet);
196 if (result == NULL)
197 error (4, errno, _("default character map file `%s' not found"),
198 DEFAULT_CHARMAP);
201 /* Test of ASCII compatibility of locale encoding.
203 Verify that the encoding to be used in a locale is ASCII compatible,
204 at least for the graphic characters, excluding the control characters,
205 '$' and '@'. This constraint comes from an ISO C 99 restriction.
207 ISO C 99 section 7.17.(2) (about wchar_t):
208 the null character shall have the code value zero and each member of
209 the basic character set shall have a code value equal to its value
210 when used as the lone character in an integer character constant.
211 ISO C 99 section 5.2.1.(3):
212 Both the basic source and basic execution character sets shall have
213 the following members: the 26 uppercase letters of the Latin alphabet
214 A B C D E F G H I J K L M N O P Q R S T U V W X Y Z
215 the 26 lowercase letters of the Latin alphabet
216 a b c d e f g h i j k l m n o p q r s t u v w x y z
217 the 10 decimal digits
218 0 1 2 3 4 5 6 7 8 9
219 the following 29 graphic characters
220 ! " # % & ' ( ) * + , - . / : ; < = > ? [ \ ] ^ _ { | } ~
221 the space character, and control characters representing horizontal
222 tab, vertical tab, and form feed.
224 Therefore, for all members of the "basic character set", the 'char' code
225 must have the same value as the 'wchar_t' code, which in glibc is the
226 same as the Unicode code, which for all of the enumerated characters
227 is identical to the ASCII code. */
228 if (result != NULL && use_default)
230 static const char basic_charset[] =
232 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M',
233 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z',
234 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm',
235 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
236 '0', '1', '2', '3', '4', '5', '6', '7', '8', '9',
237 '!', '"', '#', '%', '&', '\'', '(', ')', '*', '+', ',', '-',
238 '.', '/', ':', ';', '<', '=', '>', '?', '[', '\\', ']', '^',
239 '_', '{', '|', '}', '~', ' ', '\t', '\v', '\f', '\0'
241 int failed = 0;
242 const char *p = basic_charset;
246 struct charseq * seq = charmap_find_symbol (result, p, 1);
248 if (seq == NULL || seq->ucs4 != *p)
249 failed = 1;
251 while (*p++ != '\0');
253 if (failed)
254 fprintf (stderr, _("\
255 character map `%s' is not ASCII compatible, locale not ISO C compliant\n"),
256 result->code_set_name);
259 return result;
263 static struct charmap_t *
264 parse_charmap (struct linereader *cmfile, int verbose, int be_quiet)
266 struct charmap_t *result;
267 int state;
268 enum token_t expected_tok = tok_error;
269 const char *expected_str = NULL;
270 char *from_name = NULL;
271 char *to_name = NULL;
272 enum token_t ellipsis = 0;
273 int step = 1;
275 /* We don't want symbolic names in string to be translated. */
276 cmfile->translate_strings = 0;
278 /* Allocate room for result. */
279 result = (struct charmap_t *) xmalloc (sizeof (struct charmap_t));
280 memset (result, '\0', sizeof (struct charmap_t));
281 /* The default DEFAULT_WIDTH is 1. */
282 result->width_default = 1;
284 #define obstack_chunk_alloc malloc
285 #define obstack_chunk_free free
286 obstack_init (&result->mem_pool);
288 if (init_hash (&result->char_table, 256)
289 || init_hash (&result->byte_table, 256))
291 free (result);
292 return NULL;
295 /* We use a state machine to describe the charmap description file
296 format. */
297 state = 1;
298 while (1)
300 /* What's on? */
301 struct token *now = lr_token (cmfile, NULL, NULL, verbose);
302 enum token_t nowtok = now->tok;
303 struct token *arg;
305 if (nowtok == tok_eof)
306 break;
308 switch (state)
310 case 1:
311 /* The beginning. We expect the special declarations, EOL or
312 `CHARMAP'. */
313 if (nowtok == tok_eol)
314 /* Ignore empty lines. */
315 continue;
317 if (nowtok == tok_charmap)
319 from_name = NULL;
320 to_name = NULL;
322 /* We have to set up the real work. Fill in some
323 default values. */
324 if (result->mb_cur_max == 0)
325 result->mb_cur_max = 1;
326 if (result->mb_cur_min == 0)
327 result->mb_cur_min = result->mb_cur_max;
328 if (result->mb_cur_min > result->mb_cur_max)
330 if (!be_quiet)
331 error (0, 0, _("\
332 %s: <mb_cur_max> must be greater than <mb_cur_min>\n"),
333 cmfile->fname);
335 result->mb_cur_min = result->mb_cur_max;
338 lr_ignore_rest (cmfile, 1);
340 state = 2;
341 continue;
344 if (nowtok != tok_code_set_name && nowtok != tok_mb_cur_max
345 && nowtok != tok_mb_cur_min && nowtok != tok_escape_char
346 && nowtok != tok_comment_char && nowtok != tok_g0esc
347 && nowtok != tok_g1esc && nowtok != tok_g2esc
348 && nowtok != tok_g3esc && nowtok != tok_repertoiremap
349 && nowtok != tok_include)
351 lr_error (cmfile, _("syntax error in prolog: %s"),
352 _("invalid definition"));
354 lr_ignore_rest (cmfile, 0);
355 continue;
358 /* We know that we need an argument. */
359 arg = lr_token (cmfile, NULL, NULL, verbose);
361 switch (nowtok)
363 case tok_code_set_name:
364 case tok_repertoiremap:
365 if (arg->tok != tok_ident && arg->tok != tok_string)
367 badarg:
368 lr_error (cmfile, _("syntax error in prolog: %s"),
369 _("bad argument"));
371 lr_ignore_rest (cmfile, 0);
372 continue;
375 if (nowtok == tok_code_set_name)
376 result->code_set_name = obstack_copy0 (&result->mem_pool,
377 arg->val.str.startmb,
378 arg->val.str.lenmb);
379 else
380 result->repertoiremap = obstack_copy0 (&result->mem_pool,
381 arg->val.str.startmb,
382 arg->val.str.lenmb);
384 lr_ignore_rest (cmfile, 1);
385 continue;
387 case tok_mb_cur_max:
388 case tok_mb_cur_min:
389 if (arg->tok != tok_number)
390 goto badarg;
392 if (verbose
393 && ((nowtok == tok_mb_cur_max
394 && result->mb_cur_max != 0)
395 || (nowtok == tok_mb_cur_max
396 && result->mb_cur_max != 0)))
397 lr_error (cmfile, _("duplicate definition of <%s>"),
398 nowtok == tok_mb_cur_min
399 ? "mb_cur_min" : "mb_cur_max");
401 if (arg->val.num < 1)
403 lr_error (cmfile,
404 _("value for <%s> must be 1 or greater"),
405 nowtok == tok_mb_cur_min
406 ? "mb_cur_min" : "mb_cur_max");
408 lr_ignore_rest (cmfile, 0);
409 continue;
411 if ((nowtok == tok_mb_cur_max && result->mb_cur_min != 0
412 && (int) arg->val.num < result->mb_cur_min)
413 || (nowtok == tok_mb_cur_min && result->mb_cur_max != 0
414 && (int) arg->val.num > result->mb_cur_max))
416 lr_error (cmfile, _("\
417 value of <%s> must be greater or equal than the value of <%s>"),
418 "mb_cur_max", "mb_cur_min");
420 lr_ignore_rest (cmfile, 0);
421 continue;
424 if (nowtok == tok_mb_cur_max)
425 result->mb_cur_max = arg->val.num;
426 else
427 result->mb_cur_min = arg->val.num;
429 lr_ignore_rest (cmfile, 1);
430 continue;
432 case tok_escape_char:
433 case tok_comment_char:
434 if (arg->tok != tok_ident)
435 goto badarg;
437 if (arg->val.str.lenmb != 1)
439 lr_error (cmfile, _("\
440 argument to <%s> must be a single character"),
441 nowtok == tok_escape_char ? "escape_char"
442 : "comment_char");
444 lr_ignore_rest (cmfile, 0);
445 continue;
448 if (nowtok == tok_escape_char)
449 cmfile->escape_char = *arg->val.str.startmb;
450 else
451 cmfile->comment_char = *arg->val.str.startmb;
453 lr_ignore_rest (cmfile, 1);
454 continue;
456 case tok_g0esc:
457 case tok_g1esc:
458 case tok_g2esc:
459 case tok_g3esc:
460 case tok_escseq:
461 lr_ignore_rest (cmfile, 0); /* XXX */
462 continue;
464 case tok_include:
465 lr_error (cmfile, _("\
466 character sets with locking states are not supported"));
467 exit (4);
469 default:
470 /* Cannot happen. */
471 assert (! "Should not happen");
473 break;
475 case 2:
476 /* We have seen `CHARMAP' and now are in the body. Each line
477 must have the format "%s %s %s\n" or "%s...%s %s %s\n". */
478 if (nowtok == tok_eol)
479 /* Ignore empty lines. */
480 continue;
482 if (nowtok == tok_end)
484 expected_tok = tok_charmap;
485 expected_str = "CHARMAP";
486 state = 90;
487 continue;
490 if (nowtok != tok_bsymbol && nowtok != tok_ucs4)
492 lr_error (cmfile, _("syntax error in %s definition: %s"),
493 "CHARMAP", _("no symbolic name given"));
495 lr_ignore_rest (cmfile, 0);
496 continue;
499 /* If the previous line was not completely correct free the
500 used memory. */
501 if (from_name != NULL)
502 obstack_free (&result->mem_pool, from_name);
504 if (nowtok == tok_bsymbol)
505 from_name = (char *) obstack_copy0 (&result->mem_pool,
506 now->val.str.startmb,
507 now->val.str.lenmb);
508 else
510 obstack_printf (&result->mem_pool, "U%08X",
511 cmfile->token.val.ucs4);
512 obstack_1grow (&result->mem_pool, '\0');
513 from_name = (char *) obstack_finish (&result->mem_pool);
515 to_name = NULL;
517 state = 3;
518 continue;
520 case 3:
521 /* We have two possibilities: We can see an ellipsis or an
522 encoding value. */
523 if (nowtok == tok_ellipsis3 || nowtok == tok_ellipsis4
524 || nowtok == tok_ellipsis2 || nowtok == tok_ellipsis4_2
525 || nowtok == tok_ellipsis2_2)
527 ellipsis = nowtok;
528 if (nowtok == tok_ellipsis4_2)
530 step = 2;
531 nowtok = tok_ellipsis4;
533 else if (nowtok == tok_ellipsis2_2)
535 step = 2;
536 nowtok = tok_ellipsis2;
538 state = 4;
539 continue;
541 /* FALLTHROUGH */
543 case 5:
544 if (nowtok != tok_charcode)
546 lr_error (cmfile, _("syntax error in %s definition: %s"),
547 "CHARMAP", _("invalid encoding given"));
549 lr_ignore_rest (cmfile, 0);
551 state = 2;
552 continue;
555 if (now->val.charcode.nbytes < result->mb_cur_min)
556 lr_error (cmfile, _("too few bytes in character encoding"));
557 else if (now->val.charcode.nbytes > result->mb_cur_max)
558 lr_error (cmfile, _("too many bytes in character encoding"));
559 else
560 charmap_new_char (cmfile, result, now->val.charcode.nbytes,
561 now->val.charcode.bytes, from_name, to_name,
562 ellipsis != tok_ellipsis2, step);
564 /* Ignore trailing comment silently. */
565 lr_ignore_rest (cmfile, 0);
567 from_name = NULL;
568 to_name = NULL;
569 ellipsis = tok_none;
570 step = 1;
572 state = 2;
573 continue;
575 case 4:
576 if (nowtok != tok_bsymbol && nowtok != tok_ucs4)
578 lr_error (cmfile, _("syntax error in %s definition: %s"),
579 "CHARMAP",
580 _("no symbolic name given for end of range"));
582 lr_ignore_rest (cmfile, 0);
583 continue;
586 /* Copy the to-name in a safe place. */
587 if (nowtok == tok_bsymbol)
588 to_name = (char *) obstack_copy0 (&result->mem_pool,
589 cmfile->token.val.str.startmb,
590 cmfile->token.val.str.lenmb);
591 else
593 obstack_printf (&result->mem_pool, "U%08X",
594 cmfile->token.val.ucs4);
595 obstack_1grow (&result->mem_pool, '\0');
596 to_name = (char *) obstack_finish (&result->mem_pool);
599 state = 5;
600 continue;
602 case 90:
603 if (nowtok != expected_tok)
604 lr_error (cmfile, _("\
605 `%1$s' definition does not end with `END %1$s'"), expected_str);
607 lr_ignore_rest (cmfile, nowtok == expected_tok);
608 state = 91;
609 continue;
611 case 91:
612 /* Waiting for WIDTH... */
613 if (nowtok == tok_eol)
614 /* Ignore empty lines. */
615 continue;
617 if (nowtok == tok_width_default)
619 state = 92;
620 continue;
623 if (nowtok == tok_width)
625 lr_ignore_rest (cmfile, 1);
626 state = 93;
627 continue;
630 if (nowtok == tok_width_variable)
632 lr_ignore_rest (cmfile, 1);
633 state = 98;
634 continue;
637 lr_error (cmfile, _("\
638 only WIDTH definitions are allowed to follow the CHARMAP definition"));
640 lr_ignore_rest (cmfile, 0);
641 continue;
643 case 92:
644 if (nowtok != tok_number)
645 lr_error (cmfile, _("value for %s must be an integer"),
646 "WIDTH_DEFAULT");
647 else
648 result->width_default = now->val.num;
650 lr_ignore_rest (cmfile, nowtok == tok_number);
652 state = 91;
653 continue;
655 case 93:
656 /* We now expect `END WIDTH' or lines of the format "%s %d\n" or
657 "%s...%s %d\n". */
658 if (nowtok == tok_eol)
659 /* ignore empty lines. */
660 continue;
662 if (nowtok == tok_end)
664 expected_tok = tok_width;
665 expected_str = "WIDTH";
666 state = 90;
667 continue;
670 if (nowtok != tok_bsymbol && nowtok != tok_ucs4)
672 lr_error (cmfile, _("syntax error in %s definition: %s"),
673 "WIDTH", _("no symbolic name given"));
675 lr_ignore_rest (cmfile, 0);
676 continue;
679 if (from_name != NULL)
680 obstack_free (&result->mem_pool, from_name);
682 if (nowtok == tok_bsymbol)
683 from_name = (char *) obstack_copy0 (&result->mem_pool,
684 now->val.str.startmb,
685 now->val.str.lenmb);
686 else
688 obstack_printf (&result->mem_pool, "U%08X",
689 cmfile->token.val.ucs4);
690 obstack_1grow (&result->mem_pool, '\0');
691 from_name = (char *) obstack_finish (&result->mem_pool);
694 to_name = NULL;
696 state = 94;
697 continue;
699 case 94:
700 if (nowtok == tok_ellipsis3)
702 state = 95;
703 continue;
706 case 96:
707 if (nowtok != tok_number)
708 lr_error (cmfile, _("value for %s must be an integer"),
709 "WIDTH");
710 else
712 /* Store width for chars. */
713 new_width (cmfile, result, from_name, to_name, now->val.num);
715 from_name = NULL;
716 to_name = NULL;
719 lr_ignore_rest (cmfile, nowtok == tok_number);
721 state = 93;
722 continue;
724 case 95:
725 if (nowtok != tok_bsymbol && nowtok != tok_ucs4)
727 lr_error (cmfile, _("syntax error in %s definition: %s"),
728 "WIDTH", _("no symbolic name given for end of range"));
730 lr_ignore_rest (cmfile, 0);
732 state = 93;
733 continue;
736 if (nowtok == tok_bsymbol)
737 to_name = (char *) obstack_copy0 (&result->mem_pool,
738 now->val.str.startmb,
739 now->val.str.lenmb);
740 else
742 obstack_printf (&result->mem_pool, "U%08X",
743 cmfile->token.val.ucs4);
744 obstack_1grow (&result->mem_pool, '\0');
745 to_name = (char *) obstack_finish (&result->mem_pool);
748 state = 96;
749 continue;
751 case 98:
752 /* We now expect `END WIDTH_VARIABLE' or lines of the format
753 "%s\n" or "%s...%s\n". */
754 if (nowtok == tok_eol)
755 /* ignore empty lines. */
756 continue;
758 if (nowtok == tok_end)
760 expected_tok = tok_width_variable;
761 expected_str = "WIDTH_VARIABLE";
762 state = 90;
763 continue;
766 if (nowtok != tok_bsymbol && nowtok != tok_ucs4)
768 lr_error (cmfile, _("syntax error in %s definition: %s"),
769 "WIDTH_VARIABLE", _("no symbolic name given"));
771 lr_ignore_rest (cmfile, 0);
773 continue;
776 if (from_name != NULL)
777 obstack_free (&result->mem_pool, from_name);
779 if (nowtok == tok_bsymbol)
780 from_name = (char *) obstack_copy0 (&result->mem_pool,
781 now->val.str.startmb,
782 now->val.str.lenmb);
783 else
785 obstack_printf (&result->mem_pool, "U%08X",
786 cmfile->token.val.ucs4);
787 obstack_1grow (&result->mem_pool, '\0');
788 from_name = (char *) obstack_finish (&result->mem_pool);
790 to_name = NULL;
792 state = 99;
793 continue;
795 case 99:
796 if (nowtok == tok_ellipsis3)
797 state = 100;
799 /* Store info. */
800 from_name = NULL;
802 /* Warn */
803 state = 98;
804 continue;
806 case 100:
807 if (nowtok != tok_bsymbol && nowtok != tok_ucs4)
809 lr_error (cmfile, _("syntax error in %s definition: %s"),
810 "WIDTH_VARIABLE",
811 _("no symbolic name given for end of range"));
812 lr_ignore_rest (cmfile, 0);
813 continue;
816 if (nowtok == tok_bsymbol)
817 to_name = (char *) obstack_copy0 (&result->mem_pool,
818 now->val.str.startmb,
819 now->val.str.lenmb);
820 else
822 obstack_printf (&result->mem_pool, "U%08X",
823 cmfile->token.val.ucs4);
824 obstack_1grow (&result->mem_pool, '\0');
825 to_name = (char *) obstack_finish (&result->mem_pool);
828 /* XXX Enter value into table. */
830 lr_ignore_rest (cmfile, 1);
832 state = 98;
833 continue;
835 default:
836 error (5, 0, _("%s: error in state machine"), __FILE__);
837 /* NOTREACHED */
839 break;
842 if (state != 91 && !be_quiet)
843 error (0, 0, _("%s: premature end of file"), cmfile->fname);
845 lr_close (cmfile);
847 return result;
851 static void
852 new_width (struct linereader *cmfile, struct charmap_t *result,
853 const char *from, const char *to, unsigned long int width)
855 struct charseq *from_val;
856 struct charseq *to_val;
858 from_val = charmap_find_value (result, from, strlen (from));
859 if (from_val == NULL)
861 lr_error (cmfile, _("unknown character `%s'"), from);
862 return;
865 if (to == NULL)
866 to_val = from_val;
867 else
869 to_val = charmap_find_value (result, to, strlen (to));
870 if (to_val == NULL)
872 lr_error (cmfile, _("unknown character `%s'"), to);
873 return;
877 if (result->nwidth_rules >= result->nwidth_rules_max)
879 size_t new_size = result->nwidth_rules + 32;
880 struct width_rule *new_rules =
881 (struct width_rule *) obstack_alloc (&result->mem_pool,
882 (new_size
883 * sizeof (struct width_rule)));
885 memcpy (new_rules, result->width_rules,
886 result->nwidth_rules_max * sizeof (struct width_rule));
888 result->width_rules = new_rules;
889 result->nwidth_rules_max = new_size;
892 result->width_rules[result->nwidth_rules].from = from_val;
893 result->width_rules[result->nwidth_rules].to = to_val;
894 result->width_rules[result->nwidth_rules].width = (unsigned int) width;
895 ++result->nwidth_rules;
899 struct charseq *
900 charmap_find_value (const struct charmap_t *cm, const char *name, size_t len)
902 void *result;
904 return (find_entry ((hash_table *) &cm->char_table, name, len, &result)
905 < 0 ? NULL : (struct charseq *) result);
909 static void
910 charmap_new_char (struct linereader *lr, struct charmap_t *cm,
911 int nbytes, char *bytes, const char *from, const char *to,
912 int decimal_ellipsis, int step)
914 hash_table *ht = &cm->char_table;
915 hash_table *bt = &cm->byte_table;
916 struct obstack *ob = &cm->mem_pool;
917 char *from_end;
918 char *to_end;
919 const char *cp;
920 int prefix_len, len1, len2;
921 unsigned int from_nr, to_nr, cnt;
922 struct charseq *newp;
924 len1 = strlen (from);
926 if (to == NULL)
928 newp = (struct charseq *) obstack_alloc (ob, sizeof (*newp) + nbytes);
929 newp->nbytes = nbytes;
930 memcpy (newp->bytes, bytes, nbytes);
931 newp->name = from;
933 newp->ucs4 = UNINITIALIZED_CHAR_VALUE;
934 if ((from[0] == 'U' || from[0] == 'P') && (len1 == 5 || len1 == 9))
936 /* Maybe the name is of the form `Uxxxx' or `Uxxxxxxxx' where
937 xxxx and xxxxxxxx are hexadecimal numbers. In this case
938 we use the value of xxxx or xxxxxxxx as the UCS4 value of
939 this character and we don't have to consult the repertoire
940 map.
942 If the name is of the form `Pxxxx' or `Pxxxxxxxx' the xxxx
943 and xxxxxxxx also give the code point in UCS4 but this must
944 be in the private, i.e., unassigned, area. This should be
945 used for characters which do not (yet) have an equivalent
946 in ISO 10646 and Unicode. */
947 char *endp;
949 errno = 0;
950 newp->ucs4 = strtoul (from + 1, &endp, 16);
951 if (endp - from != len1
952 || (newp->ucs4 == ULONG_MAX && errno == ERANGE)
953 || newp->ucs4 >= 0x80000000)
954 /* This wasn't successful. Signal this name cannot be a
955 correct UCS value. */
956 newp->ucs4 = UNINITIALIZED_CHAR_VALUE;
959 insert_entry (ht, from, len1, newp);
960 insert_entry (bt, newp->bytes, nbytes, newp);
961 /* Please note that it isn't a bug if a symbol is defined more
962 than once. All later definitions are simply discarded. */
963 return;
966 /* We have a range: the names must have names with equal prefixes
967 and an equal number of digits, where the second number is greater
968 or equal than the first. */
969 len2 = strlen (to);
971 if (len1 != len2)
973 illegal_range:
974 lr_error (lr, _("invalid names for character range"));
975 return;
978 cp = &from[len1 - 1];
979 if (decimal_ellipsis)
980 while (isdigit (*cp) && cp >= from)
981 --cp;
982 else
983 while (isxdigit (*cp) && cp >= from)
985 if (!isdigit (*cp) && !isupper (*cp))
986 lr_error (lr, _("\
987 hexadecimal range format should use only capital characters"));
988 --cp;
991 prefix_len = (cp - from) + 1;
993 if (cp == &from[len1 - 1] || strncmp (from, to, prefix_len) != 0)
994 goto illegal_range;
996 errno = 0;
997 from_nr = strtoul (&from[prefix_len], &from_end, decimal_ellipsis ? 10 : 16);
998 if (*from_end != '\0' || (from_nr == ULONG_MAX && errno == ERANGE)
999 || ((to_nr = strtoul (&to[prefix_len], &to_end,
1000 decimal_ellipsis ? 10 : 16)) == ULONG_MAX
1001 && errno == ERANGE)
1002 || *to_end != '\0')
1004 lr_error (lr, _("<%s> and <%s> are illegal names for range"), from, to);
1005 return;
1008 if (from_nr > to_nr)
1010 lr_error (lr, _("upper limit in range is not higher then lower limit"));
1011 return;
1014 for (cnt = from_nr; cnt <= to_nr; cnt += step)
1016 char *name_end;
1017 obstack_printf (ob, decimal_ellipsis ? "%.*s%0*d" : "%.*s%0*X",
1018 prefix_len, from, len1 - prefix_len, cnt);
1019 obstack_1grow (ob, '\0');
1020 name_end = obstack_finish (ob);
1022 newp = (struct charseq *) obstack_alloc (ob, sizeof (*newp) + nbytes);
1023 newp->nbytes = nbytes;
1024 memcpy (newp->bytes, bytes, nbytes);
1025 newp->name = name_end;
1027 newp->ucs4 = UNINITIALIZED_CHAR_VALUE;
1028 if ((name_end[0] == 'U' || name_end[0] == 'P')
1029 && (len1 == 5 || len1 == 9))
1031 /* Maybe the name is of the form `Uxxxx' or `Uxxxxxxxx' where
1032 xxxx and xxxxxxxx are hexadecimal numbers. In this case
1033 we use the value of xxxx or xxxxxxxx as the UCS4 value of
1034 this character and we don't have to consult the repertoire
1035 map.
1037 If the name is of the form `Pxxxx' or `Pxxxxxxxx' the xxxx
1038 and xxxxxxxx also give the code point in UCS4 but this must
1039 be in the private, i.e., unassigned, area. This should be
1040 used for characters which do not (yet) have an equivalent
1041 in ISO 10646 and Unicode. */
1042 char *endp;
1044 errno = 0;
1045 newp->ucs4 = strtoul (name_end + 1, &endp, 16);
1046 if (endp - name_end != len1
1047 || (newp->ucs4 == ULONG_MAX && errno == ERANGE)
1048 || newp->ucs4 >= 0x80000000)
1049 /* This wasn't successful. Signal this name cannot be a
1050 correct UCS value. */
1051 newp->ucs4 = UNINITIALIZED_CHAR_VALUE;
1054 insert_entry (ht, name_end, len1, newp);
1055 insert_entry (bt, newp->bytes, nbytes, newp);
1056 /* Please note we don't examine the return value since it is no error
1057 if we have two definitions for a symbol. */
1059 /* Increment the value in the byte sequence. */
1060 if (++bytes[nbytes - 1] == '\0')
1062 int b = nbytes - 2;
1065 if (b < 0)
1067 lr_error (lr,
1068 _("resulting bytes for range not representable."));
1069 return;
1071 while (++bytes[b--] == 0);
1077 struct charseq *
1078 charmap_find_symbol (const struct charmap_t *cm, const char *bytes,
1079 size_t nbytes)
1081 void *result;
1083 return (find_entry ((hash_table *) &cm->byte_table, bytes, nbytes, &result)
1084 < 0 ? NULL : (struct charseq *) result);