Update.
[glibc.git] / locale / programs / charmap.c
blob7840242d8d0638a6341ba60175ba79aff2e89aa5
1 /* Copyright (C) 1996,1998,1999,2000,2001 Free Software Foundation, Inc.
2 This file is part of the GNU C Library.
3 Contributed by Ulrich Drepper <drepper@gnu.org>, 1996.
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, write to the Free
17 Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
18 02111-1307 USA. */
20 #ifdef HAVE_CONFIG_H
21 # include <config.h>
22 #endif
24 #include <ctype.h>
25 #include <errno.h>
26 #include <libintl.h>
27 #include <limits.h>
28 #include <obstack.h>
29 #include <stdio.h>
30 #include <stdlib.h>
31 #include <string.h>
33 #include "error.h"
34 #include "linereader.h"
35 #include "charmap.h"
36 #include "charmap-dir.h"
37 #include "repertoire.h"
39 #include <assert.h>
42 /* Define the lookup function. */
43 #include "charmap-kw.h"
46 extern void *xmalloc (size_t __n);
48 /* Prototypes for local functions. */
49 static struct charmap_t *parse_charmap (struct linereader *cmfile,
50 int verbose, int be_quiet);
51 static void new_width (struct linereader *cmfile, struct charmap_t *result,
52 const char *from, const char *to,
53 unsigned long int width);
54 static void charmap_new_char (struct linereader *lr, struct charmap_t *cm,
55 int nbytes, char *bytes, const char *from,
56 const char *to, int decimal_ellipsis, int step);
59 #ifdef NEED_NULL_POINTER
60 static const char *null_pointer;
61 #endif
63 static struct linereader *
64 cmlr_open (const char *directory, const char *name, kw_hash_fct_t hf)
66 FILE *fp;
68 fp = charmap_open (directory, name);
69 if (fp == NULL)
70 return NULL;
71 else
73 size_t dlen = strlen (directory);
74 int add_slash = (dlen == 0 || directory[dlen - 1] != '/');
75 size_t nlen = strlen (name);
76 char *pathname;
77 char *p;
79 pathname = alloca (dlen + add_slash + nlen + 1);
80 p = stpcpy (pathname, directory);
81 if (add_slash)
82 *p++ = '/';
83 stpcpy (p, name);
85 return lr_create (fp, pathname, hf);
89 struct charmap_t *
90 charmap_read (const char *filename, int verbose, int be_quiet, int use_default)
92 struct charmap_t *result = NULL;
94 if (filename != NULL)
96 struct linereader *cmfile;
98 /* First try the name as found in the parameter. */
99 cmfile = lr_open (filename, charmap_hash);
100 if (cmfile == NULL)
102 /* No successful. So start looking through the directories
103 in the I18NPATH if this is a simple name. */
104 if (strchr (filename, '/') == NULL)
106 char *i18npath = getenv ("I18NPATH");
107 if (i18npath != NULL && *i18npath != '\0')
109 char path[strlen (i18npath) + sizeof ("/charmaps")];
110 char *next;
111 i18npath = strdupa (i18npath);
113 while (cmfile == NULL
114 && (next = strsep (&i18npath, ":")) != NULL)
116 stpcpy (stpcpy (path, next), "/charmaps");
117 cmfile = cmlr_open (path, filename, charmap_hash);
119 if (cmfile == NULL)
120 /* Try without the "/charmaps" part. */
121 cmfile = cmlr_open (next, filename, charmap_hash);
125 if (cmfile == NULL)
126 /* Try the default directory. */
127 cmfile = cmlr_open (CHARMAP_PATH, filename, charmap_hash);
131 if (cmfile != NULL)
133 result = parse_charmap (cmfile, verbose, be_quiet);
135 if (result == NULL && !be_quiet)
136 error (0, errno, _("character map file `%s' not found"), filename);
140 if (result == NULL && filename != NULL && strchr (filename, '/') == NULL)
142 /* OK, one more try. We also accept the names given to the
143 character sets in the files. Sometimes they differ from the
144 file name. */
145 CHARMAP_DIR *dir;
147 dir = charmap_opendir (CHARMAP_PATH);
148 if (dir != NULL)
150 const char *dirent;
152 while ((dirent = charmap_readdir (dir)) != NULL)
154 char **aliases;
155 char **p;
156 int found;
158 aliases = charmap_aliases (CHARMAP_PATH, dirent);
159 found = 0;
160 for (p = aliases; *p; p++)
161 if (strcasecmp (*p, filename) == 0)
163 found = 1;
164 break;
166 charmap_free_aliases (aliases);
168 if (found)
170 struct linereader *cmfile;
172 cmfile = cmlr_open (CHARMAP_PATH, dirent, charmap_hash);
173 if (cmfile != NULL)
174 result = parse_charmap (cmfile, verbose, be_quiet);
176 break;
180 charmap_closedir (dir);
184 if (result == NULL && DEFAULT_CHARMAP != NULL)
186 struct linereader *cmfile;
188 cmfile = cmlr_open (CHARMAP_PATH, DEFAULT_CHARMAP, charmap_hash);
189 if (cmfile != NULL)
190 result = parse_charmap (cmfile, verbose, be_quiet);
192 if (result == NULL)
193 error (4, errno, _("default character map file `%s' not found"),
194 DEFAULT_CHARMAP);
197 /* Test of ASCII compatibility of locale encoding.
199 Verify that the encoding to be used in a locale is ASCII compatible,
200 at least for the graphic characters, excluding the control characters,
201 '$' and '@'. This constraint comes from an ISO C 99 restriction.
203 ISO C 99 section 7.17.(2) (about wchar_t):
204 the null character shall have the code value zero and each member of
205 the basic character set shall have a code value equal to its value
206 when used as the lone character in an integer character constant.
207 ISO C 99 section 5.2.1.(3):
208 Both the basic source and basic execution character sets shall have
209 the following members: the 26 uppercase letters of the Latin alphabet
210 A B C D E F G H I J K L M N O P Q R S T U V W X Y Z
211 the 26 lowercase letters of the Latin alphabet
212 a b c d e f g h i j k l m n o p q r s t u v w x y z
213 the 10 decimal digits
214 0 1 2 3 4 5 6 7 8 9
215 the following 29 graphic characters
216 ! " # % & ' ( ) * + , - . / : ; < = > ? [ \ ] ^ _ { | } ~
217 the space character, and control characters representing horizontal
218 tab, vertical tab, and form feed.
220 Therefore, for all members of the "basic character set", the 'char' code
221 must have the same value as the 'wchar_t' code, which in glibc is the
222 same as the Unicode code, which for all of the enumerated characters
223 is identical to the ASCII code. */
224 if (result != NULL && use_default)
226 static const char basic_charset[] =
228 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M',
229 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z',
230 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm',
231 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
232 '0', '1', '2', '3', '4', '5', '6', '7', '8', '9',
233 '!', '"', '#', '%', '&', '\'', '(', ')', '*', '+', ',', '-',
234 '.', '/', ':', ';', '<', '=', '>', '?', '[', '\\', ']', '^',
235 '_', '{', '|', '}', '~', ' ', '\t', '\v', '\f', '\0'
237 int failed = 0;
238 const char *p = basic_charset;
242 struct charseq * seq = charmap_find_symbol (result, p, 1);
244 if (seq == NULL || seq->ucs4 != *p)
245 failed = 1;
247 while (*p++ != '\0');
249 if (failed)
250 fprintf (stderr, _("\
251 character map `%s' is not ASCII compatible, locale not ISO C compliant\n"),
252 result->code_set_name);
255 return result;
259 static struct charmap_t *
260 parse_charmap (struct linereader *cmfile, int verbose, int be_quiet)
262 struct charmap_t *result;
263 int state;
264 enum token_t expected_tok = tok_error;
265 const char *expected_str = NULL;
266 char *from_name = NULL;
267 char *to_name = NULL;
268 enum token_t ellipsis = 0;
269 int step = 1;
271 /* We don't want symbolic names in string to be translated. */
272 cmfile->translate_strings = 0;
274 /* Allocate room for result. */
275 result = (struct charmap_t *) xmalloc (sizeof (struct charmap_t));
276 memset (result, '\0', sizeof (struct charmap_t));
277 /* The default DEFAULT_WIDTH is 1. */
278 result->width_default = 1;
280 #define obstack_chunk_alloc malloc
281 #define obstack_chunk_free free
282 obstack_init (&result->mem_pool);
284 if (init_hash (&result->char_table, 256)
285 || init_hash (&result->byte_table, 256))
287 free (result);
288 return NULL;
291 /* We use a state machine to describe the charmap description file
292 format. */
293 state = 1;
294 while (1)
296 /* What's on? */
297 struct token *now = lr_token (cmfile, NULL, NULL, NULL, verbose);
298 enum token_t nowtok = now->tok;
299 struct token *arg;
301 if (nowtok == tok_eof)
302 break;
304 switch (state)
306 case 1:
307 /* The beginning. We expect the special declarations, EOL or
308 `CHARMAP'. */
309 if (nowtok == tok_eol)
310 /* Ignore empty lines. */
311 continue;
313 if (nowtok == tok_charmap)
315 from_name = NULL;
316 to_name = NULL;
318 /* We have to set up the real work. Fill in some
319 default values. */
320 if (result->mb_cur_max == 0)
321 result->mb_cur_max = 1;
322 if (result->mb_cur_min == 0)
323 result->mb_cur_min = result->mb_cur_max;
324 if (result->mb_cur_min > result->mb_cur_max)
326 if (!be_quiet)
327 error (0, 0, _("\
328 %s: <mb_cur_max> must be greater than <mb_cur_min>\n"),
329 cmfile->fname);
331 result->mb_cur_min = result->mb_cur_max;
334 lr_ignore_rest (cmfile, 1);
336 state = 2;
337 continue;
340 if (nowtok != tok_code_set_name && nowtok != tok_mb_cur_max
341 && nowtok != tok_mb_cur_min && nowtok != tok_escape_char
342 && nowtok != tok_comment_char && nowtok != tok_g0esc
343 && nowtok != tok_g1esc && nowtok != tok_g2esc
344 && nowtok != tok_g3esc && nowtok != tok_repertoiremap
345 && nowtok != tok_include)
347 lr_error (cmfile, _("syntax error in prolog: %s"),
348 _("invalid definition"));
350 lr_ignore_rest (cmfile, 0);
351 continue;
354 /* We know that we need an argument. */
355 arg = lr_token (cmfile, NULL, NULL, NULL, verbose);
357 switch (nowtok)
359 case tok_code_set_name:
360 case tok_repertoiremap:
361 if (arg->tok != tok_ident && arg->tok != tok_string)
363 badarg:
364 lr_error (cmfile, _("syntax error in prolog: %s"),
365 _("bad argument"));
367 lr_ignore_rest (cmfile, 0);
368 continue;
371 if (nowtok == tok_code_set_name)
372 result->code_set_name = obstack_copy0 (&result->mem_pool,
373 arg->val.str.startmb,
374 arg->val.str.lenmb);
375 else
376 result->repertoiremap = obstack_copy0 (&result->mem_pool,
377 arg->val.str.startmb,
378 arg->val.str.lenmb);
380 lr_ignore_rest (cmfile, 1);
381 continue;
383 case tok_mb_cur_max:
384 case tok_mb_cur_min:
385 if (arg->tok != tok_number)
386 goto badarg;
388 if (verbose
389 && ((nowtok == tok_mb_cur_max
390 && result->mb_cur_max != 0)
391 || (nowtok == tok_mb_cur_max
392 && result->mb_cur_max != 0)))
393 lr_error (cmfile, _("duplicate definition of <%s>"),
394 nowtok == tok_mb_cur_min
395 ? "mb_cur_min" : "mb_cur_max");
397 if (arg->val.num < 1)
399 lr_error (cmfile,
400 _("value for <%s> must be 1 or greater"),
401 nowtok == tok_mb_cur_min
402 ? "mb_cur_min" : "mb_cur_max");
404 lr_ignore_rest (cmfile, 0);
405 continue;
407 if ((nowtok == tok_mb_cur_max && result->mb_cur_min != 0
408 && (int) arg->val.num < result->mb_cur_min)
409 || (nowtok == tok_mb_cur_min && result->mb_cur_max != 0
410 && (int) arg->val.num > result->mb_cur_max))
412 lr_error (cmfile, _("\
413 value of <%s> must be greater or equal than the value of <%s>"),
414 "mb_cur_max", "mb_cur_min");
416 lr_ignore_rest (cmfile, 0);
417 continue;
420 if (nowtok == tok_mb_cur_max)
421 result->mb_cur_max = arg->val.num;
422 else
423 result->mb_cur_min = arg->val.num;
425 lr_ignore_rest (cmfile, 1);
426 continue;
428 case tok_escape_char:
429 case tok_comment_char:
430 if (arg->tok != tok_ident)
431 goto badarg;
433 if (arg->val.str.lenmb != 1)
435 lr_error (cmfile, _("\
436 argument to <%s> must be a single character"),
437 nowtok == tok_escape_char ? "escape_char"
438 : "comment_char");
440 lr_ignore_rest (cmfile, 0);
441 continue;
444 if (nowtok == tok_escape_char)
445 cmfile->escape_char = *arg->val.str.startmb;
446 else
447 cmfile->comment_char = *arg->val.str.startmb;
449 lr_ignore_rest (cmfile, 1);
450 continue;
452 case tok_g0esc:
453 case tok_g1esc:
454 case tok_g2esc:
455 case tok_g3esc:
456 case tok_escseq:
457 lr_ignore_rest (cmfile, 0); /* XXX */
458 continue;
460 case tok_include:
461 lr_error (cmfile, _("\
462 character sets with locking states are not supported"));
463 exit (4);
465 default:
466 /* Cannot happen. */
467 assert (! "Should not happen");
469 break;
471 case 2:
472 /* We have seen `CHARMAP' and now are in the body. Each line
473 must have the format "%s %s %s\n" or "%s...%s %s %s\n". */
474 if (nowtok == tok_eol)
475 /* Ignore empty lines. */
476 continue;
478 if (nowtok == tok_end)
480 expected_tok = tok_charmap;
481 expected_str = "CHARMAP";
482 state = 90;
483 continue;
486 if (nowtok != tok_bsymbol && nowtok != tok_ucs4)
488 lr_error (cmfile, _("syntax error in %s definition: %s"),
489 "CHARMAP", _("no symbolic name given"));
491 lr_ignore_rest (cmfile, 0);
492 continue;
495 /* If the previous line was not completely correct free the
496 used memory. */
497 if (from_name != NULL)
498 obstack_free (&result->mem_pool, from_name);
500 if (nowtok == tok_bsymbol)
501 from_name = (char *) obstack_copy0 (&result->mem_pool,
502 now->val.str.startmb,
503 now->val.str.lenmb);
504 else
506 obstack_printf (&result->mem_pool, "U%08X",
507 cmfile->token.val.ucs4);
508 obstack_1grow (&result->mem_pool, '\0');
509 from_name = (char *) obstack_finish (&result->mem_pool);
511 to_name = NULL;
513 state = 3;
514 continue;
516 case 3:
517 /* We have two possibilities: We can see an ellipsis or an
518 encoding value. */
519 if (nowtok == tok_ellipsis3 || nowtok == tok_ellipsis4
520 || nowtok == tok_ellipsis2 || nowtok == tok_ellipsis4_2
521 || nowtok == tok_ellipsis2_2)
523 ellipsis = nowtok;
524 if (nowtok == tok_ellipsis4_2)
526 step = 2;
527 nowtok = tok_ellipsis4;
529 else if (nowtok == tok_ellipsis2_2)
531 step = 2;
532 nowtok = tok_ellipsis2;
534 state = 4;
535 continue;
537 /* FALLTHROUGH */
539 case 5:
540 if (nowtok != tok_charcode)
542 lr_error (cmfile, _("syntax error in %s definition: %s"),
543 "CHARMAP", _("invalid encoding given"));
545 lr_ignore_rest (cmfile, 0);
547 state = 2;
548 continue;
551 if (now->val.charcode.nbytes < result->mb_cur_min)
552 lr_error (cmfile, _("too few bytes in character encoding"));
553 else if (now->val.charcode.nbytes > result->mb_cur_max)
554 lr_error (cmfile, _("too many bytes in character encoding"));
555 else
556 charmap_new_char (cmfile, result, now->val.charcode.nbytes,
557 now->val.charcode.bytes, from_name, to_name,
558 ellipsis != tok_ellipsis2, step);
560 /* Ignore trailing comment silently. */
561 lr_ignore_rest (cmfile, 0);
563 from_name = NULL;
564 to_name = NULL;
565 ellipsis = tok_none;
566 step = 1;
568 state = 2;
569 continue;
571 case 4:
572 if (nowtok != tok_bsymbol && nowtok != tok_ucs4)
574 lr_error (cmfile, _("syntax error in %s definition: %s"),
575 "CHARMAP",
576 _("no symbolic name given for end of range"));
578 lr_ignore_rest (cmfile, 0);
579 continue;
582 /* Copy the to-name in a safe place. */
583 if (nowtok == tok_bsymbol)
584 to_name = (char *) obstack_copy0 (&result->mem_pool,
585 cmfile->token.val.str.startmb,
586 cmfile->token.val.str.lenmb);
587 else
589 obstack_printf (&result->mem_pool, "U%08X",
590 cmfile->token.val.ucs4);
591 obstack_1grow (&result->mem_pool, '\0');
592 to_name = (char *) obstack_finish (&result->mem_pool);
595 state = 5;
596 continue;
598 case 90:
599 if (nowtok != expected_tok)
600 lr_error (cmfile, _("\
601 `%1$s' definition does not end with `END %1$s'"), expected_str);
603 lr_ignore_rest (cmfile, nowtok == expected_tok);
604 state = 91;
605 continue;
607 case 91:
608 /* Waiting for WIDTH... */
609 if (nowtok == tok_eol)
610 /* Ignore empty lines. */
611 continue;
613 if (nowtok == tok_width_default)
615 state = 92;
616 continue;
619 if (nowtok == tok_width)
621 lr_ignore_rest (cmfile, 1);
622 state = 93;
623 continue;
626 if (nowtok == tok_width_variable)
628 lr_ignore_rest (cmfile, 1);
629 state = 98;
630 continue;
633 lr_error (cmfile, _("\
634 only WIDTH definitions are allowed to follow the CHARMAP definition"));
636 lr_ignore_rest (cmfile, 0);
637 continue;
639 case 92:
640 if (nowtok != tok_number)
641 lr_error (cmfile, _("value for %s must be an integer"),
642 "WIDTH_DEFAULT");
643 else
644 result->width_default = now->val.num;
646 lr_ignore_rest (cmfile, nowtok == tok_number);
648 state = 91;
649 continue;
651 case 93:
652 /* We now expect `END WIDTH' or lines of the format "%s %d\n" or
653 "%s...%s %d\n". */
654 if (nowtok == tok_eol)
655 /* ignore empty lines. */
656 continue;
658 if (nowtok == tok_end)
660 expected_tok = tok_width;
661 expected_str = "WIDTH";
662 state = 90;
663 continue;
666 if (nowtok != tok_bsymbol && nowtok != tok_ucs4)
668 lr_error (cmfile, _("syntax error in %s definition: %s"),
669 "WIDTH", _("no symbolic name given"));
671 lr_ignore_rest (cmfile, 0);
672 continue;
675 if (from_name != NULL)
676 obstack_free (&result->mem_pool, from_name);
678 if (nowtok == tok_bsymbol)
679 from_name = (char *) obstack_copy0 (&result->mem_pool,
680 now->val.str.startmb,
681 now->val.str.lenmb);
682 else
684 obstack_printf (&result->mem_pool, "U%08X",
685 cmfile->token.val.ucs4);
686 obstack_1grow (&result->mem_pool, '\0');
687 from_name = (char *) obstack_finish (&result->mem_pool);
690 to_name = NULL;
692 state = 94;
693 continue;
695 case 94:
696 if (nowtok == tok_ellipsis3)
698 state = 95;
699 continue;
702 case 96:
703 if (nowtok != tok_number)
704 lr_error (cmfile, _("value for %s must be an integer"),
705 "WIDTH");
706 else
708 /* Store width for chars. */
709 new_width (cmfile, result, from_name, to_name, now->val.num);
711 from_name = NULL;
712 to_name = NULL;
715 lr_ignore_rest (cmfile, nowtok == tok_number);
717 state = 93;
718 continue;
720 case 95:
721 if (nowtok != tok_bsymbol && nowtok != tok_ucs4)
723 lr_error (cmfile, _("syntax error in %s definition: %s"),
724 "WIDTH", _("no symbolic name given for end of range"));
726 lr_ignore_rest (cmfile, 0);
728 state = 93;
729 continue;
732 if (nowtok == tok_bsymbol)
733 to_name = (char *) obstack_copy0 (&result->mem_pool,
734 now->val.str.startmb,
735 now->val.str.lenmb);
736 else
738 obstack_printf (&result->mem_pool, "U%08X",
739 cmfile->token.val.ucs4);
740 obstack_1grow (&result->mem_pool, '\0');
741 to_name = (char *) obstack_finish (&result->mem_pool);
744 state = 96;
745 continue;
747 case 98:
748 /* We now expect `END WIDTH_VARIABLE' or lines of the format
749 "%s\n" or "%s...%s\n". */
750 if (nowtok == tok_eol)
751 /* ignore empty lines. */
752 continue;
754 if (nowtok == tok_end)
756 expected_tok = tok_width_variable;
757 expected_str = "WIDTH_VARIABLE";
758 state = 90;
759 continue;
762 if (nowtok != tok_bsymbol && nowtok != tok_ucs4)
764 lr_error (cmfile, _("syntax error in %s definition: %s"),
765 "WIDTH_VARIABLE", _("no symbolic name given"));
767 lr_ignore_rest (cmfile, 0);
769 continue;
772 if (from_name != NULL)
773 obstack_free (&result->mem_pool, from_name);
775 if (nowtok == tok_bsymbol)
776 from_name = (char *) obstack_copy0 (&result->mem_pool,
777 now->val.str.startmb,
778 now->val.str.lenmb);
779 else
781 obstack_printf (&result->mem_pool, "U%08X",
782 cmfile->token.val.ucs4);
783 obstack_1grow (&result->mem_pool, '\0');
784 from_name = (char *) obstack_finish (&result->mem_pool);
786 to_name = NULL;
788 state = 99;
789 continue;
791 case 99:
792 if (nowtok == tok_ellipsis3)
793 state = 100;
795 /* Store info. */
796 from_name = NULL;
798 /* Warn */
799 state = 98;
800 continue;
802 case 100:
803 if (nowtok != tok_bsymbol && nowtok != tok_ucs4)
805 lr_error (cmfile, _("syntax error in %s definition: %s"),
806 "WIDTH_VARIABLE",
807 _("no symbolic name given for end of range"));
808 lr_ignore_rest (cmfile, 0);
809 continue;
812 if (nowtok == tok_bsymbol)
813 to_name = (char *) obstack_copy0 (&result->mem_pool,
814 now->val.str.startmb,
815 now->val.str.lenmb);
816 else
818 obstack_printf (&result->mem_pool, "U%08X",
819 cmfile->token.val.ucs4);
820 obstack_1grow (&result->mem_pool, '\0');
821 to_name = (char *) obstack_finish (&result->mem_pool);
824 /* XXX Enter value into table. */
826 lr_ignore_rest (cmfile, 1);
828 state = 98;
829 continue;
831 default:
832 error (5, 0, _("%s: error in state machine"), __FILE__);
833 /* NOTREACHED */
835 break;
838 if (state != 91 && !be_quiet)
839 error (0, 0, _("%s: premature end of file"), cmfile->fname);
841 lr_close (cmfile);
843 return result;
847 static void
848 new_width (struct linereader *cmfile, struct charmap_t *result,
849 const char *from, const char *to, unsigned long int width)
851 struct charseq *from_val;
852 struct charseq *to_val;
854 from_val = charmap_find_value (result, from, strlen (from));
855 if (from_val == NULL)
857 lr_error (cmfile, _("unknown character `%s'"), from);
858 return;
861 if (to == NULL)
862 to_val = from_val;
863 else
865 to_val = charmap_find_value (result, to, strlen (to));
866 if (to_val == NULL)
868 lr_error (cmfile, _("unknown character `%s'"), to);
869 return;
873 if (result->nwidth_rules >= result->nwidth_rules_max)
875 size_t new_size = result->nwidth_rules + 32;
876 struct width_rule *new_rules =
877 (struct width_rule *) obstack_alloc (&result->mem_pool,
878 (new_size
879 * sizeof (struct width_rule)));
881 memcpy (new_rules, result->width_rules,
882 result->nwidth_rules_max * sizeof (struct width_rule));
884 result->width_rules = new_rules;
885 result->nwidth_rules_max = new_size;
888 result->width_rules[result->nwidth_rules].from = from_val;
889 result->width_rules[result->nwidth_rules].to = to_val;
890 result->width_rules[result->nwidth_rules].width = (unsigned int) width;
891 ++result->nwidth_rules;
895 struct charseq *
896 charmap_find_value (const struct charmap_t *cm, const char *name, size_t len)
898 void *result;
900 return (find_entry ((hash_table *) &cm->char_table, name, len, &result)
901 < 0 ? NULL : (struct charseq *) result);
905 static void
906 charmap_new_char (struct linereader *lr, struct charmap_t *cm,
907 int nbytes, char *bytes, const char *from, const char *to,
908 int decimal_ellipsis, int step)
910 hash_table *ht = &cm->char_table;
911 hash_table *bt = &cm->byte_table;
912 struct obstack *ob = &cm->mem_pool;
913 char *from_end;
914 char *to_end;
915 const char *cp;
916 int prefix_len, len1, len2;
917 unsigned int from_nr, to_nr, cnt;
918 struct charseq *newp;
920 len1 = strlen (from);
922 if (to == NULL)
924 newp = (struct charseq *) obstack_alloc (ob, sizeof (*newp) + nbytes);
925 newp->nbytes = nbytes;
926 memcpy (newp->bytes, bytes, nbytes);
927 newp->name = from;
929 newp->ucs4 = UNINITIALIZED_CHAR_VALUE;
930 if ((from[0] == 'U' || from[0] == 'P') && (len1 == 5 || len1 == 9))
932 /* Maybe the name is of the form `Uxxxx' or `Uxxxxxxxx' where
933 xxxx and xxxxxxxx are hexadecimal numbers. In this case
934 we use the value of xxxx or xxxxxxxx as the UCS4 value of
935 this character and we don't have to consult the repertoire
936 map.
938 If the name is of the form `Pxxxx' or `Pxxxxxxxx' the xxxx
939 and xxxxxxxx also give the code point in UCS4 but this must
940 be in the private, i.e., unassigned, area. This should be
941 used for characters which do not (yet) have an equivalent
942 in ISO 10646 and Unicode. */
943 char *endp;
945 errno = 0;
946 newp->ucs4 = strtoul (from + 1, &endp, 16);
947 if (endp - from != len1
948 || (newp->ucs4 == ULONG_MAX && errno == ERANGE)
949 || newp->ucs4 >= 0x80000000)
950 /* This wasn't successful. Signal this name cannot be a
951 correct UCS value. */
952 newp->ucs4 = UNINITIALIZED_CHAR_VALUE;
955 insert_entry (ht, from, len1, newp);
956 insert_entry (bt, newp->bytes, nbytes, newp);
957 /* Please note that it isn't a bug if a symbol is defined more
958 than once. All later definitions are simply discarded. */
959 return;
962 /* We have a range: the names must have names with equal prefixes
963 and an equal number of digits, where the second number is greater
964 or equal than the first. */
965 len2 = strlen (to);
967 if (len1 != len2)
969 illegal_range:
970 lr_error (lr, _("invalid names for character range"));
971 return;
974 cp = &from[len1 - 1];
975 if (decimal_ellipsis)
976 while (isdigit (*cp) && cp >= from)
977 --cp;
978 else
979 while (isxdigit (*cp) && cp >= from)
981 if (!isdigit (*cp) && !isupper (*cp))
982 lr_error (lr, _("\
983 hexadecimal range format should use only capital characters"));
984 --cp;
987 prefix_len = (cp - from) + 1;
989 if (cp == &from[len1 - 1] || strncmp (from, to, prefix_len) != 0)
990 goto illegal_range;
992 errno = 0;
993 from_nr = strtoul (&from[prefix_len], &from_end, decimal_ellipsis ? 10 : 16);
994 if (*from_end != '\0' || (from_nr == ULONG_MAX && errno == ERANGE)
995 || ((to_nr = strtoul (&to[prefix_len], &to_end,
996 decimal_ellipsis ? 10 : 16)) == ULONG_MAX
997 && errno == ERANGE)
998 || *to_end != '\0')
1000 lr_error (lr, _("<%s> and <%s> are illegal names for range"), from, to);
1001 return;
1004 if (from_nr > to_nr)
1006 lr_error (lr, _("upper limit in range is not higher then lower limit"));
1007 return;
1010 for (cnt = from_nr; cnt <= to_nr; cnt += step)
1012 char *name_end;
1013 obstack_printf (ob, decimal_ellipsis ? "%.*s%0*d" : "%.*s%0*X",
1014 prefix_len, from, len1 - prefix_len, cnt);
1015 obstack_1grow (ob, '\0');
1016 name_end = obstack_finish (ob);
1018 newp = (struct charseq *) obstack_alloc (ob, sizeof (*newp) + nbytes);
1019 newp->nbytes = nbytes;
1020 memcpy (newp->bytes, bytes, nbytes);
1021 newp->name = name_end;
1023 newp->ucs4 = UNINITIALIZED_CHAR_VALUE;
1024 if ((name_end[0] == 'U' || name_end[0] == 'P')
1025 && (len1 == 5 || len1 == 9))
1027 /* Maybe the name is of the form `Uxxxx' or `Uxxxxxxxx' where
1028 xxxx and xxxxxxxx are hexadecimal numbers. In this case
1029 we use the value of xxxx or xxxxxxxx as the UCS4 value of
1030 this character and we don't have to consult the repertoire
1031 map.
1033 If the name is of the form `Pxxxx' or `Pxxxxxxxx' the xxxx
1034 and xxxxxxxx also give the code point in UCS4 but this must
1035 be in the private, i.e., unassigned, area. This should be
1036 used for characters which do not (yet) have an equivalent
1037 in ISO 10646 and Unicode. */
1038 char *endp;
1040 errno = 0;
1041 newp->ucs4 = strtoul (name_end + 1, &endp, 16);
1042 if (endp - name_end != len1
1043 || (newp->ucs4 == ULONG_MAX && errno == ERANGE)
1044 || newp->ucs4 >= 0x80000000)
1045 /* This wasn't successful. Signal this name cannot be a
1046 correct UCS value. */
1047 newp->ucs4 = UNINITIALIZED_CHAR_VALUE;
1050 insert_entry (ht, name_end, len1, newp);
1051 insert_entry (bt, newp->bytes, nbytes, newp);
1052 /* Please note we don't examine the return value since it is no error
1053 if we have two definitions for a symbol. */
1055 /* Increment the value in the byte sequence. */
1056 if (++bytes[nbytes - 1] == '\0')
1058 int b = nbytes - 2;
1061 if (b < 0)
1063 lr_error (lr,
1064 _("resulting bytes for range not representable."));
1065 return;
1067 while (++bytes[b--] == 0);
1073 struct charseq *
1074 charmap_find_symbol (const struct charmap_t *cm, const char *bytes,
1075 size_t nbytes)
1077 void *result;
1079 return (find_entry ((hash_table *) &cm->byte_table, bytes, nbytes, &result)
1080 < 0 ? NULL : (struct charseq *) result);