2.5-18.1
[glibc.git] / locale / programs / charmap.c
blob52a69de4b04a593c9195b3eb1703f4bb47912776
1 /* Copyright (C) 1996, 1998-2004,2005, 2006 Free Software Foundation, Inc.
2 This file is part of the GNU C Library.
3 Contributed by Ulrich Drepper <drepper@gnu.org>, 1996.
5 This program is free software; you can redistribute it and/or modify
6 it under the terms of the GNU General Public License version 2 as
7 published by the Free Software Foundation.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program; if not, write to the Free Software Foundation,
16 Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */
18 #ifdef HAVE_CONFIG_H
19 # include <config.h>
20 #endif
22 #include <ctype.h>
23 #include <errno.h>
24 #include <libintl.h>
25 #include <limits.h>
26 #include <stdio.h>
27 #include <stdlib.h>
28 #include <string.h>
29 #include <error.h>
31 #include "localedef.h"
32 #include "linereader.h"
33 #include "charmap.h"
34 #include "charmap-dir.h"
36 #include <assert.h>
39 /* Define the lookup function. */
40 #include "charmap-kw.h"
43 /* Prototypes for local functions. */
44 static struct charmap_t *parse_charmap (struct linereader *cmfile,
45 int verbose, int be_quiet);
46 static void new_width (struct linereader *cmfile, struct charmap_t *result,
47 const char *from, const char *to,
48 unsigned long int width);
49 static void charmap_new_char (struct linereader *lr, struct charmap_t *cm,
50 size_t nbytes, unsigned char *bytes,
51 const char *from, const char *to,
52 int decimal_ellipsis, int step);
55 bool enc_not_ascii_compatible;
58 #ifdef NEED_NULL_POINTER
59 static const char *null_pointer;
60 #endif
62 static struct linereader *
63 cmlr_open (const char *directory, const char *name, kw_hash_fct_t hf)
65 FILE *fp;
67 fp = charmap_open (directory, name);
68 if (fp == NULL)
69 return NULL;
70 else
72 size_t dlen = strlen (directory);
73 int add_slash = (dlen == 0 || directory[dlen - 1] != '/');
74 size_t nlen = strlen (name);
75 char *pathname;
76 char *p;
78 pathname = alloca (dlen + add_slash + nlen + 1);
79 p = stpcpy (pathname, directory);
80 if (add_slash)
81 *p++ = '/';
82 stpcpy (p, name);
84 return lr_create (fp, pathname, hf);
88 struct charmap_t *
89 charmap_read (const char *filename, int verbose, int error_not_found,
90 int be_quiet, int use_default)
92 struct charmap_t *result = NULL;
94 if (filename != NULL)
96 struct linereader *cmfile;
98 /* First try the name as found in the parameter. */
99 cmfile = lr_open (filename, charmap_hash);
100 if (cmfile == NULL)
102 /* No successful. So start looking through the directories
103 in the I18NPATH if this is a simple name. */
104 if (strchr (filename, '/') == NULL)
106 char *i18npath = getenv ("I18NPATH");
107 if (i18npath != NULL && *i18npath != '\0')
109 const size_t pathlen = strlen (i18npath);
110 char i18npathbuf[pathlen + 1];
111 char path[pathlen + sizeof ("/charmaps")];
112 char *next;
113 i18npath = memcpy (i18npathbuf, i18npath, pathlen + 1);
115 while (cmfile == NULL
116 && (next = strsep (&i18npath, ":")) != NULL)
118 stpcpy (stpcpy (path, next), "/charmaps");
119 cmfile = cmlr_open (path, filename, charmap_hash);
121 if (cmfile == NULL)
122 /* Try without the "/charmaps" part. */
123 cmfile = cmlr_open (next, filename, charmap_hash);
127 if (cmfile == NULL)
128 /* Try the default directory. */
129 cmfile = cmlr_open (CHARMAP_PATH, filename, charmap_hash);
133 if (cmfile != NULL)
134 result = parse_charmap (cmfile, verbose, be_quiet);
136 if (result == NULL && error_not_found)
137 WITH_CUR_LOCALE (error (0, errno, _("\
138 character map file `%s' not found"), filename));
141 if (result == NULL && filename != NULL && strchr (filename, '/') == NULL)
143 /* OK, one more try. We also accept the names given to the
144 character sets in the files. Sometimes they differ from the
145 file name. */
146 CHARMAP_DIR *dir;
148 dir = charmap_opendir (CHARMAP_PATH);
149 if (dir != NULL)
151 const char *dirent;
153 while ((dirent = charmap_readdir (dir)) != NULL)
155 char **aliases;
156 char **p;
157 int found;
159 aliases = charmap_aliases (CHARMAP_PATH, dirent);
160 found = 0;
161 for (p = aliases; *p; p++)
162 if (strcasecmp (*p, filename) == 0)
164 found = 1;
165 break;
167 charmap_free_aliases (aliases);
169 if (found)
171 struct linereader *cmfile;
173 cmfile = cmlr_open (CHARMAP_PATH, dirent, charmap_hash);
174 if (cmfile != NULL)
175 result = parse_charmap (cmfile, verbose, be_quiet);
177 break;
181 charmap_closedir (dir);
185 if (result == NULL && DEFAULT_CHARMAP != NULL)
187 struct linereader *cmfile;
189 cmfile = cmlr_open (CHARMAP_PATH, DEFAULT_CHARMAP, charmap_hash);
190 if (cmfile != NULL)
191 result = parse_charmap (cmfile, verbose, be_quiet);
193 if (result == NULL)
194 WITH_CUR_LOCALE (error (4, errno, _("\
195 default character map file `%s' not found"), DEFAULT_CHARMAP));
198 if (result != NULL && result->code_set_name == NULL)
199 /* The input file does not specify a code set name. This
200 shouldn't happen but we should cope with it. */
201 result->code_set_name = basename (filename);
203 /* Test of ASCII compatibility of locale encoding.
205 Verify that the encoding to be used in a locale is ASCII compatible,
206 at least for the graphic characters, excluding the control characters,
207 '$' and '@'. This constraint comes from an ISO C 99 restriction.
209 ISO C 99 section 7.17.(2) (about wchar_t):
210 the null character shall have the code value zero and each member of
211 the basic character set shall have a code value equal to its value
212 when used as the lone character in an integer character constant.
213 ISO C 99 section 5.2.1.(3):
214 Both the basic source and basic execution character sets shall have
215 the following members: the 26 uppercase letters of the Latin alphabet
216 A B C D E F G H I J K L M N O P Q R S T U V W X Y Z
217 the 26 lowercase letters of the Latin alphabet
218 a b c d e f g h i j k l m n o p q r s t u v w x y z
219 the 10 decimal digits
220 0 1 2 3 4 5 6 7 8 9
221 the following 29 graphic characters
222 ! " # % & ' ( ) * + , - . / : ; < = > ? [ \ ] ^ _ { | } ~
223 the space character, and control characters representing horizontal
224 tab, vertical tab, and form feed.
226 Therefore, for all members of the "basic character set", the 'char' code
227 must have the same value as the 'wchar_t' code, which in glibc is the
228 same as the Unicode code, which for all of the enumerated characters
229 is identical to the ASCII code. */
230 if (result != NULL && use_default)
232 static const char basic_charset[] =
234 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M',
235 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z',
236 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm',
237 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
238 '0', '1', '2', '3', '4', '5', '6', '7', '8', '9',
239 '!', '"', '#', '%', '&', '\'', '(', ')', '*', '+', ',', '-',
240 '.', '/', ':', ';', '<', '=', '>', '?', '[', '\\', ']', '^',
241 '_', '{', '|', '}', '~', ' ', '\t', '\v', '\f', '\0'
243 int failed = 0;
244 const char *p = basic_charset;
248 struct charseq *seq = charmap_find_symbol (result, p, 1);
250 if (seq == NULL || seq->ucs4 != (uint32_t) *p)
251 failed = 1;
253 while (*p++ != '\0');
255 if (failed)
257 WITH_CUR_LOCALE (fprintf (stderr, _("\
258 character map `%s' is not ASCII compatible, locale not ISO C compliant\n"),
259 result->code_set_name));
260 enc_not_ascii_compatible = true;
264 return result;
268 static struct charmap_t *
269 parse_charmap (struct linereader *cmfile, int verbose, int be_quiet)
271 struct charmap_t *result;
272 int state;
273 enum token_t expected_tok = tok_error;
274 const char *expected_str = NULL;
275 char *from_name = NULL;
276 char *to_name = NULL;
277 enum token_t ellipsis = 0;
278 int step = 1;
280 /* We don't want symbolic names in string to be translated. */
281 cmfile->translate_strings = 0;
283 /* Allocate room for result. */
284 result = (struct charmap_t *) xmalloc (sizeof (struct charmap_t));
285 memset (result, '\0', sizeof (struct charmap_t));
286 /* The default DEFAULT_WIDTH is 1. */
287 result->width_default = 1;
289 #define obstack_chunk_alloc malloc
290 #define obstack_chunk_free free
291 obstack_init (&result->mem_pool);
293 if (init_hash (&result->char_table, 256)
294 || init_hash (&result->byte_table, 256))
296 free (result);
297 return NULL;
300 /* We use a state machine to describe the charmap description file
301 format. */
302 state = 1;
303 while (1)
305 /* What's on? */
306 struct token *now = lr_token (cmfile, NULL, NULL, NULL, verbose);
307 enum token_t nowtok = now->tok;
308 struct token *arg;
310 if (nowtok == tok_eof)
311 break;
313 switch (state)
315 case 1:
316 /* The beginning. We expect the special declarations, EOL or
317 `CHARMAP'. */
318 if (nowtok == tok_eol)
319 /* Ignore empty lines. */
320 continue;
322 if (nowtok == tok_charmap)
324 from_name = NULL;
325 to_name = NULL;
327 /* We have to set up the real work. Fill in some
328 default values. */
329 if (result->mb_cur_max == 0)
330 result->mb_cur_max = 1;
331 if (result->mb_cur_min == 0)
332 result->mb_cur_min = result->mb_cur_max;
333 if (result->mb_cur_min > result->mb_cur_max)
335 if (!be_quiet)
336 WITH_CUR_LOCALE (error (0, 0, _("\
337 %s: <mb_cur_max> must be greater than <mb_cur_min>\n"),
338 cmfile->fname));
340 result->mb_cur_min = result->mb_cur_max;
343 lr_ignore_rest (cmfile, 1);
345 state = 2;
346 continue;
349 if (nowtok != tok_code_set_name && nowtok != tok_mb_cur_max
350 && nowtok != tok_mb_cur_min && nowtok != tok_escape_char
351 && nowtok != tok_comment_char && nowtok != tok_g0esc
352 && nowtok != tok_g1esc && nowtok != tok_g2esc
353 && nowtok != tok_g3esc && nowtok != tok_repertoiremap
354 && nowtok != tok_include)
356 lr_error (cmfile, _("syntax error in prolog: %s"),
357 _("invalid definition"));
359 lr_ignore_rest (cmfile, 0);
360 continue;
363 /* We know that we need an argument. */
364 arg = lr_token (cmfile, NULL, NULL, NULL, verbose);
366 switch (nowtok)
368 case tok_code_set_name:
369 case tok_repertoiremap:
370 if (arg->tok != tok_ident && arg->tok != tok_string)
372 badarg:
373 lr_error (cmfile, _("syntax error in prolog: %s"),
374 _("bad argument"));
376 lr_ignore_rest (cmfile, 0);
377 continue;
380 if (nowtok == tok_code_set_name)
381 result->code_set_name = obstack_copy0 (&result->mem_pool,
382 arg->val.str.startmb,
383 arg->val.str.lenmb);
384 else
385 result->repertoiremap = obstack_copy0 (&result->mem_pool,
386 arg->val.str.startmb,
387 arg->val.str.lenmb);
389 lr_ignore_rest (cmfile, 1);
390 continue;
392 case tok_mb_cur_max:
393 case tok_mb_cur_min:
394 if (arg->tok != tok_number)
395 goto badarg;
397 if (verbose
398 && ((nowtok == tok_mb_cur_max
399 && result->mb_cur_max != 0)
400 || (nowtok == tok_mb_cur_max
401 && result->mb_cur_max != 0)))
402 lr_error (cmfile, _("duplicate definition of <%s>"),
403 nowtok == tok_mb_cur_min
404 ? "mb_cur_min" : "mb_cur_max");
406 if (arg->val.num < 1)
408 lr_error (cmfile,
409 _("value for <%s> must be 1 or greater"),
410 nowtok == tok_mb_cur_min
411 ? "mb_cur_min" : "mb_cur_max");
413 lr_ignore_rest (cmfile, 0);
414 continue;
416 if ((nowtok == tok_mb_cur_max && result->mb_cur_min != 0
417 && (int) arg->val.num < result->mb_cur_min)
418 || (nowtok == tok_mb_cur_min && result->mb_cur_max != 0
419 && (int) arg->val.num > result->mb_cur_max))
421 lr_error (cmfile, _("\
422 value of <%s> must be greater or equal than the value of <%s>"),
423 "mb_cur_max", "mb_cur_min");
425 lr_ignore_rest (cmfile, 0);
426 continue;
429 if (nowtok == tok_mb_cur_max)
430 result->mb_cur_max = arg->val.num;
431 else
432 result->mb_cur_min = arg->val.num;
434 lr_ignore_rest (cmfile, 1);
435 continue;
437 case tok_escape_char:
438 case tok_comment_char:
439 if (arg->tok != tok_ident)
440 goto badarg;
442 if (arg->val.str.lenmb != 1)
444 lr_error (cmfile, _("\
445 argument to <%s> must be a single character"),
446 nowtok == tok_escape_char ? "escape_char"
447 : "comment_char");
449 lr_ignore_rest (cmfile, 0);
450 continue;
453 if (nowtok == tok_escape_char)
454 cmfile->escape_char = *arg->val.str.startmb;
455 else
456 cmfile->comment_char = *arg->val.str.startmb;
458 lr_ignore_rest (cmfile, 1);
459 continue;
461 case tok_g0esc:
462 case tok_g1esc:
463 case tok_g2esc:
464 case tok_g3esc:
465 case tok_escseq:
466 lr_ignore_rest (cmfile, 0); /* XXX */
467 continue;
469 case tok_include:
470 lr_error (cmfile, _("\
471 character sets with locking states are not supported"));
472 exit (4);
474 default:
475 /* Cannot happen. */
476 assert (! "Should not happen");
478 break;
480 case 2:
481 /* We have seen `CHARMAP' and now are in the body. Each line
482 must have the format "%s %s %s\n" or "%s...%s %s %s\n". */
483 if (nowtok == tok_eol)
484 /* Ignore empty lines. */
485 continue;
487 if (nowtok == tok_end)
489 expected_tok = tok_charmap;
490 expected_str = "CHARMAP";
491 state = 90;
492 continue;
495 if (nowtok != tok_bsymbol && nowtok != tok_ucs4)
497 lr_error (cmfile, _("syntax error in %s definition: %s"),
498 "CHARMAP", _("no symbolic name given"));
500 lr_ignore_rest (cmfile, 0);
501 continue;
504 /* If the previous line was not completely correct free the
505 used memory. */
506 if (from_name != NULL)
507 obstack_free (&result->mem_pool, from_name);
509 if (nowtok == tok_bsymbol)
510 from_name = (char *) obstack_copy0 (&result->mem_pool,
511 now->val.str.startmb,
512 now->val.str.lenmb);
513 else
515 obstack_printf (&result->mem_pool, "U%08X",
516 cmfile->token.val.ucs4);
517 obstack_1grow (&result->mem_pool, '\0');
518 from_name = (char *) obstack_finish (&result->mem_pool);
520 to_name = NULL;
522 state = 3;
523 continue;
525 case 3:
526 /* We have two possibilities: We can see an ellipsis or an
527 encoding value. */
528 if (nowtok == tok_ellipsis3 || nowtok == tok_ellipsis4
529 || nowtok == tok_ellipsis2 || nowtok == tok_ellipsis4_2
530 || nowtok == tok_ellipsis2_2)
532 ellipsis = nowtok;
533 if (nowtok == tok_ellipsis4_2)
535 step = 2;
536 nowtok = tok_ellipsis4;
538 else if (nowtok == tok_ellipsis2_2)
540 step = 2;
541 nowtok = tok_ellipsis2;
543 state = 4;
544 continue;
546 /* FALLTHROUGH */
548 case 5:
549 if (nowtok != tok_charcode)
551 lr_error (cmfile, _("syntax error in %s definition: %s"),
552 "CHARMAP", _("invalid encoding given"));
554 lr_ignore_rest (cmfile, 0);
556 state = 2;
557 continue;
560 if (now->val.charcode.nbytes < result->mb_cur_min)
561 lr_error (cmfile, _("too few bytes in character encoding"));
562 else if (now->val.charcode.nbytes > result->mb_cur_max)
563 lr_error (cmfile, _("too many bytes in character encoding"));
564 else
565 charmap_new_char (cmfile, result, now->val.charcode.nbytes,
566 now->val.charcode.bytes, from_name, to_name,
567 ellipsis != tok_ellipsis2, step);
569 /* Ignore trailing comment silently. */
570 lr_ignore_rest (cmfile, 0);
572 from_name = NULL;
573 to_name = NULL;
574 ellipsis = tok_none;
575 step = 1;
577 state = 2;
578 continue;
580 case 4:
581 if (nowtok != tok_bsymbol && nowtok != tok_ucs4)
583 lr_error (cmfile, _("syntax error in %s definition: %s"),
584 "CHARMAP",
585 _("no symbolic name given for end of range"));
587 lr_ignore_rest (cmfile, 0);
588 continue;
591 /* Copy the to-name in a safe place. */
592 if (nowtok == tok_bsymbol)
593 to_name = (char *) obstack_copy0 (&result->mem_pool,
594 cmfile->token.val.str.startmb,
595 cmfile->token.val.str.lenmb);
596 else
598 obstack_printf (&result->mem_pool, "U%08X",
599 cmfile->token.val.ucs4);
600 obstack_1grow (&result->mem_pool, '\0');
601 to_name = (char *) obstack_finish (&result->mem_pool);
604 state = 5;
605 continue;
607 case 90:
608 if (nowtok != expected_tok)
609 lr_error (cmfile, _("\
610 `%1$s' definition does not end with `END %1$s'"), expected_str);
612 lr_ignore_rest (cmfile, nowtok == expected_tok);
613 state = 91;
614 continue;
616 case 91:
617 /* Waiting for WIDTH... */
618 if (nowtok == tok_eol)
619 /* Ignore empty lines. */
620 continue;
622 if (nowtok == tok_width_default)
624 state = 92;
625 continue;
628 if (nowtok == tok_width)
630 lr_ignore_rest (cmfile, 1);
631 state = 93;
632 continue;
635 if (nowtok == tok_width_variable)
637 lr_ignore_rest (cmfile, 1);
638 state = 98;
639 continue;
642 lr_error (cmfile, _("\
643 only WIDTH definitions are allowed to follow the CHARMAP definition"));
645 lr_ignore_rest (cmfile, 0);
646 continue;
648 case 92:
649 if (nowtok != tok_number)
650 lr_error (cmfile, _("value for %s must be an integer"),
651 "WIDTH_DEFAULT");
652 else
653 result->width_default = now->val.num;
655 lr_ignore_rest (cmfile, nowtok == tok_number);
657 state = 91;
658 continue;
660 case 93:
661 /* We now expect `END WIDTH' or lines of the format "%s %d\n" or
662 "%s...%s %d\n". */
663 if (nowtok == tok_eol)
664 /* ignore empty lines. */
665 continue;
667 if (nowtok == tok_end)
669 expected_tok = tok_width;
670 expected_str = "WIDTH";
671 state = 90;
672 continue;
675 if (nowtok != tok_bsymbol && nowtok != tok_ucs4)
677 lr_error (cmfile, _("syntax error in %s definition: %s"),
678 "WIDTH", _("no symbolic name given"));
680 lr_ignore_rest (cmfile, 0);
681 continue;
684 if (from_name != NULL)
685 obstack_free (&result->mem_pool, from_name);
687 if (nowtok == tok_bsymbol)
688 from_name = (char *) obstack_copy0 (&result->mem_pool,
689 now->val.str.startmb,
690 now->val.str.lenmb);
691 else
693 obstack_printf (&result->mem_pool, "U%08X",
694 cmfile->token.val.ucs4);
695 obstack_1grow (&result->mem_pool, '\0');
696 from_name = (char *) obstack_finish (&result->mem_pool);
699 to_name = NULL;
701 state = 94;
702 continue;
704 case 94:
705 if (nowtok == tok_ellipsis3)
707 state = 95;
708 continue;
711 case 96:
712 if (nowtok != tok_number)
713 lr_error (cmfile, _("value for %s must be an integer"),
714 "WIDTH");
715 else
717 /* Store width for chars. */
718 new_width (cmfile, result, from_name, to_name, now->val.num);
720 from_name = NULL;
721 to_name = NULL;
724 lr_ignore_rest (cmfile, nowtok == tok_number);
726 state = 93;
727 continue;
729 case 95:
730 if (nowtok != tok_bsymbol && nowtok != tok_ucs4)
732 lr_error (cmfile, _("syntax error in %s definition: %s"),
733 "WIDTH", _("no symbolic name given for end of range"));
735 lr_ignore_rest (cmfile, 0);
737 state = 93;
738 continue;
741 if (nowtok == tok_bsymbol)
742 to_name = (char *) obstack_copy0 (&result->mem_pool,
743 now->val.str.startmb,
744 now->val.str.lenmb);
745 else
747 obstack_printf (&result->mem_pool, "U%08X",
748 cmfile->token.val.ucs4);
749 obstack_1grow (&result->mem_pool, '\0');
750 to_name = (char *) obstack_finish (&result->mem_pool);
753 state = 96;
754 continue;
756 case 98:
757 /* We now expect `END WIDTH_VARIABLE' or lines of the format
758 "%s\n" or "%s...%s\n". */
759 if (nowtok == tok_eol)
760 /* ignore empty lines. */
761 continue;
763 if (nowtok == tok_end)
765 expected_tok = tok_width_variable;
766 expected_str = "WIDTH_VARIABLE";
767 state = 90;
768 continue;
771 if (nowtok != tok_bsymbol && nowtok != tok_ucs4)
773 lr_error (cmfile, _("syntax error in %s definition: %s"),
774 "WIDTH_VARIABLE", _("no symbolic name given"));
776 lr_ignore_rest (cmfile, 0);
778 continue;
781 if (from_name != NULL)
782 obstack_free (&result->mem_pool, from_name);
784 if (nowtok == tok_bsymbol)
785 from_name = (char *) obstack_copy0 (&result->mem_pool,
786 now->val.str.startmb,
787 now->val.str.lenmb);
788 else
790 obstack_printf (&result->mem_pool, "U%08X",
791 cmfile->token.val.ucs4);
792 obstack_1grow (&result->mem_pool, '\0');
793 from_name = (char *) obstack_finish (&result->mem_pool);
795 to_name = NULL;
797 state = 99;
798 continue;
800 case 99:
801 if (nowtok == tok_ellipsis3)
802 state = 100;
804 /* Store info. */
805 from_name = NULL;
807 /* Warn */
808 state = 98;
809 continue;
811 case 100:
812 if (nowtok != tok_bsymbol && nowtok != tok_ucs4)
814 lr_error (cmfile, _("syntax error in %s definition: %s"),
815 "WIDTH_VARIABLE",
816 _("no symbolic name given for end of range"));
817 lr_ignore_rest (cmfile, 0);
818 continue;
821 if (nowtok == tok_bsymbol)
822 to_name = (char *) obstack_copy0 (&result->mem_pool,
823 now->val.str.startmb,
824 now->val.str.lenmb);
825 else
827 obstack_printf (&result->mem_pool, "U%08X",
828 cmfile->token.val.ucs4);
829 obstack_1grow (&result->mem_pool, '\0');
830 to_name = (char *) obstack_finish (&result->mem_pool);
833 /* XXX Enter value into table. */
835 lr_ignore_rest (cmfile, 1);
837 state = 98;
838 continue;
840 default:
841 WITH_CUR_LOCALE (error (5, 0, _("%s: error in state machine"),
842 __FILE__));
843 /* NOTREACHED */
845 break;
848 if (state != 91 && !be_quiet)
849 WITH_CUR_LOCALE (error (0, 0, _("%s: premature end of file"),
850 cmfile->fname));
852 lr_close (cmfile);
854 return result;
858 static void
859 new_width (struct linereader *cmfile, struct charmap_t *result,
860 const char *from, const char *to, unsigned long int width)
862 struct charseq *from_val;
863 struct charseq *to_val;
865 from_val = charmap_find_value (result, from, strlen (from));
866 if (from_val == NULL)
868 lr_error (cmfile, _("unknown character `%s'"), from);
869 return;
872 if (to == NULL)
873 to_val = from_val;
874 else
876 to_val = charmap_find_value (result, to, strlen (to));
877 if (to_val == NULL)
879 lr_error (cmfile, _("unknown character `%s'"), to);
880 return;
883 /* Make sure the number of bytes for the end points of the range
884 is correct. */
885 if (from_val->nbytes != to_val->nbytes)
887 lr_error (cmfile, _("\
888 number of bytes for byte sequence of beginning and end of range not the same: %d vs %d"),
889 from_val->nbytes, to_val->nbytes);
890 return;
894 if (result->nwidth_rules >= result->nwidth_rules_max)
896 size_t new_size = result->nwidth_rules + 32;
897 struct width_rule *new_rules =
898 (struct width_rule *) obstack_alloc (&result->mem_pool,
899 (new_size
900 * sizeof (struct width_rule)));
902 memcpy (new_rules, result->width_rules,
903 result->nwidth_rules_max * sizeof (struct width_rule));
905 result->width_rules = new_rules;
906 result->nwidth_rules_max = new_size;
909 result->width_rules[result->nwidth_rules].from = from_val;
910 result->width_rules[result->nwidth_rules].to = to_val;
911 result->width_rules[result->nwidth_rules].width = (unsigned int) width;
912 ++result->nwidth_rules;
916 struct charseq *
917 charmap_find_value (const struct charmap_t *cm, const char *name, size_t len)
919 void *result;
921 return (find_entry ((hash_table *) &cm->char_table, name, len, &result)
922 < 0 ? NULL : (struct charseq *) result);
926 static void
927 charmap_new_char (struct linereader *lr, struct charmap_t *cm,
928 size_t nbytes, unsigned char *bytes,
929 const char *from, const char *to,
930 int decimal_ellipsis, int step)
932 hash_table *ht = &cm->char_table;
933 hash_table *bt = &cm->byte_table;
934 struct obstack *ob = &cm->mem_pool;
935 char *from_end;
936 char *to_end;
937 const char *cp;
938 int prefix_len, len1, len2;
939 unsigned int from_nr, to_nr, cnt;
940 struct charseq *newp;
942 len1 = strlen (from);
944 if (to == NULL)
946 newp = (struct charseq *) obstack_alloc (ob, sizeof (*newp) + nbytes);
947 newp->nbytes = nbytes;
948 memcpy (newp->bytes, bytes, nbytes);
949 newp->name = from;
951 newp->ucs4 = UNINITIALIZED_CHAR_VALUE;
952 if ((from[0] == 'U' || from[0] == 'P') && (len1 == 5 || len1 == 9))
954 /* Maybe the name is of the form `Uxxxx' or `Uxxxxxxxx' where
955 xxxx and xxxxxxxx are hexadecimal numbers. In this case
956 we use the value of xxxx or xxxxxxxx as the UCS4 value of
957 this character and we don't have to consult the repertoire
958 map.
960 If the name is of the form `Pxxxx' or `Pxxxxxxxx' the xxxx
961 and xxxxxxxx also give the code point in UCS4 but this must
962 be in the private, i.e., unassigned, area. This should be
963 used for characters which do not (yet) have an equivalent
964 in ISO 10646 and Unicode. */
965 char *endp;
967 errno = 0;
968 newp->ucs4 = strtoul (from + 1, &endp, 16);
969 if (endp - from != len1
970 || (newp->ucs4 == ~((uint32_t) 0) && errno == ERANGE)
971 || newp->ucs4 >= 0x80000000)
972 /* This wasn't successful. Signal this name cannot be a
973 correct UCS value. */
974 newp->ucs4 = UNINITIALIZED_CHAR_VALUE;
977 insert_entry (ht, from, len1, newp);
978 insert_entry (bt, newp->bytes, nbytes, newp);
979 /* Please note that it isn't a bug if a symbol is defined more
980 than once. All later definitions are simply discarded. */
981 return;
984 /* We have a range: the names must have names with equal prefixes
985 and an equal number of digits, where the second number is greater
986 or equal than the first. */
987 len2 = strlen (to);
989 if (len1 != len2)
991 illegal_range:
992 lr_error (lr, _("invalid names for character range"));
993 return;
996 cp = &from[len1 - 1];
997 if (decimal_ellipsis)
998 while (isdigit (*cp) && cp >= from)
999 --cp;
1000 else
1001 while (isxdigit (*cp) && cp >= from)
1003 if (!isdigit (*cp) && !isupper (*cp))
1004 lr_error (lr, _("\
1005 hexadecimal range format should use only capital characters"));
1006 --cp;
1009 prefix_len = (cp - from) + 1;
1011 if (cp == &from[len1 - 1] || strncmp (from, to, prefix_len) != 0)
1012 goto illegal_range;
1014 errno = 0;
1015 from_nr = strtoul (&from[prefix_len], &from_end, decimal_ellipsis ? 10 : 16);
1016 if (*from_end != '\0' || (from_nr == UINT_MAX && errno == ERANGE)
1017 || ((to_nr = strtoul (&to[prefix_len], &to_end,
1018 decimal_ellipsis ? 10 : 16)) == UINT_MAX
1019 && errno == ERANGE)
1020 || *to_end != '\0')
1022 lr_error (lr, _("<%s> and <%s> are illegal names for range"), from, to);
1023 return;
1026 if (from_nr > to_nr)
1028 lr_error (lr, _("upper limit in range is not higher then lower limit"));
1029 return;
1032 for (cnt = from_nr; cnt <= to_nr; cnt += step)
1034 char *name_end;
1035 obstack_printf (ob, decimal_ellipsis ? "%.*s%0*d" : "%.*s%0*X",
1036 prefix_len, from, len1 - prefix_len, cnt);
1037 obstack_1grow (ob, '\0');
1038 name_end = obstack_finish (ob);
1040 newp = (struct charseq *) obstack_alloc (ob, sizeof (*newp) + nbytes);
1041 newp->nbytes = nbytes;
1042 memcpy (newp->bytes, bytes, nbytes);
1043 newp->name = name_end;
1045 newp->ucs4 = UNINITIALIZED_CHAR_VALUE;
1046 if ((name_end[0] == 'U' || name_end[0] == 'P')
1047 && (len1 == 5 || len1 == 9))
1049 /* Maybe the name is of the form `Uxxxx' or `Uxxxxxxxx' where
1050 xxxx and xxxxxxxx are hexadecimal numbers. In this case
1051 we use the value of xxxx or xxxxxxxx as the UCS4 value of
1052 this character and we don't have to consult the repertoire
1053 map.
1055 If the name is of the form `Pxxxx' or `Pxxxxxxxx' the xxxx
1056 and xxxxxxxx also give the code point in UCS4 but this must
1057 be in the private, i.e., unassigned, area. This should be
1058 used for characters which do not (yet) have an equivalent
1059 in ISO 10646 and Unicode. */
1060 char *endp;
1062 errno = 0;
1063 newp->ucs4 = strtoul (name_end + 1, &endp, 16);
1064 if (endp - name_end != len1
1065 || (newp->ucs4 == ~((uint32_t) 0) && errno == ERANGE)
1066 || newp->ucs4 >= 0x80000000)
1067 /* This wasn't successful. Signal this name cannot be a
1068 correct UCS value. */
1069 newp->ucs4 = UNINITIALIZED_CHAR_VALUE;
1072 insert_entry (ht, name_end, len1, newp);
1073 insert_entry (bt, newp->bytes, nbytes, newp);
1074 /* Please note we don't examine the return value since it is no error
1075 if we have two definitions for a symbol. */
1077 /* Increment the value in the byte sequence. */
1078 if (++bytes[nbytes - 1] == '\0')
1080 int b = nbytes - 2;
1083 if (b < 0)
1085 lr_error (lr,
1086 _("resulting bytes for range not representable."));
1087 return;
1089 while (++bytes[b--] == 0);
1095 struct charseq *
1096 charmap_find_symbol (const struct charmap_t *cm, const char *bytes,
1097 size_t nbytes)
1099 void *result;
1101 return (find_entry ((hash_table *) &cm->byte_table, bytes, nbytes, &result)
1102 < 0 ? NULL : (struct charseq *) result);