(CFLAGS-tst-align.c): Add -mpreferred-stack-boundary=4.
[glibc.git] / locale / programs / charmap.c
blob8dbac6f5b90480f790312361341398c14d349027
1 /* Copyright (C) 1996, 1998-2002, 2003, 2004 Free Software Foundation, Inc.
2 This file is part of the GNU C Library.
3 Contributed by Ulrich Drepper <drepper@gnu.org>, 1996.
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, write to the Free
17 Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
18 02111-1307 USA. */
20 #ifdef HAVE_CONFIG_H
21 # include <config.h>
22 #endif
24 #include <ctype.h>
25 #include <errno.h>
26 #include <libintl.h>
27 #include <limits.h>
28 #include <stdio.h>
29 #include <stdlib.h>
30 #include <string.h>
31 #include <error.h>
33 #include "localedef.h"
34 #include "linereader.h"
35 #include "charmap.h"
36 #include "charmap-dir.h"
38 #include <assert.h>
41 /* Define the lookup function. */
42 #include "charmap-kw.h"
45 /* Prototypes for local functions. */
46 static struct charmap_t *parse_charmap (struct linereader *cmfile,
47 int verbose, int be_quiet);
48 static void new_width (struct linereader *cmfile, struct charmap_t *result,
49 const char *from, const char *to,
50 unsigned long int width);
51 static void charmap_new_char (struct linereader *lr, struct charmap_t *cm,
52 int nbytes, char *bytes, const char *from,
53 const char *to, int decimal_ellipsis, int step);
56 bool enc_not_ascii_compatible;
59 #ifdef NEED_NULL_POINTER
60 static const char *null_pointer;
61 #endif
63 static struct linereader *
64 cmlr_open (const char *directory, const char *name, kw_hash_fct_t hf)
66 FILE *fp;
68 fp = charmap_open (directory, name);
69 if (fp == NULL)
70 return NULL;
71 else
73 size_t dlen = strlen (directory);
74 int add_slash = (dlen == 0 || directory[dlen - 1] != '/');
75 size_t nlen = strlen (name);
76 char *pathname;
77 char *p;
79 pathname = alloca (dlen + add_slash + nlen + 1);
80 p = stpcpy (pathname, directory);
81 if (add_slash)
82 *p++ = '/';
83 stpcpy (p, name);
85 return lr_create (fp, pathname, hf);
89 struct charmap_t *
90 charmap_read (const char *filename, int verbose, int be_quiet, int use_default)
92 struct charmap_t *result = NULL;
94 if (filename != NULL)
96 struct linereader *cmfile;
98 /* First try the name as found in the parameter. */
99 cmfile = lr_open (filename, charmap_hash);
100 if (cmfile == NULL)
102 /* No successful. So start looking through the directories
103 in the I18NPATH if this is a simple name. */
104 if (strchr (filename, '/') == NULL)
106 char *i18npath = getenv ("I18NPATH");
107 if (i18npath != NULL && *i18npath != '\0')
109 const size_t pathlen = strlen (i18npath);
110 char i18npathbuf[pathlen + 1];
111 char path[pathlen + sizeof ("/charmaps")];
112 char *next;
113 i18npath = memcpy (i18npathbuf, i18npath, pathlen + 1);
115 while (cmfile == NULL
116 && (next = strsep (&i18npath, ":")) != NULL)
118 stpcpy (stpcpy (path, next), "/charmaps");
119 cmfile = cmlr_open (path, filename, charmap_hash);
121 if (cmfile == NULL)
122 /* Try without the "/charmaps" part. */
123 cmfile = cmlr_open (next, filename, charmap_hash);
127 if (cmfile == NULL)
128 /* Try the default directory. */
129 cmfile = cmlr_open (CHARMAP_PATH, filename, charmap_hash);
133 if (cmfile != NULL)
135 result = parse_charmap (cmfile, verbose, be_quiet);
137 if (result == NULL && !be_quiet)
138 WITH_CUR_LOCALE (error (0, errno, _("\
139 character map file `%s' not found"), filename));
143 if (result == NULL && filename != NULL && strchr (filename, '/') == NULL)
145 /* OK, one more try. We also accept the names given to the
146 character sets in the files. Sometimes they differ from the
147 file name. */
148 CHARMAP_DIR *dir;
150 dir = charmap_opendir (CHARMAP_PATH);
151 if (dir != NULL)
153 const char *dirent;
155 while ((dirent = charmap_readdir (dir)) != NULL)
157 char **aliases;
158 char **p;
159 int found;
161 aliases = charmap_aliases (CHARMAP_PATH, dirent);
162 found = 0;
163 for (p = aliases; *p; p++)
164 if (strcasecmp (*p, filename) == 0)
166 found = 1;
167 break;
169 charmap_free_aliases (aliases);
171 if (found)
173 struct linereader *cmfile;
175 cmfile = cmlr_open (CHARMAP_PATH, dirent, charmap_hash);
176 if (cmfile != NULL)
177 result = parse_charmap (cmfile, verbose, be_quiet);
179 break;
183 charmap_closedir (dir);
187 if (result == NULL && DEFAULT_CHARMAP != NULL)
189 struct linereader *cmfile;
191 cmfile = cmlr_open (CHARMAP_PATH, DEFAULT_CHARMAP, charmap_hash);
192 if (cmfile != NULL)
193 result = parse_charmap (cmfile, verbose, be_quiet);
195 if (result == NULL)
196 WITH_CUR_LOCALE (error (4, errno, _("\
197 default character map file `%s' not found"), DEFAULT_CHARMAP));
200 if (result != NULL && result->code_set_name == NULL)
201 /* The input file does not specify a code set name. This
202 shouldn't happen but we should cope with it. */
203 result->code_set_name = basename (filename);
205 /* Test of ASCII compatibility of locale encoding.
207 Verify that the encoding to be used in a locale is ASCII compatible,
208 at least for the graphic characters, excluding the control characters,
209 '$' and '@'. This constraint comes from an ISO C 99 restriction.
211 ISO C 99 section 7.17.(2) (about wchar_t):
212 the null character shall have the code value zero and each member of
213 the basic character set shall have a code value equal to its value
214 when used as the lone character in an integer character constant.
215 ISO C 99 section 5.2.1.(3):
216 Both the basic source and basic execution character sets shall have
217 the following members: the 26 uppercase letters of the Latin alphabet
218 A B C D E F G H I J K L M N O P Q R S T U V W X Y Z
219 the 26 lowercase letters of the Latin alphabet
220 a b c d e f g h i j k l m n o p q r s t u v w x y z
221 the 10 decimal digits
222 0 1 2 3 4 5 6 7 8 9
223 the following 29 graphic characters
224 ! " # % & ' ( ) * + , - . / : ; < = > ? [ \ ] ^ _ { | } ~
225 the space character, and control characters representing horizontal
226 tab, vertical tab, and form feed.
228 Therefore, for all members of the "basic character set", the 'char' code
229 must have the same value as the 'wchar_t' code, which in glibc is the
230 same as the Unicode code, which for all of the enumerated characters
231 is identical to the ASCII code. */
232 if (result != NULL && use_default)
234 static const char basic_charset[] =
236 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M',
237 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z',
238 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm',
239 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
240 '0', '1', '2', '3', '4', '5', '6', '7', '8', '9',
241 '!', '"', '#', '%', '&', '\'', '(', ')', '*', '+', ',', '-',
242 '.', '/', ':', ';', '<', '=', '>', '?', '[', '\\', ']', '^',
243 '_', '{', '|', '}', '~', ' ', '\t', '\v', '\f', '\0'
245 int failed = 0;
246 const char *p = basic_charset;
250 struct charseq *seq = charmap_find_symbol (result, p, 1);
252 if (seq == NULL || seq->ucs4 != (uint32_t) *p)
253 failed = 1;
255 while (*p++ != '\0');
257 if (failed)
259 WITH_CUR_LOCALE (fprintf (stderr, _("\
260 character map `%s' is not ASCII compatible, locale not ISO C compliant\n"),
261 result->code_set_name));
262 enc_not_ascii_compatible = true;
266 return result;
270 static struct charmap_t *
271 parse_charmap (struct linereader *cmfile, int verbose, int be_quiet)
273 struct charmap_t *result;
274 int state;
275 enum token_t expected_tok = tok_error;
276 const char *expected_str = NULL;
277 char *from_name = NULL;
278 char *to_name = NULL;
279 enum token_t ellipsis = 0;
280 int step = 1;
282 /* We don't want symbolic names in string to be translated. */
283 cmfile->translate_strings = 0;
285 /* Allocate room for result. */
286 result = (struct charmap_t *) xmalloc (sizeof (struct charmap_t));
287 memset (result, '\0', sizeof (struct charmap_t));
288 /* The default DEFAULT_WIDTH is 1. */
289 result->width_default = 1;
291 #define obstack_chunk_alloc malloc
292 #define obstack_chunk_free free
293 obstack_init (&result->mem_pool);
295 if (init_hash (&result->char_table, 256)
296 || init_hash (&result->byte_table, 256))
298 free (result);
299 return NULL;
302 /* We use a state machine to describe the charmap description file
303 format. */
304 state = 1;
305 while (1)
307 /* What's on? */
308 struct token *now = lr_token (cmfile, NULL, NULL, NULL, verbose);
309 enum token_t nowtok = now->tok;
310 struct token *arg;
312 if (nowtok == tok_eof)
313 break;
315 switch (state)
317 case 1:
318 /* The beginning. We expect the special declarations, EOL or
319 `CHARMAP'. */
320 if (nowtok == tok_eol)
321 /* Ignore empty lines. */
322 continue;
324 if (nowtok == tok_charmap)
326 from_name = NULL;
327 to_name = NULL;
329 /* We have to set up the real work. Fill in some
330 default values. */
331 if (result->mb_cur_max == 0)
332 result->mb_cur_max = 1;
333 if (result->mb_cur_min == 0)
334 result->mb_cur_min = result->mb_cur_max;
335 if (result->mb_cur_min > result->mb_cur_max)
337 if (!be_quiet)
338 WITH_CUR_LOCALE (error (0, 0, _("\
339 %s: <mb_cur_max> must be greater than <mb_cur_min>\n"),
340 cmfile->fname));
342 result->mb_cur_min = result->mb_cur_max;
345 lr_ignore_rest (cmfile, 1);
347 state = 2;
348 continue;
351 if (nowtok != tok_code_set_name && nowtok != tok_mb_cur_max
352 && nowtok != tok_mb_cur_min && nowtok != tok_escape_char
353 && nowtok != tok_comment_char && nowtok != tok_g0esc
354 && nowtok != tok_g1esc && nowtok != tok_g2esc
355 && nowtok != tok_g3esc && nowtok != tok_repertoiremap
356 && nowtok != tok_include)
358 lr_error (cmfile, _("syntax error in prolog: %s"),
359 _("invalid definition"));
361 lr_ignore_rest (cmfile, 0);
362 continue;
365 /* We know that we need an argument. */
366 arg = lr_token (cmfile, NULL, NULL, NULL, verbose);
368 switch (nowtok)
370 case tok_code_set_name:
371 case tok_repertoiremap:
372 if (arg->tok != tok_ident && arg->tok != tok_string)
374 badarg:
375 lr_error (cmfile, _("syntax error in prolog: %s"),
376 _("bad argument"));
378 lr_ignore_rest (cmfile, 0);
379 continue;
382 if (nowtok == tok_code_set_name)
383 result->code_set_name = obstack_copy0 (&result->mem_pool,
384 arg->val.str.startmb,
385 arg->val.str.lenmb);
386 else
387 result->repertoiremap = obstack_copy0 (&result->mem_pool,
388 arg->val.str.startmb,
389 arg->val.str.lenmb);
391 lr_ignore_rest (cmfile, 1);
392 continue;
394 case tok_mb_cur_max:
395 case tok_mb_cur_min:
396 if (arg->tok != tok_number)
397 goto badarg;
399 if (verbose
400 && ((nowtok == tok_mb_cur_max
401 && result->mb_cur_max != 0)
402 || (nowtok == tok_mb_cur_max
403 && result->mb_cur_max != 0)))
404 lr_error (cmfile, _("duplicate definition of <%s>"),
405 nowtok == tok_mb_cur_min
406 ? "mb_cur_min" : "mb_cur_max");
408 if (arg->val.num < 1)
410 lr_error (cmfile,
411 _("value for <%s> must be 1 or greater"),
412 nowtok == tok_mb_cur_min
413 ? "mb_cur_min" : "mb_cur_max");
415 lr_ignore_rest (cmfile, 0);
416 continue;
418 if ((nowtok == tok_mb_cur_max && result->mb_cur_min != 0
419 && (int) arg->val.num < result->mb_cur_min)
420 || (nowtok == tok_mb_cur_min && result->mb_cur_max != 0
421 && (int) arg->val.num > result->mb_cur_max))
423 lr_error (cmfile, _("\
424 value of <%s> must be greater or equal than the value of <%s>"),
425 "mb_cur_max", "mb_cur_min");
427 lr_ignore_rest (cmfile, 0);
428 continue;
431 if (nowtok == tok_mb_cur_max)
432 result->mb_cur_max = arg->val.num;
433 else
434 result->mb_cur_min = arg->val.num;
436 lr_ignore_rest (cmfile, 1);
437 continue;
439 case tok_escape_char:
440 case tok_comment_char:
441 if (arg->tok != tok_ident)
442 goto badarg;
444 if (arg->val.str.lenmb != 1)
446 lr_error (cmfile, _("\
447 argument to <%s> must be a single character"),
448 nowtok == tok_escape_char ? "escape_char"
449 : "comment_char");
451 lr_ignore_rest (cmfile, 0);
452 continue;
455 if (nowtok == tok_escape_char)
456 cmfile->escape_char = *arg->val.str.startmb;
457 else
458 cmfile->comment_char = *arg->val.str.startmb;
460 lr_ignore_rest (cmfile, 1);
461 continue;
463 case tok_g0esc:
464 case tok_g1esc:
465 case tok_g2esc:
466 case tok_g3esc:
467 case tok_escseq:
468 lr_ignore_rest (cmfile, 0); /* XXX */
469 continue;
471 case tok_include:
472 lr_error (cmfile, _("\
473 character sets with locking states are not supported"));
474 exit (4);
476 default:
477 /* Cannot happen. */
478 assert (! "Should not happen");
480 break;
482 case 2:
483 /* We have seen `CHARMAP' and now are in the body. Each line
484 must have the format "%s %s %s\n" or "%s...%s %s %s\n". */
485 if (nowtok == tok_eol)
486 /* Ignore empty lines. */
487 continue;
489 if (nowtok == tok_end)
491 expected_tok = tok_charmap;
492 expected_str = "CHARMAP";
493 state = 90;
494 continue;
497 if (nowtok != tok_bsymbol && nowtok != tok_ucs4)
499 lr_error (cmfile, _("syntax error in %s definition: %s"),
500 "CHARMAP", _("no symbolic name given"));
502 lr_ignore_rest (cmfile, 0);
503 continue;
506 /* If the previous line was not completely correct free the
507 used memory. */
508 if (from_name != NULL)
509 obstack_free (&result->mem_pool, from_name);
511 if (nowtok == tok_bsymbol)
512 from_name = (char *) obstack_copy0 (&result->mem_pool,
513 now->val.str.startmb,
514 now->val.str.lenmb);
515 else
517 obstack_printf (&result->mem_pool, "U%08X",
518 cmfile->token.val.ucs4);
519 obstack_1grow (&result->mem_pool, '\0');
520 from_name = (char *) obstack_finish (&result->mem_pool);
522 to_name = NULL;
524 state = 3;
525 continue;
527 case 3:
528 /* We have two possibilities: We can see an ellipsis or an
529 encoding value. */
530 if (nowtok == tok_ellipsis3 || nowtok == tok_ellipsis4
531 || nowtok == tok_ellipsis2 || nowtok == tok_ellipsis4_2
532 || nowtok == tok_ellipsis2_2)
534 ellipsis = nowtok;
535 if (nowtok == tok_ellipsis4_2)
537 step = 2;
538 nowtok = tok_ellipsis4;
540 else if (nowtok == tok_ellipsis2_2)
542 step = 2;
543 nowtok = tok_ellipsis2;
545 state = 4;
546 continue;
548 /* FALLTHROUGH */
550 case 5:
551 if (nowtok != tok_charcode)
553 lr_error (cmfile, _("syntax error in %s definition: %s"),
554 "CHARMAP", _("invalid encoding given"));
556 lr_ignore_rest (cmfile, 0);
558 state = 2;
559 continue;
562 if (now->val.charcode.nbytes < result->mb_cur_min)
563 lr_error (cmfile, _("too few bytes in character encoding"));
564 else if (now->val.charcode.nbytes > result->mb_cur_max)
565 lr_error (cmfile, _("too many bytes in character encoding"));
566 else
567 charmap_new_char (cmfile, result, now->val.charcode.nbytes,
568 now->val.charcode.bytes, from_name, to_name,
569 ellipsis != tok_ellipsis2, step);
571 /* Ignore trailing comment silently. */
572 lr_ignore_rest (cmfile, 0);
574 from_name = NULL;
575 to_name = NULL;
576 ellipsis = tok_none;
577 step = 1;
579 state = 2;
580 continue;
582 case 4:
583 if (nowtok != tok_bsymbol && nowtok != tok_ucs4)
585 lr_error (cmfile, _("syntax error in %s definition: %s"),
586 "CHARMAP",
587 _("no symbolic name given for end of range"));
589 lr_ignore_rest (cmfile, 0);
590 continue;
593 /* Copy the to-name in a safe place. */
594 if (nowtok == tok_bsymbol)
595 to_name = (char *) obstack_copy0 (&result->mem_pool,
596 cmfile->token.val.str.startmb,
597 cmfile->token.val.str.lenmb);
598 else
600 obstack_printf (&result->mem_pool, "U%08X",
601 cmfile->token.val.ucs4);
602 obstack_1grow (&result->mem_pool, '\0');
603 to_name = (char *) obstack_finish (&result->mem_pool);
606 state = 5;
607 continue;
609 case 90:
610 if (nowtok != expected_tok)
611 lr_error (cmfile, _("\
612 `%1$s' definition does not end with `END %1$s'"), expected_str);
614 lr_ignore_rest (cmfile, nowtok == expected_tok);
615 state = 91;
616 continue;
618 case 91:
619 /* Waiting for WIDTH... */
620 if (nowtok == tok_eol)
621 /* Ignore empty lines. */
622 continue;
624 if (nowtok == tok_width_default)
626 state = 92;
627 continue;
630 if (nowtok == tok_width)
632 lr_ignore_rest (cmfile, 1);
633 state = 93;
634 continue;
637 if (nowtok == tok_width_variable)
639 lr_ignore_rest (cmfile, 1);
640 state = 98;
641 continue;
644 lr_error (cmfile, _("\
645 only WIDTH definitions are allowed to follow the CHARMAP definition"));
647 lr_ignore_rest (cmfile, 0);
648 continue;
650 case 92:
651 if (nowtok != tok_number)
652 lr_error (cmfile, _("value for %s must be an integer"),
653 "WIDTH_DEFAULT");
654 else
655 result->width_default = now->val.num;
657 lr_ignore_rest (cmfile, nowtok == tok_number);
659 state = 91;
660 continue;
662 case 93:
663 /* We now expect `END WIDTH' or lines of the format "%s %d\n" or
664 "%s...%s %d\n". */
665 if (nowtok == tok_eol)
666 /* ignore empty lines. */
667 continue;
669 if (nowtok == tok_end)
671 expected_tok = tok_width;
672 expected_str = "WIDTH";
673 state = 90;
674 continue;
677 if (nowtok != tok_bsymbol && nowtok != tok_ucs4)
679 lr_error (cmfile, _("syntax error in %s definition: %s"),
680 "WIDTH", _("no symbolic name given"));
682 lr_ignore_rest (cmfile, 0);
683 continue;
686 if (from_name != NULL)
687 obstack_free (&result->mem_pool, from_name);
689 if (nowtok == tok_bsymbol)
690 from_name = (char *) obstack_copy0 (&result->mem_pool,
691 now->val.str.startmb,
692 now->val.str.lenmb);
693 else
695 obstack_printf (&result->mem_pool, "U%08X",
696 cmfile->token.val.ucs4);
697 obstack_1grow (&result->mem_pool, '\0');
698 from_name = (char *) obstack_finish (&result->mem_pool);
701 to_name = NULL;
703 state = 94;
704 continue;
706 case 94:
707 if (nowtok == tok_ellipsis3)
709 state = 95;
710 continue;
713 case 96:
714 if (nowtok != tok_number)
715 lr_error (cmfile, _("value for %s must be an integer"),
716 "WIDTH");
717 else
719 /* Store width for chars. */
720 new_width (cmfile, result, from_name, to_name, now->val.num);
722 from_name = NULL;
723 to_name = NULL;
726 lr_ignore_rest (cmfile, nowtok == tok_number);
728 state = 93;
729 continue;
731 case 95:
732 if (nowtok != tok_bsymbol && nowtok != tok_ucs4)
734 lr_error (cmfile, _("syntax error in %s definition: %s"),
735 "WIDTH", _("no symbolic name given for end of range"));
737 lr_ignore_rest (cmfile, 0);
739 state = 93;
740 continue;
743 if (nowtok == tok_bsymbol)
744 to_name = (char *) obstack_copy0 (&result->mem_pool,
745 now->val.str.startmb,
746 now->val.str.lenmb);
747 else
749 obstack_printf (&result->mem_pool, "U%08X",
750 cmfile->token.val.ucs4);
751 obstack_1grow (&result->mem_pool, '\0');
752 to_name = (char *) obstack_finish (&result->mem_pool);
755 state = 96;
756 continue;
758 case 98:
759 /* We now expect `END WIDTH_VARIABLE' or lines of the format
760 "%s\n" or "%s...%s\n". */
761 if (nowtok == tok_eol)
762 /* ignore empty lines. */
763 continue;
765 if (nowtok == tok_end)
767 expected_tok = tok_width_variable;
768 expected_str = "WIDTH_VARIABLE";
769 state = 90;
770 continue;
773 if (nowtok != tok_bsymbol && nowtok != tok_ucs4)
775 lr_error (cmfile, _("syntax error in %s definition: %s"),
776 "WIDTH_VARIABLE", _("no symbolic name given"));
778 lr_ignore_rest (cmfile, 0);
780 continue;
783 if (from_name != NULL)
784 obstack_free (&result->mem_pool, from_name);
786 if (nowtok == tok_bsymbol)
787 from_name = (char *) obstack_copy0 (&result->mem_pool,
788 now->val.str.startmb,
789 now->val.str.lenmb);
790 else
792 obstack_printf (&result->mem_pool, "U%08X",
793 cmfile->token.val.ucs4);
794 obstack_1grow (&result->mem_pool, '\0');
795 from_name = (char *) obstack_finish (&result->mem_pool);
797 to_name = NULL;
799 state = 99;
800 continue;
802 case 99:
803 if (nowtok == tok_ellipsis3)
804 state = 100;
806 /* Store info. */
807 from_name = NULL;
809 /* Warn */
810 state = 98;
811 continue;
813 case 100:
814 if (nowtok != tok_bsymbol && nowtok != tok_ucs4)
816 lr_error (cmfile, _("syntax error in %s definition: %s"),
817 "WIDTH_VARIABLE",
818 _("no symbolic name given for end of range"));
819 lr_ignore_rest (cmfile, 0);
820 continue;
823 if (nowtok == tok_bsymbol)
824 to_name = (char *) obstack_copy0 (&result->mem_pool,
825 now->val.str.startmb,
826 now->val.str.lenmb);
827 else
829 obstack_printf (&result->mem_pool, "U%08X",
830 cmfile->token.val.ucs4);
831 obstack_1grow (&result->mem_pool, '\0');
832 to_name = (char *) obstack_finish (&result->mem_pool);
835 /* XXX Enter value into table. */
837 lr_ignore_rest (cmfile, 1);
839 state = 98;
840 continue;
842 default:
843 WITH_CUR_LOCALE (error (5, 0, _("%s: error in state machine"),
844 __FILE__));
845 /* NOTREACHED */
847 break;
850 if (state != 91 && !be_quiet)
851 WITH_CUR_LOCALE (error (0, 0, _("%s: premature end of file"),
852 cmfile->fname));
854 lr_close (cmfile);
856 return result;
860 static void
861 new_width (struct linereader *cmfile, struct charmap_t *result,
862 const char *from, const char *to, unsigned long int width)
864 struct charseq *from_val;
865 struct charseq *to_val;
867 from_val = charmap_find_value (result, from, strlen (from));
868 if (from_val == NULL)
870 lr_error (cmfile, _("unknown character `%s'"), from);
871 return;
874 if (to == NULL)
875 to_val = from_val;
876 else
878 to_val = charmap_find_value (result, to, strlen (to));
879 if (to_val == NULL)
881 lr_error (cmfile, _("unknown character `%s'"), to);
882 return;
885 /* Make sure the number of bytes for the end points of the range
886 is correct. */
887 if (from_val->nbytes != to_val->nbytes)
889 lr_error (cmfile, _("\
890 number of bytes for byte sequence of beginning and end of range not the same: %d vs %d"),
891 from_val->nbytes, to_val->nbytes);
892 return;
896 if (result->nwidth_rules >= result->nwidth_rules_max)
898 size_t new_size = result->nwidth_rules + 32;
899 struct width_rule *new_rules =
900 (struct width_rule *) obstack_alloc (&result->mem_pool,
901 (new_size
902 * sizeof (struct width_rule)));
904 memcpy (new_rules, result->width_rules,
905 result->nwidth_rules_max * sizeof (struct width_rule));
907 result->width_rules = new_rules;
908 result->nwidth_rules_max = new_size;
911 result->width_rules[result->nwidth_rules].from = from_val;
912 result->width_rules[result->nwidth_rules].to = to_val;
913 result->width_rules[result->nwidth_rules].width = (unsigned int) width;
914 ++result->nwidth_rules;
918 struct charseq *
919 charmap_find_value (const struct charmap_t *cm, const char *name, size_t len)
921 void *result;
923 return (find_entry ((hash_table *) &cm->char_table, name, len, &result)
924 < 0 ? NULL : (struct charseq *) result);
928 static void
929 charmap_new_char (struct linereader *lr, struct charmap_t *cm,
930 int nbytes, char *bytes, const char *from, const char *to,
931 int decimal_ellipsis, int step)
933 hash_table *ht = &cm->char_table;
934 hash_table *bt = &cm->byte_table;
935 struct obstack *ob = &cm->mem_pool;
936 char *from_end;
937 char *to_end;
938 const char *cp;
939 int prefix_len, len1, len2;
940 unsigned int from_nr, to_nr, cnt;
941 struct charseq *newp;
943 len1 = strlen (from);
945 if (to == NULL)
947 newp = (struct charseq *) obstack_alloc (ob, sizeof (*newp) + nbytes);
948 newp->nbytes = nbytes;
949 memcpy (newp->bytes, bytes, nbytes);
950 newp->name = from;
952 newp->ucs4 = UNINITIALIZED_CHAR_VALUE;
953 if ((from[0] == 'U' || from[0] == 'P') && (len1 == 5 || len1 == 9))
955 /* Maybe the name is of the form `Uxxxx' or `Uxxxxxxxx' where
956 xxxx and xxxxxxxx are hexadecimal numbers. In this case
957 we use the value of xxxx or xxxxxxxx as the UCS4 value of
958 this character and we don't have to consult the repertoire
959 map.
961 If the name is of the form `Pxxxx' or `Pxxxxxxxx' the xxxx
962 and xxxxxxxx also give the code point in UCS4 but this must
963 be in the private, i.e., unassigned, area. This should be
964 used for characters which do not (yet) have an equivalent
965 in ISO 10646 and Unicode. */
966 char *endp;
968 errno = 0;
969 newp->ucs4 = strtoul (from + 1, &endp, 16);
970 if (endp - from != len1
971 || (newp->ucs4 == ~((uint32_t) 0) && errno == ERANGE)
972 || newp->ucs4 >= 0x80000000)
973 /* This wasn't successful. Signal this name cannot be a
974 correct UCS value. */
975 newp->ucs4 = UNINITIALIZED_CHAR_VALUE;
978 insert_entry (ht, from, len1, newp);
979 insert_entry (bt, newp->bytes, nbytes, newp);
980 /* Please note that it isn't a bug if a symbol is defined more
981 than once. All later definitions are simply discarded. */
982 return;
985 /* We have a range: the names must have names with equal prefixes
986 and an equal number of digits, where the second number is greater
987 or equal than the first. */
988 len2 = strlen (to);
990 if (len1 != len2)
992 illegal_range:
993 lr_error (lr, _("invalid names for character range"));
994 return;
997 cp = &from[len1 - 1];
998 if (decimal_ellipsis)
999 while (isdigit (*cp) && cp >= from)
1000 --cp;
1001 else
1002 while (isxdigit (*cp) && cp >= from)
1004 if (!isdigit (*cp) && !isupper (*cp))
1005 lr_error (lr, _("\
1006 hexadecimal range format should use only capital characters"));
1007 --cp;
1010 prefix_len = (cp - from) + 1;
1012 if (cp == &from[len1 - 1] || strncmp (from, to, prefix_len) != 0)
1013 goto illegal_range;
1015 errno = 0;
1016 from_nr = strtoul (&from[prefix_len], &from_end, decimal_ellipsis ? 10 : 16);
1017 if (*from_end != '\0' || (from_nr == UINT_MAX && errno == ERANGE)
1018 || ((to_nr = strtoul (&to[prefix_len], &to_end,
1019 decimal_ellipsis ? 10 : 16)) == UINT_MAX
1020 && errno == ERANGE)
1021 || *to_end != '\0')
1023 lr_error (lr, _("<%s> and <%s> are illegal names for range"), from, to);
1024 return;
1027 if (from_nr > to_nr)
1029 lr_error (lr, _("upper limit in range is not higher then lower limit"));
1030 return;
1033 for (cnt = from_nr; cnt <= to_nr; cnt += step)
1035 char *name_end;
1036 obstack_printf (ob, decimal_ellipsis ? "%.*s%0*d" : "%.*s%0*X",
1037 prefix_len, from, len1 - prefix_len, cnt);
1038 obstack_1grow (ob, '\0');
1039 name_end = obstack_finish (ob);
1041 newp = (struct charseq *) obstack_alloc (ob, sizeof (*newp) + nbytes);
1042 newp->nbytes = nbytes;
1043 memcpy (newp->bytes, bytes, nbytes);
1044 newp->name = name_end;
1046 newp->ucs4 = UNINITIALIZED_CHAR_VALUE;
1047 if ((name_end[0] == 'U' || name_end[0] == 'P')
1048 && (len1 == 5 || len1 == 9))
1050 /* Maybe the name is of the form `Uxxxx' or `Uxxxxxxxx' where
1051 xxxx and xxxxxxxx are hexadecimal numbers. In this case
1052 we use the value of xxxx or xxxxxxxx as the UCS4 value of
1053 this character and we don't have to consult the repertoire
1054 map.
1056 If the name is of the form `Pxxxx' or `Pxxxxxxxx' the xxxx
1057 and xxxxxxxx also give the code point in UCS4 but this must
1058 be in the private, i.e., unassigned, area. This should be
1059 used for characters which do not (yet) have an equivalent
1060 in ISO 10646 and Unicode. */
1061 char *endp;
1063 errno = 0;
1064 newp->ucs4 = strtoul (name_end + 1, &endp, 16);
1065 if (endp - name_end != len1
1066 || (newp->ucs4 == ~((uint32_t) 0) && errno == ERANGE)
1067 || newp->ucs4 >= 0x80000000)
1068 /* This wasn't successful. Signal this name cannot be a
1069 correct UCS value. */
1070 newp->ucs4 = UNINITIALIZED_CHAR_VALUE;
1073 insert_entry (ht, name_end, len1, newp);
1074 insert_entry (bt, newp->bytes, nbytes, newp);
1075 /* Please note we don't examine the return value since it is no error
1076 if we have two definitions for a symbol. */
1078 /* Increment the value in the byte sequence. */
1079 if (++bytes[nbytes - 1] == '\0')
1081 int b = nbytes - 2;
1084 if (b < 0)
1086 lr_error (lr,
1087 _("resulting bytes for range not representable."));
1088 return;
1090 while (++bytes[b--] == 0);
1096 struct charseq *
1097 charmap_find_symbol (const struct charmap_t *cm, const char *bytes,
1098 size_t nbytes)
1100 void *result;
1102 return (find_entry ((hash_table *) &cm->byte_table, bytes, nbytes, &result)
1103 < 0 ? NULL : (struct charseq *) result);