Update.
[glibc.git] / locale / programs / ld-ctype.c
bloba08095bbf588e24d63417797860f6f3b50ff0143
1 /* Copyright (C) 1995, 1996, 1997, 1998, 1999 Free Software Foundation, Inc.
2 This file is part of the GNU C Library.
3 Contributed by Ulrich Drepper <drepper@gnu.org>, 1995.
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Library General Public License as
7 published by the Free Software Foundation; either version 2 of the
8 License, or (at your option) any later version.
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Library General Public License for more details.
15 You should have received a copy of the GNU Library General Public
16 License along with the GNU C Library; see the file COPYING.LIB. If not,
17 write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
18 Boston, MA 02111-1307, USA. */
20 #ifdef HAVE_CONFIG_H
21 # include <config.h>
22 #endif
24 #include <alloca.h>
25 #include <byteswap.h>
26 #include <endian.h>
27 #include <errno.h>
28 #include <limits.h>
29 #include <obstack.h>
30 #include <stdlib.h>
31 #include <string.h>
32 #include <wchar.h>
33 #include <wctype.h>
34 #include <sys/uio.h>
36 #include "charmap.h"
37 #include "localeinfo.h"
38 #include "langinfo.h"
39 #include "linereader.h"
40 #include "locfile-token.h"
41 #include "locfile.h"
42 #include "localedef.h"
44 #include <assert.h>
47 #ifdef PREDEFINED_CLASSES
48 /* These are the extra bits not in wctype.h since these are not preallocated
49 classes. */
50 # define _ISwspecial1 (1 << 29)
51 # define _ISwspecial2 (1 << 30)
52 # define _ISwspecial3 (1 << 31)
53 #endif
56 /* The bit used for representing a special class. */
57 #define BITPOS(class) ((class) - tok_upper)
58 #define BIT(class) (_ISbit (BITPOS (class)))
59 #define BITw(class) (_ISwbit (BITPOS (class)))
61 #define ELEM(ctype, collection, idx, value) \
62 *find_idx (ctype, &ctype->collection idx, &ctype->collection##_max idx, \
63 &ctype->collection##_act idx, value)
66 /* To be compatible with former implementations we for now restrict
67 the number of bits for character classes to 16. When compatibility
68 is not necessary anymore increase the number to 32. */
69 #define char_class_t uint16_t
70 #define char_class32_t uint32_t
73 /* Type to describe a transliteration action. We have a possibly
74 multiple character from-string and a set of multiple character
75 to-strings. All are 32bit values since this is what is used in
76 the gconv functions. */
77 struct translit_to_t
79 uint32_t *str;
81 struct translit_to_t *next;
84 struct translit_t
86 uint32_t *from;
88 struct translit_to_t *to;
90 struct translit_t *next;
94 /* The real definition of the struct for the LC_CTYPE locale. */
95 struct locale_ctype_t
97 uint32_t *charnames;
98 size_t charnames_max;
99 size_t charnames_act;
101 struct repertoire_t *repertoire;
103 /* We will allow up to 8 * sizeof (uint32_t) character classes. */
104 #define MAX_NR_CHARCLASS (8 * sizeof (uint32_t))
105 size_t nr_charclass;
106 const char *classnames[MAX_NR_CHARCLASS];
107 uint32_t last_class_char;
108 uint32_t class256_collection[256];
109 uint32_t *class_collection;
110 size_t class_collection_max;
111 size_t class_collection_act;
112 uint32_t class_done;
114 struct charseq **mbdigits;
115 size_t mbdigits_act;
116 size_t mbdigits_max;
117 uint32_t *wcdigits;
118 size_t wcdigits_act;
119 size_t wcdigits_max;
121 struct charseq *mboutdigits[10];
122 uint32_t wcoutdigits[10];
123 size_t outdigits_act;
125 /* If the following number ever turns out to be too small simply
126 increase it. But I doubt it will. --drepper@gnu */
127 #define MAX_NR_CHARMAP 16
128 const char *mapnames[MAX_NR_CHARMAP];
129 uint32_t *map_collection[MAX_NR_CHARMAP];
130 uint32_t map256_collection[2][256];
131 size_t map_collection_max[MAX_NR_CHARMAP];
132 size_t map_collection_act[MAX_NR_CHARMAP];
133 size_t map_collection_nr;
134 size_t last_map_idx;
135 int tomap_done[MAX_NR_CHARMAP];
137 /* Transliteration information. */
138 const char *translit_copy_locale;
139 const char *translit_copy_repertoire;
140 struct translit_t *translit;
142 /* The arrays for the binary representation. */
143 uint32_t plane_size;
144 uint32_t plane_cnt;
145 char_class_t *ctype_b;
146 char_class32_t *ctype32_b;
147 uint32_t *names;
148 uint32_t **map;
149 uint32_t *class_name_ptr;
150 uint32_t *map_name_ptr;
151 unsigned char *width;
152 uint32_t mb_cur_max;
153 const char *codeset_name;
154 uint32_t translit_hash_size;
155 uint32_t translit_hash_layers;
156 uint32_t *translit_from_idx;
157 uint32_t *translit_from_tbl;
158 uint32_t *translit_to_idx;
159 uint32_t *translit_to_tbl;
160 size_t translit_idx_size;
161 size_t translit_from_tbl_size;
162 size_t translit_to_tbl_size;
164 struct obstack mem_pool;
168 #define obstack_chunk_alloc xmalloc
169 #define obstack_chunk_free free
172 /* Prototypes for local functions. */
173 static void ctype_startup (struct linereader *lr, struct localedef_t *locale,
174 struct charmap_t *charmap, int ignore_content);
175 static void ctype_class_new (struct linereader *lr,
176 struct locale_ctype_t *ctype, const char *name);
177 static void ctype_map_new (struct linereader *lr,
178 struct locale_ctype_t *ctype,
179 const char *name, struct charmap_t *charmap);
180 static uint32_t *find_idx (struct locale_ctype_t *ctype, uint32_t **table,
181 size_t *max, size_t *act, unsigned int idx);
182 static void set_class_defaults (struct locale_ctype_t *ctype,
183 struct charmap_t *charmap,
184 struct repertoire_t *repertoire);
185 static void allocate_arrays (struct locale_ctype_t *ctype,
186 struct charmap_t *charmap,
187 struct repertoire_t *repertoire);
190 static const char *longnames[] =
192 "zero", "one", "two", "three", "four",
193 "five", "six", "seven", "eight", "nine"
195 static const unsigned char digits[] = "0123456789";
198 static void
199 ctype_startup (struct linereader *lr, struct localedef_t *locale,
200 struct charmap_t *charmap, int ignore_content)
202 unsigned int cnt;
203 struct locale_ctype_t *ctype;
205 if (!ignore_content)
207 /* Allocate the needed room. */
208 locale->categories[LC_CTYPE].ctype = ctype =
209 (struct locale_ctype_t *) xcalloc (1, sizeof (struct locale_ctype_t));
211 /* We have seen no names yet. */
212 ctype->charnames_max = charmap->mb_cur_max == 1 ? 256 : 512;
213 ctype->charnames =
214 (unsigned int *) xmalloc (ctype->charnames_max
215 * sizeof (unsigned int));
216 for (cnt = 0; cnt < 256; ++cnt)
217 ctype->charnames[cnt] = cnt;
218 ctype->charnames_act = 256;
220 /* Fill character class information. */
221 ctype->last_class_char = ILLEGAL_CHAR_VALUE;
222 /* The order of the following instructions determines the bit
223 positions! */
224 ctype_class_new (lr, ctype, "upper");
225 ctype_class_new (lr, ctype, "lower");
226 ctype_class_new (lr, ctype, "alpha");
227 ctype_class_new (lr, ctype, "digit");
228 ctype_class_new (lr, ctype, "xdigit");
229 ctype_class_new (lr, ctype, "space");
230 ctype_class_new (lr, ctype, "print");
231 ctype_class_new (lr, ctype, "graph");
232 ctype_class_new (lr, ctype, "blank");
233 ctype_class_new (lr, ctype, "cntrl");
234 ctype_class_new (lr, ctype, "punct");
235 ctype_class_new (lr, ctype, "alnum");
236 #ifdef PREDEFINED_CLASSES
237 /* The following are extensions from ISO 14652. */
238 ctype_class_new (lr, ctype, "left_to_right");
239 ctype_class_new (lr, ctype, "right_to_left");
240 ctype_class_new (lr, ctype, "num_terminator");
241 ctype_class_new (lr, ctype, "num_separator");
242 ctype_class_new (lr, ctype, "segment_separator");
243 ctype_class_new (lr, ctype, "block_separator");
244 ctype_class_new (lr, ctype, "direction_control");
245 ctype_class_new (lr, ctype, "sym_swap_layout");
246 ctype_class_new (lr, ctype, "char_shape_selector");
247 ctype_class_new (lr, ctype, "num_shape_selector");
248 ctype_class_new (lr, ctype, "non_spacing");
249 ctype_class_new (lr, ctype, "non_spacing_level3");
250 ctype_class_new (lr, ctype, "normal_connect");
251 ctype_class_new (lr, ctype, "r_connect");
252 ctype_class_new (lr, ctype, "no_connect");
253 ctype_class_new (lr, ctype, "no_connect-space");
254 ctype_class_new (lr, ctype, "vowel_connect");
255 #endif
257 ctype->class_collection_max = charmap->mb_cur_max == 1 ? 256 : 512;
258 ctype->class_collection
259 = (uint32_t *) xcalloc (sizeof (unsigned long int),
260 ctype->class_collection_max);
261 ctype->class_collection_act = 256;
263 /* Fill character map information. */
264 ctype->map_collection_nr = 0;
265 ctype->last_map_idx = MAX_NR_CHARMAP;
266 ctype_map_new (lr, ctype, "toupper", charmap);
267 ctype_map_new (lr, ctype, "tolower", charmap);
268 #ifdef PREDEFINED_CLASSES
269 ctype_map_new (lr, ctype, "tosymmetric", charmap);
270 #endif
272 /* Fill first 256 entries in `toXXX' arrays. */
273 for (cnt = 0; cnt < 256; ++cnt)
275 ctype->map_collection[0][cnt] = cnt;
276 ctype->map_collection[1][cnt] = cnt;
277 #ifdef PREDEFINED_CLASSES
278 ctype->map_collection[2][cnt] = cnt;
279 #endif
280 ctype->map256_collection[0][cnt] = cnt;
281 ctype->map256_collection[1][cnt] = cnt;
284 obstack_init (&ctype->mem_pool);
289 void
290 ctype_finish (struct localedef_t *locale, struct charmap_t *charmap)
292 /* See POSIX.2, table 2-6 for the meaning of the following table. */
293 #define NCLASS 12
294 static const struct
296 const char *name;
297 const char allow[NCLASS];
299 valid_table[NCLASS] =
301 /* The order is important. See token.h for more information.
302 M = Always, D = Default, - = Permitted, X = Mutually exclusive */
303 { "upper", "--MX-XDDXXX-" },
304 { "lower", "--MX-XDDXXX-" },
305 { "alpha", "---X-XDDXXX-" },
306 { "digit", "XXX--XDDXXX-" },
307 { "xdigit", "-----XDDXXX-" },
308 { "space", "XXXXX------X" },
309 { "print", "---------X--" },
310 { "graph", "---------X--" },
311 { "blank", "XXXXXM-----X" },
312 { "cntrl", "XXXXX-XX--XX" },
313 { "punct", "XXXXX-DD-X-X" },
314 { "alnum", "-----XDDXXX-" }
316 size_t cnt;
317 int cls1, cls2;
318 uint32_t space_value;
319 struct charseq *space_seq;
320 struct locale_ctype_t *ctype = locale->categories[LC_CTYPE].ctype;
321 int warned;
323 /* Now resolve copying and also handle completely missing definitions. */
324 if (ctype == NULL)
326 /* First see whether we were supposed to copy. If yes, find the
327 actual definition. */
328 if (locale->copy_name[LC_CTYPE] != NULL)
330 /* Find the copying locale. This has to happen transitively since
331 the locale we are copying from might also copying another one. */
332 struct localedef_t *from = locale;
335 from = find_locale (LC_CTYPE, from->copy_name[LC_CTYPE],
336 from->repertoire_name, charmap);
337 while (from->categories[LC_CTYPE].ctype == NULL
338 && from->copy_name[LC_CTYPE] != NULL);
340 ctype = locale->categories[LC_CTYPE].ctype
341 = from->categories[LC_CTYPE].ctype;
344 /* If there is still no definition issue an warning and create an
345 empty one. */
346 if (ctype == NULL)
348 error (0, 0, _("No definition for %s category found"), "LC_CTYPE");
349 ctype_startup (NULL, locale, charmap, 0);
350 ctype = locale->categories[LC_CTYPE].ctype;
354 /* Set default value for classes not specified. */
355 set_class_defaults (ctype, charmap, ctype->repertoire);
357 /* Check according to table. */
358 for (cnt = 0; cnt < ctype->class_collection_act; ++cnt)
360 uint32_t tmp = ctype->class_collection[cnt];
362 if (tmp != 0)
364 for (cls1 = 0; cls1 < NCLASS; ++cls1)
365 if ((tmp & _ISwbit (cls1)) != 0)
366 for (cls2 = 0; cls2 < NCLASS; ++cls2)
367 if (valid_table[cls1].allow[cls2] != '-')
369 int eq = (tmp & _ISwbit (cls2)) != 0;
370 switch (valid_table[cls1].allow[cls2])
372 case 'M':
373 if (!eq)
375 uint32_t value = ctype->charnames[cnt];
377 if (!be_quiet)
378 error (0, 0, _("\
379 character L'\\u%0*x' in class `%s' must be in class `%s'"),
380 value > 0xffff ? 8 : 4, value,
381 valid_table[cls1].name,
382 valid_table[cls2].name);
384 break;
386 case 'X':
387 if (eq)
389 uint32_t value = ctype->charnames[cnt];
391 if (!be_quiet)
392 error (0, 0, _("\
393 character L'\\u%0*x' in class `%s' must not be in class `%s'"),
394 value > 0xffff ? 8 : 4, value,
395 valid_table[cls1].name,
396 valid_table[cls2].name);
398 break;
400 case 'D':
401 ctype->class_collection[cnt] |= _ISwbit (cls2);
402 break;
404 default:
405 error (5, 0, _("internal error in %s, line %u"),
406 __FUNCTION__, __LINE__);
412 for (cnt = 0; cnt < 256; ++cnt)
414 uint32_t tmp = ctype->class256_collection[cnt];
416 if (tmp != 0)
418 for (cls1 = 0; cls1 < NCLASS; ++cls1)
419 if ((tmp & _ISbit (cls1)) != 0)
420 for (cls2 = 0; cls2 < NCLASS; ++cls2)
421 if (valid_table[cls1].allow[cls2] != '-')
423 int eq = (tmp & _ISbit (cls2)) != 0;
424 switch (valid_table[cls1].allow[cls2])
426 case 'M':
427 if (!eq)
429 char buf[17];
431 sprintf (buf, "\\%o", cnt);
433 if (!be_quiet)
434 error (0, 0, _("\
435 character '%s' in class `%s' must be in class `%s'"),
436 buf, valid_table[cls1].name,
437 valid_table[cls2].name);
439 break;
441 case 'X':
442 if (eq)
444 char buf[17];
446 sprintf (buf, "\\%o", cnt);
448 if (!be_quiet)
449 error (0, 0, _("\
450 character '%s' in class `%s' must not be in class `%s'"),
451 buf, valid_table[cls1].name,
452 valid_table[cls2].name);
454 break;
456 case 'D':
457 ctype->class256_collection[cnt] |= _ISbit (cls2);
458 break;
460 default:
461 error (5, 0, _("internal error in %s, line %u"),
462 __FUNCTION__, __LINE__);
468 /* ... and now test <SP> as a special case. */
469 space_value = repertoire_find_value (ctype->repertoire, "SP", 2);
470 if (space_value == ILLEGAL_CHAR_VALUE)
472 if (!be_quiet)
473 error (0, 0, _("character <SP> not defined in character map"));
475 else if (((cnt = BITPOS (tok_space),
476 (ELEM (ctype, class_collection, , space_value)
477 & BITw (tok_space)) == 0)
478 || (cnt = BITPOS (tok_blank),
479 (ELEM (ctype, class_collection, , space_value)
480 & BITw (tok_blank)) == 0)))
482 if (!be_quiet)
483 error (0, 0, _("<SP> character not in class `%s'"),
484 valid_table[cnt].name);
486 else if (((cnt = BITPOS (tok_punct),
487 (ELEM (ctype, class_collection, , space_value)
488 & BITw (tok_punct)) != 0)
489 || (cnt = BITPOS (tok_graph),
490 (ELEM (ctype, class_collection, , space_value)
491 & BITw (tok_graph))
492 != 0)))
494 if (!be_quiet)
495 error (0, 0, _("<SP> character must not be in class `%s'"),
496 valid_table[cnt].name);
498 else
499 ELEM (ctype, class_collection, , space_value) |= BITw (tok_print);
501 space_seq = charmap_find_value (charmap, "SP", 2);
502 if (space_seq == NULL || space_seq->nbytes != 1)
504 if (!be_quiet)
505 error (0, 0, _("character <SP> not defined in character map"));
507 else if (((cnt = BITPOS (tok_space),
508 (ctype->class256_collection[space_seq->bytes[0]]
509 & BIT (tok_space)) == 0)
510 || (cnt = BITPOS (tok_blank),
511 (ctype->class256_collection[space_seq->bytes[0]]
512 & BIT (tok_blank)) == 0)))
514 if (!be_quiet)
515 error (0, 0, _("<SP> character not in class `%s'"),
516 valid_table[cnt].name);
518 else if (((cnt = BITPOS (tok_punct),
519 (ctype->class256_collection[space_seq->bytes[0]]
520 & BIT (tok_punct)) != 0)
521 || (cnt = BITPOS (tok_graph),
522 (ctype->class256_collection[space_seq->bytes[0]]
523 & BIT (tok_graph)) != 0)))
525 if (!be_quiet)
526 error (0, 0, _("<SP> character must not be in class `%s'"),
527 valid_table[cnt].name);
529 else
530 ctype->class256_collection[space_seq->bytes[0]] |= BIT (tok_print);
532 /* Now that the tests are done make sure the name array contains all
533 characters which are handled in the WIDTH section of the
534 character set definition file. */
535 if (charmap->width_rules != NULL)
536 for (cnt = 0; cnt < charmap->nwidth_rules; ++cnt)
538 unsigned char bytes[charmap->mb_cur_max];
539 int nbytes = charmap->width_rules[cnt].from->nbytes;
541 /* We have the range of character for which the width is
542 specified described using byte sequences of the multibyte
543 charset. We have to convert this to UCS4 now. And we
544 cannot simply convert the beginning and the end of the
545 sequence, we have to iterate over the byte sequence and
546 convert it for every single character. */
547 memcpy (bytes, charmap->width_rules[cnt].from->bytes, nbytes);
549 while (nbytes < charmap->width_rules[cnt].to->nbytes
550 || memcmp (bytes, charmap->width_rules[cnt].to->bytes,
551 nbytes) <= 0)
553 /* Find the UCS value for `bytes'. */
554 uint32_t wch = repertoire_find_value (ctype->repertoire, bytes,
555 nbytes);
556 int inner;
558 if (wch != ILLEGAL_CHAR_VALUE)
559 /* We are only interested in the side-effects of the
560 `find_idx' call. It will add appropriate entries in
561 the name array if this is necessary. */
562 (void) find_idx (ctype, NULL, NULL, NULL, wch);
564 /* "Increment" the bytes sequence. */
565 inner = nbytes - 1;
566 while (inner >= 0 && bytes[inner] == 0xff)
567 --inner;
569 if (inner < 0)
571 /* We have to extend the byte sequence. */
572 if (nbytes >= charmap->width_rules[cnt].to->nbytes)
573 break;
575 bytes[0] = 1;
576 memset (&bytes[1], 0, nbytes);
577 ++nbytes;
579 else
581 ++bytes[inner];
582 while (++inner < nbytes)
583 bytes[inner] = 0;
588 /* There must be a multiple of 10 digits. */
589 if (ctype->mbdigits_act % 10 != 0)
591 assert (ctype->mbdigits_act == ctype->wcdigits_act);
592 ctype->wcdigits_act -= ctype->mbdigits_act % 10;
593 ctype->mbdigits_act -= ctype->mbdigits_act % 10;
594 error (0, 0, _("`digit' category has not entries in groups of ten"));
597 /* Check the input digits. There must be a multiple of ten available.
598 In each group it could be that one or the other character is missing.
599 In this case the whole group must be removed. */
600 cnt = 0;
601 while (cnt < ctype->mbdigits_act)
603 size_t inner;
604 for (inner = 0; inner < 10; ++inner)
605 if (ctype->mbdigits[cnt + inner] == NULL)
606 break;
608 if (inner == 10)
609 cnt += 10;
610 else
612 /* Remove the group. */
613 memmove (&ctype->mbdigits[cnt], &ctype->mbdigits[cnt + 10],
614 ((ctype->wcdigits_act - cnt - 10)
615 * sizeof (ctype->mbdigits[0])));
616 ctype->mbdigits_act -= 10;
620 /* If no input digits are given use the default. */
621 if (ctype->mbdigits_act == 0)
623 if (ctype->mbdigits_max == 0)
625 ctype->mbdigits = obstack_alloc (&charmap->mem_pool,
626 10 * sizeof (struct charseq *));
627 ctype->mbdigits_max = 10;
630 for (cnt = 0; cnt < 10; ++cnt)
632 ctype->mbdigits[cnt] = charmap_find_symbol (charmap,
633 digits + cnt, 1);
634 if (ctype->mbdigits[cnt] == NULL)
636 ctype->mbdigits[cnt] = charmap_find_symbol (charmap,
637 longnames[cnt],
638 strlen (longnames[cnt]));
639 if (ctype->mbdigits[cnt] == NULL)
641 /* Hum, this ain't good. */
642 error (0, 0, _("\
643 no input digits defined and none of the standard names in the charmap"));
645 ctype->mbdigits[cnt] = obstack_alloc (&charmap->mem_pool,
646 sizeof (struct charseq) + 1);
648 /* This is better than nothing. */
649 ctype->mbdigits[cnt]->bytes[0] = digits[cnt];
650 ctype->mbdigits[cnt]->nbytes = 1;
655 ctype->mbdigits_act = 10;
658 /* Check the wide character input digits. There must be a multiple
659 of ten available. In each group it could be that one or the other
660 character is missing. In this case the whole group must be
661 removed. */
662 cnt = 0;
663 while (cnt < ctype->wcdigits_act)
665 size_t inner;
666 for (inner = 0; inner < 10; ++inner)
667 if (ctype->wcdigits[cnt + inner] == ILLEGAL_CHAR_VALUE)
668 break;
670 if (inner == 10)
671 cnt += 10;
672 else
674 /* Remove the group. */
675 memmove (&ctype->wcdigits[cnt], &ctype->wcdigits[cnt + 10],
676 ((ctype->wcdigits_act - cnt - 10)
677 * sizeof (ctype->wcdigits[0])));
678 ctype->wcdigits_act -= 10;
682 /* If no input digits are given use the default. */
683 if (ctype->wcdigits_act == 0)
685 if (ctype->wcdigits_max == 0)
687 ctype->wcdigits = obstack_alloc (&charmap->mem_pool,
688 10 * sizeof (uint32_t));
689 ctype->wcdigits_max = 10;
692 for (cnt = 0; cnt < 10; ++cnt)
693 ctype->wcdigits[cnt] = L'0' + cnt;
695 ctype->mbdigits_act = 10;
698 /* Check the outdigits. */
699 warned = 0;
700 for (cnt = 0; cnt < 10; ++cnt)
701 if (ctype->mboutdigits[cnt] == NULL)
703 static struct charseq replace[2];
705 if (!warned)
707 error (0, 0, _("\
708 not all characters used in `outdigit' are available in the charmap"));
709 warned = 1;
712 replace[0].nbytes = 1;
713 replace[0].bytes[0] = '?';
714 replace[0].bytes[1] = '\0';
715 ctype->mboutdigits[cnt] = &replace[0];
718 warned = 0;
719 for (cnt = 0; cnt < 10; ++cnt)
720 if (ctype->wcoutdigits[cnt] == 0)
722 if (!warned)
724 error (0, 0, _("\
725 not all characters used in `outdigit' are available in the repertoire"));
726 warned = 1;
729 ctype->wcoutdigits[cnt] = L'?';
734 void
735 ctype_output (struct localedef_t *locale, struct charmap_t *charmap,
736 const char *output_path)
738 struct locale_ctype_t *ctype = locale->categories[LC_CTYPE].ctype;
739 const size_t nelems = (_NL_ITEM_INDEX (_NL_NUM_LC_CTYPE)
740 + (ctype->map_collection_nr - 2));
741 struct iovec iov[2 + nelems + ctype->nr_charclass
742 + ctype->map_collection_nr];
743 struct locale_file data;
744 uint32_t idx[nelems + 1];
745 size_t elem, cnt, offset, total;
746 char *cp;
748 /* Now prepare the output: Find the sizes of the table we can use. */
749 allocate_arrays (ctype, charmap, ctype->repertoire);
751 data.magic = LIMAGIC (LC_CTYPE);
752 data.n = nelems;
753 iov[0].iov_base = (void *) &data;
754 iov[0].iov_len = sizeof (data);
756 iov[1].iov_base = (void *) idx;
757 iov[1].iov_len = sizeof (idx);
759 idx[0] = iov[0].iov_len + iov[1].iov_len;
760 offset = 0;
762 for (elem = 0; elem < nelems; ++elem)
764 if (elem < _NL_ITEM_INDEX (_NL_NUM_LC_CTYPE))
765 switch (elem)
767 #define CTYPE_DATA(name, base, len) \
768 case _NL_ITEM_INDEX (name): \
769 iov[2 + elem + offset].iov_base = (base); \
770 iov[2 + elem + offset].iov_len = (len); \
771 if (elem + 1 < nelems) \
772 idx[elem + 1] = idx[elem] + iov[2 + elem + offset].iov_len; \
773 break
775 CTYPE_DATA (_NL_CTYPE_CLASS,
776 ctype->ctype_b,
777 (256 + 128) * sizeof (char_class_t));
779 CTYPE_DATA (_NL_CTYPE_TOUPPER,
780 ctype->map[0],
781 (ctype->plane_size * ctype->plane_cnt + 128)
782 * sizeof (uint32_t));
783 CTYPE_DATA (_NL_CTYPE_TOLOWER,
784 ctype->map[1],
785 (ctype->plane_size * ctype->plane_cnt + 128)
786 * sizeof (uint32_t));
788 CTYPE_DATA (_NL_CTYPE_CLASS32,
789 ctype->ctype32_b,
790 (ctype->plane_size * ctype->plane_cnt
791 * sizeof (char_class32_t)));
793 CTYPE_DATA (_NL_CTYPE_NAMES,
794 ctype->names, (ctype->plane_size * ctype->plane_cnt
795 * sizeof (uint32_t)));
797 CTYPE_DATA (_NL_CTYPE_TRANSLIT_HASH_SIZE,
798 &ctype->translit_hash_size, sizeof (uint32_t));
799 CTYPE_DATA (_NL_CTYPE_TRANSLIT_HASH_LAYERS,
800 &ctype->translit_hash_layers, sizeof (uint32_t));
802 CTYPE_DATA (_NL_CTYPE_TRANSLIT_FROM_IDX,
803 ctype->translit_from_idx,
804 ctype->translit_idx_size);
806 CTYPE_DATA (_NL_CTYPE_TRANSLIT_FROM_TBL,
807 ctype->translit_from_tbl,
808 ctype->translit_from_tbl_size);
810 CTYPE_DATA (_NL_CTYPE_TRANSLIT_TO_IDX,
811 ctype->translit_to_idx,
812 ctype->translit_idx_size);
814 CTYPE_DATA (_NL_CTYPE_TRANSLIT_TO_TBL,
815 ctype->translit_to_tbl, ctype->translit_to_tbl_size);
817 CTYPE_DATA (_NL_CTYPE_HASH_SIZE,
818 &ctype->plane_size, sizeof (uint32_t));
819 CTYPE_DATA (_NL_CTYPE_HASH_LAYERS,
820 &ctype->plane_cnt, sizeof (uint32_t));
822 case _NL_ITEM_INDEX (_NL_CTYPE_CLASS_NAMES):
823 /* The class name array. */
824 total = 0;
825 for (cnt = 0; cnt < ctype->nr_charclass; ++cnt, ++offset)
827 iov[2 + elem + offset].iov_base
828 = (void *) ctype->classnames[cnt];
829 iov[2 + elem + offset].iov_len
830 = strlen (ctype->classnames[cnt]) + 1;
831 total += iov[2 + elem + offset].iov_len;
833 iov[2 + elem + offset].iov_base = (void *) "\0\0\0";
834 iov[2 + elem + offset].iov_len = 1 + (4 - ((total + 1) % 4));
835 total += 1 + (4 - ((total + 1) % 4));
837 idx[elem + 1] = idx[elem] + total;
838 break;
840 case _NL_ITEM_INDEX (_NL_CTYPE_MAP_NAMES):
841 /* The class name array. */
842 total = 0;
843 for (cnt = 0; cnt < ctype->map_collection_nr; ++cnt, ++offset)
845 iov[2 + elem + offset].iov_base
846 = (void *) ctype->mapnames[cnt];
847 iov[2 + elem + offset].iov_len
848 = strlen (ctype->mapnames[cnt]) + 1;
849 total += iov[2 + elem + offset].iov_len;
851 iov[2 + elem + offset].iov_base = (void *) "\0\0\0";
852 iov[2 + elem + offset].iov_len = 1 + (4 - ((total + 1) % 4));
853 total += 1 + (4 - ((total + 1) % 4));
855 idx[elem + 1] = idx[elem] + total;
856 break;
858 CTYPE_DATA (_NL_CTYPE_WIDTH,
859 ctype->width, ctype->plane_size * ctype->plane_cnt);
861 CTYPE_DATA (_NL_CTYPE_MB_CUR_MAX,
862 &ctype->mb_cur_max, sizeof (uint32_t));
864 case _NL_ITEM_INDEX (_NL_CTYPE_CODESET_NAME):
865 total = strlen (ctype->codeset_name) + 1;
866 if (total % 4 == 0)
867 iov[2 + elem + offset].iov_base = (char *) ctype->codeset_name;
868 else
870 iov[2 + elem + offset].iov_base = alloca ((total + 3) & ~3);
871 memset (mempcpy (iov[2 + elem + offset].iov_base,
872 ctype->codeset_name, total),
873 '\0', 4 - (total & 3));
874 total = (total + 3) & ~3;
876 iov[2 + elem + offset].iov_len = total;
877 idx[elem + 1] = idx[elem] + iov[2 + elem + offset].iov_len;
878 break;
880 case _NL_ITEM_INDEX (_NL_CTYPE_INDIGITS_MB_LEN):
881 iov[2 + elem + offset].iov_base = alloca (sizeof (uint32_t));
882 iov[2 + elem + offset].iov_len = sizeof (uint32_t);
883 *(uint32_t *) iov[2 + elem + offset].iov_base =
884 ctype->mbdigits_act / 10;
885 idx[elem + 1] = idx[elem] + sizeof (uint32_t);
886 break;
888 case _NL_ITEM_INDEX (_NL_CTYPE_INDIGITS_WC_LEN):
889 iov[2 + elem + offset].iov_base = alloca (sizeof (uint32_t));
890 iov[2 + elem + offset].iov_len = sizeof (uint32_t);
891 *(uint32_t *) iov[2 + elem + offset].iov_base =
892 ctype->wcdigits_act / 10;
893 idx[elem + 1] = idx[elem] + sizeof (uint32_t);
894 break;
896 case _NL_ITEM_INDEX (_NL_CTYPE_INDIGITS0_MB) ... _NL_ITEM_INDEX (_NL_CTYPE_INDIGITS9_MB):
897 /* Compute the length of all possible characters. For INDIGITS
898 there might be more than one. We simply concatenate all of
899 them with a NUL byte following. The NUL byte wouldn't be
900 necessary but it makes it easier for the user. */
901 total = 0;
902 for (cnt = elem - _NL_CTYPE_INDIGITS0_MB;
903 cnt < ctype->mbdigits_act; cnt += 10)
904 total += ctype->mbdigits[cnt]->nbytes + 1;
905 iov[2 + elem + offset].iov_base = (char *) alloca (total);
906 iov[2 + elem + offset].iov_len = total;
908 cp = iov[2 + elem + offset].iov_base;
909 for (cnt = elem - _NL_CTYPE_INDIGITS0_MB;
910 cnt < ctype->mbdigits_act; cnt += 10)
912 cp = mempcpy (cp, ctype->mbdigits[cnt]->bytes,
913 ctype->mbdigits[cnt]->nbytes);
914 *cp++ = '\0';
916 idx[elem + 1] = idx[elem] + iov[2 + elem + offset].iov_len;
917 break;
919 case _NL_ITEM_INDEX (_NL_CTYPE_OUTDIGIT0_MB) ... _NL_ITEM_INDEX (_NL_CTYPE_OUTDIGIT9_MB):
920 /* Compute the length of all possible characters. For INDIGITS
921 there might be more than one. We simply concatenate all of
922 them with a NUL byte following. The NUL byte wouldn't be
923 necessary but it makes it easier for the user. */
924 cnt = elem - _NL_CTYPE_OUTDIGIT0_MB;
925 total = ctype->mboutdigits[cnt]->nbytes + 1;
926 iov[2 + elem + offset].iov_base = (char *) alloca (total);
927 iov[2 + elem + offset].iov_len = total;
929 *(char *) mempcpy (iov[2 + elem + offset].iov_base,
930 ctype->mbdigits[cnt]->bytes,
931 ctype->mbdigits[cnt]->nbytes) = '\0';
932 idx[elem + 1] = idx[elem] + iov[2 + elem + offset].iov_len;
933 break;
935 case _NL_ITEM_INDEX (_NL_CTYPE_INDIGITS0_WC) ... _NL_ITEM_INDEX (_NL_CTYPE_INDIGITS9_WC):
936 total = ctype->wcdigits_act / 10;
938 iov[2 + elem + offset].iov_base =
939 (uint32_t *) alloca (total * sizeof (uint32_t));
940 iov[2 + elem + offset].iov_len = total * sizeof (uint32_t);
942 for (cnt = elem - _NL_CTYPE_INDIGITS0_WC;
943 cnt < ctype->wcdigits_act; cnt += 10)
944 ((uint32_t *) iov[2 + elem + offset].iov_base)[cnt / 10]
945 = ctype->wcdigits[cnt];
946 idx[elem + 1] = idx[elem] + iov[2 + elem + offset].iov_len;
947 break;
949 case _NL_ITEM_INDEX (_NL_CTYPE_OUTDIGIT0_WC) ... _NL_ITEM_INDEX (_NL_CTYPE_OUTDIGIT9_WC):
950 cnt = elem - _NL_CTYPE_OUTDIGIT0_WC;
951 iov[2 + elem + offset].iov_base = &ctype->wcoutdigits[cnt];
952 iov[2 + elem + offset].iov_len = sizeof (uint32_t);
953 idx[elem + 1] = idx[elem] + iov[2 + elem + offset].iov_len;
954 break;
956 default:
957 assert (! "unknown CTYPE element");
959 else
961 /* Handle extra maps. */
962 size_t nr = (elem - _NL_ITEM_INDEX (_NL_NUM_LC_CTYPE)) + 2;
964 iov[2 + elem + offset].iov_base = ctype->map[nr];
965 iov[2 + elem + offset].iov_len = ((ctype->plane_size
966 * ctype->plane_cnt + 128)
967 * sizeof (uint32_t));
969 idx[elem + 1] = idx[elem] + iov[2 + elem + offset].iov_len;
973 assert (2 + elem + offset == (nelems + ctype->nr_charclass
974 + ctype->map_collection_nr + 2));
976 write_locale_data (output_path, "LC_CTYPE", 2 + elem + offset, iov);
980 /* Local functions. */
981 static void
982 ctype_class_new (struct linereader *lr, struct locale_ctype_t *ctype,
983 const char *name)
985 size_t cnt;
987 for (cnt = 0; cnt < ctype->nr_charclass; ++cnt)
988 if (strcmp (ctype->classnames[cnt], name) == 0)
989 break;
991 if (cnt < ctype->nr_charclass)
993 lr_error (lr, _("character class `%s' already defined"), name);
994 return;
997 if (ctype->nr_charclass == MAX_NR_CHARCLASS)
998 /* Exit code 2 is prescribed in P1003.2b. */
999 error (2, 0, _("\
1000 implementation limit: no more than %d character classes allowed"),
1001 MAX_NR_CHARCLASS);
1003 ctype->classnames[ctype->nr_charclass++] = name;
1007 static void
1008 ctype_map_new (struct linereader *lr, struct locale_ctype_t *ctype,
1009 const char *name, struct charmap_t *charmap)
1011 size_t max_chars = 0;
1012 size_t cnt;
1014 for (cnt = 0; cnt < ctype->map_collection_nr; ++cnt)
1016 if (strcmp (ctype->mapnames[cnt], name) == 0)
1017 break;
1019 if (max_chars < ctype->map_collection_max[cnt])
1020 max_chars = ctype->map_collection_max[cnt];
1023 if (cnt < ctype->map_collection_nr)
1025 lr_error (lr, _("character map `%s' already defined"), name);
1026 return;
1029 if (ctype->map_collection_nr == MAX_NR_CHARMAP)
1030 /* Exit code 2 is prescribed in P1003.2b. */
1031 error (2, 0, _("\
1032 implementation limit: no more than %d character maps allowed"),
1033 MAX_NR_CHARMAP);
1035 ctype->mapnames[cnt] = name;
1037 if (max_chars == 0)
1038 ctype->map_collection_max[cnt] = charmap->mb_cur_max == 1 ? 256 : 512;
1039 else
1040 ctype->map_collection_max[cnt] = max_chars;
1042 ctype->map_collection[cnt] = (uint32_t *)
1043 xmalloc (sizeof (uint32_t) * ctype->map_collection_max[cnt]);
1044 memset (ctype->map_collection[cnt], '\0',
1045 sizeof (uint32_t) * ctype->map_collection_max[cnt]);
1046 ctype->map_collection_act[cnt] = 256;
1048 ++ctype->map_collection_nr;
1052 /* We have to be prepared that TABLE, MAX, and ACT can be NULL. This
1053 is possible if we only want to extend the name array. */
1054 static uint32_t *
1055 find_idx (struct locale_ctype_t *ctype, uint32_t **table, size_t *max,
1056 size_t *act, uint32_t idx)
1058 size_t cnt;
1060 if (idx < 256)
1061 return table == NULL ? NULL : &(*table)[idx];
1063 for (cnt = 256; cnt < ctype->charnames_act; ++cnt)
1064 if (ctype->charnames[cnt] == idx)
1065 break;
1067 /* We have to distinguish two cases: the name is found or not. */
1068 if (cnt == ctype->charnames_act)
1070 /* Extend the name array. */
1071 if (ctype->charnames_act == ctype->charnames_max)
1073 ctype->charnames_max *= 2;
1074 ctype->charnames = (unsigned int *)
1075 xrealloc (ctype->charnames,
1076 sizeof (unsigned int) * ctype->charnames_max);
1078 ctype->charnames[ctype->charnames_act++] = idx;
1081 if (table == NULL)
1082 /* We have done everything we are asked to do. */
1083 return NULL;
1085 if (cnt >= *act)
1087 if (cnt >= *max)
1089 size_t old_max = *max;
1091 *max *= 2;
1092 while (*max <= cnt);
1094 *table =
1095 (uint32_t *) xrealloc (*table, *max * sizeof (unsigned long int));
1096 memset (&(*table)[old_max], '\0',
1097 (*max - old_max) * sizeof (uint32_t));
1100 *act = cnt;
1103 return &(*table)[cnt];
1107 static int
1108 get_character (struct token *now, struct charmap_t *charmap,
1109 struct repertoire_t *repertoire,
1110 struct charseq **seqp, uint32_t *wchp)
1112 if (now->tok == tok_bsymbol)
1114 /* This will hopefully be the normal case. */
1115 *wchp = repertoire_find_value (repertoire, now->val.str.startmb,
1116 now->val.str.lenmb);
1117 *seqp = charmap_find_value (charmap, now->val.str.startmb,
1118 now->val.str.lenmb);
1120 else if (now->tok == tok_ucs4)
1122 *seqp = repertoire_find_seq (repertoire, now->val.ucs4);
1124 if (*seqp == NULL)
1126 /* Compute the value in the charmap from the UCS value. */
1127 const char *symbol = repertoire_find_symbol (repertoire,
1128 now->val.ucs4);
1130 if (symbol == NULL)
1131 *seqp = NULL;
1132 else
1133 *seqp = charmap_find_value (charmap, symbol, strlen (symbol));
1135 if (*seqp == NULL)
1137 /* Insert a negative entry. */
1138 static const struct charseq negative
1139 = { .ucs4 = ILLEGAL_CHAR_VALUE };
1140 uint32_t *newp = obstack_alloc (&repertoire->mem_pool, 4);
1141 *newp = now->val.ucs4;
1143 insert_entry (&repertoire->seq_table, newp, 4,
1144 (void *) &negative);
1146 else
1147 (*seqp)->ucs4 = now->val.ucs4;
1149 else if ((*seqp)->ucs4 != now->val.ucs4)
1150 *seqp = NULL;
1152 *wchp = now->val.ucs4;
1154 else if (now->tok == tok_charcode)
1156 /* We must map from the byte code to UCS4. */
1157 *seqp = charmap_find_symbol (charmap, now->val.str.startmb,
1158 now->val.str.lenmb);
1160 if (*seqp == NULL)
1161 *wchp = ILLEGAL_CHAR_VALUE;
1162 else
1164 if ((*seqp)->ucs4 == UNINITIALIZED_CHAR_VALUE)
1165 (*seqp)->ucs4 = repertoire_find_value (repertoire, (*seqp)->name,
1166 strlen ((*seqp)->name));
1167 *wchp = (*seqp)->ucs4;
1170 else
1171 return 1;
1173 return 0;
1177 /* Ellipsis like in `<foo123>..<foo12a>' or `<j1234>....<j1245>'. */
1178 static void
1179 charclass_symbolic_ellipsis (struct linereader *ldfile,
1180 struct locale_ctype_t *ctype,
1181 struct charmap_t *charmap,
1182 struct repertoire_t *repertoire,
1183 struct token *now,
1184 const char *last_str,
1185 unsigned long int class256_bit,
1186 unsigned long int class_bit, int base,
1187 int ignore_content, int handle_digits)
1189 const char *nowstr = now->val.str.startmb;
1190 char tmp[now->val.str.lenmb + 1];
1191 const char *cp;
1192 char *endp;
1193 unsigned long int from;
1194 unsigned long int to;
1196 /* We have to compute the ellipsis values using the symbolic names. */
1197 assert (last_str != NULL);
1199 if (strlen (last_str) != now->val.str.lenmb)
1201 invalid_range:
1202 lr_error (ldfile,
1203 _("`%s' and `%.*s' are no valid names for symbolic range"),
1204 last_str, now->val.str.lenmb, nowstr);
1205 return;
1208 if (memcmp (last_str, nowstr, now->val.str.lenmb) == 0)
1209 /* Nothing to do, the names are the same. */
1210 return;
1212 for (cp = last_str; *cp == *(nowstr + (cp - last_str)); ++cp)
1215 errno = 0;
1216 from = strtoul (cp, &endp, base);
1217 if ((from == UINT_MAX && errno == ERANGE) || *endp != '\0')
1218 goto invalid_range;
1220 to = strtoul (nowstr + (cp - last_str), &endp, base);
1221 if ((to == UINT_MAX && errno == ERANGE)
1222 || (endp - nowstr) != now->val.str.lenmb || from >= to)
1223 goto invalid_range;
1225 /* OK, we have a range FROM - TO. Now we can create the symbolic names. */
1226 if (!ignore_content)
1228 now->val.str.startmb = tmp;
1229 while (++from <= to)
1231 struct charseq *seq;
1232 uint32_t wch;
1234 sprintf (tmp, (base == 10 ? "%.*s%0*d" : "%.*s%0*X"), cp - last_str,
1235 last_str, now->val.str.lenmb - (cp - last_str), from);
1237 get_character (now, charmap, repertoire, &seq, &wch);
1239 if (seq != NULL && seq->nbytes == 1)
1240 /* Yep, we can store information about this byte sequence. */
1241 ctype->class256_collection[seq->bytes[0]] |= class256_bit;
1243 if (wch != ILLEGAL_CHAR_VALUE && class_bit != 0)
1244 /* We have the UCS4 position. */
1245 *find_idx (ctype, &ctype->class_collection,
1246 &ctype->class_collection_max,
1247 &ctype->class_collection_act, wch) |= class_bit;
1249 if (handle_digits == 1)
1251 /* We must store the digit values. */
1252 if (ctype->mbdigits_act == ctype->mbdigits_max)
1254 ctype->mbdigits_max *= 2;
1255 ctype->mbdigits = xrealloc (ctype->mbdigits,
1256 (ctype->mbdigits_max
1257 * sizeof (char *)));
1258 ctype->wcdigits_max *= 2;
1259 ctype->wcdigits = xrealloc (ctype->wcdigits,
1260 (ctype->wcdigits_max
1261 * sizeof (uint32_t)));
1264 ctype->mbdigits[ctype->mbdigits_act++] = seq;
1265 ctype->wcdigits[ctype->wcdigits_act++] = wch;
1267 else if (handle_digits == 2)
1269 /* We must store the digit values. */
1270 if (ctype->outdigits_act >= 10)
1272 lr_error (ldfile, _("\
1273 %s: field `%s' does not contain exactly ten entries"),
1274 "LC_CTYPE", "outdigit");
1275 return;
1278 ctype->mboutdigits[ctype->outdigits_act] = seq;
1279 ctype->wcoutdigits[ctype->outdigits_act] = wch;
1280 ++ctype->outdigits_act;
1287 /* Ellipsis like in `<U1234>..<U2345>'. */
1288 static void
1289 charclass_ucs4_ellipsis (struct linereader *ldfile,
1290 struct locale_ctype_t *ctype,
1291 struct charmap_t *charmap,
1292 struct repertoire_t *repertoire,
1293 struct token *now, uint32_t last_wch,
1294 unsigned long int class256_bit,
1295 unsigned long int class_bit, int ignore_content,
1296 int handle_digits)
1298 if (last_wch > now->val.ucs4)
1300 lr_error (ldfile, _("\
1301 to-value <U%0*X> of range is smaller than from-value <U%0*X>"),
1302 (now->val.ucs4 | last_wch) < 65536 ? 4 : 8, now->val.ucs4,
1303 (now->val.ucs4 | last_wch) < 65536 ? 4 : 8, last_wch);
1304 return;
1307 if (!ignore_content)
1308 while (++last_wch <= now->val.ucs4)
1310 /* We have to find out whether there is a byte sequence corresponding
1311 to this UCS4 value. */
1312 struct charseq *seq = repertoire_find_seq (repertoire, last_wch);
1314 /* If this is the first time we look for this sequence create a new
1315 entry. */
1316 if (seq == NULL)
1318 /* Find the symbolic name for this UCS4 value. */
1319 const char *symbol = repertoire_find_symbol (repertoire, last_wch);
1320 uint32_t *newp = obstack_alloc (&repertoire->mem_pool, 4);
1321 *newp = last_wch;
1323 if (symbol != NULL)
1324 /* We have a name, now search the multibyte value. */
1325 seq = charmap_find_value (charmap, symbol, strlen (symbol));
1327 if (seq == NULL)
1329 /* We have to create a fake entry. */
1330 static const struct charseq negative
1331 = { .ucs4 = ILLEGAL_CHAR_VALUE };
1332 seq = (struct charseq *) &negative;
1334 else
1335 seq->ucs4 = last_wch;
1337 insert_entry (&repertoire->seq_table, newp, 4, seq);
1340 /* We have a name, now search the multibyte value. */
1341 if (seq->ucs4 == last_wch && seq->nbytes == 1)
1342 /* Yep, we can store information about this byte sequence. */
1343 ctype->class256_collection[(size_t) seq->bytes[0]]
1344 |= class256_bit;
1346 /* And of course we have the UCS4 position. */
1347 if (class_bit != 0 && class_bit != 0)
1348 *find_idx (ctype, &ctype->class_collection,
1349 &ctype->class_collection_max,
1350 &ctype->class_collection_act, last_wch) |= class_bit;
1352 if (handle_digits == 1)
1354 /* We must store the digit values. */
1355 if (ctype->mbdigits_act == ctype->mbdigits_max)
1357 ctype->mbdigits_max *= 2;
1358 ctype->mbdigits = xrealloc (ctype->mbdigits,
1359 (ctype->mbdigits_max
1360 * sizeof (char *)));
1361 ctype->wcdigits_max *= 2;
1362 ctype->wcdigits = xrealloc (ctype->wcdigits,
1363 (ctype->wcdigits_max
1364 * sizeof (uint32_t)));
1367 ctype->mbdigits[ctype->mbdigits_act++] = (seq->ucs4 == last_wch
1368 ? seq : NULL);
1369 ctype->wcdigits[ctype->wcdigits_act++] = last_wch;
1371 else if (handle_digits == 2)
1373 /* We must store the digit values. */
1374 if (ctype->outdigits_act >= 10)
1376 lr_error (ldfile, _("\
1377 %s: field `%s' does not contain exactly ten entries"),
1378 "LC_CTYPE", "outdigit");
1379 return;
1382 ctype->mboutdigits[ctype->outdigits_act] = (seq->ucs4 == last_wch
1383 ? seq : NULL);
1384 ctype->wcoutdigits[ctype->outdigits_act] = last_wch;
1385 ++ctype->outdigits_act;
1391 /* Ellipsis as in `/xea/x12.../xea/x34'. */
1392 static void
1393 charclass_charcode_ellipsis (struct linereader *ldfile,
1394 struct locale_ctype_t *ctype,
1395 struct charmap_t *charmap,
1396 struct repertoire_t *repertoire,
1397 struct token *now, char *last_charcode,
1398 uint32_t last_charcode_len,
1399 unsigned long int class256_bit,
1400 unsigned long int class_bit, int ignore_content,
1401 int handle_digits)
1403 /* First check whether the to-value is larger. */
1404 if (now->val.charcode.nbytes != last_charcode_len)
1406 lr_error (ldfile, _("\
1407 start end end character sequence of range must have the same length"));
1408 return;
1411 if (memcmp (last_charcode, now->val.charcode.bytes, last_charcode_len) > 0)
1413 lr_error (ldfile, _("\
1414 to-value character sequence is smaller than from-value sequence"));
1415 return;
1418 if (!ignore_content)
1422 /* Increment the byte sequence value. */
1423 struct charseq *seq;
1424 uint32_t wch;
1425 int i;
1427 for (i = last_charcode_len - 1; i >= 0; --i)
1428 if (++last_charcode[i] != 0)
1429 break;
1431 if (last_charcode_len == 1)
1432 /* Of course we have the charcode value. */
1433 ctype->class256_collection[(size_t) last_charcode[0]]
1434 |= class256_bit;
1436 /* Find the symbolic name. */
1437 seq = charmap_find_symbol (charmap, last_charcode,
1438 last_charcode_len);
1439 if (seq != NULL)
1441 if (seq->ucs4 == UNINITIALIZED_CHAR_VALUE)
1442 seq->ucs4 = repertoire_find_value (repertoire, seq->name,
1443 strlen (seq->name));
1444 wch = seq->ucs4;
1446 if (wch != ILLEGAL_CHAR_VALUE && class_bit != 0)
1447 *find_idx (ctype, &ctype->class_collection,
1448 &ctype->class_collection_max,
1449 &ctype->class_collection_act, wch) |= class_bit;
1451 else
1452 wch = ILLEGAL_CHAR_VALUE;
1454 if (handle_digits == 1)
1456 /* We must store the digit values. */
1457 if (ctype->mbdigits_act == ctype->mbdigits_max)
1459 ctype->mbdigits_max *= 2;
1460 ctype->mbdigits = xrealloc (ctype->mbdigits,
1461 (ctype->mbdigits_max
1462 * sizeof (char *)));
1463 ctype->wcdigits_max *= 2;
1464 ctype->wcdigits = xrealloc (ctype->wcdigits,
1465 (ctype->wcdigits_max
1466 * sizeof (uint32_t)));
1469 seq = xmalloc (sizeof (struct charseq) + last_charcode_len);
1470 memcpy ((char *) (seq + 1), last_charcode, last_charcode_len);
1471 seq->nbytes = last_charcode_len;
1473 ctype->mbdigits[ctype->mbdigits_act++] = seq;
1474 ctype->wcdigits[ctype->wcdigits_act++] = wch;
1476 else if (handle_digits == 2)
1478 struct charseq *seq;
1479 /* We must store the digit values. */
1480 if (ctype->outdigits_act >= 10)
1482 lr_error (ldfile, _("\
1483 %s: field `%s' does not contain exactly ten entries"),
1484 "LC_CTYPE", "outdigit");
1485 return;
1488 seq = xmalloc (sizeof (struct charseq) + last_charcode_len);
1489 memcpy ((char *) (seq + 1), last_charcode, last_charcode_len);
1490 seq->nbytes = last_charcode_len;
1492 ctype->mboutdigits[ctype->outdigits_act] = seq;
1493 ctype->wcoutdigits[ctype->outdigits_act] = wch;
1494 ++ctype->outdigits_act;
1497 while (memcmp (last_charcode, now->val.charcode.bytes,
1498 last_charcode_len) != 0);
1503 /* Read one transliteration entry. */
1504 static uint32_t *
1505 read_widestring (struct linereader *ldfile, struct token *now,
1506 struct charmap_t *charmap, struct repertoire_t *repertoire)
1508 uint32_t *wstr;
1510 if (now->tok == tok_default_missing)
1511 /* The special name "" will denote this case. */
1512 wstr = (uint32_t *) L"";
1513 else if (now->tok == tok_bsymbol)
1515 /* Get the value from the repertoire. */
1516 wstr = xmalloc (2 * sizeof (uint32_t));
1517 wstr[0] = repertoire_find_value (repertoire, now->val.str.startmb,
1518 now->val.str.lenmb);
1519 if (wstr[0] == ILLEGAL_CHAR_VALUE)
1520 /* We cannot proceed, we don't know the UCS4 value. */
1521 return NULL;
1523 wstr[1] = 0;
1525 else if (now->tok == tok_ucs4)
1527 wstr = xmalloc (2 * sizeof (uint32_t));
1528 wstr[0] = now->val.ucs4;
1529 wstr[1] = 0;
1531 else if (now->tok == tok_charcode)
1533 /* Argh, we have to convert to the symbol name first and then to the
1534 UCS4 value. */
1535 struct charseq *seq = charmap_find_symbol (charmap,
1536 now->val.str.startmb,
1537 now->val.str.lenmb);
1538 if (seq == NULL)
1539 /* Cannot find the UCS4 value. */
1540 return NULL;
1542 if (seq->ucs4 == UNINITIALIZED_CHAR_VALUE)
1543 seq->ucs4 = repertoire_find_value (repertoire, seq->name,
1544 strlen (seq->name));
1545 if (seq->ucs4 == ILLEGAL_CHAR_VALUE)
1546 /* We cannot proceed, we don't know the UCS4 value. */
1547 return NULL;
1549 wstr = xmalloc (2 * sizeof (uint32_t));
1550 wstr[0] = seq->ucs4;
1551 wstr[1] = 0;
1553 else if (now->tok == tok_string)
1555 wstr = now->val.str.startwc;
1556 if (wstr[0] == 0)
1557 return NULL;
1559 else
1561 if (now->tok != tok_eol && now->tok != tok_eof)
1562 lr_ignore_rest (ldfile, 0);
1563 SYNTAX_ERROR (_("%s: syntax error"), "LC_CTYPE");
1564 return (uint32_t *) -1l;
1567 return wstr;
1571 static void
1572 read_translit_entry (struct linereader *ldfile, struct locale_ctype_t *ctype,
1573 struct token *now, struct charmap_t *charmap,
1574 struct repertoire_t *repertoire)
1576 uint32_t *from_wstr = read_widestring (ldfile, now, charmap, repertoire);
1577 struct translit_t *result;
1578 struct translit_to_t **top;
1579 struct obstack *ob = &ctype->mem_pool;
1580 int first;
1581 int ignore;
1583 if (from_wstr == NULL)
1584 /* There is no valid from string. */
1585 return;
1587 result = (struct translit_t *) obstack_alloc (ob,
1588 sizeof (struct translit_t));
1589 result->from = from_wstr;
1590 result->next = NULL;
1591 result->to = NULL;
1592 top = &result->to;
1593 first = 1;
1594 ignore = 0;
1596 while (1)
1598 uint32_t *to_wstr;
1600 /* Next we have one or more transliterations. They are
1601 separated by semicolons. */
1602 now = lr_token (ldfile, charmap, repertoire);
1604 if (!first && (now->tok == tok_semicolon || now->tok == tok_eol))
1606 /* One string read. */
1607 const uint32_t zero = 0;
1609 if (!ignore)
1611 obstack_grow (ob, &zero, 4);
1612 to_wstr = obstack_finish (ob);
1614 *top = obstack_alloc (ob, sizeof (struct translit_to_t));
1615 (*top)->str = to_wstr;
1616 (*top)->next = NULL;
1619 if (now->tok == tok_eol)
1621 result->next = ctype->translit;
1622 ctype->translit = result;
1623 return;
1626 if (!ignore)
1627 top = &(*top)->next;
1628 ignore = 0;
1630 else
1632 to_wstr = read_widestring (ldfile, now, charmap, repertoire);
1633 if (to_wstr == (uint32_t *) -1l)
1635 /* An error occurred. */
1636 obstack_free (ob, result);
1637 return;
1640 if (to_wstr == NULL)
1641 ignore = 1;
1642 else
1643 /* This value is usable. */
1644 obstack_grow (ob, to_wstr, wcslen ((wchar_t *) to_wstr) * 4);
1646 first = 0;
1652 /* The parser for the LC_CTYPE section of the locale definition. */
1653 void
1654 ctype_read (struct linereader *ldfile, struct localedef_t *result,
1655 struct charmap_t *charmap, const char *repertoire_name,
1656 int ignore_content)
1658 struct repertoire_t *repertoire = NULL;
1659 struct locale_ctype_t *ctype;
1660 struct token *now;
1661 enum token_t nowtok;
1662 size_t cnt;
1663 struct charseq *last_seq;
1664 uint32_t last_wch = 0;
1665 enum token_t last_token;
1666 enum token_t ellipsis_token;
1667 char last_charcode[16];
1668 size_t last_charcode_len = 0;
1669 const char *last_str = NULL;
1670 int mapidx;
1672 /* Get the repertoire we have to use. */
1673 if (repertoire_name != NULL)
1674 repertoire = repertoire_read (repertoire_name);
1676 /* The rest of the line containing `LC_CTYPE' must be free. */
1677 lr_ignore_rest (ldfile, 1);
1682 now = lr_token (ldfile, charmap, NULL);
1683 nowtok = now->tok;
1685 while (nowtok == tok_eol);
1687 /* If we see `copy' now we are almost done. */
1688 if (nowtok == tok_copy)
1690 handle_copy (ldfile, charmap, repertoire, result, tok_lc_ctype, LC_CTYPE,
1691 "LC_CTYPE", ignore_content);
1692 return;
1695 /* Prepare the data structures. */
1696 ctype_startup (ldfile, result, charmap, ignore_content);
1697 ctype = result->categories[LC_CTYPE].ctype;
1699 /* Remember the repertoire we use. */
1700 if (!ignore_content)
1701 ctype->repertoire = repertoire;
1703 while (1)
1705 unsigned long int class_bit = 0;
1706 unsigned long int class256_bit = 0;
1707 int handle_digits = 0;
1709 /* Of course we don't proceed beyond the end of file. */
1710 if (nowtok == tok_eof)
1711 break;
1713 /* Ingore empty lines. */
1714 if (nowtok == tok_eol)
1716 now = lr_token (ldfile, charmap, NULL);
1717 nowtok = now->tok;
1718 continue;
1721 switch (nowtok)
1723 case tok_charclass:
1724 now = lr_token (ldfile, charmap, NULL);
1725 while (now->tok == tok_ident || now->tok == tok_string)
1727 ctype_class_new (ldfile, ctype, now->val.str.startmb);
1728 now = lr_token (ldfile, charmap, NULL);
1729 if (now->tok != tok_semicolon)
1730 break;
1731 now = lr_token (ldfile, charmap, NULL);
1733 if (now->tok != tok_eol)
1734 SYNTAX_ERROR (_("\
1735 %s: syntax error in definition of new character class"), "LC_CTYPE");
1736 break;
1738 case tok_charconv:
1739 now = lr_token (ldfile, charmap, NULL);
1740 while (now->tok == tok_ident || now->tok == tok_string)
1742 ctype_map_new (ldfile, ctype, now->val.str.startmb, charmap);
1743 now = lr_token (ldfile, charmap, NULL);
1744 if (now->tok != tok_semicolon)
1745 break;
1746 now = lr_token (ldfile, charmap, NULL);
1748 if (now->tok != tok_eol)
1749 SYNTAX_ERROR (_("\
1750 %s: syntax error in definition of new character map"), "LC_CTYPE");
1751 break;
1753 case tok_class:
1754 /* Ignore the rest of the line if we don't need the input of
1755 this line. */
1756 if (ignore_content)
1758 lr_ignore_rest (ldfile, 0);
1759 break;
1762 /* We simply forget the `class' keyword and use the following
1763 operand to determine the bit. */
1764 now = lr_token (ldfile, charmap, NULL);
1765 if (now->tok == tok_ident || now->tok == tok_string)
1767 /* Must be one of the predefined class names. */
1768 for (cnt = 0; cnt < ctype->nr_charclass; ++cnt)
1769 if (strcmp (ctype->classnames[cnt], now->val.str.startmb) == 0)
1770 break;
1771 if (cnt >= ctype->nr_charclass)
1773 #ifdef PREDEFINED_CLASSES
1774 if (now->val.str.lenmb == 8
1775 && memcmp ("special1", now->val.str.startmb, 8) == 0)
1776 class_bit = _ISwspecial1;
1777 else if (now->val.str.lenmb == 8
1778 && memcmp ("special2", now->val.str.startmb, 8) == 0)
1779 class_bit = _ISwspecial2;
1780 else if (now->val.str.lenmb == 8
1781 && memcmp ("special3", now->val.str.startmb, 8) == 0)
1782 class_bit = _ISwspecial3;
1783 else
1784 #endif
1786 lr_error (ldfile, _("\
1787 unknown character class `%s' in category `LC_CTYPE'"),
1788 now->val.str.startmb);
1789 free (now->val.str.startmb);
1791 lr_ignore_rest (ldfile, 0);
1792 continue;
1795 else
1796 class_bit = _ISwbit (cnt);
1798 free (now->val.str.startmb);
1800 else if (now->tok == tok_digit)
1801 goto handle_tok_digit;
1802 else if (now->tok < tok_upper || now->tok > tok_blank)
1803 goto err_label;
1804 else
1806 class_bit = BITw (now->tok);
1807 class256_bit = BIT (now->tok);
1810 /* The next character must be a semicolon. */
1811 now = lr_token (ldfile, charmap, NULL);
1812 if (now->tok != tok_semicolon)
1813 goto err_label;
1814 goto read_charclass;
1816 case tok_upper:
1817 case tok_lower:
1818 case tok_alpha:
1819 case tok_alnum:
1820 case tok_space:
1821 case tok_cntrl:
1822 case tok_punct:
1823 case tok_graph:
1824 case tok_print:
1825 case tok_xdigit:
1826 case tok_blank:
1827 /* Ignore the rest of the line if we don't need the input of
1828 this line. */
1829 if (ignore_content)
1831 lr_ignore_rest (ldfile, 0);
1832 break;
1835 class_bit = BITw (now->tok);
1836 class256_bit = BIT (now->tok);
1837 handle_digits = 0;
1838 read_charclass:
1839 ctype->class_done |= class_bit;
1840 last_token = tok_none;
1841 ellipsis_token = tok_none;
1842 now = lr_token (ldfile, charmap, NULL);
1843 while (now->tok != tok_eol && now->tok != tok_eof)
1845 uint32_t wch;
1846 struct charseq *seq;
1848 if (ellipsis_token == tok_none)
1850 if (get_character (now, charmap, repertoire, &seq, &wch))
1851 goto err_label;
1853 if (!ignore_content && seq != NULL && seq->nbytes == 1)
1854 /* Yep, we can store information about this byte
1855 sequence. */
1856 ctype->class256_collection[seq->bytes[0]] |= class256_bit;
1858 if (!ignore_content && wch != ILLEGAL_CHAR_VALUE
1859 && class_bit != 0)
1860 /* We have the UCS4 position. */
1861 *find_idx (ctype, &ctype->class_collection,
1862 &ctype->class_collection_max,
1863 &ctype->class_collection_act, wch) |= class_bit;
1865 last_token = now->tok;
1866 /* Terminate the string. */
1867 if (last_token == tok_bsymbol)
1869 now->val.str.startmb[now->val.str.lenmb] = '\0';
1870 last_str = now->val.str.startmb;
1872 else
1873 last_str = NULL;
1874 last_seq = seq;
1875 last_wch = wch;
1876 memcpy (last_charcode, now->val.charcode.bytes, 16);
1877 last_charcode_len = now->val.charcode.nbytes;
1879 if (!ignore_content && handle_digits == 1)
1881 /* We must store the digit values. */
1882 if (ctype->mbdigits_act == ctype->mbdigits_max)
1884 ctype->mbdigits_max += 10;
1885 ctype->mbdigits = xrealloc (ctype->mbdigits,
1886 (ctype->mbdigits_max
1887 * sizeof (char *)));
1888 ctype->wcdigits_max += 10;
1889 ctype->wcdigits = xrealloc (ctype->wcdigits,
1890 (ctype->wcdigits_max
1891 * sizeof (uint32_t)));
1894 ctype->mbdigits[ctype->mbdigits_act++] = seq;
1895 ctype->wcdigits[ctype->wcdigits_act++] = wch;
1897 else if (!ignore_content && handle_digits == 2)
1899 /* We must store the digit values. */
1900 if (ctype->outdigits_act >= 10)
1902 lr_error (ldfile, _("\
1903 %s: field `%s' does not contain exactly ten entries"),
1904 "LC_CTYPE", "outdigit");
1905 goto err_label;
1908 ctype->mboutdigits[ctype->outdigits_act] = seq;
1909 ctype->wcoutdigits[ctype->outdigits_act] = wch;
1910 ++ctype->outdigits_act;
1913 else
1915 /* Now it gets complicated. We have to resolve the
1916 ellipsis problem. First we must distinguish between
1917 the different kind of ellipsis and this must match the
1918 tokens we have seen. */
1919 assert (last_token != tok_none);
1921 if (last_token != now->tok)
1923 lr_error (ldfile, _("\
1924 ellipsis range must be marked by two operands of same type"));
1925 lr_ignore_rest (ldfile, 0);
1926 break;
1929 if (last_token == tok_bsymbol)
1931 if (ellipsis_token == tok_ellipsis3)
1932 lr_error (ldfile, _("with symbolic name range values \
1933 the absolute ellipsis `...' must not be used"));
1935 charclass_symbolic_ellipsis (ldfile, ctype, charmap,
1936 repertoire, now, last_str,
1937 class256_bit, class_bit,
1938 (ellipsis_token
1939 == tok_ellipsis4
1940 ? 10 : 16),
1941 ignore_content,
1942 handle_digits);
1944 else if (last_token == tok_ucs4)
1946 if (ellipsis_token != tok_ellipsis2)
1947 lr_error (ldfile, _("\
1948 with UCS range values one must use the hexadecimal symbolic ellipsis `..'"));
1950 charclass_ucs4_ellipsis (ldfile, ctype, charmap,
1951 repertoire, now, last_wch,
1952 class256_bit, class_bit,
1953 ignore_content, handle_digits);
1955 else
1957 assert (last_token == tok_charcode);
1959 if (ellipsis_token != tok_ellipsis3)
1960 lr_error (ldfile, _("\
1961 with character code range values one must use the absolute ellipsis `...'"));
1963 charclass_charcode_ellipsis (ldfile, ctype, charmap,
1964 repertoire, now,
1965 last_charcode,
1966 last_charcode_len,
1967 class256_bit, class_bit,
1968 ignore_content,
1969 handle_digits);
1972 /* Now we have used the last value. */
1973 last_token = tok_none;
1976 /* Next we expect a semicolon or the end of the line. */
1977 now = lr_token (ldfile, charmap, NULL);
1978 if (now->tok == tok_eol || now->tok == tok_eof)
1979 break;
1981 if (last_token != tok_none
1982 && now->tok >= tok_ellipsis2 && now->tok <= tok_ellipsis4)
1984 ellipsis_token = now->tok;
1985 now = lr_token (ldfile, charmap, NULL);
1986 continue;
1989 if (now->tok != tok_semicolon)
1990 goto err_label;
1992 /* And get the next character. */
1993 now = lr_token (ldfile, charmap, NULL);
1995 ellipsis_token = tok_none;
1997 break;
1999 case tok_digit:
2000 /* Ignore the rest of the line if we don't need the input of
2001 this line. */
2002 if (ignore_content)
2004 lr_ignore_rest (ldfile, 0);
2005 break;
2008 handle_tok_digit:
2009 class_bit = _ISwdigit;
2010 class256_bit = _ISdigit;
2011 handle_digits = 1;
2012 goto read_charclass;
2014 case tok_outdigit:
2015 /* Ignore the rest of the line if we don't need the input of
2016 this line. */
2017 if (ignore_content)
2019 lr_ignore_rest (ldfile, 0);
2020 break;
2023 if (ctype->outdigits_act != 0)
2024 lr_error (ldfile, _("\
2025 %s: field `%s' declared more than once"),
2026 "LC_CTYPE", "outdigit");
2027 class_bit = 0;
2028 class256_bit = 0;
2029 handle_digits = 2;
2030 goto read_charclass;
2032 case tok_toupper:
2033 /* Ignore the rest of the line if we don't need the input of
2034 this line. */
2035 if (ignore_content)
2037 lr_ignore_rest (ldfile, 0);
2038 break;
2041 mapidx = 0;
2042 goto read_mapping;
2044 case tok_tolower:
2045 /* Ignore the rest of the line if we don't need the input of
2046 this line. */
2047 if (ignore_content)
2049 lr_ignore_rest (ldfile, 0);
2050 break;
2053 mapidx = 1;
2054 goto read_mapping;
2056 case tok_map:
2057 /* Ignore the rest of the line if we don't need the input of
2058 this line. */
2059 if (ignore_content)
2061 lr_ignore_rest (ldfile, 0);
2062 break;
2065 /* We simply forget the `map' keyword and use the following
2066 operand to determine the mapping. */
2067 now = lr_token (ldfile, charmap, NULL);
2068 if (now->tok == tok_ident || now->tok == tok_string)
2070 size_t cnt;
2072 for (cnt = 2; cnt < ctype->map_collection_nr; ++cnt)
2073 if (strcmp (now->val.str.startmb, ctype->mapnames[cnt]) == 0)
2074 break;
2076 if (cnt < ctype->map_collection_nr)
2077 mapidx = cnt;
2078 else
2080 lr_error (ldfile, _("unknown map `%s'"),
2081 now->val.str.startmb);
2082 lr_ignore_rest (ldfile, 0);
2083 break;
2086 else if (now->tok < tok_toupper || now->tok > tok_tolower)
2087 goto err_label;
2088 else
2089 mapidx = now->tok - tok_toupper;
2091 now = lr_token (ldfile, charmap, NULL);
2092 /* This better should be a semicolon. */
2093 if (now->tok != tok_semicolon)
2094 goto err_label;
2096 read_mapping:
2097 /* Test whether this mapping was already defined. */
2098 if (ctype->tomap_done[mapidx])
2100 lr_error (ldfile, _("duplicated definition for mapping `%s'"),
2101 ctype->mapnames[mapidx]);
2102 lr_ignore_rest (ldfile, 0);
2103 break;
2105 ctype->tomap_done[mapidx] = 1;
2107 now = lr_token (ldfile, charmap, NULL);
2108 while (now->tok != tok_eol && now->tok != tok_eof)
2110 struct charseq *from_seq;
2111 uint32_t from_wch;
2112 struct charseq *to_seq;
2113 uint32_t to_wch;
2115 /* Every pair starts with an opening brace. */
2116 if (now->tok != tok_open_brace)
2117 goto err_label;
2119 /* Next comes the from-value. */
2120 now = lr_token (ldfile, charmap, NULL);
2121 if (get_character (now, charmap, repertoire, &from_seq,
2122 &from_wch) != 0)
2123 goto err_label;
2125 /* The next is a comma. */
2126 now = lr_token (ldfile, charmap, NULL);
2127 if (now->tok != tok_comma)
2128 goto err_label;
2130 /* And the other value. */
2131 now = lr_token (ldfile, charmap, NULL);
2132 if (get_character (now, charmap, repertoire, &to_seq,
2133 &to_wch) != 0)
2134 goto err_label;
2136 /* And the last thing is the closing brace. */
2137 now = lr_token (ldfile, charmap, NULL);
2138 if (now->tok != tok_close_brace)
2139 goto err_label;
2141 if (!ignore_content)
2143 if (mapidx < 2 && from_seq != NULL && to_seq != NULL
2144 && from_seq->nbytes == 1 && to_seq->nbytes == 1)
2145 /* We can use this value. */
2146 ctype->map256_collection[mapidx][from_seq->bytes[0]]
2147 = to_seq->bytes[0];
2149 if (from_wch != ILLEGAL_CHAR_VALUE
2150 && to_wch != ILLEGAL_CHAR_VALUE)
2151 /* Both correct values. */
2152 *find_idx (ctype, &ctype->map_collection[mapidx],
2153 &ctype->map_collection_max[mapidx],
2154 &ctype->map_collection_act[mapidx],
2155 from_wch) = to_wch;
2158 /* Now comes a semicolon or the end of the line/file. */
2159 now = lr_token (ldfile, charmap, NULL);
2160 if (now->tok == tok_semicolon)
2161 now = lr_token (ldfile, charmap, NULL);
2163 break;
2165 case tok_translit_start:
2166 /* Ignore the rest of the line if we don't need the input of
2167 this line. */
2168 if (ignore_content)
2170 lr_ignore_rest (ldfile, 0);
2171 break;
2174 /* The rest of the line better should be empty. */
2175 lr_ignore_rest (ldfile, 1);
2177 /* We count here the number of allocated entries in the `translit'
2178 array. */
2179 cnt = 0;
2181 /* We proceed until we see the `translit_end' token. */
2182 while (now = lr_token (ldfile, charmap, repertoire),
2183 now->tok != tok_translit_end && now->tok != tok_eof)
2185 if (now->tok == tok_eol)
2186 /* Ignore empty lines. */
2187 continue;
2189 if (now->tok == tok_translit_end)
2191 lr_ignore_rest (ldfile, 0);
2192 break;
2195 if (now->tok == tok_include)
2197 /* We have to include locale. */
2198 const char *locale_name;
2199 const char *repertoire_name;
2201 now = lr_token (ldfile, charmap, NULL);
2202 /* This should be a string or an identifier. In any
2203 case something to name a locale. */
2204 if (now->tok != tok_string && now->tok != tok_ident)
2206 translit_syntax:
2207 lr_error (ldfile, _("%s: syntax error"), "LC_CTYPE");
2208 lr_ignore_rest (ldfile, 0);
2209 continue;
2211 locale_name = now->val.str.startmb;
2213 /* Next should be a semicolon. */
2214 now = lr_token (ldfile, charmap, NULL);
2215 if (now->tok != tok_semicolon)
2216 goto translit_syntax;
2218 /* Now the repertoire name. */
2219 now = lr_token (ldfile, charmap, NULL);
2220 if ((now->tok != tok_string && now->tok != tok_ident)
2221 || now->val.str.startmb == NULL)
2222 goto translit_syntax;
2223 repertoire_name = now->val.str.startmb;
2225 /* We must not have more than one `include'. */
2226 if (ctype->translit_copy_locale != NULL)
2228 lr_error (ldfile, _("\
2229 %s: only one `include' instruction allowed"), "LC_CTYPE");
2230 lr_ignore_rest (ldfile, 0);
2231 continue;
2234 ctype->translit_copy_locale = locale_name;
2235 ctype->translit_copy_repertoire = repertoire_name;
2237 /* The rest of the line must be empty. */
2238 lr_ignore_rest (ldfile, 1);
2239 continue;
2242 read_translit_entry (ldfile, ctype, now, charmap, repertoire);
2244 break;
2246 case tok_ident:
2247 /* Ignore the rest of the line if we don't need the input of
2248 this line. */
2249 if (ignore_content)
2251 lr_ignore_rest (ldfile, 0);
2252 break;
2255 /* This could mean one of several things. First test whether
2256 it's a character class name. */
2257 for (cnt = 0; cnt < ctype->nr_charclass; ++cnt)
2258 if (strcmp (now->val.str.startmb, ctype->classnames[cnt]) == 0)
2259 break;
2260 if (cnt < ctype->nr_charclass)
2262 class_bit = _ISwbit (cnt);
2263 class256_bit = cnt <= 11 ? _ISbit (cnt) : 0;
2264 free (now->val.str.startmb);
2265 goto read_charclass;
2267 for (cnt = 0; cnt < ctype->map_collection_nr; ++cnt)
2268 if (strcmp (now->val.str.startmb, ctype->mapnames[cnt]) == 0)
2269 break;
2270 if (cnt < ctype->map_collection_nr)
2272 mapidx = cnt;
2273 free (now->val.str.startmb);
2274 goto read_mapping;
2276 #ifdef PREDEFINED_CLASSES
2277 if (strcmp (now->val.str.startmb, "special1") == 0)
2279 class_bit = _ISwspecial1;
2280 free (now->val.str.startmb);
2281 goto read_charclass;
2283 if (strcmp (now->val.str.startmb, "special2") == 0)
2285 class_bit = _ISwspecial2;
2286 free (now->val.str.startmb);
2287 goto read_charclass;
2289 if (strcmp (now->val.str.startmb, "special3") == 0)
2291 class_bit = _ISwspecial3;
2292 free (now->val.str.startmb);
2293 goto read_charclass;
2295 if (strcmp (now->val.str.startmb, "tosymmetric") == 0)
2297 mapidx = 2;
2298 goto read_mapping;
2300 #endif
2301 break;
2303 case tok_end:
2304 /* Next we assume `LC_CTYPE'. */
2305 now = lr_token (ldfile, charmap, NULL);
2306 if (now->tok == tok_eof)
2307 break;
2308 if (now->tok == tok_eol)
2309 lr_error (ldfile, _("%s: incomplete `END' line"),
2310 "LC_CTYPE");
2311 else if (now->tok != tok_lc_ctype)
2312 lr_error (ldfile, _("\
2313 %1$s: definition does not end with `END %1$s'"), "LC_CTYPE");
2314 lr_ignore_rest (ldfile, now->tok == tok_lc_ctype);
2315 return;
2317 default:
2318 err_label:
2319 if (now->tok != tok_eof)
2320 SYNTAX_ERROR (_("%s: syntax error"), "LC_CTYPE");
2323 /* Prepare for the next round. */
2324 now = lr_token (ldfile, charmap, NULL);
2325 nowtok = now->tok;
2328 /* When we come here we reached the end of the file. */
2329 lr_error (ldfile, _("%s: premature end of file"), "LC_CTYPE");
2333 static void
2334 set_class_defaults (struct locale_ctype_t *ctype, struct charmap_t *charmap,
2335 struct repertoire_t *repertoire)
2337 size_t cnt;
2339 /* These function defines the default values for the classes and conversions
2340 according to POSIX.2 2.5.2.1.
2341 It may seem that the order of these if-blocks is arbitrary but it is NOT.
2342 Don't move them unless you know what you do! */
2344 void set_default (int bitpos, int from, int to)
2346 char tmp[2];
2347 int ch;
2348 int bit = _ISbit (bitpos);
2349 int bitw = _ISwbit (bitpos);
2350 /* Define string. */
2351 strcpy (tmp, "?");
2353 for (ch = from; ch <= to; ++ch)
2355 uint32_t value;
2356 struct charseq *seq;
2357 tmp[0] = ch;
2359 value = repertoire_find_value (repertoire, tmp, 1);
2360 if (value == ILLEGAL_CHAR_VALUE)
2362 if (!be_quiet)
2363 error (0, 0, _("\
2364 %s: character `%s' not defined in repertoire while needed as default value"),
2365 "LC_CTYPE", tmp);
2367 else
2368 ELEM (ctype, class_collection, , value) |= bitw;
2370 seq = charmap_find_value (charmap, tmp, 1);
2371 if (seq == NULL)
2373 if (!be_quiet)
2374 error (0, 0, _("\
2375 %s: character `%s' not defined in charmap while needed as default value"),
2376 "LC_CTYPE", tmp);
2378 else if (seq->nbytes != 1)
2379 error (0, 0, _("\
2380 %s: character `%s' in charmap not representable with one byte"),
2381 "LC_CTYPE", tmp);
2382 else
2383 ctype->class256_collection[seq->bytes[0]] |= bit;
2387 /* Set default values if keyword was not present. */
2388 if ((ctype->class_done & BITw (tok_upper)) == 0)
2389 /* "If this keyword [lower] is not specified, the lowercase letters
2390 `A' through `Z', ..., shall automatically belong to this class,
2391 with implementation defined character values." [P1003.2, 2.5.2.1] */
2392 set_default (BITPOS (tok_upper), 'A', 'Z');
2394 if ((ctype->class_done & BITw (tok_lower)) == 0)
2395 /* "If this keyword [lower] is not specified, the lowercase letters
2396 `a' through `z', ..., shall automatically belong to this class,
2397 with implementation defined character values." [P1003.2, 2.5.2.1] */
2398 set_default (BITPOS (tok_lower), 'a', 'z');
2400 if ((ctype->class_done & BITw (tok_alpha)) == 0)
2402 /* Table 2-6 in P1003.2 says that characters in class `upper' or
2403 class `lower' *must* be in class `alpha'. */
2404 unsigned long int mask = BIT (tok_upper) | BIT (tok_lower);
2405 unsigned long int maskw = BITw (tok_upper) | BITw (tok_lower);
2407 for (cnt = 0; cnt < 256; ++cnt)
2408 if ((ctype->class256_collection[cnt] & mask) != 0)
2409 ctype->class256_collection[cnt] |= BIT (tok_alpha);
2411 for (cnt = 0; cnt < ctype->class_collection_act; ++cnt)
2412 if ((ctype->class_collection[cnt] & maskw) != 0)
2413 ctype->class_collection[cnt] |= BITw (tok_alpha);
2416 if ((ctype->class_done & BITw (tok_digit)) == 0)
2417 /* "If this keyword [digit] is not specified, the digits `0' through
2418 `9', ..., shall automatically belong to this class, with
2419 implementation-defined character values." [P1003.2, 2.5.2.1] */
2420 set_default (BITPOS (tok_digit), '0', '9');
2422 /* "Only characters specified for the `alpha' and `digit' keyword
2423 shall be specified. Characters specified for the keyword `alpha'
2424 and `digit' are automatically included in this class. */
2426 unsigned long int mask = BIT (tok_alpha) | BIT (tok_digit);
2427 unsigned long int maskw = BITw (tok_alpha) | BITw (tok_digit);
2429 for (cnt = 0; cnt < 256; ++cnt)
2430 if ((ctype->class256_collection[cnt] & mask) != 0)
2431 ctype->class256_collection[cnt] |= BIT (tok_alnum);
2433 for (cnt = 0; cnt < ctype->class_collection_act; ++cnt)
2434 if ((ctype->class_collection[cnt] & maskw) != 0)
2435 ctype->class_collection[cnt] |= BITw (tok_alnum);
2438 if ((ctype->class_done & BITw (tok_space)) == 0)
2439 /* "If this keyword [space] is not specified, the characters <space>,
2440 <form-feed>, <newline>, <carriage-return>, <tab>, and
2441 <vertical-tab>, ..., shall automatically belong to this class,
2442 with implementation-defined character values." [P1003.2, 2.5.2.1] */
2444 uint32_t value;
2445 struct charseq *seq;
2447 value = repertoire_find_value (repertoire, "space", 5);
2448 if (value == ILLEGAL_CHAR_VALUE)
2450 if (!be_quiet)
2451 error (0, 0, _("\
2452 %s: character `%s' not defined while needed as default value"),
2453 "LC_CTYPE", "<space>");
2455 else
2456 ELEM (ctype, class_collection, , value) |= BIT (tok_space);
2458 seq = charmap_find_value (charmap, "space", 5);
2459 if (seq == NULL)
2461 if (!be_quiet)
2462 error (0, 0, _("\
2463 %s: character `%s' not defined while needed as default value"),
2464 "LC_CTYPE", "<space>");
2466 else if (seq->nbytes != 1)
2467 error (0, 0, _("\
2468 %s: character `%s' in charmap not representable with one byte"),
2469 "LC_CTYPE", "<space>");
2470 else
2471 ctype->class256_collection[seq->bytes[0]] |= BIT (tok_space);
2474 value = repertoire_find_value (repertoire, "form-feed", 9);
2475 if (value == ILLEGAL_CHAR_VALUE)
2477 if (!be_quiet)
2478 error (0, 0, _("\
2479 %s: character `%s' not defined while needed as default value"),
2480 "LC_CTYPE", "<form-feed>");
2482 else
2483 ELEM (ctype, class_collection, , value) |= BIT (tok_space);
2485 seq = charmap_find_value (charmap, "form-feed", 9);
2486 if (seq == NULL)
2488 if (!be_quiet)
2489 error (0, 0, _("\
2490 %s: character `%s' not defined while needed as default value"),
2491 "LC_CTYPE", "<form-feed>");
2493 else if (seq->nbytes != 1)
2494 error (0, 0, _("\
2495 %s: character `%s' in charmap not representable with one byte"),
2496 "LC_CTYPE", "<form-feed>");
2497 else
2498 ctype->class256_collection[seq->bytes[0]] |= BIT (tok_space);
2501 value = repertoire_find_value (repertoire, "newline", 7);
2502 if (value == ILLEGAL_CHAR_VALUE)
2504 if (!be_quiet)
2505 error (0, 0, _("\
2506 %s: character `%s' not defined while needed as default value"),
2507 "LC_CTYPE", "<newline>");
2509 else
2510 ELEM (ctype, class_collection, , value) |= BIT (tok_space);
2512 seq = charmap_find_value (charmap, "newline", 7);
2513 if (seq == NULL)
2515 if (!be_quiet)
2516 error (0, 0, _("\
2517 character `%s' not defined while needed as default value"),
2518 "<newline>");
2520 else if (seq->nbytes != 1)
2521 error (0, 0, _("\
2522 %s: character `%s' in charmap not representable with one byte"),
2523 "LC_CTYPE", "<newline>");
2524 else
2525 ctype->class256_collection[seq->bytes[0]] |= BIT (tok_space);
2528 value = repertoire_find_value (repertoire, "carriage-return", 15);
2529 if (value == ILLEGAL_CHAR_VALUE)
2531 if (!be_quiet)
2532 error (0, 0, _("\
2533 %s: character `%s' not defined while needed as default value"),
2534 "LC_CTYPE", "<carriage-return>");
2536 else
2537 ELEM (ctype, class_collection, , value) |= BIT (tok_space);
2539 seq = charmap_find_value (charmap, "carriage-return", 15);
2540 if (seq == NULL)
2542 if (!be_quiet)
2543 error (0, 0, _("\
2544 %s: character `%s' not defined while needed as default value"),
2545 "LC_CTYPE", "<carriage-return>");
2547 else if (seq->nbytes != 1)
2548 error (0, 0, _("\
2549 %s: character `%s' in charmap not representable with one byte"),
2550 "LC_CTYPE", "<carriage-return>");
2551 else
2552 ctype->class256_collection[seq->bytes[0]] |= BIT (tok_space);
2555 value = repertoire_find_value (repertoire, "tab", 3);
2556 if (value == ILLEGAL_CHAR_VALUE)
2558 if (!be_quiet)
2559 error (0, 0, _("\
2560 %s: character `%s' not defined while needed as default value"),
2561 "LC_CTYPE", "<tab>");
2563 else
2564 ELEM (ctype, class_collection, , value) |= BIT (tok_space);
2566 seq = charmap_find_value (charmap, "tab", 3);
2567 if (seq == NULL)
2569 if (!be_quiet)
2570 error (0, 0, _("\
2571 %s: character `%s' not defined while needed as default value"),
2572 "LC_CTYPE", "<tab>");
2574 else if (seq->nbytes != 1)
2575 error (0, 0, _("\
2576 %s: character `%s' in charmap not representable with one byte"),
2577 "LC_CTYPE", "<tab>");
2578 else
2579 ctype->class256_collection[seq->bytes[0]] |= BIT (tok_space);
2582 value = repertoire_find_value (repertoire, "vertical-tab", 12);
2583 if (value == ILLEGAL_CHAR_VALUE)
2585 if (!be_quiet)
2586 error (0, 0, _("\
2587 %s: character `%s' not defined while needed as default value"),
2588 "LC_CTYPE", "<vertical-tab>");
2590 else
2591 ELEM (ctype, class_collection, , value) |= BIT (tok_space);
2593 seq = charmap_find_value (charmap, "vertical-tab", 12);
2594 if (seq == NULL)
2596 if (!be_quiet)
2597 error (0, 0, _("\
2598 %s: character `%s' not defined while needed as default value"),
2599 "LC_CTYPE", "<vertical-tab>");
2601 else if (seq->nbytes != 1)
2602 error (0, 0, _("\
2603 %s: character `%s' in charmap not representable with one byte"),
2604 "LC_CTYPE", "<vertical-tab>");
2605 else
2606 ctype->class256_collection[seq->bytes[0]] |= BIT (tok_space);
2609 if ((ctype->class_done & BITw (tok_xdigit)) == 0)
2610 /* "If this keyword is not specified, the digits `0' to `9', the
2611 uppercase letters `A' through `F', and the lowercase letters `a'
2612 through `f', ..., shell automatically belong to this class, with
2613 implementation defined character values." [P1003.2, 2.5.2.1] */
2615 set_default (BITPOS (tok_xdigit), '0', '9');
2616 set_default (BITPOS (tok_xdigit), 'A', 'F');
2617 set_default (BITPOS (tok_xdigit), 'a', 'f');
2620 if ((ctype->class_done & BITw (tok_blank)) == 0)
2621 /* "If this keyword [blank] is unspecified, the characters <space> and
2622 <tab> shall belong to this character class." [P1003.2, 2.5.2.1] */
2624 uint32_t value;
2625 struct charseq *seq;
2627 value = repertoire_find_value (repertoire, "space", 5);
2628 if (value == ILLEGAL_CHAR_VALUE)
2630 if (!be_quiet)
2631 error (0, 0, _("\
2632 %s: character `%s' not defined while needed as default value"),
2633 "LC_CTYPE", "<space>");
2635 else
2636 ELEM (ctype, class_collection, , value) |= BIT (tok_blank);
2638 seq = charmap_find_value (charmap, "space", 5);
2639 if (seq == NULL)
2641 if (!be_quiet)
2642 error (0, 0, _("\
2643 %s: character `%s' not defined while needed as default value"),
2644 "LC_CTYPE", "<space>");
2646 else if (seq->nbytes != 1)
2647 error (0, 0, _("\
2648 %s: character `%s' in charmap not representable with one byte"),
2649 "LC_CTYPE", "<space>");
2650 else
2651 ctype->class256_collection[seq->bytes[0]] |= BIT (tok_blank);
2654 value = repertoire_find_value (repertoire, "tab", 3);
2655 if (value == ILLEGAL_CHAR_VALUE)
2657 if (!be_quiet)
2658 error (0, 0, _("\
2659 %s: character `%s' not defined while needed as default value"),
2660 "LC_CTYPE", "<tab>");
2662 else
2663 ELEM (ctype, class_collection, , value) |= BIT (tok_blank);
2665 seq = charmap_find_value (charmap, "tab", 3);
2666 if (seq == NULL)
2668 if (!be_quiet)
2669 error (0, 0, _("\
2670 %s: character `%s' not defined while needed as default value"),
2671 "LC_CTYPE", "<tab>");
2673 else if (seq->nbytes != 1)
2674 error (0, 0, _("\
2675 %s: character `%s' in charmap not representable with one byte"),
2676 "LC_CTYPE", "<tab>");
2677 else
2678 ctype->class256_collection[seq->bytes[0]] |= BIT (tok_blank);
2681 if ((ctype->class_done & BITw (tok_graph)) == 0)
2682 /* "If this keyword [graph] is not specified, characters specified for
2683 the keywords `upper', `lower', `alpha', `digit', `xdigit' and `punct',
2684 shall belong to this character class." [P1003.2, 2.5.2.1] */
2686 unsigned long int mask = BIT (tok_upper) | BIT (tok_lower) |
2687 BIT (tok_alpha) | BIT (tok_digit) | BIT (tok_xdigit) | BIT (tok_punct);
2688 size_t cnt;
2690 for (cnt = 0; cnt < ctype->class_collection_act; ++cnt)
2691 if ((ctype->class_collection[cnt] & mask) != 0)
2692 ctype->class_collection[cnt] |= BIT (tok_graph);
2694 for (cnt = 0; cnt < 256; ++cnt)
2695 if ((ctype->class256_collection[cnt] & mask) != 0)
2696 ctype->class256_collection[cnt] |= BIT (tok_graph);
2699 if ((ctype->class_done & BITw (tok_print)) == 0)
2700 /* "If this keyword [print] is not provided, characters specified for
2701 the keywords `upper', `lower', `alpha', `digit', `xdigit', `punct',
2702 and the <space> character shall belong to this character class."
2703 [P1003.2, 2.5.2.1] */
2705 unsigned long int mask = BIT (tok_upper) | BIT (tok_lower) |
2706 BIT (tok_alpha) | BIT (tok_digit) | BIT (tok_xdigit) | BIT (tok_punct);
2707 size_t cnt;
2708 uint32_t space;
2709 struct charseq *seq;
2711 for (cnt = 0; cnt < ctype->class_collection_act; ++cnt)
2712 if ((ctype->class_collection[cnt] & mask) != 0)
2713 ctype->class_collection[cnt] |= BIT (tok_print);
2715 for (cnt = 0; cnt < 256; ++cnt)
2716 if ((ctype->class256_collection[cnt] & mask) != 0)
2717 ctype->class256_collection[cnt] |= BIT (tok_print);
2720 space = repertoire_find_value (repertoire, "space", 5);
2721 if (space == ILLEGAL_CHAR_VALUE)
2723 if (!be_quiet)
2724 error (0, 0, _("\
2725 %s: character `%s' not defined while needed as default value"),
2726 "LC_CTYPE", "<space>");
2728 else
2729 ELEM (ctype, class_collection, , space) |= BIT (tok_print);
2731 seq = charmap_find_value (charmap, "space", 5);
2732 if (seq == NULL)
2734 if (!be_quiet)
2735 error (0, 0, _("\
2736 %s: character `%s' not defined while needed as default value"),
2737 "LC_CTYPE", "<space>");
2739 else if (seq->nbytes != 1)
2740 error (0, 0, _("\
2741 %s: character `%s' in charmap not representable with one byte"),
2742 "LC_CTYPE", "<space>");
2743 else
2744 ctype->class256_collection[seq->bytes[0]] |= BIT (tok_print);
2747 if (ctype->tomap_done[0] == 0)
2748 /* "If this keyword [toupper] is not specified, the lowercase letters
2749 `a' through `z', and their corresponding uppercase letters `A' to
2750 `Z', ..., shall automatically be included, with implementation-
2751 defined character values." [P1003.2, 2.5.2.1] */
2753 char tmp[4];
2754 int ch;
2756 strcpy (tmp, "<?>");
2758 for (ch = 'a'; ch <= 'z'; ++ch)
2760 uint32_t value_from, value_to;
2761 struct charseq *seq_from, *seq_to;
2763 tmp[1] = (char) ch;
2765 value_from = repertoire_find_value (repertoire, &tmp[1], 1);
2766 if (value_from == ILLEGAL_CHAR_VALUE)
2768 if (!be_quiet)
2769 error (0, 0, _("\
2770 %s: character `%s' not defined while needed as default value"),
2771 "LC_CTYPE", tmp);
2773 else
2775 /* This conversion is implementation defined. */
2776 tmp[1] = (char) (ch + ('A' - 'a'));
2777 value_to = repertoire_find_value (repertoire, &tmp[1], 1);
2778 if (value_to == ILLEGAL_CHAR_VALUE)
2780 if (!be_quiet)
2781 error (0, 0, _("\
2782 %s: character `%s' not defined while needed as default value"),
2783 "LC_CTYPE", tmp);
2785 else
2786 /* The index [0] is determined by the order of the
2787 `ctype_map_newP' calls in `ctype_startup'. */
2788 ELEM (ctype, map_collection, [0], value_from) = value_to;
2791 seq_from = charmap_find_value (charmap, &tmp[1], 1);
2792 if (seq_from == NULL)
2794 if (!be_quiet)
2795 error (0, 0, _("\
2796 %s: character `%s' not defined while needed as default value"),
2797 "LC_CTYPE", tmp);
2799 else if (seq_from->nbytes != 1)
2801 if (!be_quiet)
2802 error (0, 0, _("\
2803 %s: character `%s' needed as default value not representable with one byte"),
2804 "LC_CTYPE", tmp);
2806 else
2808 /* This conversion is implementation defined. */
2809 tmp[1] = (char) (ch + ('A' - 'a'));
2810 seq_to = charmap_find_value (charmap, &tmp[1], 1);
2811 if (seq_to == NULL)
2813 if (!be_quiet)
2814 error (0, 0, _("\
2815 %s: character `%s' not defined while needed as default value"),
2816 "LC_CTYPE", tmp);
2818 else if (seq_to->nbytes != 1)
2820 if (!be_quiet)
2821 error (0, 0, _("\
2822 %s: character `%s' needed as default value not representable with one byte"),
2823 "LC_CTYPE", tmp);
2825 else
2826 /* The index [0] is determined by the order of the
2827 `ctype_map_newP' calls in `ctype_startup'. */
2828 ctype->map256_collection[0][seq_from->bytes[0]]
2829 = seq_to->bytes[0];
2834 if (ctype->tomap_done[1] == 0)
2835 /* "If this keyword [tolower] is not specified, the mapping shall be
2836 the reverse mapping of the one specified to `toupper'." [P1003.2] */
2838 for (cnt = 0; cnt < ctype->map_collection_act[0]; ++cnt)
2839 if (ctype->map_collection[0][cnt] != 0)
2840 ELEM (ctype, map_collection, [1],
2841 ctype->map_collection[0][cnt])
2842 = ctype->charnames[cnt];
2844 for (cnt = 0; cnt < 256; ++cnt)
2845 if (ctype->map256_collection[0][cnt] != 0)
2846 ctype->map_collection[1][ctype->map_collection[0][cnt]]
2847 = ctype->charnames[cnt];
2850 if (ctype->outdigits_act == 0)
2852 for (cnt = 0; cnt < 10; ++cnt)
2854 ctype->mboutdigits[cnt] = charmap_find_symbol (charmap,
2855 digits + cnt, 1);
2857 if (ctype->mboutdigits[cnt] == NULL)
2859 ctype->mboutdigits[cnt] = charmap_find_symbol (charmap,
2860 longnames[cnt],
2861 strlen (longnames[cnt]));
2863 if (ctype->mboutdigits[cnt] == NULL)
2865 /* Provide a replacement. */
2866 error (0, 0, _("\
2867 no output digits defined and none of the standard names in the charmap"));
2869 ctype->mboutdigits[cnt] = obstack_alloc (&charmap->mem_pool,
2870 sizeof (struct charseq) + 1);
2872 /* This is better than nothing. */
2873 ctype->mboutdigits[cnt]->bytes[0] = digits[cnt];
2874 ctype->mboutdigits[cnt]->nbytes = 1;
2878 ctype->wcoutdigits[cnt] = repertoire_find_value (repertoire,
2879 digits + cnt, 1);
2881 if (ctype->wcoutdigits[cnt] == ILLEGAL_CHAR_VALUE)
2883 ctype->wcoutdigits[cnt] = repertoire_find_value (repertoire,
2884 longnames[cnt],
2885 strlen (longnames[cnt]));
2887 if (ctype->wcoutdigits[cnt] == ILLEGAL_CHAR_VALUE)
2889 /* Provide a replacement. */
2890 error (0, 0, _("\
2891 no output digits defined and none of the standard names in the repertoire"));
2893 /* This is better than nothing. */
2894 ctype->wcoutdigits[cnt] = (uint32_t) digits[cnt];
2899 ctype->outdigits_act = 10;
2904 static void
2905 allocate_arrays (struct locale_ctype_t *ctype, struct charmap_t *charmap,
2906 struct repertoire_t *repertoire)
2908 size_t idx;
2910 /* First we have to decide how we organize the arrays. It is easy
2911 for a one-byte character set. But multi-byte character set
2912 cannot be stored flat because the chars might be sparsely used.
2913 So we determine an optimal hashing function for the used
2914 characters.
2916 We use a very trivial hashing function to store the sparse
2917 table. CH % TABSIZE is used as an index. To solve multiple hits
2918 we have N planes. This guarantees a fixed search time for a
2919 character [N / 2]. In the following code we determine the minimum
2920 value for TABSIZE * N, where TABSIZE >= 256. */
2921 size_t min_total = UINT_MAX;
2922 size_t act_size = 256;
2924 if (!be_quiet)
2925 fputs (_("\
2926 Computing table size for character classes might take a while..."),
2927 stderr);
2929 while (act_size < min_total)
2931 size_t cnt[act_size];
2932 size_t act_planes = 1;
2934 memset (cnt, '\0', sizeof cnt);
2936 for (idx = 0; idx < 256; ++idx)
2937 cnt[idx] = 1;
2939 for (idx = 0; idx < ctype->charnames_act; ++idx)
2940 if (ctype->charnames[idx] >= 256)
2942 size_t nr = ctype->charnames[idx] % act_size;
2944 if (++cnt[nr] > act_planes)
2946 act_planes = cnt[nr];
2947 if (act_size * act_planes >= min_total)
2948 break;
2952 if (act_size * act_planes < min_total)
2954 min_total = act_size * act_planes;
2955 ctype->plane_size = act_size;
2956 ctype->plane_cnt = act_planes;
2959 ++act_size;
2962 if (!be_quiet)
2963 fputs (_(" done\n"), stderr);
2966 ctype->names = (uint32_t *) xcalloc (ctype->plane_size
2967 * ctype->plane_cnt,
2968 sizeof (uint32_t));
2970 for (idx = 1; idx < 256; ++idx)
2971 ctype->names[idx] = idx;
2973 /* Trick: change the 0th entry's name to 1 to mark the cell occupied. */
2974 ctype->names[0] = 1;
2976 for (idx = 256; idx < ctype->charnames_act; ++idx)
2978 size_t nr = (ctype->charnames[idx] % ctype->plane_size);
2979 size_t depth = 0;
2981 while (ctype->names[nr + depth * ctype->plane_size])
2982 ++depth;
2983 assert (depth < ctype->plane_cnt);
2985 ctype->names[nr + depth * ctype->plane_size] = ctype->charnames[idx];
2987 /* Now for faster access remember the index in the NAMES_B array. */
2988 ctype->charnames[idx] = nr + depth * ctype->plane_size;
2990 ctype->names[0] = 0;
2993 /* You wonder about this amount of memory? This is only because some
2994 users do not manage to address the array with unsigned values or
2995 data types with range >= 256. '\200' would result in the array
2996 index -128. To help these poor people we duplicate the entries for
2997 128 up to 255 below the entry for \0. */
2998 ctype->ctype_b = (char_class_t *) xcalloc (256 + 128,
2999 sizeof (char_class_t));
3000 ctype->ctype32_b = (char_class32_t *) xcalloc (ctype->plane_size
3001 * ctype->plane_cnt,
3002 sizeof (char_class32_t));
3004 /* This is the array accessed using the multibyte string elements. */
3005 for (idx = 0; idx < 256; ++idx)
3006 ctype->ctype_b[128 + idx] = ctype->class256_collection[idx];
3008 /* Mirror first 127 entries. We must take care that entry -1 is not
3009 mirrored because EOF == -1. */
3010 for (idx = 0; idx < 127; ++idx)
3011 ctype->ctype_b[idx] = ctype->ctype_b[256 + idx];
3013 /* The 32 bit array contains all characters. */
3014 for (idx = 0; idx < ctype->class_collection_act; ++idx)
3015 ctype->ctype32_b[ctype->charnames[idx]] = ctype->class_collection[idx];
3017 /* Room for table of mappings. */
3018 ctype->map = (uint32_t **) xmalloc (ctype->map_collection_nr
3019 * sizeof (uint32_t *));
3021 /* Fill in all mappings. */
3022 for (idx = 0; idx < ctype->map_collection_nr; ++idx)
3024 unsigned int idx2;
3026 /* Allocate table. */
3027 ctype->map[idx] = (uint32_t *) xmalloc ((ctype->plane_size
3028 * ctype->plane_cnt + 128)
3029 * sizeof (uint32_t));
3031 /* Copy default value (identity mapping). */
3032 memcpy (&ctype->map[idx][128], ctype->names,
3033 ctype->plane_size * ctype->plane_cnt * sizeof (uint32_t));
3035 /* Copy values from collection. */
3036 for (idx2 = 0; idx2 < 256; ++idx2)
3037 ctype->map[idx][128 + idx2] = ctype->map256_collection[idx][idx2];
3039 /* Mirror first 127 entries. We must take care not to map entry
3040 -1 because EOF == -1. */
3041 for (idx2 = 0; idx2 < 127; ++idx2)
3042 ctype->map[idx][idx2] = ctype->map[idx][256 + idx2];
3044 /* EOF must map to EOF. */
3045 ctype->map[idx][127] = EOF;
3047 /* The 32 bit map collection. */
3048 for (idx2 = 0; idx2 < ctype->map_collection_act[idx]; ++idx2)
3049 if (ctype->map_collection[idx][idx2] != 0)
3050 ctype->map[idx][128 + ctype->charnames[idx2]]
3051 = ctype->map_collection[idx][idx2];
3054 /* Extra array for class and map names. */
3055 ctype->class_name_ptr = (uint32_t *) xmalloc (ctype->nr_charclass
3056 * sizeof (uint32_t));
3057 ctype->map_name_ptr = (uint32_t *) xmalloc (ctype->map_collection_nr
3058 * sizeof (uint32_t));
3060 /* Array for width information. Because the expected width are very
3061 small we use only one single byte. This save space and we need
3062 not provide the information twice with both endianesses. */
3063 ctype->width = (unsigned char *) xmalloc (ctype->plane_size
3064 * ctype->plane_cnt);
3065 /* Initialize with default width value. */
3066 memset (ctype->width, charmap->width_default,
3067 ctype->plane_size * ctype->plane_cnt);
3068 if (charmap->width_rules != NULL)
3070 size_t cnt;
3072 for (cnt = 0; cnt < charmap->nwidth_rules; ++cnt)
3074 unsigned char bytes[charmap->mb_cur_max];
3075 int nbytes = charmap->width_rules[cnt].from->nbytes;
3077 /* We have the range of character for which the width is
3078 specified described using byte sequences of the multibyte
3079 charset. We have to convert this to UCS4 now. And we
3080 cannot simply convert the beginning and the end of the
3081 sequence, we have to iterate over the byte sequence and
3082 convert it for every single character. */
3083 memcpy (bytes, charmap->width_rules[cnt].from->bytes, nbytes);
3085 while (nbytes < charmap->width_rules[cnt].to->nbytes
3086 || memcmp (bytes, charmap->width_rules[cnt].to->bytes,
3087 nbytes) <= 0)
3089 /* Find the UCS value for `bytes'. */
3090 uint32_t wch = repertoire_find_value (ctype->repertoire, bytes,
3091 nbytes);
3092 int inner;
3094 if (wch != ILLEGAL_CHAR_VALUE)
3096 /* Store the value. */
3097 size_t nr = idx % ctype->plane_size;
3098 size_t depth = 0;
3100 while (ctype->names[nr + depth * ctype->plane_size] != nr)
3101 ++depth;
3102 assert (depth < ctype->plane_cnt);
3104 ctype->width[nr + depth * ctype->plane_size]
3105 = charmap->width_rules[cnt].width;
3108 /* "Increment" the bytes sequence. */
3109 inner = nbytes - 1;
3110 while (inner >= 0 && bytes[inner] == 0xff)
3111 --inner;
3113 if (inner < 0)
3115 /* We have to extend the byte sequence. */
3116 if (nbytes >= charmap->width_rules[cnt].to->nbytes)
3117 break;
3119 bytes[0] = 1;
3120 memset (&bytes[1], 0, nbytes);
3121 ++nbytes;
3123 else
3125 ++bytes[inner];
3126 while (++inner < nbytes)
3127 bytes[inner] = 0;
3133 /* Set MB_CUR_MAX. */
3134 ctype->mb_cur_max = charmap->mb_cur_max;
3136 /* We need the name of the currently used 8-bit character set to
3137 make correct conversion between this 8-bit representation and the
3138 ISO 10646 character set used internally for wide characters. */
3139 ctype->codeset_name = charmap->code_set_name;
3141 /* Now determine the table for the transliteration information.
3143 XXX It is not yet clear to me whether it is worth implementing a
3144 complicated algorithm which uses a hash table to locate the entries.
3145 For now I'll use a simple array which can be searching using binary
3146 search. */
3147 if (ctype->translit_copy_locale != NULL)
3149 /* Fold in the transliteration information from the locale mentioned
3150 in the `include' statement. */
3151 struct locale_ctype_t *here = ctype;
3155 struct localedef_t *other = find_locale (LC_CTYPE,
3156 here->translit_copy_locale,
3157 repertoire->name, charmap);
3159 if (other == NULL)
3161 error (0, 0, _("\
3162 %s: transliteration data from locale `%s' not available"),
3163 "LC_CTYPE", here->translit_copy_locale);
3164 break;
3167 here = other->categories[LC_CTYPE].ctype;
3169 /* Enqueue the information if necessary. */
3170 if (here->translit != NULL)
3172 struct translit_t *endp = here->translit;
3173 while (endp->next != NULL)
3174 endp = endp->next;
3176 endp->next = ctype->translit;
3177 ctype->translit = here->translit;
3180 while (here->translit_copy_locale != NULL);
3183 if (ctype->translit != NULL)
3185 /* First count how many entries we have. This is the upper limit
3186 since some entries from the included files might be overwritten. */
3187 size_t number = 0;
3188 size_t cnt;
3189 struct translit_t *runp = ctype->translit;
3190 struct translit_t **sorted;
3191 size_t from_len, to_len;
3193 while (runp != NULL)
3195 ++number;
3196 runp = runp->next;
3199 /* Next we allocate an array large enough and fill in the values. */
3200 sorted = (struct translit_t **) alloca (number
3201 * sizeof (struct translit_t **));
3202 runp = ctype->translit;
3203 number = 0;
3206 /* Search for the place where to insert this string.
3207 XXX Better use a real sorting algorithm later. */
3208 size_t idx = 0;
3209 int replace = 0;
3211 while (idx < number)
3213 int res = wcscmp ((const wchar_t *) sorted[idx]->from,
3214 (const wchar_t *) runp->from);
3215 if (res == 0)
3217 replace = 1;
3218 break;
3220 if (res > 0)
3221 break;
3222 ++idx;
3225 if (replace)
3226 sorted[idx] = runp;
3227 else
3229 memmove (&sorted[idx + 1], &sorted[idx],
3230 (number - idx) * sizeof (struct translit_t *));
3231 sorted[idx] = runp;
3232 ++number;
3235 runp = runp->next;
3237 while (runp != NULL);
3239 /* The next step is putting all the possible transliteration
3240 strings in one memory block so that we can write it out.
3241 We need several different blocks:
3242 - index to the tfromstring array
3243 - from-string array
3244 - index to the to-string array
3245 - to-string array.
3246 And this all must be available for both endianes variants.
3248 from_len = to_len = 0;
3249 for (cnt = 0; cnt < number; ++cnt)
3251 struct translit_to_t *srunp;
3252 from_len += wcslen ((const wchar_t *) sorted[cnt]->from) + 1;
3253 srunp = sorted[cnt]->to;
3254 while (srunp != NULL)
3256 to_len += wcslen ((const wchar_t *) srunp->str) + 1;
3257 srunp = srunp->next;
3259 /* Plus one for the extra NUL character marking the end of
3260 the list for the current entry. */
3261 ++to_len;
3264 /* We can allocate the arrays for the results. */
3265 ctype->translit_from_idx = xmalloc (number * sizeof (uint32_t));
3266 ctype->translit_from_tbl = xmalloc (from_len * sizeof (uint32_t));
3267 ctype->translit_to_idx = xmalloc (number * sizeof (uint32_t));
3268 ctype->translit_to_tbl = xmalloc (to_len * sizeof (uint32_t));
3270 from_len = 0;
3271 to_len = 0;
3272 for (cnt = 0; cnt < number; ++cnt)
3274 size_t len;
3275 struct translit_to_t *srunp;
3277 ctype->translit_from_idx[cnt] = from_len;
3278 ctype->translit_to_idx[cnt] = to_len;
3280 len = wcslen ((const wchar_t *) sorted[cnt]->from) + 1;
3281 wmemcpy ((wchar_t *) &ctype->translit_from_tbl[from_len],
3282 (const wchar_t *) sorted[cnt]->from, len);
3283 from_len += len;
3285 ctype->translit_to_idx[cnt] = to_len;
3286 srunp = sorted[cnt]->to;
3287 while (srunp != NULL)
3289 len = wcslen ((const wchar_t *) srunp->str) + 1;
3290 wmemcpy ((wchar_t *) &ctype->translit_to_tbl[to_len],
3291 (const wchar_t *) srunp->str, len);
3292 to_len += len;
3293 srunp = srunp->next;
3295 ctype->translit_to_tbl[to_len++] = L'\0';
3298 /* Store the information about the length. */
3299 ctype->translit_idx_size = number * sizeof (uint32_t);
3300 ctype->translit_from_tbl_size = from_len * sizeof (uint32_t);
3301 ctype->translit_to_tbl_size = to_len * sizeof (uint32_t);
3303 else
3305 /* Provide some dummy pointers since we have nothing to write out. */
3306 static uint32_t no_str = { 0 };
3308 ctype->translit_from_idx = &no_str;
3309 ctype->translit_from_tbl = &no_str;
3310 ctype->translit_to_tbl = &no_str;
3311 ctype->translit_idx_size = 0;
3312 ctype->translit_from_tbl_size = 0;
3313 ctype->translit_to_tbl_size = 0;