Update.
[glibc.git] / locale / programs / ld-ctype.c
blobd9a560783f782a37bae8b4e677d36279ab2dad86
1 /* Copyright (C) 1995, 1996, 1997, 1998, 1999 Free Software Foundation, Inc.
2 This file is part of the GNU C Library.
3 Contributed by Ulrich Drepper <drepper@gnu.org>, 1995.
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Library General Public License as
7 published by the Free Software Foundation; either version 2 of the
8 License, or (at your option) any later version.
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Library General Public License for more details.
15 You should have received a copy of the GNU Library General Public
16 License along with the GNU C Library; see the file COPYING.LIB. If not,
17 write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
18 Boston, MA 02111-1307, USA. */
20 #ifdef HAVE_CONFIG_H
21 # include <config.h>
22 #endif
24 #include <alloca.h>
25 #include <byteswap.h>
26 #include <endian.h>
27 #include <errno.h>
28 #include <limits.h>
29 #include <obstack.h>
30 #include <stdlib.h>
31 #include <string.h>
32 #include <wchar.h>
33 #include <wctype.h>
34 #include <sys/uio.h>
36 #include "charmap.h"
37 #include "localeinfo.h"
38 #include "langinfo.h"
39 #include "linereader.h"
40 #include "locfile-token.h"
41 #include "locfile.h"
42 #include "localedef.h"
44 #include <assert.h>
47 #ifdef PREDEFINED_CLASSES
48 /* These are the extra bits not in wctype.h since these are not preallocated
49 classes. */
50 # define _ISwspecial1 (1 << 29)
51 # define _ISwspecial2 (1 << 30)
52 # define _ISwspecial3 (1 << 31)
53 #endif
56 /* The bit used for representing a special class. */
57 #define BITPOS(class) ((class) - tok_upper)
58 #define BIT(class) (_ISbit (BITPOS (class)))
59 #define BITw(class) (_ISwbit (BITPOS (class)))
61 #define ELEM(ctype, collection, idx, value) \
62 *find_idx (ctype, &ctype->collection idx, &ctype->collection##_max idx, \
63 &ctype->collection##_act idx, value)
66 /* To be compatible with former implementations we for now restrict
67 the number of bits for character classes to 16. When compatibility
68 is not necessary anymore increase the number to 32. */
69 #define char_class_t uint16_t
70 #define char_class32_t uint32_t
73 /* Type to describe a transliteration action. We have a possibly
74 multiple character from-string and a set of multiple character
75 to-strings. All are 32bit values since this is what is used in
76 the gconv functions. */
77 struct translit_to_t
79 uint32_t *str;
81 struct translit_to_t *next;
84 struct translit_t
86 uint32_t *from;
88 struct translit_to_t *to;
90 struct translit_t *next;
94 /* The real definition of the struct for the LC_CTYPE locale. */
95 struct locale_ctype_t
97 uint32_t *charnames;
98 size_t charnames_max;
99 size_t charnames_act;
101 struct repertoire_t *repertoire;
103 /* We will allow up to 8 * sizeof (uint32_t) character classes. */
104 #define MAX_NR_CHARCLASS (8 * sizeof (uint32_t))
105 size_t nr_charclass;
106 const char *classnames[MAX_NR_CHARCLASS];
107 uint32_t last_class_char;
108 uint32_t class256_collection[256];
109 uint32_t *class_collection;
110 size_t class_collection_max;
111 size_t class_collection_act;
112 uint32_t class_done;
114 struct charseq **mbdigits;
115 size_t mbdigits_act;
116 size_t mbdigits_max;
117 uint32_t *wcdigits;
118 size_t wcdigits_act;
119 size_t wcdigits_max;
121 struct charseq *mboutdigits[10];
122 uint32_t wcoutdigits[10];
123 size_t outdigits_act;
125 /* If the following number ever turns out to be too small simply
126 increase it. But I doubt it will. --drepper@gnu */
127 #define MAX_NR_CHARMAP 16
128 const char *mapnames[MAX_NR_CHARMAP];
129 uint32_t *map_collection[MAX_NR_CHARMAP];
130 uint32_t map256_collection[2][256];
131 size_t map_collection_max[MAX_NR_CHARMAP];
132 size_t map_collection_act[MAX_NR_CHARMAP];
133 size_t map_collection_nr;
134 size_t last_map_idx;
135 int tomap_done[MAX_NR_CHARMAP];
137 /* Transliteration information. */
138 const char *translit_copy_locale;
139 const char *translit_copy_repertoire;
140 struct translit_t *translit;
142 /* The arrays for the binary representation. */
143 uint32_t plane_size;
144 uint32_t plane_cnt;
145 char_class_t *ctype_b;
146 char_class32_t *ctype32_b;
147 uint32_t *names;
148 uint32_t **map;
149 uint32_t *class_name_ptr;
150 uint32_t *map_name_ptr;
151 unsigned char *width;
152 uint32_t mb_cur_max;
153 const char *codeset_name;
154 uint32_t translit_hash_size;
155 uint32_t translit_hash_layers;
156 uint32_t *translit_from_idx;
157 uint32_t *translit_from_tbl;
158 uint32_t *translit_to_idx;
159 uint32_t *translit_to_tbl;
160 size_t translit_idx_size;
161 size_t translit_from_tbl_size;
162 size_t translit_to_tbl_size;
164 struct obstack mem_pool;
168 #define obstack_chunk_alloc xmalloc
169 #define obstack_chunk_free free
172 /* Prototypes for local functions. */
173 static void ctype_startup (struct linereader *lr, struct localedef_t *locale,
174 struct charmap_t *charmap, int ignore_content);
175 static void ctype_class_new (struct linereader *lr,
176 struct locale_ctype_t *ctype, const char *name);
177 static void ctype_map_new (struct linereader *lr,
178 struct locale_ctype_t *ctype,
179 const char *name, struct charmap_t *charmap);
180 static uint32_t *find_idx (struct locale_ctype_t *ctype, uint32_t **table,
181 size_t *max, size_t *act, unsigned int idx);
182 static void set_class_defaults (struct locale_ctype_t *ctype,
183 struct charmap_t *charmap,
184 struct repertoire_t *repertoire);
185 static void allocate_arrays (struct locale_ctype_t *ctype,
186 struct charmap_t *charmap,
187 struct repertoire_t *repertoire);
190 static const char *longnames[] =
192 "zero", "one", "two", "three", "four",
193 "five", "six", "seven", "eight", "nine"
195 static const unsigned char digits[] = "0123456789";
198 static void
199 ctype_startup (struct linereader *lr, struct localedef_t *locale,
200 struct charmap_t *charmap, int ignore_content)
202 unsigned int cnt;
203 struct locale_ctype_t *ctype;
205 if (!ignore_content)
207 /* Allocate the needed room. */
208 locale->categories[LC_CTYPE].ctype = ctype =
209 (struct locale_ctype_t *) xcalloc (1, sizeof (struct locale_ctype_t));
211 /* We have seen no names yet. */
212 ctype->charnames_max = charmap->mb_cur_max == 1 ? 256 : 512;
213 ctype->charnames =
214 (unsigned int *) xmalloc (ctype->charnames_max
215 * sizeof (unsigned int));
216 for (cnt = 0; cnt < 256; ++cnt)
217 ctype->charnames[cnt] = cnt;
218 ctype->charnames_act = 256;
220 /* Fill character class information. */
221 ctype->last_class_char = ILLEGAL_CHAR_VALUE;
222 /* The order of the following instructions determines the bit
223 positions! */
224 ctype_class_new (lr, ctype, "upper");
225 ctype_class_new (lr, ctype, "lower");
226 ctype_class_new (lr, ctype, "alpha");
227 ctype_class_new (lr, ctype, "digit");
228 ctype_class_new (lr, ctype, "xdigit");
229 ctype_class_new (lr, ctype, "space");
230 ctype_class_new (lr, ctype, "print");
231 ctype_class_new (lr, ctype, "graph");
232 ctype_class_new (lr, ctype, "blank");
233 ctype_class_new (lr, ctype, "cntrl");
234 ctype_class_new (lr, ctype, "punct");
235 ctype_class_new (lr, ctype, "alnum");
236 #ifdef PREDEFINED_CLASSES
237 /* The following are extensions from ISO 14652. */
238 ctype_class_new (lr, ctype, "left_to_right");
239 ctype_class_new (lr, ctype, "right_to_left");
240 ctype_class_new (lr, ctype, "num_terminator");
241 ctype_class_new (lr, ctype, "num_separator");
242 ctype_class_new (lr, ctype, "segment_separator");
243 ctype_class_new (lr, ctype, "block_separator");
244 ctype_class_new (lr, ctype, "direction_control");
245 ctype_class_new (lr, ctype, "sym_swap_layout");
246 ctype_class_new (lr, ctype, "char_shape_selector");
247 ctype_class_new (lr, ctype, "num_shape_selector");
248 ctype_class_new (lr, ctype, "non_spacing");
249 ctype_class_new (lr, ctype, "non_spacing_level3");
250 ctype_class_new (lr, ctype, "normal_connect");
251 ctype_class_new (lr, ctype, "r_connect");
252 ctype_class_new (lr, ctype, "no_connect");
253 ctype_class_new (lr, ctype, "no_connect-space");
254 ctype_class_new (lr, ctype, "vowel_connect");
255 #endif
257 ctype->class_collection_max = charmap->mb_cur_max == 1 ? 256 : 512;
258 ctype->class_collection
259 = (uint32_t *) xcalloc (sizeof (unsigned long int),
260 ctype->class_collection_max);
261 ctype->class_collection_act = 256;
263 /* Fill character map information. */
264 ctype->map_collection_nr = 0;
265 ctype->last_map_idx = MAX_NR_CHARMAP;
266 ctype_map_new (lr, ctype, "toupper", charmap);
267 ctype_map_new (lr, ctype, "tolower", charmap);
268 #ifdef PREDEFINED_CLASSES
269 ctype_map_new (lr, ctype, "tosymmetric", charmap);
270 #endif
272 /* Fill first 256 entries in `toXXX' arrays. */
273 for (cnt = 0; cnt < 256; ++cnt)
275 ctype->map_collection[0][cnt] = cnt;
276 ctype->map_collection[1][cnt] = cnt;
277 #ifdef PREDEFINED_CLASSES
278 ctype->map_collection[2][cnt] = cnt;
279 #endif
280 ctype->map256_collection[0][cnt] = cnt;
281 ctype->map256_collection[1][cnt] = cnt;
284 obstack_init (&ctype->mem_pool);
289 void
290 ctype_finish (struct localedef_t *locale, struct charmap_t *charmap)
292 /* See POSIX.2, table 2-6 for the meaning of the following table. */
293 #define NCLASS 12
294 static const struct
296 const char *name;
297 const char allow[NCLASS];
299 valid_table[NCLASS] =
301 /* The order is important. See token.h for more information.
302 M = Always, D = Default, - = Permitted, X = Mutually exclusive */
303 { "upper", "--MX-XDDXXX-" },
304 { "lower", "--MX-XDDXXX-" },
305 { "alpha", "---X-XDDXXX-" },
306 { "digit", "XXX--XDDXXX-" },
307 { "xdigit", "-----XDDXXX-" },
308 { "space", "XXXXX------X" },
309 { "print", "---------X--" },
310 { "graph", "---------X--" },
311 { "blank", "XXXXXM-----X" },
312 { "cntrl", "XXXXX-XX--XX" },
313 { "punct", "XXXXX-DD-X-X" },
314 { "alnum", "-----XDDXXX-" }
316 size_t cnt;
317 int cls1, cls2;
318 uint32_t space_value;
319 struct charseq *space_seq;
320 struct locale_ctype_t *ctype = locale->categories[LC_CTYPE].ctype;
321 int warned;
323 /* Now resolve copying and also handle completely missing definitions. */
324 if (ctype == NULL)
326 /* First see whether we were supposed to copy. If yes, find the
327 actual definition. */
328 if (locale->copy_name[LC_CTYPE] != NULL)
330 /* Find the copying locale. This has to happen transitively since
331 the locale we are copying from might also copying another one. */
332 struct localedef_t *from = locale;
335 from = find_locale (LC_CTYPE, from->copy_name[LC_CTYPE],
336 from->repertoire_name, charmap);
337 while (from->categories[LC_CTYPE].ctype == NULL
338 && from->copy_name[LC_CTYPE] != NULL);
340 ctype = locale->categories[LC_CTYPE].ctype
341 = from->categories[LC_CTYPE].ctype;
344 /* If there is still no definition issue an warning and create an
345 empty one. */
346 if (ctype == NULL)
348 error (0, 0, _("No definition for %s category found"), "LC_CTYPE");
349 ctype_startup (NULL, locale, charmap, 0);
350 ctype = locale->categories[LC_CTYPE].ctype;
354 /* Set default value for classes not specified. */
355 set_class_defaults (ctype, charmap, ctype->repertoire);
357 /* Check according to table. */
358 for (cnt = 0; cnt < ctype->class_collection_act; ++cnt)
360 uint32_t tmp = ctype->class_collection[cnt];
362 if (tmp != 0)
364 for (cls1 = 0; cls1 < NCLASS; ++cls1)
365 if ((tmp & _ISwbit (cls1)) != 0)
366 for (cls2 = 0; cls2 < NCLASS; ++cls2)
367 if (valid_table[cls1].allow[cls2] != '-')
369 int eq = (tmp & _ISwbit (cls2)) != 0;
370 switch (valid_table[cls1].allow[cls2])
372 case 'M':
373 if (!eq)
375 uint32_t value = ctype->charnames[cnt];
377 if (!be_quiet)
378 error (0, 0, _("\
379 character L'\\u%0*x' in class `%s' must be in class `%s'"),
380 value > 0xffff ? 8 : 4, value,
381 valid_table[cls1].name,
382 valid_table[cls2].name);
384 break;
386 case 'X':
387 if (eq)
389 uint32_t value = ctype->charnames[cnt];
391 if (!be_quiet)
392 error (0, 0, _("\
393 character L'\\u%0*x' in class `%s' must not be in class `%s'"),
394 value > 0xffff ? 8 : 4, value,
395 valid_table[cls1].name,
396 valid_table[cls2].name);
398 break;
400 case 'D':
401 ctype->class_collection[cnt] |= _ISwbit (cls2);
402 break;
404 default:
405 error (5, 0, _("internal error in %s, line %u"),
406 __FUNCTION__, __LINE__);
412 for (cnt = 0; cnt < 256; ++cnt)
414 uint32_t tmp = ctype->class256_collection[cnt];
416 if (tmp != 0)
418 for (cls1 = 0; cls1 < NCLASS; ++cls1)
419 if ((tmp & _ISbit (cls1)) != 0)
420 for (cls2 = 0; cls2 < NCLASS; ++cls2)
421 if (valid_table[cls1].allow[cls2] != '-')
423 int eq = (tmp & _ISbit (cls2)) != 0;
424 switch (valid_table[cls1].allow[cls2])
426 case 'M':
427 if (!eq)
429 char buf[17];
431 sprintf (buf, "\\%o", cnt);
433 if (!be_quiet)
434 error (0, 0, _("\
435 character '%s' in class `%s' must be in class `%s'"),
436 buf, valid_table[cls1].name,
437 valid_table[cls2].name);
439 break;
441 case 'X':
442 if (eq)
444 char buf[17];
446 sprintf (buf, "\\%o", cnt);
448 if (!be_quiet)
449 error (0, 0, _("\
450 character '%s' in class `%s' must not be in class `%s'"),
451 buf, valid_table[cls1].name,
452 valid_table[cls2].name);
454 break;
456 case 'D':
457 ctype->class256_collection[cnt] |= _ISbit (cls2);
458 break;
460 default:
461 error (5, 0, _("internal error in %s, line %u"),
462 __FUNCTION__, __LINE__);
468 /* ... and now test <SP> as a special case. */
469 space_value = repertoire_find_value (ctype->repertoire, "SP", 2);
470 if (space_value == ILLEGAL_CHAR_VALUE)
472 if (!be_quiet)
473 error (0, 0, _("character <SP> not defined in character map"));
475 else if (((cnt = BITPOS (tok_space),
476 (ELEM (ctype, class_collection, , space_value)
477 & BITw (tok_space)) == 0)
478 || (cnt = BITPOS (tok_blank),
479 (ELEM (ctype, class_collection, , space_value)
480 & BITw (tok_blank)) == 0)))
482 if (!be_quiet)
483 error (0, 0, _("<SP> character not in class `%s'"),
484 valid_table[cnt].name);
486 else if (((cnt = BITPOS (tok_punct),
487 (ELEM (ctype, class_collection, , space_value)
488 & BITw (tok_punct)) != 0)
489 || (cnt = BITPOS (tok_graph),
490 (ELEM (ctype, class_collection, , space_value)
491 & BITw (tok_graph))
492 != 0)))
494 if (!be_quiet)
495 error (0, 0, _("<SP> character must not be in class `%s'"),
496 valid_table[cnt].name);
498 else
499 ELEM (ctype, class_collection, , space_value) |= BITw (tok_print);
501 space_seq = charmap_find_value (charmap, "SP", 2);
502 if (space_seq == NULL || space_seq->nbytes != 1)
504 if (!be_quiet)
505 error (0, 0, _("character <SP> not defined in character map"));
507 else if (((cnt = BITPOS (tok_space),
508 (ctype->class256_collection[space_seq->bytes[0]]
509 & BIT (tok_space)) == 0)
510 || (cnt = BITPOS (tok_blank),
511 (ctype->class256_collection[space_seq->bytes[0]]
512 & BIT (tok_blank)) == 0)))
514 if (!be_quiet)
515 error (0, 0, _("<SP> character not in class `%s'"),
516 valid_table[cnt].name);
518 else if (((cnt = BITPOS (tok_punct),
519 (ctype->class256_collection[space_seq->bytes[0]]
520 & BIT (tok_punct)) != 0)
521 || (cnt = BITPOS (tok_graph),
522 (ctype->class256_collection[space_seq->bytes[0]]
523 & BIT (tok_graph)) != 0)))
525 if (!be_quiet)
526 error (0, 0, _("<SP> character must not be in class `%s'"),
527 valid_table[cnt].name);
529 else
530 ctype->class256_collection[space_seq->bytes[0]] |= BIT (tok_print);
532 /* Now that the tests are done make sure the name array contains all
533 characters which are handled in the WIDTH section of the
534 character set definition file. */
535 if (charmap->width_rules != NULL)
536 for (cnt = 0; cnt < charmap->nwidth_rules; ++cnt)
538 unsigned char bytes[charmap->mb_cur_max];
539 int nbytes = charmap->width_rules[cnt].from->nbytes;
541 /* We have the range of character for which the width is
542 specified described using byte sequences of the multibyte
543 charset. We have to convert this to UCS4 now. And we
544 cannot simply convert the beginning and the end of the
545 sequence, we have to iterate over the byte sequence and
546 convert it for every single character. */
547 memcpy (bytes, charmap->width_rules[cnt].from->bytes, nbytes);
549 while (nbytes < charmap->width_rules[cnt].to->nbytes
550 || memcmp (bytes, charmap->width_rules[cnt].to->bytes,
551 nbytes) <= 0)
553 /* Find the UCS value for `bytes'. */
554 uint32_t wch = repertoire_find_value (ctype->repertoire, bytes,
555 nbytes);
556 int inner;
558 if (wch != ILLEGAL_CHAR_VALUE)
559 /* We are only interested in the side-effects of the
560 `find_idx' call. It will add appropriate entries in
561 the name array if this is necessary. */
562 (void) find_idx (ctype, NULL, NULL, NULL, wch);
564 /* "Increment" the bytes sequence. */
565 inner = nbytes - 1;
566 while (inner >= 0 && bytes[inner] == 0xff)
567 --inner;
569 if (inner < 0)
571 /* We have to extend the byte sequence. */
572 if (nbytes >= charmap->width_rules[cnt].to->nbytes)
573 break;
575 bytes[0] = 1;
576 memset (&bytes[1], 0, nbytes);
577 ++nbytes;
579 else
581 ++bytes[inner];
582 while (++inner < nbytes)
583 bytes[inner] = 0;
588 /* There must be a multiple of 10 digits. */
589 if (ctype->mbdigits_act % 10 != 0)
591 assert (ctype->mbdigits_act == ctype->wcdigits_act);
592 ctype->wcdigits_act -= ctype->mbdigits_act % 10;
593 ctype->mbdigits_act -= ctype->mbdigits_act % 10;
594 error (0, 0, _("`digit' category has not entries in groups of ten"));
597 /* Check the input digits. There must be a multiple of ten available.
598 In each group it could be that one or the other character is missing.
599 In this case the whole group must be removed. */
600 cnt = 0;
601 while (cnt < ctype->mbdigits_act)
603 size_t inner;
604 for (inner = 0; inner < 10; ++inner)
605 if (ctype->mbdigits[cnt + inner] == NULL)
606 break;
608 if (inner == 10)
609 cnt += 10;
610 else
612 /* Remove the group. */
613 memmove (&ctype->mbdigits[cnt], &ctype->mbdigits[cnt + 10],
614 ((ctype->wcdigits_act - cnt - 10)
615 * sizeof (ctype->mbdigits[0])));
616 ctype->mbdigits_act -= 10;
620 /* If no input digits are given use the default. */
621 if (ctype->mbdigits_act == 0)
623 if (ctype->mbdigits_max == 0)
625 ctype->mbdigits = obstack_alloc (&charmap->mem_pool,
626 10 * sizeof (struct charseq *));
627 ctype->mbdigits_max = 10;
630 for (cnt = 0; cnt < 10; ++cnt)
632 ctype->mbdigits[cnt] = charmap_find_symbol (charmap,
633 digits + cnt, 1);
634 if (ctype->mbdigits[cnt] == NULL)
636 ctype->mbdigits[cnt] = charmap_find_symbol (charmap,
637 longnames[cnt],
638 strlen (longnames[cnt]));
639 if (ctype->mbdigits[cnt] == NULL)
641 /* Hum, this ain't good. */
642 error (0, 0, _("\
643 no input digits defined and none of the standard names in the charmap"));
645 ctype->mbdigits[cnt] = obstack_alloc (&charmap->mem_pool,
646 sizeof (struct charseq) + 1);
648 /* This is better than nothing. */
649 ctype->mbdigits[cnt]->bytes[0] = digits[cnt];
650 ctype->mbdigits[cnt]->nbytes = 1;
655 ctype->mbdigits_act = 10;
658 /* Check the wide character input digits. There must be a multiple
659 of ten available. In each group it could be that one or the other
660 character is missing. In this case the whole group must be
661 removed. */
662 cnt = 0;
663 while (cnt < ctype->wcdigits_act)
665 size_t inner;
666 for (inner = 0; inner < 10; ++inner)
667 if (ctype->wcdigits[cnt + inner] == ILLEGAL_CHAR_VALUE)
668 break;
670 if (inner == 10)
671 cnt += 10;
672 else
674 /* Remove the group. */
675 memmove (&ctype->wcdigits[cnt], &ctype->wcdigits[cnt + 10],
676 ((ctype->wcdigits_act - cnt - 10)
677 * sizeof (ctype->wcdigits[0])));
678 ctype->wcdigits_act -= 10;
682 /* If no input digits are given use the default. */
683 if (ctype->wcdigits_act == 0)
685 if (ctype->wcdigits_max == 0)
687 ctype->wcdigits = obstack_alloc (&charmap->mem_pool,
688 10 * sizeof (uint32_t));
689 ctype->wcdigits_max = 10;
692 for (cnt = 0; cnt < 10; ++cnt)
693 ctype->wcdigits[cnt] = L'0' + cnt;
695 ctype->mbdigits_act = 10;
698 /* Check the outdigits. */
699 warned = 0;
700 for (cnt = 0; cnt < 10; ++cnt)
701 if (ctype->mboutdigits[cnt] == NULL)
703 static struct charseq replace[2];
705 if (!warned)
707 error (0, 0, _("\
708 not all characters used in `outdigit' are available in the charmap"));
709 warned = 1;
712 replace[0].nbytes = 1;
713 replace[0].bytes[0] = '?';
714 replace[0].bytes[1] = '\0';
715 ctype->mboutdigits[cnt] = &replace[0];
718 warned = 0;
719 for (cnt = 0; cnt < 10; ++cnt)
720 if (ctype->wcoutdigits[cnt] == 0)
722 if (!warned)
724 error (0, 0, _("\
725 not all characters used in `outdigit' are available in the repertoire"));
726 warned = 1;
729 ctype->wcoutdigits[cnt] = L'?';
734 void
735 ctype_output (struct localedef_t *locale, struct charmap_t *charmap,
736 const char *output_path)
738 struct locale_ctype_t *ctype = locale->categories[LC_CTYPE].ctype;
739 const size_t nelems = (_NL_ITEM_INDEX (_NL_NUM_LC_CTYPE)
740 + (ctype->map_collection_nr - 2));
741 struct iovec iov[2 + nelems + ctype->nr_charclass
742 + ctype->map_collection_nr];
743 struct locale_file data;
744 uint32_t idx[nelems + 1];
745 size_t elem, cnt, offset, total;
746 char *cp;
748 /* Now prepare the output: Find the sizes of the table we can use. */
749 allocate_arrays (ctype, charmap, ctype->repertoire);
751 data.magic = LIMAGIC (LC_CTYPE);
752 data.n = nelems;
753 iov[0].iov_base = (void *) &data;
754 iov[0].iov_len = sizeof (data);
756 iov[1].iov_base = (void *) idx;
757 iov[1].iov_len = sizeof (idx);
759 idx[0] = iov[0].iov_len + iov[1].iov_len;
760 offset = 0;
762 for (elem = 0; elem < nelems; ++elem)
764 if (elem < _NL_ITEM_INDEX (_NL_NUM_LC_CTYPE))
765 switch (elem)
767 #define CTYPE_DATA(name, base, len) \
768 case _NL_ITEM_INDEX (name): \
769 iov[2 + elem + offset].iov_base = (base); \
770 iov[2 + elem + offset].iov_len = (len); \
771 if (elem + 1 < nelems) \
772 idx[elem + 1] = idx[elem] + iov[2 + elem + offset].iov_len; \
773 break
775 CTYPE_DATA (_NL_CTYPE_CLASS,
776 ctype->ctype_b,
777 (256 + 128) * sizeof (char_class_t));
779 CTYPE_DATA (_NL_CTYPE_TOUPPER,
780 ctype->map[0],
781 (ctype->plane_size * ctype->plane_cnt + 128)
782 * sizeof (uint32_t));
783 CTYPE_DATA (_NL_CTYPE_TOLOWER,
784 ctype->map[1],
785 (ctype->plane_size * ctype->plane_cnt + 128)
786 * sizeof (uint32_t));
788 CTYPE_DATA (_NL_CTYPE_CLASS32,
789 ctype->ctype32_b,
790 (ctype->plane_size * ctype->plane_cnt
791 * sizeof (char_class32_t)));
793 CTYPE_DATA (_NL_CTYPE_NAMES,
794 ctype->names, (ctype->plane_size * ctype->plane_cnt
795 * sizeof (uint32_t)));
797 CTYPE_DATA (_NL_CTYPE_TRANSLIT_HASH_SIZE,
798 &ctype->translit_hash_size, sizeof (uint32_t));
799 CTYPE_DATA (_NL_CTYPE_TRANSLIT_HASH_LAYERS,
800 &ctype->translit_hash_layers, sizeof (uint32_t));
802 CTYPE_DATA (_NL_CTYPE_TRANSLIT_FROM_IDX,
803 ctype->translit_from_idx,
804 ctype->translit_idx_size);
806 CTYPE_DATA (_NL_CTYPE_TRANSLIT_FROM_TBL,
807 ctype->translit_from_tbl,
808 ctype->translit_from_tbl_size);
810 CTYPE_DATA (_NL_CTYPE_TRANSLIT_TO_IDX,
811 ctype->translit_to_idx,
812 ctype->translit_idx_size);
814 CTYPE_DATA (_NL_CTYPE_TRANSLIT_TO_TBL,
815 ctype->translit_to_tbl, ctype->translit_to_tbl_size);
817 CTYPE_DATA (_NL_CTYPE_HASH_SIZE,
818 &ctype->plane_size, sizeof (uint32_t));
819 CTYPE_DATA (_NL_CTYPE_HASH_LAYERS,
820 &ctype->plane_cnt, sizeof (uint32_t));
822 case _NL_ITEM_INDEX (_NL_CTYPE_CLASS_NAMES):
823 /* The class name array. */
824 total = 0;
825 for (cnt = 0; cnt < ctype->nr_charclass; ++cnt, ++offset)
827 iov[2 + elem + offset].iov_base
828 = (void *) ctype->classnames[cnt];
829 iov[2 + elem + offset].iov_len
830 = strlen (ctype->classnames[cnt]) + 1;
831 total += iov[2 + elem + offset].iov_len;
833 iov[2 + elem + offset].iov_base = (void *) "\0\0\0";
834 iov[2 + elem + offset].iov_len = 1 + (4 - ((total + 1) % 4));
835 total += 1 + (4 - ((total + 1) % 4));
837 idx[elem + 1] = idx[elem] + total;
838 break;
840 case _NL_ITEM_INDEX (_NL_CTYPE_MAP_NAMES):
841 /* The class name array. */
842 total = 0;
843 for (cnt = 0; cnt < ctype->map_collection_nr; ++cnt, ++offset)
845 iov[2 + elem + offset].iov_base
846 = (void *) ctype->mapnames[cnt];
847 iov[2 + elem + offset].iov_len
848 = strlen (ctype->mapnames[cnt]) + 1;
849 total += iov[2 + elem + offset].iov_len;
851 iov[2 + elem + offset].iov_base = (void *) "\0\0\0";
852 iov[2 + elem + offset].iov_len = 1 + (4 - ((total + 1) % 4));
853 total += 1 + (4 - ((total + 1) % 4));
855 idx[elem + 1] = idx[elem] + total;
856 break;
858 CTYPE_DATA (_NL_CTYPE_WIDTH,
859 ctype->width, ctype->plane_size * ctype->plane_cnt);
861 CTYPE_DATA (_NL_CTYPE_MB_CUR_MAX,
862 &ctype->mb_cur_max, sizeof (uint32_t));
864 case _NL_ITEM_INDEX (_NL_CTYPE_CODESET_NAME):
865 total = strlen (ctype->codeset_name) + 1;
866 if (total % 4 == 0)
867 iov[2 + elem + offset].iov_base = (char *) ctype->codeset_name;
868 else
870 iov[2 + elem + offset].iov_base = alloca ((total + 3) & ~3);
871 memset (mempcpy (iov[2 + elem + offset].iov_base,
872 ctype->codeset_name, total),
873 '\0', 4 - (total & 3));
874 total = (total + 3) & ~3;
876 iov[2 + elem + offset].iov_len = total;
877 idx[elem + 1] = idx[elem] + iov[2 + elem + offset].iov_len;
878 break;
880 case _NL_ITEM_INDEX (_NL_CTYPE_INDIGITS_MB_LEN):
881 iov[2 + elem + offset].iov_base = alloca (sizeof (uint32_t));
882 iov[2 + elem + offset].iov_len = sizeof (uint32_t);
883 *(uint32_t *) iov[2 + elem + offset].iov_base =
884 ctype->mbdigits_act / 10;
885 idx[elem + 1] = idx[elem] + sizeof (uint32_t);
886 break;
888 case _NL_ITEM_INDEX (_NL_CTYPE_INDIGITS_WC_LEN):
889 iov[2 + elem + offset].iov_base = alloca (sizeof (uint32_t));
890 iov[2 + elem + offset].iov_len = sizeof (uint32_t);
891 *(uint32_t *) iov[2 + elem + offset].iov_base =
892 ctype->wcdigits_act / 10;
893 idx[elem + 1] = idx[elem] + sizeof (uint32_t);
894 break;
896 case _NL_ITEM_INDEX (_NL_CTYPE_INDIGITS0_MB) ... _NL_ITEM_INDEX (_NL_CTYPE_INDIGITS9_MB):
897 /* Compute the length of all possible characters. For INDIGITS
898 there might be more than one. We simply concatenate all of
899 them with a NUL byte following. The NUL byte wouldn't be
900 necessary but it makes it easier for the user. */
901 total = 0;
902 for (cnt = elem - _NL_CTYPE_INDIGITS0_MB;
903 cnt < ctype->mbdigits_act; cnt += 10)
904 total += ctype->mbdigits[cnt]->nbytes + 1;
905 iov[2 + elem + offset].iov_base = (char *) alloca (total);
906 iov[2 + elem + offset].iov_len = total;
908 cp = iov[2 + elem + offset].iov_base;
909 for (cnt = elem - _NL_CTYPE_INDIGITS0_MB;
910 cnt < ctype->mbdigits_act; cnt += 10)
912 cp = mempcpy (cp, ctype->mbdigits[cnt]->bytes,
913 ctype->mbdigits[cnt]->nbytes);
914 *cp++ = '\0';
916 idx[elem + 1] = idx[elem] + iov[2 + elem + offset].iov_len;
917 break;
919 case _NL_ITEM_INDEX (_NL_CTYPE_OUTDIGIT0_MB) ... _NL_ITEM_INDEX (_NL_CTYPE_OUTDIGIT9_MB):
920 /* Compute the length of all possible characters. For INDIGITS
921 there might be more than one. We simply concatenate all of
922 them with a NUL byte following. The NUL byte wouldn't be
923 necessary but it makes it easier for the user. */
924 cnt = elem - _NL_CTYPE_OUTDIGIT0_MB;
925 total = ctype->mboutdigits[cnt]->nbytes + 1;
926 iov[2 + elem + offset].iov_base = (char *) alloca (total);
927 iov[2 + elem + offset].iov_len = total;
929 *(char *) mempcpy (iov[2 + elem + offset].iov_base,
930 ctype->mbdigits[cnt]->bytes,
931 ctype->mbdigits[cnt]->nbytes) = '\0';
932 idx[elem + 1] = idx[elem] + iov[2 + elem + offset].iov_len;
933 break;
935 case _NL_ITEM_INDEX (_NL_CTYPE_INDIGITS0_WC) ... _NL_ITEM_INDEX (_NL_CTYPE_INDIGITS9_WC):
936 total = ctype->wcdigits_act / 10;
938 iov[2 + elem + offset].iov_base =
939 (uint32_t *) alloca (total * sizeof (uint32_t));
940 iov[2 + elem + offset].iov_len = total * sizeof (uint32_t);
942 for (cnt = elem - _NL_CTYPE_INDIGITS0_WC;
943 cnt < ctype->wcdigits_act; cnt += 10)
944 ((uint32_t *) iov[2 + elem + offset].iov_base)[cnt / 10]
945 = ctype->wcdigits[cnt];
946 idx[elem + 1] = idx[elem] + iov[2 + elem + offset].iov_len;
947 break;
949 case _NL_ITEM_INDEX (_NL_CTYPE_OUTDIGIT0_WC) ... _NL_ITEM_INDEX (_NL_CTYPE_OUTDIGIT9_WC):
950 cnt = elem - _NL_CTYPE_OUTDIGIT0_WC;
951 iov[2 + elem + offset].iov_base = &ctype->wcoutdigits[cnt];
952 iov[2 + elem + offset].iov_len = sizeof (uint32_t);
953 idx[elem + 1] = idx[elem] + iov[2 + elem + offset].iov_len;
954 break;
956 default:
957 assert (! "unknown CTYPE element");
959 else
961 /* Handle extra maps. */
962 size_t nr = (elem - _NL_ITEM_INDEX (_NL_NUM_LC_CTYPE)) + 2;
964 iov[2 + elem + offset].iov_base = ctype->map[nr];
965 iov[2 + elem + offset].iov_len = ((ctype->plane_size
966 * ctype->plane_cnt + 128)
967 * sizeof (uint32_t));
969 idx[elem + 1] = idx[elem] + iov[2 + elem + offset].iov_len;
973 assert (2 + elem + offset == (nelems + ctype->nr_charclass
974 + ctype->map_collection_nr + 2));
976 write_locale_data (output_path, "LC_CTYPE", 2 + elem + offset, iov);
980 /* Local functions. */
981 static void
982 ctype_class_new (struct linereader *lr, struct locale_ctype_t *ctype,
983 const char *name)
985 size_t cnt;
987 for (cnt = 0; cnt < ctype->nr_charclass; ++cnt)
988 if (strcmp (ctype->classnames[cnt], name) == 0)
989 break;
991 if (cnt < ctype->nr_charclass)
993 lr_error (lr, _("character class `%s' already defined"), name);
994 return;
997 if (ctype->nr_charclass == MAX_NR_CHARCLASS)
998 /* Exit code 2 is prescribed in P1003.2b. */
999 error (2, 0, _("\
1000 implementation limit: no more than %d character classes allowed"),
1001 MAX_NR_CHARCLASS);
1003 ctype->classnames[ctype->nr_charclass++] = name;
1007 static void
1008 ctype_map_new (struct linereader *lr, struct locale_ctype_t *ctype,
1009 const char *name, struct charmap_t *charmap)
1011 size_t max_chars = 0;
1012 size_t cnt;
1014 for (cnt = 0; cnt < ctype->map_collection_nr; ++cnt)
1016 if (strcmp (ctype->mapnames[cnt], name) == 0)
1017 break;
1019 if (max_chars < ctype->map_collection_max[cnt])
1020 max_chars = ctype->map_collection_max[cnt];
1023 if (cnt < ctype->map_collection_nr)
1025 lr_error (lr, _("character map `%s' already defined"), name);
1026 return;
1029 if (ctype->map_collection_nr == MAX_NR_CHARMAP)
1030 /* Exit code 2 is prescribed in P1003.2b. */
1031 error (2, 0, _("\
1032 implementation limit: no more than %d character maps allowed"),
1033 MAX_NR_CHARMAP);
1035 ctype->mapnames[cnt] = name;
1037 if (max_chars == 0)
1038 ctype->map_collection_max[cnt] = charmap->mb_cur_max == 1 ? 256 : 512;
1039 else
1040 ctype->map_collection_max[cnt] = max_chars;
1042 ctype->map_collection[cnt] = (uint32_t *)
1043 xmalloc (sizeof (uint32_t) * ctype->map_collection_max[cnt]);
1044 memset (ctype->map_collection[cnt], '\0',
1045 sizeof (uint32_t) * ctype->map_collection_max[cnt]);
1046 ctype->map_collection_act[cnt] = 256;
1048 ++ctype->map_collection_nr;
1052 /* We have to be prepared that TABLE, MAX, and ACT can be NULL. This
1053 is possible if we only want to extend the name array. */
1054 static uint32_t *
1055 find_idx (struct locale_ctype_t *ctype, uint32_t **table, size_t *max,
1056 size_t *act, uint32_t idx)
1058 size_t cnt;
1060 if (idx < 256)
1061 return table == NULL ? NULL : &(*table)[idx];
1063 for (cnt = 256; cnt < ctype->charnames_act; ++cnt)
1064 if (ctype->charnames[cnt] == idx)
1065 break;
1067 /* We have to distinguish two cases: the name is found or not. */
1068 if (cnt == ctype->charnames_act)
1070 /* Extend the name array. */
1071 if (ctype->charnames_act == ctype->charnames_max)
1073 ctype->charnames_max *= 2;
1074 ctype->charnames = (unsigned int *)
1075 xrealloc (ctype->charnames,
1076 sizeof (unsigned int) * ctype->charnames_max);
1078 ctype->charnames[ctype->charnames_act++] = idx;
1081 if (table == NULL)
1082 /* We have done everything we are asked to do. */
1083 return NULL;
1085 if (cnt >= *act)
1087 if (cnt >= *max)
1089 size_t old_max = *max;
1091 *max *= 2;
1092 while (*max <= cnt);
1094 *table =
1095 (uint32_t *) xrealloc (*table, *max * sizeof (unsigned long int));
1096 memset (&(*table)[old_max], '\0',
1097 (*max - old_max) * sizeof (uint32_t));
1100 *act = cnt;
1103 return &(*table)[cnt];
1107 static int
1108 get_character (struct token *now, struct charmap_t *charmap,
1109 struct repertoire_t *repertoire,
1110 struct charseq **seqp, uint32_t *wchp)
1112 if (now->tok == tok_bsymbol)
1114 /* This will hopefully be the normal case. */
1115 *wchp = repertoire_find_value (repertoire, now->val.str.startmb,
1116 now->val.str.lenmb);
1117 *seqp = charmap_find_value (charmap, now->val.str.startmb,
1118 now->val.str.lenmb);
1120 else if (now->tok == tok_ucs4)
1122 *seqp = repertoire_find_seq (repertoire, now->val.ucs4);
1124 if (*seqp == NULL)
1126 /* Compute the value in the charmap from the UCS value. */
1127 const char *symbol = repertoire_find_symbol (repertoire,
1128 now->val.ucs4);
1130 if (symbol == NULL)
1131 *seqp = NULL;
1132 else
1133 *seqp = charmap_find_value (charmap, symbol, strlen (symbol));
1135 if (*seqp == NULL)
1137 /* Insert a negative entry. */
1138 static const struct charseq negative
1139 = { .ucs4 = ILLEGAL_CHAR_VALUE };
1140 uint32_t *newp = obstack_alloc (&repertoire->mem_pool, 4);
1141 *newp = now->val.ucs4;
1143 insert_entry (&repertoire->seq_table, newp, 4,
1144 (void *) &negative);
1146 else
1147 (*seqp)->ucs4 = now->val.ucs4;
1149 else if ((*seqp)->ucs4 != now->val.ucs4)
1150 *seqp = NULL;
1152 *wchp = now->val.ucs4;
1154 else if (now->tok == tok_charcode)
1156 /* We must map from the byte code to UCS4. */
1157 *seqp = charmap_find_symbol (charmap, now->val.str.startmb,
1158 now->val.str.lenmb);
1160 if (*seqp == NULL)
1161 *wchp = ILLEGAL_CHAR_VALUE;
1162 else
1164 if ((*seqp)->ucs4 == UNINITIALIZED_CHAR_VALUE)
1165 (*seqp)->ucs4 = repertoire_find_value (repertoire, (*seqp)->name,
1166 strlen ((*seqp)->name));
1167 *wchp = (*seqp)->ucs4;
1170 else
1171 return 1;
1173 return 0;
1177 /* Ellipsis like in `<foo123>..<foo12a>' or `<j1234>....<j1245>'. */
1178 static void
1179 charclass_symbolic_ellipsis (struct linereader *ldfile,
1180 struct locale_ctype_t *ctype,
1181 struct charmap_t *charmap,
1182 struct repertoire_t *repertoire,
1183 struct token *now,
1184 const char *last_str,
1185 unsigned long int class256_bit,
1186 unsigned long int class_bit, int base,
1187 int ignore_content, int handle_digits)
1189 const char *nowstr = now->val.str.startmb;
1190 char tmp[now->val.str.lenmb + 1];
1191 const char *cp;
1192 char *endp;
1193 unsigned long int from;
1194 unsigned long int to;
1196 /* We have to compute the ellipsis values using the symbolic names. */
1197 assert (last_str != NULL);
1199 if (strlen (last_str) != now->val.str.lenmb)
1201 invalid_range:
1202 lr_error (ldfile,
1203 _("`%s' and `%.*s' are no valid names for symbolic range"),
1204 last_str, now->val.str.lenmb, nowstr);
1205 return;
1208 if (memcmp (last_str, nowstr, now->val.str.lenmb) == 0)
1209 /* Nothing to do, the names are the same. */
1210 return;
1212 for (cp = last_str; *cp == *(nowstr + (cp - last_str)); ++cp)
1215 errno = 0;
1216 from = strtoul (cp, &endp, base);
1217 if ((from == UINT_MAX && errno == ERANGE) || *endp != '\0')
1218 goto invalid_range;
1220 to = strtoul (nowstr + (cp - last_str), &endp, base);
1221 if ((to == UINT_MAX && errno == ERANGE)
1222 || (endp - nowstr) != now->val.str.lenmb || from >= to)
1223 goto invalid_range;
1225 /* OK, we have a range FROM - TO. Now we can create the symbolic names. */
1226 if (!ignore_content)
1228 now->val.str.startmb = tmp;
1229 while (++from <= to)
1231 struct charseq *seq;
1232 uint32_t wch;
1234 sprintf (tmp, (base == 10 ? "%.*s%0*d" : "%.*s%0*X"), cp - last_str,
1235 last_str, now->val.str.lenmb - (cp - last_str), from);
1237 get_character (now, charmap, repertoire, &seq, &wch);
1239 if (seq != NULL && seq->nbytes == 1)
1240 /* Yep, we can store information about this byte sequence. */
1241 ctype->class256_collection[seq->bytes[0]] |= class256_bit;
1243 if (wch != ILLEGAL_CHAR_VALUE && class_bit != 0)
1244 /* We have the UCS4 position. */
1245 *find_idx (ctype, &ctype->class_collection,
1246 &ctype->class_collection_max,
1247 &ctype->class_collection_act, wch) |= class_bit;
1249 if (handle_digits == 1)
1251 /* We must store the digit values. */
1252 if (ctype->mbdigits_act == ctype->mbdigits_max)
1254 ctype->mbdigits_max *= 2;
1255 ctype->mbdigits = xrealloc (ctype->mbdigits,
1256 (ctype->mbdigits_max
1257 * sizeof (char *)));
1258 ctype->wcdigits_max *= 2;
1259 ctype->wcdigits = xrealloc (ctype->wcdigits,
1260 (ctype->wcdigits_max
1261 * sizeof (uint32_t)));
1264 ctype->mbdigits[ctype->mbdigits_act++] = seq;
1265 ctype->wcdigits[ctype->wcdigits_act++] = wch;
1267 else if (handle_digits == 2)
1269 /* We must store the digit values. */
1270 if (ctype->outdigits_act >= 10)
1272 lr_error (ldfile, _("\
1273 %s: field `%s' does not contain exactly ten entries"),
1274 "LC_CTYPE", "outdigit");
1275 return;
1278 ctype->mboutdigits[ctype->outdigits_act] = seq;
1279 ctype->wcoutdigits[ctype->outdigits_act] = wch;
1280 ++ctype->outdigits_act;
1287 /* Ellipsis like in `<U1234>..<U2345>'. */
1288 static void
1289 charclass_ucs4_ellipsis (struct linereader *ldfile,
1290 struct locale_ctype_t *ctype,
1291 struct charmap_t *charmap,
1292 struct repertoire_t *repertoire,
1293 struct token *now, uint32_t last_wch,
1294 unsigned long int class256_bit,
1295 unsigned long int class_bit, int ignore_content,
1296 int handle_digits)
1298 if (last_wch > now->val.ucs4)
1300 lr_error (ldfile, _("\
1301 to-value <U%0*X> of range is smaller than from-value <U%0*X>"),
1302 (now->val.ucs4 | last_wch) < 65536 ? 4 : 8, now->val.ucs4,
1303 (now->val.ucs4 | last_wch) < 65536 ? 4 : 8, last_wch);
1304 return;
1307 if (!ignore_content)
1308 while (++last_wch <= now->val.ucs4)
1310 /* We have to find out whether there is a byte sequence corresponding
1311 to this UCS4 value. */
1312 struct charseq *seq = repertoire_find_seq (repertoire, last_wch);
1314 /* If this is the first time we look for this sequence create a new
1315 entry. */
1316 if (seq == NULL)
1318 /* Find the symbolic name for this UCS4 value. */
1319 const char *symbol = repertoire_find_symbol (repertoire, last_wch);
1320 uint32_t *newp = obstack_alloc (&repertoire->mem_pool, 4);
1321 *newp = last_wch;
1323 if (symbol != NULL)
1324 /* We have a name, now search the multibyte value. */
1325 seq = charmap_find_value (charmap, symbol, strlen (symbol));
1327 if (seq == NULL)
1329 /* We have to create a fake entry. */
1330 static const struct charseq negative
1331 = { .ucs4 = ILLEGAL_CHAR_VALUE };
1332 seq = (struct charseq *) &negative;
1334 else
1335 seq->ucs4 = last_wch;
1337 insert_entry (&repertoire->seq_table, newp, 4, seq);
1340 /* We have a name, now search the multibyte value. */
1341 if (seq->ucs4 == last_wch && seq->nbytes == 1)
1342 /* Yep, we can store information about this byte sequence. */
1343 ctype->class256_collection[(size_t) seq->bytes[0]]
1344 |= class256_bit;
1346 /* And of course we have the UCS4 position. */
1347 if (class_bit != 0 && class_bit != 0)
1348 *find_idx (ctype, &ctype->class_collection,
1349 &ctype->class_collection_max,
1350 &ctype->class_collection_act, last_wch) |= class_bit;
1352 if (handle_digits == 1)
1354 /* We must store the digit values. */
1355 if (ctype->mbdigits_act == ctype->mbdigits_max)
1357 ctype->mbdigits_max *= 2;
1358 ctype->mbdigits = xrealloc (ctype->mbdigits,
1359 (ctype->mbdigits_max
1360 * sizeof (char *)));
1361 ctype->wcdigits_max *= 2;
1362 ctype->wcdigits = xrealloc (ctype->wcdigits,
1363 (ctype->wcdigits_max
1364 * sizeof (uint32_t)));
1367 ctype->mbdigits[ctype->mbdigits_act++] = (seq->ucs4 == last_wch
1368 ? seq : NULL);
1369 ctype->wcdigits[ctype->wcdigits_act++] = last_wch;
1371 else if (handle_digits == 2)
1373 /* We must store the digit values. */
1374 if (ctype->outdigits_act >= 10)
1376 lr_error (ldfile, _("\
1377 %s: field `%s' does not contain exactly ten entries"),
1378 "LC_CTYPE", "outdigit");
1379 return;
1382 ctype->mboutdigits[ctype->outdigits_act] = (seq->ucs4 == last_wch
1383 ? seq : NULL);
1384 ctype->wcoutdigits[ctype->outdigits_act] = last_wch;
1385 ++ctype->outdigits_act;
1391 /* Ellipsis as in `/xea/x12.../xea/x34'. */
1392 static void
1393 charclass_charcode_ellipsis (struct linereader *ldfile,
1394 struct locale_ctype_t *ctype,
1395 struct charmap_t *charmap,
1396 struct repertoire_t *repertoire,
1397 struct token *now, char *last_charcode,
1398 uint32_t last_charcode_len,
1399 unsigned long int class256_bit,
1400 unsigned long int class_bit, int ignore_content,
1401 int handle_digits)
1403 /* First check whether the to-value is larger. */
1404 if (now->val.charcode.nbytes != last_charcode_len)
1406 lr_error (ldfile, _("\
1407 start end end character sequence of range must have the same length"));
1408 return;
1411 if (memcmp (last_charcode, now->val.charcode.bytes, last_charcode_len) > 0)
1413 lr_error (ldfile, _("\
1414 to-value character sequence is smaller than from-value sequence"));
1415 return;
1418 if (!ignore_content)
1422 /* Increment the byte sequence value. */
1423 struct charseq *seq;
1424 uint32_t wch;
1425 int i;
1427 for (i = last_charcode_len - 1; i >= 0; --i)
1428 if (++last_charcode[i] != 0)
1429 break;
1431 if (last_charcode_len == 1)
1432 /* Of course we have the charcode value. */
1433 ctype->class256_collection[(size_t) last_charcode[0]]
1434 |= class256_bit;
1436 /* Find the symbolic name. */
1437 seq = charmap_find_symbol (charmap, last_charcode,
1438 last_charcode_len);
1439 if (seq != NULL)
1441 if (seq->ucs4 == UNINITIALIZED_CHAR_VALUE)
1442 seq->ucs4 = repertoire_find_value (repertoire, seq->name,
1443 strlen (seq->name));
1444 wch = seq->ucs4;
1446 if (wch != ILLEGAL_CHAR_VALUE && class_bit != 0)
1447 *find_idx (ctype, &ctype->class_collection,
1448 &ctype->class_collection_max,
1449 &ctype->class_collection_act, wch) |= class_bit;
1451 else
1452 wch = ILLEGAL_CHAR_VALUE;
1454 if (handle_digits == 1)
1456 /* We must store the digit values. */
1457 if (ctype->mbdigits_act == ctype->mbdigits_max)
1459 ctype->mbdigits_max *= 2;
1460 ctype->mbdigits = xrealloc (ctype->mbdigits,
1461 (ctype->mbdigits_max
1462 * sizeof (char *)));
1463 ctype->wcdigits_max *= 2;
1464 ctype->wcdigits = xrealloc (ctype->wcdigits,
1465 (ctype->wcdigits_max
1466 * sizeof (uint32_t)));
1469 seq = xmalloc (sizeof (struct charseq) + last_charcode_len);
1470 memcpy ((char *) (seq + 1), last_charcode, last_charcode_len);
1471 seq->nbytes = last_charcode_len;
1473 ctype->mbdigits[ctype->mbdigits_act++] = seq;
1474 ctype->wcdigits[ctype->wcdigits_act++] = wch;
1476 else if (handle_digits == 2)
1478 struct charseq *seq;
1479 /* We must store the digit values. */
1480 if (ctype->outdigits_act >= 10)
1482 lr_error (ldfile, _("\
1483 %s: field `%s' does not contain exactly ten entries"),
1484 "LC_CTYPE", "outdigit");
1485 return;
1488 seq = xmalloc (sizeof (struct charseq) + last_charcode_len);
1489 memcpy ((char *) (seq + 1), last_charcode, last_charcode_len);
1490 seq->nbytes = last_charcode_len;
1492 ctype->mboutdigits[ctype->outdigits_act] = seq;
1493 ctype->wcoutdigits[ctype->outdigits_act] = wch;
1494 ++ctype->outdigits_act;
1497 while (memcmp (last_charcode, now->val.charcode.bytes,
1498 last_charcode_len) != 0);
1503 /* Read one transliteration entry. */
1504 static uint32_t *
1505 read_widestring (struct linereader *ldfile, struct token *now,
1506 struct charmap_t *charmap, struct repertoire_t *repertoire)
1508 uint32_t *wstr;
1510 if (now->tok == tok_default_missing)
1511 /* The special name "" will denote this case. */
1512 wstr = (uint32_t *) L"";
1513 else if (now->tok == tok_bsymbol)
1515 /* Get the value from the repertoire. */
1516 wstr = xmalloc (2 * sizeof (uint32_t));
1517 wstr[0] = repertoire_find_value (repertoire, now->val.str.startmb,
1518 now->val.str.lenmb);
1519 if (wstr[0] == ILLEGAL_CHAR_VALUE)
1520 /* We cannot proceed, we don't know the UCS4 value. */
1521 return NULL;
1523 wstr[1] = 0;
1525 else if (now->tok == tok_ucs4)
1527 wstr = xmalloc (2 * sizeof (uint32_t));
1528 wstr[0] = now->val.ucs4;
1529 wstr[1] = 0;
1531 else if (now->tok == tok_charcode)
1533 /* Argh, we have to convert to the symbol name first and then to the
1534 UCS4 value. */
1535 struct charseq *seq = charmap_find_symbol (charmap,
1536 now->val.str.startmb,
1537 now->val.str.lenmb);
1538 if (seq == NULL)
1539 /* Cannot find the UCS4 value. */
1540 return NULL;
1542 if (seq->ucs4 == UNINITIALIZED_CHAR_VALUE)
1543 seq->ucs4 = repertoire_find_value (repertoire, seq->name,
1544 strlen (seq->name));
1545 if (seq->ucs4 == ILLEGAL_CHAR_VALUE)
1546 /* We cannot proceed, we don't know the UCS4 value. */
1547 return NULL;
1549 wstr = xmalloc (2 * sizeof (uint32_t));
1550 wstr[0] = seq->ucs4;
1551 wstr[1] = 0;
1553 else if (now->tok == tok_string)
1555 wstr = now->val.str.startwc;
1556 if (wstr[0] == 0)
1557 return NULL;
1559 else
1561 if (now->tok != tok_eol && now->tok != tok_eof)
1562 lr_ignore_rest (ldfile, 0);
1563 SYNTAX_ERROR (_("%s: syntax error"), "LC_CTYPE");
1564 return (uint32_t *) -1l;
1567 return wstr;
1571 static void
1572 read_translit_entry (struct linereader *ldfile, struct locale_ctype_t *ctype,
1573 struct token *now, struct charmap_t *charmap,
1574 struct repertoire_t *repertoire)
1576 uint32_t *from_wstr = read_widestring (ldfile, now, charmap, repertoire);
1577 struct translit_t *result;
1578 struct translit_to_t **top;
1579 struct obstack *ob = &ctype->mem_pool;
1580 int first;
1581 int ignore;
1583 if (from_wstr == NULL)
1584 /* There is no valid from string. */
1585 return;
1587 result = (struct translit_t *) obstack_alloc (ob,
1588 sizeof (struct translit_t));
1589 result->from = from_wstr;
1590 result->next = NULL;
1591 result->to = NULL;
1592 top = &result->to;
1593 first = 1;
1594 ignore = 0;
1596 while (1)
1598 uint32_t *to_wstr;
1600 /* Next we have one or more transliterations. They are
1601 separated by semicolons. */
1602 now = lr_token (ldfile, charmap, repertoire);
1604 if (!first && (now->tok == tok_semicolon || now->tok == tok_eol))
1606 /* One string read. */
1607 const uint32_t zero = 0;
1609 if (!ignore)
1611 obstack_grow (ob, &zero, 4);
1612 to_wstr = obstack_finish (ob);
1614 *top = obstack_alloc (ob, sizeof (struct translit_to_t));
1615 (*top)->str = to_wstr;
1616 (*top)->next = NULL;
1619 if (now->tok == tok_eol)
1621 result->next = ctype->translit;
1622 ctype->translit = result;
1623 return;
1626 if (!ignore)
1627 top = &(*top)->next;
1628 ignore = 0;
1630 else
1632 to_wstr = read_widestring (ldfile, now, charmap, repertoire);
1633 if (to_wstr == (uint32_t *) -1l)
1635 /* An error occurred. */
1636 obstack_free (ob, result);
1637 return;
1640 if (to_wstr == NULL)
1641 ignore = 1;
1642 else
1643 /* This value is usable. */
1644 obstack_grow (ob, to_wstr, wcslen ((wchar_t *) to_wstr) * 4);
1646 first = 0;
1652 /* The parser for the LC_CTYPE section of the locale definition. */
1653 void
1654 ctype_read (struct linereader *ldfile, struct localedef_t *result,
1655 struct charmap_t *charmap, const char *repertoire_name,
1656 int ignore_content)
1658 struct repertoire_t *repertoire = NULL;
1659 struct locale_ctype_t *ctype;
1660 struct token *now;
1661 enum token_t nowtok;
1662 size_t cnt;
1663 struct charseq *last_seq;
1664 uint32_t last_wch = 0;
1665 enum token_t last_token;
1666 enum token_t ellipsis_token;
1667 char last_charcode[16];
1668 size_t last_charcode_len = 0;
1669 const char *last_str = NULL;
1670 int mapidx;
1672 /* Get the repertoire we have to use. */
1673 if (repertoire_name != NULL)
1674 repertoire = repertoire_read (repertoire_name);
1676 /* The rest of the line containing `LC_CTYPE' must be free. */
1677 lr_ignore_rest (ldfile, 1);
1682 now = lr_token (ldfile, charmap, NULL);
1683 nowtok = now->tok;
1685 while (nowtok == tok_eol);
1687 /* If we see `copy' now we are almost done. */
1688 if (nowtok == tok_copy)
1690 handle_copy (ldfile, charmap, repertoire, result, tok_lc_ctype, LC_CTYPE,
1691 "LC_CTYPE", ignore_content);
1692 return;
1695 /* Prepare the data structures. */
1696 ctype_startup (ldfile, result, charmap, ignore_content);
1697 ctype = result->categories[LC_CTYPE].ctype;
1699 /* Remember the repertoire we use. */
1700 if (!ignore_content)
1701 ctype->repertoire = repertoire;
1703 while (1)
1705 unsigned long int class_bit = 0;
1706 unsigned long int class256_bit = 0;
1707 int handle_digits = 0;
1709 /* Of course we don't proceed beyond the end of file. */
1710 if (nowtok == tok_eof)
1711 break;
1713 /* Ingore empty lines. */
1714 if (nowtok == tok_eol)
1716 now = lr_token (ldfile, charmap, NULL);
1717 nowtok = now->tok;
1718 continue;
1721 switch (nowtok)
1723 case tok_charclass:
1724 now = lr_token (ldfile, charmap, NULL);
1725 while (now->tok == tok_ident || now->tok == tok_string)
1727 ctype_class_new (ldfile, ctype, now->val.str.startmb);
1728 now = lr_token (ldfile, charmap, NULL);
1729 if (now->tok != tok_semicolon)
1730 break;
1731 now = lr_token (ldfile, charmap, NULL);
1733 if (now->tok != tok_eol)
1734 SYNTAX_ERROR (_("\
1735 %s: syntax error in definition of new character class"), "LC_CTYPE");
1736 break;
1738 case tok_charconv:
1739 now = lr_token (ldfile, charmap, NULL);
1740 while (now->tok == tok_ident || now->tok == tok_string)
1742 ctype_map_new (ldfile, ctype, now->val.str.startmb, charmap);
1743 now = lr_token (ldfile, charmap, NULL);
1744 if (now->tok != tok_semicolon)
1745 break;
1746 now = lr_token (ldfile, charmap, NULL);
1748 if (now->tok != tok_eol)
1749 SYNTAX_ERROR (_("\
1750 %s: syntax error in definition of new character map"), "LC_CTYPE");
1751 break;
1753 case tok_class:
1754 /* Ignore the rest of the line if we don't need the input of
1755 this line. */
1756 if (ignore_content)
1758 lr_ignore_rest (ldfile, 0);
1759 break;
1762 /* We simply forget the `class' keyword and use the following
1763 operand to determine the bit. */
1764 now = lr_token (ldfile, charmap, NULL);
1765 if (now->tok == tok_ident || now->tok == tok_string)
1767 /* Must can be one of the predefined class names. */
1768 for (cnt = 0; cnt < ctype->nr_charclass; ++cnt)
1769 if (strcmp (ctype->classnames[cnt], now->val.str.startmb) == 0)
1770 break;
1771 if (cnt >= ctype->nr_charclass)
1773 #ifdef PREDEFINED_CLASSES
1774 if (now->val.str.lenmb == 8
1775 && memcmp ("special1", now->val.str.startmb, 8) == 0)
1776 class_bit = _ISwspecial1;
1777 else if (now->val.str.lenmb == 8
1778 && memcmp ("special2", now->val.str.startmb, 8) == 0)
1779 class_bit = _ISwspecial2;
1780 else if (now->val.str.lenmb == 8
1781 && memcmp ("special3", now->val.str.startmb, 8) == 0)
1782 class_bit = _ISwspecial3;
1783 else
1784 #endif
1786 /* OK, it's a new class. */
1787 ctype_class_new (ldfile, ctype, now->val.str.startmb);
1789 class_bit = _ISwbit (ctype->nr_charclass - 1);
1792 else
1793 class_bit = _ISwbit (cnt);
1795 free (now->val.str.startmb);
1797 else if (now->tok == tok_digit)
1798 goto handle_tok_digit;
1799 else if (now->tok < tok_upper || now->tok > tok_blank)
1800 goto err_label;
1801 else
1803 class_bit = BITw (now->tok);
1804 class256_bit = BIT (now->tok);
1807 /* The next character must be a semicolon. */
1808 now = lr_token (ldfile, charmap, NULL);
1809 if (now->tok != tok_semicolon)
1810 goto err_label;
1811 goto read_charclass;
1813 case tok_upper:
1814 case tok_lower:
1815 case tok_alpha:
1816 case tok_alnum:
1817 case tok_space:
1818 case tok_cntrl:
1819 case tok_punct:
1820 case tok_graph:
1821 case tok_print:
1822 case tok_xdigit:
1823 case tok_blank:
1824 /* Ignore the rest of the line if we don't need the input of
1825 this line. */
1826 if (ignore_content)
1828 lr_ignore_rest (ldfile, 0);
1829 break;
1832 class_bit = BITw (now->tok);
1833 class256_bit = BIT (now->tok);
1834 handle_digits = 0;
1835 read_charclass:
1836 ctype->class_done |= class_bit;
1837 last_token = tok_none;
1838 ellipsis_token = tok_none;
1839 now = lr_token (ldfile, charmap, NULL);
1840 while (now->tok != tok_eol && now->tok != tok_eof)
1842 uint32_t wch;
1843 struct charseq *seq;
1845 if (ellipsis_token == tok_none)
1847 if (get_character (now, charmap, repertoire, &seq, &wch))
1848 goto err_label;
1850 if (!ignore_content && seq != NULL && seq->nbytes == 1)
1851 /* Yep, we can store information about this byte
1852 sequence. */
1853 ctype->class256_collection[seq->bytes[0]] |= class256_bit;
1855 if (!ignore_content && wch != ILLEGAL_CHAR_VALUE
1856 && class_bit != 0)
1857 /* We have the UCS4 position. */
1858 *find_idx (ctype, &ctype->class_collection,
1859 &ctype->class_collection_max,
1860 &ctype->class_collection_act, wch) |= class_bit;
1862 last_token = now->tok;
1863 /* Terminate the string. */
1864 if (last_token == tok_bsymbol)
1866 now->val.str.startmb[now->val.str.lenmb] = '\0';
1867 last_str = now->val.str.startmb;
1869 else
1870 last_str = NULL;
1871 last_seq = seq;
1872 last_wch = wch;
1873 memcpy (last_charcode, now->val.charcode.bytes, 16);
1874 last_charcode_len = now->val.charcode.nbytes;
1876 if (!ignore_content && handle_digits == 1)
1878 /* We must store the digit values. */
1879 if (ctype->mbdigits_act == ctype->mbdigits_max)
1881 ctype->mbdigits_max += 10;
1882 ctype->mbdigits = xrealloc (ctype->mbdigits,
1883 (ctype->mbdigits_max
1884 * sizeof (char *)));
1885 ctype->wcdigits_max += 10;
1886 ctype->wcdigits = xrealloc (ctype->wcdigits,
1887 (ctype->wcdigits_max
1888 * sizeof (uint32_t)));
1891 ctype->mbdigits[ctype->mbdigits_act++] = seq;
1892 ctype->wcdigits[ctype->wcdigits_act++] = wch;
1894 else if (!ignore_content && handle_digits == 2)
1896 /* We must store the digit values. */
1897 if (ctype->outdigits_act >= 10)
1899 lr_error (ldfile, _("\
1900 %s: field `%s' does not contain exactly ten entries"),
1901 "LC_CTYPE", "outdigit");
1902 goto err_label;
1905 ctype->mboutdigits[ctype->outdigits_act] = seq;
1906 ctype->wcoutdigits[ctype->outdigits_act] = wch;
1907 ++ctype->outdigits_act;
1910 else
1912 /* Now it gets complicated. We have to resolve the
1913 ellipsis problem. First we must distinguish between
1914 the different kind of ellipsis and this must match the
1915 tokens we have seen. */
1916 assert (last_token != tok_none);
1918 if (last_token != now->tok)
1920 lr_error (ldfile, _("\
1921 ellipsis range must be marked by two operands of same type"));
1922 lr_ignore_rest (ldfile, 0);
1923 break;
1926 if (last_token == tok_bsymbol)
1928 if (ellipsis_token == tok_ellipsis3)
1929 lr_error (ldfile, _("with symbolic name range values \
1930 the absolute ellipsis `...' must not be used"));
1932 charclass_symbolic_ellipsis (ldfile, ctype, charmap,
1933 repertoire, now, last_str,
1934 class256_bit, class_bit,
1935 (ellipsis_token
1936 == tok_ellipsis4
1937 ? 10 : 16),
1938 ignore_content,
1939 handle_digits);
1941 else if (last_token == tok_ucs4)
1943 if (ellipsis_token != tok_ellipsis2)
1944 lr_error (ldfile, _("\
1945 with UCS range values one must use the hexadecimal symbolic ellipsis `..'"));
1947 charclass_ucs4_ellipsis (ldfile, ctype, charmap,
1948 repertoire, now, last_wch,
1949 class256_bit, class_bit,
1950 ignore_content, handle_digits);
1952 else
1954 assert (last_token == tok_charcode);
1956 if (ellipsis_token != tok_ellipsis3)
1957 lr_error (ldfile, _("\
1958 with character code range values one must use the absolute ellipsis `...'"));
1960 charclass_charcode_ellipsis (ldfile, ctype, charmap,
1961 repertoire, now,
1962 last_charcode,
1963 last_charcode_len,
1964 class256_bit, class_bit,
1965 ignore_content,
1966 handle_digits);
1969 /* Now we have used the last value. */
1970 last_token = tok_none;
1973 /* Next we expect a semicolon or the end of the line. */
1974 now = lr_token (ldfile, charmap, NULL);
1975 if (now->tok == tok_eol || now->tok == tok_eof)
1976 break;
1978 if (last_token != tok_none
1979 && now->tok >= tok_ellipsis2 && now->tok <= tok_ellipsis4)
1981 ellipsis_token = now->tok;
1982 now = lr_token (ldfile, charmap, NULL);
1983 continue;
1986 if (now->tok != tok_semicolon)
1987 goto err_label;
1989 /* And get the next character. */
1990 now = lr_token (ldfile, charmap, NULL);
1992 ellipsis_token = tok_none;
1994 break;
1996 case tok_digit:
1997 /* Ignore the rest of the line if we don't need the input of
1998 this line. */
1999 if (ignore_content)
2001 lr_ignore_rest (ldfile, 0);
2002 break;
2005 handle_tok_digit:
2006 class_bit = _ISwdigit;
2007 class256_bit = _ISdigit;
2008 handle_digits = 1;
2009 goto read_charclass;
2011 case tok_outdigit:
2012 /* Ignore the rest of the line if we don't need the input of
2013 this line. */
2014 if (ignore_content)
2016 lr_ignore_rest (ldfile, 0);
2017 break;
2020 if (ctype->outdigits_act != 0)
2021 lr_error (ldfile, _("\
2022 %s: field `%s' declared more than once"),
2023 "LC_CTYPE", "outdigit");
2024 class_bit = 0;
2025 class256_bit = 0;
2026 handle_digits = 2;
2027 goto read_charclass;
2029 case tok_toupper:
2030 /* Ignore the rest of the line if we don't need the input of
2031 this line. */
2032 if (ignore_content)
2034 lr_ignore_rest (ldfile, 0);
2035 break;
2038 mapidx = 0;
2039 goto read_mapping;
2041 case tok_tolower:
2042 /* Ignore the rest of the line if we don't need the input of
2043 this line. */
2044 if (ignore_content)
2046 lr_ignore_rest (ldfile, 0);
2047 break;
2050 mapidx = 1;
2051 goto read_mapping;
2053 case tok_map:
2054 /* Ignore the rest of the line if we don't need the input of
2055 this line. */
2056 if (ignore_content)
2058 lr_ignore_rest (ldfile, 0);
2059 break;
2062 /* We simply forget the `map' keyword and use the following
2063 operand to determine the mapping. */
2064 now = lr_token (ldfile, charmap, NULL);
2065 if (now->tok == tok_ident || now->tok == tok_string)
2067 size_t cnt;
2069 for (cnt = 2; cnt < ctype->map_collection_nr; ++cnt)
2070 if (strcmp (now->val.str.startmb, ctype->mapnames[cnt]) == 0)
2071 break;
2073 if (cnt >= ctype->map_collection_nr)
2074 /* OK, it's a new map. */
2075 ctype_map_new (ldfile, ctype, now->val.str.startmb, charmap);
2077 mapidx = cnt;
2079 else if (now->tok < tok_toupper || now->tok > tok_tolower)
2080 goto err_label;
2081 else
2082 mapidx = now->tok - tok_toupper;
2084 now = lr_token (ldfile, charmap, NULL);
2085 /* This better should be a semicolon. */
2086 if (now->tok != tok_semicolon)
2087 goto err_label;
2089 read_mapping:
2090 /* Test whether this mapping was already defined. */
2091 if (ctype->tomap_done[mapidx])
2093 lr_error (ldfile, _("duplicated definition for mapping `%s'"),
2094 ctype->mapnames[mapidx]);
2095 lr_ignore_rest (ldfile, 0);
2096 break;
2098 ctype->tomap_done[mapidx] = 1;
2100 now = lr_token (ldfile, charmap, NULL);
2101 while (now->tok != tok_eol && now->tok != tok_eof)
2103 struct charseq *from_seq;
2104 uint32_t from_wch;
2105 struct charseq *to_seq;
2106 uint32_t to_wch;
2108 /* Every pair starts with an opening brace. */
2109 if (now->tok != tok_open_brace)
2110 goto err_label;
2112 /* Next comes the from-value. */
2113 now = lr_token (ldfile, charmap, NULL);
2114 if (get_character (now, charmap, repertoire, &from_seq,
2115 &from_wch) != 0)
2116 goto err_label;
2118 /* The next is a comma. */
2119 now = lr_token (ldfile, charmap, NULL);
2120 if (now->tok != tok_comma)
2121 goto err_label;
2123 /* And the other value. */
2124 now = lr_token (ldfile, charmap, NULL);
2125 if (get_character (now, charmap, repertoire, &to_seq,
2126 &to_wch) != 0)
2127 goto err_label;
2129 /* And the last thing is the closing brace. */
2130 now = lr_token (ldfile, charmap, NULL);
2131 if (now->tok != tok_close_brace)
2132 goto err_label;
2134 if (!ignore_content)
2136 if (mapidx < 2 && from_seq != NULL && to_seq != NULL
2137 && from_seq->nbytes == 1 && to_seq->nbytes == 1)
2138 /* We can use this value. */
2139 ctype->map256_collection[mapidx][from_seq->bytes[0]]
2140 = to_seq->bytes[0];
2142 if (from_wch != ILLEGAL_CHAR_VALUE
2143 && to_wch != ILLEGAL_CHAR_VALUE)
2144 /* Both correct values. */
2145 *find_idx (ctype, &ctype->map_collection[mapidx],
2146 &ctype->map_collection_max[mapidx],
2147 &ctype->map_collection_act[mapidx],
2148 from_wch) = to_wch;
2151 /* Now comes a semicolon or the end of the line/file. */
2152 now = lr_token (ldfile, charmap, NULL);
2153 if (now->tok == tok_semicolon)
2154 now = lr_token (ldfile, charmap, NULL);
2156 break;
2158 case tok_translit_start:
2159 /* Ignore the rest of the line if we don't need the input of
2160 this line. */
2161 if (ignore_content)
2163 lr_ignore_rest (ldfile, 0);
2164 break;
2167 /* The rest of the line better should be empty. */
2168 lr_ignore_rest (ldfile, 1);
2170 /* We count here the number of allocated entries in the `translit'
2171 array. */
2172 cnt = 0;
2174 /* We proceed until we see the `translit_end' token. */
2175 while (now = lr_token (ldfile, charmap, repertoire),
2176 now->tok != tok_translit_end && now->tok != tok_eof)
2178 if (now->tok == tok_eol)
2179 /* Ignore empty lines. */
2180 continue;
2182 if (now->tok == tok_translit_end)
2184 lr_ignore_rest (ldfile, 0);
2185 break;
2188 if (now->tok == tok_include)
2190 /* We have to include locale. */
2191 const char *locale_name;
2192 const char *repertoire_name;
2194 now = lr_token (ldfile, charmap, NULL);
2195 /* This should be a string or an identifier. In any
2196 case something to name a locale. */
2197 if (now->tok != tok_string && now->tok != tok_ident)
2199 translit_syntax:
2200 lr_error (ldfile, _("%s: syntax error"), "LC_CTYPE");
2201 lr_ignore_rest (ldfile, 0);
2202 continue;
2204 locale_name = now->val.str.startmb;
2206 /* Next should be a semicolon. */
2207 now = lr_token (ldfile, charmap, NULL);
2208 if (now->tok != tok_semicolon)
2209 goto translit_syntax;
2211 /* Now the repertoire name. */
2212 now = lr_token (ldfile, charmap, NULL);
2213 if ((now->tok != tok_string && now->tok != tok_ident)
2214 || now->val.str.startmb == NULL)
2215 goto translit_syntax;
2216 repertoire_name = now->val.str.startmb;
2218 /* We must not have more than one `include'. */
2219 if (ctype->translit_copy_locale != NULL)
2221 lr_error (ldfile, _("\
2222 %s: only one `include' instruction allowed"), "LC_CTYPE");
2223 lr_ignore_rest (ldfile, 0);
2224 continue;
2227 ctype->translit_copy_locale = locale_name;
2228 ctype->translit_copy_repertoire = repertoire_name;
2230 /* The rest of the line must be empty. */
2231 lr_ignore_rest (ldfile, 1);
2232 continue;
2235 read_translit_entry (ldfile, ctype, now, charmap, repertoire);
2237 break;
2239 case tok_ident:
2240 /* Ignore the rest of the line if we don't need the input of
2241 this line. */
2242 if (ignore_content)
2244 lr_ignore_rest (ldfile, 0);
2245 break;
2248 /* This could mean one of several things. First test whether
2249 it's a character class name. */
2250 for (cnt = 0; cnt < ctype->nr_charclass; ++cnt)
2251 if (strcmp (now->val.str.startmb, ctype->classnames[cnt]) == 0)
2252 break;
2253 if (cnt < ctype->nr_charclass)
2255 class_bit = _ISwbit (cnt);
2256 class256_bit = cnt <= 11 ? _ISbit (cnt) : 0;
2257 free (now->val.str.startmb);
2258 goto read_charclass;
2260 for (cnt = 0; cnt < ctype->map_collection_nr; ++cnt)
2261 if (strcmp (now->val.str.startmb, ctype->mapnames[cnt]) == 0)
2262 break;
2263 if (cnt < ctype->map_collection_nr)
2265 mapidx = cnt;
2266 free (now->val.str.startmb);
2267 goto read_mapping;
2269 #ifdef PREDEFINED_CLASSES
2270 if (strcmp (now->val.str.startmb, "special1") == 0)
2272 class_bit = _ISwspecial1;
2273 free (now->val.str.startmb);
2274 goto read_charclass;
2276 if (strcmp (now->val.str.startmb, "special2") == 0)
2278 class_bit = _ISwspecial2;
2279 free (now->val.str.startmb);
2280 goto read_charclass;
2282 if (strcmp (now->val.str.startmb, "special3") == 0)
2284 class_bit = _ISwspecial3;
2285 free (now->val.str.startmb);
2286 goto read_charclass;
2288 if (strcmp (now->val.str.startmb, "tosymmetric") == 0)
2290 mapidx = 2;
2291 goto read_mapping;
2293 #endif
2294 break;
2296 case tok_end:
2297 /* Next we assume `LC_CTYPE'. */
2298 now = lr_token (ldfile, charmap, NULL);
2299 if (now->tok == tok_eof)
2300 break;
2301 if (now->tok == tok_eol)
2302 lr_error (ldfile, _("%s: incomplete `END' line"),
2303 "LC_CTYPE");
2304 else if (now->tok != tok_lc_ctype)
2305 lr_error (ldfile, _("\
2306 %1$s: definition does not end with `END %1$s'"), "LC_CTYPE");
2307 lr_ignore_rest (ldfile, now->tok == tok_lc_ctype);
2308 return;
2310 default:
2311 err_label:
2312 if (now->tok != tok_eof)
2313 SYNTAX_ERROR (_("%s: syntax error"), "LC_CTYPE");
2316 /* Prepare for the next round. */
2317 now = lr_token (ldfile, charmap, NULL);
2318 nowtok = now->tok;
2321 /* When we come here we reached the end of the file. */
2322 lr_error (ldfile, _("%s: premature end of file"), "LC_CTYPE");
2326 static void
2327 set_class_defaults (struct locale_ctype_t *ctype, struct charmap_t *charmap,
2328 struct repertoire_t *repertoire)
2330 size_t cnt;
2332 /* These function defines the default values for the classes and conversions
2333 according to POSIX.2 2.5.2.1.
2334 It may seem that the order of these if-blocks is arbitrary but it is NOT.
2335 Don't move them unless you know what you do! */
2337 void set_default (int bitpos, int from, int to)
2339 char tmp[2];
2340 int ch;
2341 int bit = _ISbit (bitpos);
2342 int bitw = _ISwbit (bitpos);
2343 /* Define string. */
2344 strcpy (tmp, "?");
2346 for (ch = from; ch <= to; ++ch)
2348 uint32_t value;
2349 struct charseq *seq;
2350 tmp[0] = ch;
2352 value = repertoire_find_value (repertoire, tmp, 1);
2353 if (value == ILLEGAL_CHAR_VALUE)
2355 if (!be_quiet)
2356 error (0, 0, _("\
2357 %s: character `%s' not defined in repertoire while needed as default value"),
2358 "LC_CTYPE", tmp);
2360 else
2361 ELEM (ctype, class_collection, , value) |= bitw;
2363 seq = charmap_find_value (charmap, tmp, 1);
2364 if (seq == NULL)
2366 if (!be_quiet)
2367 error (0, 0, _("\
2368 %s: character `%s' not defined in charmap while needed as default value"),
2369 "LC_CTYPE", tmp);
2371 else if (seq->nbytes != 1)
2372 error (0, 0, _("\
2373 %s: character `%s' in charmap not representable with one byte"),
2374 "LC_CTYPE", tmp);
2375 else
2376 ctype->class256_collection[seq->bytes[0]] |= bit;
2380 /* Set default values if keyword was not present. */
2381 if ((ctype->class_done & BITw (tok_upper)) == 0)
2382 /* "If this keyword [lower] is not specified, the lowercase letters
2383 `A' through `Z', ..., shall automatically belong to this class,
2384 with implementation defined character values." [P1003.2, 2.5.2.1] */
2385 set_default (BITPOS (tok_upper), 'A', 'Z');
2387 if ((ctype->class_done & BITw (tok_lower)) == 0)
2388 /* "If this keyword [lower] is not specified, the lowercase letters
2389 `a' through `z', ..., shall automatically belong to this class,
2390 with implementation defined character values." [P1003.2, 2.5.2.1] */
2391 set_default (BITPOS (tok_lower), 'a', 'z');
2393 if ((ctype->class_done & BITw (tok_alpha)) == 0)
2395 /* Table 2-6 in P1003.2 says that characters in class `upper' or
2396 class `lower' *must* be in class `alpha'. */
2397 unsigned long int mask = BIT (tok_upper) | BIT (tok_lower);
2398 unsigned long int maskw = BITw (tok_upper) | BITw (tok_lower);
2400 for (cnt = 0; cnt < 256; ++cnt)
2401 if ((ctype->class256_collection[cnt] & mask) != 0)
2402 ctype->class256_collection[cnt] |= BIT (tok_alpha);
2404 for (cnt = 0; cnt < ctype->class_collection_act; ++cnt)
2405 if ((ctype->class_collection[cnt] & maskw) != 0)
2406 ctype->class_collection[cnt] |= BITw (tok_alpha);
2409 if ((ctype->class_done & BITw (tok_digit)) == 0)
2410 /* "If this keyword [digit] is not specified, the digits `0' through
2411 `9', ..., shall automatically belong to this class, with
2412 implementation-defined character values." [P1003.2, 2.5.2.1] */
2413 set_default (BITPOS (tok_digit), '0', '9');
2415 /* "Only characters specified for the `alpha' and `digit' keyword
2416 shall be specified. Characters specified for the keyword `alpha'
2417 and `digit' are automatically included in this class. */
2419 unsigned long int mask = BIT (tok_alpha) | BIT (tok_digit);
2420 unsigned long int maskw = BITw (tok_alpha) | BITw (tok_digit);
2422 for (cnt = 0; cnt < 256; ++cnt)
2423 if ((ctype->class256_collection[cnt] & mask) != 0)
2424 ctype->class256_collection[cnt] |= BIT (tok_alnum);
2426 for (cnt = 0; cnt < ctype->class_collection_act; ++cnt)
2427 if ((ctype->class_collection[cnt] & maskw) != 0)
2428 ctype->class_collection[cnt] |= BITw (tok_alnum);
2431 if ((ctype->class_done & BITw (tok_space)) == 0)
2432 /* "If this keyword [space] is not specified, the characters <space>,
2433 <form-feed>, <newline>, <carriage-return>, <tab>, and
2434 <vertical-tab>, ..., shall automatically belong to this class,
2435 with implementation-defined character values." [P1003.2, 2.5.2.1] */
2437 uint32_t value;
2438 struct charseq *seq;
2440 value = repertoire_find_value (repertoire, "space", 5);
2441 if (value == ILLEGAL_CHAR_VALUE)
2443 if (!be_quiet)
2444 error (0, 0, _("\
2445 %s: character `%s' not defined while needed as default value"),
2446 "LC_CTYPE", "<space>");
2448 else
2449 ELEM (ctype, class_collection, , value) |= BIT (tok_space);
2451 seq = charmap_find_value (charmap, "space", 5);
2452 if (seq == NULL)
2454 if (!be_quiet)
2455 error (0, 0, _("\
2456 %s: character `%s' not defined while needed as default value"),
2457 "LC_CTYPE", "<space>");
2459 else if (seq->nbytes != 1)
2460 error (0, 0, _("\
2461 %s: character `%s' in charmap not representable with one byte"),
2462 "LC_CTYPE", "<space>");
2463 else
2464 ctype->class256_collection[seq->bytes[0]] |= BIT (tok_space);
2467 value = repertoire_find_value (repertoire, "form-feed", 9);
2468 if (value == ILLEGAL_CHAR_VALUE)
2470 if (!be_quiet)
2471 error (0, 0, _("\
2472 %s: character `%s' not defined while needed as default value"),
2473 "LC_CTYPE", "<form-feed>");
2475 else
2476 ELEM (ctype, class_collection, , value) |= BIT (tok_space);
2478 seq = charmap_find_value (charmap, "form-feed", 9);
2479 if (seq == NULL)
2481 if (!be_quiet)
2482 error (0, 0, _("\
2483 %s: character `%s' not defined while needed as default value"),
2484 "LC_CTYPE", "<form-feed>");
2486 else if (seq->nbytes != 1)
2487 error (0, 0, _("\
2488 %s: character `%s' in charmap not representable with one byte"),
2489 "LC_CTYPE", "<form-feed>");
2490 else
2491 ctype->class256_collection[seq->bytes[0]] |= BIT (tok_space);
2494 value = repertoire_find_value (repertoire, "newline", 7);
2495 if (value == ILLEGAL_CHAR_VALUE)
2497 if (!be_quiet)
2498 error (0, 0, _("\
2499 %s: character `%s' not defined while needed as default value"),
2500 "LC_CTYPE", "<newline>");
2502 else
2503 ELEM (ctype, class_collection, , value) |= BIT (tok_space);
2505 seq = charmap_find_value (charmap, "newline", 7);
2506 if (seq == NULL)
2508 if (!be_quiet)
2509 error (0, 0, _("\
2510 character `%s' not defined while needed as default value"),
2511 "<newline>");
2513 else if (seq->nbytes != 1)
2514 error (0, 0, _("\
2515 %s: character `%s' in charmap not representable with one byte"),
2516 "LC_CTYPE", "<newline>");
2517 else
2518 ctype->class256_collection[seq->bytes[0]] |= BIT (tok_space);
2521 value = repertoire_find_value (repertoire, "carriage-return", 15);
2522 if (value == ILLEGAL_CHAR_VALUE)
2524 if (!be_quiet)
2525 error (0, 0, _("\
2526 %s: character `%s' not defined while needed as default value"),
2527 "LC_CTYPE", "<carriage-return>");
2529 else
2530 ELEM (ctype, class_collection, , value) |= BIT (tok_space);
2532 seq = charmap_find_value (charmap, "carriage-return", 15);
2533 if (seq == NULL)
2535 if (!be_quiet)
2536 error (0, 0, _("\
2537 %s: character `%s' not defined while needed as default value"),
2538 "LC_CTYPE", "<carriage-return>");
2540 else if (seq->nbytes != 1)
2541 error (0, 0, _("\
2542 %s: character `%s' in charmap not representable with one byte"),
2543 "LC_CTYPE", "<carriage-return>");
2544 else
2545 ctype->class256_collection[seq->bytes[0]] |= BIT (tok_space);
2548 value = repertoire_find_value (repertoire, "tab", 3);
2549 if (value == ILLEGAL_CHAR_VALUE)
2551 if (!be_quiet)
2552 error (0, 0, _("\
2553 %s: character `%s' not defined while needed as default value"),
2554 "LC_CTYPE", "<tab>");
2556 else
2557 ELEM (ctype, class_collection, , value) |= BIT (tok_space);
2559 seq = charmap_find_value (charmap, "tab", 3);
2560 if (seq == NULL)
2562 if (!be_quiet)
2563 error (0, 0, _("\
2564 %s: character `%s' not defined while needed as default value"),
2565 "LC_CTYPE", "<tab>");
2567 else if (seq->nbytes != 1)
2568 error (0, 0, _("\
2569 %s: character `%s' in charmap not representable with one byte"),
2570 "LC_CTYPE", "<tab>");
2571 else
2572 ctype->class256_collection[seq->bytes[0]] |= BIT (tok_space);
2575 value = repertoire_find_value (repertoire, "vertical-tab", 12);
2576 if (value == ILLEGAL_CHAR_VALUE)
2578 if (!be_quiet)
2579 error (0, 0, _("\
2580 %s: character `%s' not defined while needed as default value"),
2581 "LC_CTYPE", "<vertical-tab>");
2583 else
2584 ELEM (ctype, class_collection, , value) |= BIT (tok_space);
2586 seq = charmap_find_value (charmap, "vertical-tab", 12);
2587 if (seq == NULL)
2589 if (!be_quiet)
2590 error (0, 0, _("\
2591 %s: character `%s' not defined while needed as default value"),
2592 "LC_CTYPE", "<vertical-tab>");
2594 else if (seq->nbytes != 1)
2595 error (0, 0, _("\
2596 %s: character `%s' in charmap not representable with one byte"),
2597 "LC_CTYPE", "<vertical-tab>");
2598 else
2599 ctype->class256_collection[seq->bytes[0]] |= BIT (tok_space);
2602 if ((ctype->class_done & BITw (tok_xdigit)) == 0)
2603 /* "If this keyword is not specified, the digits `0' to `9', the
2604 uppercase letters `A' through `F', and the lowercase letters `a'
2605 through `f', ..., shell automatically belong to this class, with
2606 implementation defined character values." [P1003.2, 2.5.2.1] */
2608 set_default (BITPOS (tok_xdigit), '0', '9');
2609 set_default (BITPOS (tok_xdigit), 'A', 'F');
2610 set_default (BITPOS (tok_xdigit), 'a', 'f');
2613 if ((ctype->class_done & BITw (tok_blank)) == 0)
2614 /* "If this keyword [blank] is unspecified, the characters <space> and
2615 <tab> shall belong to this character class." [P1003.2, 2.5.2.1] */
2617 uint32_t value;
2618 struct charseq *seq;
2620 value = repertoire_find_value (repertoire, "space", 5);
2621 if (value == ILLEGAL_CHAR_VALUE)
2623 if (!be_quiet)
2624 error (0, 0, _("\
2625 %s: character `%s' not defined while needed as default value"),
2626 "LC_CTYPE", "<space>");
2628 else
2629 ELEM (ctype, class_collection, , value) |= BIT (tok_blank);
2631 seq = charmap_find_value (charmap, "space", 5);
2632 if (seq == NULL)
2634 if (!be_quiet)
2635 error (0, 0, _("\
2636 %s: character `%s' not defined while needed as default value"),
2637 "LC_CTYPE", "<space>");
2639 else if (seq->nbytes != 1)
2640 error (0, 0, _("\
2641 %s: character `%s' in charmap not representable with one byte"),
2642 "LC_CTYPE", "<space>");
2643 else
2644 ctype->class256_collection[seq->bytes[0]] |= BIT (tok_blank);
2647 value = repertoire_find_value (repertoire, "tab", 3);
2648 if (value == ILLEGAL_CHAR_VALUE)
2650 if (!be_quiet)
2651 error (0, 0, _("\
2652 %s: character `%s' not defined while needed as default value"),
2653 "LC_CTYPE", "<tab>");
2655 else
2656 ELEM (ctype, class_collection, , value) |= BIT (tok_blank);
2658 seq = charmap_find_value (charmap, "tab", 3);
2659 if (seq == NULL)
2661 if (!be_quiet)
2662 error (0, 0, _("\
2663 %s: character `%s' not defined while needed as default value"),
2664 "LC_CTYPE", "<tab>");
2666 else if (seq->nbytes != 1)
2667 error (0, 0, _("\
2668 %s: character `%s' in charmap not representable with one byte"),
2669 "LC_CTYPE", "<tab>");
2670 else
2671 ctype->class256_collection[seq->bytes[0]] |= BIT (tok_blank);
2674 if ((ctype->class_done & BITw (tok_graph)) == 0)
2675 /* "If this keyword [graph] is not specified, characters specified for
2676 the keywords `upper', `lower', `alpha', `digit', `xdigit' and `punct',
2677 shall belong to this character class." [P1003.2, 2.5.2.1] */
2679 unsigned long int mask = BIT (tok_upper) | BIT (tok_lower) |
2680 BIT (tok_alpha) | BIT (tok_digit) | BIT (tok_xdigit) | BIT (tok_punct);
2681 size_t cnt;
2683 for (cnt = 0; cnt < ctype->class_collection_act; ++cnt)
2684 if ((ctype->class_collection[cnt] & mask) != 0)
2685 ctype->class_collection[cnt] |= BIT (tok_graph);
2687 for (cnt = 0; cnt < 256; ++cnt)
2688 if ((ctype->class256_collection[cnt] & mask) != 0)
2689 ctype->class256_collection[cnt] |= BIT (tok_graph);
2692 if ((ctype->class_done & BITw (tok_print)) == 0)
2693 /* "If this keyword [print] is not provided, characters specified for
2694 the keywords `upper', `lower', `alpha', `digit', `xdigit', `punct',
2695 and the <space> character shall belong to this character class."
2696 [P1003.2, 2.5.2.1] */
2698 unsigned long int mask = BIT (tok_upper) | BIT (tok_lower) |
2699 BIT (tok_alpha) | BIT (tok_digit) | BIT (tok_xdigit) | BIT (tok_punct);
2700 size_t cnt;
2701 uint32_t space;
2702 struct charseq *seq;
2704 for (cnt = 0; cnt < ctype->class_collection_act; ++cnt)
2705 if ((ctype->class_collection[cnt] & mask) != 0)
2706 ctype->class_collection[cnt] |= BIT (tok_print);
2708 for (cnt = 0; cnt < 256; ++cnt)
2709 if ((ctype->class256_collection[cnt] & mask) != 0)
2710 ctype->class256_collection[cnt] |= BIT (tok_print);
2713 space = repertoire_find_value (repertoire, "space", 5);
2714 if (space == ILLEGAL_CHAR_VALUE)
2716 if (!be_quiet)
2717 error (0, 0, _("\
2718 %s: character `%s' not defined while needed as default value"),
2719 "LC_CTYPE", "<space>");
2721 else
2722 ELEM (ctype, class_collection, , space) |= BIT (tok_print);
2724 seq = charmap_find_value (charmap, "space", 5);
2725 if (seq == NULL)
2727 if (!be_quiet)
2728 error (0, 0, _("\
2729 %s: character `%s' not defined while needed as default value"),
2730 "LC_CTYPE", "<space>");
2732 else if (seq->nbytes != 1)
2733 error (0, 0, _("\
2734 %s: character `%s' in charmap not representable with one byte"),
2735 "LC_CTYPE", "<space>");
2736 else
2737 ctype->class256_collection[seq->bytes[0]] |= BIT (tok_print);
2740 if (ctype->tomap_done[0] == 0)
2741 /* "If this keyword [toupper] is not specified, the lowercase letters
2742 `a' through `z', and their corresponding uppercase letters `A' to
2743 `Z', ..., shall automatically be included, with implementation-
2744 defined character values." [P1003.2, 2.5.2.1] */
2746 char tmp[4];
2747 int ch;
2749 strcpy (tmp, "<?>");
2751 for (ch = 'a'; ch <= 'z'; ++ch)
2753 uint32_t value_from, value_to;
2754 struct charseq *seq_from, *seq_to;
2756 tmp[1] = (char) ch;
2758 value_from = repertoire_find_value (repertoire, &tmp[1], 1);
2759 if (value_from == ILLEGAL_CHAR_VALUE)
2761 if (!be_quiet)
2762 error (0, 0, _("\
2763 %s: character `%s' not defined while needed as default value"),
2764 "LC_CTYPE", tmp);
2766 else
2768 /* This conversion is implementation defined. */
2769 tmp[1] = (char) (ch + ('A' - 'a'));
2770 value_to = repertoire_find_value (repertoire, &tmp[1], 1);
2771 if (value_to == ILLEGAL_CHAR_VALUE)
2773 if (!be_quiet)
2774 error (0, 0, _("\
2775 %s: character `%s' not defined while needed as default value"),
2776 "LC_CTYPE", tmp);
2778 else
2779 /* The index [0] is determined by the order of the
2780 `ctype_map_newP' calls in `ctype_startup'. */
2781 ELEM (ctype, map_collection, [0], value_from) = value_to;
2784 seq_from = charmap_find_value (charmap, &tmp[1], 1);
2785 if (seq_from == NULL)
2787 if (!be_quiet)
2788 error (0, 0, _("\
2789 %s: character `%s' not defined while needed as default value"),
2790 "LC_CTYPE", tmp);
2792 else if (seq_from->nbytes != 1)
2794 if (!be_quiet)
2795 error (0, 0, _("\
2796 %s: character `%s' needed as default value not representable with one byte"),
2797 "LC_CTYPE", tmp);
2799 else
2801 /* This conversion is implementation defined. */
2802 tmp[1] = (char) (ch + ('A' - 'a'));
2803 seq_to = charmap_find_value (charmap, &tmp[1], 1);
2804 if (seq_to == NULL)
2806 if (!be_quiet)
2807 error (0, 0, _("\
2808 %s: character `%s' not defined while needed as default value"),
2809 "LC_CTYPE", tmp);
2811 else if (seq_to->nbytes != 1)
2813 if (!be_quiet)
2814 error (0, 0, _("\
2815 %s: character `%s' needed as default value not representable with one byte"),
2816 "LC_CTYPE", tmp);
2818 else
2819 /* The index [0] is determined by the order of the
2820 `ctype_map_newP' calls in `ctype_startup'. */
2821 ctype->map256_collection[0][seq_from->bytes[0]]
2822 = seq_to->bytes[0];
2827 if (ctype->tomap_done[1] == 0)
2828 /* "If this keyword [tolower] is not specified, the mapping shall be
2829 the reverse mapping of the one specified to `toupper'." [P1003.2] */
2831 for (cnt = 0; cnt < ctype->map_collection_act[0]; ++cnt)
2832 if (ctype->map_collection[0][cnt] != 0)
2833 ELEM (ctype, map_collection, [1],
2834 ctype->map_collection[0][cnt])
2835 = ctype->charnames[cnt];
2837 for (cnt = 0; cnt < 256; ++cnt)
2838 if (ctype->map256_collection[0][cnt] != 0)
2839 ctype->map_collection[1][ctype->map_collection[0][cnt]]
2840 = ctype->charnames[cnt];
2843 if (ctype->outdigits_act == 0)
2845 for (cnt = 0; cnt < 10; ++cnt)
2847 ctype->mboutdigits[cnt] = charmap_find_symbol (charmap,
2848 digits + cnt, 1);
2850 if (ctype->mboutdigits[cnt] == NULL)
2852 ctype->mboutdigits[cnt] = charmap_find_symbol (charmap,
2853 longnames[cnt],
2854 strlen (longnames[cnt]));
2856 if (ctype->mboutdigits[cnt] == NULL)
2858 /* Provide a replacement. */
2859 error (0, 0, _("\
2860 no output digits defined and none of the standard names in the charmap"));
2862 ctype->mboutdigits[cnt] = obstack_alloc (&charmap->mem_pool,
2863 sizeof (struct charseq) + 1);
2865 /* This is better than nothing. */
2866 ctype->mboutdigits[cnt]->bytes[0] = digits[cnt];
2867 ctype->mboutdigits[cnt]->nbytes = 1;
2871 ctype->wcoutdigits[cnt] = repertoire_find_value (repertoire,
2872 digits + cnt, 1);
2874 if (ctype->wcoutdigits[cnt] == ILLEGAL_CHAR_VALUE)
2876 ctype->wcoutdigits[cnt] = repertoire_find_value (repertoire,
2877 longnames[cnt],
2878 strlen (longnames[cnt]));
2880 if (ctype->wcoutdigits[cnt] == ILLEGAL_CHAR_VALUE)
2882 /* Provide a replacement. */
2883 error (0, 0, _("\
2884 no output digits defined and none of the standard names in the repertoire"));
2886 /* This is better than nothing. */
2887 ctype->wcoutdigits[cnt] = (uint32_t) digits[cnt];
2892 ctype->outdigits_act = 10;
2897 static void
2898 allocate_arrays (struct locale_ctype_t *ctype, struct charmap_t *charmap,
2899 struct repertoire_t *repertoire)
2901 size_t idx;
2903 /* First we have to decide how we organize the arrays. It is easy
2904 for a one-byte character set. But multi-byte character set
2905 cannot be stored flat because the chars might be sparsely used.
2906 So we determine an optimal hashing function for the used
2907 characters.
2909 We use a very trivial hashing function to store the sparse
2910 table. CH % TABSIZE is used as an index. To solve multiple hits
2911 we have N planes. This guarantees a fixed search time for a
2912 character [N / 2]. In the following code we determine the minimum
2913 value for TABSIZE * N, where TABSIZE >= 256. */
2914 size_t min_total = UINT_MAX;
2915 size_t act_size = 256;
2917 if (!be_quiet)
2918 fputs (_("\
2919 Computing table size for character classes might take a while..."),
2920 stderr);
2922 while (act_size < min_total)
2924 size_t cnt[act_size];
2925 size_t act_planes = 1;
2927 memset (cnt, '\0', sizeof cnt);
2929 for (idx = 0; idx < 256; ++idx)
2930 cnt[idx] = 1;
2932 for (idx = 0; idx < ctype->charnames_act; ++idx)
2933 if (ctype->charnames[idx] >= 256)
2935 size_t nr = ctype->charnames[idx] % act_size;
2937 if (++cnt[nr] > act_planes)
2939 act_planes = cnt[nr];
2940 if (act_size * act_planes >= min_total)
2941 break;
2945 if (act_size * act_planes < min_total)
2947 min_total = act_size * act_planes;
2948 ctype->plane_size = act_size;
2949 ctype->plane_cnt = act_planes;
2952 ++act_size;
2955 if (!be_quiet)
2956 fputs (_(" done\n"), stderr);
2959 ctype->names = (uint32_t *) xcalloc (ctype->plane_size
2960 * ctype->plane_cnt,
2961 sizeof (uint32_t));
2963 for (idx = 1; idx < 256; ++idx)
2964 ctype->names[idx] = idx;
2966 /* Trick: change the 0th entry's name to 1 to mark the cell occupied. */
2967 ctype->names[0] = 1;
2969 for (idx = 256; idx < ctype->charnames_act; ++idx)
2971 size_t nr = (ctype->charnames[idx] % ctype->plane_size);
2972 size_t depth = 0;
2974 while (ctype->names[nr + depth * ctype->plane_size])
2975 ++depth;
2976 assert (depth < ctype->plane_cnt);
2978 ctype->names[nr + depth * ctype->plane_size] = ctype->charnames[idx];
2980 /* Now for faster access remember the index in the NAMES_B array. */
2981 ctype->charnames[idx] = nr + depth * ctype->plane_size;
2983 ctype->names[0] = 0;
2986 /* You wonder about this amount of memory? This is only because some
2987 users do not manage to address the array with unsigned values or
2988 data types with range >= 256. '\200' would result in the array
2989 index -128. To help these poor people we duplicate the entries for
2990 128 up to 255 below the entry for \0. */
2991 ctype->ctype_b = (char_class_t *) xcalloc (256 + 128,
2992 sizeof (char_class_t));
2993 ctype->ctype32_b = (char_class32_t *) xcalloc (ctype->plane_size
2994 * ctype->plane_cnt,
2995 sizeof (char_class32_t));
2997 /* This is the array accessed using the multibyte string elements. */
2998 for (idx = 0; idx < 256; ++idx)
2999 ctype->ctype_b[128 + idx] = ctype->class256_collection[idx];
3001 /* Mirror first 127 entries. We must take care that entry -1 is not
3002 mirrored because EOF == -1. */
3003 for (idx = 0; idx < 127; ++idx)
3004 ctype->ctype_b[idx] = ctype->ctype_b[256 + idx];
3006 /* The 32 bit array contains all characters. */
3007 for (idx = 0; idx < ctype->class_collection_act; ++idx)
3008 ctype->ctype32_b[ctype->charnames[idx]] = ctype->class_collection[idx];
3010 /* Room for table of mappings. */
3011 ctype->map = (uint32_t **) xmalloc (ctype->map_collection_nr
3012 * sizeof (uint32_t *));
3014 /* Fill in all mappings. */
3015 for (idx = 0; idx < ctype->map_collection_nr; ++idx)
3017 unsigned int idx2;
3019 /* Allocate table. */
3020 ctype->map[idx] = (uint32_t *) xmalloc ((ctype->plane_size
3021 * ctype->plane_cnt + 128)
3022 * sizeof (uint32_t));
3024 /* Copy default value (identity mapping). */
3025 memcpy (&ctype->map[idx][128], ctype->names,
3026 ctype->plane_size * ctype->plane_cnt * sizeof (uint32_t));
3028 /* Copy values from collection. */
3029 for (idx2 = 0; idx2 < 256; ++idx2)
3030 ctype->map[idx][128 + idx2] = ctype->map256_collection[idx][idx2];
3032 /* Mirror first 127 entries. We must take care not to map entry
3033 -1 because EOF == -1. */
3034 for (idx2 = 0; idx2 < 127; ++idx2)
3035 ctype->map[idx][idx2] = ctype->map[idx][256 + idx2];
3037 /* EOF must map to EOF. */
3038 ctype->map[idx][127] = EOF;
3040 /* The 32 bit map collection. */
3041 for (idx2 = 0; idx2 < ctype->map_collection_act[idx]; ++idx2)
3042 if (ctype->map_collection[idx][idx2] != 0)
3043 ctype->map[idx][128 + ctype->charnames[idx2]]
3044 = ctype->map_collection[idx][idx2];
3047 /* Extra array for class and map names. */
3048 ctype->class_name_ptr = (uint32_t *) xmalloc (ctype->nr_charclass
3049 * sizeof (uint32_t));
3050 ctype->map_name_ptr = (uint32_t *) xmalloc (ctype->map_collection_nr
3051 * sizeof (uint32_t));
3053 /* Array for width information. Because the expected width are very
3054 small we use only one single byte. This save space and we need
3055 not provide the information twice with both endianesses. */
3056 ctype->width = (unsigned char *) xmalloc (ctype->plane_size
3057 * ctype->plane_cnt);
3058 /* Initialize with default width value. */
3059 memset (ctype->width, charmap->width_default,
3060 ctype->plane_size * ctype->plane_cnt);
3061 if (charmap->width_rules != NULL)
3063 size_t cnt;
3065 for (cnt = 0; cnt < charmap->nwidth_rules; ++cnt)
3067 unsigned char bytes[charmap->mb_cur_max];
3068 int nbytes = charmap->width_rules[cnt].from->nbytes;
3070 /* We have the range of character for which the width is
3071 specified described using byte sequences of the multibyte
3072 charset. We have to convert this to UCS4 now. And we
3073 cannot simply convert the beginning and the end of the
3074 sequence, we have to iterate over the byte sequence and
3075 convert it for every single character. */
3076 memcpy (bytes, charmap->width_rules[cnt].from->bytes, nbytes);
3078 while (nbytes < charmap->width_rules[cnt].to->nbytes
3079 || memcmp (bytes, charmap->width_rules[cnt].to->bytes,
3080 nbytes) <= 0)
3082 /* Find the UCS value for `bytes'. */
3083 uint32_t wch = repertoire_find_value (ctype->repertoire, bytes,
3084 nbytes);
3085 int inner;
3087 if (wch != ILLEGAL_CHAR_VALUE)
3089 /* Store the value. */
3090 size_t nr = idx % ctype->plane_size;
3091 size_t depth = 0;
3093 while (ctype->names[nr + depth * ctype->plane_size] != nr)
3094 ++depth;
3095 assert (depth < ctype->plane_cnt);
3097 ctype->width[nr + depth * ctype->plane_size]
3098 = charmap->width_rules[cnt].width;
3101 /* "Increment" the bytes sequence. */
3102 inner = nbytes - 1;
3103 while (inner >= 0 && bytes[inner] == 0xff)
3104 --inner;
3106 if (inner < 0)
3108 /* We have to extend the byte sequence. */
3109 if (nbytes >= charmap->width_rules[cnt].to->nbytes)
3110 break;
3112 bytes[0] = 1;
3113 memset (&bytes[1], 0, nbytes);
3114 ++nbytes;
3116 else
3118 ++bytes[inner];
3119 while (++inner < nbytes)
3120 bytes[inner] = 0;
3126 /* Set MB_CUR_MAX. */
3127 ctype->mb_cur_max = charmap->mb_cur_max;
3129 /* We need the name of the currently used 8-bit character set to
3130 make correct conversion between this 8-bit representation and the
3131 ISO 10646 character set used internally for wide characters. */
3132 ctype->codeset_name = charmap->code_set_name;
3134 /* Now determine the table for the transliteration information.
3136 XXX It is not yet clear to me whether it is worth implementing a
3137 complicated algorithm which uses a hash table to locate the entries.
3138 For now I'll use a simple array which can be searching using binary
3139 search. */
3140 if (ctype->translit_copy_locale != NULL)
3142 /* Fold in the transliteration information from the locale mentioned
3143 in the `include' statement. */
3144 struct locale_ctype_t *here = ctype;
3148 struct localedef_t *other = find_locale (LC_CTYPE,
3149 here->translit_copy_locale,
3150 repertoire->name, charmap);
3152 if (other == NULL)
3154 error (0, 0, _("\
3155 %s: transliteration data from locale `%s' not available"),
3156 "LC_CTYPE", here->translit_copy_locale);
3157 break;
3160 here = other->categories[LC_CTYPE].ctype;
3162 /* Enqueue the information if necessary. */
3163 if (here->translit != NULL)
3165 struct translit_t *endp = here->translit;
3166 while (endp->next != NULL)
3167 endp = endp->next;
3169 endp->next = ctype->translit;
3170 ctype->translit = here->translit;
3173 while (here->translit_copy_locale != NULL);
3176 if (ctype->translit != NULL)
3178 /* First count how many entries we have. This is the upper limit
3179 since some entries from the included files might be overwritten. */
3180 size_t number = 0;
3181 size_t cnt;
3182 struct translit_t *runp = ctype->translit;
3183 struct translit_t **sorted;
3184 size_t from_len, to_len;
3186 while (runp != NULL)
3188 ++number;
3189 runp = runp->next;
3192 /* Next we allocate an array large enough and fill in the values. */
3193 sorted = (struct translit_t **) alloca (number
3194 * sizeof (struct translit_t **));
3195 runp = ctype->translit;
3196 number = 0;
3199 /* Search for the place where to insert this string.
3200 XXX Better use a real sorting algorithm later. */
3201 size_t idx = 0;
3202 int replace = 0;
3204 while (idx < number)
3206 int res = wcscmp ((const wchar_t *) sorted[idx]->from,
3207 (const wchar_t *) runp->from);
3208 if (res == 0)
3210 replace = 1;
3211 break;
3213 if (res > 0)
3214 break;
3215 ++idx;
3218 if (replace)
3219 sorted[idx] = runp;
3220 else
3222 memmove (&sorted[idx + 1], &sorted[idx],
3223 (number - idx) * sizeof (struct translit_t *));
3224 sorted[idx] = runp;
3225 ++number;
3228 runp = runp->next;
3230 while (runp != NULL);
3232 /* The next step is putting all the possible transliteration
3233 strings in one memory block so that we can write it out.
3234 We need several different blocks:
3235 - index to the tfromstring array
3236 - from-string array
3237 - index to the to-string array
3238 - to-string array.
3239 And this all must be available for both endianes variants.
3241 from_len = to_len = 0;
3242 for (cnt = 0; cnt < number; ++cnt)
3244 struct translit_to_t *srunp;
3245 from_len += wcslen ((const wchar_t *) sorted[cnt]->from) + 1;
3246 srunp = sorted[cnt]->to;
3247 while (srunp != NULL)
3249 to_len += wcslen ((const wchar_t *) srunp->str) + 1;
3250 srunp = srunp->next;
3252 /* Plus one for the extra NUL character marking the end of
3253 the list for the current entry. */
3254 ++to_len;
3257 /* We can allocate the arrays for the results. */
3258 ctype->translit_from_idx = xmalloc (number * sizeof (uint32_t));
3259 ctype->translit_from_tbl = xmalloc (from_len * sizeof (uint32_t));
3260 ctype->translit_to_idx = xmalloc (number * sizeof (uint32_t));
3261 ctype->translit_to_tbl = xmalloc (to_len * sizeof (uint32_t));
3263 from_len = 0;
3264 to_len = 0;
3265 for (cnt = 0; cnt < number; ++cnt)
3267 size_t len;
3268 struct translit_to_t *srunp;
3270 ctype->translit_from_idx[cnt] = from_len;
3271 ctype->translit_to_idx[cnt] = to_len;
3273 len = wcslen ((const wchar_t *) sorted[cnt]->from) + 1;
3274 wmemcpy ((wchar_t *) &ctype->translit_from_tbl[from_len],
3275 (const wchar_t *) sorted[cnt]->from, len);
3276 from_len += len;
3278 ctype->translit_to_idx[cnt] = to_len;
3279 srunp = sorted[cnt]->to;
3280 while (srunp != NULL)
3282 len = wcslen ((const wchar_t *) srunp->str) + 1;
3283 wmemcpy ((wchar_t *) &ctype->translit_to_tbl[to_len],
3284 (const wchar_t *) srunp->str, len);
3285 to_len += len;
3286 srunp = srunp->next;
3288 ctype->translit_to_tbl[to_len++] = L'\0';
3291 /* Store the information about the length. */
3292 ctype->translit_idx_size = number * sizeof (uint32_t);
3293 ctype->translit_from_tbl_size = from_len * sizeof (uint32_t);
3294 ctype->translit_to_tbl_size = to_len * sizeof (uint32_t);
3296 else
3298 /* Provide some dummy pointers since we have nothing to write out. */
3299 static uint32_t no_str = { 0 };
3301 ctype->translit_from_idx = &no_str;
3302 ctype->translit_from_tbl = &no_str;
3303 ctype->translit_to_tbl = &no_str;
3304 ctype->translit_idx_size = 0;
3305 ctype->translit_from_tbl_size = 0;
3306 ctype->translit_to_tbl_size = 0;