Update.
[glibc.git] / locale / programs / ld-ctype.c
blobc545ba4164c0dc5554a9cfdd4ade480d7ab3a2d6
1 /* Copyright (C) 1995, 1996, 1997, 1998, 1999 Free Software Foundation, Inc.
2 This file is part of the GNU C Library.
3 Contributed by Ulrich Drepper <drepper@gnu.org>, 1995.
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Library General Public License as
7 published by the Free Software Foundation; either version 2 of the
8 License, or (at your option) any later version.
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Library General Public License for more details.
15 You should have received a copy of the GNU Library General Public
16 License along with the GNU C Library; see the file COPYING.LIB. If not,
17 write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
18 Boston, MA 02111-1307, USA. */
20 #ifdef HAVE_CONFIG_H
21 # include <config.h>
22 #endif
24 #include <alloca.h>
25 #include <byteswap.h>
26 #include <endian.h>
27 #include <errno.h>
28 #include <limits.h>
29 #include <obstack.h>
30 #include <stdlib.h>
31 #include <string.h>
32 #include <wchar.h>
33 #include <wctype.h>
34 #include <sys/uio.h>
36 #include "charmap.h"
37 #include "localeinfo.h"
38 #include "langinfo.h"
39 #include "linereader.h"
40 #include "locfile-token.h"
41 #include "locfile.h"
42 #include "localedef.h"
44 #include <assert.h>
47 /* These are the extra bits not in wctype.h since these are not preallocated
48 classes. */
49 #define _ISwspecial1 (1 << 29)
50 #define _ISwspecial2 (1 << 30)
51 #define _ISwspecial3 (1 << 31)
54 /* The bit used for representing a special class. */
55 #define BITPOS(class) ((class) - tok_upper)
56 #define BIT(class) (_ISbit (BITPOS (class)))
57 #define BITw(class) (_ISwbit (BITPOS (class)))
59 #define ELEM(ctype, collection, idx, value) \
60 *find_idx (ctype, &ctype->collection idx, &ctype->collection##_max idx, \
61 &ctype->collection##_act idx, value)
64 /* To be compatible with former implementations we for now restrict
65 the number of bits for character classes to 16. When compatibility
66 is not necessary anymore increase the number to 32. */
67 #define char_class_t uint16_t
68 #define char_class32_t uint32_t
71 /* Type to describe a transliteration action. We have a possibly
72 multiple character from-string and a set of multiple character
73 to-strings. All are 32bit values since this is what is used in
74 the gconv functions. */
75 struct translit_to_t
77 uint32_t *str;
79 struct translit_to_t *next;
82 struct translit_t
84 uint32_t *from;
86 struct translit_to_t *to;
88 struct translit_t *next;
92 /* The real definition of the struct for the LC_CTYPE locale. */
93 struct locale_ctype_t
95 uint32_t *charnames;
96 size_t charnames_max;
97 size_t charnames_act;
99 struct repertoire_t *repertoire;
101 /* We will allow up to 8 * sizeof (uint32_t) character classes. */
102 #define MAX_NR_CHARCLASS (8 * sizeof (uint32_t))
103 size_t nr_charclass;
104 const char *classnames[MAX_NR_CHARCLASS];
105 uint32_t last_class_char;
106 uint32_t class256_collection[256];
107 uint32_t *class_collection;
108 size_t class_collection_max;
109 size_t class_collection_act;
110 uint32_t class_done;
112 struct charseq **mbdigits;
113 size_t mbdigits_act;
114 size_t mbdigits_max;
115 uint32_t *wcdigits;
116 size_t wcdigits_act;
117 size_t wcdigits_max;
119 struct charseq *mboutdigits[10];
120 uint32_t wcoutdigits[10];
121 size_t outdigits_act;
123 /* If the following number ever turns out to be too small simply
124 increase it. But I doubt it will. --drepper@gnu */
125 #define MAX_NR_CHARMAP 16
126 const char *mapnames[MAX_NR_CHARMAP];
127 uint32_t *map_collection[MAX_NR_CHARMAP];
128 uint32_t map256_collection[2][256];
129 size_t map_collection_max[MAX_NR_CHARMAP];
130 size_t map_collection_act[MAX_NR_CHARMAP];
131 size_t map_collection_nr;
132 size_t last_map_idx;
133 int tomap_done[MAX_NR_CHARMAP];
135 /* Transliteration information. */
136 const char *translit_copy_locale;
137 const char *translit_copy_repertoire;
138 struct translit_t *translit;
140 /* The arrays for the binary representation. */
141 uint32_t plane_size;
142 uint32_t plane_cnt;
143 char_class_t *ctype_b;
144 char_class32_t *ctype32_b;
145 uint32_t *names;
146 uint32_t **map;
147 uint32_t *class_name_ptr;
148 uint32_t *map_name_ptr;
149 unsigned char *width;
150 uint32_t mb_cur_max;
151 const char *codeset_name;
152 uint32_t translit_hash_size;
153 uint32_t translit_hash_layers;
154 uint32_t *translit_from_idx;
155 uint32_t *translit_from_tbl;
156 uint32_t *translit_to_idx;
157 uint32_t *translit_to_tbl;
158 size_t translit_idx_size;
159 size_t translit_from_tbl_size;
160 size_t translit_to_tbl_size;
162 struct obstack mem_pool;
166 #define obstack_chunk_alloc xmalloc
167 #define obstack_chunk_free free
170 /* Prototypes for local functions. */
171 static void ctype_startup (struct linereader *lr, struct localedef_t *locale,
172 struct charmap_t *charmap, int ignore_content);
173 static void ctype_class_new (struct linereader *lr,
174 struct locale_ctype_t *ctype, const char *name);
175 static void ctype_map_new (struct linereader *lr,
176 struct locale_ctype_t *ctype,
177 const char *name, struct charmap_t *charmap);
178 static uint32_t *find_idx (struct locale_ctype_t *ctype, uint32_t **table,
179 size_t *max, size_t *act, unsigned int idx);
180 static void set_class_defaults (struct locale_ctype_t *ctype,
181 struct charmap_t *charmap,
182 struct repertoire_t *repertoire);
183 static void allocate_arrays (struct locale_ctype_t *ctype,
184 struct charmap_t *charmap,
185 struct repertoire_t *repertoire);
188 static const char *longnames[] =
190 "zero", "one", "two", "three", "four",
191 "five", "six", "seven", "eight", "nine"
193 static const unsigned char digits[] = "0123456789";
196 static void
197 ctype_startup (struct linereader *lr, struct localedef_t *locale,
198 struct charmap_t *charmap, int ignore_content)
200 unsigned int cnt;
201 struct locale_ctype_t *ctype;
203 if (!ignore_content)
205 /* Allocate the needed room. */
206 locale->categories[LC_CTYPE].ctype = ctype =
207 (struct locale_ctype_t *) xcalloc (1, sizeof (struct locale_ctype_t));
209 /* We have seen no names yet. */
210 ctype->charnames_max = charmap->mb_cur_max == 1 ? 256 : 512;
211 ctype->charnames =
212 (unsigned int *) xmalloc (ctype->charnames_max
213 * sizeof (unsigned int));
214 for (cnt = 0; cnt < 256; ++cnt)
215 ctype->charnames[cnt] = cnt;
216 ctype->charnames_act = 256;
218 /* Fill character class information. */
219 ctype->last_class_char = ILLEGAL_CHAR_VALUE;
220 /* The order of the following instructions determines the bit
221 positions! */
222 ctype_class_new (lr, ctype, "upper");
223 ctype_class_new (lr, ctype, "lower");
224 ctype_class_new (lr, ctype, "alpha");
225 ctype_class_new (lr, ctype, "digit");
226 ctype_class_new (lr, ctype, "xdigit");
227 ctype_class_new (lr, ctype, "space");
228 ctype_class_new (lr, ctype, "print");
229 ctype_class_new (lr, ctype, "graph");
230 ctype_class_new (lr, ctype, "blank");
231 ctype_class_new (lr, ctype, "cntrl");
232 ctype_class_new (lr, ctype, "punct");
233 ctype_class_new (lr, ctype, "alnum");
234 /* The following are extensions from ISO 14652. */
235 ctype_class_new (lr, ctype, "left_to_right");
236 ctype_class_new (lr, ctype, "right_to_left");
237 ctype_class_new (lr, ctype, "num_terminator");
238 ctype_class_new (lr, ctype, "num_separator");
239 ctype_class_new (lr, ctype, "segment_separator");
240 ctype_class_new (lr, ctype, "block_separator");
241 ctype_class_new (lr, ctype, "direction_control");
242 ctype_class_new (lr, ctype, "sym_swap_layout");
243 ctype_class_new (lr, ctype, "char_shape_selector");
244 ctype_class_new (lr, ctype, "num_shape_selector");
245 ctype_class_new (lr, ctype, "non_spacing");
246 ctype_class_new (lr, ctype, "non_spacing_level3");
247 ctype_class_new (lr, ctype, "normal_connect");
248 ctype_class_new (lr, ctype, "r_connect");
249 ctype_class_new (lr, ctype, "no_connect");
250 ctype_class_new (lr, ctype, "no_connect-space");
251 ctype_class_new (lr, ctype, "vowel_connect");
253 ctype->class_collection_max = charmap->mb_cur_max == 1 ? 256 : 512;
254 ctype->class_collection
255 = (uint32_t *) xcalloc (sizeof (unsigned long int),
256 ctype->class_collection_max);
257 ctype->class_collection_act = 256;
259 /* Fill character map information. */
260 ctype->map_collection_nr = 0;
261 ctype->last_map_idx = MAX_NR_CHARMAP;
262 ctype_map_new (lr, ctype, "toupper", charmap);
263 ctype_map_new (lr, ctype, "tolower", charmap);
264 ctype_map_new (lr, ctype, "tosymmetric", charmap);
266 /* Fill first 256 entries in `toXXX' arrays. */
267 for (cnt = 0; cnt < 256; ++cnt)
269 ctype->map_collection[0][cnt] = cnt;
270 ctype->map_collection[1][cnt] = cnt;
271 ctype->map_collection[2][cnt] = cnt;
272 ctype->map256_collection[0][cnt] = cnt;
273 ctype->map256_collection[1][cnt] = cnt;
276 obstack_init (&ctype->mem_pool);
281 void
282 ctype_finish (struct localedef_t *locale, struct charmap_t *charmap)
284 /* See POSIX.2, table 2-6 for the meaning of the following table. */
285 #define NCLASS 12
286 static const struct
288 const char *name;
289 const char allow[NCLASS];
291 valid_table[NCLASS] =
293 /* The order is important. See token.h for more information.
294 M = Always, D = Default, - = Permitted, X = Mutually exclusive */
295 { "upper", "--MX-XDDXXX-" },
296 { "lower", "--MX-XDDXXX-" },
297 { "alpha", "---X-XDDXXX-" },
298 { "digit", "XXX--XDDXXX-" },
299 { "xdigit", "-----XDDXXX-" },
300 { "space", "XXXXX------X" },
301 { "print", "---------X--" },
302 { "graph", "---------X--" },
303 { "blank", "XXXXXM-----X" },
304 { "cntrl", "XXXXX-XX--XX" },
305 { "punct", "XXXXX-DD-X-X" },
306 { "alnum", "-----XDDXXX-" }
308 size_t cnt;
309 int cls1, cls2;
310 uint32_t space_value;
311 struct charseq *space_seq;
312 struct locale_ctype_t *ctype = locale->categories[LC_CTYPE].ctype;
313 int warned;
315 /* Now resolve copying and also handle completely missing definitions. */
316 if (ctype == NULL)
318 /* First see whether we were supposed to copy. If yes, find the
319 actual definition. */
320 if (locale->copy_name[LC_CTYPE] != NULL)
322 /* Find the copying locale. This has to happen transitively since
323 the locale we are copying from might also copying another one. */
324 struct localedef_t *from = locale;
327 from = find_locale (LC_CTYPE, from->copy_name[LC_CTYPE],
328 from->repertoire_name, charmap);
329 while (from->categories[LC_CTYPE].ctype == NULL
330 && from->copy_name[LC_CTYPE] != NULL);
332 ctype = locale->categories[LC_CTYPE].ctype
333 = from->categories[LC_CTYPE].ctype;
336 /* If there is still no definition issue an warning and create an
337 empty one. */
338 if (ctype == NULL)
340 error (0, 0, _("No definition for %s category found"), "LC_CTYPE");
341 ctype_startup (NULL, locale, charmap, 0);
342 ctype = locale->categories[LC_CTYPE].ctype;
346 /* Set default value for classes not specified. */
347 set_class_defaults (ctype, charmap, ctype->repertoire);
349 /* Check according to table. */
350 for (cnt = 0; cnt < ctype->class_collection_act; ++cnt)
352 uint32_t tmp = ctype->class_collection[cnt];
354 if (tmp != 0)
356 for (cls1 = 0; cls1 < NCLASS; ++cls1)
357 if ((tmp & _ISwbit (cls1)) != 0)
358 for (cls2 = 0; cls2 < NCLASS; ++cls2)
359 if (valid_table[cls1].allow[cls2] != '-')
361 int eq = (tmp & _ISwbit (cls2)) != 0;
362 switch (valid_table[cls1].allow[cls2])
364 case 'M':
365 if (!eq)
367 uint32_t value = ctype->charnames[cnt];
369 if (!be_quiet)
370 error (0, 0, _("\
371 character L'\\u%0*x' in class `%s' must be in class `%s'"),
372 value > 0xffff ? 8 : 4, value,
373 valid_table[cls1].name,
374 valid_table[cls2].name);
376 break;
378 case 'X':
379 if (eq)
381 uint32_t value = ctype->charnames[cnt];
383 if (!be_quiet)
384 error (0, 0, _("\
385 character L'\\u%0*x' in class `%s' must not be in class `%s'"),
386 value > 0xffff ? 8 : 4, value,
387 valid_table[cls1].name,
388 valid_table[cls2].name);
390 break;
392 case 'D':
393 ctype->class_collection[cnt] |= _ISwbit (cls2);
394 break;
396 default:
397 error (5, 0, _("internal error in %s, line %u"),
398 __FUNCTION__, __LINE__);
404 for (cnt = 0; cnt < 256; ++cnt)
406 uint32_t tmp = ctype->class256_collection[cnt];
408 if (tmp != 0)
410 for (cls1 = 0; cls1 < NCLASS; ++cls1)
411 if ((tmp & _ISbit (cls1)) != 0)
412 for (cls2 = 0; cls2 < NCLASS; ++cls2)
413 if (valid_table[cls1].allow[cls2] != '-')
415 int eq = (tmp & _ISbit (cls2)) != 0;
416 switch (valid_table[cls1].allow[cls2])
418 case 'M':
419 if (!eq)
421 char buf[17];
423 sprintf (buf, "\\%o", cnt);
425 if (!be_quiet)
426 error (0, 0, _("\
427 character '%s' in class `%s' must be in class `%s'"),
428 buf, valid_table[cls1].name,
429 valid_table[cls2].name);
431 break;
433 case 'X':
434 if (eq)
436 char buf[17];
438 sprintf (buf, "\\%o", cnt);
440 if (!be_quiet)
441 error (0, 0, _("\
442 character '%s' in class `%s' must not be in class `%s'"),
443 buf, valid_table[cls1].name,
444 valid_table[cls2].name);
446 break;
448 case 'D':
449 ctype->class256_collection[cnt] |= _ISbit (cls2);
450 break;
452 default:
453 error (5, 0, _("internal error in %s, line %u"),
454 __FUNCTION__, __LINE__);
460 /* ... and now test <SP> as a special case. */
461 space_value = repertoire_find_value (ctype->repertoire, "SP", 2);
462 if (space_value == ILLEGAL_CHAR_VALUE)
464 if (!be_quiet)
465 error (0, 0, _("character <SP> not defined in character map"));
467 else if (((cnt = BITPOS (tok_space),
468 (ELEM (ctype, class_collection, , space_value)
469 & BITw (tok_space)) == 0)
470 || (cnt = BITPOS (tok_blank),
471 (ELEM (ctype, class_collection, , space_value)
472 & BITw (tok_blank)) == 0)))
474 if (!be_quiet)
475 error (0, 0, _("<SP> character not in class `%s'"),
476 valid_table[cnt].name);
478 else if (((cnt = BITPOS (tok_punct),
479 (ELEM (ctype, class_collection, , space_value)
480 & BITw (tok_punct)) != 0)
481 || (cnt = BITPOS (tok_graph),
482 (ELEM (ctype, class_collection, , space_value)
483 & BITw (tok_graph))
484 != 0)))
486 if (!be_quiet)
487 error (0, 0, _("<SP> character must not be in class `%s'"),
488 valid_table[cnt].name);
490 else
491 ELEM (ctype, class_collection, , space_value) |= BITw (tok_print);
493 space_seq = charmap_find_value (charmap, "SP", 2);
494 if (space_seq == NULL || space_seq->nbytes != 1)
496 if (!be_quiet)
497 error (0, 0, _("character <SP> not defined in character map"));
499 else if (((cnt = BITPOS (tok_space),
500 (ctype->class256_collection[space_seq->bytes[0]]
501 & BIT (tok_space)) == 0)
502 || (cnt = BITPOS (tok_blank),
503 (ctype->class256_collection[space_seq->bytes[0]]
504 & BIT (tok_blank)) == 0)))
506 if (!be_quiet)
507 error (0, 0, _("<SP> character not in class `%s'"),
508 valid_table[cnt].name);
510 else if (((cnt = BITPOS (tok_punct),
511 (ctype->class256_collection[space_seq->bytes[0]]
512 & BIT (tok_punct)) != 0)
513 || (cnt = BITPOS (tok_graph),
514 (ctype->class256_collection[space_seq->bytes[0]]
515 & BIT (tok_graph)) != 0)))
517 if (!be_quiet)
518 error (0, 0, _("<SP> character must not be in class `%s'"),
519 valid_table[cnt].name);
521 else
522 ctype->class256_collection[space_seq->bytes[0]] |= BIT (tok_print);
524 /* Now that the tests are done make sure the name array contains all
525 characters which are handled in the WIDTH section of the
526 character set definition file. */
527 if (charmap->width_rules != NULL)
528 for (cnt = 0; cnt < charmap->nwidth_rules; ++cnt)
530 unsigned char bytes[charmap->mb_cur_max];
531 int nbytes = charmap->width_rules[cnt].from->nbytes;
533 /* We have the range of character for which the width is
534 specified described using byte sequences of the multibyte
535 charset. We have to convert this to UCS4 now. And we
536 cannot simply convert the beginning and the end of the
537 sequence, we have to iterate over the byte sequence and
538 convert it for every single character. */
539 memcpy (bytes, charmap->width_rules[cnt].from->bytes, nbytes);
541 while (nbytes < charmap->width_rules[cnt].to->nbytes
542 || memcmp (bytes, charmap->width_rules[cnt].to->bytes,
543 nbytes) <= 0)
545 /* Find the UCS value for `bytes'. */
546 uint32_t wch = repertoire_find_value (ctype->repertoire, bytes,
547 nbytes);
548 int inner;
550 if (wch != ILLEGAL_CHAR_VALUE)
551 /* We are only interested in the side-effects of the
552 `find_idx' call. It will add appropriate entries in
553 the name array if this is necessary. */
554 (void) find_idx (ctype, NULL, NULL, NULL, wch);
556 /* "Increment" the bytes sequence. */
557 inner = nbytes - 1;
558 while (inner >= 0 && bytes[inner] == 0xff)
559 --inner;
561 if (inner < 0)
563 /* We have to extend the byte sequence. */
564 if (nbytes >= charmap->width_rules[cnt].to->nbytes)
565 break;
567 bytes[0] = 1;
568 memset (&bytes[1], 0, nbytes);
569 ++nbytes;
571 else
573 ++bytes[inner];
574 while (++inner < nbytes)
575 bytes[inner] = 0;
580 /* There must be a multiple of 10 digits. */
581 if (ctype->mbdigits_act % 10 != 0)
583 assert (ctype->mbdigits_act == ctype->wcdigits_act);
584 ctype->wcdigits_act -= ctype->mbdigits_act % 10;
585 ctype->mbdigits_act -= ctype->mbdigits_act % 10;
586 error (0, 0, _("`digit' category has not entries in groups of ten"));
589 /* Check the input digits. There must be a multiple of ten available.
590 In each group it could be that one or the other character is missing.
591 In this case the whole group must be removed. */
592 cnt = 0;
593 while (cnt < ctype->mbdigits_act)
595 size_t inner;
596 for (inner = 0; inner < 10; ++inner)
597 if (ctype->mbdigits[cnt + inner] == NULL)
598 break;
600 if (inner == 10)
601 cnt += 10;
602 else
604 /* Remove the group. */
605 memmove (&ctype->mbdigits[cnt], &ctype->mbdigits[cnt + 10],
606 ((ctype->wcdigits_act - cnt - 10)
607 * sizeof (ctype->mbdigits[0])));
608 ctype->mbdigits_act -= 10;
612 /* If no input digits are given use the default. */
613 if (ctype->mbdigits_act == 0)
615 if (ctype->mbdigits_max == 0)
617 ctype->mbdigits = obstack_alloc (&charmap->mem_pool,
618 10 * sizeof (struct charseq *));
619 ctype->mbdigits_max = 10;
622 for (cnt = 0; cnt < 10; ++cnt)
624 ctype->mbdigits[cnt] = charmap_find_symbol (charmap,
625 digits + cnt, 1);
626 if (ctype->mbdigits[cnt] == NULL)
628 ctype->mbdigits[cnt] = charmap_find_symbol (charmap,
629 longnames[cnt],
630 strlen (longnames[cnt]));
631 if (ctype->mbdigits[cnt] == NULL)
633 /* Hum, this ain't good. */
634 error (0, 0, _("\
635 no input digits defined and none of the standard names in the charmap"));
637 ctype->mbdigits[cnt] = obstack_alloc (&charmap->mem_pool,
638 sizeof (struct charseq) + 1);
640 /* This is better than nothing. */
641 ctype->mbdigits[cnt]->bytes[0] = digits[cnt];
642 ctype->mbdigits[cnt]->nbytes = 1;
647 ctype->mbdigits_act = 10;
650 /* Check the wide character input digits. There must be a multiple
651 of ten available. In each group it could be that one or the other
652 character is missing. In this case the whole group must be
653 removed. */
654 cnt = 0;
655 while (cnt < ctype->wcdigits_act)
657 size_t inner;
658 for (inner = 0; inner < 10; ++inner)
659 if (ctype->wcdigits[cnt + inner] == ILLEGAL_CHAR_VALUE)
660 break;
662 if (inner == 10)
663 cnt += 10;
664 else
666 /* Remove the group. */
667 memmove (&ctype->wcdigits[cnt], &ctype->wcdigits[cnt + 10],
668 ((ctype->wcdigits_act - cnt - 10)
669 * sizeof (ctype->wcdigits[0])));
670 ctype->wcdigits_act -= 10;
674 /* If no input digits are given use the default. */
675 if (ctype->wcdigits_act == 0)
677 if (ctype->wcdigits_max == 0)
679 ctype->wcdigits = obstack_alloc (&charmap->mem_pool,
680 10 * sizeof (uint32_t));
681 ctype->wcdigits_max = 10;
684 for (cnt = 0; cnt < 10; ++cnt)
685 ctype->wcdigits[cnt] = L'0' + cnt;
687 ctype->mbdigits_act = 10;
690 /* Check the outdigits. */
691 warned = 0;
692 for (cnt = 0; cnt < 10; ++cnt)
693 if (ctype->mboutdigits[cnt] == NULL)
695 static struct charseq replace[2];
697 if (!warned)
699 error (0, 0, _("\
700 not all characters used in `outdigit' are available in the charmap"));
701 warned = 1;
704 replace[0].nbytes = 1;
705 replace[0].bytes[0] = '?';
706 replace[0].bytes[1] = '\0';
707 ctype->mboutdigits[cnt] = &replace[0];
710 warned = 0;
711 for (cnt = 0; cnt < 10; ++cnt)
712 if (ctype->wcoutdigits[cnt] == 0)
714 if (!warned)
716 error (0, 0, _("\
717 not all characters used in `outdigit' are available in the repertoire"));
718 warned = 1;
721 ctype->wcoutdigits[cnt] = L'?';
726 void
727 ctype_output (struct localedef_t *locale, struct charmap_t *charmap,
728 const char *output_path)
730 struct locale_ctype_t *ctype = locale->categories[LC_CTYPE].ctype;
731 const size_t nelems = (_NL_ITEM_INDEX (_NL_NUM_LC_CTYPE)
732 + (ctype->map_collection_nr - 2));
733 struct iovec iov[2 + nelems + ctype->nr_charclass
734 + ctype->map_collection_nr];
735 struct locale_file data;
736 uint32_t idx[nelems + 1];
737 size_t elem, cnt, offset, total;
738 char *cp;
740 /* Now prepare the output: Find the sizes of the table we can use. */
741 allocate_arrays (ctype, charmap, ctype->repertoire);
743 data.magic = LIMAGIC (LC_CTYPE);
744 data.n = nelems;
745 iov[0].iov_base = (void *) &data;
746 iov[0].iov_len = sizeof (data);
748 iov[1].iov_base = (void *) idx;
749 iov[1].iov_len = sizeof (idx);
751 idx[0] = iov[0].iov_len + iov[1].iov_len;
752 offset = 0;
754 for (elem = 0; elem < nelems; ++elem)
756 if (elem < _NL_ITEM_INDEX (_NL_NUM_LC_CTYPE))
757 switch (elem)
759 #define CTYPE_DATA(name, base, len) \
760 case _NL_ITEM_INDEX (name): \
761 iov[2 + elem + offset].iov_base = (base); \
762 iov[2 + elem + offset].iov_len = (len); \
763 if (elem + 1 < nelems) \
764 idx[elem + 1] = idx[elem] + iov[2 + elem + offset].iov_len; \
765 break
767 CTYPE_DATA (_NL_CTYPE_CLASS,
768 ctype->ctype_b,
769 (256 + 128) * sizeof (char_class_t));
771 CTYPE_DATA (_NL_CTYPE_TOUPPER,
772 ctype->map[0],
773 (ctype->plane_size * ctype->plane_cnt + 128)
774 * sizeof (uint32_t));
775 CTYPE_DATA (_NL_CTYPE_TOLOWER,
776 ctype->map[1],
777 (ctype->plane_size * ctype->plane_cnt + 128)
778 * sizeof (uint32_t));
780 CTYPE_DATA (_NL_CTYPE_CLASS32,
781 ctype->ctype32_b,
782 (ctype->plane_size * ctype->plane_cnt
783 * sizeof (char_class32_t)));
785 CTYPE_DATA (_NL_CTYPE_NAMES,
786 ctype->names, (ctype->plane_size * ctype->plane_cnt
787 * sizeof (uint32_t)));
789 CTYPE_DATA (_NL_CTYPE_TRANSLIT_HASH_SIZE,
790 &ctype->translit_hash_size, sizeof (uint32_t));
791 CTYPE_DATA (_NL_CTYPE_TRANSLIT_HASH_LAYERS,
792 &ctype->translit_hash_layers, sizeof (uint32_t));
794 CTYPE_DATA (_NL_CTYPE_TRANSLIT_FROM_IDX,
795 ctype->translit_from_idx,
796 ctype->translit_idx_size);
798 CTYPE_DATA (_NL_CTYPE_TRANSLIT_FROM_TBL,
799 ctype->translit_from_tbl,
800 ctype->translit_from_tbl_size);
802 CTYPE_DATA (_NL_CTYPE_TRANSLIT_TO_IDX,
803 ctype->translit_to_idx,
804 ctype->translit_idx_size);
806 CTYPE_DATA (_NL_CTYPE_TRANSLIT_TO_TBL,
807 ctype->translit_to_tbl, ctype->translit_to_tbl_size);
809 CTYPE_DATA (_NL_CTYPE_HASH_SIZE,
810 &ctype->plane_size, sizeof (uint32_t));
811 CTYPE_DATA (_NL_CTYPE_HASH_LAYERS,
812 &ctype->plane_cnt, sizeof (uint32_t));
814 case _NL_ITEM_INDEX (_NL_CTYPE_CLASS_NAMES):
815 /* The class name array. */
816 total = 0;
817 for (cnt = 0; cnt < ctype->nr_charclass; ++cnt, ++offset)
819 iov[2 + elem + offset].iov_base
820 = (void *) ctype->classnames[cnt];
821 iov[2 + elem + offset].iov_len
822 = strlen (ctype->classnames[cnt]) + 1;
823 total += iov[2 + elem + offset].iov_len;
825 iov[2 + elem + offset].iov_base = (void *) "\0\0\0";
826 iov[2 + elem + offset].iov_len = 1 + (4 - ((total + 1) % 4));
827 total += 1 + (4 - ((total + 1) % 4));
829 idx[elem + 1] = idx[elem] + total;
830 break;
832 case _NL_ITEM_INDEX (_NL_CTYPE_MAP_NAMES):
833 /* The class name array. */
834 total = 0;
835 for (cnt = 0; cnt < ctype->map_collection_nr; ++cnt, ++offset)
837 iov[2 + elem + offset].iov_base
838 = (void *) ctype->mapnames[cnt];
839 iov[2 + elem + offset].iov_len
840 = strlen (ctype->mapnames[cnt]) + 1;
841 total += iov[2 + elem + offset].iov_len;
843 iov[2 + elem + offset].iov_base = (void *) "\0\0\0";
844 iov[2 + elem + offset].iov_len = 1 + (4 - ((total + 1) % 4));
845 total += 1 + (4 - ((total + 1) % 4));
847 idx[elem + 1] = idx[elem] + total;
848 break;
850 CTYPE_DATA (_NL_CTYPE_WIDTH,
851 ctype->width, ctype->plane_size * ctype->plane_cnt);
853 CTYPE_DATA (_NL_CTYPE_MB_CUR_MAX,
854 &ctype->mb_cur_max, sizeof (uint32_t));
856 case _NL_ITEM_INDEX (_NL_CTYPE_CODESET_NAME):
857 total = strlen (ctype->codeset_name) + 1;
858 if (total % 4 == 0)
859 iov[2 + elem + offset].iov_base = (char *) ctype->codeset_name;
860 else
862 iov[2 + elem + offset].iov_base = alloca ((total + 3) & ~3);
863 memset (mempcpy (iov[2 + elem + offset].iov_base,
864 ctype->codeset_name, total),
865 '\0', 4 - (total & 3));
866 total = (total + 3) & ~3;
868 iov[2 + elem + offset].iov_len = total;
869 idx[elem + 1] = idx[elem] + iov[2 + elem + offset].iov_len;
870 break;
872 case _NL_ITEM_INDEX (_NL_CTYPE_INDIGITS_MB_LEN):
873 iov[2 + elem + offset].iov_base = alloca (sizeof (uint32_t));
874 iov[2 + elem + offset].iov_len = sizeof (uint32_t);
875 *(uint32_t *) iov[2 + elem + offset].iov_base =
876 ctype->mbdigits_act / 10;
877 idx[elem + 1] = idx[elem] + sizeof (uint32_t);
878 break;
880 case _NL_ITEM_INDEX (_NL_CTYPE_INDIGITS_WC_LEN):
881 iov[2 + elem + offset].iov_base = alloca (sizeof (uint32_t));
882 iov[2 + elem + offset].iov_len = sizeof (uint32_t);
883 *(uint32_t *) iov[2 + elem + offset].iov_base =
884 ctype->wcdigits_act / 10;
885 idx[elem + 1] = idx[elem] + sizeof (uint32_t);
886 break;
888 case _NL_ITEM_INDEX (_NL_CTYPE_INDIGITS0_MB) ... _NL_ITEM_INDEX (_NL_CTYPE_INDIGITS9_MB):
889 /* Compute the length of all possible characters. For INDIGITS
890 there might be more than one. We simply concatenate all of
891 them with a NUL byte following. The NUL byte wouldn't be
892 necessary but it makes it easier for the user. */
893 total = 0;
894 for (cnt = elem - _NL_CTYPE_INDIGITS0_MB;
895 cnt < ctype->mbdigits_act; cnt += 10)
896 total += ctype->mbdigits[cnt]->nbytes + 1;
897 iov[2 + elem + offset].iov_base = (char *) alloca (total);
898 iov[2 + elem + offset].iov_len = total;
900 cp = iov[2 + elem + offset].iov_base;
901 for (cnt = elem - _NL_CTYPE_INDIGITS0_MB;
902 cnt < ctype->mbdigits_act; cnt += 10)
904 cp = mempcpy (cp, ctype->mbdigits[cnt]->bytes,
905 ctype->mbdigits[cnt]->nbytes);
906 *cp++ = '\0';
908 idx[elem + 1] = idx[elem] + iov[2 + elem + offset].iov_len;
909 break;
911 case _NL_ITEM_INDEX (_NL_CTYPE_OUTDIGIT0_MB) ... _NL_ITEM_INDEX (_NL_CTYPE_OUTDIGIT9_MB):
912 /* Compute the length of all possible characters. For INDIGITS
913 there might be more than one. We simply concatenate all of
914 them with a NUL byte following. The NUL byte wouldn't be
915 necessary but it makes it easier for the user. */
916 cnt = elem - _NL_CTYPE_OUTDIGIT0_MB;
917 total = ctype->mboutdigits[cnt]->nbytes + 1;
918 iov[2 + elem + offset].iov_base = (char *) alloca (total);
919 iov[2 + elem + offset].iov_len = total;
921 *(char *) mempcpy (iov[2 + elem + offset].iov_base,
922 ctype->mbdigits[cnt]->bytes,
923 ctype->mbdigits[cnt]->nbytes) = '\0';
924 idx[elem + 1] = idx[elem] + iov[2 + elem + offset].iov_len;
925 break;
927 case _NL_ITEM_INDEX (_NL_CTYPE_INDIGITS0_WC) ... _NL_ITEM_INDEX (_NL_CTYPE_INDIGITS9_WC):
928 total = ctype->wcdigits_act / 10;
930 iov[2 + elem + offset].iov_base =
931 (uint32_t *) alloca (total * sizeof (uint32_t));
932 iov[2 + elem + offset].iov_len = total * sizeof (uint32_t);
934 for (cnt = elem - _NL_CTYPE_INDIGITS0_WC;
935 cnt < ctype->wcdigits_act; cnt += 10)
936 ((uint32_t *) iov[2 + elem + offset].iov_base)[cnt / 10]
937 = ctype->wcdigits[cnt];
938 idx[elem + 1] = idx[elem] + iov[2 + elem + offset].iov_len;
939 break;
941 case _NL_ITEM_INDEX (_NL_CTYPE_OUTDIGIT0_WC) ... _NL_ITEM_INDEX (_NL_CTYPE_OUTDIGIT9_WC):
942 cnt = elem - _NL_CTYPE_OUTDIGIT0_WC;
943 iov[2 + elem + offset].iov_base = &ctype->wcoutdigits[cnt];
944 iov[2 + elem + offset].iov_len = sizeof (uint32_t);
945 idx[elem + 1] = idx[elem] + iov[2 + elem + offset].iov_len;
946 break;
948 default:
949 assert (! "unknown CTYPE element");
951 else
953 /* Handle extra maps. */
954 size_t nr = (elem - _NL_ITEM_INDEX (_NL_NUM_LC_CTYPE)) + 2;
956 iov[2 + elem + offset].iov_base = ctype->map[nr];
957 iov[2 + elem + offset].iov_len = ((ctype->plane_size
958 * ctype->plane_cnt + 128)
959 * sizeof (uint32_t));
961 idx[elem + 1] = idx[elem] + iov[2 + elem + offset].iov_len;
965 assert (2 + elem + offset == (nelems + ctype->nr_charclass
966 + ctype->map_collection_nr + 2));
968 write_locale_data (output_path, "LC_CTYPE", 2 + elem + offset, iov);
972 /* Local functions. */
973 static void
974 ctype_class_new (struct linereader *lr, struct locale_ctype_t *ctype,
975 const char *name)
977 size_t cnt;
979 for (cnt = 0; cnt < ctype->nr_charclass; ++cnt)
980 if (strcmp (ctype->classnames[cnt], name) == 0)
981 break;
983 if (cnt < ctype->nr_charclass)
985 lr_error (lr, _("character class `%s' already defined"), name);
986 return;
989 if (ctype->nr_charclass == MAX_NR_CHARCLASS)
990 /* Exit code 2 is prescribed in P1003.2b. */
991 error (2, 0, _("\
992 implementation limit: no more than %d character classes allowed"),
993 MAX_NR_CHARCLASS);
995 ctype->classnames[ctype->nr_charclass++] = name;
999 static void
1000 ctype_map_new (struct linereader *lr, struct locale_ctype_t *ctype,
1001 const char *name, struct charmap_t *charmap)
1003 size_t max_chars = 0;
1004 size_t cnt;
1006 for (cnt = 0; cnt < ctype->map_collection_nr; ++cnt)
1008 if (strcmp (ctype->mapnames[cnt], name) == 0)
1009 break;
1011 if (max_chars < ctype->map_collection_max[cnt])
1012 max_chars = ctype->map_collection_max[cnt];
1015 if (cnt < ctype->map_collection_nr)
1017 lr_error (lr, _("character map `%s' already defined"), name);
1018 return;
1021 if (ctype->map_collection_nr == MAX_NR_CHARMAP)
1022 /* Exit code 2 is prescribed in P1003.2b. */
1023 error (2, 0, _("\
1024 implementation limit: no more than %d character maps allowed"),
1025 MAX_NR_CHARMAP);
1027 ctype->mapnames[cnt] = name;
1029 if (max_chars == 0)
1030 ctype->map_collection_max[cnt] = charmap->mb_cur_max == 1 ? 256 : 512;
1031 else
1032 ctype->map_collection_max[cnt] = max_chars;
1034 ctype->map_collection[cnt] = (uint32_t *)
1035 xmalloc (sizeof (uint32_t) * ctype->map_collection_max[cnt]);
1036 memset (ctype->map_collection[cnt], '\0',
1037 sizeof (uint32_t) * ctype->map_collection_max[cnt]);
1038 ctype->map_collection_act[cnt] = 256;
1040 ++ctype->map_collection_nr;
1044 /* We have to be prepared that TABLE, MAX, and ACT can be NULL. This
1045 is possible if we only want to extend the name array. */
1046 static uint32_t *
1047 find_idx (struct locale_ctype_t *ctype, uint32_t **table, size_t *max,
1048 size_t *act, uint32_t idx)
1050 size_t cnt;
1052 if (idx < 256)
1053 return table == NULL ? NULL : &(*table)[idx];
1055 for (cnt = 256; cnt < ctype->charnames_act; ++cnt)
1056 if (ctype->charnames[cnt] == idx)
1057 break;
1059 /* We have to distinguish two cases: the name is found or not. */
1060 if (cnt == ctype->charnames_act)
1062 /* Extend the name array. */
1063 if (ctype->charnames_act == ctype->charnames_max)
1065 ctype->charnames_max *= 2;
1066 ctype->charnames = (unsigned int *)
1067 xrealloc (ctype->charnames,
1068 sizeof (unsigned int) * ctype->charnames_max);
1070 ctype->charnames[ctype->charnames_act++] = idx;
1073 if (table == NULL)
1074 /* We have done everything we are asked to do. */
1075 return NULL;
1077 if (cnt >= *act)
1079 if (cnt >= *max)
1081 size_t old_max = *max;
1083 *max *= 2;
1084 while (*max <= cnt);
1086 *table =
1087 (uint32_t *) xrealloc (*table, *max * sizeof (unsigned long int));
1088 memset (&(*table)[old_max], '\0',
1089 (*max - old_max) * sizeof (uint32_t));
1092 *act = cnt;
1095 return &(*table)[cnt];
1099 static int
1100 get_character (struct token *now, struct charmap_t *charmap,
1101 struct repertoire_t *repertoire,
1102 struct charseq **seqp, uint32_t *wchp)
1104 if (now->tok == tok_bsymbol)
1106 /* This will hopefully be the normal case. */
1107 *wchp = repertoire_find_value (repertoire, now->val.str.startmb,
1108 now->val.str.lenmb);
1109 *seqp = charmap_find_value (charmap, now->val.str.startmb,
1110 now->val.str.lenmb);
1112 else if (now->tok == tok_ucs4)
1114 *seqp = repertoire_find_seq (repertoire, now->val.ucs4);
1116 if (*seqp == NULL)
1118 /* Compute the value in the charmap from the UCS value. */
1119 const char *symbol = repertoire_find_symbol (repertoire,
1120 now->val.ucs4);
1122 if (symbol == NULL)
1123 *seqp = NULL;
1124 else
1125 *seqp = charmap_find_value (charmap, symbol, strlen (symbol));
1127 if (*seqp == NULL)
1129 /* Insert a negative entry. */
1130 static const struct charseq negative
1131 = { .ucs4 = ILLEGAL_CHAR_VALUE };
1132 uint32_t *newp = obstack_alloc (&repertoire->mem_pool, 4);
1133 *newp = now->val.ucs4;
1135 insert_entry (&repertoire->seq_table, newp, 4,
1136 (void *) &negative);
1138 else
1139 (*seqp)->ucs4 = now->val.ucs4;
1141 else if ((*seqp)->ucs4 != now->val.ucs4)
1142 *seqp = NULL;
1144 *wchp = now->val.ucs4;
1146 else if (now->tok == tok_charcode)
1148 /* We must map from the byte code to UCS4. */
1149 *seqp = charmap_find_symbol (charmap, now->val.str.startmb,
1150 now->val.str.lenmb);
1152 if (*seqp == NULL)
1153 *wchp = ILLEGAL_CHAR_VALUE;
1154 else
1156 if ((*seqp)->ucs4 == UNINITIALIZED_CHAR_VALUE)
1157 (*seqp)->ucs4 = repertoire_find_value (repertoire, (*seqp)->name,
1158 strlen ((*seqp)->name));
1159 *wchp = (*seqp)->ucs4;
1162 else
1163 return 1;
1165 return 0;
1169 /* Ellipsis like in `<foo123>..<foo12a>' or `<j1234>....<j1245>'. */
1170 static void
1171 charclass_symbolic_ellipsis (struct linereader *ldfile,
1172 struct locale_ctype_t *ctype,
1173 struct charmap_t *charmap,
1174 struct repertoire_t *repertoire,
1175 struct token *now,
1176 const char *last_str,
1177 unsigned long int class256_bit,
1178 unsigned long int class_bit, int base,
1179 int ignore_content, int handle_digits)
1181 const char *nowstr = now->val.str.startmb;
1182 char tmp[now->val.str.lenmb + 1];
1183 const char *cp;
1184 char *endp;
1185 unsigned long int from;
1186 unsigned long int to;
1188 /* We have to compute the ellipsis values using the symbolic names. */
1189 assert (last_str != NULL);
1191 if (strlen (last_str) != now->val.str.lenmb)
1193 invalid_range:
1194 lr_error (ldfile,
1195 _("`%s' and `%.*s' are no valid names for symbolic range"),
1196 last_str, now->val.str.lenmb, nowstr);
1197 return;
1200 if (memcmp (last_str, nowstr, now->val.str.lenmb) == 0)
1201 /* Nothing to do, the names are the same. */
1202 return;
1204 for (cp = last_str; *cp == *(nowstr + (cp - last_str)); ++cp)
1207 errno = 0;
1208 from = strtoul (cp, &endp, base);
1209 if ((from == UINT_MAX && errno == ERANGE) || *endp != '\0')
1210 goto invalid_range;
1212 to = strtoul (nowstr + (cp - last_str), &endp, base);
1213 if ((to == UINT_MAX && errno == ERANGE)
1214 || (endp - nowstr) != now->val.str.lenmb || from >= to)
1215 goto invalid_range;
1217 /* OK, we have a range FROM - TO. Now we can create the symbolic names. */
1218 if (!ignore_content)
1220 now->val.str.startmb = tmp;
1221 while (++from <= to)
1223 struct charseq *seq;
1224 uint32_t wch;
1226 sprintf (tmp, (base == 10 ? "%.*s%0*d" : "%.*s%0*X"), cp - last_str,
1227 last_str, now->val.str.lenmb - (cp - last_str), from);
1229 get_character (now, charmap, repertoire, &seq, &wch);
1231 if (seq != NULL && seq->nbytes == 1)
1232 /* Yep, we can store information about this byte sequence. */
1233 ctype->class256_collection[seq->bytes[0]] |= class256_bit;
1235 if (wch != ILLEGAL_CHAR_VALUE && class_bit != 0)
1236 /* We have the UCS4 position. */
1237 *find_idx (ctype, &ctype->class_collection,
1238 &ctype->class_collection_max,
1239 &ctype->class_collection_act, wch) |= class_bit;
1241 if (handle_digits == 1)
1243 /* We must store the digit values. */
1244 if (ctype->mbdigits_act == ctype->mbdigits_max)
1246 ctype->mbdigits_max *= 2;
1247 ctype->mbdigits = xrealloc (ctype->mbdigits,
1248 (ctype->mbdigits_max
1249 * sizeof (char *)));
1250 ctype->wcdigits_max *= 2;
1251 ctype->wcdigits = xrealloc (ctype->wcdigits,
1252 (ctype->wcdigits_max
1253 * sizeof (uint32_t)));
1256 ctype->mbdigits[ctype->mbdigits_act++] = seq;
1257 ctype->wcdigits[ctype->wcdigits_act++] = wch;
1259 else if (handle_digits == 2)
1261 /* We must store the digit values. */
1262 if (ctype->outdigits_act >= 10)
1264 lr_error (ldfile, _("\
1265 %s: field `%s' does not contain exactly ten entries"),
1266 "LC_CTYPE", "outdigit");
1267 return;
1270 ctype->mboutdigits[ctype->outdigits_act] = seq;
1271 ctype->wcoutdigits[ctype->outdigits_act] = wch;
1272 ++ctype->outdigits_act;
1279 /* Ellipsis like in `<U1234>..<U2345>'. */
1280 static void
1281 charclass_ucs4_ellipsis (struct linereader *ldfile,
1282 struct locale_ctype_t *ctype,
1283 struct charmap_t *charmap,
1284 struct repertoire_t *repertoire,
1285 struct token *now, uint32_t last_wch,
1286 unsigned long int class256_bit,
1287 unsigned long int class_bit, int ignore_content,
1288 int handle_digits)
1290 if (last_wch > now->val.ucs4)
1292 lr_error (ldfile, _("\
1293 to-value <U%0*X> of range is smaller than from-value <U%0*X>"),
1294 (now->val.ucs4 | last_wch) < 65536 ? 4 : 8, now->val.ucs4,
1295 (now->val.ucs4 | last_wch) < 65536 ? 4 : 8, last_wch);
1296 return;
1299 if (!ignore_content)
1300 while (++last_wch <= now->val.ucs4)
1302 /* We have to find out whether there is a byte sequence corresponding
1303 to this UCS4 value. */
1304 struct charseq *seq = repertoire_find_seq (repertoire, last_wch);
1306 /* If this is the first time we look for this sequence create a new
1307 entry. */
1308 if (seq == NULL)
1310 /* Find the symbolic name for this UCS4 value. */
1311 const char *symbol = repertoire_find_symbol (repertoire, last_wch);
1312 uint32_t *newp = obstack_alloc (&repertoire->mem_pool, 4);
1313 *newp = last_wch;
1315 if (symbol != NULL)
1316 /* We have a name, now search the multibyte value. */
1317 seq = charmap_find_value (charmap, symbol, strlen (symbol));
1319 if (seq == NULL)
1321 /* We have to create a fake entry. */
1322 static const struct charseq negative
1323 = { .ucs4 = ILLEGAL_CHAR_VALUE };
1324 seq = (struct charseq *) &negative;
1326 else
1327 seq->ucs4 = last_wch;
1329 insert_entry (&repertoire->seq_table, newp, 4, seq);
1332 /* We have a name, now search the multibyte value. */
1333 if (seq->ucs4 == last_wch && seq->nbytes == 1)
1334 /* Yep, we can store information about this byte sequence. */
1335 ctype->class256_collection[(size_t) seq->bytes[0]]
1336 |= class256_bit;
1338 /* And of course we have the UCS4 position. */
1339 if (class_bit != 0 && class_bit != 0)
1340 *find_idx (ctype, &ctype->class_collection,
1341 &ctype->class_collection_max,
1342 &ctype->class_collection_act, last_wch) |= class_bit;
1344 if (handle_digits == 1)
1346 /* We must store the digit values. */
1347 if (ctype->mbdigits_act == ctype->mbdigits_max)
1349 ctype->mbdigits_max *= 2;
1350 ctype->mbdigits = xrealloc (ctype->mbdigits,
1351 (ctype->mbdigits_max
1352 * sizeof (char *)));
1353 ctype->wcdigits_max *= 2;
1354 ctype->wcdigits = xrealloc (ctype->wcdigits,
1355 (ctype->wcdigits_max
1356 * sizeof (uint32_t)));
1359 ctype->mbdigits[ctype->mbdigits_act++] = (seq->ucs4 == last_wch
1360 ? seq : NULL);
1361 ctype->wcdigits[ctype->wcdigits_act++] = last_wch;
1363 else if (handle_digits == 2)
1365 /* We must store the digit values. */
1366 if (ctype->outdigits_act >= 10)
1368 lr_error (ldfile, _("\
1369 %s: field `%s' does not contain exactly ten entries"),
1370 "LC_CTYPE", "outdigit");
1371 return;
1374 ctype->mboutdigits[ctype->outdigits_act] = (seq->ucs4 == last_wch
1375 ? seq : NULL);
1376 ctype->wcoutdigits[ctype->outdigits_act] = last_wch;
1377 ++ctype->outdigits_act;
1383 /* Ellipsis as in `/xea/x12.../xea/x34'. */
1384 static void
1385 charclass_charcode_ellipsis (struct linereader *ldfile,
1386 struct locale_ctype_t *ctype,
1387 struct charmap_t *charmap,
1388 struct repertoire_t *repertoire,
1389 struct token *now, char *last_charcode,
1390 uint32_t last_charcode_len,
1391 unsigned long int class256_bit,
1392 unsigned long int class_bit, int ignore_content,
1393 int handle_digits)
1395 /* First check whether the to-value is larger. */
1396 if (now->val.charcode.nbytes != last_charcode_len)
1398 lr_error (ldfile, _("\
1399 start end end character sequence of range must have the same length"));
1400 return;
1403 if (memcmp (last_charcode, now->val.charcode.bytes, last_charcode_len) > 0)
1405 lr_error (ldfile, _("\
1406 to-value character sequence is smaller than from-value sequence"));
1407 return;
1410 if (!ignore_content)
1414 /* Increment the byte sequence value. */
1415 struct charseq *seq;
1416 uint32_t wch;
1417 int i;
1419 for (i = last_charcode_len - 1; i >= 0; --i)
1420 if (++last_charcode[i] != 0)
1421 break;
1423 if (last_charcode_len == 1)
1424 /* Of course we have the charcode value. */
1425 ctype->class256_collection[(size_t) last_charcode[0]]
1426 |= class256_bit;
1428 /* Find the symbolic name. */
1429 seq = charmap_find_symbol (charmap, last_charcode,
1430 last_charcode_len);
1431 if (seq != NULL)
1433 if (seq->ucs4 == UNINITIALIZED_CHAR_VALUE)
1434 seq->ucs4 = repertoire_find_value (repertoire, seq->name,
1435 strlen (seq->name));
1436 wch = seq->ucs4;
1438 if (wch != ILLEGAL_CHAR_VALUE && class_bit != 0)
1439 *find_idx (ctype, &ctype->class_collection,
1440 &ctype->class_collection_max,
1441 &ctype->class_collection_act, wch) |= class_bit;
1443 else
1444 wch = ILLEGAL_CHAR_VALUE;
1446 if (handle_digits == 1)
1448 /* We must store the digit values. */
1449 if (ctype->mbdigits_act == ctype->mbdigits_max)
1451 ctype->mbdigits_max *= 2;
1452 ctype->mbdigits = xrealloc (ctype->mbdigits,
1453 (ctype->mbdigits_max
1454 * sizeof (char *)));
1455 ctype->wcdigits_max *= 2;
1456 ctype->wcdigits = xrealloc (ctype->wcdigits,
1457 (ctype->wcdigits_max
1458 * sizeof (uint32_t)));
1461 seq = xmalloc (sizeof (struct charseq) + last_charcode_len);
1462 memcpy ((char *) (seq + 1), last_charcode, last_charcode_len);
1463 seq->nbytes = last_charcode_len;
1465 ctype->mbdigits[ctype->mbdigits_act++] = seq;
1466 ctype->wcdigits[ctype->wcdigits_act++] = wch;
1468 else if (handle_digits == 2)
1470 struct charseq *seq;
1471 /* We must store the digit values. */
1472 if (ctype->outdigits_act >= 10)
1474 lr_error (ldfile, _("\
1475 %s: field `%s' does not contain exactly ten entries"),
1476 "LC_CTYPE", "outdigit");
1477 return;
1480 seq = xmalloc (sizeof (struct charseq) + last_charcode_len);
1481 memcpy ((char *) (seq + 1), last_charcode, last_charcode_len);
1482 seq->nbytes = last_charcode_len;
1484 ctype->mboutdigits[ctype->outdigits_act] = seq;
1485 ctype->wcoutdigits[ctype->outdigits_act] = wch;
1486 ++ctype->outdigits_act;
1489 while (memcmp (last_charcode, now->val.charcode.bytes,
1490 last_charcode_len) != 0);
1495 /* Read one transliteration entry. */
1496 static uint32_t *
1497 read_widestring (struct linereader *ldfile, struct token *now,
1498 struct charmap_t *charmap, struct repertoire_t *repertoire)
1500 uint32_t *wstr;
1502 if (now->tok == tok_default_missing)
1503 /* The special name "" will denote this case. */
1504 wstr = (uint32_t *) L"";
1505 else if (now->tok == tok_bsymbol)
1507 /* Get the value from the repertoire. */
1508 wstr = xmalloc (2 * sizeof (uint32_t));
1509 wstr[0] = repertoire_find_value (repertoire, now->val.str.startmb,
1510 now->val.str.lenmb);
1511 if (wstr[0] == ILLEGAL_CHAR_VALUE)
1512 /* We cannot proceed, we don't know the UCS4 value. */
1513 return NULL;
1515 wstr[1] = 0;
1517 else if (now->tok == tok_ucs4)
1519 wstr = xmalloc (2 * sizeof (uint32_t));
1520 wstr[0] = now->val.ucs4;
1521 wstr[1] = 0;
1523 else if (now->tok == tok_charcode)
1525 /* Argh, we have to convert to the symbol name first and then to the
1526 UCS4 value. */
1527 struct charseq *seq = charmap_find_symbol (charmap,
1528 now->val.str.startmb,
1529 now->val.str.lenmb);
1530 if (seq == NULL)
1531 /* Cannot find the UCS4 value. */
1532 return NULL;
1534 if (seq->ucs4 == UNINITIALIZED_CHAR_VALUE)
1535 seq->ucs4 = repertoire_find_value (repertoire, seq->name,
1536 strlen (seq->name));
1537 if (seq->ucs4 == ILLEGAL_CHAR_VALUE)
1538 /* We cannot proceed, we don't know the UCS4 value. */
1539 return NULL;
1541 wstr = xmalloc (2 * sizeof (uint32_t));
1542 wstr[0] = seq->ucs4;
1543 wstr[1] = 0;
1545 else if (now->tok == tok_string)
1547 wstr = now->val.str.startwc;
1548 if (wstr[0] == 0)
1549 return NULL;
1551 else
1553 if (now->tok != tok_eol && now->tok != tok_eof)
1554 lr_ignore_rest (ldfile, 0);
1555 SYNTAX_ERROR (_("%s: syntax error"), "LC_CTYPE");
1556 return (uint32_t *) -1l;
1559 return wstr;
1563 static void
1564 read_translit_entry (struct linereader *ldfile, struct locale_ctype_t *ctype,
1565 struct token *now, struct charmap_t *charmap,
1566 struct repertoire_t *repertoire)
1568 uint32_t *from_wstr = read_widestring (ldfile, now, charmap, repertoire);
1569 struct translit_t *result;
1570 struct translit_to_t **top;
1571 struct obstack *ob = &ctype->mem_pool;
1572 int first;
1573 int ignore;
1575 if (from_wstr == NULL)
1576 /* There is no valid from string. */
1577 return;
1579 result = (struct translit_t *) obstack_alloc (ob,
1580 sizeof (struct translit_t));
1581 result->from = from_wstr;
1582 result->next = NULL;
1583 result->to = NULL;
1584 top = &result->to;
1585 first = 1;
1586 ignore = 0;
1588 while (1)
1590 uint32_t *to_wstr;
1592 /* Next we have one or more transliterations. They are
1593 separated by semicolons. */
1594 now = lr_token (ldfile, charmap, repertoire);
1596 if (!first && (now->tok == tok_semicolon || now->tok == tok_eol))
1598 /* One string read. */
1599 const uint32_t zero = 0;
1601 if (!ignore)
1603 obstack_grow (ob, &zero, 4);
1604 to_wstr = obstack_finish (ob);
1606 *top = obstack_alloc (ob, sizeof (struct translit_to_t));
1607 (*top)->str = to_wstr;
1608 (*top)->next = NULL;
1611 if (now->tok == tok_eol)
1613 result->next = ctype->translit;
1614 ctype->translit = result;
1615 return;
1618 if (!ignore)
1619 top = &(*top)->next;
1620 ignore = 0;
1622 else
1624 to_wstr = read_widestring (ldfile, now, charmap, repertoire);
1625 if (to_wstr == (uint32_t *) -1l)
1627 /* An error occurred. */
1628 obstack_free (ob, result);
1629 return;
1632 if (to_wstr == NULL)
1633 ignore = 1;
1634 else
1635 /* This value is usable. */
1636 obstack_grow (ob, to_wstr, wcslen ((wchar_t *) to_wstr) * 4);
1638 first = 0;
1644 /* The parser for the LC_CTYPE section of the locale definition. */
1645 void
1646 ctype_read (struct linereader *ldfile, struct localedef_t *result,
1647 struct charmap_t *charmap, const char *repertoire_name,
1648 int ignore_content)
1650 struct repertoire_t *repertoire = NULL;
1651 struct locale_ctype_t *ctype;
1652 struct token *now;
1653 enum token_t nowtok;
1654 size_t cnt;
1655 struct charseq *last_seq;
1656 uint32_t last_wch = 0;
1657 enum token_t last_token;
1658 enum token_t ellipsis_token;
1659 char last_charcode[16];
1660 size_t last_charcode_len = 0;
1661 const char *last_str = NULL;
1662 int mapidx;
1664 /* Get the repertoire we have to use. */
1665 if (repertoire_name != NULL)
1666 repertoire = repertoire_read (repertoire_name);
1668 /* The rest of the line containing `LC_CTYPE' must be free. */
1669 lr_ignore_rest (ldfile, 1);
1674 now = lr_token (ldfile, charmap, NULL);
1675 nowtok = now->tok;
1677 while (nowtok == tok_eol);
1679 /* If we see `copy' now we are almost done. */
1680 if (nowtok == tok_copy)
1682 handle_copy (ldfile, charmap, repertoire, result, tok_lc_ctype, LC_CTYPE,
1683 "LC_CTYPE", ignore_content);
1684 return;
1687 /* Prepare the data structures. */
1688 ctype_startup (ldfile, result, charmap, ignore_content);
1689 ctype = result->categories[LC_CTYPE].ctype;
1691 /* Remember the repertoire we use. */
1692 if (!ignore_content)
1693 ctype->repertoire = repertoire;
1695 while (1)
1697 unsigned long int class_bit = 0;
1698 unsigned long int class256_bit = 0;
1699 int handle_digits = 0;
1701 /* Of course we don't proceed beyond the end of file. */
1702 if (nowtok == tok_eof)
1703 break;
1705 /* Ingore empty lines. */
1706 if (nowtok == tok_eol)
1708 now = lr_token (ldfile, charmap, NULL);
1709 nowtok = now->tok;
1710 continue;
1713 switch (nowtok)
1715 case tok_charclass:
1716 now = lr_token (ldfile, charmap, NULL);
1717 while (now->tok == tok_ident || now->tok == tok_string)
1719 ctype_class_new (ldfile, ctype, now->val.str.startmb);
1720 now = lr_token (ldfile, charmap, NULL);
1721 if (now->tok != tok_semicolon)
1722 break;
1723 now = lr_token (ldfile, charmap, NULL);
1725 if (now->tok != tok_eol)
1726 SYNTAX_ERROR (_("\
1727 %s: syntax error in definition of new character class"), "LC_CTYPE");
1728 break;
1730 case tok_charconv:
1731 now = lr_token (ldfile, charmap, NULL);
1732 while (now->tok == tok_ident || now->tok == tok_string)
1734 ctype_map_new (ldfile, ctype, now->val.str.startmb, charmap);
1735 now = lr_token (ldfile, charmap, NULL);
1736 if (now->tok != tok_semicolon)
1737 break;
1738 now = lr_token (ldfile, charmap, NULL);
1740 if (now->tok != tok_eol)
1741 SYNTAX_ERROR (_("\
1742 %s: syntax error in definition of new character map"), "LC_CTYPE");
1743 break;
1745 case tok_class:
1746 /* Ignore the rest of the line if we don't need the input of
1747 this line. */
1748 if (ignore_content)
1750 lr_ignore_rest (ldfile, 0);
1751 break;
1754 /* We simply forget the `class' keyword and use the following
1755 operand to determine the bit. */
1756 now = lr_token (ldfile, charmap, NULL);
1757 if (now->tok == tok_ident || now->tok == tok_string)
1759 /* Must be one of the predefined class names. */
1760 for (cnt = 0; cnt < ctype->nr_charclass; ++cnt)
1761 if (strcmp (ctype->classnames[cnt], now->val.str.startmb) == 0)
1762 break;
1763 if (cnt >= ctype->nr_charclass)
1765 if (now->val.str.lenmb == 8
1766 && memcmp ("special1", now->val.str.startmb, 8) == 0)
1767 class_bit = _ISwspecial1;
1768 else if (now->val.str.lenmb == 8
1769 && memcmp ("special2", now->val.str.startmb, 8) == 0)
1770 class_bit = _ISwspecial2;
1771 else if (now->val.str.lenmb == 8
1772 && memcmp ("special3", now->val.str.startmb, 8) == 0)
1773 class_bit = _ISwspecial3;
1774 else
1776 lr_error (ldfile, _("\
1777 unknown character class `%s' in category `LC_CTYPE'"),
1778 now->val.str.startmb);
1779 free (now->val.str.startmb);
1781 lr_ignore_rest (ldfile, 0);
1782 continue;
1785 else
1786 class_bit = _ISwbit (cnt);
1788 free (now->val.str.startmb);
1790 else if (now->tok == tok_digit)
1791 goto handle_tok_digit;
1792 else if (now->tok < tok_upper || now->tok > tok_blank)
1793 goto err_label;
1794 else
1796 class_bit = BITw (now->tok);
1797 class256_bit = BIT (now->tok);
1800 /* The next character must be a semicolon. */
1801 now = lr_token (ldfile, charmap, NULL);
1802 if (now->tok != tok_semicolon)
1803 goto err_label;
1804 goto read_charclass;
1806 case tok_upper:
1807 case tok_lower:
1808 case tok_alpha:
1809 case tok_alnum:
1810 case tok_space:
1811 case tok_cntrl:
1812 case tok_punct:
1813 case tok_graph:
1814 case tok_print:
1815 case tok_xdigit:
1816 case tok_blank:
1817 /* Ignore the rest of the line if we don't need the input of
1818 this line. */
1819 if (ignore_content)
1821 lr_ignore_rest (ldfile, 0);
1822 break;
1825 class_bit = BITw (now->tok);
1826 class256_bit = BIT (now->tok);
1827 handle_digits = 0;
1828 read_charclass:
1829 ctype->class_done |= class_bit;
1830 last_token = tok_none;
1831 ellipsis_token = tok_none;
1832 now = lr_token (ldfile, charmap, NULL);
1833 while (now->tok != tok_eol && now->tok != tok_eof)
1835 uint32_t wch;
1836 struct charseq *seq;
1838 if (now->tok != tok_bsymbol)
1839 /* XXX Cannot be handled yet. We will have support
1840 for tok_ucs4 soon. */
1841 goto err_label;
1843 if (ellipsis_token == tok_none)
1845 if (get_character (now, charmap, repertoire, &seq, &wch))
1846 goto err_label;
1848 if (!ignore_content && seq != NULL && seq->nbytes == 1)
1849 /* Yep, we can store information about this byte
1850 sequence. */
1851 ctype->class256_collection[seq->bytes[0]] |= class256_bit;
1853 if (!ignore_content && wch != ILLEGAL_CHAR_VALUE
1854 && class_bit != 0)
1855 /* We have the UCS4 position. */
1856 *find_idx (ctype, &ctype->class_collection,
1857 &ctype->class_collection_max,
1858 &ctype->class_collection_act, wch) |= class_bit;
1860 last_token = now->tok;
1861 /* Terminate the string. */
1862 now->val.str.startmb[now->val.str.lenmb] = '\0';
1863 last_str = now->val.str.startmb;
1864 last_seq = seq;
1865 last_wch = wch;
1866 memcpy (last_charcode, now->val.charcode.bytes, 16);
1867 last_charcode_len = now->val.charcode.nbytes;
1869 if (!ignore_content && handle_digits == 1)
1871 /* We must store the digit values. */
1872 if (ctype->mbdigits_act == ctype->mbdigits_max)
1874 ctype->mbdigits_max += 10;
1875 ctype->mbdigits = xrealloc (ctype->mbdigits,
1876 (ctype->mbdigits_max
1877 * sizeof (char *)));
1878 ctype->wcdigits_max += 10;
1879 ctype->wcdigits = xrealloc (ctype->wcdigits,
1880 (ctype->wcdigits_max
1881 * sizeof (uint32_t)));
1884 ctype->mbdigits[ctype->mbdigits_act++] = seq;
1885 ctype->wcdigits[ctype->wcdigits_act++] = wch;
1887 else if (!ignore_content && handle_digits == 2)
1889 /* We must store the digit values. */
1890 if (ctype->outdigits_act >= 10)
1892 lr_error (ldfile, _("\
1893 %s: field `%s' does not contain exactly ten entries"),
1894 "LC_CTYPE", "outdigit");
1895 goto err_label;
1898 ctype->mboutdigits[ctype->outdigits_act] = seq;
1899 ctype->wcoutdigits[ctype->outdigits_act] = wch;
1900 ++ctype->outdigits_act;
1903 else
1905 /* Now it gets complicated. We have to resolve the
1906 ellipsis problem. First we must distinguish between
1907 the different kind of ellipsis and this must match the
1908 tokens we have seen. */
1909 assert (last_token != tok_none);
1911 if (last_token != now->tok)
1913 lr_error (ldfile, _("\
1914 ellipsis range must be marked by two operands of same type"));
1915 lr_ignore_rest (ldfile, 0);
1916 break;
1919 if (last_token == tok_bsymbol)
1921 if (ellipsis_token == tok_ellipsis3)
1922 lr_error (ldfile, _("with symbolic name range values \
1923 the absolute ellipsis `...' must not be used"));
1925 charclass_symbolic_ellipsis (ldfile, ctype, charmap,
1926 repertoire, now, last_str,
1927 class256_bit, class_bit,
1928 (ellipsis_token
1929 == tok_ellipsis4
1930 ? 10 : 16),
1931 ignore_content,
1932 handle_digits);
1934 else if (last_token == tok_ucs4)
1936 if (ellipsis_token != tok_ellipsis2)
1937 lr_error (ldfile, _("\
1938 with UCS range values one must use the hexadecimal symbolic ellipsis `..'"));
1940 charclass_ucs4_ellipsis (ldfile, ctype, charmap,
1941 repertoire, now, last_wch,
1942 class256_bit, class_bit,
1943 ignore_content, handle_digits);
1945 else
1947 assert (last_token == tok_charcode);
1949 if (ellipsis_token != tok_ellipsis3)
1950 lr_error (ldfile, _("\
1951 with character code range values one must use the absolute ellipsis `...'"));
1953 charclass_charcode_ellipsis (ldfile, ctype, charmap,
1954 repertoire, now,
1955 last_charcode,
1956 last_charcode_len,
1957 class256_bit, class_bit,
1958 ignore_content,
1959 handle_digits);
1962 /* Now we have used the last value. */
1963 last_token = tok_none;
1966 /* Next we expect a semicolon or the end of the line. */
1967 now = lr_token (ldfile, charmap, NULL);
1968 if (now->tok == tok_eol || now->tok == tok_eof)
1969 break;
1971 if (last_token != tok_none
1972 && now->tok >= tok_ellipsis2 && now->tok <= tok_ellipsis4)
1974 ellipsis_token = now->tok;
1975 now = lr_token (ldfile, charmap, NULL);
1976 continue;
1979 if (now->tok != tok_semicolon)
1980 goto err_label;
1982 /* And get the next character. */
1983 now = lr_token (ldfile, charmap, NULL);
1985 ellipsis_token = tok_none;
1987 break;
1989 case tok_digit:
1990 /* Ignore the rest of the line if we don't need the input of
1991 this line. */
1992 if (ignore_content)
1994 lr_ignore_rest (ldfile, 0);
1995 break;
1998 handle_tok_digit:
1999 class_bit = _ISwdigit;
2000 class256_bit = _ISdigit;
2001 handle_digits = 1;
2002 goto read_charclass;
2004 case tok_outdigit:
2005 /* Ignore the rest of the line if we don't need the input of
2006 this line. */
2007 if (ignore_content)
2009 lr_ignore_rest (ldfile, 0);
2010 break;
2013 if (ctype->outdigits_act != 0)
2014 lr_error (ldfile, _("\
2015 %s: field `%s' declared more than once"),
2016 "LC_CTYPE", "outdigit");
2017 class_bit = 0;
2018 class256_bit = 0;
2019 handle_digits = 2;
2020 goto read_charclass;
2022 case tok_toupper:
2023 /* Ignore the rest of the line if we don't need the input of
2024 this line. */
2025 if (ignore_content)
2027 lr_ignore_rest (ldfile, 0);
2028 break;
2031 mapidx = 0;
2032 goto read_mapping;
2034 case tok_tolower:
2035 /* Ignore the rest of the line if we don't need the input of
2036 this line. */
2037 if (ignore_content)
2039 lr_ignore_rest (ldfile, 0);
2040 break;
2043 mapidx = 1;
2044 goto read_mapping;
2046 case tok_map:
2047 /* Ignore the rest of the line if we don't need the input of
2048 this line. */
2049 if (ignore_content)
2051 lr_ignore_rest (ldfile, 0);
2052 break;
2055 /* We simply forget the `map' keyword and use the following
2056 operand to determine the mapping. */
2057 now = lr_token (ldfile, charmap, NULL);
2058 if (now->tok == tok_ident || now->tok == tok_string)
2060 size_t cnt;
2062 for (cnt = 2; cnt < ctype->map_collection_nr; ++cnt)
2063 if (strcmp (now->val.str.startmb, ctype->mapnames[cnt]) == 0)
2064 break;
2066 if (cnt < ctype->map_collection_nr)
2067 mapidx = cnt;
2068 else
2070 lr_error (ldfile, _("unknown map `%s'"),
2071 now->val.str.startmb);
2072 lr_ignore_rest (ldfile, 0);
2073 break;
2076 else if (now->tok < tok_toupper || now->tok > tok_tolower)
2077 goto err_label;
2078 else
2079 mapidx = now->tok - tok_toupper;
2081 now = lr_token (ldfile, charmap, NULL);
2082 /* This better should be a semicolon. */
2083 if (now->tok != tok_semicolon)
2084 goto err_label;
2086 read_mapping:
2087 /* Test whether this mapping was already defined. */
2088 if (ctype->tomap_done[mapidx])
2090 lr_error (ldfile, _("duplicated definition for mapping `%s'"),
2091 ctype->mapnames[mapidx]);
2092 lr_ignore_rest (ldfile, 0);
2093 break;
2095 ctype->tomap_done[mapidx] = 1;
2097 now = lr_token (ldfile, charmap, NULL);
2098 while (now->tok != tok_eol && now->tok != tok_eof)
2100 struct charseq *from_seq;
2101 uint32_t from_wch;
2102 struct charseq *to_seq;
2103 uint32_t to_wch;
2105 /* Every pair starts with an opening brace. */
2106 if (now->tok != tok_open_brace)
2107 goto err_label;
2109 /* Next comes the from-value. */
2110 now = lr_token (ldfile, charmap, NULL);
2111 if (get_character (now, charmap, repertoire, &from_seq,
2112 &from_wch) != 0)
2113 goto err_label;
2115 /* The next is a comma. */
2116 now = lr_token (ldfile, charmap, NULL);
2117 if (now->tok != tok_comma)
2118 goto err_label;
2120 /* And the other value. */
2121 now = lr_token (ldfile, charmap, NULL);
2122 if (get_character (now, charmap, repertoire, &to_seq,
2123 &to_wch) != 0)
2124 goto err_label;
2126 /* And the last thing is the closing brace. */
2127 now = lr_token (ldfile, charmap, NULL);
2128 if (now->tok != tok_close_brace)
2129 goto err_label;
2131 if (!ignore_content)
2133 if (mapidx < 2 && from_seq != NULL && to_seq != NULL
2134 && from_seq->nbytes == 1 && to_seq->nbytes == 1)
2135 /* We can use this value. */
2136 ctype->map256_collection[mapidx][from_seq->bytes[0]]
2137 = to_seq->bytes[0];
2139 if (from_wch != ILLEGAL_CHAR_VALUE
2140 && to_wch != ILLEGAL_CHAR_VALUE)
2141 /* Both correct values. */
2142 *find_idx (ctype, &ctype->map_collection[mapidx],
2143 &ctype->map_collection_max[mapidx],
2144 &ctype->map_collection_act[mapidx],
2145 from_wch) = to_wch;
2148 /* Now comes a semicolon or the end of the line/file. */
2149 now = lr_token (ldfile, charmap, NULL);
2150 if (now->tok == tok_semicolon)
2151 now = lr_token (ldfile, charmap, NULL);
2153 break;
2155 case tok_translit_start:
2156 /* Ignore the rest of the line if we don't need the input of
2157 this line. */
2158 if (ignore_content)
2160 lr_ignore_rest (ldfile, 0);
2161 break;
2164 /* The rest of the line better should be empty. */
2165 lr_ignore_rest (ldfile, 1);
2167 /* We count here the number of allocated entries in the `translit'
2168 array. */
2169 cnt = 0;
2171 /* We proceed until we see the `translit_end' token. */
2172 while (now = lr_token (ldfile, charmap, repertoire),
2173 now->tok != tok_translit_end && now->tok != tok_eof)
2175 if (now->tok == tok_eol)
2176 /* Ignore empty lines. */
2177 continue;
2179 if (now->tok == tok_translit_end)
2181 lr_ignore_rest (ldfile, 0);
2182 break;
2185 if (now->tok == tok_include)
2187 /* We have to include locale. */
2188 const char *locale_name;
2189 const char *repertoire_name;
2191 now = lr_token (ldfile, charmap, NULL);
2192 /* This should be a string or an identifier. In any
2193 case something to name a locale. */
2194 if (now->tok != tok_string && now->tok != tok_ident)
2196 translit_syntax:
2197 lr_error (ldfile, _("%s: syntax error"), "LC_CTYPE");
2198 lr_ignore_rest (ldfile, 0);
2199 continue;
2201 locale_name = now->val.str.startmb;
2203 /* Next should be a semicolon. */
2204 now = lr_token (ldfile, charmap, NULL);
2205 if (now->tok != tok_semicolon)
2206 goto translit_syntax;
2208 /* Now the repertoire name. */
2209 now = lr_token (ldfile, charmap, NULL);
2210 if ((now->tok != tok_string && now->tok != tok_ident)
2211 || now->val.str.startmb == NULL)
2212 goto translit_syntax;
2213 repertoire_name = now->val.str.startmb;
2215 /* We must not have more than one `include'. */
2216 if (ctype->translit_copy_locale != NULL)
2218 lr_error (ldfile, _("\
2219 %s: only one `include' instruction allowed"), "LC_CTYPE");
2220 lr_ignore_rest (ldfile, 0);
2221 continue;
2224 ctype->translit_copy_locale = locale_name;
2225 ctype->translit_copy_repertoire = repertoire_name;
2227 /* The rest of the line must be empty. */
2228 lr_ignore_rest (ldfile, 1);
2229 continue;
2232 read_translit_entry (ldfile, ctype, now, charmap, repertoire);
2234 break;
2236 case tok_ident:
2237 /* Ignore the rest of the line if we don't need the input of
2238 this line. */
2239 if (ignore_content)
2241 lr_ignore_rest (ldfile, 0);
2242 break;
2245 /* This could mean one of several things. First test whether
2246 it's a character class name. */
2247 for (cnt = 0; cnt < ctype->nr_charclass; ++cnt)
2248 if (strcmp (now->val.str.startmb, ctype->classnames[cnt]) == 0)
2249 break;
2250 if (cnt < ctype->nr_charclass)
2252 class_bit = _ISwbit (cnt);
2253 class256_bit = cnt <= 11 ? _ISbit (cnt) : 0;
2254 free (now->val.str.startmb);
2255 goto read_charclass;
2257 for (cnt = 0; cnt < ctype->map_collection_nr; ++cnt)
2258 if (strcmp (now->val.str.startmb, ctype->mapnames[cnt]) == 0)
2259 break;
2260 if (cnt < ctype->map_collection_nr)
2262 mapidx = cnt;
2263 free (now->val.str.startmb);
2264 goto read_mapping;
2266 if (strcmp (now->val.str.startmb, "special1") == 0)
2268 class_bit = _ISwspecial1;
2269 free (now->val.str.startmb);
2270 goto read_charclass;
2272 if (strcmp (now->val.str.startmb, "special2") == 0)
2274 class_bit = _ISwspecial2;
2275 free (now->val.str.startmb);
2276 goto read_charclass;
2278 if (strcmp (now->val.str.startmb, "special3") == 0)
2280 class_bit = _ISwspecial3;
2281 free (now->val.str.startmb);
2282 goto read_charclass;
2284 if (strcmp (now->val.str.startmb, "tosymmetric") == 0)
2286 mapidx = 2;
2287 goto read_mapping;
2289 break;
2291 case tok_end:
2292 /* Next we assume `LC_CTYPE'. */
2293 now = lr_token (ldfile, charmap, NULL);
2294 if (now->tok == tok_eof)
2295 break;
2296 if (now->tok == tok_eol)
2297 lr_error (ldfile, _("%s: incomplete `END' line"),
2298 "LC_CTYPE");
2299 else if (now->tok != tok_lc_ctype)
2300 lr_error (ldfile, _("\
2301 %1$s: definition does not end with `END %1$s'"), "LC_CTYPE");
2302 lr_ignore_rest (ldfile, now->tok == tok_lc_ctype);
2303 return;
2305 default:
2306 err_label:
2307 if (now->tok != tok_eof)
2308 SYNTAX_ERROR (_("%s: syntax error"), "LC_CTYPE");
2311 /* Prepare for the next round. */
2312 now = lr_token (ldfile, charmap, NULL);
2313 nowtok = now->tok;
2316 /* When we come here we reached the end of the file. */
2317 lr_error (ldfile, _("%s: premature end of file"), "LC_CTYPE");
2321 static void
2322 set_class_defaults (struct locale_ctype_t *ctype, struct charmap_t *charmap,
2323 struct repertoire_t *repertoire)
2325 size_t cnt;
2327 /* These function defines the default values for the classes and conversions
2328 according to POSIX.2 2.5.2.1.
2329 It may seem that the order of these if-blocks is arbitrary but it is NOT.
2330 Don't move them unless you know what you do! */
2332 void set_default (int bitpos, int from, int to)
2334 char tmp[2];
2335 int ch;
2336 int bit = _ISbit (bitpos);
2337 int bitw = _ISwbit (bitpos);
2338 /* Define string. */
2339 strcpy (tmp, "?");
2341 for (ch = from; ch <= to; ++ch)
2343 uint32_t value;
2344 struct charseq *seq;
2345 tmp[0] = ch;
2347 value = repertoire_find_value (repertoire, tmp, 1);
2348 if (value == ILLEGAL_CHAR_VALUE)
2350 if (!be_quiet)
2351 error (0, 0, _("\
2352 %s: character `%s' not defined in repertoire while needed as default value"),
2353 "LC_CTYPE", tmp);
2355 else
2356 ELEM (ctype, class_collection, , value) |= bitw;
2358 seq = charmap_find_value (charmap, tmp, 1);
2359 if (seq == NULL)
2361 if (!be_quiet)
2362 error (0, 0, _("\
2363 %s: character `%s' not defined in charmap while needed as default value"),
2364 "LC_CTYPE", tmp);
2366 else if (seq->nbytes != 1)
2367 error (0, 0, _("\
2368 %s: character `%s' in charmap not representable with one byte"),
2369 "LC_CTYPE", tmp);
2370 else
2371 ctype->class256_collection[seq->bytes[0]] |= bit;
2375 /* Set default values if keyword was not present. */
2376 if ((ctype->class_done & BITw (tok_upper)) == 0)
2377 /* "If this keyword [lower] is not specified, the lowercase letters
2378 `A' through `Z', ..., shall automatically belong to this class,
2379 with implementation defined character values." [P1003.2, 2.5.2.1] */
2380 set_default (BITPOS (tok_upper), 'A', 'Z');
2382 if ((ctype->class_done & BITw (tok_lower)) == 0)
2383 /* "If this keyword [lower] is not specified, the lowercase letters
2384 `a' through `z', ..., shall automatically belong to this class,
2385 with implementation defined character values." [P1003.2, 2.5.2.1] */
2386 set_default (BITPOS (tok_lower), 'a', 'z');
2388 if ((ctype->class_done & BITw (tok_alpha)) == 0)
2390 /* Table 2-6 in P1003.2 says that characters in class `upper' or
2391 class `lower' *must* be in class `alpha'. */
2392 unsigned long int mask = BIT (tok_upper) | BIT (tok_lower);
2393 unsigned long int maskw = BITw (tok_upper) | BITw (tok_lower);
2395 for (cnt = 0; cnt < 256; ++cnt)
2396 if ((ctype->class256_collection[cnt] & mask) != 0)
2397 ctype->class256_collection[cnt] |= BIT (tok_alpha);
2399 for (cnt = 0; cnt < ctype->class_collection_act; ++cnt)
2400 if ((ctype->class_collection[cnt] & maskw) != 0)
2401 ctype->class_collection[cnt] |= BITw (tok_alpha);
2404 if ((ctype->class_done & BITw (tok_digit)) == 0)
2405 /* "If this keyword [digit] is not specified, the digits `0' through
2406 `9', ..., shall automatically belong to this class, with
2407 implementation-defined character values." [P1003.2, 2.5.2.1] */
2408 set_default (BITPOS (tok_digit), '0', '9');
2410 /* "Only characters specified for the `alpha' and `digit' keyword
2411 shall be specified. Characters specified for the keyword `alpha'
2412 and `digit' are automatically included in this class. */
2414 unsigned long int mask = BIT (tok_alpha) | BIT (tok_digit);
2415 unsigned long int maskw = BITw (tok_alpha) | BITw (tok_digit);
2417 for (cnt = 0; cnt < 256; ++cnt)
2418 if ((ctype->class256_collection[cnt] & mask) != 0)
2419 ctype->class256_collection[cnt] |= BIT (tok_alnum);
2421 for (cnt = 0; cnt < ctype->class_collection_act; ++cnt)
2422 if ((ctype->class_collection[cnt] & maskw) != 0)
2423 ctype->class_collection[cnt] |= BITw (tok_alnum);
2426 if ((ctype->class_done & BITw (tok_space)) == 0)
2427 /* "If this keyword [space] is not specified, the characters <space>,
2428 <form-feed>, <newline>, <carriage-return>, <tab>, and
2429 <vertical-tab>, ..., shall automatically belong to this class,
2430 with implementation-defined character values." [P1003.2, 2.5.2.1] */
2432 uint32_t value;
2433 struct charseq *seq;
2435 value = repertoire_find_value (repertoire, "space", 5);
2436 if (value == ILLEGAL_CHAR_VALUE)
2438 if (!be_quiet)
2439 error (0, 0, _("\
2440 %s: character `%s' not defined while needed as default value"),
2441 "LC_CTYPE", "<space>");
2443 else
2444 ELEM (ctype, class_collection, , value) |= BIT (tok_space);
2446 seq = charmap_find_value (charmap, "space", 5);
2447 if (seq == NULL)
2449 if (!be_quiet)
2450 error (0, 0, _("\
2451 %s: character `%s' not defined while needed as default value"),
2452 "LC_CTYPE", "<space>");
2454 else if (seq->nbytes != 1)
2455 error (0, 0, _("\
2456 %s: character `%s' in charmap not representable with one byte"),
2457 "LC_CTYPE", "<space>");
2458 else
2459 ctype->class256_collection[seq->bytes[0]] |= BIT (tok_space);
2462 value = repertoire_find_value (repertoire, "form-feed", 9);
2463 if (value == ILLEGAL_CHAR_VALUE)
2465 if (!be_quiet)
2466 error (0, 0, _("\
2467 %s: character `%s' not defined while needed as default value"),
2468 "LC_CTYPE", "<form-feed>");
2470 else
2471 ELEM (ctype, class_collection, , value) |= BIT (tok_space);
2473 seq = charmap_find_value (charmap, "form-feed", 9);
2474 if (seq == NULL)
2476 if (!be_quiet)
2477 error (0, 0, _("\
2478 %s: character `%s' not defined while needed as default value"),
2479 "LC_CTYPE", "<form-feed>");
2481 else if (seq->nbytes != 1)
2482 error (0, 0, _("\
2483 %s: character `%s' in charmap not representable with one byte"),
2484 "LC_CTYPE", "<form-feed>");
2485 else
2486 ctype->class256_collection[seq->bytes[0]] |= BIT (tok_space);
2489 value = repertoire_find_value (repertoire, "newline", 7);
2490 if (value == ILLEGAL_CHAR_VALUE)
2492 if (!be_quiet)
2493 error (0, 0, _("\
2494 %s: character `%s' not defined while needed as default value"),
2495 "LC_CTYPE", "<newline>");
2497 else
2498 ELEM (ctype, class_collection, , value) |= BIT (tok_space);
2500 seq = charmap_find_value (charmap, "newline", 7);
2501 if (seq == NULL)
2503 if (!be_quiet)
2504 error (0, 0, _("\
2505 character `%s' not defined while needed as default value"),
2506 "<newline>");
2508 else if (seq->nbytes != 1)
2509 error (0, 0, _("\
2510 %s: character `%s' in charmap not representable with one byte"),
2511 "LC_CTYPE", "<newline>");
2512 else
2513 ctype->class256_collection[seq->bytes[0]] |= BIT (tok_space);
2516 value = repertoire_find_value (repertoire, "carriage-return", 15);
2517 if (value == ILLEGAL_CHAR_VALUE)
2519 if (!be_quiet)
2520 error (0, 0, _("\
2521 %s: character `%s' not defined while needed as default value"),
2522 "LC_CTYPE", "<carriage-return>");
2524 else
2525 ELEM (ctype, class_collection, , value) |= BIT (tok_space);
2527 seq = charmap_find_value (charmap, "carriage-return", 15);
2528 if (seq == NULL)
2530 if (!be_quiet)
2531 error (0, 0, _("\
2532 %s: character `%s' not defined while needed as default value"),
2533 "LC_CTYPE", "<carriage-return>");
2535 else if (seq->nbytes != 1)
2536 error (0, 0, _("\
2537 %s: character `%s' in charmap not representable with one byte"),
2538 "LC_CTYPE", "<carriage-return>");
2539 else
2540 ctype->class256_collection[seq->bytes[0]] |= BIT (tok_space);
2543 value = repertoire_find_value (repertoire, "tab", 3);
2544 if (value == ILLEGAL_CHAR_VALUE)
2546 if (!be_quiet)
2547 error (0, 0, _("\
2548 %s: character `%s' not defined while needed as default value"),
2549 "LC_CTYPE", "<tab>");
2551 else
2552 ELEM (ctype, class_collection, , value) |= BIT (tok_space);
2554 seq = charmap_find_value (charmap, "tab", 3);
2555 if (seq == NULL)
2557 if (!be_quiet)
2558 error (0, 0, _("\
2559 %s: character `%s' not defined while needed as default value"),
2560 "LC_CTYPE", "<tab>");
2562 else if (seq->nbytes != 1)
2563 error (0, 0, _("\
2564 %s: character `%s' in charmap not representable with one byte"),
2565 "LC_CTYPE", "<tab>");
2566 else
2567 ctype->class256_collection[seq->bytes[0]] |= BIT (tok_space);
2570 value = repertoire_find_value (repertoire, "vertical-tab", 12);
2571 if (value == ILLEGAL_CHAR_VALUE)
2573 if (!be_quiet)
2574 error (0, 0, _("\
2575 %s: character `%s' not defined while needed as default value"),
2576 "LC_CTYPE", "<vertical-tab>");
2578 else
2579 ELEM (ctype, class_collection, , value) |= BIT (tok_space);
2581 seq = charmap_find_value (charmap, "vertical-tab", 12);
2582 if (seq == NULL)
2584 if (!be_quiet)
2585 error (0, 0, _("\
2586 %s: character `%s' not defined while needed as default value"),
2587 "LC_CTYPE", "<vertical-tab>");
2589 else if (seq->nbytes != 1)
2590 error (0, 0, _("\
2591 %s: character `%s' in charmap not representable with one byte"),
2592 "LC_CTYPE", "<vertical-tab>");
2593 else
2594 ctype->class256_collection[seq->bytes[0]] |= BIT (tok_space);
2597 if ((ctype->class_done & BITw (tok_xdigit)) == 0)
2598 /* "If this keyword is not specified, the digits `0' to `9', the
2599 uppercase letters `A' through `F', and the lowercase letters `a'
2600 through `f', ..., shell automatically belong to this class, with
2601 implementation defined character values." [P1003.2, 2.5.2.1] */
2603 set_default (BITPOS (tok_xdigit), '0', '9');
2604 set_default (BITPOS (tok_xdigit), 'A', 'F');
2605 set_default (BITPOS (tok_xdigit), 'a', 'f');
2608 if ((ctype->class_done & BITw (tok_blank)) == 0)
2609 /* "If this keyword [blank] is unspecified, the characters <space> and
2610 <tab> shall belong to this character class." [P1003.2, 2.5.2.1] */
2612 uint32_t value;
2613 struct charseq *seq;
2615 value = repertoire_find_value (repertoire, "space", 5);
2616 if (value == ILLEGAL_CHAR_VALUE)
2618 if (!be_quiet)
2619 error (0, 0, _("\
2620 %s: character `%s' not defined while needed as default value"),
2621 "LC_CTYPE", "<space>");
2623 else
2624 ELEM (ctype, class_collection, , value) |= BIT (tok_blank);
2626 seq = charmap_find_value (charmap, "space", 5);
2627 if (seq == NULL)
2629 if (!be_quiet)
2630 error (0, 0, _("\
2631 %s: character `%s' not defined while needed as default value"),
2632 "LC_CTYPE", "<space>");
2634 else if (seq->nbytes != 1)
2635 error (0, 0, _("\
2636 %s: character `%s' in charmap not representable with one byte"),
2637 "LC_CTYPE", "<space>");
2638 else
2639 ctype->class256_collection[seq->bytes[0]] |= BIT (tok_blank);
2642 value = repertoire_find_value (repertoire, "tab", 3);
2643 if (value == ILLEGAL_CHAR_VALUE)
2645 if (!be_quiet)
2646 error (0, 0, _("\
2647 %s: character `%s' not defined while needed as default value"),
2648 "LC_CTYPE", "<tab>");
2650 else
2651 ELEM (ctype, class_collection, , value) |= BIT (tok_blank);
2653 seq = charmap_find_value (charmap, "tab", 3);
2654 if (seq == NULL)
2656 if (!be_quiet)
2657 error (0, 0, _("\
2658 %s: character `%s' not defined while needed as default value"),
2659 "LC_CTYPE", "<tab>");
2661 else if (seq->nbytes != 1)
2662 error (0, 0, _("\
2663 %s: character `%s' in charmap not representable with one byte"),
2664 "LC_CTYPE", "<tab>");
2665 else
2666 ctype->class256_collection[seq->bytes[0]] |= BIT (tok_blank);
2669 if ((ctype->class_done & BITw (tok_graph)) == 0)
2670 /* "If this keyword [graph] is not specified, characters specified for
2671 the keywords `upper', `lower', `alpha', `digit', `xdigit' and `punct',
2672 shall belong to this character class." [P1003.2, 2.5.2.1] */
2674 unsigned long int mask = BIT (tok_upper) | BIT (tok_lower) |
2675 BIT (tok_alpha) | BIT (tok_digit) | BIT (tok_xdigit) | BIT (tok_punct);
2676 size_t cnt;
2678 for (cnt = 0; cnt < ctype->class_collection_act; ++cnt)
2679 if ((ctype->class_collection[cnt] & mask) != 0)
2680 ctype->class_collection[cnt] |= BIT (tok_graph);
2682 for (cnt = 0; cnt < 256; ++cnt)
2683 if ((ctype->class256_collection[cnt] & mask) != 0)
2684 ctype->class256_collection[cnt] |= BIT (tok_graph);
2687 if ((ctype->class_done & BITw (tok_print)) == 0)
2688 /* "If this keyword [print] is not provided, characters specified for
2689 the keywords `upper', `lower', `alpha', `digit', `xdigit', `punct',
2690 and the <space> character shall belong to this character class."
2691 [P1003.2, 2.5.2.1] */
2693 unsigned long int mask = BIT (tok_upper) | BIT (tok_lower) |
2694 BIT (tok_alpha) | BIT (tok_digit) | BIT (tok_xdigit) | BIT (tok_punct);
2695 size_t cnt;
2696 uint32_t space;
2697 struct charseq *seq;
2699 for (cnt = 0; cnt < ctype->class_collection_act; ++cnt)
2700 if ((ctype->class_collection[cnt] & mask) != 0)
2701 ctype->class_collection[cnt] |= BIT (tok_print);
2703 for (cnt = 0; cnt < 256; ++cnt)
2704 if ((ctype->class256_collection[cnt] & mask) != 0)
2705 ctype->class256_collection[cnt] |= BIT (tok_print);
2708 space = repertoire_find_value (repertoire, "space", 5);
2709 if (space == ILLEGAL_CHAR_VALUE)
2711 if (!be_quiet)
2712 error (0, 0, _("\
2713 %s: character `%s' not defined while needed as default value"),
2714 "LC_CTYPE", "<space>");
2716 else
2717 ELEM (ctype, class_collection, , space) |= BIT (tok_print);
2719 seq = charmap_find_value (charmap, "space", 5);
2720 if (seq == NULL)
2722 if (!be_quiet)
2723 error (0, 0, _("\
2724 %s: character `%s' not defined while needed as default value"),
2725 "LC_CTYPE", "<space>");
2727 else if (seq->nbytes != 1)
2728 error (0, 0, _("\
2729 %s: character `%s' in charmap not representable with one byte"),
2730 "LC_CTYPE", "<space>");
2731 else
2732 ctype->class256_collection[seq->bytes[0]] |= BIT (tok_print);
2735 if (ctype->tomap_done[0] == 0)
2736 /* "If this keyword [toupper] is not specified, the lowercase letters
2737 `a' through `z', and their corresponding uppercase letters `A' to
2738 `Z', ..., shall automatically be included, with implementation-
2739 defined character values." [P1003.2, 2.5.2.1] */
2741 char tmp[4];
2742 int ch;
2744 strcpy (tmp, "<?>");
2746 for (ch = 'a'; ch <= 'z'; ++ch)
2748 uint32_t value_from, value_to;
2749 struct charseq *seq_from, *seq_to;
2751 tmp[1] = (char) ch;
2753 value_from = repertoire_find_value (repertoire, &tmp[1], 1);
2754 if (value_from == ILLEGAL_CHAR_VALUE)
2756 if (!be_quiet)
2757 error (0, 0, _("\
2758 %s: character `%s' not defined while needed as default value"),
2759 "LC_CTYPE", tmp);
2761 else
2763 /* This conversion is implementation defined. */
2764 tmp[1] = (char) (ch + ('A' - 'a'));
2765 value_to = repertoire_find_value (repertoire, &tmp[1], 1);
2766 if (value_to == ILLEGAL_CHAR_VALUE)
2768 if (!be_quiet)
2769 error (0, 0, _("\
2770 %s: character `%s' not defined while needed as default value"),
2771 "LC_CTYPE", tmp);
2773 else
2774 /* The index [0] is determined by the order of the
2775 `ctype_map_newP' calls in `ctype_startup'. */
2776 ELEM (ctype, map_collection, [0], value_from) = value_to;
2779 seq_from = charmap_find_value (charmap, &tmp[1], 1);
2780 if (seq_from == NULL)
2782 if (!be_quiet)
2783 error (0, 0, _("\
2784 %s: character `%s' not defined while needed as default value"),
2785 "LC_CTYPE", tmp);
2787 else if (seq_from->nbytes != 1)
2789 if (!be_quiet)
2790 error (0, 0, _("\
2791 %s: character `%s' needed as default value not representable with one byte"),
2792 "LC_CTYPE", tmp);
2794 else
2796 /* This conversion is implementation defined. */
2797 tmp[1] = (char) (ch + ('A' - 'a'));
2798 seq_to = charmap_find_value (charmap, &tmp[1], 1);
2799 if (seq_to == NULL)
2801 if (!be_quiet)
2802 error (0, 0, _("\
2803 %s: character `%s' not defined while needed as default value"),
2804 "LC_CTYPE", tmp);
2806 else if (seq_to->nbytes != 1)
2808 if (!be_quiet)
2809 error (0, 0, _("\
2810 %s: character `%s' needed as default value not representable with one byte"),
2811 "LC_CTYPE", tmp);
2813 else
2814 /* The index [0] is determined by the order of the
2815 `ctype_map_newP' calls in `ctype_startup'. */
2816 ctype->map256_collection[0][seq_from->bytes[0]]
2817 = seq_to->bytes[0];
2822 if (ctype->tomap_done[1] == 0)
2823 /* "If this keyword [tolower] is not specified, the mapping shall be
2824 the reverse mapping of the one specified to `toupper'." [P1003.2] */
2826 for (cnt = 0; cnt < ctype->map_collection_act[0]; ++cnt)
2827 if (ctype->map_collection[0][cnt] != 0)
2828 ELEM (ctype, map_collection, [1],
2829 ctype->map_collection[0][cnt])
2830 = ctype->charnames[cnt];
2832 for (cnt = 0; cnt < 256; ++cnt)
2833 if (ctype->map256_collection[0][cnt] != 0)
2834 ctype->map_collection[1][ctype->map_collection[0][cnt]]
2835 = ctype->charnames[cnt];
2838 if (ctype->outdigits_act == 0)
2840 for (cnt = 0; cnt < 10; ++cnt)
2842 ctype->mboutdigits[cnt] = charmap_find_symbol (charmap,
2843 digits + cnt, 1);
2845 if (ctype->mboutdigits[cnt] == NULL)
2847 ctype->mboutdigits[cnt] = charmap_find_symbol (charmap,
2848 longnames[cnt],
2849 strlen (longnames[cnt]));
2851 if (ctype->mboutdigits[cnt] == NULL)
2853 /* Provide a replacement. */
2854 error (0, 0, _("\
2855 no output digits defined and none of the standard names in the charmap"));
2857 ctype->mboutdigits[cnt] = obstack_alloc (&charmap->mem_pool,
2858 sizeof (struct charseq) + 1);
2860 /* This is better than nothing. */
2861 ctype->mboutdigits[cnt]->bytes[0] = digits[cnt];
2862 ctype->mboutdigits[cnt]->nbytes = 1;
2866 ctype->wcoutdigits[cnt] = repertoire_find_value (repertoire,
2867 digits + cnt, 1);
2869 if (ctype->wcoutdigits[cnt] == ILLEGAL_CHAR_VALUE)
2871 ctype->wcoutdigits[cnt] = repertoire_find_value (repertoire,
2872 longnames[cnt],
2873 strlen (longnames[cnt]));
2875 if (ctype->wcoutdigits[cnt] == ILLEGAL_CHAR_VALUE)
2877 /* Provide a replacement. */
2878 error (0, 0, _("\
2879 no output digits defined and none of the standard names in the repertoire"));
2881 /* This is better than nothing. */
2882 ctype->wcoutdigits[cnt] = (uint32_t) digits[cnt];
2887 ctype->outdigits_act = 10;
2892 static void
2893 allocate_arrays (struct locale_ctype_t *ctype, struct charmap_t *charmap,
2894 struct repertoire_t *repertoire)
2896 size_t idx;
2898 /* First we have to decide how we organize the arrays. It is easy
2899 for a one-byte character set. But multi-byte character set
2900 cannot be stored flat because the chars might be sparsely used.
2901 So we determine an optimal hashing function for the used
2902 characters.
2904 We use a very trivial hashing function to store the sparse
2905 table. CH % TABSIZE is used as an index. To solve multiple hits
2906 we have N planes. This guarantees a fixed search time for a
2907 character [N / 2]. In the following code we determine the minimum
2908 value for TABSIZE * N, where TABSIZE >= 256. */
2909 size_t min_total = UINT_MAX;
2910 size_t act_size = 256;
2912 if (!be_quiet)
2913 fputs (_("\
2914 Computing table size for character classes might take a while..."),
2915 stderr);
2917 while (act_size < min_total)
2919 size_t cnt[act_size];
2920 size_t act_planes = 1;
2922 memset (cnt, '\0', sizeof cnt);
2924 for (idx = 0; idx < 256; ++idx)
2925 cnt[idx] = 1;
2927 for (idx = 0; idx < ctype->charnames_act; ++idx)
2928 if (ctype->charnames[idx] >= 256)
2930 size_t nr = ctype->charnames[idx] % act_size;
2932 if (++cnt[nr] > act_planes)
2934 act_planes = cnt[nr];
2935 if (act_size * act_planes >= min_total)
2936 break;
2940 if (act_size * act_planes < min_total)
2942 min_total = act_size * act_planes;
2943 ctype->plane_size = act_size;
2944 ctype->plane_cnt = act_planes;
2947 ++act_size;
2950 if (!be_quiet)
2951 fputs (_(" done\n"), stderr);
2954 ctype->names = (uint32_t *) xcalloc (ctype->plane_size
2955 * ctype->plane_cnt,
2956 sizeof (uint32_t));
2958 for (idx = 1; idx < 256; ++idx)
2959 ctype->names[idx] = idx;
2961 /* Trick: change the 0th entry's name to 1 to mark the cell occupied. */
2962 ctype->names[0] = 1;
2964 for (idx = 256; idx < ctype->charnames_act; ++idx)
2966 size_t nr = (ctype->charnames[idx] % ctype->plane_size);
2967 size_t depth = 0;
2969 while (ctype->names[nr + depth * ctype->plane_size])
2970 ++depth;
2971 assert (depth < ctype->plane_cnt);
2973 ctype->names[nr + depth * ctype->plane_size] = ctype->charnames[idx];
2975 /* Now for faster access remember the index in the NAMES_B array. */
2976 ctype->charnames[idx] = nr + depth * ctype->plane_size;
2978 ctype->names[0] = 0;
2981 /* You wonder about this amount of memory? This is only because some
2982 users do not manage to address the array with unsigned values or
2983 data types with range >= 256. '\200' would result in the array
2984 index -128. To help these poor people we duplicate the entries for
2985 128 up to 255 below the entry for \0. */
2986 ctype->ctype_b = (char_class_t *) xcalloc (256 + 128,
2987 sizeof (char_class_t));
2988 ctype->ctype32_b = (char_class32_t *) xcalloc (ctype->plane_size
2989 * ctype->plane_cnt,
2990 sizeof (char_class32_t));
2992 /* This is the array accessed using the multibyte string elements. */
2993 for (idx = 0; idx < 256; ++idx)
2994 ctype->ctype_b[128 + idx] = ctype->class256_collection[idx];
2996 /* Mirror first 127 entries. We must take care that entry -1 is not
2997 mirrored because EOF == -1. */
2998 for (idx = 0; idx < 127; ++idx)
2999 ctype->ctype_b[idx] = ctype->ctype_b[256 + idx];
3001 /* The 32 bit array contains all characters. */
3002 for (idx = 0; idx < ctype->class_collection_act; ++idx)
3003 ctype->ctype32_b[ctype->charnames[idx]] = ctype->class_collection[idx];
3005 /* Room for table of mappings. */
3006 ctype->map = (uint32_t **) xmalloc (ctype->map_collection_nr
3007 * sizeof (uint32_t *));
3009 /* Fill in all mappings. */
3010 for (idx = 0; idx < ctype->map_collection_nr; ++idx)
3012 unsigned int idx2;
3014 /* Allocate table. */
3015 ctype->map[idx] = (uint32_t *) xmalloc ((ctype->plane_size
3016 * ctype->plane_cnt + 128)
3017 * sizeof (uint32_t));
3019 /* Copy default value (identity mapping). */
3020 memcpy (&ctype->map[idx][128], ctype->names,
3021 ctype->plane_size * ctype->plane_cnt * sizeof (uint32_t));
3023 /* Copy values from collection. */
3024 for (idx2 = 0; idx2 < 256; ++idx2)
3025 ctype->map[idx][128 + idx2] = ctype->map256_collection[idx][idx2];
3027 /* Mirror first 127 entries. We must take care not to map entry
3028 -1 because EOF == -1. */
3029 for (idx2 = 0; idx2 < 127; ++idx2)
3030 ctype->map[idx][idx2] = ctype->map[idx][256 + idx2];
3032 /* EOF must map to EOF. */
3033 ctype->map[idx][127] = EOF;
3035 /* The 32 bit map collection. */
3036 for (idx2 = 0; idx2 < ctype->map_collection_act[idx]; ++idx2)
3037 if (ctype->map_collection[idx][idx2] != 0)
3038 ctype->map[idx][128 + ctype->charnames[idx2]]
3039 = ctype->map_collection[idx][idx2];
3042 /* Extra array for class and map names. */
3043 ctype->class_name_ptr = (uint32_t *) xmalloc (ctype->nr_charclass
3044 * sizeof (uint32_t));
3045 ctype->map_name_ptr = (uint32_t *) xmalloc (ctype->map_collection_nr
3046 * sizeof (uint32_t));
3048 /* Array for width information. Because the expected width are very
3049 small we use only one single byte. This save space and we need
3050 not provide the information twice with both endianesses. */
3051 ctype->width = (unsigned char *) xmalloc (ctype->plane_size
3052 * ctype->plane_cnt);
3053 /* Initialize with default width value. */
3054 memset (ctype->width, charmap->width_default,
3055 ctype->plane_size * ctype->plane_cnt);
3056 if (charmap->width_rules != NULL)
3058 size_t cnt;
3060 for (cnt = 0; cnt < charmap->nwidth_rules; ++cnt)
3062 unsigned char bytes[charmap->mb_cur_max];
3063 int nbytes = charmap->width_rules[cnt].from->nbytes;
3065 /* We have the range of character for which the width is
3066 specified described using byte sequences of the multibyte
3067 charset. We have to convert this to UCS4 now. And we
3068 cannot simply convert the beginning and the end of the
3069 sequence, we have to iterate over the byte sequence and
3070 convert it for every single character. */
3071 memcpy (bytes, charmap->width_rules[cnt].from->bytes, nbytes);
3073 while (nbytes < charmap->width_rules[cnt].to->nbytes
3074 || memcmp (bytes, charmap->width_rules[cnt].to->bytes,
3075 nbytes) <= 0)
3077 /* Find the UCS value for `bytes'. */
3078 uint32_t wch = repertoire_find_value (ctype->repertoire, bytes,
3079 nbytes);
3080 int inner;
3082 if (wch != ILLEGAL_CHAR_VALUE)
3084 /* Store the value. */
3085 size_t nr = idx % ctype->plane_size;
3086 size_t depth = 0;
3088 while (ctype->names[nr + depth * ctype->plane_size] != nr)
3089 ++depth;
3090 assert (depth < ctype->plane_cnt);
3092 ctype->width[nr + depth * ctype->plane_size]
3093 = charmap->width_rules[cnt].width;
3096 /* "Increment" the bytes sequence. */
3097 inner = nbytes - 1;
3098 while (inner >= 0 && bytes[inner] == 0xff)
3099 --inner;
3101 if (inner < 0)
3103 /* We have to extend the byte sequence. */
3104 if (nbytes >= charmap->width_rules[cnt].to->nbytes)
3105 break;
3107 bytes[0] = 1;
3108 memset (&bytes[1], 0, nbytes);
3109 ++nbytes;
3111 else
3113 ++bytes[inner];
3114 while (++inner < nbytes)
3115 bytes[inner] = 0;
3121 /* Set MB_CUR_MAX. */
3122 ctype->mb_cur_max = charmap->mb_cur_max;
3124 /* We need the name of the currently used 8-bit character set to
3125 make correct conversion between this 8-bit representation and the
3126 ISO 10646 character set used internally for wide characters. */
3127 ctype->codeset_name = charmap->code_set_name;
3129 /* Now determine the table for the transliteration information.
3131 XXX It is not yet clear to me whether it is worth implementing a
3132 complicated algorithm which uses a hash table to locate the entries.
3133 For now I'll use a simple array which can be searching using binary
3134 search. */
3135 if (ctype->translit_copy_locale != NULL)
3137 /* Fold in the transliteration information from the locale mentioned
3138 in the `include' statement. */
3139 struct locale_ctype_t *here = ctype;
3143 struct localedef_t *other = find_locale (LC_CTYPE,
3144 here->translit_copy_locale,
3145 repertoire->name, charmap);
3147 if (other == NULL)
3149 error (0, 0, _("\
3150 %s: transliteration data from locale `%s' not available"),
3151 "LC_CTYPE", here->translit_copy_locale);
3152 break;
3155 here = other->categories[LC_CTYPE].ctype;
3157 /* Enqueue the information if necessary. */
3158 if (here->translit != NULL)
3160 struct translit_t *endp = here->translit;
3161 while (endp->next != NULL)
3162 endp = endp->next;
3164 endp->next = ctype->translit;
3165 ctype->translit = here->translit;
3168 while (here->translit_copy_locale != NULL);
3171 if (ctype->translit != NULL)
3173 /* First count how many entries we have. This is the upper limit
3174 since some entries from the included files might be overwritten. */
3175 size_t number = 0;
3176 size_t cnt;
3177 struct translit_t *runp = ctype->translit;
3178 struct translit_t **sorted;
3179 size_t from_len, to_len;
3181 while (runp != NULL)
3183 ++number;
3184 runp = runp->next;
3187 /* Next we allocate an array large enough and fill in the values. */
3188 sorted = (struct translit_t **) alloca (number
3189 * sizeof (struct translit_t **));
3190 runp = ctype->translit;
3191 number = 0;
3194 /* Search for the place where to insert this string.
3195 XXX Better use a real sorting algorithm later. */
3196 size_t idx = 0;
3197 int replace = 0;
3199 while (idx < number)
3201 int res = wcscmp ((const wchar_t *) sorted[idx]->from,
3202 (const wchar_t *) runp->from);
3203 if (res == 0)
3205 replace = 1;
3206 break;
3208 if (res > 0)
3209 break;
3210 ++idx;
3213 if (replace)
3214 sorted[idx] = runp;
3215 else
3217 memmove (&sorted[idx + 1], &sorted[idx],
3218 (number - idx) * sizeof (struct translit_t *));
3219 sorted[idx] = runp;
3220 ++number;
3223 runp = runp->next;
3225 while (runp != NULL);
3227 /* The next step is putting all the possible transliteration
3228 strings in one memory block so that we can write it out.
3229 We need several different blocks:
3230 - index to the tfromstring array
3231 - from-string array
3232 - index to the to-string array
3233 - to-string array.
3234 And this all must be available for both endianes variants.
3236 from_len = to_len = 0;
3237 for (cnt = 0; cnt < number; ++cnt)
3239 struct translit_to_t *srunp;
3240 from_len += wcslen ((const wchar_t *) sorted[cnt]->from) + 1;
3241 srunp = sorted[cnt]->to;
3242 while (srunp != NULL)
3244 to_len += wcslen ((const wchar_t *) srunp->str) + 1;
3245 srunp = srunp->next;
3247 /* Plus one for the extra NUL character marking the end of
3248 the list for the current entry. */
3249 ++to_len;
3252 /* We can allocate the arrays for the results. */
3253 ctype->translit_from_idx = xmalloc (number * sizeof (uint32_t));
3254 ctype->translit_from_tbl = xmalloc (from_len * sizeof (uint32_t));
3255 ctype->translit_to_idx = xmalloc (number * sizeof (uint32_t));
3256 ctype->translit_to_tbl = xmalloc (to_len * sizeof (uint32_t));
3258 from_len = 0;
3259 to_len = 0;
3260 for (cnt = 0; cnt < number; ++cnt)
3262 size_t len;
3263 struct translit_to_t *srunp;
3265 ctype->translit_from_idx[cnt] = from_len;
3266 ctype->translit_to_idx[cnt] = to_len;
3268 len = wcslen ((const wchar_t *) sorted[cnt]->from) + 1;
3269 wmemcpy ((wchar_t *) &ctype->translit_from_tbl[from_len],
3270 (const wchar_t *) sorted[cnt]->from, len);
3271 from_len += len;
3273 ctype->translit_to_idx[cnt] = to_len;
3274 srunp = sorted[cnt]->to;
3275 while (srunp != NULL)
3277 len = wcslen ((const wchar_t *) srunp->str) + 1;
3278 wmemcpy ((wchar_t *) &ctype->translit_to_tbl[to_len],
3279 (const wchar_t *) srunp->str, len);
3280 to_len += len;
3281 srunp = srunp->next;
3283 ctype->translit_to_tbl[to_len++] = L'\0';
3286 /* Store the information about the length. */
3287 ctype->translit_idx_size = number * sizeof (uint32_t);
3288 ctype->translit_from_tbl_size = from_len * sizeof (uint32_t);
3289 ctype->translit_to_tbl_size = to_len * sizeof (uint32_t);
3291 else
3293 /* Provide some dummy pointers since we have nothing to write out. */
3294 static uint32_t no_str = { 0 };
3296 ctype->translit_from_idx = &no_str;
3297 ctype->translit_from_tbl = &no_str;
3298 ctype->translit_to_tbl = &no_str;
3299 ctype->translit_idx_size = 0;
3300 ctype->translit_from_tbl_size = 0;
3301 ctype->translit_to_tbl_size = 0;