Use '%z' instead of '%Z' on printf functions
[glibc.git] / locale / programs / ld-ctype.c
blobe85820d3357ce44d3fb260a42c62f978a4dd2e4f
1 /* Copyright (C) 1995-2022 Free Software Foundation, Inc.
2 This file is part of the GNU C Library.
4 This program is free software; you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published
6 by the Free Software Foundation; version 2 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program; if not, see <https://www.gnu.org/licenses/>. */
17 #ifdef HAVE_CONFIG_H
18 # include <config.h>
19 #endif
21 #include <alloca.h>
22 #include <byteswap.h>
23 #include <endian.h>
24 #include <errno.h>
25 #include <limits.h>
26 #include <obstack.h>
27 #include <stdlib.h>
28 #include <string.h>
29 #include <wchar.h>
30 #include <wctype.h>
31 #include <stdint.h>
32 #include <sys/uio.h>
34 #include "localedef.h"
35 #include "charmap.h"
36 #include "localeinfo.h"
37 #include "langinfo.h"
38 #include "linereader.h"
39 #include "locfile-token.h"
40 #include "locfile.h"
42 #include <assert.h>
45 /* The bit used for representing a special class. */
46 #define BITPOS(class) ((class) - tok_upper)
47 #define BIT(class) (_ISbit (BITPOS (class)))
48 #define BITw(class) (_ISwbit (BITPOS (class)))
50 #define ELEM(ctype, collection, idx, value) \
51 *find_idx (ctype, &ctype->collection idx, &ctype->collection##_max idx, \
52 &ctype->collection##_act idx, value)
55 /* To be compatible with former implementations we for now restrict
56 the number of bits for character classes to 16. When compatibility
57 is not necessary anymore increase the number to 32. */
58 #define char_class_t uint16_t
59 #define char_class32_t uint32_t
62 /* Type to describe a transliteration action. We have a possibly
63 multiple character from-string and a set of multiple character
64 to-strings. All are 32bit values since this is what is used in
65 the gconv functions. */
66 struct translit_to_t
68 uint32_t *str;
70 struct translit_to_t *next;
73 struct translit_t
75 uint32_t *from;
77 const char *fname;
78 size_t lineno;
80 struct translit_to_t *to;
82 struct translit_t *next;
85 struct translit_ignore_t
87 uint32_t from;
88 uint32_t to;
89 uint32_t step;
91 const char *fname;
92 size_t lineno;
94 struct translit_ignore_t *next;
98 /* Type to describe a transliteration include statement. */
99 struct translit_include_t
101 const char *copy_locale;
102 const char *copy_repertoire;
104 struct translit_include_t *next;
107 /* Provide some dummy pointer for empty string. */
108 static uint32_t no_str[] = { 0 };
111 /* Sparse table of uint32_t. */
112 #define TABLE idx_table
113 #define ELEMENT uint32_t
114 #define DEFAULT ((uint32_t) ~0)
115 #define NO_ADD_LOCALE
116 #include "3level.h"
118 #define TABLE wcwidth_table
119 #define ELEMENT uint8_t
120 #define DEFAULT 0xff
121 #include "3level.h"
123 #define TABLE wctrans_table
124 #define ELEMENT int32_t
125 #define DEFAULT 0
126 #define wctrans_table_add wctrans_table_add_internal
127 #include "3level.h"
128 #undef wctrans_table_add
129 /* The wctrans_table must actually store the difference between the
130 desired result and the argument. */
131 static inline void
132 wctrans_table_add (struct wctrans_table *t, uint32_t wc, uint32_t mapped_wc)
134 wctrans_table_add_internal (t, wc, mapped_wc - wc);
137 /* Construction of sparse 3-level tables.
138 See wchar-lookup.h for their structure and the meaning of p and q. */
140 struct wctype_table
142 /* Parameters. */
143 unsigned int p;
144 unsigned int q;
145 /* Working representation. */
146 size_t level1_alloc;
147 size_t level1_size;
148 uint32_t *level1;
149 size_t level2_alloc;
150 size_t level2_size;
151 uint32_t *level2;
152 size_t level3_alloc;
153 size_t level3_size;
154 uint32_t *level3;
155 size_t result_size;
158 static void add_locale_wctype_table (struct locale_file *file,
159 struct wctype_table *t);
161 /* The real definition of the struct for the LC_CTYPE locale. */
162 struct locale_ctype_t
164 uint32_t *charnames;
165 size_t charnames_max;
166 size_t charnames_act;
167 /* An index lookup table, to speedup find_idx. */
168 struct idx_table charnames_idx;
170 struct repertoire_t *repertoire;
172 /* We will allow up to 8 * sizeof (uint32_t) character classes. */
173 #define MAX_NR_CHARCLASS (8 * sizeof (uint32_t))
174 size_t nr_charclass;
175 const char *classnames[MAX_NR_CHARCLASS];
176 uint32_t last_class_char;
177 uint32_t class256_collection[256];
178 uint32_t *class_collection;
179 size_t class_collection_max;
180 size_t class_collection_act;
181 uint32_t class_done;
182 uint32_t class_offset;
184 struct charseq **mbdigits;
185 size_t mbdigits_act;
186 size_t mbdigits_max;
187 uint32_t *wcdigits;
188 size_t wcdigits_act;
189 size_t wcdigits_max;
191 struct charseq *mboutdigits[10];
192 uint32_t wcoutdigits[10];
193 size_t outdigits_act;
195 /* If the following number ever turns out to be too small simply
196 increase it. But I doubt it will. --drepper@gnu */
197 #define MAX_NR_CHARMAP 16
198 const char *mapnames[MAX_NR_CHARMAP];
199 uint32_t *map_collection[MAX_NR_CHARMAP];
200 uint32_t map256_collection[2][256];
201 size_t map_collection_max[MAX_NR_CHARMAP];
202 size_t map_collection_act[MAX_NR_CHARMAP];
203 size_t map_collection_nr;
204 size_t last_map_idx;
205 int tomap_done[MAX_NR_CHARMAP];
206 uint32_t map_offset;
208 /* Transliteration information. */
209 struct translit_include_t *translit_include;
210 struct translit_t *translit;
211 struct translit_ignore_t *translit_ignore;
212 uint32_t ntranslit_ignore;
214 uint32_t *default_missing;
215 const char *default_missing_file;
216 size_t default_missing_lineno;
218 uint32_t to_nonascii;
219 uint32_t nonascii_case;
221 /* The arrays for the binary representation. */
222 char_class_t *ctype_b;
223 char_class32_t *ctype32_b;
224 uint32_t **map_b;
225 uint32_t **map32_b;
226 uint32_t **class_b;
227 struct wctype_table *class_3level;
228 struct wctrans_table *map_3level;
229 uint32_t *class_name_ptr;
230 uint32_t *map_name_ptr;
231 struct wcwidth_table width;
232 uint32_t mb_cur_max;
233 const char *codeset_name;
234 uint32_t *translit_from_idx;
235 uint32_t *translit_from_tbl;
236 uint32_t *translit_to_idx;
237 uint32_t *translit_to_tbl;
238 uint32_t translit_idx_size;
239 size_t translit_from_tbl_size;
240 size_t translit_to_tbl_size;
242 struct obstack mempool;
246 /* Marker for an empty slot. This has the value 0xFFFFFFFF, regardless
247 whether 'int' is 16 bit, 32 bit, or 64 bit. */
248 #define EMPTY ((uint32_t) ~0)
251 #define obstack_chunk_alloc xmalloc
252 #define obstack_chunk_free free
255 /* Prototypes for local functions. */
256 static void ctype_startup (struct linereader *lr, struct localedef_t *locale,
257 const struct charmap_t *charmap,
258 struct localedef_t *copy_locale,
259 int ignore_content);
260 static void ctype_class_new (struct linereader *lr,
261 struct locale_ctype_t *ctype, const char *name);
262 static void ctype_map_new (struct linereader *lr,
263 struct locale_ctype_t *ctype,
264 const char *name, const struct charmap_t *charmap);
265 static uint32_t *find_idx (struct locale_ctype_t *ctype, uint32_t **table,
266 size_t *max, size_t *act, uint32_t idx);
267 static void set_class_defaults (struct locale_ctype_t *ctype,
268 const struct charmap_t *charmap,
269 struct repertoire_t *repertoire);
270 static void allocate_arrays (struct locale_ctype_t *ctype,
271 const struct charmap_t *charmap,
272 struct repertoire_t *repertoire);
275 static const char *longnames[] =
277 "zero", "one", "two", "three", "four",
278 "five", "six", "seven", "eight", "nine"
280 static const char *uninames[] =
282 "U00000030", "U00000031", "U00000032", "U00000033", "U00000034",
283 "U00000035", "U00000036", "U00000037", "U00000038", "U00000039"
285 static const unsigned char digits[] = "0123456789";
288 static void
289 ctype_startup (struct linereader *lr, struct localedef_t *locale,
290 const struct charmap_t *charmap,
291 struct localedef_t *copy_locale, int ignore_content)
293 unsigned int cnt;
294 struct locale_ctype_t *ctype;
296 if (!ignore_content && locale->categories[LC_CTYPE].ctype == NULL)
298 if (copy_locale == NULL)
300 /* Allocate the needed room. */
301 locale->categories[LC_CTYPE].ctype = ctype =
302 (struct locale_ctype_t *) xcalloc (1,
303 sizeof (struct locale_ctype_t));
305 /* We have seen no names yet. */
306 ctype->charnames_max = charmap->mb_cur_max == 1 ? 256 : 512;
307 ctype->charnames = (uint32_t *) xmalloc (ctype->charnames_max
308 * sizeof (uint32_t));
309 for (cnt = 0; cnt < 256; ++cnt)
310 ctype->charnames[cnt] = cnt;
311 ctype->charnames_act = 256;
312 idx_table_init (&ctype->charnames_idx);
314 /* Fill character class information. */
315 ctype->last_class_char = ILLEGAL_CHAR_VALUE;
316 /* The order of the following instructions determines the bit
317 positions! */
318 ctype_class_new (lr, ctype, "upper");
319 ctype_class_new (lr, ctype, "lower");
320 ctype_class_new (lr, ctype, "alpha");
321 ctype_class_new (lr, ctype, "digit");
322 ctype_class_new (lr, ctype, "xdigit");
323 ctype_class_new (lr, ctype, "space");
324 ctype_class_new (lr, ctype, "print");
325 ctype_class_new (lr, ctype, "graph");
326 ctype_class_new (lr, ctype, "blank");
327 ctype_class_new (lr, ctype, "cntrl");
328 ctype_class_new (lr, ctype, "punct");
329 ctype_class_new (lr, ctype, "alnum");
331 ctype->class_collection_max = charmap->mb_cur_max == 1 ? 256 : 512;
332 ctype->class_collection
333 = (uint32_t *) xcalloc (sizeof (unsigned long int),
334 ctype->class_collection_max);
335 ctype->class_collection_act = 256;
337 /* Fill character map information. */
338 ctype->last_map_idx = MAX_NR_CHARMAP;
339 ctype_map_new (lr, ctype, "toupper", charmap);
340 ctype_map_new (lr, ctype, "tolower", charmap);
342 /* Fill first 256 entries in `toXXX' arrays. */
343 for (cnt = 0; cnt < 256; ++cnt)
345 ctype->map_collection[0][cnt] = cnt;
346 ctype->map_collection[1][cnt] = cnt;
348 ctype->map256_collection[0][cnt] = cnt;
349 ctype->map256_collection[1][cnt] = cnt;
352 if (enc_not_ascii_compatible)
353 ctype->to_nonascii = 1;
355 obstack_init (&ctype->mempool);
357 else
358 ctype = locale->categories[LC_CTYPE].ctype =
359 copy_locale->categories[LC_CTYPE].ctype;
364 void
365 ctype_finish (struct localedef_t *locale, const struct charmap_t *charmap)
367 /* See POSIX.2, table 2-6 for the meaning of the following table. */
368 #define NCLASS 12
369 static const struct
371 const char *name;
372 const char allow[NCLASS];
374 valid_table[NCLASS] =
376 /* The order is important. See token.h for more information.
377 M = Always, D = Default, - = Permitted, X = Mutually exclusive */
378 { "upper", "--MX-XDDXXX-" },
379 { "lower", "--MX-XDDXXX-" },
380 { "alpha", "---X-XDDXXX-" },
381 { "digit", "XXX--XDDXXX-" },
382 { "xdigit", "-----XDDXXX-" },
383 { "space", "XXXXX------X" },
384 { "print", "---------X--" },
385 { "graph", "---------X--" },
386 { "blank", "XXXXXM-----X" },
387 { "cntrl", "XXXXX-XX--XX" },
388 { "punct", "XXXXX-DD-X-X" },
389 { "alnum", "-----XDDXXX-" }
391 size_t cnt;
392 int cls1, cls2;
393 uint32_t space_value;
394 struct charseq *space_seq;
395 struct locale_ctype_t *ctype = locale->categories[LC_CTYPE].ctype;
396 int warned;
397 const void *key;
398 size_t len;
399 void *vdata;
400 void *curs;
402 /* Now resolve copying and also handle completely missing definitions. */
403 if (ctype == NULL)
405 const char *repertoire_name;
407 /* First see whether we were supposed to copy. If yes, find the
408 actual definition. */
409 if (locale->copy_name[LC_CTYPE] != NULL)
411 /* Find the copying locale. This has to happen transitively since
412 the locale we are copying from might also copying another one. */
413 struct localedef_t *from = locale;
416 from = find_locale (LC_CTYPE, from->copy_name[LC_CTYPE],
417 from->repertoire_name, charmap);
418 while (from->categories[LC_CTYPE].ctype == NULL
419 && from->copy_name[LC_CTYPE] != NULL);
421 ctype = locale->categories[LC_CTYPE].ctype
422 = from->categories[LC_CTYPE].ctype;
425 /* If there is still no definition issue an warning and create an
426 empty one. */
427 if (ctype == NULL)
429 record_warning (_("\
430 No definition for %s category found"), "LC_CTYPE");
431 ctype_startup (NULL, locale, charmap, NULL, 0);
432 ctype = locale->categories[LC_CTYPE].ctype;
435 /* Get the repertoire we have to use. */
436 repertoire_name = locale->repertoire_name ?: repertoire_global;
437 if (repertoire_name != NULL)
438 ctype->repertoire = repertoire_read (repertoire_name);
441 /* We need the name of the currently used 8-bit character set to
442 make correct conversion between this 8-bit representation and the
443 ISO 10646 character set used internally for wide characters. */
444 ctype->codeset_name = charmap->code_set_name;
445 if (ctype->codeset_name == NULL)
447 record_error (0, 0, _("\
448 No character set name specified in charmap"));
449 ctype->codeset_name = "//UNKNOWN//";
452 /* Set default value for classes not specified. */
453 set_class_defaults (ctype, charmap, ctype->repertoire);
455 /* Check according to table. */
456 for (cnt = 0; cnt < ctype->class_collection_act; ++cnt)
458 uint32_t tmp = ctype->class_collection[cnt];
460 if (tmp != 0)
462 for (cls1 = 0; cls1 < NCLASS; ++cls1)
463 if ((tmp & _ISwbit (cls1)) != 0)
464 for (cls2 = 0; cls2 < NCLASS; ++cls2)
465 if (valid_table[cls1].allow[cls2] != '-')
467 int eq = (tmp & _ISwbit (cls2)) != 0;
468 switch (valid_table[cls1].allow[cls2])
470 case 'M':
471 if (!eq)
473 uint32_t value = ctype->charnames[cnt];
475 record_error (0, 0, _("\
476 character L'\\u%0*x' in class `%s' must be in class `%s'"),
477 value > 0xffff ? 8 : 4,
478 value,
479 valid_table[cls1].name,
480 valid_table[cls2].name);
482 break;
484 case 'X':
485 if (eq)
487 uint32_t value = ctype->charnames[cnt];
489 record_error (0, 0, _("\
490 character L'\\u%0*x' in class `%s' must not be in class `%s'"),
491 value > 0xffff ? 8 : 4,
492 value,
493 valid_table[cls1].name,
494 valid_table[cls2].name);
496 break;
498 case 'D':
499 ctype->class_collection[cnt] |= _ISwbit (cls2);
500 break;
502 default:
503 record_error (5, 0, _("\
504 internal error in %s, line %u"), __FUNCTION__, __LINE__);
510 for (cnt = 0; cnt < 256; ++cnt)
512 uint32_t tmp = ctype->class256_collection[cnt];
514 if (tmp != 0)
516 for (cls1 = 0; cls1 < NCLASS; ++cls1)
517 if ((tmp & _ISbit (cls1)) != 0)
518 for (cls2 = 0; cls2 < NCLASS; ++cls2)
519 if (valid_table[cls1].allow[cls2] != '-')
521 int eq = (tmp & _ISbit (cls2)) != 0;
522 switch (valid_table[cls1].allow[cls2])
524 case 'M':
525 if (!eq)
527 char buf[17];
529 snprintf (buf, sizeof buf, "\\%zo", cnt);
531 record_error (0, 0, _("\
532 character '%s' in class `%s' must be in class `%s'"),
533 buf,
534 valid_table[cls1].name,
535 valid_table[cls2].name);
537 break;
539 case 'X':
540 if (eq)
542 char buf[17];
544 snprintf (buf, sizeof buf, "\\%zo", cnt);
546 record_error (0, 0, _("\
547 character '%s' in class `%s' must not be in class `%s'"),
548 buf,
549 valid_table[cls1].name,
550 valid_table[cls2].name);
552 break;
554 case 'D':
555 ctype->class256_collection[cnt] |= _ISbit (cls2);
556 break;
558 default:
559 record_error (5, 0, _("\
560 internal error in %s, line %u"), __FUNCTION__, __LINE__);
566 /* ... and now test <SP> as a special case. */
567 space_value = 32;
568 if (((cnt = BITPOS (tok_space),
569 (ELEM (ctype, class_collection, , space_value)
570 & BITw (tok_space)) == 0)
571 || (cnt = BITPOS (tok_blank),
572 (ELEM (ctype, class_collection, , space_value)
573 & BITw (tok_blank)) == 0)))
575 record_error (0, 0, _("<SP> character not in class `%s'"),
576 valid_table[cnt].name);
578 else if (((cnt = BITPOS (tok_punct),
579 (ELEM (ctype, class_collection, , space_value)
580 & BITw (tok_punct)) != 0)
581 || (cnt = BITPOS (tok_graph),
582 (ELEM (ctype, class_collection, , space_value)
583 & BITw (tok_graph))
584 != 0)))
586 record_error (0, 0, _("\
587 <SP> character must not be in class `%s'"),
588 valid_table[cnt].name);
590 else
591 ELEM (ctype, class_collection, , space_value) |= BITw (tok_print);
593 space_seq = charmap_find_value (charmap, "SP", 2);
594 if (space_seq == NULL)
595 space_seq = charmap_find_value (charmap, "space", 5);
596 if (space_seq == NULL)
597 space_seq = charmap_find_value (charmap, "U00000020", 9);
598 if (space_seq == NULL || space_seq->nbytes != 1)
600 record_error (0, 0, _("\
601 character <SP> not defined in character map"));
603 else if (((cnt = BITPOS (tok_space),
604 (ctype->class256_collection[space_seq->bytes[0]]
605 & BIT (tok_space)) == 0)
606 || (cnt = BITPOS (tok_blank),
607 (ctype->class256_collection[space_seq->bytes[0]]
608 & BIT (tok_blank)) == 0)))
610 record_error (0, 0, _("<SP> character not in class `%s'"),
611 valid_table[cnt].name);
613 else if (((cnt = BITPOS (tok_punct),
614 (ctype->class256_collection[space_seq->bytes[0]]
615 & BIT (tok_punct)) != 0)
616 || (cnt = BITPOS (tok_graph),
617 (ctype->class256_collection[space_seq->bytes[0]]
618 & BIT (tok_graph)) != 0)))
620 record_error (0, 0, _("\
621 <SP> character must not be in class `%s'"),
622 valid_table[cnt].name);
624 else
625 ctype->class256_collection[space_seq->bytes[0]] |= BIT (tok_print);
627 /* Check whether all single-byte characters make to their upper/lowercase
628 equivalent according to the ASCII rules. */
629 for (cnt = 'A'; cnt <= 'Z'; ++cnt)
631 uint32_t uppval = ctype->map256_collection[0][cnt];
632 uint32_t lowval = ctype->map256_collection[1][cnt];
633 uint32_t lowuppval = ctype->map256_collection[0][lowval];
634 uint32_t lowlowval = ctype->map256_collection[1][lowval];
636 if (uppval != cnt
637 || lowval != cnt + 0x20
638 || lowuppval != cnt
639 || lowlowval != cnt + 0x20)
640 ctype->nonascii_case = 1;
642 for (cnt = 0; cnt < 256; ++cnt)
643 if (cnt < 'A' || (cnt > 'Z' && cnt < 'a') || cnt > 'z')
644 if (ctype->map256_collection[0][cnt] != cnt
645 || ctype->map256_collection[1][cnt] != cnt)
646 ctype->nonascii_case = 1;
648 /* Now that the tests are done make sure the name array contains all
649 characters which are handled in the WIDTH section of the
650 character set definition file. */
651 if (charmap->width_rules != NULL)
652 for (cnt = 0; cnt < charmap->nwidth_rules; ++cnt)
654 unsigned char bytes[charmap->mb_cur_max];
655 int nbytes = charmap->width_rules[cnt].from->nbytes;
657 /* We have the range of character for which the width is
658 specified described using byte sequences of the multibyte
659 charset. We have to convert this to UCS4 now. And we
660 cannot simply convert the beginning and the end of the
661 sequence, we have to iterate over the byte sequence and
662 convert it for every single character. */
663 memcpy (bytes, charmap->width_rules[cnt].from->bytes, nbytes);
665 while (nbytes < charmap->width_rules[cnt].to->nbytes
666 || memcmp (bytes, charmap->width_rules[cnt].to->bytes,
667 nbytes) <= 0)
669 /* Find the UCS value for `bytes'. */
670 int inner;
671 uint32_t wch;
672 struct charseq *seq
673 = charmap_find_symbol (charmap, (char *) bytes, nbytes);
675 if (seq == NULL)
676 wch = ILLEGAL_CHAR_VALUE;
677 else if (seq->ucs4 != UNINITIALIZED_CHAR_VALUE)
678 wch = seq->ucs4;
679 else
680 wch = repertoire_find_value (ctype->repertoire, seq->name,
681 strlen (seq->name));
683 if (wch != ILLEGAL_CHAR_VALUE)
684 /* We are only interested in the side-effects of the
685 `find_idx' call. It will add appropriate entries in
686 the name array if this is necessary. */
687 (void) find_idx (ctype, NULL, NULL, NULL, wch);
689 /* "Increment" the bytes sequence. */
690 inner = nbytes - 1;
691 while (inner >= 0 && bytes[inner] == 0xff)
692 --inner;
694 if (inner < 0)
696 /* We have to extend the byte sequence. */
697 if (nbytes >= charmap->width_rules[cnt].to->nbytes)
698 break;
700 bytes[0] = 1;
701 memset (&bytes[1], 0, nbytes);
702 ++nbytes;
704 else
706 ++bytes[inner];
707 while (++inner < nbytes)
708 bytes[inner] = 0;
713 /* Now set all the other characters of the character set to the
714 default width. */
715 curs = NULL;
716 while (iterate_table (&charmap->char_table, &curs, &key, &len, &vdata) == 0)
718 struct charseq *data = (struct charseq *) vdata;
720 if (data->ucs4 == UNINITIALIZED_CHAR_VALUE)
721 data->ucs4 = repertoire_find_value (ctype->repertoire,
722 data->name, len);
724 if (data->ucs4 != ILLEGAL_CHAR_VALUE)
725 (void) find_idx (ctype, NULL, NULL, NULL, data->ucs4);
728 /* There must be a multiple of 10 digits. */
729 if (ctype->mbdigits_act % 10 != 0)
731 assert (ctype->mbdigits_act == ctype->wcdigits_act);
732 ctype->wcdigits_act -= ctype->mbdigits_act % 10;
733 ctype->mbdigits_act -= ctype->mbdigits_act % 10;
734 record_error (0, 0, _("\
735 `digit' category has not entries in groups of ten"));
738 /* Check the input digits. There must be a multiple of ten available.
739 In each group it could be that one or the other character is missing.
740 In this case the whole group must be removed. */
741 cnt = 0;
742 while (cnt < ctype->mbdigits_act)
744 size_t inner;
745 for (inner = 0; inner < 10; ++inner)
746 if (ctype->mbdigits[cnt + inner] == NULL)
747 break;
749 if (inner == 10)
750 cnt += 10;
751 else
753 /* Remove the group. */
754 memmove (&ctype->mbdigits[cnt], &ctype->mbdigits[cnt + 10],
755 ((ctype->wcdigits_act - cnt - 10)
756 * sizeof (ctype->mbdigits[0])));
757 ctype->mbdigits_act -= 10;
761 /* If no input digits are given use the default. */
762 if (ctype->mbdigits_act == 0)
764 if (ctype->mbdigits_max == 0)
766 ctype->mbdigits = obstack_alloc (&((struct charmap_t *) charmap)->mem_pool,
767 10 * sizeof (struct charseq *));
768 ctype->mbdigits_max = 10;
771 for (cnt = 0; cnt < 10; ++cnt)
773 ctype->mbdigits[cnt] = charmap_find_symbol (charmap,
774 (char *) digits + cnt, 1);
775 if (ctype->mbdigits[cnt] == NULL)
777 ctype->mbdigits[cnt] = charmap_find_symbol (charmap,
778 longnames[cnt],
779 strlen (longnames[cnt]));
780 if (ctype->mbdigits[cnt] == NULL)
782 /* Hum, this ain't good. */
783 record_error (0, 0, _("\
784 no input digits defined and none of the standard names in the charmap"));
786 ctype->mbdigits[cnt] = obstack_alloc (&((struct charmap_t *) charmap)->mem_pool,
787 sizeof (struct charseq) + 1);
789 /* This is better than nothing. */
790 ctype->mbdigits[cnt]->bytes[0] = digits[cnt];
791 ctype->mbdigits[cnt]->nbytes = 1;
796 ctype->mbdigits_act = 10;
799 /* Check the wide character input digits. There must be a multiple
800 of ten available. In each group it could be that one or the other
801 character is missing. In this case the whole group must be
802 removed. */
803 cnt = 0;
804 while (cnt < ctype->wcdigits_act)
806 size_t inner;
807 for (inner = 0; inner < 10; ++inner)
808 if (ctype->wcdigits[cnt + inner] == ILLEGAL_CHAR_VALUE)
809 break;
811 if (inner == 10)
812 cnt += 10;
813 else
815 /* Remove the group. */
816 memmove (&ctype->wcdigits[cnt], &ctype->wcdigits[cnt + 10],
817 ((ctype->wcdigits_act - cnt - 10)
818 * sizeof (ctype->wcdigits[0])));
819 ctype->wcdigits_act -= 10;
823 /* If no input digits are given use the default. */
824 if (ctype->wcdigits_act == 0)
826 if (ctype->wcdigits_max == 0)
828 ctype->wcdigits = obstack_alloc (&((struct charmap_t *) charmap)->mem_pool,
829 10 * sizeof (uint32_t));
830 ctype->wcdigits_max = 10;
833 for (cnt = 0; cnt < 10; ++cnt)
834 ctype->wcdigits[cnt] = L'0' + cnt;
836 ctype->mbdigits_act = 10;
839 /* Check the outdigits. */
840 warned = 0;
841 for (cnt = 0; cnt < 10; ++cnt)
842 if (ctype->mboutdigits[cnt] == NULL)
844 if (!warned)
846 record_error (0, 0, _("\
847 not all characters used in `outdigit' are available in the charmap"));
848 warned = 1;
851 static const struct charseq replace =
853 .nbytes = 1,
854 .bytes = "?",
856 ctype->mboutdigits[cnt] = (struct charseq *) &replace;
859 warned = 0;
860 for (cnt = 0; cnt < 10; ++cnt)
861 if (ctype->wcoutdigits[cnt] == 0)
863 if (!warned)
865 record_error (0, 0, _("\
866 not all characters used in `outdigit' are available in the repertoire"));
867 warned = 1;
870 ctype->wcoutdigits[cnt] = L'?';
873 /* Sort the entries in the translit_ignore list. */
874 if (ctype->translit_ignore != NULL)
876 struct translit_ignore_t *firstp = ctype->translit_ignore;
877 struct translit_ignore_t *runp;
879 ctype->ntranslit_ignore = 1;
881 for (runp = firstp->next; runp != NULL; runp = runp->next)
883 struct translit_ignore_t *lastp = NULL;
884 struct translit_ignore_t *cmpp;
886 ++ctype->ntranslit_ignore;
888 for (cmpp = firstp; cmpp != NULL; lastp = cmpp, cmpp = cmpp->next)
889 if (runp->from < cmpp->from)
890 break;
892 runp->next = lastp;
893 if (lastp == NULL)
894 firstp = runp;
897 ctype->translit_ignore = firstp;
902 void
903 ctype_output (struct localedef_t *locale, const struct charmap_t *charmap,
904 const char *output_path)
906 struct locale_ctype_t *ctype = locale->categories[LC_CTYPE].ctype;
907 const size_t nelems = (_NL_ITEM_INDEX (_NL_CTYPE_EXTRA_MAP_1)
908 + ctype->nr_charclass + ctype->map_collection_nr);
909 struct locale_file file;
910 uint32_t default_missing_len;
911 size_t elem, cnt;
913 /* Now prepare the output: Find the sizes of the table we can use. */
914 allocate_arrays (ctype, charmap, ctype->repertoire);
916 default_missing_len = (ctype->default_missing
917 ? wcslen ((wchar_t *) ctype->default_missing)
918 : 0);
920 init_locale_data (&file, nelems);
921 for (elem = 0; elem < nelems; ++elem)
923 if (elem < _NL_ITEM_INDEX (_NL_CTYPE_EXTRA_MAP_1))
924 switch (elem)
926 #define CTYPE_EMPTY(name) \
927 case name: \
928 add_locale_empty (&file); \
929 break
931 CTYPE_EMPTY(_NL_CTYPE_GAP1);
932 CTYPE_EMPTY(_NL_CTYPE_GAP2);
933 CTYPE_EMPTY(_NL_CTYPE_GAP3);
934 CTYPE_EMPTY(_NL_CTYPE_GAP4);
935 CTYPE_EMPTY(_NL_CTYPE_GAP5);
936 CTYPE_EMPTY(_NL_CTYPE_GAP6);
938 #define CTYPE_RAW_DATA(name, base, size) \
939 case _NL_ITEM_INDEX (name): \
940 add_locale_raw_data (&file, base, size); \
941 break
943 CTYPE_RAW_DATA (_NL_CTYPE_CLASS,
944 ctype->ctype_b,
945 (256 + 128) * sizeof (char_class_t));
947 #define CTYPE_UINT32_ARRAY(name, base, n_elems) \
948 case _NL_ITEM_INDEX (name): \
949 add_locale_uint32_array (&file, base, n_elems); \
950 break
952 CTYPE_UINT32_ARRAY (_NL_CTYPE_TOUPPER, ctype->map_b[0], 256 + 128);
953 CTYPE_UINT32_ARRAY (_NL_CTYPE_TOLOWER, ctype->map_b[1], 256 + 128);
954 CTYPE_UINT32_ARRAY (_NL_CTYPE_TOUPPER32, ctype->map32_b[0], 256);
955 CTYPE_UINT32_ARRAY (_NL_CTYPE_TOLOWER32, ctype->map32_b[1], 256);
956 CTYPE_RAW_DATA (_NL_CTYPE_CLASS32,
957 ctype->ctype32_b,
958 256 * sizeof (char_class32_t));
960 #define CTYPE_UINT32(name, value) \
961 case _NL_ITEM_INDEX (name): \
962 add_locale_uint32 (&file, value); \
963 break
965 CTYPE_UINT32 (_NL_CTYPE_CLASS_OFFSET, ctype->class_offset);
966 CTYPE_UINT32 (_NL_CTYPE_MAP_OFFSET, ctype->map_offset);
967 CTYPE_UINT32 (_NL_CTYPE_TRANSLIT_TAB_SIZE, ctype->translit_idx_size);
969 CTYPE_UINT32_ARRAY (_NL_CTYPE_TRANSLIT_FROM_IDX,
970 ctype->translit_from_idx,
971 ctype->translit_idx_size);
973 CTYPE_UINT32_ARRAY (_NL_CTYPE_TRANSLIT_FROM_TBL,
974 ctype->translit_from_tbl,
975 ctype->translit_from_tbl_size
976 / sizeof (uint32_t));
978 CTYPE_UINT32_ARRAY (_NL_CTYPE_TRANSLIT_TO_IDX,
979 ctype->translit_to_idx,
980 ctype->translit_idx_size);
982 CTYPE_UINT32_ARRAY (_NL_CTYPE_TRANSLIT_TO_TBL,
983 ctype->translit_to_tbl,
984 ctype->translit_to_tbl_size / sizeof (uint32_t));
986 case _NL_ITEM_INDEX (_NL_CTYPE_CLASS_NAMES):
987 /* The class name array. */
988 start_locale_structure (&file);
989 for (cnt = 0; cnt < ctype->nr_charclass; ++cnt)
990 add_locale_string (&file, ctype->classnames[cnt]);
991 add_locale_char (&file, 0);
992 align_locale_data (&file, LOCFILE_ALIGN);
993 end_locale_structure (&file);
994 break;
996 case _NL_ITEM_INDEX (_NL_CTYPE_MAP_NAMES):
997 /* The class name array. */
998 start_locale_structure (&file);
999 for (cnt = 0; cnt < ctype->map_collection_nr; ++cnt)
1000 add_locale_string (&file, ctype->mapnames[cnt]);
1001 add_locale_char (&file, 0);
1002 align_locale_data (&file, LOCFILE_ALIGN);
1003 end_locale_structure (&file);
1004 break;
1006 case _NL_ITEM_INDEX (_NL_CTYPE_WIDTH):
1007 add_locale_wcwidth_table (&file, &ctype->width);
1008 break;
1010 CTYPE_UINT32 (_NL_CTYPE_MB_CUR_MAX, ctype->mb_cur_max);
1012 case _NL_ITEM_INDEX (_NL_CTYPE_CODESET_NAME):
1013 add_locale_string (&file, ctype->codeset_name);
1014 break;
1016 CTYPE_UINT32 (_NL_CTYPE_MAP_TO_NONASCII, ctype->to_nonascii);
1018 CTYPE_UINT32 (_NL_CTYPE_NONASCII_CASE, ctype->nonascii_case);
1020 case _NL_ITEM_INDEX (_NL_CTYPE_INDIGITS_MB_LEN):
1021 add_locale_uint32 (&file, ctype->mbdigits_act / 10);
1022 break;
1024 case _NL_ITEM_INDEX (_NL_CTYPE_INDIGITS_WC_LEN):
1025 add_locale_uint32 (&file, ctype->wcdigits_act / 10);
1026 break;
1028 case _NL_ITEM_INDEX (_NL_CTYPE_INDIGITS0_MB) ... _NL_ITEM_INDEX (_NL_CTYPE_INDIGITS9_MB):
1029 start_locale_structure (&file);
1030 for (cnt = elem - _NL_ITEM_INDEX (_NL_CTYPE_INDIGITS0_MB);
1031 cnt < ctype->mbdigits_act; cnt += 10)
1033 add_locale_raw_data (&file, ctype->mbdigits[cnt]->bytes,
1034 ctype->mbdigits[cnt]->nbytes);
1035 add_locale_char (&file, 0);
1037 end_locale_structure (&file);
1038 break;
1040 case _NL_ITEM_INDEX (_NL_CTYPE_OUTDIGIT0_MB) ... _NL_ITEM_INDEX (_NL_CTYPE_OUTDIGIT9_MB):
1041 start_locale_structure (&file);
1042 cnt = elem - _NL_ITEM_INDEX (_NL_CTYPE_OUTDIGIT0_MB);
1043 add_locale_raw_data (&file, ctype->mboutdigits[cnt]->bytes,
1044 ctype->mboutdigits[cnt]->nbytes);
1045 add_locale_char (&file, 0);
1046 end_locale_structure (&file);
1047 break;
1049 case _NL_ITEM_INDEX (_NL_CTYPE_INDIGITS0_WC) ... _NL_ITEM_INDEX (_NL_CTYPE_INDIGITS9_WC):
1050 start_locale_structure (&file);
1051 for (cnt = elem - _NL_ITEM_INDEX (_NL_CTYPE_INDIGITS0_WC);
1052 cnt < ctype->wcdigits_act; cnt += 10)
1053 add_locale_uint32 (&file, ctype->wcdigits[cnt]);
1054 end_locale_structure (&file);
1055 break;
1057 case _NL_ITEM_INDEX (_NL_CTYPE_OUTDIGIT0_WC) ... _NL_ITEM_INDEX (_NL_CTYPE_OUTDIGIT9_WC):
1058 cnt = elem - _NL_ITEM_INDEX (_NL_CTYPE_OUTDIGIT0_WC);
1059 add_locale_uint32 (&file, ctype->wcoutdigits[cnt]);
1060 break;
1062 case _NL_ITEM_INDEX(_NL_CTYPE_TRANSLIT_DEFAULT_MISSING_LEN):
1063 add_locale_uint32 (&file, default_missing_len);
1064 break;
1066 case _NL_ITEM_INDEX(_NL_CTYPE_TRANSLIT_DEFAULT_MISSING):
1067 add_locale_uint32_array (&file, ctype->default_missing,
1068 default_missing_len);
1069 break;
1071 case _NL_ITEM_INDEX(_NL_CTYPE_TRANSLIT_IGNORE_LEN):
1072 add_locale_uint32 (&file, ctype->ntranslit_ignore);
1073 break;
1075 case _NL_ITEM_INDEX(_NL_CTYPE_TRANSLIT_IGNORE):
1076 start_locale_structure (&file);
1078 struct translit_ignore_t *runp;
1079 for (runp = ctype->translit_ignore; runp != NULL;
1080 runp = runp->next)
1082 add_locale_uint32 (&file, runp->from);
1083 add_locale_uint32 (&file, runp->to);
1084 add_locale_uint32 (&file, runp->step);
1087 end_locale_structure (&file);
1088 break;
1090 default:
1091 assert (! "unknown CTYPE element");
1093 else
1095 /* Handle extra maps. */
1096 size_t nr = elem - _NL_ITEM_INDEX (_NL_CTYPE_EXTRA_MAP_1);
1097 if (nr < ctype->nr_charclass)
1099 start_locale_prelude (&file);
1100 add_locale_uint32_array (&file, ctype->class_b[nr], 256 / 32);
1101 end_locale_prelude (&file);
1102 add_locale_wctype_table (&file, &ctype->class_3level[nr]);
1104 else
1106 nr -= ctype->nr_charclass;
1107 assert (nr < ctype->map_collection_nr);
1108 add_locale_wctrans_table (&file, &ctype->map_3level[nr]);
1113 write_locale_data (output_path, LC_CTYPE, "LC_CTYPE", &file);
1117 /* Local functions. */
1118 static void
1119 ctype_class_new (struct linereader *lr, struct locale_ctype_t *ctype,
1120 const char *name)
1122 size_t cnt;
1124 for (cnt = 0; cnt < ctype->nr_charclass; ++cnt)
1125 if (strcmp (ctype->classnames[cnt], name) == 0)
1126 break;
1128 if (cnt < ctype->nr_charclass)
1130 lr_error (lr, _("character class `%s' already defined"), name);
1131 return;
1134 if (ctype->nr_charclass == MAX_NR_CHARCLASS)
1135 /* Exit code 2 is prescribed in P1003.2b. */
1136 record_error (2, 0, _("\
1137 implementation limit: no more than %Zd character classes allowed"),
1138 MAX_NR_CHARCLASS);
1140 ctype->classnames[ctype->nr_charclass++] = name;
1144 static void
1145 ctype_map_new (struct linereader *lr, struct locale_ctype_t *ctype,
1146 const char *name, const struct charmap_t *charmap)
1148 size_t max_chars = 0;
1149 size_t cnt;
1151 for (cnt = 0; cnt < ctype->map_collection_nr; ++cnt)
1153 if (strcmp (ctype->mapnames[cnt], name) == 0)
1154 break;
1156 if (max_chars < ctype->map_collection_max[cnt])
1157 max_chars = ctype->map_collection_max[cnt];
1160 if (cnt < ctype->map_collection_nr)
1162 lr_error (lr, _("character map `%s' already defined"), name);
1163 return;
1166 if (ctype->map_collection_nr == MAX_NR_CHARMAP)
1167 /* Exit code 2 is prescribed in P1003.2b. */
1168 record_error (2, 0, _("\
1169 implementation limit: no more than %d character maps allowed"),
1170 MAX_NR_CHARMAP);
1172 ctype->mapnames[cnt] = name;
1174 if (max_chars == 0)
1175 ctype->map_collection_max[cnt] = charmap->mb_cur_max == 1 ? 256 : 512;
1176 else
1177 ctype->map_collection_max[cnt] = max_chars;
1179 ctype->map_collection[cnt] = (uint32_t *)
1180 xcalloc (sizeof (uint32_t), ctype->map_collection_max[cnt]);
1181 ctype->map_collection_act[cnt] = 256;
1183 ++ctype->map_collection_nr;
1187 /* We have to be prepared that TABLE, MAX, and ACT can be NULL. This
1188 is possible if we only want to extend the name array. */
1189 static uint32_t *
1190 find_idx (struct locale_ctype_t *ctype, uint32_t **table, size_t *max,
1191 size_t *act, uint32_t idx)
1193 size_t cnt;
1195 if (idx < 256)
1196 return table == NULL ? NULL : &(*table)[idx];
1198 /* Use the charnames_idx lookup table instead of the slow search loop. */
1199 #if 1
1200 cnt = idx_table_get (&ctype->charnames_idx, idx);
1201 if (cnt == EMPTY)
1202 /* Not found. */
1203 cnt = ctype->charnames_act;
1204 #else
1205 for (cnt = 256; cnt < ctype->charnames_act; ++cnt)
1206 if (ctype->charnames[cnt] == idx)
1207 break;
1208 #endif
1210 /* We have to distinguish two cases: the name is found or not. */
1211 if (cnt == ctype->charnames_act)
1213 /* Extend the name array. */
1214 if (ctype->charnames_act == ctype->charnames_max)
1216 ctype->charnames_max *= 2;
1217 ctype->charnames = (uint32_t *)
1218 xrealloc (ctype->charnames,
1219 sizeof (uint32_t) * ctype->charnames_max);
1221 ctype->charnames[ctype->charnames_act++] = idx;
1222 idx_table_add (&ctype->charnames_idx, idx, cnt);
1225 if (table == NULL)
1226 /* We have done everything we are asked to do. */
1227 return NULL;
1229 if (max == NULL)
1230 /* The caller does not want to extend the table. */
1231 return (cnt >= *act ? NULL : &(*table)[cnt]);
1233 if (cnt >= *act)
1235 if (cnt >= *max)
1237 size_t old_max = *max;
1239 *max *= 2;
1240 while (*max <= cnt);
1242 *table =
1243 (uint32_t *) xrealloc (*table, *max * sizeof (uint32_t));
1244 memset (&(*table)[old_max], '\0',
1245 (*max - old_max) * sizeof (uint32_t));
1248 *act = cnt + 1;
1251 return &(*table)[cnt];
1255 static int
1256 get_character (struct token *now, const struct charmap_t *charmap,
1257 struct repertoire_t *repertoire,
1258 struct charseq **seqp, uint32_t *wchp)
1260 if (now->tok == tok_bsymbol)
1262 /* This will hopefully be the normal case. */
1263 *wchp = repertoire_find_value (repertoire, now->val.str.startmb,
1264 now->val.str.lenmb);
1265 *seqp = charmap_find_value (charmap, now->val.str.startmb,
1266 now->val.str.lenmb);
1268 else if (now->tok == tok_ucs4)
1270 char utmp[10];
1272 snprintf (utmp, sizeof (utmp), "U%08X", now->val.ucs4);
1273 *seqp = charmap_find_value (charmap, utmp, 9);
1275 if (*seqp == NULL)
1276 *seqp = repertoire_find_seq (repertoire, now->val.ucs4);
1278 if (*seqp == NULL)
1280 /* Compute the value in the charmap from the UCS value. */
1281 const char *symbol = repertoire_find_symbol (repertoire,
1282 now->val.ucs4);
1284 if (symbol == NULL)
1285 *seqp = NULL;
1286 else
1287 *seqp = charmap_find_value (charmap, symbol, strlen (symbol));
1289 if (*seqp == NULL)
1291 if (repertoire != NULL)
1293 /* Insert a negative entry. */
1294 static const struct charseq negative
1295 = { .ucs4 = ILLEGAL_CHAR_VALUE };
1296 uint32_t *newp = obstack_alloc (&repertoire->mem_pool,
1297 sizeof (uint32_t));
1298 *newp = now->val.ucs4;
1300 insert_entry (&repertoire->seq_table, newp,
1301 sizeof (uint32_t), (void *) &negative);
1304 else
1305 (*seqp)->ucs4 = now->val.ucs4;
1307 else if ((*seqp)->ucs4 != now->val.ucs4)
1308 *seqp = NULL;
1310 *wchp = now->val.ucs4;
1312 else if (now->tok == tok_charcode)
1314 /* We must map from the byte code to UCS4. */
1315 *seqp = charmap_find_symbol (charmap, now->val.str.startmb,
1316 now->val.str.lenmb);
1318 if (*seqp == NULL)
1319 *wchp = ILLEGAL_CHAR_VALUE;
1320 else
1322 if ((*seqp)->ucs4 == UNINITIALIZED_CHAR_VALUE)
1323 (*seqp)->ucs4 = repertoire_find_value (repertoire, (*seqp)->name,
1324 strlen ((*seqp)->name));
1325 *wchp = (*seqp)->ucs4;
1328 else
1329 return 1;
1331 return 0;
1335 /* Ellipsis like in `<foo123>..<foo12a>' or `<j1234>....<j1245>' and
1336 the .(2). counterparts. */
1337 static void
1338 charclass_symbolic_ellipsis (struct linereader *ldfile,
1339 struct locale_ctype_t *ctype,
1340 const struct charmap_t *charmap,
1341 struct repertoire_t *repertoire,
1342 struct token *now,
1343 const char *last_str,
1344 unsigned long int class256_bit,
1345 unsigned long int class_bit, int base,
1346 int ignore_content, int handle_digits, int step)
1348 const char *nowstr = now->val.str.startmb;
1349 char tmp[now->val.str.lenmb + 1];
1350 const char *cp;
1351 char *endp;
1352 unsigned long int from;
1353 unsigned long int to;
1355 /* We have to compute the ellipsis values using the symbolic names. */
1356 assert (last_str != NULL);
1358 if (strlen (last_str) != now->val.str.lenmb)
1360 invalid_range:
1361 lr_error (ldfile,
1362 _("`%s' and `%.*s' are not valid names for symbolic range"),
1363 last_str, (int) now->val.str.lenmb, nowstr);
1364 return;
1367 if (memcmp (last_str, nowstr, now->val.str.lenmb) == 0)
1368 /* Nothing to do, the names are the same. */
1369 return;
1371 for (cp = last_str; *cp == *(nowstr + (cp - last_str)); ++cp)
1374 errno = 0;
1375 from = strtoul (cp, &endp, base);
1376 if ((from == UINT_MAX && errno == ERANGE) || *endp != '\0')
1377 goto invalid_range;
1379 to = strtoul (nowstr + (cp - last_str), &endp, base);
1380 if ((to == UINT_MAX && errno == ERANGE)
1381 || (endp - nowstr) != now->val.str.lenmb || from >= to)
1382 goto invalid_range;
1384 /* OK, we have a range FROM - TO. Now we can create the symbolic names. */
1385 if (!ignore_content)
1387 now->val.str.startmb = tmp;
1388 while ((from += step) <= to)
1390 struct charseq *seq;
1391 uint32_t wch;
1393 sprintf (tmp, (base == 10 ? "%.*s%0*ld" : "%.*s%0*lX"),
1394 (int) (cp - last_str), last_str,
1395 (int) (now->val.str.lenmb - (cp - last_str)),
1396 from);
1398 if (get_character (now, charmap, repertoire, &seq, &wch))
1399 goto invalid_range;
1401 if (seq != NULL && seq->nbytes == 1)
1402 /* Yep, we can store information about this byte sequence. */
1403 ctype->class256_collection[seq->bytes[0]] |= class256_bit;
1405 if (wch != ILLEGAL_CHAR_VALUE && class_bit != 0)
1406 /* We have the UCS4 position. */
1407 *find_idx (ctype, &ctype->class_collection,
1408 &ctype->class_collection_max,
1409 &ctype->class_collection_act, wch) |= class_bit;
1411 if (handle_digits == 1)
1413 /* We must store the digit values. */
1414 if (ctype->mbdigits_act == ctype->mbdigits_max)
1416 ctype->mbdigits_max *= 2;
1417 ctype->mbdigits = xrealloc (ctype->mbdigits,
1418 (ctype->mbdigits_max
1419 * sizeof (char *)));
1420 ctype->wcdigits_max *= 2;
1421 ctype->wcdigits = xrealloc (ctype->wcdigits,
1422 (ctype->wcdigits_max
1423 * sizeof (uint32_t)));
1426 ctype->mbdigits[ctype->mbdigits_act++] = seq;
1427 ctype->wcdigits[ctype->wcdigits_act++] = wch;
1429 else if (handle_digits == 2)
1431 /* We must store the digit values. */
1432 if (ctype->outdigits_act >= 10)
1434 lr_error (ldfile, _("\
1435 %s: field `%s' does not contain exactly ten entries"),
1436 "LC_CTYPE", "outdigit");
1437 return;
1440 ctype->mboutdigits[ctype->outdigits_act] = seq;
1441 ctype->wcoutdigits[ctype->outdigits_act] = wch;
1442 ++ctype->outdigits_act;
1449 /* Ellipsis like in `<U1234>..<U2345>' or `<U1234>..(2)..<U2345>'. */
1450 static void
1451 charclass_ucs4_ellipsis (struct linereader *ldfile,
1452 struct locale_ctype_t *ctype,
1453 const struct charmap_t *charmap,
1454 struct repertoire_t *repertoire,
1455 struct token *now, uint32_t last_wch,
1456 unsigned long int class256_bit,
1457 unsigned long int class_bit, int ignore_content,
1458 int handle_digits, int step)
1460 if (last_wch > now->val.ucs4)
1462 lr_error (ldfile, _("\
1463 to-value <U%0*X> of range is smaller than from-value <U%0*X>"),
1464 (now->val.ucs4 | last_wch) < 65536 ? 4 : 8, now->val.ucs4,
1465 (now->val.ucs4 | last_wch) < 65536 ? 4 : 8, last_wch);
1466 return;
1469 if (!ignore_content)
1470 while ((last_wch += step) <= now->val.ucs4)
1472 /* We have to find out whether there is a byte sequence corresponding
1473 to this UCS4 value. */
1474 struct charseq *seq;
1475 char utmp[10];
1477 snprintf (utmp, sizeof (utmp), "U%08X", last_wch);
1478 seq = charmap_find_value (charmap, utmp, 9);
1479 if (seq == NULL)
1481 snprintf (utmp, sizeof (utmp), "U%04X", last_wch);
1482 seq = charmap_find_value (charmap, utmp, 5);
1485 if (seq == NULL)
1486 /* Try looking in the repertoire map. */
1487 seq = repertoire_find_seq (repertoire, last_wch);
1489 /* If this is the first time we look for this sequence create a new
1490 entry. */
1491 if (seq == NULL)
1493 static const struct charseq negative
1494 = { .ucs4 = ILLEGAL_CHAR_VALUE };
1496 /* Find the symbolic name for this UCS4 value. */
1497 if (repertoire != NULL)
1499 const char *symbol = repertoire_find_symbol (repertoire,
1500 last_wch);
1501 uint32_t *newp = obstack_alloc (&repertoire->mem_pool,
1502 sizeof (uint32_t));
1503 *newp = last_wch;
1505 if (symbol != NULL)
1506 /* We have a name, now search the multibyte value. */
1507 seq = charmap_find_value (charmap, symbol, strlen (symbol));
1509 if (seq == NULL)
1510 /* We have to create a fake entry. */
1511 seq = (struct charseq *) &negative;
1512 else
1513 seq->ucs4 = last_wch;
1515 insert_entry (&repertoire->seq_table, newp, sizeof (uint32_t),
1516 seq);
1518 else
1519 /* We have to create a fake entry. */
1520 seq = (struct charseq *) &negative;
1523 /* We have a name, now search the multibyte value. */
1524 if (seq->ucs4 == last_wch && seq->nbytes == 1)
1525 /* Yep, we can store information about this byte sequence. */
1526 ctype->class256_collection[(size_t) seq->bytes[0]]
1527 |= class256_bit;
1529 /* And of course we have the UCS4 position. */
1530 if (class_bit != 0)
1531 *find_idx (ctype, &ctype->class_collection,
1532 &ctype->class_collection_max,
1533 &ctype->class_collection_act, last_wch) |= class_bit;
1535 if (handle_digits == 1)
1537 /* We must store the digit values. */
1538 if (ctype->mbdigits_act == ctype->mbdigits_max)
1540 ctype->mbdigits_max *= 2;
1541 ctype->mbdigits = xrealloc (ctype->mbdigits,
1542 (ctype->mbdigits_max
1543 * sizeof (char *)));
1544 ctype->wcdigits_max *= 2;
1545 ctype->wcdigits = xrealloc (ctype->wcdigits,
1546 (ctype->wcdigits_max
1547 * sizeof (uint32_t)));
1550 ctype->mbdigits[ctype->mbdigits_act++] = (seq->ucs4 == last_wch
1551 ? seq : NULL);
1552 ctype->wcdigits[ctype->wcdigits_act++] = last_wch;
1554 else if (handle_digits == 2)
1556 /* We must store the digit values. */
1557 if (ctype->outdigits_act >= 10)
1559 lr_error (ldfile, _("\
1560 %s: field `%s' does not contain exactly ten entries"),
1561 "LC_CTYPE", "outdigit");
1562 return;
1565 ctype->mboutdigits[ctype->outdigits_act] = (seq->ucs4 == last_wch
1566 ? seq : NULL);
1567 ctype->wcoutdigits[ctype->outdigits_act] = last_wch;
1568 ++ctype->outdigits_act;
1574 /* Ellipsis as in `/xea/x12.../xea/x34'. */
1575 static void
1576 charclass_charcode_ellipsis (struct linereader *ldfile,
1577 struct locale_ctype_t *ctype,
1578 const struct charmap_t *charmap,
1579 struct repertoire_t *repertoire,
1580 struct token *now, char *last_charcode,
1581 uint32_t last_charcode_len,
1582 unsigned long int class256_bit,
1583 unsigned long int class_bit, int ignore_content,
1584 int handle_digits)
1586 /* First check whether the to-value is larger. */
1587 if (now->val.charcode.nbytes != last_charcode_len)
1589 lr_error (ldfile, _("\
1590 start and end character sequence of range must have the same length"));
1591 return;
1594 if (memcmp (last_charcode, now->val.charcode.bytes, last_charcode_len) > 0)
1596 lr_error (ldfile, _("\
1597 to-value character sequence is smaller than from-value sequence"));
1598 return;
1601 if (!ignore_content)
1605 /* Increment the byte sequence value. */
1606 struct charseq *seq;
1607 uint32_t wch;
1608 int i;
1610 for (i = last_charcode_len - 1; i >= 0; --i)
1611 if (++last_charcode[i] != 0)
1612 break;
1614 if (last_charcode_len == 1)
1615 /* Of course we have the charcode value. */
1616 ctype->class256_collection[(size_t) last_charcode[0]]
1617 |= class256_bit;
1619 /* Find the symbolic name. */
1620 seq = charmap_find_symbol (charmap, last_charcode,
1621 last_charcode_len);
1622 if (seq != NULL)
1624 if (seq->ucs4 == UNINITIALIZED_CHAR_VALUE)
1625 seq->ucs4 = repertoire_find_value (repertoire, seq->name,
1626 strlen (seq->name));
1627 wch = seq == NULL ? ILLEGAL_CHAR_VALUE : seq->ucs4;
1629 if (wch != ILLEGAL_CHAR_VALUE && class_bit != 0)
1630 *find_idx (ctype, &ctype->class_collection,
1631 &ctype->class_collection_max,
1632 &ctype->class_collection_act, wch) |= class_bit;
1634 else
1635 wch = ILLEGAL_CHAR_VALUE;
1637 if (handle_digits == 1)
1639 /* We must store the digit values. */
1640 if (ctype->mbdigits_act == ctype->mbdigits_max)
1642 ctype->mbdigits_max *= 2;
1643 ctype->mbdigits = xrealloc (ctype->mbdigits,
1644 (ctype->mbdigits_max
1645 * sizeof (char *)));
1646 ctype->wcdigits_max *= 2;
1647 ctype->wcdigits = xrealloc (ctype->wcdigits,
1648 (ctype->wcdigits_max
1649 * sizeof (uint32_t)));
1652 seq = xmalloc (sizeof (struct charseq) + last_charcode_len);
1653 memcpy ((char *) (seq + 1), last_charcode, last_charcode_len);
1654 seq->nbytes = last_charcode_len;
1656 ctype->mbdigits[ctype->mbdigits_act++] = seq;
1657 ctype->wcdigits[ctype->wcdigits_act++] = wch;
1659 else if (handle_digits == 2)
1661 struct charseq *seq;
1662 /* We must store the digit values. */
1663 if (ctype->outdigits_act >= 10)
1665 lr_error (ldfile, _("\
1666 %s: field `%s' does not contain exactly ten entries"),
1667 "LC_CTYPE", "outdigit");
1668 return;
1671 seq = xmalloc (sizeof (struct charseq) + last_charcode_len);
1672 memcpy ((char *) (seq + 1), last_charcode, last_charcode_len);
1673 seq->nbytes = last_charcode_len;
1675 ctype->mboutdigits[ctype->outdigits_act] = seq;
1676 ctype->wcoutdigits[ctype->outdigits_act] = wch;
1677 ++ctype->outdigits_act;
1680 while (memcmp (last_charcode, now->val.charcode.bytes,
1681 last_charcode_len) != 0);
1686 static uint32_t *
1687 find_translit2 (struct locale_ctype_t *ctype, const struct charmap_t *charmap,
1688 uint32_t wch)
1690 struct translit_t *trunp = ctype->translit;
1691 struct translit_ignore_t *tirunp = ctype->translit_ignore;
1693 while (trunp != NULL)
1695 /* XXX We simplify things here. The transliterations we look
1696 for are only allowed to have one character. */
1697 if (trunp->from[0] == wch && trunp->from[1] == 0)
1699 /* Found it. Now look for a transliteration which can be
1700 represented with the character set. */
1701 struct translit_to_t *torunp = trunp->to;
1703 while (torunp != NULL)
1705 int i;
1707 for (i = 0; torunp->str[i] != 0; ++i)
1709 char utmp[10];
1711 snprintf (utmp, sizeof (utmp), "U%08X", torunp->str[i]);
1712 if (charmap_find_value (charmap, utmp, 9) == NULL)
1713 /* This character cannot be represented. */
1714 break;
1717 if (torunp->str[i] == 0)
1718 return torunp->str;
1720 torunp = torunp->next;
1723 break;
1726 trunp = trunp->next;
1729 /* Check for ignored chars. */
1730 while (tirunp != NULL)
1732 if (tirunp->from <= wch && tirunp->to >= wch)
1734 uint32_t wi;
1736 for (wi = tirunp->from; wi <= wch; wi += tirunp->step)
1737 if (wi == wch)
1738 return no_str;
1742 /* Nothing found. */
1743 return NULL;
1747 uint32_t *
1748 find_translit (struct localedef_t *locale, const struct charmap_t *charmap,
1749 uint32_t wch)
1751 struct locale_ctype_t *ctype;
1752 uint32_t *result = NULL;
1754 assert (locale != NULL);
1755 ctype = locale->categories[LC_CTYPE].ctype;
1757 if (ctype == NULL)
1758 return NULL;
1760 if (ctype->translit != NULL)
1761 result = find_translit2 (ctype, charmap, wch);
1763 if (result == NULL)
1765 struct translit_include_t *irunp = ctype->translit_include;
1767 while (irunp != NULL && result == NULL)
1769 result = find_translit (find_locale (CTYPE_LOCALE,
1770 irunp->copy_locale,
1771 irunp->copy_repertoire,
1772 charmap),
1773 charmap, wch);
1774 irunp = irunp->next;
1778 return result;
1782 /* Read one transliteration entry. */
1783 static uint32_t *
1784 read_widestring (struct linereader *ldfile, struct token *now,
1785 const struct charmap_t *charmap,
1786 struct repertoire_t *repertoire)
1788 uint32_t *wstr;
1790 if (now->tok == tok_default_missing)
1791 /* The special name "" will denote this case. */
1792 wstr = no_str;
1793 else if (now->tok == tok_bsymbol)
1795 /* Get the value from the repertoire. */
1796 wstr = (uint32_t *) xmalloc (2 * sizeof (uint32_t));
1797 wstr[0] = repertoire_find_value (repertoire, now->val.str.startmb,
1798 now->val.str.lenmb);
1799 if (wstr[0] == ILLEGAL_CHAR_VALUE)
1801 /* We cannot proceed, we don't know the UCS4 value. */
1802 free (wstr);
1803 return NULL;
1806 wstr[1] = 0;
1808 else if (now->tok == tok_ucs4)
1810 wstr = (uint32_t *) xmalloc (2 * sizeof (uint32_t));
1811 wstr[0] = now->val.ucs4;
1812 wstr[1] = 0;
1814 else if (now->tok == tok_charcode)
1816 /* Argh, we have to convert to the symbol name first and then to the
1817 UCS4 value. */
1818 struct charseq *seq = charmap_find_symbol (charmap,
1819 now->val.str.startmb,
1820 now->val.str.lenmb);
1821 if (seq == NULL)
1822 /* Cannot find the UCS4 value. */
1823 return NULL;
1825 if (seq->ucs4 == UNINITIALIZED_CHAR_VALUE)
1826 seq->ucs4 = repertoire_find_value (repertoire, seq->name,
1827 strlen (seq->name));
1828 if (seq->ucs4 == ILLEGAL_CHAR_VALUE)
1829 /* We cannot proceed, we don't know the UCS4 value. */
1830 return NULL;
1832 wstr = (uint32_t *) xmalloc (2 * sizeof (uint32_t));
1833 wstr[0] = seq->ucs4;
1834 wstr[1] = 0;
1836 else if (now->tok == tok_string)
1838 wstr = now->val.str.startwc;
1839 if (wstr == NULL || wstr[0] == 0)
1840 return NULL;
1842 else
1844 if (now->tok != tok_eol && now->tok != tok_eof)
1845 lr_ignore_rest (ldfile, 0);
1846 SYNTAX_ERROR (_("%s: syntax error"), "LC_CTYPE");
1847 return (uint32_t *) -1l;
1850 return wstr;
1854 static void
1855 read_translit_entry (struct linereader *ldfile, struct locale_ctype_t *ctype,
1856 struct token *now, const struct charmap_t *charmap,
1857 struct repertoire_t *repertoire)
1859 uint32_t *from_wstr = read_widestring (ldfile, now, charmap, repertoire);
1860 struct translit_t *result;
1861 struct translit_to_t **top;
1862 struct obstack *ob = &ctype->mempool;
1863 int first;
1864 int ignore;
1866 if (from_wstr == NULL)
1867 /* There is no valid from string. */
1868 return;
1870 result = (struct translit_t *) obstack_alloc (ob,
1871 sizeof (struct translit_t));
1872 result->from = from_wstr;
1873 result->fname = ldfile->fname;
1874 result->lineno = ldfile->lineno;
1875 result->next = NULL;
1876 result->to = NULL;
1877 top = &result->to;
1878 first = 1;
1879 ignore = 0;
1881 while (1)
1883 uint32_t *to_wstr;
1885 /* Next we have one or more transliterations. They are
1886 separated by semicolons. */
1887 now = lr_token (ldfile, charmap, NULL, repertoire, verbose);
1889 if (!first && (now->tok == tok_semicolon || now->tok == tok_eol))
1891 /* One string read. */
1892 const uint32_t zero = 0;
1894 if (!ignore)
1896 obstack_grow (ob, &zero, 4);
1897 to_wstr = obstack_finish (ob);
1899 *top = obstack_alloc (ob, sizeof (struct translit_to_t));
1900 (*top)->str = to_wstr;
1901 (*top)->next = NULL;
1904 if (now->tok == tok_eol)
1906 result->next = ctype->translit;
1907 ctype->translit = result;
1908 return;
1911 if (!ignore)
1912 top = &(*top)->next;
1913 ignore = 0;
1915 else
1917 to_wstr = read_widestring (ldfile, now, charmap, repertoire);
1918 if (to_wstr == (uint32_t *) -1l)
1920 /* An error occurred. */
1921 obstack_free (ob, result);
1922 return;
1925 if (to_wstr == NULL)
1926 ignore = 1;
1927 else
1928 /* This value is usable. */
1929 obstack_grow (ob, to_wstr, wcslen ((wchar_t *) to_wstr) * 4);
1931 first = 0;
1937 static void
1938 read_translit_ignore_entry (struct linereader *ldfile,
1939 struct locale_ctype_t *ctype,
1940 const struct charmap_t *charmap,
1941 struct repertoire_t *repertoire)
1943 /* We expect a semicolon-separated list of characters we ignore. We are
1944 only interested in the wide character definitions. These must be
1945 single characters, possibly defining a range when an ellipsis is used. */
1946 while (1)
1948 struct token *now = lr_token (ldfile, charmap, NULL, repertoire,
1949 verbose);
1950 struct translit_ignore_t *newp;
1951 uint32_t from;
1953 if (now->tok == tok_eol || now->tok == tok_eof)
1955 lr_error (ldfile,
1956 _("premature end of `translit_ignore' definition"));
1957 return;
1960 if (now->tok != tok_bsymbol && now->tok != tok_ucs4)
1962 lr_error (ldfile, _("syntax error"));
1963 lr_ignore_rest (ldfile, 0);
1964 return;
1967 if (now->tok == tok_ucs4)
1968 from = now->val.ucs4;
1969 else
1970 /* Try to get the value. */
1971 from = repertoire_find_value (repertoire, now->val.str.startmb,
1972 now->val.str.lenmb);
1974 if (from == ILLEGAL_CHAR_VALUE)
1976 lr_error (ldfile, "invalid character name");
1977 newp = NULL;
1979 else
1981 newp = (struct translit_ignore_t *)
1982 obstack_alloc (&ctype->mempool, sizeof (struct translit_ignore_t));
1983 newp->from = from;
1984 newp->to = from;
1985 newp->step = 1;
1987 newp->next = ctype->translit_ignore;
1988 ctype->translit_ignore = newp;
1991 /* Now we expect either a semicolon, an ellipsis, or the end of the
1992 line. */
1993 now = lr_token (ldfile, charmap, NULL, repertoire, verbose);
1995 if (now->tok == tok_ellipsis2 || now->tok == tok_ellipsis2_2)
1997 /* XXX Should we bother implementing `....'? `...' certainly
1998 will not be implemented. */
1999 uint32_t to;
2000 int step = now->tok == tok_ellipsis2_2 ? 2 : 1;
2002 now = lr_token (ldfile, charmap, NULL, repertoire, verbose);
2004 if (now->tok == tok_eol || now->tok == tok_eof)
2006 lr_error (ldfile,
2007 _("premature end of `translit_ignore' definition"));
2008 return;
2011 if (now->tok != tok_bsymbol && now->tok != tok_ucs4)
2013 lr_error (ldfile, _("syntax error"));
2014 lr_ignore_rest (ldfile, 0);
2015 return;
2018 if (now->tok == tok_ucs4)
2019 to = now->val.ucs4;
2020 else
2021 /* Try to get the value. */
2022 to = repertoire_find_value (repertoire, now->val.str.startmb,
2023 now->val.str.lenmb);
2025 if (to == ILLEGAL_CHAR_VALUE)
2026 lr_error (ldfile, "invalid character name");
2027 else
2029 /* Make sure the `to'-value is larger. */
2030 if (to >= from)
2032 newp->to = to;
2033 newp->step = step;
2035 else
2036 lr_error (ldfile, _("\
2037 to-value <U%0*X> of range is smaller than from-value <U%0*X>"),
2038 (to | from) < 65536 ? 4 : 8, to,
2039 (to | from) < 65536 ? 4 : 8, from);
2042 /* And the next token. */
2043 now = lr_token (ldfile, charmap, NULL, repertoire, verbose);
2046 if (now->tok == tok_eol || now->tok == tok_eof)
2047 /* We are done. */
2048 return;
2050 if (now->tok == tok_semicolon)
2051 /* Next round. */
2052 continue;
2054 /* If we come here something is wrong. */
2055 lr_error (ldfile, _("syntax error"));
2056 lr_ignore_rest (ldfile, 0);
2057 return;
2062 /* The parser for the LC_CTYPE section of the locale definition. */
2063 void
2064 ctype_read (struct linereader *ldfile, struct localedef_t *result,
2065 const struct charmap_t *charmap, const char *repertoire_name,
2066 int ignore_content)
2068 struct repertoire_t *repertoire = NULL;
2069 struct locale_ctype_t *ctype;
2070 struct token *now;
2071 enum token_t nowtok;
2072 size_t cnt;
2073 uint32_t last_wch = 0;
2074 enum token_t last_token;
2075 enum token_t ellipsis_token;
2076 int step;
2077 char last_charcode[16];
2078 size_t last_charcode_len = 0;
2079 const char *last_str = NULL;
2080 int mapidx;
2081 struct localedef_t *copy_locale = NULL;
2083 /* Get the repertoire we have to use. */
2084 if (repertoire_name != NULL)
2085 repertoire = repertoire_read (repertoire_name);
2087 /* The rest of the line containing `LC_CTYPE' must be free. */
2088 lr_ignore_rest (ldfile, 1);
2093 now = lr_token (ldfile, charmap, NULL, NULL, verbose);
2094 nowtok = now->tok;
2096 while (nowtok == tok_eol);
2098 /* If we see `copy' now we are almost done. */
2099 if (nowtok == tok_copy)
2101 now = lr_token (ldfile, charmap, NULL, NULL, verbose);
2102 if (now->tok != tok_string)
2104 SYNTAX_ERROR (_("%s: syntax error"), "LC_CTYPE");
2106 skip_category:
2108 now = lr_token (ldfile, charmap, NULL, NULL, verbose);
2109 while (now->tok != tok_eof && now->tok != tok_end);
2111 if (now->tok != tok_eof
2112 || (now = lr_token (ldfile, charmap, NULL, NULL, verbose),
2113 now->tok == tok_eof))
2114 lr_error (ldfile, _("%s: premature end of file"), "LC_CTYPE");
2115 else if (now->tok != tok_lc_ctype)
2117 lr_error (ldfile, _("\
2118 %1$s: definition does not end with `END %1$s'"), "LC_CTYPE");
2119 lr_ignore_rest (ldfile, 0);
2121 else
2122 lr_ignore_rest (ldfile, 1);
2124 return;
2127 if (! ignore_content)
2129 /* Get the locale definition. */
2130 copy_locale = load_locale (LC_CTYPE, now->val.str.startmb,
2131 repertoire_name, charmap, NULL);
2132 if ((copy_locale->avail & CTYPE_LOCALE) == 0)
2134 /* Not yet loaded. So do it now. */
2135 if (locfile_read (copy_locale, charmap) != 0)
2136 goto skip_category;
2139 if (copy_locale->categories[LC_CTYPE].ctype == NULL)
2140 return;
2143 lr_ignore_rest (ldfile, 1);
2145 now = lr_token (ldfile, charmap, NULL, NULL, verbose);
2146 nowtok = now->tok;
2149 /* Prepare the data structures. */
2150 ctype_startup (ldfile, result, charmap, copy_locale, ignore_content);
2151 ctype = result->categories[LC_CTYPE].ctype;
2153 /* Remember the repertoire we use. */
2154 if (!ignore_content)
2155 ctype->repertoire = repertoire;
2157 while (1)
2159 unsigned long int class_bit = 0;
2160 unsigned long int class256_bit = 0;
2161 int handle_digits = 0;
2163 /* Of course we don't proceed beyond the end of file. */
2164 if (nowtok == tok_eof)
2165 break;
2167 /* Ingore empty lines. */
2168 if (nowtok == tok_eol)
2170 now = lr_token (ldfile, charmap, NULL, NULL, verbose);
2171 nowtok = now->tok;
2172 continue;
2175 switch (nowtok)
2177 case tok_charclass:
2178 now = lr_token (ldfile, charmap, NULL, NULL, verbose);
2179 while (now->tok == tok_ident || now->tok == tok_string)
2181 ctype_class_new (ldfile, ctype, now->val.str.startmb);
2182 now = lr_token (ldfile, charmap, NULL, NULL, verbose);
2183 if (now->tok != tok_semicolon)
2184 break;
2185 now = lr_token (ldfile, charmap, NULL, NULL, verbose);
2187 if (now->tok != tok_eol)
2188 SYNTAX_ERROR (_("\
2189 %s: syntax error in definition of new character class"), "LC_CTYPE");
2190 break;
2192 case tok_charconv:
2193 now = lr_token (ldfile, charmap, NULL, NULL, verbose);
2194 while (now->tok == tok_ident || now->tok == tok_string)
2196 ctype_map_new (ldfile, ctype, now->val.str.startmb, charmap);
2197 now = lr_token (ldfile, charmap, NULL, NULL, verbose);
2198 if (now->tok != tok_semicolon)
2199 break;
2200 now = lr_token (ldfile, charmap, NULL, NULL, verbose);
2202 if (now->tok != tok_eol)
2203 SYNTAX_ERROR (_("\
2204 %s: syntax error in definition of new character map"), "LC_CTYPE");
2205 break;
2207 case tok_class:
2208 /* Ignore the rest of the line if we don't need the input of
2209 this line. */
2210 if (ignore_content)
2212 lr_ignore_rest (ldfile, 0);
2213 break;
2216 /* We simply forget the `class' keyword and use the following
2217 operand to determine the bit. */
2218 now = lr_token (ldfile, charmap, NULL, NULL, verbose);
2219 if (now->tok == tok_ident || now->tok == tok_string)
2221 /* Must can be one of the predefined class names. */
2222 for (cnt = 0; cnt < ctype->nr_charclass; ++cnt)
2223 if (strcmp (ctype->classnames[cnt], now->val.str.startmb) == 0)
2224 break;
2225 if (cnt >= ctype->nr_charclass)
2227 /* OK, it's a new class. */
2228 ctype_class_new (ldfile, ctype, now->val.str.startmb);
2230 class_bit = _ISwbit (ctype->nr_charclass - 1);
2232 else
2234 class_bit = _ISwbit (cnt);
2236 free (now->val.str.startmb);
2239 else if (now->tok == tok_digit)
2240 goto handle_tok_digit;
2241 else if (now->tok < tok_upper || now->tok > tok_blank)
2242 goto err_label;
2243 else
2245 class_bit = BITw (now->tok);
2246 class256_bit = BIT (now->tok);
2249 /* The next character must be a semicolon. */
2250 now = lr_token (ldfile, charmap, NULL, NULL, verbose);
2251 if (now->tok != tok_semicolon)
2252 goto err_label;
2253 goto read_charclass;
2255 case tok_upper:
2256 case tok_lower:
2257 case tok_alpha:
2258 case tok_alnum:
2259 case tok_space:
2260 case tok_cntrl:
2261 case tok_punct:
2262 case tok_graph:
2263 case tok_print:
2264 case tok_xdigit:
2265 case tok_blank:
2266 /* Ignore the rest of the line if we don't need the input of
2267 this line. */
2268 if (ignore_content)
2270 lr_ignore_rest (ldfile, 0);
2271 break;
2274 class_bit = BITw (now->tok);
2275 class256_bit = BIT (now->tok);
2276 handle_digits = 0;
2277 read_charclass:
2278 ctype->class_done |= class_bit;
2279 last_token = tok_none;
2280 ellipsis_token = tok_none;
2281 step = 1;
2282 now = lr_token (ldfile, charmap, NULL, NULL, verbose);
2283 while (now->tok != tok_eol && now->tok != tok_eof)
2285 uint32_t wch;
2286 struct charseq *seq;
2288 if (ellipsis_token == tok_none)
2290 if (get_character (now, charmap, repertoire, &seq, &wch))
2291 goto err_label;
2293 if (!ignore_content && seq != NULL && seq->nbytes == 1)
2294 /* Yep, we can store information about this byte
2295 sequence. */
2296 ctype->class256_collection[seq->bytes[0]] |= class256_bit;
2298 if (!ignore_content && wch != ILLEGAL_CHAR_VALUE
2299 && class_bit != 0)
2300 /* We have the UCS4 position. */
2301 *find_idx (ctype, &ctype->class_collection,
2302 &ctype->class_collection_max,
2303 &ctype->class_collection_act, wch) |= class_bit;
2305 last_token = now->tok;
2306 /* Terminate the string. */
2307 if (last_token == tok_bsymbol)
2309 now->val.str.startmb[now->val.str.lenmb] = '\0';
2310 last_str = now->val.str.startmb;
2312 else
2313 last_str = NULL;
2314 last_wch = wch;
2315 memcpy (last_charcode, now->val.charcode.bytes, 16);
2316 last_charcode_len = now->val.charcode.nbytes;
2318 if (!ignore_content && handle_digits == 1)
2320 /* We must store the digit values. */
2321 if (ctype->mbdigits_act == ctype->mbdigits_max)
2323 ctype->mbdigits_max += 10;
2324 ctype->mbdigits = xrealloc (ctype->mbdigits,
2325 (ctype->mbdigits_max
2326 * sizeof (char *)));
2327 ctype->wcdigits_max += 10;
2328 ctype->wcdigits = xrealloc (ctype->wcdigits,
2329 (ctype->wcdigits_max
2330 * sizeof (uint32_t)));
2333 ctype->mbdigits[ctype->mbdigits_act++] = seq;
2334 ctype->wcdigits[ctype->wcdigits_act++] = wch;
2336 else if (!ignore_content && handle_digits == 2)
2338 /* We must store the digit values. */
2339 if (ctype->outdigits_act >= 10)
2341 lr_error (ldfile, _("\
2342 %s: field `%s' does not contain exactly ten entries"),
2343 "LC_CTYPE", "outdigit");
2344 lr_ignore_rest (ldfile, 0);
2345 break;
2348 ctype->mboutdigits[ctype->outdigits_act] = seq;
2349 ctype->wcoutdigits[ctype->outdigits_act] = wch;
2350 ++ctype->outdigits_act;
2353 else
2355 /* Now it gets complicated. We have to resolve the
2356 ellipsis problem. First we must distinguish between
2357 the different kind of ellipsis and this must match the
2358 tokens we have seen. */
2359 assert (last_token != tok_none);
2361 if (last_token != now->tok)
2363 lr_error (ldfile, _("\
2364 ellipsis range must be marked by two operands of same type"));
2365 lr_ignore_rest (ldfile, 0);
2366 break;
2369 if (last_token == tok_bsymbol)
2371 if (ellipsis_token == tok_ellipsis3)
2372 lr_error (ldfile, _("with symbolic name range values \
2373 the absolute ellipsis `...' must not be used"));
2375 charclass_symbolic_ellipsis (ldfile, ctype, charmap,
2376 repertoire, now, last_str,
2377 class256_bit, class_bit,
2378 (ellipsis_token
2379 == tok_ellipsis4
2380 ? 10 : 16),
2381 ignore_content,
2382 handle_digits, step);
2384 else if (last_token == tok_ucs4)
2386 if (ellipsis_token != tok_ellipsis2)
2387 lr_error (ldfile, _("\
2388 with UCS range values one must use the hexadecimal symbolic ellipsis `..'"));
2390 charclass_ucs4_ellipsis (ldfile, ctype, charmap,
2391 repertoire, now, last_wch,
2392 class256_bit, class_bit,
2393 ignore_content, handle_digits,
2394 step);
2396 else
2398 assert (last_token == tok_charcode);
2400 if (ellipsis_token != tok_ellipsis3)
2401 lr_error (ldfile, _("\
2402 with character code range values one must use the absolute ellipsis `...'"));
2404 charclass_charcode_ellipsis (ldfile, ctype, charmap,
2405 repertoire, now,
2406 last_charcode,
2407 last_charcode_len,
2408 class256_bit, class_bit,
2409 ignore_content,
2410 handle_digits);
2413 /* Now we have used the last value. */
2414 last_token = tok_none;
2417 /* Next we expect a semicolon or the end of the line. */
2418 now = lr_token (ldfile, charmap, NULL, NULL, verbose);
2419 if (now->tok == tok_eol || now->tok == tok_eof)
2420 break;
2422 if (last_token != tok_none
2423 && now->tok >= tok_ellipsis2 && now->tok <= tok_ellipsis4_2)
2425 if (now->tok == tok_ellipsis2_2)
2427 now->tok = tok_ellipsis2;
2428 step = 2;
2430 else if (now->tok == tok_ellipsis4_2)
2432 now->tok = tok_ellipsis4;
2433 step = 2;
2436 ellipsis_token = now->tok;
2438 now = lr_token (ldfile, charmap, NULL, NULL, verbose);
2439 continue;
2442 if (now->tok != tok_semicolon)
2443 goto err_label;
2445 /* And get the next character. */
2446 now = lr_token (ldfile, charmap, NULL, NULL, verbose);
2448 ellipsis_token = tok_none;
2449 step = 1;
2451 break;
2453 case tok_digit:
2454 /* Ignore the rest of the line if we don't need the input of
2455 this line. */
2456 if (ignore_content)
2458 lr_ignore_rest (ldfile, 0);
2459 break;
2462 handle_tok_digit:
2463 class_bit = _ISwdigit;
2464 class256_bit = _ISdigit;
2465 handle_digits = 1;
2466 goto read_charclass;
2468 case tok_outdigit:
2469 /* Ignore the rest of the line if we don't need the input of
2470 this line. */
2471 if (ignore_content)
2473 lr_ignore_rest (ldfile, 0);
2474 break;
2477 if (ctype->outdigits_act != 0)
2478 lr_error (ldfile, _("\
2479 %s: field `%s' declared more than once"),
2480 "LC_CTYPE", "outdigit");
2481 class_bit = 0;
2482 class256_bit = 0;
2483 handle_digits = 2;
2484 goto read_charclass;
2486 case tok_toupper:
2487 /* Ignore the rest of the line if we don't need the input of
2488 this line. */
2489 if (ignore_content)
2491 lr_ignore_rest (ldfile, 0);
2492 break;
2495 mapidx = 0;
2496 goto read_mapping;
2498 case tok_tolower:
2499 /* Ignore the rest of the line if we don't need the input of
2500 this line. */
2501 if (ignore_content)
2503 lr_ignore_rest (ldfile, 0);
2504 break;
2507 mapidx = 1;
2508 goto read_mapping;
2510 case tok_map:
2511 /* Ignore the rest of the line if we don't need the input of
2512 this line. */
2513 if (ignore_content)
2515 lr_ignore_rest (ldfile, 0);
2516 break;
2519 /* We simply forget the `map' keyword and use the following
2520 operand to determine the mapping. */
2521 now = lr_token (ldfile, charmap, NULL, NULL, verbose);
2522 if (now->tok == tok_ident || now->tok == tok_string)
2524 size_t cnt;
2526 for (cnt = 2; cnt < ctype->map_collection_nr; ++cnt)
2527 if (strcmp (now->val.str.startmb, ctype->mapnames[cnt]) == 0)
2528 break;
2530 if (cnt < ctype->map_collection_nr)
2531 free (now->val.str.startmb);
2532 else
2533 /* OK, it's a new map. */
2534 ctype_map_new (ldfile, ctype, now->val.str.startmb, charmap);
2536 mapidx = cnt;
2538 else if (now->tok < tok_toupper || now->tok > tok_tolower)
2539 goto err_label;
2540 else
2541 mapidx = now->tok - tok_toupper;
2543 now = lr_token (ldfile, charmap, NULL, NULL, verbose);
2544 /* This better should be a semicolon. */
2545 if (now->tok != tok_semicolon)
2546 goto err_label;
2548 read_mapping:
2549 /* Test whether this mapping was already defined. */
2550 if (ctype->tomap_done[mapidx])
2552 lr_error (ldfile, _("duplicated definition for mapping `%s'"),
2553 ctype->mapnames[mapidx]);
2554 lr_ignore_rest (ldfile, 0);
2555 break;
2557 ctype->tomap_done[mapidx] = 1;
2559 now = lr_token (ldfile, charmap, NULL, NULL, verbose);
2560 while (now->tok != tok_eol && now->tok != tok_eof)
2562 struct charseq *from_seq;
2563 uint32_t from_wch;
2564 struct charseq *to_seq;
2565 uint32_t to_wch;
2567 /* Every pair starts with an opening brace. */
2568 if (now->tok != tok_open_brace)
2569 goto err_label;
2571 /* Next comes the from-value. */
2572 now = lr_token (ldfile, charmap, NULL, NULL, verbose);
2573 if (get_character (now, charmap, repertoire, &from_seq,
2574 &from_wch) != 0)
2575 goto err_label;
2577 /* The next is a comma. */
2578 now = lr_token (ldfile, charmap, NULL, NULL, verbose);
2579 if (now->tok != tok_comma)
2580 goto err_label;
2582 /* And the other value. */
2583 now = lr_token (ldfile, charmap, NULL, NULL, verbose);
2584 if (get_character (now, charmap, repertoire, &to_seq,
2585 &to_wch) != 0)
2586 goto err_label;
2588 /* And the last thing is the closing brace. */
2589 now = lr_token (ldfile, charmap, NULL, NULL, verbose);
2590 if (now->tok != tok_close_brace)
2591 goto err_label;
2593 if (!ignore_content)
2595 /* Check whether the mapping converts from an ASCII value
2596 to a non-ASCII value. */
2597 if (from_seq != NULL && from_seq->nbytes == 1
2598 && isascii (from_seq->bytes[0])
2599 && to_seq != NULL && (to_seq->nbytes != 1
2600 || !isascii (to_seq->bytes[0])))
2601 ctype->to_nonascii = 1;
2603 if (mapidx < 2 && from_seq != NULL && to_seq != NULL
2604 && from_seq->nbytes == 1 && to_seq->nbytes == 1)
2605 /* We can use this value. */
2606 ctype->map256_collection[mapidx][from_seq->bytes[0]]
2607 = to_seq->bytes[0];
2609 if (from_wch != ILLEGAL_CHAR_VALUE
2610 && to_wch != ILLEGAL_CHAR_VALUE)
2611 /* Both correct values. */
2612 *find_idx (ctype, &ctype->map_collection[mapidx],
2613 &ctype->map_collection_max[mapidx],
2614 &ctype->map_collection_act[mapidx],
2615 from_wch) = to_wch;
2618 /* Now comes a semicolon or the end of the line/file. */
2619 now = lr_token (ldfile, charmap, NULL, NULL, verbose);
2620 if (now->tok == tok_semicolon)
2621 now = lr_token (ldfile, charmap, NULL, NULL, verbose);
2623 break;
2625 case tok_translit_start:
2626 /* Ignore the entire translit section with its peculiar syntax
2627 if we don't need the input. */
2628 if (ignore_content)
2632 lr_ignore_rest (ldfile, 0);
2633 now = lr_token (ldfile, charmap, NULL, NULL, verbose);
2635 while (now->tok != tok_translit_end && now->tok != tok_eof);
2637 if (now->tok == tok_eof)
2638 lr_error (ldfile, _(\
2639 "%s: `translit_start' section does not end with `translit_end'"),
2640 "LC_CTYPE");
2642 break;
2645 /* The rest of the line better should be empty. */
2646 lr_ignore_rest (ldfile, 1);
2648 /* We count here the number of allocated entries in the `translit'
2649 array. */
2650 cnt = 0;
2652 ldfile->translate_strings = 1;
2653 ldfile->return_widestr = 1;
2655 /* We proceed until we see the `translit_end' token. */
2656 while (now = lr_token (ldfile, charmap, NULL, repertoire, verbose),
2657 now->tok != tok_translit_end && now->tok != tok_eof)
2659 if (now->tok == tok_eol)
2660 /* Ignore empty lines. */
2661 continue;
2663 if (now->tok == tok_include)
2665 /* We have to include locale. */
2666 const char *locale_name;
2667 const char *repertoire_name;
2668 struct translit_include_t *include_stmt, **include_ptr;
2670 now = lr_token (ldfile, charmap, NULL, NULL, verbose);
2671 /* This should be a string or an identifier. In any
2672 case something to name a locale. */
2673 if (now->tok != tok_string && now->tok != tok_ident)
2675 translit_syntax:
2676 lr_error (ldfile, _("%s: syntax error"), "LC_CTYPE");
2677 lr_ignore_rest (ldfile, 0);
2678 continue;
2680 locale_name = now->val.str.startmb;
2682 /* Next should be a semicolon. */
2683 now = lr_token (ldfile, charmap, NULL, NULL, verbose);
2684 if (now->tok != tok_semicolon)
2685 goto translit_syntax;
2687 /* Now the repertoire name. */
2688 now = lr_token (ldfile, charmap, NULL, NULL, verbose);
2689 if ((now->tok != tok_string && now->tok != tok_ident)
2690 || now->val.str.startmb == NULL)
2691 goto translit_syntax;
2692 repertoire_name = now->val.str.startmb;
2693 if (repertoire_name[0] == '\0')
2694 /* Ignore the empty string. */
2695 repertoire_name = NULL;
2697 /* Save the include statement for later processing. */
2698 include_stmt = (struct translit_include_t *)
2699 xmalloc (sizeof (struct translit_include_t));
2700 include_stmt->copy_locale = locale_name;
2701 include_stmt->copy_repertoire = repertoire_name;
2702 include_stmt->next = NULL;
2704 include_ptr = &ctype->translit_include;
2705 while (*include_ptr != NULL)
2706 include_ptr = &(*include_ptr)->next;
2707 *include_ptr = include_stmt;
2709 /* The rest of the line must be empty. */
2710 lr_ignore_rest (ldfile, 1);
2712 /* Make sure the locale is read. */
2713 add_to_readlist (LC_CTYPE, locale_name, repertoire_name,
2714 1, NULL);
2715 continue;
2717 else if (now->tok == tok_default_missing)
2719 uint32_t *wstr;
2721 while (1)
2723 /* We expect a single character or string as the
2724 argument. */
2725 now = lr_token (ldfile, charmap, NULL, NULL, verbose);
2726 wstr = read_widestring (ldfile, now, charmap,
2727 repertoire);
2729 if (wstr != NULL)
2731 if (ctype->default_missing != NULL)
2733 lr_error (ldfile, _("\
2734 %s: duplicate `default_missing' definition"), "LC_CTYPE");
2735 record_error_at_line (0, 0,
2736 ctype->default_missing_file,
2737 ctype->default_missing_lineno,
2738 _("\
2739 previous definition was here"));
2741 else
2743 ctype->default_missing = wstr;
2744 ctype->default_missing_file = ldfile->fname;
2745 ctype->default_missing_lineno = ldfile->lineno;
2747 /* We can have more entries, ignore them. */
2748 lr_ignore_rest (ldfile, 0);
2749 break;
2751 else if (wstr == (uint32_t *) -1l)
2752 /* This was an syntax error. */
2753 break;
2755 /* Maybe there is another replacement we can use. */
2756 now = lr_token (ldfile, charmap, NULL, NULL, verbose);
2757 if (now->tok == tok_eol || now->tok == tok_eof)
2759 /* Nothing found. We tell the user. */
2760 lr_error (ldfile, _("\
2761 %s: no representable `default_missing' definition found"), "LC_CTYPE");
2762 break;
2764 if (now->tok != tok_semicolon)
2765 goto translit_syntax;
2768 continue;
2770 else if (now->tok == tok_translit_ignore)
2772 read_translit_ignore_entry (ldfile, ctype, charmap,
2773 repertoire);
2774 continue;
2777 read_translit_entry (ldfile, ctype, now, charmap, repertoire);
2779 ldfile->return_widestr = 0;
2781 if (now->tok == tok_eof)
2782 lr_error (ldfile, _(\
2783 "%s: `translit_start' section does not end with `translit_end'"),
2784 "LC_CTYPE");
2786 break;
2788 case tok_ident:
2789 /* Ignore the rest of the line if we don't need the input of
2790 this line. */
2791 if (ignore_content)
2793 lr_ignore_rest (ldfile, 0);
2794 break;
2797 /* This could mean one of several things. First test whether
2798 it's a character class name. */
2799 for (cnt = 0; cnt < ctype->nr_charclass; ++cnt)
2800 if (strcmp (now->val.str.startmb, ctype->classnames[cnt]) == 0)
2801 break;
2802 if (cnt < ctype->nr_charclass)
2804 class_bit = _ISwbit (cnt);
2805 class256_bit = cnt <= 11 ? _ISbit (cnt) : 0;
2806 free (now->val.str.startmb);
2807 goto read_charclass;
2809 for (cnt = 0; cnt < ctype->map_collection_nr; ++cnt)
2810 if (strcmp (now->val.str.startmb, ctype->mapnames[cnt]) == 0)
2811 break;
2812 if (cnt < ctype->map_collection_nr)
2814 mapidx = cnt;
2815 free (now->val.str.startmb);
2816 goto read_mapping;
2818 break;
2820 case tok_end:
2821 /* Next we assume `LC_CTYPE'. */
2822 now = lr_token (ldfile, charmap, NULL, NULL, verbose);
2823 if (now->tok == tok_eof)
2824 break;
2825 if (now->tok == tok_eol)
2826 lr_error (ldfile, _("%s: incomplete `END' line"),
2827 "LC_CTYPE");
2828 else if (now->tok != tok_lc_ctype)
2829 lr_error (ldfile, _("\
2830 %1$s: definition does not end with `END %1$s'"), "LC_CTYPE");
2831 lr_ignore_rest (ldfile, now->tok == tok_lc_ctype);
2832 return;
2834 default:
2835 err_label:
2836 if (now->tok != tok_eof)
2837 SYNTAX_ERROR (_("%s: syntax error"), "LC_CTYPE");
2840 /* Prepare for the next round. */
2841 now = lr_token (ldfile, charmap, NULL, NULL, verbose);
2842 nowtok = now->tok;
2845 /* When we come here we reached the end of the file. */
2846 lr_error (ldfile, _("%s: premature end of file"), "LC_CTYPE");
2850 /* Subroutine of set_class_defaults, below. */
2851 static void
2852 set_one_default (struct locale_ctype_t *ctype,
2853 const struct charmap_t *charmap,
2854 int bitpos, int from, int to)
2856 char tmp[2];
2857 int ch;
2858 int bit = _ISbit (bitpos);
2859 int bitw = _ISwbit (bitpos);
2860 /* Define string. */
2861 strcpy (tmp, "?");
2863 for (ch = from; ch <= to; ++ch)
2865 struct charseq *seq;
2866 tmp[0] = ch;
2868 seq = charmap_find_value (charmap, tmp, 1);
2869 if (seq == NULL)
2871 char buf[10];
2872 sprintf (buf, "U%08X", ch);
2873 seq = charmap_find_value (charmap, buf, 9);
2875 if (seq == NULL)
2877 record_error (0, 0, _("\
2878 %s: character `%s' not defined while needed as default value"),
2879 "LC_CTYPE", tmp);
2881 else if (seq->nbytes != 1)
2882 record_error (0, 0, _("\
2883 %s: character `%s' in charmap not representable with one byte"),
2884 "LC_CTYPE", tmp);
2885 else
2886 ctype->class256_collection[seq->bytes[0]] |= bit;
2888 /* No need to search here, the ASCII value is also the Unicode
2889 value. */
2890 ELEM (ctype, class_collection, , ch) |= bitw;
2894 static void
2895 set_class_defaults (struct locale_ctype_t *ctype,
2896 const struct charmap_t *charmap,
2897 struct repertoire_t *repertoire)
2899 #define set_default(bitpos, from, to) \
2900 set_one_default (ctype, charmap, bitpos, from, to)
2902 /* These function defines the default values for the classes and conversions
2903 according to POSIX.2 2.5.2.1.
2904 It may seem that the order of these if-blocks is arbitrary but it is NOT.
2905 Don't move them unless you know what you do! */
2907 /* Set default values if keyword was not present. */
2908 if ((ctype->class_done & BITw (tok_upper)) == 0)
2909 /* "If this keyword [lower] is not specified, the lowercase letters
2910 `A' through `Z', ..., shall automatically belong to this class,
2911 with implementation defined character values." [P1003.2, 2.5.2.1] */
2912 set_default (BITPOS (tok_upper), 'A', 'Z');
2914 if ((ctype->class_done & BITw (tok_lower)) == 0)
2915 /* "If this keyword [lower] is not specified, the lowercase letters
2916 `a' through `z', ..., shall automatically belong to this class,
2917 with implementation defined character values." [P1003.2, 2.5.2.1] */
2918 set_default (BITPOS (tok_lower), 'a', 'z');
2920 if ((ctype->class_done & BITw (tok_alpha)) == 0)
2922 /* Table 2-6 in P1003.2 says that characters in class `upper' or
2923 class `lower' *must* be in class `alpha'. */
2924 unsigned long int mask = BIT (tok_upper) | BIT (tok_lower);
2925 unsigned long int maskw = BITw (tok_upper) | BITw (tok_lower);
2927 for (size_t cnt = 0; cnt < 256; ++cnt)
2928 if ((ctype->class256_collection[cnt] & mask) != 0)
2929 ctype->class256_collection[cnt] |= BIT (tok_alpha);
2931 for (size_t cnt = 0; cnt < ctype->class_collection_act; ++cnt)
2932 if ((ctype->class_collection[cnt] & maskw) != 0)
2933 ctype->class_collection[cnt] |= BITw (tok_alpha);
2936 if ((ctype->class_done & BITw (tok_digit)) == 0)
2937 /* "If this keyword [digit] is not specified, the digits `0' through
2938 `9', ..., shall automatically belong to this class, with
2939 implementation-defined character values." [P1003.2, 2.5.2.1] */
2940 set_default (BITPOS (tok_digit), '0', '9');
2942 /* "Only characters specified for the `alpha' and `digit' keyword
2943 shall be specified. Characters specified for the keyword `alpha'
2944 and `digit' are automatically included in this class. */
2946 unsigned long int mask = BIT (tok_alpha) | BIT (tok_digit);
2947 unsigned long int maskw = BITw (tok_alpha) | BITw (tok_digit);
2949 for (size_t cnt = 0; cnt < 256; ++cnt)
2950 if ((ctype->class256_collection[cnt] & mask) != 0)
2951 ctype->class256_collection[cnt] |= BIT (tok_alnum);
2953 for (size_t cnt = 0; cnt < ctype->class_collection_act; ++cnt)
2954 if ((ctype->class_collection[cnt] & maskw) != 0)
2955 ctype->class_collection[cnt] |= BITw (tok_alnum);
2958 if ((ctype->class_done & BITw (tok_space)) == 0)
2959 /* "If this keyword [space] is not specified, the characters <space>,
2960 <form-feed>, <newline>, <carriage-return>, <tab>, and
2961 <vertical-tab>, ..., shall automatically belong to this class,
2962 with implementation-defined character values." [P1003.2, 2.5.2.1] */
2964 struct charseq *seq;
2966 seq = charmap_find_value (charmap, "space", 5);
2967 if (seq == NULL)
2968 seq = charmap_find_value (charmap, "SP", 2);
2969 if (seq == NULL)
2970 seq = charmap_find_value (charmap, "U00000020", 9);
2971 if (seq == NULL)
2973 record_error (0, 0, _("\
2974 %s: character `%s' not defined while needed as default value"),
2975 "LC_CTYPE", "<space>");
2977 else if (seq->nbytes != 1)
2978 record_error (0, 0, _("\
2979 %s: character `%s' in charmap not representable with one byte"),
2980 "LC_CTYPE", "<space>");
2981 else
2982 ctype->class256_collection[seq->bytes[0]] |= BIT (tok_space);
2984 /* No need to search. */
2985 ELEM (ctype, class_collection, , L' ') |= BITw (tok_space);
2987 seq = charmap_find_value (charmap, "form-feed", 9);
2988 if (seq == NULL)
2989 seq = charmap_find_value (charmap, "U0000000C", 9);
2990 if (seq == NULL)
2992 record_error (0, 0, _("\
2993 %s: character `%s' not defined while needed as default value"),
2994 "LC_CTYPE", "<form-feed>");
2996 else if (seq->nbytes != 1)
2997 record_error (0, 0, _("\
2998 %s: character `%s' in charmap not representable with one byte"),
2999 "LC_CTYPE", "<form-feed>");
3000 else
3001 ctype->class256_collection[seq->bytes[0]] |= BIT (tok_space);
3003 /* No need to search. */
3004 ELEM (ctype, class_collection, , L'\f') |= BITw (tok_space);
3007 seq = charmap_find_value (charmap, "newline", 7);
3008 if (seq == NULL)
3009 seq = charmap_find_value (charmap, "U0000000A", 9);
3010 if (seq == NULL)
3012 record_error (0, 0, _("\
3013 %s: character `%s' not defined while needed as default value"),
3014 "LC_CTYPE", "<newline>");
3016 else if (seq->nbytes != 1)
3017 record_error (0, 0, _("\
3018 %s: character `%s' in charmap not representable with one byte"),
3019 "LC_CTYPE", "<newline>");
3020 else
3021 ctype->class256_collection[seq->bytes[0]] |= BIT (tok_space);
3023 /* No need to search. */
3024 ELEM (ctype, class_collection, , L'\n') |= BITw (tok_space);
3027 seq = charmap_find_value (charmap, "carriage-return", 15);
3028 if (seq == NULL)
3029 seq = charmap_find_value (charmap, "U0000000D", 9);
3030 if (seq == NULL)
3032 record_error (0, 0, _("\
3033 %s: character `%s' not defined while needed as default value"),
3034 "LC_CTYPE", "<carriage-return>");
3036 else if (seq->nbytes != 1)
3037 record_error (0, 0, _("\
3038 %s: character `%s' in charmap not representable with one byte"),
3039 "LC_CTYPE", "<carriage-return>");
3040 else
3041 ctype->class256_collection[seq->bytes[0]] |= BIT (tok_space);
3043 /* No need to search. */
3044 ELEM (ctype, class_collection, , L'\r') |= BITw (tok_space);
3047 seq = charmap_find_value (charmap, "tab", 3);
3048 if (seq == NULL)
3049 seq = charmap_find_value (charmap, "U00000009", 9);
3050 if (seq == NULL)
3052 record_error (0, 0, _("\
3053 %s: character `%s' not defined while needed as default value"),
3054 "LC_CTYPE", "<tab>");
3056 else if (seq->nbytes != 1)
3057 record_error (0, 0, _("\
3058 %s: character `%s' in charmap not representable with one byte"),
3059 "LC_CTYPE", "<tab>");
3060 else
3061 ctype->class256_collection[seq->bytes[0]] |= BIT (tok_space);
3063 /* No need to search. */
3064 ELEM (ctype, class_collection, , L'\t') |= BITw (tok_space);
3067 seq = charmap_find_value (charmap, "vertical-tab", 12);
3068 if (seq == NULL)
3069 seq = charmap_find_value (charmap, "U0000000B", 9);
3070 if (seq == NULL)
3072 record_error (0, 0, _("\
3073 %s: character `%s' not defined while needed as default value"),
3074 "LC_CTYPE", "<vertical-tab>");
3076 else if (seq->nbytes != 1)
3077 record_error (0, 0, _("\
3078 %s: character `%s' in charmap not representable with one byte"),
3079 "LC_CTYPE", "<vertical-tab>");
3080 else
3081 ctype->class256_collection[seq->bytes[0]] |= BIT (tok_space);
3083 /* No need to search. */
3084 ELEM (ctype, class_collection, , L'\v') |= BITw (tok_space);
3087 if ((ctype->class_done & BITw (tok_xdigit)) == 0)
3088 /* "If this keyword is not specified, the digits `0' to `9', the
3089 uppercase letters `A' through `F', and the lowercase letters `a'
3090 through `f', ..., shell automatically belong to this class, with
3091 implementation defined character values." [P1003.2, 2.5.2.1] */
3093 set_default (BITPOS (tok_xdigit), '0', '9');
3094 set_default (BITPOS (tok_xdigit), 'A', 'F');
3095 set_default (BITPOS (tok_xdigit), 'a', 'f');
3098 if ((ctype->class_done & BITw (tok_blank)) == 0)
3099 /* "If this keyword [blank] is unspecified, the characters <space> and
3100 <tab> shall belong to this character class." [P1003.2, 2.5.2.1] */
3102 struct charseq *seq;
3104 seq = charmap_find_value (charmap, "space", 5);
3105 if (seq == NULL)
3106 seq = charmap_find_value (charmap, "SP", 2);
3107 if (seq == NULL)
3108 seq = charmap_find_value (charmap, "U00000020", 9);
3109 if (seq == NULL)
3111 record_error (0, 0, _("\
3112 %s: character `%s' not defined while needed as default value"),
3113 "LC_CTYPE", "<space>");
3115 else if (seq->nbytes != 1)
3116 record_error (0, 0, _("\
3117 %s: character `%s' in charmap not representable with one byte"),
3118 "LC_CTYPE", "<space>");
3119 else
3120 ctype->class256_collection[seq->bytes[0]] |= BIT (tok_blank);
3122 /* No need to search. */
3123 ELEM (ctype, class_collection, , L' ') |= BITw (tok_blank);
3126 seq = charmap_find_value (charmap, "tab", 3);
3127 if (seq == NULL)
3128 seq = charmap_find_value (charmap, "U00000009", 9);
3129 if (seq == NULL)
3131 record_error (0, 0, _("\
3132 %s: character `%s' not defined while needed as default value"),
3133 "LC_CTYPE", "<tab>");
3135 else if (seq->nbytes != 1)
3136 record_error (0, 0, _("\
3137 %s: character `%s' in charmap not representable with one byte"),
3138 "LC_CTYPE", "<tab>");
3139 else
3140 ctype->class256_collection[seq->bytes[0]] |= BIT (tok_blank);
3142 /* No need to search. */
3143 ELEM (ctype, class_collection, , L'\t') |= BITw (tok_blank);
3146 if ((ctype->class_done & BITw (tok_graph)) == 0)
3147 /* "If this keyword [graph] is not specified, characters specified for
3148 the keywords `upper', `lower', `alpha', `digit', `xdigit' and `punct',
3149 shall belong to this character class." [P1003.2, 2.5.2.1] */
3151 unsigned long int mask = BIT (tok_upper) | BIT (tok_lower)
3152 | BIT (tok_alpha) | BIT (tok_digit) | BIT (tok_xdigit)
3153 | BIT (tok_punct);
3154 unsigned long int maskw = BITw (tok_upper) | BITw (tok_lower)
3155 | BITw (tok_alpha) | BITw (tok_digit) | BITw (tok_xdigit)
3156 | BITw (tok_punct);
3158 for (size_t cnt = 0; cnt < ctype->class_collection_act; ++cnt)
3159 if ((ctype->class_collection[cnt] & maskw) != 0)
3160 ctype->class_collection[cnt] |= BITw (tok_graph);
3162 for (size_t cnt = 0; cnt < 256; ++cnt)
3163 if ((ctype->class256_collection[cnt] & mask) != 0)
3164 ctype->class256_collection[cnt] |= BIT (tok_graph);
3167 if ((ctype->class_done & BITw (tok_print)) == 0)
3168 /* "If this keyword [print] is not provided, characters specified for
3169 the keywords `upper', `lower', `alpha', `digit', `xdigit', `punct',
3170 and the <space> character shall belong to this character class."
3171 [P1003.2, 2.5.2.1] */
3173 unsigned long int mask = BIT (tok_upper) | BIT (tok_lower)
3174 | BIT (tok_alpha) | BIT (tok_digit) | BIT (tok_xdigit)
3175 | BIT (tok_punct);
3176 unsigned long int maskw = BITw (tok_upper) | BITw (tok_lower)
3177 | BITw (tok_alpha) | BITw (tok_digit) | BITw (tok_xdigit)
3178 | BITw (tok_punct);
3179 struct charseq *seq;
3181 for (size_t cnt = 0; cnt < ctype->class_collection_act; ++cnt)
3182 if ((ctype->class_collection[cnt] & maskw) != 0)
3183 ctype->class_collection[cnt] |= BITw (tok_print);
3185 for (size_t cnt = 0; cnt < 256; ++cnt)
3186 if ((ctype->class256_collection[cnt] & mask) != 0)
3187 ctype->class256_collection[cnt] |= BIT (tok_print);
3190 seq = charmap_find_value (charmap, "space", 5);
3191 if (seq == NULL)
3192 seq = charmap_find_value (charmap, "SP", 2);
3193 if (seq == NULL)
3194 seq = charmap_find_value (charmap, "U00000020", 9);
3195 if (seq == NULL)
3197 record_error (0, 0, _("\
3198 %s: character `%s' not defined while needed as default value"),
3199 "LC_CTYPE", "<space>");
3201 else if (seq->nbytes != 1)
3202 record_error (0, 0, _("\
3203 %s: character `%s' in charmap not representable with one byte"),
3204 "LC_CTYPE", "<space>");
3205 else
3206 ctype->class256_collection[seq->bytes[0]] |= BIT (tok_print);
3208 /* No need to search. */
3209 ELEM (ctype, class_collection, , L' ') |= BITw (tok_print);
3212 if (ctype->tomap_done[0] == 0)
3213 /* "If this keyword [toupper] is not specified, the lowercase letters
3214 `a' through `z', and their corresponding uppercase letters `A' to
3215 `Z', ..., shall automatically be included, with implementation-
3216 defined character values." [P1003.2, 2.5.2.1] */
3218 char tmp[4];
3219 int ch;
3221 strcpy (tmp, "<?>");
3223 for (ch = 'a'; ch <= 'z'; ++ch)
3225 struct charseq *seq_from, *seq_to;
3227 tmp[1] = (char) ch;
3229 seq_from = charmap_find_value (charmap, &tmp[1], 1);
3230 if (seq_from == NULL)
3232 char buf[10];
3233 sprintf (buf, "U%08X", ch);
3234 seq_from = charmap_find_value (charmap, buf, 9);
3236 if (seq_from == NULL)
3238 record_error (0, 0, _("\
3239 %s: character `%s' not defined while needed as default value"),
3240 "LC_CTYPE", tmp);
3242 else if (seq_from->nbytes != 1)
3244 record_error (0, 0, _("\
3245 %s: character `%s' needed as default value not representable with one byte"),
3246 "LC_CTYPE", tmp);
3248 else
3250 /* This conversion is implementation defined. */
3251 tmp[1] = (char) (ch + ('A' - 'a'));
3252 seq_to = charmap_find_value (charmap, &tmp[1], 1);
3253 if (seq_to == NULL)
3255 char buf[10];
3256 sprintf (buf, "U%08X", ch + ('A' - 'a'));
3257 seq_to = charmap_find_value (charmap, buf, 9);
3259 if (seq_to == NULL)
3261 record_error (0, 0, _("\
3262 %s: character `%s' not defined while needed as default value"),
3263 "LC_CTYPE", tmp);
3265 else if (seq_to->nbytes != 1)
3267 record_error (0, 0, _("\
3268 %s: character `%s' needed as default value not representable with one byte"),
3269 "LC_CTYPE", tmp);
3271 else
3272 /* The index [0] is determined by the order of the
3273 `ctype_map_newP' calls in `ctype_startup'. */
3274 ctype->map256_collection[0][seq_from->bytes[0]]
3275 = seq_to->bytes[0];
3278 /* No need to search. */
3279 ELEM (ctype, map_collection, [0], ch) = ch + ('A' - 'a');
3283 if (ctype->tomap_done[1] == 0)
3284 /* "If this keyword [tolower] is not specified, the mapping shall be
3285 the reverse mapping of the one specified to `toupper'." [P1003.2] */
3287 for (size_t cnt = 0; cnt < ctype->map_collection_act[0]; ++cnt)
3288 if (ctype->map_collection[0][cnt] != 0)
3289 ELEM (ctype, map_collection, [1],
3290 ctype->map_collection[0][cnt])
3291 = ctype->charnames[cnt];
3293 for (size_t cnt = 0; cnt < 256; ++cnt)
3294 if (ctype->map256_collection[0][cnt] != 0)
3295 ctype->map256_collection[1][ctype->map256_collection[0][cnt]] = cnt;
3298 if (ctype->outdigits_act != 10)
3300 if (ctype->outdigits_act != 0)
3301 record_error (0, 0, _("\
3302 %s: field `%s' does not contain exactly ten entries"),
3303 "LC_CTYPE", "outdigit");
3305 for (size_t cnt = ctype->outdigits_act; cnt < 10; ++cnt)
3307 ctype->mboutdigits[cnt] = charmap_find_symbol (charmap,
3308 (char *) digits + cnt,
3311 if (ctype->mboutdigits[cnt] == NULL)
3312 ctype->mboutdigits[cnt] = charmap_find_symbol (charmap,
3313 longnames[cnt],
3314 strlen (longnames[cnt]));
3316 if (ctype->mboutdigits[cnt] == NULL)
3317 ctype->mboutdigits[cnt] = charmap_find_symbol (charmap,
3318 uninames[cnt], 9);
3320 if (ctype->mboutdigits[cnt] == NULL)
3322 /* Provide a replacement. */
3323 record_error (0, 0, _("\
3324 no output digits defined and none of the standard names in the charmap"));
3326 ctype->mboutdigits[cnt] = obstack_alloc (&((struct charmap_t *) charmap)->mem_pool,
3327 sizeof (struct charseq)
3328 + 1);
3330 /* This is better than nothing. */
3331 ctype->mboutdigits[cnt]->bytes[0] = digits[cnt];
3332 ctype->mboutdigits[cnt]->nbytes = 1;
3335 ctype->wcoutdigits[cnt] = L'0' + cnt;
3338 ctype->outdigits_act = 10;
3341 #undef set_default
3345 /* Initialize. Assumes t->p and t->q have already been set. */
3346 static inline void
3347 wctype_table_init (struct wctype_table *t)
3349 t->level1 = NULL;
3350 t->level1_alloc = t->level1_size = 0;
3351 t->level2 = NULL;
3352 t->level2_alloc = t->level2_size = 0;
3353 t->level3 = NULL;
3354 t->level3_alloc = t->level3_size = 0;
3357 /* Add one entry. */
3358 static void
3359 wctype_table_add (struct wctype_table *t, uint32_t wc)
3361 uint32_t index1 = wc >> (t->q + t->p + 5);
3362 uint32_t index2 = (wc >> (t->p + 5)) & ((1 << t->q) - 1);
3363 uint32_t index3 = (wc >> 5) & ((1 << t->p) - 1);
3364 uint32_t index4 = wc & 0x1f;
3365 size_t i, i1, i2;
3367 if (index1 >= t->level1_size)
3369 if (index1 >= t->level1_alloc)
3371 size_t alloc = 2 * t->level1_alloc;
3372 if (alloc <= index1)
3373 alloc = index1 + 1;
3374 t->level1 = (uint32_t *) xrealloc ((char *) t->level1,
3375 alloc * sizeof (uint32_t));
3376 t->level1_alloc = alloc;
3378 while (index1 >= t->level1_size)
3379 t->level1[t->level1_size++] = EMPTY;
3382 if (t->level1[index1] == EMPTY)
3384 if (t->level2_size == t->level2_alloc)
3386 size_t alloc = 2 * t->level2_alloc + 1;
3387 t->level2 = (uint32_t *) xrealloc ((char *) t->level2,
3388 (alloc << t->q) * sizeof (uint32_t));
3389 t->level2_alloc = alloc;
3391 i1 = t->level2_size << t->q;
3392 i2 = (t->level2_size + 1) << t->q;
3393 for (i = i1; i < i2; i++)
3394 t->level2[i] = EMPTY;
3395 t->level1[index1] = t->level2_size++;
3398 index2 += t->level1[index1] << t->q;
3400 if (t->level2[index2] == EMPTY)
3402 if (t->level3_size == t->level3_alloc)
3404 size_t alloc = 2 * t->level3_alloc + 1;
3405 t->level3 = (uint32_t *) xrealloc ((char *) t->level3,
3406 (alloc << t->p) * sizeof (uint32_t));
3407 t->level3_alloc = alloc;
3409 i1 = t->level3_size << t->p;
3410 i2 = (t->level3_size + 1) << t->p;
3411 for (i = i1; i < i2; i++)
3412 t->level3[i] = 0;
3413 t->level2[index2] = t->level3_size++;
3416 index3 += t->level2[index2] << t->p;
3418 t->level3[index3] |= (uint32_t)1 << index4;
3421 /* Finalize and shrink. */
3422 static void
3423 add_locale_wctype_table (struct locale_file *file, struct wctype_table *t)
3425 size_t i, j, k;
3426 uint32_t reorder3[t->level3_size];
3427 uint32_t reorder2[t->level2_size];
3428 uint32_t level2_offset, level3_offset;
3430 /* Uniquify level3 blocks. */
3431 k = 0;
3432 for (j = 0; j < t->level3_size; j++)
3434 for (i = 0; i < k; i++)
3435 if (memcmp (&t->level3[i << t->p], &t->level3[j << t->p],
3436 (1 << t->p) * sizeof (uint32_t)) == 0)
3437 break;
3438 /* Relocate block j to block i. */
3439 reorder3[j] = i;
3440 if (i == k)
3442 if (i != j)
3443 memcpy (&t->level3[i << t->p], &t->level3[j << t->p],
3444 (1 << t->p) * sizeof (uint32_t));
3445 k++;
3448 t->level3_size = k;
3450 for (i = 0; i < (t->level2_size << t->q); i++)
3451 if (t->level2[i] != EMPTY)
3452 t->level2[i] = reorder3[t->level2[i]];
3454 /* Uniquify level2 blocks. */
3455 k = 0;
3456 for (j = 0; j < t->level2_size; j++)
3458 for (i = 0; i < k; i++)
3459 if (memcmp (&t->level2[i << t->q], &t->level2[j << t->q],
3460 (1 << t->q) * sizeof (uint32_t)) == 0)
3461 break;
3462 /* Relocate block j to block i. */
3463 reorder2[j] = i;
3464 if (i == k)
3466 if (i != j)
3467 memcpy (&t->level2[i << t->q], &t->level2[j << t->q],
3468 (1 << t->q) * sizeof (uint32_t));
3469 k++;
3472 t->level2_size = k;
3474 for (i = 0; i < t->level1_size; i++)
3475 if (t->level1[i] != EMPTY)
3476 t->level1[i] = reorder2[t->level1[i]];
3478 t->result_size =
3479 5 * sizeof (uint32_t)
3480 + t->level1_size * sizeof (uint32_t)
3481 + (t->level2_size << t->q) * sizeof (uint32_t)
3482 + (t->level3_size << t->p) * sizeof (uint32_t);
3484 level2_offset =
3485 5 * sizeof (uint32_t)
3486 + t->level1_size * sizeof (uint32_t);
3487 level3_offset =
3488 5 * sizeof (uint32_t)
3489 + t->level1_size * sizeof (uint32_t)
3490 + (t->level2_size << t->q) * sizeof (uint32_t);
3492 start_locale_structure (file);
3493 add_locale_uint32 (file, t->q + t->p + 5);
3494 add_locale_uint32 (file, t->level1_size);
3495 add_locale_uint32 (file, t->p + 5);
3496 add_locale_uint32 (file, (1 << t->q) - 1);
3497 add_locale_uint32 (file, (1 << t->p) - 1);
3499 for (i = 0; i < t->level1_size; i++)
3500 add_locale_uint32
3501 (file,
3502 t->level1[i] == EMPTY
3504 : (t->level1[i] << t->q) * sizeof (uint32_t) + level2_offset);
3506 for (i = 0; i < (t->level2_size << t->q); i++)
3507 add_locale_uint32
3508 (file,
3509 t->level2[i] == EMPTY
3511 : (t->level2[i] << t->p) * sizeof (uint32_t) + level3_offset);
3513 add_locale_uint32_array (file, t->level3, t->level3_size << t->p);
3514 end_locale_structure (file);
3516 if (t->level1_alloc > 0)
3517 free (t->level1);
3518 if (t->level2_alloc > 0)
3519 free (t->level2);
3520 if (t->level3_alloc > 0)
3521 free (t->level3);
3524 /* Flattens the included transliterations into a translit list.
3525 Inserts them in the list at `cursor', and returns the new cursor. */
3526 static struct translit_t **
3527 translit_flatten (struct locale_ctype_t *ctype,
3528 const struct charmap_t *charmap,
3529 struct translit_t **cursor)
3531 while (ctype->translit_include != NULL)
3533 const char *copy_locale = ctype->translit_include->copy_locale;
3534 const char *copy_repertoire = ctype->translit_include->copy_repertoire;
3535 struct localedef_t *other;
3537 /* Unchain the include statement. During the depth-first traversal
3538 we don't want to visit any locale more than once. */
3539 ctype->translit_include = ctype->translit_include->next;
3541 other = find_locale (LC_CTYPE, copy_locale, copy_repertoire, charmap);
3543 if (other == NULL || other->categories[LC_CTYPE].ctype == NULL)
3545 record_error (0, 0, _("\
3546 %s: transliteration data from locale `%s' not available"),
3547 "LC_CTYPE", copy_locale);
3549 else
3551 struct locale_ctype_t *other_ctype =
3552 other->categories[LC_CTYPE].ctype;
3554 cursor = translit_flatten (other_ctype, charmap, cursor);
3555 assert (other_ctype->translit_include == NULL);
3557 if (other_ctype->translit != NULL)
3559 /* Insert the other_ctype->translit list at *cursor. */
3560 struct translit_t *endp = other_ctype->translit;
3561 while (endp->next != NULL)
3562 endp = endp->next;
3564 endp->next = *cursor;
3565 *cursor = other_ctype->translit;
3567 /* Avoid any risk of circular lists. */
3568 other_ctype->translit = NULL;
3570 cursor = &endp->next;
3573 if (ctype->default_missing == NULL)
3574 ctype->default_missing = other_ctype->default_missing;
3578 return cursor;
3581 static void
3582 allocate_arrays (struct locale_ctype_t *ctype, const struct charmap_t *charmap,
3583 struct repertoire_t *repertoire)
3585 size_t idx, nr;
3586 const void *key;
3587 size_t len;
3588 void *vdata;
3589 void *curs;
3591 /* You wonder about this amount of memory? This is only because some
3592 users do not manage to address the array with unsigned values or
3593 data types with range >= 256. '\200' would result in the array
3594 index -128. To help these poor people we duplicate the entries for
3595 128 up to 255 below the entry for \0. */
3596 ctype->ctype_b = (char_class_t *) xcalloc (256 + 128, sizeof (char_class_t));
3597 ctype->ctype32_b = (char_class32_t *) xcalloc (256, sizeof (char_class32_t));
3598 ctype->class_b = (uint32_t **)
3599 xmalloc (ctype->nr_charclass * sizeof (uint32_t *));
3600 ctype->class_3level = (struct wctype_table *)
3601 xmalloc (ctype->nr_charclass * sizeof (struct wctype_table));
3603 /* This is the array accessed using the multibyte string elements. */
3604 for (idx = 0; idx < 256; ++idx)
3605 ctype->ctype_b[128 + idx] = ctype->class256_collection[idx];
3607 /* Mirror first 127 entries. We must take care that entry -1 is not
3608 mirrored because EOF == -1. */
3609 for (idx = 0; idx < 127; ++idx)
3610 ctype->ctype_b[idx] = ctype->ctype_b[256 + idx];
3612 /* The 32 bit array contains all characters < 0x100. */
3613 for (idx = 0; idx < ctype->class_collection_act; ++idx)
3614 if (ctype->charnames[idx] < 0x100)
3615 ctype->ctype32_b[ctype->charnames[idx]] = ctype->class_collection[idx];
3617 for (nr = 0; nr < ctype->nr_charclass; nr++)
3619 ctype->class_b[nr] = (uint32_t *) xcalloc (256 / 32, sizeof (uint32_t));
3621 /* We only set CLASS_B for the bits in the ISO C classes, not
3622 the user defined classes. The number should not change but
3623 who knows. */
3624 #define LAST_ISO_C_BIT 11
3625 if (nr <= LAST_ISO_C_BIT)
3626 for (idx = 0; idx < 256; ++idx)
3627 if (ctype->class256_collection[idx] & _ISbit (nr))
3628 ctype->class_b[nr][idx >> 5] |= (uint32_t) 1 << (idx & 0x1f);
3631 for (nr = 0; nr < ctype->nr_charclass; nr++)
3633 struct wctype_table *t;
3635 t = &ctype->class_3level[nr];
3636 t->p = 4; /* or: 5 */
3637 t->q = 7; /* or: 6 */
3638 wctype_table_init (t);
3640 for (idx = 0; idx < ctype->class_collection_act; ++idx)
3641 if (ctype->class_collection[idx] & _ISwbit (nr))
3642 wctype_table_add (t, ctype->charnames[idx]);
3644 record_verbose (stderr, _("\
3645 %s: table for class \"%s\": %lu bytes"),
3646 "LC_CTYPE", ctype->classnames[nr],
3647 (unsigned long int) t->result_size);
3650 /* Room for table of mappings. */
3651 ctype->map_b = (uint32_t **) xmalloc (2 * sizeof (uint32_t *));
3652 ctype->map32_b = (uint32_t **) xmalloc (ctype->map_collection_nr
3653 * sizeof (uint32_t *));
3654 ctype->map_3level = (struct wctrans_table *)
3655 xmalloc (ctype->map_collection_nr * sizeof (struct wctrans_table));
3657 /* Fill in all mappings. */
3658 for (idx = 0; idx < 2; ++idx)
3660 unsigned int idx2;
3662 /* Allocate table. */
3663 ctype->map_b[idx] = (uint32_t *)
3664 xmalloc ((256 + 128) * sizeof (uint32_t));
3666 /* Copy values from collection. */
3667 for (idx2 = 0; idx2 < 256; ++idx2)
3668 ctype->map_b[idx][128 + idx2] = ctype->map256_collection[idx][idx2];
3670 /* Mirror first 127 entries. We must take care not to map entry
3671 -1 because EOF == -1. */
3672 for (idx2 = 0; idx2 < 127; ++idx2)
3673 ctype->map_b[idx][idx2] = ctype->map_b[idx][256 + idx2];
3675 /* EOF must map to EOF. */
3676 ctype->map_b[idx][127] = EOF;
3679 for (idx = 0; idx < ctype->map_collection_nr; ++idx)
3681 unsigned int idx2;
3683 /* Allocate table. */
3684 ctype->map32_b[idx] = (uint32_t *) xmalloc (256 * sizeof (uint32_t));
3686 /* Copy values from collection. Default is identity mapping. */
3687 for (idx2 = 0; idx2 < 256; ++idx2)
3688 ctype->map32_b[idx][idx2] =
3689 (ctype->map_collection[idx][idx2] != 0
3690 ? ctype->map_collection[idx][idx2]
3691 : idx2);
3694 for (nr = 0; nr < ctype->map_collection_nr; nr++)
3696 struct wctrans_table *t;
3698 t = &ctype->map_3level[nr];
3699 t->p = 7;
3700 t->q = 9;
3701 wctrans_table_init (t);
3703 for (idx = 0; idx < ctype->map_collection_act[nr]; ++idx)
3704 if (ctype->map_collection[nr][idx] != 0)
3705 wctrans_table_add (t, ctype->charnames[idx],
3706 ctype->map_collection[nr][idx]);
3708 record_verbose (stderr, _("\
3709 %s: table for map \"%s\": %lu bytes"),
3710 "LC_CTYPE", ctype->mapnames[nr],
3711 (unsigned long int) t->result_size);
3714 /* Extra array for class and map names. */
3715 ctype->class_name_ptr = (uint32_t *) xmalloc (ctype->nr_charclass
3716 * sizeof (uint32_t));
3717 ctype->map_name_ptr = (uint32_t *) xmalloc (ctype->map_collection_nr
3718 * sizeof (uint32_t));
3720 ctype->class_offset = _NL_ITEM_INDEX (_NL_CTYPE_EXTRA_MAP_1);
3721 ctype->map_offset = ctype->class_offset + ctype->nr_charclass;
3723 /* Array for width information. Because the expected widths are very
3724 small (never larger than 2) we use only one single byte. This
3725 saves space.
3726 We put only printable characters in the table. wcwidth is specified
3727 to return -1 for non-printable characters. Doing the check here
3728 saves a run-time check.
3729 But we put L'\0' in the table. This again saves a run-time check. */
3731 struct wcwidth_table *t;
3733 t = &ctype->width;
3734 t->p = 7;
3735 t->q = 9;
3736 wcwidth_table_init (t);
3738 /* First set all the printable characters of the character set to
3739 the default width. */
3740 curs = NULL;
3741 while (iterate_table (&charmap->char_table, &curs, &key, &len, &vdata) == 0)
3743 struct charseq *data = (struct charseq *) vdata;
3745 if (data->ucs4 == UNINITIALIZED_CHAR_VALUE)
3746 data->ucs4 = repertoire_find_value (ctype->repertoire,
3747 data->name, len);
3749 if (data->ucs4 != ILLEGAL_CHAR_VALUE)
3751 uint32_t *class_bits =
3752 find_idx (ctype, &ctype->class_collection, NULL,
3753 &ctype->class_collection_act, data->ucs4);
3755 if (class_bits != NULL && (*class_bits & BITw (tok_print)))
3756 wcwidth_table_add (t, data->ucs4, charmap->width_default);
3760 /* Now add the explicitly specified widths. */
3761 if (charmap->width_rules != NULL)
3762 for (size_t cnt = 0; cnt < charmap->nwidth_rules; ++cnt)
3764 unsigned char bytes[charmap->mb_cur_max];
3765 int nbytes = charmap->width_rules[cnt].from->nbytes;
3767 /* We have the range of character for which the width is
3768 specified described using byte sequences of the multibyte
3769 charset. We have to convert this to UCS4 now. And we
3770 cannot simply convert the beginning and the end of the
3771 sequence, we have to iterate over the byte sequence and
3772 convert it for every single character. */
3773 memcpy (bytes, charmap->width_rules[cnt].from->bytes, nbytes);
3775 while (nbytes < charmap->width_rules[cnt].to->nbytes
3776 || memcmp (bytes, charmap->width_rules[cnt].to->bytes,
3777 nbytes) <= 0)
3779 /* Find the UCS value for `bytes'. */
3780 int inner;
3781 uint32_t wch;
3782 struct charseq *seq =
3783 charmap_find_symbol (charmap, (char *) bytes, nbytes);
3785 if (seq == NULL)
3786 wch = ILLEGAL_CHAR_VALUE;
3787 else if (seq->ucs4 != UNINITIALIZED_CHAR_VALUE)
3788 wch = seq->ucs4;
3789 else
3790 wch = repertoire_find_value (ctype->repertoire, seq->name,
3791 strlen (seq->name));
3793 if (wch != ILLEGAL_CHAR_VALUE)
3795 /* Store the value. */
3796 uint32_t *class_bits =
3797 find_idx (ctype, &ctype->class_collection, NULL,
3798 &ctype->class_collection_act, wch);
3800 if (class_bits != NULL && (*class_bits & BITw (tok_print)))
3801 wcwidth_table_add (t, wch,
3802 charmap->width_rules[cnt].width);
3805 /* "Increment" the bytes sequence. */
3806 inner = nbytes - 1;
3807 while (inner >= 0 && bytes[inner] == 0xff)
3808 --inner;
3810 if (inner < 0)
3812 /* We have to extend the byte sequence. */
3813 if (nbytes >= charmap->width_rules[cnt].to->nbytes)
3814 break;
3816 bytes[0] = 1;
3817 memset (&bytes[1], 0, nbytes);
3818 ++nbytes;
3820 else
3822 ++bytes[inner];
3823 while (++inner < nbytes)
3824 bytes[inner] = 0;
3829 /* Set the width of L'\0' to 0. */
3830 wcwidth_table_add (t, 0, 0);
3832 record_verbose (stderr, _("%s: table for width: %lu bytes"),
3833 "LC_CTYPE", (unsigned long int) t->result_size);
3836 /* Set MB_CUR_MAX. */
3837 ctype->mb_cur_max = charmap->mb_cur_max;
3839 /* Now determine the table for the transliteration information.
3841 XXX It is not yet clear to me whether it is worth implementing a
3842 complicated algorithm which uses a hash table to locate the entries.
3843 For now I'll use a simple array which can be searching using binary
3844 search. */
3845 if (ctype->translit_include != NULL)
3846 /* Traverse the locales mentioned in the `include' statements in a
3847 depth-first way and fold in their transliteration information. */
3848 translit_flatten (ctype, charmap, &ctype->translit);
3850 if (ctype->translit != NULL)
3852 /* First count how many entries we have. This is the upper limit
3853 since some entries from the included files might be overwritten. */
3854 size_t number = 0;
3855 struct translit_t *runp = ctype->translit;
3856 struct translit_t **sorted;
3857 size_t from_len, to_len;
3859 while (runp != NULL)
3861 ++number;
3862 runp = runp->next;
3865 /* Next we allocate an array large enough and fill in the values. */
3866 sorted = (struct translit_t **) alloca (number
3867 * sizeof (struct translit_t **));
3868 runp = ctype->translit;
3869 number = 0;
3872 /* Search for the place where to insert this string.
3873 XXX Better use a real sorting algorithm later. */
3874 size_t idx = 0;
3875 int replace = 0;
3877 while (idx < number)
3879 int res = wcscmp ((const wchar_t *) sorted[idx]->from,
3880 (const wchar_t *) runp->from);
3881 if (res == 0)
3883 replace = 1;
3884 break;
3886 if (res > 0)
3887 break;
3888 ++idx;
3891 if (replace)
3892 sorted[idx] = runp;
3893 else
3895 memmove (&sorted[idx + 1], &sorted[idx],
3896 (number - idx) * sizeof (struct translit_t *));
3897 sorted[idx] = runp;
3898 ++number;
3901 runp = runp->next;
3903 while (runp != NULL);
3905 /* The next step is putting all the possible transliteration
3906 strings in one memory block so that we can write it out.
3907 We need several different blocks:
3908 - index to the from-string array
3909 - from-string array
3910 - index to the to-string array
3911 - to-string array.
3913 from_len = to_len = 0;
3914 for (size_t cnt = 0; cnt < number; ++cnt)
3916 struct translit_to_t *srunp;
3917 from_len += wcslen ((const wchar_t *) sorted[cnt]->from) + 1;
3918 srunp = sorted[cnt]->to;
3919 while (srunp != NULL)
3921 to_len += wcslen ((const wchar_t *) srunp->str) + 1;
3922 srunp = srunp->next;
3924 /* Plus one for the extra NUL character marking the end of
3925 the list for the current entry. */
3926 ++to_len;
3929 /* We can allocate the arrays for the results. */
3930 ctype->translit_from_idx = xmalloc (number * sizeof (uint32_t));
3931 ctype->translit_from_tbl = xmalloc (from_len * sizeof (uint32_t));
3932 ctype->translit_to_idx = xmalloc (number * sizeof (uint32_t));
3933 ctype->translit_to_tbl = xmalloc (to_len * sizeof (uint32_t));
3935 from_len = 0;
3936 to_len = 0;
3937 for (size_t cnt = 0; cnt < number; ++cnt)
3939 size_t len;
3940 struct translit_to_t *srunp;
3942 ctype->translit_from_idx[cnt] = from_len;
3943 ctype->translit_to_idx[cnt] = to_len;
3945 len = wcslen ((const wchar_t *) sorted[cnt]->from) + 1;
3946 wmemcpy ((wchar_t *) &ctype->translit_from_tbl[from_len],
3947 (const wchar_t *) sorted[cnt]->from, len);
3948 from_len += len;
3950 ctype->translit_to_idx[cnt] = to_len;
3951 srunp = sorted[cnt]->to;
3952 while (srunp != NULL)
3954 len = wcslen ((const wchar_t *) srunp->str) + 1;
3955 wmemcpy ((wchar_t *) &ctype->translit_to_tbl[to_len],
3956 (const wchar_t *) srunp->str, len);
3957 to_len += len;
3958 srunp = srunp->next;
3960 ctype->translit_to_tbl[to_len++] = L'\0';
3963 /* Store the information about the length. */
3964 ctype->translit_idx_size = number;
3965 ctype->translit_from_tbl_size = from_len * sizeof (uint32_t);
3966 ctype->translit_to_tbl_size = to_len * sizeof (uint32_t);
3968 else
3970 ctype->translit_from_idx = no_str;
3971 ctype->translit_from_tbl = no_str;
3972 ctype->translit_to_tbl = no_str;
3973 ctype->translit_idx_size = 0;
3974 ctype->translit_from_tbl_size = 0;
3975 ctype->translit_to_tbl_size = 0;