Update copyright dates with scripts/update-copyrights.
[glibc.git] / locale / programs / ld-ctype.c
blobe8690f3e3e134211b37c85861ff0e0bd6291476d
1 /* Copyright (C) 1995-2015 Free Software Foundation, Inc.
2 This file is part of the GNU C Library.
3 Contributed by Ulrich Drepper <drepper@gnu.org>, 1995.
5 This program is free software; you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published
7 by the Free Software Foundation; version 2 of the License, or
8 (at your option) any later version.
10 This program is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 GNU General Public License for more details.
15 You should have received a copy of the GNU General Public License
16 along with this program; if not, see <http://www.gnu.org/licenses/>. */
18 #ifdef HAVE_CONFIG_H
19 # include <config.h>
20 #endif
22 #include <alloca.h>
23 #include <byteswap.h>
24 #include <endian.h>
25 #include <errno.h>
26 #include <limits.h>
27 #include <obstack.h>
28 #include <stdlib.h>
29 #include <string.h>
30 #include <wchar.h>
31 #include <wctype.h>
32 #include <stdint.h>
33 #include <sys/uio.h>
35 #include "localedef.h"
36 #include "charmap.h"
37 #include "localeinfo.h"
38 #include "langinfo.h"
39 #include "linereader.h"
40 #include "locfile-token.h"
41 #include "locfile.h"
43 #include <assert.h>
46 #ifdef PREDEFINED_CLASSES
47 /* These are the extra bits not in wctype.h since these are not preallocated
48 classes. */
49 # define _ISwspecial1 (1 << 29)
50 # define _ISwspecial2 (1 << 30)
51 # define _ISwspecial3 (1 << 31)
52 #endif
55 /* The bit used for representing a special class. */
56 #define BITPOS(class) ((class) - tok_upper)
57 #define BIT(class) (_ISbit (BITPOS (class)))
58 #define BITw(class) (_ISwbit (BITPOS (class)))
60 #define ELEM(ctype, collection, idx, value) \
61 *find_idx (ctype, &ctype->collection idx, &ctype->collection##_max idx, \
62 &ctype->collection##_act idx, value)
65 /* To be compatible with former implementations we for now restrict
66 the number of bits for character classes to 16. When compatibility
67 is not necessary anymore increase the number to 32. */
68 #define char_class_t uint16_t
69 #define char_class32_t uint32_t
72 /* Type to describe a transliteration action. We have a possibly
73 multiple character from-string and a set of multiple character
74 to-strings. All are 32bit values since this is what is used in
75 the gconv functions. */
76 struct translit_to_t
78 uint32_t *str;
80 struct translit_to_t *next;
83 struct translit_t
85 uint32_t *from;
87 const char *fname;
88 size_t lineno;
90 struct translit_to_t *to;
92 struct translit_t *next;
95 struct translit_ignore_t
97 uint32_t from;
98 uint32_t to;
99 uint32_t step;
101 const char *fname;
102 size_t lineno;
104 struct translit_ignore_t *next;
108 /* Type to describe a transliteration include statement. */
109 struct translit_include_t
111 const char *copy_locale;
112 const char *copy_repertoire;
114 struct translit_include_t *next;
117 /* Provide some dummy pointer for empty string. */
118 static uint32_t no_str[] = { 0 };
121 /* Sparse table of uint32_t. */
122 #define TABLE idx_table
123 #define ELEMENT uint32_t
124 #define DEFAULT ((uint32_t) ~0)
125 #define NO_ADD_LOCALE
126 #include "3level.h"
128 #define TABLE wcwidth_table
129 #define ELEMENT uint8_t
130 #define DEFAULT 0xff
131 #include "3level.h"
133 #define TABLE wctrans_table
134 #define ELEMENT int32_t
135 #define DEFAULT 0
136 #define wctrans_table_add wctrans_table_add_internal
137 #include "3level.h"
138 #undef wctrans_table_add
139 /* The wctrans_table must actually store the difference between the
140 desired result and the argument. */
141 static inline void
142 wctrans_table_add (struct wctrans_table *t, uint32_t wc, uint32_t mapped_wc)
144 wctrans_table_add_internal (t, wc, mapped_wc - wc);
147 /* Construction of sparse 3-level tables.
148 See wchar-lookup.h for their structure and the meaning of p and q. */
150 struct wctype_table
152 /* Parameters. */
153 unsigned int p;
154 unsigned int q;
155 /* Working representation. */
156 size_t level1_alloc;
157 size_t level1_size;
158 uint32_t *level1;
159 size_t level2_alloc;
160 size_t level2_size;
161 uint32_t *level2;
162 size_t level3_alloc;
163 size_t level3_size;
164 uint32_t *level3;
165 size_t result_size;
168 static void add_locale_wctype_table (struct locale_file *file,
169 struct wctype_table *t);
171 /* The real definition of the struct for the LC_CTYPE locale. */
172 struct locale_ctype_t
174 uint32_t *charnames;
175 size_t charnames_max;
176 size_t charnames_act;
177 /* An index lookup table, to speedup find_idx. */
178 struct idx_table charnames_idx;
180 struct repertoire_t *repertoire;
182 /* We will allow up to 8 * sizeof (uint32_t) character classes. */
183 #define MAX_NR_CHARCLASS (8 * sizeof (uint32_t))
184 size_t nr_charclass;
185 const char *classnames[MAX_NR_CHARCLASS];
186 uint32_t last_class_char;
187 uint32_t class256_collection[256];
188 uint32_t *class_collection;
189 size_t class_collection_max;
190 size_t class_collection_act;
191 uint32_t class_done;
192 uint32_t class_offset;
194 struct charseq **mbdigits;
195 size_t mbdigits_act;
196 size_t mbdigits_max;
197 uint32_t *wcdigits;
198 size_t wcdigits_act;
199 size_t wcdigits_max;
201 struct charseq *mboutdigits[10];
202 uint32_t wcoutdigits[10];
203 size_t outdigits_act;
205 /* If the following number ever turns out to be too small simply
206 increase it. But I doubt it will. --drepper@gnu */
207 #define MAX_NR_CHARMAP 16
208 const char *mapnames[MAX_NR_CHARMAP];
209 uint32_t *map_collection[MAX_NR_CHARMAP];
210 uint32_t map256_collection[2][256];
211 size_t map_collection_max[MAX_NR_CHARMAP];
212 size_t map_collection_act[MAX_NR_CHARMAP];
213 size_t map_collection_nr;
214 size_t last_map_idx;
215 int tomap_done[MAX_NR_CHARMAP];
216 uint32_t map_offset;
218 /* Transliteration information. */
219 struct translit_include_t *translit_include;
220 struct translit_t *translit;
221 struct translit_ignore_t *translit_ignore;
222 uint32_t ntranslit_ignore;
224 uint32_t *default_missing;
225 const char *default_missing_file;
226 size_t default_missing_lineno;
228 uint32_t to_nonascii;
229 uint32_t nonascii_case;
231 /* The arrays for the binary representation. */
232 char_class_t *ctype_b;
233 char_class32_t *ctype32_b;
234 uint32_t **map_b;
235 uint32_t **map32_b;
236 uint32_t **class_b;
237 struct wctype_table *class_3level;
238 struct wctrans_table *map_3level;
239 uint32_t *class_name_ptr;
240 uint32_t *map_name_ptr;
241 struct wcwidth_table width;
242 uint32_t mb_cur_max;
243 const char *codeset_name;
244 uint32_t *translit_from_idx;
245 uint32_t *translit_from_tbl;
246 uint32_t *translit_to_idx;
247 uint32_t *translit_to_tbl;
248 uint32_t translit_idx_size;
249 size_t translit_from_tbl_size;
250 size_t translit_to_tbl_size;
252 struct obstack mempool;
256 /* Marker for an empty slot. This has the value 0xFFFFFFFF, regardless
257 whether 'int' is 16 bit, 32 bit, or 64 bit. */
258 #define EMPTY ((uint32_t) ~0)
261 #define obstack_chunk_alloc xmalloc
262 #define obstack_chunk_free free
265 /* Prototypes for local functions. */
266 static void ctype_startup (struct linereader *lr, struct localedef_t *locale,
267 const struct charmap_t *charmap,
268 struct localedef_t *copy_locale,
269 int ignore_content);
270 static void ctype_class_new (struct linereader *lr,
271 struct locale_ctype_t *ctype, const char *name);
272 static void ctype_map_new (struct linereader *lr,
273 struct locale_ctype_t *ctype,
274 const char *name, const struct charmap_t *charmap);
275 static uint32_t *find_idx (struct locale_ctype_t *ctype, uint32_t **table,
276 size_t *max, size_t *act, uint32_t idx);
277 static void set_class_defaults (struct locale_ctype_t *ctype,
278 const struct charmap_t *charmap,
279 struct repertoire_t *repertoire);
280 static void allocate_arrays (struct locale_ctype_t *ctype,
281 const struct charmap_t *charmap,
282 struct repertoire_t *repertoire);
285 static const char *longnames[] =
287 "zero", "one", "two", "three", "four",
288 "five", "six", "seven", "eight", "nine"
290 static const char *uninames[] =
292 "U00000030", "U00000031", "U00000032", "U00000033", "U00000034",
293 "U00000035", "U00000036", "U00000037", "U00000038", "U00000039"
295 static const unsigned char digits[] = "0123456789";
298 static void
299 ctype_startup (struct linereader *lr, struct localedef_t *locale,
300 const struct charmap_t *charmap,
301 struct localedef_t *copy_locale, int ignore_content)
303 unsigned int cnt;
304 struct locale_ctype_t *ctype;
306 if (!ignore_content && locale->categories[LC_CTYPE].ctype == NULL)
308 if (copy_locale == NULL)
310 /* Allocate the needed room. */
311 locale->categories[LC_CTYPE].ctype = ctype =
312 (struct locale_ctype_t *) xcalloc (1,
313 sizeof (struct locale_ctype_t));
315 /* We have seen no names yet. */
316 ctype->charnames_max = charmap->mb_cur_max == 1 ? 256 : 512;
317 ctype->charnames = (uint32_t *) xmalloc (ctype->charnames_max
318 * sizeof (uint32_t));
319 for (cnt = 0; cnt < 256; ++cnt)
320 ctype->charnames[cnt] = cnt;
321 ctype->charnames_act = 256;
322 idx_table_init (&ctype->charnames_idx);
324 /* Fill character class information. */
325 ctype->last_class_char = ILLEGAL_CHAR_VALUE;
326 /* The order of the following instructions determines the bit
327 positions! */
328 ctype_class_new (lr, ctype, "upper");
329 ctype_class_new (lr, ctype, "lower");
330 ctype_class_new (lr, ctype, "alpha");
331 ctype_class_new (lr, ctype, "digit");
332 ctype_class_new (lr, ctype, "xdigit");
333 ctype_class_new (lr, ctype, "space");
334 ctype_class_new (lr, ctype, "print");
335 ctype_class_new (lr, ctype, "graph");
336 ctype_class_new (lr, ctype, "blank");
337 ctype_class_new (lr, ctype, "cntrl");
338 ctype_class_new (lr, ctype, "punct");
339 ctype_class_new (lr, ctype, "alnum");
340 #ifdef PREDEFINED_CLASSES
341 /* The following are extensions from ISO 14652. */
342 ctype_class_new (lr, ctype, "left_to_right");
343 ctype_class_new (lr, ctype, "right_to_left");
344 ctype_class_new (lr, ctype, "num_terminator");
345 ctype_class_new (lr, ctype, "num_separator");
346 ctype_class_new (lr, ctype, "segment_separator");
347 ctype_class_new (lr, ctype, "block_separator");
348 ctype_class_new (lr, ctype, "direction_control");
349 ctype_class_new (lr, ctype, "sym_swap_layout");
350 ctype_class_new (lr, ctype, "char_shape_selector");
351 ctype_class_new (lr, ctype, "num_shape_selector");
352 ctype_class_new (lr, ctype, "non_spacing");
353 ctype_class_new (lr, ctype, "non_spacing_level3");
354 ctype_class_new (lr, ctype, "normal_connect");
355 ctype_class_new (lr, ctype, "r_connect");
356 ctype_class_new (lr, ctype, "no_connect");
357 ctype_class_new (lr, ctype, "no_connect-space");
358 ctype_class_new (lr, ctype, "vowel_connect");
359 #endif
361 ctype->class_collection_max = charmap->mb_cur_max == 1 ? 256 : 512;
362 ctype->class_collection
363 = (uint32_t *) xcalloc (sizeof (unsigned long int),
364 ctype->class_collection_max);
365 ctype->class_collection_act = 256;
367 /* Fill character map information. */
368 ctype->last_map_idx = MAX_NR_CHARMAP;
369 ctype_map_new (lr, ctype, "toupper", charmap);
370 ctype_map_new (lr, ctype, "tolower", charmap);
371 #ifdef PREDEFINED_CLASSES
372 ctype_map_new (lr, ctype, "tosymmetric", charmap);
373 #endif
375 /* Fill first 256 entries in `toXXX' arrays. */
376 for (cnt = 0; cnt < 256; ++cnt)
378 ctype->map_collection[0][cnt] = cnt;
379 ctype->map_collection[1][cnt] = cnt;
380 #ifdef PREDEFINED_CLASSES
381 ctype->map_collection[2][cnt] = cnt;
382 #endif
383 ctype->map256_collection[0][cnt] = cnt;
384 ctype->map256_collection[1][cnt] = cnt;
387 if (enc_not_ascii_compatible)
388 ctype->to_nonascii = 1;
390 obstack_init (&ctype->mempool);
392 else
393 ctype = locale->categories[LC_CTYPE].ctype =
394 copy_locale->categories[LC_CTYPE].ctype;
399 void
400 ctype_finish (struct localedef_t *locale, const struct charmap_t *charmap)
402 /* See POSIX.2, table 2-6 for the meaning of the following table. */
403 #define NCLASS 12
404 static const struct
406 const char *name;
407 const char allow[NCLASS];
409 valid_table[NCLASS] =
411 /* The order is important. See token.h for more information.
412 M = Always, D = Default, - = Permitted, X = Mutually exclusive */
413 { "upper", "--MX-XDDXXX-" },
414 { "lower", "--MX-XDDXXX-" },
415 { "alpha", "---X-XDDXXX-" },
416 { "digit", "XXX--XDDXXX-" },
417 { "xdigit", "-----XDDXXX-" },
418 { "space", "XXXXX------X" },
419 { "print", "---------X--" },
420 { "graph", "---------X--" },
421 { "blank", "XXXXXM-----X" },
422 { "cntrl", "XXXXX-XX--XX" },
423 { "punct", "XXXXX-DD-X-X" },
424 { "alnum", "-----XDDXXX-" }
426 size_t cnt;
427 int cls1, cls2;
428 uint32_t space_value;
429 struct charseq *space_seq;
430 struct locale_ctype_t *ctype = locale->categories[LC_CTYPE].ctype;
431 int warned;
432 const void *key;
433 size_t len;
434 void *vdata;
435 void *curs;
437 /* Now resolve copying and also handle completely missing definitions. */
438 if (ctype == NULL)
440 const char *repertoire_name;
442 /* First see whether we were supposed to copy. If yes, find the
443 actual definition. */
444 if (locale->copy_name[LC_CTYPE] != NULL)
446 /* Find the copying locale. This has to happen transitively since
447 the locale we are copying from might also copying another one. */
448 struct localedef_t *from = locale;
451 from = find_locale (LC_CTYPE, from->copy_name[LC_CTYPE],
452 from->repertoire_name, charmap);
453 while (from->categories[LC_CTYPE].ctype == NULL
454 && from->copy_name[LC_CTYPE] != NULL);
456 ctype = locale->categories[LC_CTYPE].ctype
457 = from->categories[LC_CTYPE].ctype;
460 /* If there is still no definition issue an warning and create an
461 empty one. */
462 if (ctype == NULL)
464 if (! be_quiet)
465 WITH_CUR_LOCALE (error (0, 0, _("\
466 No definition for %s category found"), "LC_CTYPE"));
467 ctype_startup (NULL, locale, charmap, NULL, 0);
468 ctype = locale->categories[LC_CTYPE].ctype;
471 /* Get the repertoire we have to use. */
472 repertoire_name = locale->repertoire_name ?: repertoire_global;
473 if (repertoire_name != NULL)
474 ctype->repertoire = repertoire_read (repertoire_name);
477 /* We need the name of the currently used 8-bit character set to
478 make correct conversion between this 8-bit representation and the
479 ISO 10646 character set used internally for wide characters. */
480 ctype->codeset_name = charmap->code_set_name;
481 if (ctype->codeset_name == NULL)
483 if (! be_quiet)
484 WITH_CUR_LOCALE (error (0, 0, _("\
485 No character set name specified in charmap")));
486 ctype->codeset_name = "//UNKNOWN//";
489 /* Set default value for classes not specified. */
490 set_class_defaults (ctype, charmap, ctype->repertoire);
492 /* Check according to table. */
493 for (cnt = 0; cnt < ctype->class_collection_act; ++cnt)
495 uint32_t tmp = ctype->class_collection[cnt];
497 if (tmp != 0)
499 for (cls1 = 0; cls1 < NCLASS; ++cls1)
500 if ((tmp & _ISwbit (cls1)) != 0)
501 for (cls2 = 0; cls2 < NCLASS; ++cls2)
502 if (valid_table[cls1].allow[cls2] != '-')
504 int eq = (tmp & _ISwbit (cls2)) != 0;
505 switch (valid_table[cls1].allow[cls2])
507 case 'M':
508 if (!eq)
510 uint32_t value = ctype->charnames[cnt];
512 if (!be_quiet)
513 WITH_CUR_LOCALE (error (0, 0, _("\
514 character L'\\u%0*x' in class `%s' must be in class `%s'"),
515 value > 0xffff ? 8 : 4,
516 value,
517 valid_table[cls1].name,
518 valid_table[cls2].name));
520 break;
522 case 'X':
523 if (eq)
525 uint32_t value = ctype->charnames[cnt];
527 if (!be_quiet)
528 WITH_CUR_LOCALE (error (0, 0, _("\
529 character L'\\u%0*x' in class `%s' must not be in class `%s'"),
530 value > 0xffff ? 8 : 4,
531 value,
532 valid_table[cls1].name,
533 valid_table[cls2].name));
535 break;
537 case 'D':
538 ctype->class_collection[cnt] |= _ISwbit (cls2);
539 break;
541 default:
542 WITH_CUR_LOCALE (error (5, 0, _("\
543 internal error in %s, line %u"), __FUNCTION__, __LINE__));
549 for (cnt = 0; cnt < 256; ++cnt)
551 uint32_t tmp = ctype->class256_collection[cnt];
553 if (tmp != 0)
555 for (cls1 = 0; cls1 < NCLASS; ++cls1)
556 if ((tmp & _ISbit (cls1)) != 0)
557 for (cls2 = 0; cls2 < NCLASS; ++cls2)
558 if (valid_table[cls1].allow[cls2] != '-')
560 int eq = (tmp & _ISbit (cls2)) != 0;
561 switch (valid_table[cls1].allow[cls2])
563 case 'M':
564 if (!eq)
566 char buf[17];
568 snprintf (buf, sizeof buf, "\\%Zo", cnt);
570 if (!be_quiet)
571 WITH_CUR_LOCALE (error (0, 0, _("\
572 character '%s' in class `%s' must be in class `%s'"),
573 buf,
574 valid_table[cls1].name,
575 valid_table[cls2].name));
577 break;
579 case 'X':
580 if (eq)
582 char buf[17];
584 snprintf (buf, sizeof buf, "\\%Zo", cnt);
586 if (!be_quiet)
587 WITH_CUR_LOCALE (error (0, 0, _("\
588 character '%s' in class `%s' must not be in class `%s'"),
589 buf,
590 valid_table[cls1].name,
591 valid_table[cls2].name));
593 break;
595 case 'D':
596 ctype->class256_collection[cnt] |= _ISbit (cls2);
597 break;
599 default:
600 WITH_CUR_LOCALE (error (5, 0, _("\
601 internal error in %s, line %u"), __FUNCTION__, __LINE__));
607 /* ... and now test <SP> as a special case. */
608 space_value = 32;
609 if (((cnt = BITPOS (tok_space),
610 (ELEM (ctype, class_collection, , space_value)
611 & BITw (tok_space)) == 0)
612 || (cnt = BITPOS (tok_blank),
613 (ELEM (ctype, class_collection, , space_value)
614 & BITw (tok_blank)) == 0)))
616 if (!be_quiet)
617 WITH_CUR_LOCALE (error (0, 0, _("<SP> character not in class `%s'"),
618 valid_table[cnt].name));
620 else if (((cnt = BITPOS (tok_punct),
621 (ELEM (ctype, class_collection, , space_value)
622 & BITw (tok_punct)) != 0)
623 || (cnt = BITPOS (tok_graph),
624 (ELEM (ctype, class_collection, , space_value)
625 & BITw (tok_graph))
626 != 0)))
628 if (!be_quiet)
629 WITH_CUR_LOCALE (error (0, 0, _("\
630 <SP> character must not be in class `%s'"),
631 valid_table[cnt].name));
633 else
634 ELEM (ctype, class_collection, , space_value) |= BITw (tok_print);
636 space_seq = charmap_find_value (charmap, "SP", 2);
637 if (space_seq == NULL)
638 space_seq = charmap_find_value (charmap, "space", 5);
639 if (space_seq == NULL)
640 space_seq = charmap_find_value (charmap, "U00000020", 9);
641 if (space_seq == NULL || space_seq->nbytes != 1)
643 if (!be_quiet)
644 WITH_CUR_LOCALE (error (0, 0, _("\
645 character <SP> not defined in character map")));
647 else if (((cnt = BITPOS (tok_space),
648 (ctype->class256_collection[space_seq->bytes[0]]
649 & BIT (tok_space)) == 0)
650 || (cnt = BITPOS (tok_blank),
651 (ctype->class256_collection[space_seq->bytes[0]]
652 & BIT (tok_blank)) == 0)))
654 if (!be_quiet)
655 WITH_CUR_LOCALE (error (0, 0, _("<SP> character not in class `%s'"),
656 valid_table[cnt].name));
658 else if (((cnt = BITPOS (tok_punct),
659 (ctype->class256_collection[space_seq->bytes[0]]
660 & BIT (tok_punct)) != 0)
661 || (cnt = BITPOS (tok_graph),
662 (ctype->class256_collection[space_seq->bytes[0]]
663 & BIT (tok_graph)) != 0)))
665 if (!be_quiet)
666 WITH_CUR_LOCALE (error (0, 0, _("\
667 <SP> character must not be in class `%s'"),
668 valid_table[cnt].name));
670 else
671 ctype->class256_collection[space_seq->bytes[0]] |= BIT (tok_print);
673 /* Check whether all single-byte characters make to their upper/lowercase
674 equivalent according to the ASCII rules. */
675 for (cnt = 'A'; cnt <= 'Z'; ++cnt)
677 uint32_t uppval = ctype->map256_collection[0][cnt];
678 uint32_t lowval = ctype->map256_collection[1][cnt];
679 uint32_t lowuppval = ctype->map256_collection[0][lowval];
680 uint32_t lowlowval = ctype->map256_collection[1][lowval];
682 if (uppval != cnt
683 || lowval != cnt + 0x20
684 || lowuppval != cnt
685 || lowlowval != cnt + 0x20)
686 ctype->nonascii_case = 1;
688 for (cnt = 0; cnt < 256; ++cnt)
689 if (cnt < 'A' || (cnt > 'Z' && cnt < 'a') || cnt > 'z')
690 if (ctype->map256_collection[0][cnt] != cnt
691 || ctype->map256_collection[1][cnt] != cnt)
692 ctype->nonascii_case = 1;
694 /* Now that the tests are done make sure the name array contains all
695 characters which are handled in the WIDTH section of the
696 character set definition file. */
697 if (charmap->width_rules != NULL)
698 for (cnt = 0; cnt < charmap->nwidth_rules; ++cnt)
700 unsigned char bytes[charmap->mb_cur_max];
701 int nbytes = charmap->width_rules[cnt].from->nbytes;
703 /* We have the range of character for which the width is
704 specified described using byte sequences of the multibyte
705 charset. We have to convert this to UCS4 now. And we
706 cannot simply convert the beginning and the end of the
707 sequence, we have to iterate over the byte sequence and
708 convert it for every single character. */
709 memcpy (bytes, charmap->width_rules[cnt].from->bytes, nbytes);
711 while (nbytes < charmap->width_rules[cnt].to->nbytes
712 || memcmp (bytes, charmap->width_rules[cnt].to->bytes,
713 nbytes) <= 0)
715 /* Find the UCS value for `bytes'. */
716 int inner;
717 uint32_t wch;
718 struct charseq *seq
719 = charmap_find_symbol (charmap, (char *) bytes, nbytes);
721 if (seq == NULL)
722 wch = ILLEGAL_CHAR_VALUE;
723 else if (seq->ucs4 != UNINITIALIZED_CHAR_VALUE)
724 wch = seq->ucs4;
725 else
726 wch = repertoire_find_value (ctype->repertoire, seq->name,
727 strlen (seq->name));
729 if (wch != ILLEGAL_CHAR_VALUE)
730 /* We are only interested in the side-effects of the
731 `find_idx' call. It will add appropriate entries in
732 the name array if this is necessary. */
733 (void) find_idx (ctype, NULL, NULL, NULL, wch);
735 /* "Increment" the bytes sequence. */
736 inner = nbytes - 1;
737 while (inner >= 0 && bytes[inner] == 0xff)
738 --inner;
740 if (inner < 0)
742 /* We have to extend the byte sequence. */
743 if (nbytes >= charmap->width_rules[cnt].to->nbytes)
744 break;
746 bytes[0] = 1;
747 memset (&bytes[1], 0, nbytes);
748 ++nbytes;
750 else
752 ++bytes[inner];
753 while (++inner < nbytes)
754 bytes[inner] = 0;
759 /* Now set all the other characters of the character set to the
760 default width. */
761 curs = NULL;
762 while (iterate_table (&charmap->char_table, &curs, &key, &len, &vdata) == 0)
764 struct charseq *data = (struct charseq *) vdata;
766 if (data->ucs4 == UNINITIALIZED_CHAR_VALUE)
767 data->ucs4 = repertoire_find_value (ctype->repertoire,
768 data->name, len);
770 if (data->ucs4 != ILLEGAL_CHAR_VALUE)
771 (void) find_idx (ctype, NULL, NULL, NULL, data->ucs4);
774 /* There must be a multiple of 10 digits. */
775 if (ctype->mbdigits_act % 10 != 0)
777 assert (ctype->mbdigits_act == ctype->wcdigits_act);
778 ctype->wcdigits_act -= ctype->mbdigits_act % 10;
779 ctype->mbdigits_act -= ctype->mbdigits_act % 10;
780 WITH_CUR_LOCALE (error (0, 0, _("\
781 `digit' category has not entries in groups of ten")));
784 /* Check the input digits. There must be a multiple of ten available.
785 In each group it could be that one or the other character is missing.
786 In this case the whole group must be removed. */
787 cnt = 0;
788 while (cnt < ctype->mbdigits_act)
790 size_t inner;
791 for (inner = 0; inner < 10; ++inner)
792 if (ctype->mbdigits[cnt + inner] == NULL)
793 break;
795 if (inner == 10)
796 cnt += 10;
797 else
799 /* Remove the group. */
800 memmove (&ctype->mbdigits[cnt], &ctype->mbdigits[cnt + 10],
801 ((ctype->wcdigits_act - cnt - 10)
802 * sizeof (ctype->mbdigits[0])));
803 ctype->mbdigits_act -= 10;
807 /* If no input digits are given use the default. */
808 if (ctype->mbdigits_act == 0)
810 if (ctype->mbdigits_max == 0)
812 ctype->mbdigits = obstack_alloc (&((struct charmap_t *) charmap)->mem_pool,
813 10 * sizeof (struct charseq *));
814 ctype->mbdigits_max = 10;
817 for (cnt = 0; cnt < 10; ++cnt)
819 ctype->mbdigits[cnt] = charmap_find_symbol (charmap,
820 (char *) digits + cnt, 1);
821 if (ctype->mbdigits[cnt] == NULL)
823 ctype->mbdigits[cnt] = charmap_find_symbol (charmap,
824 longnames[cnt],
825 strlen (longnames[cnt]));
826 if (ctype->mbdigits[cnt] == NULL)
828 /* Hum, this ain't good. */
829 WITH_CUR_LOCALE (error (0, 0, _("\
830 no input digits defined and none of the standard names in the charmap")));
832 ctype->mbdigits[cnt] = obstack_alloc (&((struct charmap_t *) charmap)->mem_pool,
833 sizeof (struct charseq) + 1);
835 /* This is better than nothing. */
836 ctype->mbdigits[cnt]->bytes[0] = digits[cnt];
837 ctype->mbdigits[cnt]->nbytes = 1;
842 ctype->mbdigits_act = 10;
845 /* Check the wide character input digits. There must be a multiple
846 of ten available. In each group it could be that one or the other
847 character is missing. In this case the whole group must be
848 removed. */
849 cnt = 0;
850 while (cnt < ctype->wcdigits_act)
852 size_t inner;
853 for (inner = 0; inner < 10; ++inner)
854 if (ctype->wcdigits[cnt + inner] == ILLEGAL_CHAR_VALUE)
855 break;
857 if (inner == 10)
858 cnt += 10;
859 else
861 /* Remove the group. */
862 memmove (&ctype->wcdigits[cnt], &ctype->wcdigits[cnt + 10],
863 ((ctype->wcdigits_act - cnt - 10)
864 * sizeof (ctype->wcdigits[0])));
865 ctype->wcdigits_act -= 10;
869 /* If no input digits are given use the default. */
870 if (ctype->wcdigits_act == 0)
872 if (ctype->wcdigits_max == 0)
874 ctype->wcdigits = obstack_alloc (&((struct charmap_t *) charmap)->mem_pool,
875 10 * sizeof (uint32_t));
876 ctype->wcdigits_max = 10;
879 for (cnt = 0; cnt < 10; ++cnt)
880 ctype->wcdigits[cnt] = L'0' + cnt;
882 ctype->mbdigits_act = 10;
885 /* Check the outdigits. */
886 warned = 0;
887 for (cnt = 0; cnt < 10; ++cnt)
888 if (ctype->mboutdigits[cnt] == NULL)
890 static struct charseq replace[2];
892 if (!warned)
894 WITH_CUR_LOCALE (error (0, 0, _("\
895 not all characters used in `outdigit' are available in the charmap")));
896 warned = 1;
899 replace[0].nbytes = 1;
900 replace[0].bytes[0] = '?';
901 replace[0].bytes[1] = '\0';
902 ctype->mboutdigits[cnt] = &replace[0];
905 warned = 0;
906 for (cnt = 0; cnt < 10; ++cnt)
907 if (ctype->wcoutdigits[cnt] == 0)
909 if (!warned)
911 WITH_CUR_LOCALE (error (0, 0, _("\
912 not all characters used in `outdigit' are available in the repertoire")));
913 warned = 1;
916 ctype->wcoutdigits[cnt] = L'?';
919 /* Sort the entries in the translit_ignore list. */
920 if (ctype->translit_ignore != NULL)
922 struct translit_ignore_t *firstp = ctype->translit_ignore;
923 struct translit_ignore_t *runp;
925 ctype->ntranslit_ignore = 1;
927 for (runp = firstp->next; runp != NULL; runp = runp->next)
929 struct translit_ignore_t *lastp = NULL;
930 struct translit_ignore_t *cmpp;
932 ++ctype->ntranslit_ignore;
934 for (cmpp = firstp; cmpp != NULL; lastp = cmpp, cmpp = cmpp->next)
935 if (runp->from < cmpp->from)
936 break;
938 runp->next = lastp;
939 if (lastp == NULL)
940 firstp = runp;
943 ctype->translit_ignore = firstp;
948 void
949 ctype_output (struct localedef_t *locale, const struct charmap_t *charmap,
950 const char *output_path)
952 struct locale_ctype_t *ctype = locale->categories[LC_CTYPE].ctype;
953 const size_t nelems = (_NL_ITEM_INDEX (_NL_CTYPE_EXTRA_MAP_1)
954 + ctype->nr_charclass + ctype->map_collection_nr);
955 struct locale_file file;
956 uint32_t default_missing_len;
957 size_t elem, cnt;
959 /* Now prepare the output: Find the sizes of the table we can use. */
960 allocate_arrays (ctype, charmap, ctype->repertoire);
962 default_missing_len = (ctype->default_missing
963 ? wcslen ((wchar_t *) ctype->default_missing)
964 : 0);
966 init_locale_data (&file, nelems);
967 for (elem = 0; elem < nelems; ++elem)
969 if (elem < _NL_ITEM_INDEX (_NL_CTYPE_EXTRA_MAP_1))
970 switch (elem)
972 #define CTYPE_EMPTY(name) \
973 case name: \
974 add_locale_empty (&file); \
975 break
977 CTYPE_EMPTY(_NL_CTYPE_GAP1);
978 CTYPE_EMPTY(_NL_CTYPE_GAP2);
979 CTYPE_EMPTY(_NL_CTYPE_GAP3);
980 CTYPE_EMPTY(_NL_CTYPE_GAP4);
981 CTYPE_EMPTY(_NL_CTYPE_GAP5);
982 CTYPE_EMPTY(_NL_CTYPE_GAP6);
984 #define CTYPE_RAW_DATA(name, base, size) \
985 case _NL_ITEM_INDEX (name): \
986 add_locale_raw_data (&file, base, size); \
987 break
989 CTYPE_RAW_DATA (_NL_CTYPE_CLASS,
990 ctype->ctype_b,
991 (256 + 128) * sizeof (char_class_t));
993 #define CTYPE_UINT32_ARRAY(name, base, n_elems) \
994 case _NL_ITEM_INDEX (name): \
995 add_locale_uint32_array (&file, base, n_elems); \
996 break
998 CTYPE_UINT32_ARRAY (_NL_CTYPE_TOUPPER, ctype->map_b[0], 256 + 128);
999 CTYPE_UINT32_ARRAY (_NL_CTYPE_TOLOWER, ctype->map_b[1], 256 + 128);
1000 CTYPE_UINT32_ARRAY (_NL_CTYPE_TOUPPER32, ctype->map32_b[0], 256);
1001 CTYPE_UINT32_ARRAY (_NL_CTYPE_TOLOWER32, ctype->map32_b[1], 256);
1002 CTYPE_RAW_DATA (_NL_CTYPE_CLASS32,
1003 ctype->ctype32_b,
1004 256 * sizeof (char_class32_t));
1006 #define CTYPE_UINT32(name, value) \
1007 case _NL_ITEM_INDEX (name): \
1008 add_locale_uint32 (&file, value); \
1009 break
1011 CTYPE_UINT32 (_NL_CTYPE_CLASS_OFFSET, ctype->class_offset);
1012 CTYPE_UINT32 (_NL_CTYPE_MAP_OFFSET, ctype->map_offset);
1013 CTYPE_UINT32 (_NL_CTYPE_TRANSLIT_TAB_SIZE, ctype->translit_idx_size);
1015 CTYPE_UINT32_ARRAY (_NL_CTYPE_TRANSLIT_FROM_IDX,
1016 ctype->translit_from_idx,
1017 ctype->translit_idx_size);
1019 CTYPE_UINT32_ARRAY (_NL_CTYPE_TRANSLIT_FROM_TBL,
1020 ctype->translit_from_tbl,
1021 ctype->translit_from_tbl_size
1022 / sizeof (uint32_t));
1024 CTYPE_UINT32_ARRAY (_NL_CTYPE_TRANSLIT_TO_IDX,
1025 ctype->translit_to_idx,
1026 ctype->translit_idx_size);
1028 CTYPE_UINT32_ARRAY (_NL_CTYPE_TRANSLIT_TO_TBL,
1029 ctype->translit_to_tbl,
1030 ctype->translit_to_tbl_size / sizeof (uint32_t));
1032 case _NL_ITEM_INDEX (_NL_CTYPE_CLASS_NAMES):
1033 /* The class name array. */
1034 start_locale_structure (&file);
1035 for (cnt = 0; cnt < ctype->nr_charclass; ++cnt)
1036 add_locale_string (&file, ctype->classnames[cnt]);
1037 add_locale_char (&file, 0);
1038 align_locale_data (&file, LOCFILE_ALIGN);
1039 end_locale_structure (&file);
1040 break;
1042 case _NL_ITEM_INDEX (_NL_CTYPE_MAP_NAMES):
1043 /* The class name array. */
1044 start_locale_structure (&file);
1045 for (cnt = 0; cnt < ctype->map_collection_nr; ++cnt)
1046 add_locale_string (&file, ctype->mapnames[cnt]);
1047 add_locale_char (&file, 0);
1048 align_locale_data (&file, LOCFILE_ALIGN);
1049 end_locale_structure (&file);
1050 break;
1052 case _NL_ITEM_INDEX (_NL_CTYPE_WIDTH):
1053 add_locale_wcwidth_table (&file, &ctype->width);
1054 break;
1056 CTYPE_UINT32 (_NL_CTYPE_MB_CUR_MAX, ctype->mb_cur_max);
1058 case _NL_ITEM_INDEX (_NL_CTYPE_CODESET_NAME):
1059 add_locale_string (&file, ctype->codeset_name);
1060 break;
1062 CTYPE_UINT32 (_NL_CTYPE_MAP_TO_NONASCII, ctype->to_nonascii);
1064 CTYPE_UINT32 (_NL_CTYPE_NONASCII_CASE, ctype->nonascii_case);
1066 case _NL_ITEM_INDEX (_NL_CTYPE_INDIGITS_MB_LEN):
1067 add_locale_uint32 (&file, ctype->mbdigits_act / 10);
1068 break;
1070 case _NL_ITEM_INDEX (_NL_CTYPE_INDIGITS_WC_LEN):
1071 add_locale_uint32 (&file, ctype->wcdigits_act / 10);
1072 break;
1074 case _NL_ITEM_INDEX (_NL_CTYPE_INDIGITS0_MB) ... _NL_ITEM_INDEX (_NL_CTYPE_INDIGITS9_MB):
1075 start_locale_structure (&file);
1076 for (cnt = elem - _NL_ITEM_INDEX (_NL_CTYPE_INDIGITS0_MB);
1077 cnt < ctype->mbdigits_act; cnt += 10)
1079 add_locale_raw_data (&file, ctype->mbdigits[cnt]->bytes,
1080 ctype->mbdigits[cnt]->nbytes);
1081 add_locale_char (&file, 0);
1083 end_locale_structure (&file);
1084 break;
1086 case _NL_ITEM_INDEX (_NL_CTYPE_OUTDIGIT0_MB) ... _NL_ITEM_INDEX (_NL_CTYPE_OUTDIGIT9_MB):
1087 start_locale_structure (&file);
1088 cnt = elem - _NL_ITEM_INDEX (_NL_CTYPE_OUTDIGIT0_MB);
1089 add_locale_raw_data (&file, ctype->mboutdigits[cnt]->bytes,
1090 ctype->mboutdigits[cnt]->nbytes);
1091 add_locale_char (&file, 0);
1092 end_locale_structure (&file);
1093 break;
1095 case _NL_ITEM_INDEX (_NL_CTYPE_INDIGITS0_WC) ... _NL_ITEM_INDEX (_NL_CTYPE_INDIGITS9_WC):
1096 start_locale_structure (&file);
1097 for (cnt = elem - _NL_ITEM_INDEX (_NL_CTYPE_INDIGITS0_WC);
1098 cnt < ctype->wcdigits_act; cnt += 10)
1099 add_locale_uint32 (&file, ctype->wcdigits[cnt]);
1100 end_locale_structure (&file);
1101 break;
1103 case _NL_ITEM_INDEX (_NL_CTYPE_OUTDIGIT0_WC) ... _NL_ITEM_INDEX (_NL_CTYPE_OUTDIGIT9_WC):
1104 cnt = elem - _NL_ITEM_INDEX (_NL_CTYPE_OUTDIGIT0_WC);
1105 add_locale_uint32 (&file, ctype->wcoutdigits[cnt]);
1106 break;
1108 case _NL_ITEM_INDEX(_NL_CTYPE_TRANSLIT_DEFAULT_MISSING_LEN):
1109 add_locale_uint32 (&file, default_missing_len);
1110 break;
1112 case _NL_ITEM_INDEX(_NL_CTYPE_TRANSLIT_DEFAULT_MISSING):
1113 add_locale_uint32_array (&file, ctype->default_missing,
1114 default_missing_len);
1115 break;
1117 case _NL_ITEM_INDEX(_NL_CTYPE_TRANSLIT_IGNORE_LEN):
1118 add_locale_uint32 (&file, ctype->ntranslit_ignore);
1119 break;
1121 case _NL_ITEM_INDEX(_NL_CTYPE_TRANSLIT_IGNORE):
1122 start_locale_structure (&file);
1124 struct translit_ignore_t *runp;
1125 for (runp = ctype->translit_ignore; runp != NULL;
1126 runp = runp->next)
1128 add_locale_uint32 (&file, runp->from);
1129 add_locale_uint32 (&file, runp->to);
1130 add_locale_uint32 (&file, runp->step);
1133 end_locale_structure (&file);
1134 break;
1136 default:
1137 assert (! "unknown CTYPE element");
1139 else
1141 /* Handle extra maps. */
1142 size_t nr = elem - _NL_ITEM_INDEX (_NL_CTYPE_EXTRA_MAP_1);
1143 if (nr < ctype->nr_charclass)
1145 start_locale_prelude (&file);
1146 add_locale_uint32_array (&file, ctype->class_b[nr], 256 / 32);
1147 end_locale_prelude (&file);
1148 add_locale_wctype_table (&file, &ctype->class_3level[nr]);
1150 else
1152 nr -= ctype->nr_charclass;
1153 assert (nr < ctype->map_collection_nr);
1154 add_locale_wctrans_table (&file, &ctype->map_3level[nr]);
1159 write_locale_data (output_path, LC_CTYPE, "LC_CTYPE", &file);
1163 /* Local functions. */
1164 static void
1165 ctype_class_new (struct linereader *lr, struct locale_ctype_t *ctype,
1166 const char *name)
1168 size_t cnt;
1170 for (cnt = 0; cnt < ctype->nr_charclass; ++cnt)
1171 if (strcmp (ctype->classnames[cnt], name) == 0)
1172 break;
1174 if (cnt < ctype->nr_charclass)
1176 lr_error (lr, _("character class `%s' already defined"), name);
1177 return;
1180 if (ctype->nr_charclass == MAX_NR_CHARCLASS)
1181 /* Exit code 2 is prescribed in P1003.2b. */
1182 WITH_CUR_LOCALE (error (2, 0, _("\
1183 implementation limit: no more than %Zd character classes allowed"),
1184 MAX_NR_CHARCLASS));
1186 ctype->classnames[ctype->nr_charclass++] = name;
1190 static void
1191 ctype_map_new (struct linereader *lr, struct locale_ctype_t *ctype,
1192 const char *name, const struct charmap_t *charmap)
1194 size_t max_chars = 0;
1195 size_t cnt;
1197 for (cnt = 0; cnt < ctype->map_collection_nr; ++cnt)
1199 if (strcmp (ctype->mapnames[cnt], name) == 0)
1200 break;
1202 if (max_chars < ctype->map_collection_max[cnt])
1203 max_chars = ctype->map_collection_max[cnt];
1206 if (cnt < ctype->map_collection_nr)
1208 lr_error (lr, _("character map `%s' already defined"), name);
1209 return;
1212 if (ctype->map_collection_nr == MAX_NR_CHARMAP)
1213 /* Exit code 2 is prescribed in P1003.2b. */
1214 WITH_CUR_LOCALE (error (2, 0, _("\
1215 implementation limit: no more than %d character maps allowed"),
1216 MAX_NR_CHARMAP));
1218 ctype->mapnames[cnt] = name;
1220 if (max_chars == 0)
1221 ctype->map_collection_max[cnt] = charmap->mb_cur_max == 1 ? 256 : 512;
1222 else
1223 ctype->map_collection_max[cnt] = max_chars;
1225 ctype->map_collection[cnt] = (uint32_t *)
1226 xcalloc (sizeof (uint32_t), ctype->map_collection_max[cnt]);
1227 ctype->map_collection_act[cnt] = 256;
1229 ++ctype->map_collection_nr;
1233 /* We have to be prepared that TABLE, MAX, and ACT can be NULL. This
1234 is possible if we only want to extend the name array. */
1235 static uint32_t *
1236 find_idx (struct locale_ctype_t *ctype, uint32_t **table, size_t *max,
1237 size_t *act, uint32_t idx)
1239 size_t cnt;
1241 if (idx < 256)
1242 return table == NULL ? NULL : &(*table)[idx];
1244 /* Use the charnames_idx lookup table instead of the slow search loop. */
1245 #if 1
1246 cnt = idx_table_get (&ctype->charnames_idx, idx);
1247 if (cnt == EMPTY)
1248 /* Not found. */
1249 cnt = ctype->charnames_act;
1250 #else
1251 for (cnt = 256; cnt < ctype->charnames_act; ++cnt)
1252 if (ctype->charnames[cnt] == idx)
1253 break;
1254 #endif
1256 /* We have to distinguish two cases: the name is found or not. */
1257 if (cnt == ctype->charnames_act)
1259 /* Extend the name array. */
1260 if (ctype->charnames_act == ctype->charnames_max)
1262 ctype->charnames_max *= 2;
1263 ctype->charnames = (uint32_t *)
1264 xrealloc (ctype->charnames,
1265 sizeof (uint32_t) * ctype->charnames_max);
1267 ctype->charnames[ctype->charnames_act++] = idx;
1268 idx_table_add (&ctype->charnames_idx, idx, cnt);
1271 if (table == NULL)
1272 /* We have done everything we are asked to do. */
1273 return NULL;
1275 if (max == NULL)
1276 /* The caller does not want to extend the table. */
1277 return (cnt >= *act ? NULL : &(*table)[cnt]);
1279 if (cnt >= *act)
1281 if (cnt >= *max)
1283 size_t old_max = *max;
1285 *max *= 2;
1286 while (*max <= cnt);
1288 *table =
1289 (uint32_t *) xrealloc (*table, *max * sizeof (uint32_t));
1290 memset (&(*table)[old_max], '\0',
1291 (*max - old_max) * sizeof (uint32_t));
1294 *act = cnt + 1;
1297 return &(*table)[cnt];
1301 static int
1302 get_character (struct token *now, const struct charmap_t *charmap,
1303 struct repertoire_t *repertoire,
1304 struct charseq **seqp, uint32_t *wchp)
1306 if (now->tok == tok_bsymbol)
1308 /* This will hopefully be the normal case. */
1309 *wchp = repertoire_find_value (repertoire, now->val.str.startmb,
1310 now->val.str.lenmb);
1311 *seqp = charmap_find_value (charmap, now->val.str.startmb,
1312 now->val.str.lenmb);
1314 else if (now->tok == tok_ucs4)
1316 char utmp[10];
1318 snprintf (utmp, sizeof (utmp), "U%08X", now->val.ucs4);
1319 *seqp = charmap_find_value (charmap, utmp, 9);
1321 if (*seqp == NULL)
1322 *seqp = repertoire_find_seq (repertoire, now->val.ucs4);
1324 if (*seqp == NULL)
1326 /* Compute the value in the charmap from the UCS value. */
1327 const char *symbol = repertoire_find_symbol (repertoire,
1328 now->val.ucs4);
1330 if (symbol == NULL)
1331 *seqp = NULL;
1332 else
1333 *seqp = charmap_find_value (charmap, symbol, strlen (symbol));
1335 if (*seqp == NULL)
1337 if (repertoire != NULL)
1339 /* Insert a negative entry. */
1340 static const struct charseq negative
1341 = { .ucs4 = ILLEGAL_CHAR_VALUE };
1342 uint32_t *newp = obstack_alloc (&repertoire->mem_pool,
1343 sizeof (uint32_t));
1344 *newp = now->val.ucs4;
1346 insert_entry (&repertoire->seq_table, newp,
1347 sizeof (uint32_t), (void *) &negative);
1350 else
1351 (*seqp)->ucs4 = now->val.ucs4;
1353 else if ((*seqp)->ucs4 != now->val.ucs4)
1354 *seqp = NULL;
1356 *wchp = now->val.ucs4;
1358 else if (now->tok == tok_charcode)
1360 /* We must map from the byte code to UCS4. */
1361 *seqp = charmap_find_symbol (charmap, now->val.str.startmb,
1362 now->val.str.lenmb);
1364 if (*seqp == NULL)
1365 *wchp = ILLEGAL_CHAR_VALUE;
1366 else
1368 if ((*seqp)->ucs4 == UNINITIALIZED_CHAR_VALUE)
1369 (*seqp)->ucs4 = repertoire_find_value (repertoire, (*seqp)->name,
1370 strlen ((*seqp)->name));
1371 *wchp = (*seqp)->ucs4;
1374 else
1375 return 1;
1377 return 0;
1381 /* Ellipsis like in `<foo123>..<foo12a>' or `<j1234>....<j1245>' and
1382 the .(2). counterparts. */
1383 static void
1384 charclass_symbolic_ellipsis (struct linereader *ldfile,
1385 struct locale_ctype_t *ctype,
1386 const struct charmap_t *charmap,
1387 struct repertoire_t *repertoire,
1388 struct token *now,
1389 const char *last_str,
1390 unsigned long int class256_bit,
1391 unsigned long int class_bit, int base,
1392 int ignore_content, int handle_digits, int step)
1394 const char *nowstr = now->val.str.startmb;
1395 char tmp[now->val.str.lenmb + 1];
1396 const char *cp;
1397 char *endp;
1398 unsigned long int from;
1399 unsigned long int to;
1401 /* We have to compute the ellipsis values using the symbolic names. */
1402 assert (last_str != NULL);
1404 if (strlen (last_str) != now->val.str.lenmb)
1406 invalid_range:
1407 lr_error (ldfile,
1408 _("`%s' and `%.*s' are not valid names for symbolic range"),
1409 last_str, (int) now->val.str.lenmb, nowstr);
1410 return;
1413 if (memcmp (last_str, nowstr, now->val.str.lenmb) == 0)
1414 /* Nothing to do, the names are the same. */
1415 return;
1417 for (cp = last_str; *cp == *(nowstr + (cp - last_str)); ++cp)
1420 errno = 0;
1421 from = strtoul (cp, &endp, base);
1422 if ((from == UINT_MAX && errno == ERANGE) || *endp != '\0')
1423 goto invalid_range;
1425 to = strtoul (nowstr + (cp - last_str), &endp, base);
1426 if ((to == UINT_MAX && errno == ERANGE)
1427 || (endp - nowstr) != now->val.str.lenmb || from >= to)
1428 goto invalid_range;
1430 /* OK, we have a range FROM - TO. Now we can create the symbolic names. */
1431 if (!ignore_content)
1433 now->val.str.startmb = tmp;
1434 while ((from += step) <= to)
1436 struct charseq *seq;
1437 uint32_t wch;
1439 sprintf (tmp, (base == 10 ? "%.*s%0*ld" : "%.*s%0*lX"),
1440 (int) (cp - last_str), last_str,
1441 (int) (now->val.str.lenmb - (cp - last_str)),
1442 from);
1444 get_character (now, charmap, repertoire, &seq, &wch);
1446 if (seq != NULL && seq->nbytes == 1)
1447 /* Yep, we can store information about this byte sequence. */
1448 ctype->class256_collection[seq->bytes[0]] |= class256_bit;
1450 if (wch != ILLEGAL_CHAR_VALUE && class_bit != 0)
1451 /* We have the UCS4 position. */
1452 *find_idx (ctype, &ctype->class_collection,
1453 &ctype->class_collection_max,
1454 &ctype->class_collection_act, wch) |= class_bit;
1456 if (handle_digits == 1)
1458 /* We must store the digit values. */
1459 if (ctype->mbdigits_act == ctype->mbdigits_max)
1461 ctype->mbdigits_max *= 2;
1462 ctype->mbdigits = xrealloc (ctype->mbdigits,
1463 (ctype->mbdigits_max
1464 * sizeof (char *)));
1465 ctype->wcdigits_max *= 2;
1466 ctype->wcdigits = xrealloc (ctype->wcdigits,
1467 (ctype->wcdigits_max
1468 * sizeof (uint32_t)));
1471 ctype->mbdigits[ctype->mbdigits_act++] = seq;
1472 ctype->wcdigits[ctype->wcdigits_act++] = wch;
1474 else if (handle_digits == 2)
1476 /* We must store the digit values. */
1477 if (ctype->outdigits_act >= 10)
1479 lr_error (ldfile, _("\
1480 %s: field `%s' does not contain exactly ten entries"),
1481 "LC_CTYPE", "outdigit");
1482 return;
1485 ctype->mboutdigits[ctype->outdigits_act] = seq;
1486 ctype->wcoutdigits[ctype->outdigits_act] = wch;
1487 ++ctype->outdigits_act;
1494 /* Ellipsis like in `<U1234>..<U2345>' or `<U1234>..(2)..<U2345>'. */
1495 static void
1496 charclass_ucs4_ellipsis (struct linereader *ldfile,
1497 struct locale_ctype_t *ctype,
1498 const struct charmap_t *charmap,
1499 struct repertoire_t *repertoire,
1500 struct token *now, uint32_t last_wch,
1501 unsigned long int class256_bit,
1502 unsigned long int class_bit, int ignore_content,
1503 int handle_digits, int step)
1505 if (last_wch > now->val.ucs4)
1507 lr_error (ldfile, _("\
1508 to-value <U%0*X> of range is smaller than from-value <U%0*X>"),
1509 (now->val.ucs4 | last_wch) < 65536 ? 4 : 8, now->val.ucs4,
1510 (now->val.ucs4 | last_wch) < 65536 ? 4 : 8, last_wch);
1511 return;
1514 if (!ignore_content)
1515 while ((last_wch += step) <= now->val.ucs4)
1517 /* We have to find out whether there is a byte sequence corresponding
1518 to this UCS4 value. */
1519 struct charseq *seq;
1520 char utmp[10];
1522 snprintf (utmp, sizeof (utmp), "U%08X", last_wch);
1523 seq = charmap_find_value (charmap, utmp, 9);
1524 if (seq == NULL)
1526 snprintf (utmp, sizeof (utmp), "U%04X", last_wch);
1527 seq = charmap_find_value (charmap, utmp, 5);
1530 if (seq == NULL)
1531 /* Try looking in the repertoire map. */
1532 seq = repertoire_find_seq (repertoire, last_wch);
1534 /* If this is the first time we look for this sequence create a new
1535 entry. */
1536 if (seq == NULL)
1538 static const struct charseq negative
1539 = { .ucs4 = ILLEGAL_CHAR_VALUE };
1541 /* Find the symbolic name for this UCS4 value. */
1542 if (repertoire != NULL)
1544 const char *symbol = repertoire_find_symbol (repertoire,
1545 last_wch);
1546 uint32_t *newp = obstack_alloc (&repertoire->mem_pool,
1547 sizeof (uint32_t));
1548 *newp = last_wch;
1550 if (symbol != NULL)
1551 /* We have a name, now search the multibyte value. */
1552 seq = charmap_find_value (charmap, symbol, strlen (symbol));
1554 if (seq == NULL)
1555 /* We have to create a fake entry. */
1556 seq = (struct charseq *) &negative;
1557 else
1558 seq->ucs4 = last_wch;
1560 insert_entry (&repertoire->seq_table, newp, sizeof (uint32_t),
1561 seq);
1563 else
1564 /* We have to create a fake entry. */
1565 seq = (struct charseq *) &negative;
1568 /* We have a name, now search the multibyte value. */
1569 if (seq->ucs4 == last_wch && seq->nbytes == 1)
1570 /* Yep, we can store information about this byte sequence. */
1571 ctype->class256_collection[(size_t) seq->bytes[0]]
1572 |= class256_bit;
1574 /* And of course we have the UCS4 position. */
1575 if (class_bit != 0)
1576 *find_idx (ctype, &ctype->class_collection,
1577 &ctype->class_collection_max,
1578 &ctype->class_collection_act, last_wch) |= class_bit;
1580 if (handle_digits == 1)
1582 /* We must store the digit values. */
1583 if (ctype->mbdigits_act == ctype->mbdigits_max)
1585 ctype->mbdigits_max *= 2;
1586 ctype->mbdigits = xrealloc (ctype->mbdigits,
1587 (ctype->mbdigits_max
1588 * sizeof (char *)));
1589 ctype->wcdigits_max *= 2;
1590 ctype->wcdigits = xrealloc (ctype->wcdigits,
1591 (ctype->wcdigits_max
1592 * sizeof (uint32_t)));
1595 ctype->mbdigits[ctype->mbdigits_act++] = (seq->ucs4 == last_wch
1596 ? seq : NULL);
1597 ctype->wcdigits[ctype->wcdigits_act++] = last_wch;
1599 else if (handle_digits == 2)
1601 /* We must store the digit values. */
1602 if (ctype->outdigits_act >= 10)
1604 lr_error (ldfile, _("\
1605 %s: field `%s' does not contain exactly ten entries"),
1606 "LC_CTYPE", "outdigit");
1607 return;
1610 ctype->mboutdigits[ctype->outdigits_act] = (seq->ucs4 == last_wch
1611 ? seq : NULL);
1612 ctype->wcoutdigits[ctype->outdigits_act] = last_wch;
1613 ++ctype->outdigits_act;
1619 /* Ellipsis as in `/xea/x12.../xea/x34'. */
1620 static void
1621 charclass_charcode_ellipsis (struct linereader *ldfile,
1622 struct locale_ctype_t *ctype,
1623 const struct charmap_t *charmap,
1624 struct repertoire_t *repertoire,
1625 struct token *now, char *last_charcode,
1626 uint32_t last_charcode_len,
1627 unsigned long int class256_bit,
1628 unsigned long int class_bit, int ignore_content,
1629 int handle_digits)
1631 /* First check whether the to-value is larger. */
1632 if (now->val.charcode.nbytes != last_charcode_len)
1634 lr_error (ldfile, _("\
1635 start and end character sequence of range must have the same length"));
1636 return;
1639 if (memcmp (last_charcode, now->val.charcode.bytes, last_charcode_len) > 0)
1641 lr_error (ldfile, _("\
1642 to-value character sequence is smaller than from-value sequence"));
1643 return;
1646 if (!ignore_content)
1650 /* Increment the byte sequence value. */
1651 struct charseq *seq;
1652 uint32_t wch;
1653 int i;
1655 for (i = last_charcode_len - 1; i >= 0; --i)
1656 if (++last_charcode[i] != 0)
1657 break;
1659 if (last_charcode_len == 1)
1660 /* Of course we have the charcode value. */
1661 ctype->class256_collection[(size_t) last_charcode[0]]
1662 |= class256_bit;
1664 /* Find the symbolic name. */
1665 seq = charmap_find_symbol (charmap, last_charcode,
1666 last_charcode_len);
1667 if (seq != NULL)
1669 if (seq->ucs4 == UNINITIALIZED_CHAR_VALUE)
1670 seq->ucs4 = repertoire_find_value (repertoire, seq->name,
1671 strlen (seq->name));
1672 wch = seq == NULL ? ILLEGAL_CHAR_VALUE : seq->ucs4;
1674 if (wch != ILLEGAL_CHAR_VALUE && class_bit != 0)
1675 *find_idx (ctype, &ctype->class_collection,
1676 &ctype->class_collection_max,
1677 &ctype->class_collection_act, wch) |= class_bit;
1679 else
1680 wch = ILLEGAL_CHAR_VALUE;
1682 if (handle_digits == 1)
1684 /* We must store the digit values. */
1685 if (ctype->mbdigits_act == ctype->mbdigits_max)
1687 ctype->mbdigits_max *= 2;
1688 ctype->mbdigits = xrealloc (ctype->mbdigits,
1689 (ctype->mbdigits_max
1690 * sizeof (char *)));
1691 ctype->wcdigits_max *= 2;
1692 ctype->wcdigits = xrealloc (ctype->wcdigits,
1693 (ctype->wcdigits_max
1694 * sizeof (uint32_t)));
1697 seq = xmalloc (sizeof (struct charseq) + last_charcode_len);
1698 memcpy ((char *) (seq + 1), last_charcode, last_charcode_len);
1699 seq->nbytes = last_charcode_len;
1701 ctype->mbdigits[ctype->mbdigits_act++] = seq;
1702 ctype->wcdigits[ctype->wcdigits_act++] = wch;
1704 else if (handle_digits == 2)
1706 struct charseq *seq;
1707 /* We must store the digit values. */
1708 if (ctype->outdigits_act >= 10)
1710 lr_error (ldfile, _("\
1711 %s: field `%s' does not contain exactly ten entries"),
1712 "LC_CTYPE", "outdigit");
1713 return;
1716 seq = xmalloc (sizeof (struct charseq) + last_charcode_len);
1717 memcpy ((char *) (seq + 1), last_charcode, last_charcode_len);
1718 seq->nbytes = last_charcode_len;
1720 ctype->mboutdigits[ctype->outdigits_act] = seq;
1721 ctype->wcoutdigits[ctype->outdigits_act] = wch;
1722 ++ctype->outdigits_act;
1725 while (memcmp (last_charcode, now->val.charcode.bytes,
1726 last_charcode_len) != 0);
1731 static uint32_t *
1732 find_translit2 (struct locale_ctype_t *ctype, const struct charmap_t *charmap,
1733 uint32_t wch)
1735 struct translit_t *trunp = ctype->translit;
1736 struct translit_ignore_t *tirunp = ctype->translit_ignore;
1738 while (trunp != NULL)
1740 /* XXX We simplify things here. The transliterations we look
1741 for are only allowed to have one character. */
1742 if (trunp->from[0] == wch && trunp->from[1] == 0)
1744 /* Found it. Now look for a transliteration which can be
1745 represented with the character set. */
1746 struct translit_to_t *torunp = trunp->to;
1748 while (torunp != NULL)
1750 int i;
1752 for (i = 0; torunp->str[i] != 0; ++i)
1754 char utmp[10];
1756 snprintf (utmp, sizeof (utmp), "U%08X", torunp->str[i]);
1757 if (charmap_find_value (charmap, utmp, 9) == NULL)
1758 /* This character cannot be represented. */
1759 break;
1762 if (torunp->str[i] == 0)
1763 return torunp->str;
1765 torunp = torunp->next;
1768 break;
1771 trunp = trunp->next;
1774 /* Check for ignored chars. */
1775 while (tirunp != NULL)
1777 if (tirunp->from <= wch && tirunp->to >= wch)
1779 uint32_t wi;
1781 for (wi = tirunp->from; wi <= wch; wi += tirunp->step)
1782 if (wi == wch)
1783 return no_str;
1787 /* Nothing found. */
1788 return NULL;
1792 uint32_t *
1793 find_translit (struct localedef_t *locale, const struct charmap_t *charmap,
1794 uint32_t wch)
1796 struct locale_ctype_t *ctype;
1797 uint32_t *result = NULL;
1799 assert (locale != NULL);
1800 ctype = locale->categories[LC_CTYPE].ctype;
1802 if (ctype == NULL)
1803 return NULL;
1805 if (ctype->translit != NULL)
1806 result = find_translit2 (ctype, charmap, wch);
1808 if (result == NULL)
1810 struct translit_include_t *irunp = ctype->translit_include;
1812 while (irunp != NULL && result == NULL)
1814 result = find_translit (find_locale (CTYPE_LOCALE,
1815 irunp->copy_locale,
1816 irunp->copy_repertoire,
1817 charmap),
1818 charmap, wch);
1819 irunp = irunp->next;
1823 return result;
1827 /* Read one transliteration entry. */
1828 static uint32_t *
1829 read_widestring (struct linereader *ldfile, struct token *now,
1830 const struct charmap_t *charmap,
1831 struct repertoire_t *repertoire)
1833 uint32_t *wstr;
1835 if (now->tok == tok_default_missing)
1836 /* The special name "" will denote this case. */
1837 wstr = no_str;
1838 else if (now->tok == tok_bsymbol)
1840 /* Get the value from the repertoire. */
1841 wstr = (uint32_t *) xmalloc (2 * sizeof (uint32_t));
1842 wstr[0] = repertoire_find_value (repertoire, now->val.str.startmb,
1843 now->val.str.lenmb);
1844 if (wstr[0] == ILLEGAL_CHAR_VALUE)
1846 /* We cannot proceed, we don't know the UCS4 value. */
1847 free (wstr);
1848 return NULL;
1851 wstr[1] = 0;
1853 else if (now->tok == tok_ucs4)
1855 wstr = (uint32_t *) xmalloc (2 * sizeof (uint32_t));
1856 wstr[0] = now->val.ucs4;
1857 wstr[1] = 0;
1859 else if (now->tok == tok_charcode)
1861 /* Argh, we have to convert to the symbol name first and then to the
1862 UCS4 value. */
1863 struct charseq *seq = charmap_find_symbol (charmap,
1864 now->val.str.startmb,
1865 now->val.str.lenmb);
1866 if (seq == NULL)
1867 /* Cannot find the UCS4 value. */
1868 return NULL;
1870 if (seq->ucs4 == UNINITIALIZED_CHAR_VALUE)
1871 seq->ucs4 = repertoire_find_value (repertoire, seq->name,
1872 strlen (seq->name));
1873 if (seq->ucs4 == ILLEGAL_CHAR_VALUE)
1874 /* We cannot proceed, we don't know the UCS4 value. */
1875 return NULL;
1877 wstr = (uint32_t *) xmalloc (2 * sizeof (uint32_t));
1878 wstr[0] = seq->ucs4;
1879 wstr[1] = 0;
1881 else if (now->tok == tok_string)
1883 wstr = now->val.str.startwc;
1884 if (wstr == NULL || wstr[0] == 0)
1885 return NULL;
1887 else
1889 if (now->tok != tok_eol && now->tok != tok_eof)
1890 lr_ignore_rest (ldfile, 0);
1891 SYNTAX_ERROR (_("%s: syntax error"), "LC_CTYPE");
1892 return (uint32_t *) -1l;
1895 return wstr;
1899 static void
1900 read_translit_entry (struct linereader *ldfile, struct locale_ctype_t *ctype,
1901 struct token *now, const struct charmap_t *charmap,
1902 struct repertoire_t *repertoire)
1904 uint32_t *from_wstr = read_widestring (ldfile, now, charmap, repertoire);
1905 struct translit_t *result;
1906 struct translit_to_t **top;
1907 struct obstack *ob = &ctype->mempool;
1908 int first;
1909 int ignore;
1911 if (from_wstr == NULL)
1912 /* There is no valid from string. */
1913 return;
1915 result = (struct translit_t *) obstack_alloc (ob,
1916 sizeof (struct translit_t));
1917 result->from = from_wstr;
1918 result->fname = ldfile->fname;
1919 result->lineno = ldfile->lineno;
1920 result->next = NULL;
1921 result->to = NULL;
1922 top = &result->to;
1923 first = 1;
1924 ignore = 0;
1926 while (1)
1928 uint32_t *to_wstr;
1930 /* Next we have one or more transliterations. They are
1931 separated by semicolons. */
1932 now = lr_token (ldfile, charmap, NULL, repertoire, verbose);
1934 if (!first && (now->tok == tok_semicolon || now->tok == tok_eol))
1936 /* One string read. */
1937 const uint32_t zero = 0;
1939 if (!ignore)
1941 obstack_grow (ob, &zero, 4);
1942 to_wstr = obstack_finish (ob);
1944 *top = obstack_alloc (ob, sizeof (struct translit_to_t));
1945 (*top)->str = to_wstr;
1946 (*top)->next = NULL;
1949 if (now->tok == tok_eol)
1951 result->next = ctype->translit;
1952 ctype->translit = result;
1953 return;
1956 if (!ignore)
1957 top = &(*top)->next;
1958 ignore = 0;
1960 else
1962 to_wstr = read_widestring (ldfile, now, charmap, repertoire);
1963 if (to_wstr == (uint32_t *) -1l)
1965 /* An error occurred. */
1966 obstack_free (ob, result);
1967 return;
1970 if (to_wstr == NULL)
1971 ignore = 1;
1972 else
1973 /* This value is usable. */
1974 obstack_grow (ob, to_wstr, wcslen ((wchar_t *) to_wstr) * 4);
1976 first = 0;
1982 static void
1983 read_translit_ignore_entry (struct linereader *ldfile,
1984 struct locale_ctype_t *ctype,
1985 const struct charmap_t *charmap,
1986 struct repertoire_t *repertoire)
1988 /* We expect a semicolon-separated list of characters we ignore. We are
1989 only interested in the wide character definitions. These must be
1990 single characters, possibly defining a range when an ellipsis is used. */
1991 while (1)
1993 struct token *now = lr_token (ldfile, charmap, NULL, repertoire,
1994 verbose);
1995 struct translit_ignore_t *newp;
1996 uint32_t from;
1998 if (now->tok == tok_eol || now->tok == tok_eof)
2000 lr_error (ldfile,
2001 _("premature end of `translit_ignore' definition"));
2002 return;
2005 if (now->tok != tok_bsymbol && now->tok != tok_ucs4)
2007 lr_error (ldfile, _("syntax error"));
2008 lr_ignore_rest (ldfile, 0);
2009 return;
2012 if (now->tok == tok_ucs4)
2013 from = now->val.ucs4;
2014 else
2015 /* Try to get the value. */
2016 from = repertoire_find_value (repertoire, now->val.str.startmb,
2017 now->val.str.lenmb);
2019 if (from == ILLEGAL_CHAR_VALUE)
2021 lr_error (ldfile, "invalid character name");
2022 newp = NULL;
2024 else
2026 newp = (struct translit_ignore_t *)
2027 obstack_alloc (&ctype->mempool, sizeof (struct translit_ignore_t));
2028 newp->from = from;
2029 newp->to = from;
2030 newp->step = 1;
2032 newp->next = ctype->translit_ignore;
2033 ctype->translit_ignore = newp;
2036 /* Now we expect either a semicolon, an ellipsis, or the end of the
2037 line. */
2038 now = lr_token (ldfile, charmap, NULL, repertoire, verbose);
2040 if (now->tok == tok_ellipsis2 || now->tok == tok_ellipsis2_2)
2042 /* XXX Should we bother implementing `....'? `...' certainly
2043 will not be implemented. */
2044 uint32_t to;
2045 int step = now->tok == tok_ellipsis2_2 ? 2 : 1;
2047 now = lr_token (ldfile, charmap, NULL, repertoire, verbose);
2049 if (now->tok == tok_eol || now->tok == tok_eof)
2051 lr_error (ldfile,
2052 _("premature end of `translit_ignore' definition"));
2053 return;
2056 if (now->tok != tok_bsymbol && now->tok != tok_ucs4)
2058 lr_error (ldfile, _("syntax error"));
2059 lr_ignore_rest (ldfile, 0);
2060 return;
2063 if (now->tok == tok_ucs4)
2064 to = now->val.ucs4;
2065 else
2066 /* Try to get the value. */
2067 to = repertoire_find_value (repertoire, now->val.str.startmb,
2068 now->val.str.lenmb);
2070 if (to == ILLEGAL_CHAR_VALUE)
2071 lr_error (ldfile, "invalid character name");
2072 else
2074 /* Make sure the `to'-value is larger. */
2075 if (to >= from)
2077 newp->to = to;
2078 newp->step = step;
2080 else
2081 lr_error (ldfile, _("\
2082 to-value <U%0*X> of range is smaller than from-value <U%0*X>"),
2083 (to | from) < 65536 ? 4 : 8, to,
2084 (to | from) < 65536 ? 4 : 8, from);
2087 /* And the next token. */
2088 now = lr_token (ldfile, charmap, NULL, repertoire, verbose);
2091 if (now->tok == tok_eol || now->tok == tok_eof)
2092 /* We are done. */
2093 return;
2095 if (now->tok == tok_semicolon)
2096 /* Next round. */
2097 continue;
2099 /* If we come here something is wrong. */
2100 lr_error (ldfile, _("syntax error"));
2101 lr_ignore_rest (ldfile, 0);
2102 return;
2107 /* The parser for the LC_CTYPE section of the locale definition. */
2108 void
2109 ctype_read (struct linereader *ldfile, struct localedef_t *result,
2110 const struct charmap_t *charmap, const char *repertoire_name,
2111 int ignore_content)
2113 struct repertoire_t *repertoire = NULL;
2114 struct locale_ctype_t *ctype;
2115 struct token *now;
2116 enum token_t nowtok;
2117 size_t cnt;
2118 uint32_t last_wch = 0;
2119 enum token_t last_token;
2120 enum token_t ellipsis_token;
2121 int step;
2122 char last_charcode[16];
2123 size_t last_charcode_len = 0;
2124 const char *last_str = NULL;
2125 int mapidx;
2126 struct localedef_t *copy_locale = NULL;
2128 /* Get the repertoire we have to use. */
2129 if (repertoire_name != NULL)
2130 repertoire = repertoire_read (repertoire_name);
2132 /* The rest of the line containing `LC_CTYPE' must be free. */
2133 lr_ignore_rest (ldfile, 1);
2138 now = lr_token (ldfile, charmap, NULL, NULL, verbose);
2139 nowtok = now->tok;
2141 while (nowtok == tok_eol);
2143 /* If we see `copy' now we are almost done. */
2144 if (nowtok == tok_copy)
2146 now = lr_token (ldfile, charmap, NULL, NULL, verbose);
2147 if (now->tok != tok_string)
2149 SYNTAX_ERROR (_("%s: syntax error"), "LC_CTYPE");
2151 skip_category:
2153 now = lr_token (ldfile, charmap, NULL, NULL, verbose);
2154 while (now->tok != tok_eof && now->tok != tok_end);
2156 if (now->tok != tok_eof
2157 || (now = lr_token (ldfile, charmap, NULL, NULL, verbose),
2158 now->tok == tok_eof))
2159 lr_error (ldfile, _("%s: premature end of file"), "LC_CTYPE");
2160 else if (now->tok != tok_lc_ctype)
2162 lr_error (ldfile, _("\
2163 %1$s: definition does not end with `END %1$s'"), "LC_CTYPE");
2164 lr_ignore_rest (ldfile, 0);
2166 else
2167 lr_ignore_rest (ldfile, 1);
2169 return;
2172 if (! ignore_content)
2174 /* Get the locale definition. */
2175 copy_locale = load_locale (LC_CTYPE, now->val.str.startmb,
2176 repertoire_name, charmap, NULL);
2177 if ((copy_locale->avail & CTYPE_LOCALE) == 0)
2179 /* Not yet loaded. So do it now. */
2180 if (locfile_read (copy_locale, charmap) != 0)
2181 goto skip_category;
2184 if (copy_locale->categories[LC_CTYPE].ctype == NULL)
2185 return;
2188 lr_ignore_rest (ldfile, 1);
2190 now = lr_token (ldfile, charmap, NULL, NULL, verbose);
2191 nowtok = now->tok;
2194 /* Prepare the data structures. */
2195 ctype_startup (ldfile, result, charmap, copy_locale, ignore_content);
2196 ctype = result->categories[LC_CTYPE].ctype;
2198 /* Remember the repertoire we use. */
2199 if (!ignore_content)
2200 ctype->repertoire = repertoire;
2202 while (1)
2204 unsigned long int class_bit = 0;
2205 unsigned long int class256_bit = 0;
2206 int handle_digits = 0;
2208 /* Of course we don't proceed beyond the end of file. */
2209 if (nowtok == tok_eof)
2210 break;
2212 /* Ingore empty lines. */
2213 if (nowtok == tok_eol)
2215 now = lr_token (ldfile, charmap, NULL, NULL, verbose);
2216 nowtok = now->tok;
2217 continue;
2220 switch (nowtok)
2222 case tok_charclass:
2223 now = lr_token (ldfile, charmap, NULL, NULL, verbose);
2224 while (now->tok == tok_ident || now->tok == tok_string)
2226 ctype_class_new (ldfile, ctype, now->val.str.startmb);
2227 now = lr_token (ldfile, charmap, NULL, NULL, verbose);
2228 if (now->tok != tok_semicolon)
2229 break;
2230 now = lr_token (ldfile, charmap, NULL, NULL, verbose);
2232 if (now->tok != tok_eol)
2233 SYNTAX_ERROR (_("\
2234 %s: syntax error in definition of new character class"), "LC_CTYPE");
2235 break;
2237 case tok_charconv:
2238 now = lr_token (ldfile, charmap, NULL, NULL, verbose);
2239 while (now->tok == tok_ident || now->tok == tok_string)
2241 ctype_map_new (ldfile, ctype, now->val.str.startmb, charmap);
2242 now = lr_token (ldfile, charmap, NULL, NULL, verbose);
2243 if (now->tok != tok_semicolon)
2244 break;
2245 now = lr_token (ldfile, charmap, NULL, NULL, verbose);
2247 if (now->tok != tok_eol)
2248 SYNTAX_ERROR (_("\
2249 %s: syntax error in definition of new character map"), "LC_CTYPE");
2250 break;
2252 case tok_class:
2253 /* Ignore the rest of the line if we don't need the input of
2254 this line. */
2255 if (ignore_content)
2257 lr_ignore_rest (ldfile, 0);
2258 break;
2261 /* We simply forget the `class' keyword and use the following
2262 operand to determine the bit. */
2263 now = lr_token (ldfile, charmap, NULL, NULL, verbose);
2264 if (now->tok == tok_ident || now->tok == tok_string)
2266 /* Must can be one of the predefined class names. */
2267 for (cnt = 0; cnt < ctype->nr_charclass; ++cnt)
2268 if (strcmp (ctype->classnames[cnt], now->val.str.startmb) == 0)
2269 break;
2270 if (cnt >= ctype->nr_charclass)
2272 #ifdef PREDEFINED_CLASSES
2273 if (now->val.str.lenmb == 8
2274 && memcmp ("special1", now->val.str.startmb, 8) == 0)
2275 class_bit = _ISwspecial1;
2276 else if (now->val.str.lenmb == 8
2277 && memcmp ("special2", now->val.str.startmb, 8) == 0)
2278 class_bit = _ISwspecial2;
2279 else if (now->val.str.lenmb == 8
2280 && memcmp ("special3", now->val.str.startmb, 8) == 0)
2281 class_bit = _ISwspecial3;
2282 else
2283 #endif
2285 /* OK, it's a new class. */
2286 ctype_class_new (ldfile, ctype, now->val.str.startmb);
2288 class_bit = _ISwbit (ctype->nr_charclass - 1);
2291 else
2293 class_bit = _ISwbit (cnt);
2295 free (now->val.str.startmb);
2298 else if (now->tok == tok_digit)
2299 goto handle_tok_digit;
2300 else if (now->tok < tok_upper || now->tok > tok_blank)
2301 goto err_label;
2302 else
2304 class_bit = BITw (now->tok);
2305 class256_bit = BIT (now->tok);
2308 /* The next character must be a semicolon. */
2309 now = lr_token (ldfile, charmap, NULL, NULL, verbose);
2310 if (now->tok != tok_semicolon)
2311 goto err_label;
2312 goto read_charclass;
2314 case tok_upper:
2315 case tok_lower:
2316 case tok_alpha:
2317 case tok_alnum:
2318 case tok_space:
2319 case tok_cntrl:
2320 case tok_punct:
2321 case tok_graph:
2322 case tok_print:
2323 case tok_xdigit:
2324 case tok_blank:
2325 /* Ignore the rest of the line if we don't need the input of
2326 this line. */
2327 if (ignore_content)
2329 lr_ignore_rest (ldfile, 0);
2330 break;
2333 class_bit = BITw (now->tok);
2334 class256_bit = BIT (now->tok);
2335 handle_digits = 0;
2336 read_charclass:
2337 ctype->class_done |= class_bit;
2338 last_token = tok_none;
2339 ellipsis_token = tok_none;
2340 step = 1;
2341 now = lr_token (ldfile, charmap, NULL, NULL, verbose);
2342 while (now->tok != tok_eol && now->tok != tok_eof)
2344 uint32_t wch;
2345 struct charseq *seq;
2347 if (ellipsis_token == tok_none)
2349 if (get_character (now, charmap, repertoire, &seq, &wch))
2350 goto err_label;
2352 if (!ignore_content && seq != NULL && seq->nbytes == 1)
2353 /* Yep, we can store information about this byte
2354 sequence. */
2355 ctype->class256_collection[seq->bytes[0]] |= class256_bit;
2357 if (!ignore_content && wch != ILLEGAL_CHAR_VALUE
2358 && class_bit != 0)
2359 /* We have the UCS4 position. */
2360 *find_idx (ctype, &ctype->class_collection,
2361 &ctype->class_collection_max,
2362 &ctype->class_collection_act, wch) |= class_bit;
2364 last_token = now->tok;
2365 /* Terminate the string. */
2366 if (last_token == tok_bsymbol)
2368 now->val.str.startmb[now->val.str.lenmb] = '\0';
2369 last_str = now->val.str.startmb;
2371 else
2372 last_str = NULL;
2373 last_wch = wch;
2374 memcpy (last_charcode, now->val.charcode.bytes, 16);
2375 last_charcode_len = now->val.charcode.nbytes;
2377 if (!ignore_content && handle_digits == 1)
2379 /* We must store the digit values. */
2380 if (ctype->mbdigits_act == ctype->mbdigits_max)
2382 ctype->mbdigits_max += 10;
2383 ctype->mbdigits = xrealloc (ctype->mbdigits,
2384 (ctype->mbdigits_max
2385 * sizeof (char *)));
2386 ctype->wcdigits_max += 10;
2387 ctype->wcdigits = xrealloc (ctype->wcdigits,
2388 (ctype->wcdigits_max
2389 * sizeof (uint32_t)));
2392 ctype->mbdigits[ctype->mbdigits_act++] = seq;
2393 ctype->wcdigits[ctype->wcdigits_act++] = wch;
2395 else if (!ignore_content && handle_digits == 2)
2397 /* We must store the digit values. */
2398 if (ctype->outdigits_act >= 10)
2400 lr_error (ldfile, _("\
2401 %s: field `%s' does not contain exactly ten entries"),
2402 "LC_CTYPE", "outdigit");
2403 lr_ignore_rest (ldfile, 0);
2404 break;
2407 ctype->mboutdigits[ctype->outdigits_act] = seq;
2408 ctype->wcoutdigits[ctype->outdigits_act] = wch;
2409 ++ctype->outdigits_act;
2412 else
2414 /* Now it gets complicated. We have to resolve the
2415 ellipsis problem. First we must distinguish between
2416 the different kind of ellipsis and this must match the
2417 tokens we have seen. */
2418 assert (last_token != tok_none);
2420 if (last_token != now->tok)
2422 lr_error (ldfile, _("\
2423 ellipsis range must be marked by two operands of same type"));
2424 lr_ignore_rest (ldfile, 0);
2425 break;
2428 if (last_token == tok_bsymbol)
2430 if (ellipsis_token == tok_ellipsis3)
2431 lr_error (ldfile, _("with symbolic name range values \
2432 the absolute ellipsis `...' must not be used"));
2434 charclass_symbolic_ellipsis (ldfile, ctype, charmap,
2435 repertoire, now, last_str,
2436 class256_bit, class_bit,
2437 (ellipsis_token
2438 == tok_ellipsis4
2439 ? 10 : 16),
2440 ignore_content,
2441 handle_digits, step);
2443 else if (last_token == tok_ucs4)
2445 if (ellipsis_token != tok_ellipsis2)
2446 lr_error (ldfile, _("\
2447 with UCS range values one must use the hexadecimal symbolic ellipsis `..'"));
2449 charclass_ucs4_ellipsis (ldfile, ctype, charmap,
2450 repertoire, now, last_wch,
2451 class256_bit, class_bit,
2452 ignore_content, handle_digits,
2453 step);
2455 else
2457 assert (last_token == tok_charcode);
2459 if (ellipsis_token != tok_ellipsis3)
2460 lr_error (ldfile, _("\
2461 with character code range values one must use the absolute ellipsis `...'"));
2463 charclass_charcode_ellipsis (ldfile, ctype, charmap,
2464 repertoire, now,
2465 last_charcode,
2466 last_charcode_len,
2467 class256_bit, class_bit,
2468 ignore_content,
2469 handle_digits);
2472 /* Now we have used the last value. */
2473 last_token = tok_none;
2476 /* Next we expect a semicolon or the end of the line. */
2477 now = lr_token (ldfile, charmap, NULL, NULL, verbose);
2478 if (now->tok == tok_eol || now->tok == tok_eof)
2479 break;
2481 if (last_token != tok_none
2482 && now->tok >= tok_ellipsis2 && now->tok <= tok_ellipsis4_2)
2484 if (now->tok == tok_ellipsis2_2)
2486 now->tok = tok_ellipsis2;
2487 step = 2;
2489 else if (now->tok == tok_ellipsis4_2)
2491 now->tok = tok_ellipsis4;
2492 step = 2;
2495 ellipsis_token = now->tok;
2497 now = lr_token (ldfile, charmap, NULL, NULL, verbose);
2498 continue;
2501 if (now->tok != tok_semicolon)
2502 goto err_label;
2504 /* And get the next character. */
2505 now = lr_token (ldfile, charmap, NULL, NULL, verbose);
2507 ellipsis_token = tok_none;
2508 step = 1;
2510 break;
2512 case tok_digit:
2513 /* Ignore the rest of the line if we don't need the input of
2514 this line. */
2515 if (ignore_content)
2517 lr_ignore_rest (ldfile, 0);
2518 break;
2521 handle_tok_digit:
2522 class_bit = _ISwdigit;
2523 class256_bit = _ISdigit;
2524 handle_digits = 1;
2525 goto read_charclass;
2527 case tok_outdigit:
2528 /* Ignore the rest of the line if we don't need the input of
2529 this line. */
2530 if (ignore_content)
2532 lr_ignore_rest (ldfile, 0);
2533 break;
2536 if (ctype->outdigits_act != 0)
2537 lr_error (ldfile, _("\
2538 %s: field `%s' declared more than once"),
2539 "LC_CTYPE", "outdigit");
2540 class_bit = 0;
2541 class256_bit = 0;
2542 handle_digits = 2;
2543 goto read_charclass;
2545 case tok_toupper:
2546 /* Ignore the rest of the line if we don't need the input of
2547 this line. */
2548 if (ignore_content)
2550 lr_ignore_rest (ldfile, 0);
2551 break;
2554 mapidx = 0;
2555 goto read_mapping;
2557 case tok_tolower:
2558 /* Ignore the rest of the line if we don't need the input of
2559 this line. */
2560 if (ignore_content)
2562 lr_ignore_rest (ldfile, 0);
2563 break;
2566 mapidx = 1;
2567 goto read_mapping;
2569 case tok_map:
2570 /* Ignore the rest of the line if we don't need the input of
2571 this line. */
2572 if (ignore_content)
2574 lr_ignore_rest (ldfile, 0);
2575 break;
2578 /* We simply forget the `map' keyword and use the following
2579 operand to determine the mapping. */
2580 now = lr_token (ldfile, charmap, NULL, NULL, verbose);
2581 if (now->tok == tok_ident || now->tok == tok_string)
2583 size_t cnt;
2585 for (cnt = 2; cnt < ctype->map_collection_nr; ++cnt)
2586 if (strcmp (now->val.str.startmb, ctype->mapnames[cnt]) == 0)
2587 break;
2589 if (cnt < ctype->map_collection_nr)
2590 free (now->val.str.startmb);
2591 else
2592 /* OK, it's a new map. */
2593 ctype_map_new (ldfile, ctype, now->val.str.startmb, charmap);
2595 mapidx = cnt;
2597 else if (now->tok < tok_toupper || now->tok > tok_tolower)
2598 goto err_label;
2599 else
2600 mapidx = now->tok - tok_toupper;
2602 now = lr_token (ldfile, charmap, NULL, NULL, verbose);
2603 /* This better should be a semicolon. */
2604 if (now->tok != tok_semicolon)
2605 goto err_label;
2607 read_mapping:
2608 /* Test whether this mapping was already defined. */
2609 if (ctype->tomap_done[mapidx])
2611 lr_error (ldfile, _("duplicated definition for mapping `%s'"),
2612 ctype->mapnames[mapidx]);
2613 lr_ignore_rest (ldfile, 0);
2614 break;
2616 ctype->tomap_done[mapidx] = 1;
2618 now = lr_token (ldfile, charmap, NULL, NULL, verbose);
2619 while (now->tok != tok_eol && now->tok != tok_eof)
2621 struct charseq *from_seq;
2622 uint32_t from_wch;
2623 struct charseq *to_seq;
2624 uint32_t to_wch;
2626 /* Every pair starts with an opening brace. */
2627 if (now->tok != tok_open_brace)
2628 goto err_label;
2630 /* Next comes the from-value. */
2631 now = lr_token (ldfile, charmap, NULL, NULL, verbose);
2632 if (get_character (now, charmap, repertoire, &from_seq,
2633 &from_wch) != 0)
2634 goto err_label;
2636 /* The next is a comma. */
2637 now = lr_token (ldfile, charmap, NULL, NULL, verbose);
2638 if (now->tok != tok_comma)
2639 goto err_label;
2641 /* And the other value. */
2642 now = lr_token (ldfile, charmap, NULL, NULL, verbose);
2643 if (get_character (now, charmap, repertoire, &to_seq,
2644 &to_wch) != 0)
2645 goto err_label;
2647 /* And the last thing is the closing brace. */
2648 now = lr_token (ldfile, charmap, NULL, NULL, verbose);
2649 if (now->tok != tok_close_brace)
2650 goto err_label;
2652 if (!ignore_content)
2654 /* Check whether the mapping converts from an ASCII value
2655 to a non-ASCII value. */
2656 if (from_seq != NULL && from_seq->nbytes == 1
2657 && isascii (from_seq->bytes[0])
2658 && to_seq != NULL && (to_seq->nbytes != 1
2659 || !isascii (to_seq->bytes[0])))
2660 ctype->to_nonascii = 1;
2662 if (mapidx < 2 && from_seq != NULL && to_seq != NULL
2663 && from_seq->nbytes == 1 && to_seq->nbytes == 1)
2664 /* We can use this value. */
2665 ctype->map256_collection[mapidx][from_seq->bytes[0]]
2666 = to_seq->bytes[0];
2668 if (from_wch != ILLEGAL_CHAR_VALUE
2669 && to_wch != ILLEGAL_CHAR_VALUE)
2670 /* Both correct values. */
2671 *find_idx (ctype, &ctype->map_collection[mapidx],
2672 &ctype->map_collection_max[mapidx],
2673 &ctype->map_collection_act[mapidx],
2674 from_wch) = to_wch;
2677 /* Now comes a semicolon or the end of the line/file. */
2678 now = lr_token (ldfile, charmap, NULL, NULL, verbose);
2679 if (now->tok == tok_semicolon)
2680 now = lr_token (ldfile, charmap, NULL, NULL, verbose);
2682 break;
2684 case tok_translit_start:
2685 /* Ignore the entire translit section with its peculiar syntax
2686 if we don't need the input. */
2687 if (ignore_content)
2691 lr_ignore_rest (ldfile, 0);
2692 now = lr_token (ldfile, charmap, NULL, NULL, verbose);
2694 while (now->tok != tok_translit_end && now->tok != tok_eof);
2696 if (now->tok == tok_eof)
2697 lr_error (ldfile, _(\
2698 "%s: `translit_start' section does not end with `translit_end'"),
2699 "LC_CTYPE");
2701 break;
2704 /* The rest of the line better should be empty. */
2705 lr_ignore_rest (ldfile, 1);
2707 /* We count here the number of allocated entries in the `translit'
2708 array. */
2709 cnt = 0;
2711 ldfile->translate_strings = 1;
2712 ldfile->return_widestr = 1;
2714 /* We proceed until we see the `translit_end' token. */
2715 while (now = lr_token (ldfile, charmap, NULL, repertoire, verbose),
2716 now->tok != tok_translit_end && now->tok != tok_eof)
2718 if (now->tok == tok_eol)
2719 /* Ignore empty lines. */
2720 continue;
2722 if (now->tok == tok_include)
2724 /* We have to include locale. */
2725 const char *locale_name;
2726 const char *repertoire_name;
2727 struct translit_include_t *include_stmt, **include_ptr;
2729 now = lr_token (ldfile, charmap, NULL, NULL, verbose);
2730 /* This should be a string or an identifier. In any
2731 case something to name a locale. */
2732 if (now->tok != tok_string && now->tok != tok_ident)
2734 translit_syntax:
2735 lr_error (ldfile, _("%s: syntax error"), "LC_CTYPE");
2736 lr_ignore_rest (ldfile, 0);
2737 continue;
2739 locale_name = now->val.str.startmb;
2741 /* Next should be a semicolon. */
2742 now = lr_token (ldfile, charmap, NULL, NULL, verbose);
2743 if (now->tok != tok_semicolon)
2744 goto translit_syntax;
2746 /* Now the repertoire name. */
2747 now = lr_token (ldfile, charmap, NULL, NULL, verbose);
2748 if ((now->tok != tok_string && now->tok != tok_ident)
2749 || now->val.str.startmb == NULL)
2750 goto translit_syntax;
2751 repertoire_name = now->val.str.startmb;
2752 if (repertoire_name[0] == '\0')
2753 /* Ignore the empty string. */
2754 repertoire_name = NULL;
2756 /* Save the include statement for later processing. */
2757 include_stmt = (struct translit_include_t *)
2758 xmalloc (sizeof (struct translit_include_t));
2759 include_stmt->copy_locale = locale_name;
2760 include_stmt->copy_repertoire = repertoire_name;
2761 include_stmt->next = NULL;
2763 include_ptr = &ctype->translit_include;
2764 while (*include_ptr != NULL)
2765 include_ptr = &(*include_ptr)->next;
2766 *include_ptr = include_stmt;
2768 /* The rest of the line must be empty. */
2769 lr_ignore_rest (ldfile, 1);
2771 /* Make sure the locale is read. */
2772 add_to_readlist (LC_CTYPE, locale_name, repertoire_name,
2773 1, NULL);
2774 continue;
2776 else if (now->tok == tok_default_missing)
2778 uint32_t *wstr;
2780 while (1)
2782 /* We expect a single character or string as the
2783 argument. */
2784 now = lr_token (ldfile, charmap, NULL, NULL, verbose);
2785 wstr = read_widestring (ldfile, now, charmap,
2786 repertoire);
2788 if (wstr != NULL)
2790 if (ctype->default_missing != NULL)
2792 lr_error (ldfile, _("\
2793 %s: duplicate `default_missing' definition"), "LC_CTYPE");
2794 WITH_CUR_LOCALE (error_at_line (0, 0,
2795 ctype->default_missing_file,
2796 ctype->default_missing_lineno,
2797 _("\
2798 previous definition was here")));
2800 else
2802 ctype->default_missing = wstr;
2803 ctype->default_missing_file = ldfile->fname;
2804 ctype->default_missing_lineno = ldfile->lineno;
2806 /* We can have more entries, ignore them. */
2807 lr_ignore_rest (ldfile, 0);
2808 break;
2810 else if (wstr == (uint32_t *) -1l)
2811 /* This was an syntax error. */
2812 break;
2814 /* Maybe there is another replacement we can use. */
2815 now = lr_token (ldfile, charmap, NULL, NULL, verbose);
2816 if (now->tok == tok_eol || now->tok == tok_eof)
2818 /* Nothing found. We tell the user. */
2819 lr_error (ldfile, _("\
2820 %s: no representable `default_missing' definition found"), "LC_CTYPE");
2821 break;
2823 if (now->tok != tok_semicolon)
2824 goto translit_syntax;
2827 continue;
2829 else if (now->tok == tok_translit_ignore)
2831 read_translit_ignore_entry (ldfile, ctype, charmap,
2832 repertoire);
2833 continue;
2836 read_translit_entry (ldfile, ctype, now, charmap, repertoire);
2838 ldfile->return_widestr = 0;
2840 if (now->tok == tok_eof)
2841 lr_error (ldfile, _(\
2842 "%s: `translit_start' section does not end with `translit_end'"),
2843 "LC_CTYPE");
2845 break;
2847 case tok_ident:
2848 /* Ignore the rest of the line if we don't need the input of
2849 this line. */
2850 if (ignore_content)
2852 lr_ignore_rest (ldfile, 0);
2853 break;
2856 /* This could mean one of several things. First test whether
2857 it's a character class name. */
2858 for (cnt = 0; cnt < ctype->nr_charclass; ++cnt)
2859 if (strcmp (now->val.str.startmb, ctype->classnames[cnt]) == 0)
2860 break;
2861 if (cnt < ctype->nr_charclass)
2863 class_bit = _ISwbit (cnt);
2864 class256_bit = cnt <= 11 ? _ISbit (cnt) : 0;
2865 free (now->val.str.startmb);
2866 goto read_charclass;
2868 for (cnt = 0; cnt < ctype->map_collection_nr; ++cnt)
2869 if (strcmp (now->val.str.startmb, ctype->mapnames[cnt]) == 0)
2870 break;
2871 if (cnt < ctype->map_collection_nr)
2873 mapidx = cnt;
2874 free (now->val.str.startmb);
2875 goto read_mapping;
2877 #ifdef PREDEFINED_CLASSES
2878 if (strcmp (now->val.str.startmb, "special1") == 0)
2880 class_bit = _ISwspecial1;
2881 free (now->val.str.startmb);
2882 goto read_charclass;
2884 if (strcmp (now->val.str.startmb, "special2") == 0)
2886 class_bit = _ISwspecial2;
2887 free (now->val.str.startmb);
2888 goto read_charclass;
2890 if (strcmp (now->val.str.startmb, "special3") == 0)
2892 class_bit = _ISwspecial3;
2893 free (now->val.str.startmb);
2894 goto read_charclass;
2896 if (strcmp (now->val.str.startmb, "tosymmetric") == 0)
2898 mapidx = 2;
2899 goto read_mapping;
2901 #endif
2902 break;
2904 case tok_end:
2905 /* Next we assume `LC_CTYPE'. */
2906 now = lr_token (ldfile, charmap, NULL, NULL, verbose);
2907 if (now->tok == tok_eof)
2908 break;
2909 if (now->tok == tok_eol)
2910 lr_error (ldfile, _("%s: incomplete `END' line"),
2911 "LC_CTYPE");
2912 else if (now->tok != tok_lc_ctype)
2913 lr_error (ldfile, _("\
2914 %1$s: definition does not end with `END %1$s'"), "LC_CTYPE");
2915 lr_ignore_rest (ldfile, now->tok == tok_lc_ctype);
2916 return;
2918 default:
2919 err_label:
2920 if (now->tok != tok_eof)
2921 SYNTAX_ERROR (_("%s: syntax error"), "LC_CTYPE");
2924 /* Prepare for the next round. */
2925 now = lr_token (ldfile, charmap, NULL, NULL, verbose);
2926 nowtok = now->tok;
2929 /* When we come here we reached the end of the file. */
2930 lr_error (ldfile, _("%s: premature end of file"), "LC_CTYPE");
2934 /* Subroutine of set_class_defaults, below. */
2935 static void
2936 set_one_default (struct locale_ctype_t *ctype,
2937 const struct charmap_t *charmap,
2938 int bitpos, int from, int to)
2940 char tmp[2];
2941 int ch;
2942 int bit = _ISbit (bitpos);
2943 int bitw = _ISwbit (bitpos);
2944 /* Define string. */
2945 strcpy (tmp, "?");
2947 for (ch = from; ch <= to; ++ch)
2949 struct charseq *seq;
2950 tmp[0] = ch;
2952 seq = charmap_find_value (charmap, tmp, 1);
2953 if (seq == NULL)
2955 char buf[10];
2956 sprintf (buf, "U%08X", ch);
2957 seq = charmap_find_value (charmap, buf, 9);
2959 if (seq == NULL)
2961 if (!be_quiet)
2962 WITH_CUR_LOCALE (error (0, 0, _("\
2963 %s: character `%s' not defined while needed as default value"),
2964 "LC_CTYPE", tmp));
2966 else if (seq->nbytes != 1)
2967 WITH_CUR_LOCALE (error (0, 0, _("\
2968 %s: character `%s' in charmap not representable with one byte"),
2969 "LC_CTYPE", tmp));
2970 else
2971 ctype->class256_collection[seq->bytes[0]] |= bit;
2973 /* No need to search here, the ASCII value is also the Unicode
2974 value. */
2975 ELEM (ctype, class_collection, , ch) |= bitw;
2979 static void
2980 set_class_defaults (struct locale_ctype_t *ctype,
2981 const struct charmap_t *charmap,
2982 struct repertoire_t *repertoire)
2984 #define set_default(bitpos, from, to) \
2985 set_one_default (ctype, charmap, bitpos, from, to)
2987 /* These function defines the default values for the classes and conversions
2988 according to POSIX.2 2.5.2.1.
2989 It may seem that the order of these if-blocks is arbitrary but it is NOT.
2990 Don't move them unless you know what you do! */
2992 /* Set default values if keyword was not present. */
2993 if ((ctype->class_done & BITw (tok_upper)) == 0)
2994 /* "If this keyword [lower] is not specified, the lowercase letters
2995 `A' through `Z', ..., shall automatically belong to this class,
2996 with implementation defined character values." [P1003.2, 2.5.2.1] */
2997 set_default (BITPOS (tok_upper), 'A', 'Z');
2999 if ((ctype->class_done & BITw (tok_lower)) == 0)
3000 /* "If this keyword [lower] is not specified, the lowercase letters
3001 `a' through `z', ..., shall automatically belong to this class,
3002 with implementation defined character values." [P1003.2, 2.5.2.1] */
3003 set_default (BITPOS (tok_lower), 'a', 'z');
3005 if ((ctype->class_done & BITw (tok_alpha)) == 0)
3007 /* Table 2-6 in P1003.2 says that characters in class `upper' or
3008 class `lower' *must* be in class `alpha'. */
3009 unsigned long int mask = BIT (tok_upper) | BIT (tok_lower);
3010 unsigned long int maskw = BITw (tok_upper) | BITw (tok_lower);
3012 for (size_t cnt = 0; cnt < 256; ++cnt)
3013 if ((ctype->class256_collection[cnt] & mask) != 0)
3014 ctype->class256_collection[cnt] |= BIT (tok_alpha);
3016 for (size_t cnt = 0; cnt < ctype->class_collection_act; ++cnt)
3017 if ((ctype->class_collection[cnt] & maskw) != 0)
3018 ctype->class_collection[cnt] |= BITw (tok_alpha);
3021 if ((ctype->class_done & BITw (tok_digit)) == 0)
3022 /* "If this keyword [digit] is not specified, the digits `0' through
3023 `9', ..., shall automatically belong to this class, with
3024 implementation-defined character values." [P1003.2, 2.5.2.1] */
3025 set_default (BITPOS (tok_digit), '0', '9');
3027 /* "Only characters specified for the `alpha' and `digit' keyword
3028 shall be specified. Characters specified for the keyword `alpha'
3029 and `digit' are automatically included in this class. */
3031 unsigned long int mask = BIT (tok_alpha) | BIT (tok_digit);
3032 unsigned long int maskw = BITw (tok_alpha) | BITw (tok_digit);
3034 for (size_t cnt = 0; cnt < 256; ++cnt)
3035 if ((ctype->class256_collection[cnt] & mask) != 0)
3036 ctype->class256_collection[cnt] |= BIT (tok_alnum);
3038 for (size_t cnt = 0; cnt < ctype->class_collection_act; ++cnt)
3039 if ((ctype->class_collection[cnt] & maskw) != 0)
3040 ctype->class_collection[cnt] |= BITw (tok_alnum);
3043 if ((ctype->class_done & BITw (tok_space)) == 0)
3044 /* "If this keyword [space] is not specified, the characters <space>,
3045 <form-feed>, <newline>, <carriage-return>, <tab>, and
3046 <vertical-tab>, ..., shall automatically belong to this class,
3047 with implementation-defined character values." [P1003.2, 2.5.2.1] */
3049 struct charseq *seq;
3051 seq = charmap_find_value (charmap, "space", 5);
3052 if (seq == NULL)
3053 seq = charmap_find_value (charmap, "SP", 2);
3054 if (seq == NULL)
3055 seq = charmap_find_value (charmap, "U00000020", 9);
3056 if (seq == NULL)
3058 if (!be_quiet)
3059 WITH_CUR_LOCALE (error (0, 0, _("\
3060 %s: character `%s' not defined while needed as default value"),
3061 "LC_CTYPE", "<space>"));
3063 else if (seq->nbytes != 1)
3064 WITH_CUR_LOCALE (error (0, 0, _("\
3065 %s: character `%s' in charmap not representable with one byte"),
3066 "LC_CTYPE", "<space>"));
3067 else
3068 ctype->class256_collection[seq->bytes[0]] |= BIT (tok_space);
3070 /* No need to search. */
3071 ELEM (ctype, class_collection, , L' ') |= BITw (tok_space);
3073 seq = charmap_find_value (charmap, "form-feed", 9);
3074 if (seq == NULL)
3075 seq = charmap_find_value (charmap, "U0000000C", 9);
3076 if (seq == NULL)
3078 if (!be_quiet)
3079 WITH_CUR_LOCALE (error (0, 0, _("\
3080 %s: character `%s' not defined while needed as default value"),
3081 "LC_CTYPE", "<form-feed>"));
3083 else if (seq->nbytes != 1)
3084 WITH_CUR_LOCALE (error (0, 0, _("\
3085 %s: character `%s' in charmap not representable with one byte"),
3086 "LC_CTYPE", "<form-feed>"));
3087 else
3088 ctype->class256_collection[seq->bytes[0]] |= BIT (tok_space);
3090 /* No need to search. */
3091 ELEM (ctype, class_collection, , L'\f') |= BITw (tok_space);
3094 seq = charmap_find_value (charmap, "newline", 7);
3095 if (seq == NULL)
3096 seq = charmap_find_value (charmap, "U0000000A", 9);
3097 if (seq == NULL)
3099 if (!be_quiet)
3100 WITH_CUR_LOCALE (error (0, 0, _("\
3101 %s: character `%s' not defined while needed as default value"),
3102 "LC_CTYPE", "<newline>"));
3104 else if (seq->nbytes != 1)
3105 WITH_CUR_LOCALE (error (0, 0, _("\
3106 %s: character `%s' in charmap not representable with one byte"),
3107 "LC_CTYPE", "<newline>"));
3108 else
3109 ctype->class256_collection[seq->bytes[0]] |= BIT (tok_space);
3111 /* No need to search. */
3112 ELEM (ctype, class_collection, , L'\n') |= BITw (tok_space);
3115 seq = charmap_find_value (charmap, "carriage-return", 15);
3116 if (seq == NULL)
3117 seq = charmap_find_value (charmap, "U0000000D", 9);
3118 if (seq == NULL)
3120 if (!be_quiet)
3121 WITH_CUR_LOCALE (error (0, 0, _("\
3122 %s: character `%s' not defined while needed as default value"),
3123 "LC_CTYPE", "<carriage-return>"));
3125 else if (seq->nbytes != 1)
3126 WITH_CUR_LOCALE (error (0, 0, _("\
3127 %s: character `%s' in charmap not representable with one byte"),
3128 "LC_CTYPE", "<carriage-return>"));
3129 else
3130 ctype->class256_collection[seq->bytes[0]] |= BIT (tok_space);
3132 /* No need to search. */
3133 ELEM (ctype, class_collection, , L'\r') |= BITw (tok_space);
3136 seq = charmap_find_value (charmap, "tab", 3);
3137 if (seq == NULL)
3138 seq = charmap_find_value (charmap, "U00000009", 9);
3139 if (seq == NULL)
3141 if (!be_quiet)
3142 WITH_CUR_LOCALE (error (0, 0, _("\
3143 %s: character `%s' not defined while needed as default value"),
3144 "LC_CTYPE", "<tab>"));
3146 else if (seq->nbytes != 1)
3147 WITH_CUR_LOCALE (error (0, 0, _("\
3148 %s: character `%s' in charmap not representable with one byte"),
3149 "LC_CTYPE", "<tab>"));
3150 else
3151 ctype->class256_collection[seq->bytes[0]] |= BIT (tok_space);
3153 /* No need to search. */
3154 ELEM (ctype, class_collection, , L'\t') |= BITw (tok_space);
3157 seq = charmap_find_value (charmap, "vertical-tab", 12);
3158 if (seq == NULL)
3159 seq = charmap_find_value (charmap, "U0000000B", 9);
3160 if (seq == NULL)
3162 if (!be_quiet)
3163 WITH_CUR_LOCALE (error (0, 0, _("\
3164 %s: character `%s' not defined while needed as default value"),
3165 "LC_CTYPE", "<vertical-tab>"));
3167 else if (seq->nbytes != 1)
3168 WITH_CUR_LOCALE (error (0, 0, _("\
3169 %s: character `%s' in charmap not representable with one byte"),
3170 "LC_CTYPE", "<vertical-tab>"));
3171 else
3172 ctype->class256_collection[seq->bytes[0]] |= BIT (tok_space);
3174 /* No need to search. */
3175 ELEM (ctype, class_collection, , L'\v') |= BITw (tok_space);
3178 if ((ctype->class_done & BITw (tok_xdigit)) == 0)
3179 /* "If this keyword is not specified, the digits `0' to `9', the
3180 uppercase letters `A' through `F', and the lowercase letters `a'
3181 through `f', ..., shell automatically belong to this class, with
3182 implementation defined character values." [P1003.2, 2.5.2.1] */
3184 set_default (BITPOS (tok_xdigit), '0', '9');
3185 set_default (BITPOS (tok_xdigit), 'A', 'F');
3186 set_default (BITPOS (tok_xdigit), 'a', 'f');
3189 if ((ctype->class_done & BITw (tok_blank)) == 0)
3190 /* "If this keyword [blank] is unspecified, the characters <space> and
3191 <tab> shall belong to this character class." [P1003.2, 2.5.2.1] */
3193 struct charseq *seq;
3195 seq = charmap_find_value (charmap, "space", 5);
3196 if (seq == NULL)
3197 seq = charmap_find_value (charmap, "SP", 2);
3198 if (seq == NULL)
3199 seq = charmap_find_value (charmap, "U00000020", 9);
3200 if (seq == NULL)
3202 if (!be_quiet)
3203 WITH_CUR_LOCALE (error (0, 0, _("\
3204 %s: character `%s' not defined while needed as default value"),
3205 "LC_CTYPE", "<space>"));
3207 else if (seq->nbytes != 1)
3208 WITH_CUR_LOCALE (error (0, 0, _("\
3209 %s: character `%s' in charmap not representable with one byte"),
3210 "LC_CTYPE", "<space>"));
3211 else
3212 ctype->class256_collection[seq->bytes[0]] |= BIT (tok_blank);
3214 /* No need to search. */
3215 ELEM (ctype, class_collection, , L' ') |= BITw (tok_blank);
3218 seq = charmap_find_value (charmap, "tab", 3);
3219 if (seq == NULL)
3220 seq = charmap_find_value (charmap, "U00000009", 9);
3221 if (seq == NULL)
3223 if (!be_quiet)
3224 WITH_CUR_LOCALE (error (0, 0, _("\
3225 %s: character `%s' not defined while needed as default value"),
3226 "LC_CTYPE", "<tab>"));
3228 else if (seq->nbytes != 1)
3229 WITH_CUR_LOCALE (error (0, 0, _("\
3230 %s: character `%s' in charmap not representable with one byte"),
3231 "LC_CTYPE", "<tab>"));
3232 else
3233 ctype->class256_collection[seq->bytes[0]] |= BIT (tok_blank);
3235 /* No need to search. */
3236 ELEM (ctype, class_collection, , L'\t') |= BITw (tok_blank);
3239 if ((ctype->class_done & BITw (tok_graph)) == 0)
3240 /* "If this keyword [graph] is not specified, characters specified for
3241 the keywords `upper', `lower', `alpha', `digit', `xdigit' and `punct',
3242 shall belong to this character class." [P1003.2, 2.5.2.1] */
3244 unsigned long int mask = BIT (tok_upper) | BIT (tok_lower) |
3245 BIT (tok_alpha) | BIT (tok_digit) | BIT (tok_xdigit) | BIT (tok_punct);
3246 unsigned long int maskw = BITw (tok_upper) | BITw (tok_lower) |
3247 BITw (tok_alpha) | BITw (tok_digit) | BITw (tok_xdigit) |
3248 BITw (tok_punct);
3250 for (size_t cnt = 0; cnt < ctype->class_collection_act; ++cnt)
3251 if ((ctype->class_collection[cnt] & maskw) != 0)
3252 ctype->class_collection[cnt] |= BITw (tok_graph);
3254 for (size_t cnt = 0; cnt < 256; ++cnt)
3255 if ((ctype->class256_collection[cnt] & mask) != 0)
3256 ctype->class256_collection[cnt] |= BIT (tok_graph);
3259 if ((ctype->class_done & BITw (tok_print)) == 0)
3260 /* "If this keyword [print] is not provided, characters specified for
3261 the keywords `upper', `lower', `alpha', `digit', `xdigit', `punct',
3262 and the <space> character shall belong to this character class."
3263 [P1003.2, 2.5.2.1] */
3265 unsigned long int mask = BIT (tok_upper) | BIT (tok_lower) |
3266 BIT (tok_alpha) | BIT (tok_digit) | BIT (tok_xdigit) | BIT (tok_punct);
3267 unsigned long int maskw = BITw (tok_upper) | BITw (tok_lower) |
3268 BITw (tok_alpha) | BITw (tok_digit) | BITw (tok_xdigit) |
3269 BITw (tok_punct);
3270 struct charseq *seq;
3272 for (size_t cnt = 0; cnt < ctype->class_collection_act; ++cnt)
3273 if ((ctype->class_collection[cnt] & maskw) != 0)
3274 ctype->class_collection[cnt] |= BITw (tok_print);
3276 for (size_t cnt = 0; cnt < 256; ++cnt)
3277 if ((ctype->class256_collection[cnt] & mask) != 0)
3278 ctype->class256_collection[cnt] |= BIT (tok_print);
3281 seq = charmap_find_value (charmap, "space", 5);
3282 if (seq == NULL)
3283 seq = charmap_find_value (charmap, "SP", 2);
3284 if (seq == NULL)
3285 seq = charmap_find_value (charmap, "U00000020", 9);
3286 if (seq == NULL)
3288 if (!be_quiet)
3289 WITH_CUR_LOCALE (error (0, 0, _("\
3290 %s: character `%s' not defined while needed as default value"),
3291 "LC_CTYPE", "<space>"));
3293 else if (seq->nbytes != 1)
3294 WITH_CUR_LOCALE (error (0, 0, _("\
3295 %s: character `%s' in charmap not representable with one byte"),
3296 "LC_CTYPE", "<space>"));
3297 else
3298 ctype->class256_collection[seq->bytes[0]] |= BIT (tok_print);
3300 /* No need to search. */
3301 ELEM (ctype, class_collection, , L' ') |= BITw (tok_print);
3304 if (ctype->tomap_done[0] == 0)
3305 /* "If this keyword [toupper] is not specified, the lowercase letters
3306 `a' through `z', and their corresponding uppercase letters `A' to
3307 `Z', ..., shall automatically be included, with implementation-
3308 defined character values." [P1003.2, 2.5.2.1] */
3310 char tmp[4];
3311 int ch;
3313 strcpy (tmp, "<?>");
3315 for (ch = 'a'; ch <= 'z'; ++ch)
3317 struct charseq *seq_from, *seq_to;
3319 tmp[1] = (char) ch;
3321 seq_from = charmap_find_value (charmap, &tmp[1], 1);
3322 if (seq_from == NULL)
3324 char buf[10];
3325 sprintf (buf, "U%08X", ch);
3326 seq_from = charmap_find_value (charmap, buf, 9);
3328 if (seq_from == NULL)
3330 if (!be_quiet)
3331 WITH_CUR_LOCALE (error (0, 0, _("\
3332 %s: character `%s' not defined while needed as default value"),
3333 "LC_CTYPE", tmp));
3335 else if (seq_from->nbytes != 1)
3337 if (!be_quiet)
3338 WITH_CUR_LOCALE (error (0, 0, _("\
3339 %s: character `%s' needed as default value not representable with one byte"),
3340 "LC_CTYPE", tmp));
3342 else
3344 /* This conversion is implementation defined. */
3345 tmp[1] = (char) (ch + ('A' - 'a'));
3346 seq_to = charmap_find_value (charmap, &tmp[1], 1);
3347 if (seq_to == NULL)
3349 char buf[10];
3350 sprintf (buf, "U%08X", ch + ('A' - 'a'));
3351 seq_to = charmap_find_value (charmap, buf, 9);
3353 if (seq_to == NULL)
3355 if (!be_quiet)
3356 WITH_CUR_LOCALE (error (0, 0, _("\
3357 %s: character `%s' not defined while needed as default value"),
3358 "LC_CTYPE", tmp));
3360 else if (seq_to->nbytes != 1)
3362 if (!be_quiet)
3363 WITH_CUR_LOCALE (error (0, 0, _("\
3364 %s: character `%s' needed as default value not representable with one byte"),
3365 "LC_CTYPE", tmp));
3367 else
3368 /* The index [0] is determined by the order of the
3369 `ctype_map_newP' calls in `ctype_startup'. */
3370 ctype->map256_collection[0][seq_from->bytes[0]]
3371 = seq_to->bytes[0];
3374 /* No need to search. */
3375 ELEM (ctype, map_collection, [0], ch) = ch + ('A' - 'a');
3379 if (ctype->tomap_done[1] == 0)
3380 /* "If this keyword [tolower] is not specified, the mapping shall be
3381 the reverse mapping of the one specified to `toupper'." [P1003.2] */
3383 for (size_t cnt = 0; cnt < ctype->map_collection_act[0]; ++cnt)
3384 if (ctype->map_collection[0][cnt] != 0)
3385 ELEM (ctype, map_collection, [1],
3386 ctype->map_collection[0][cnt])
3387 = ctype->charnames[cnt];
3389 for (size_t cnt = 0; cnt < 256; ++cnt)
3390 if (ctype->map256_collection[0][cnt] != 0)
3391 ctype->map256_collection[1][ctype->map256_collection[0][cnt]] = cnt;
3394 if (ctype->outdigits_act != 10)
3396 if (ctype->outdigits_act != 0)
3397 WITH_CUR_LOCALE (error (0, 0, _("\
3398 %s: field `%s' does not contain exactly ten entries"),
3399 "LC_CTYPE", "outdigit"));
3401 for (size_t cnt = ctype->outdigits_act; cnt < 10; ++cnt)
3403 ctype->mboutdigits[cnt] = charmap_find_symbol (charmap,
3404 (char *) digits + cnt,
3407 if (ctype->mboutdigits[cnt] == NULL)
3408 ctype->mboutdigits[cnt] = charmap_find_symbol (charmap,
3409 longnames[cnt],
3410 strlen (longnames[cnt]));
3412 if (ctype->mboutdigits[cnt] == NULL)
3413 ctype->mboutdigits[cnt] = charmap_find_symbol (charmap,
3414 uninames[cnt], 9);
3416 if (ctype->mboutdigits[cnt] == NULL)
3418 /* Provide a replacement. */
3419 WITH_CUR_LOCALE (error (0, 0, _("\
3420 no output digits defined and none of the standard names in the charmap")));
3422 ctype->mboutdigits[cnt] = obstack_alloc (&((struct charmap_t *) charmap)->mem_pool,
3423 sizeof (struct charseq)
3424 + 1);
3426 /* This is better than nothing. */
3427 ctype->mboutdigits[cnt]->bytes[0] = digits[cnt];
3428 ctype->mboutdigits[cnt]->nbytes = 1;
3431 ctype->wcoutdigits[cnt] = L'0' + cnt;
3434 ctype->outdigits_act = 10;
3437 #undef set_default
3441 /* Initialize. Assumes t->p and t->q have already been set. */
3442 static inline void
3443 wctype_table_init (struct wctype_table *t)
3445 t->level1 = NULL;
3446 t->level1_alloc = t->level1_size = 0;
3447 t->level2 = NULL;
3448 t->level2_alloc = t->level2_size = 0;
3449 t->level3 = NULL;
3450 t->level3_alloc = t->level3_size = 0;
3453 /* Retrieve an entry. */
3454 static inline int
3455 wctype_table_get (struct wctype_table *t, uint32_t wc)
3457 uint32_t index1 = wc >> (t->q + t->p + 5);
3458 if (index1 < t->level1_size)
3460 uint32_t lookup1 = t->level1[index1];
3461 if (lookup1 != EMPTY)
3463 uint32_t index2 = ((wc >> (t->p + 5)) & ((1 << t->q) - 1))
3464 + (lookup1 << t->q);
3465 uint32_t lookup2 = t->level2[index2];
3466 if (lookup2 != EMPTY)
3468 uint32_t index3 = ((wc >> 5) & ((1 << t->p) - 1))
3469 + (lookup2 << t->p);
3470 uint32_t lookup3 = t->level3[index3];
3471 uint32_t index4 = wc & 0x1f;
3473 return (lookup3 >> index4) & 1;
3477 return 0;
3480 /* Add one entry. */
3481 static void
3482 wctype_table_add (struct wctype_table *t, uint32_t wc)
3484 uint32_t index1 = wc >> (t->q + t->p + 5);
3485 uint32_t index2 = (wc >> (t->p + 5)) & ((1 << t->q) - 1);
3486 uint32_t index3 = (wc >> 5) & ((1 << t->p) - 1);
3487 uint32_t index4 = wc & 0x1f;
3488 size_t i, i1, i2;
3490 if (index1 >= t->level1_size)
3492 if (index1 >= t->level1_alloc)
3494 size_t alloc = 2 * t->level1_alloc;
3495 if (alloc <= index1)
3496 alloc = index1 + 1;
3497 t->level1 = (uint32_t *) xrealloc ((char *) t->level1,
3498 alloc * sizeof (uint32_t));
3499 t->level1_alloc = alloc;
3501 while (index1 >= t->level1_size)
3502 t->level1[t->level1_size++] = EMPTY;
3505 if (t->level1[index1] == EMPTY)
3507 if (t->level2_size == t->level2_alloc)
3509 size_t alloc = 2 * t->level2_alloc + 1;
3510 t->level2 = (uint32_t *) xrealloc ((char *) t->level2,
3511 (alloc << t->q) * sizeof (uint32_t));
3512 t->level2_alloc = alloc;
3514 i1 = t->level2_size << t->q;
3515 i2 = (t->level2_size + 1) << t->q;
3516 for (i = i1; i < i2; i++)
3517 t->level2[i] = EMPTY;
3518 t->level1[index1] = t->level2_size++;
3521 index2 += t->level1[index1] << t->q;
3523 if (t->level2[index2] == EMPTY)
3525 if (t->level3_size == t->level3_alloc)
3527 size_t alloc = 2 * t->level3_alloc + 1;
3528 t->level3 = (uint32_t *) xrealloc ((char *) t->level3,
3529 (alloc << t->p) * sizeof (uint32_t));
3530 t->level3_alloc = alloc;
3532 i1 = t->level3_size << t->p;
3533 i2 = (t->level3_size + 1) << t->p;
3534 for (i = i1; i < i2; i++)
3535 t->level3[i] = 0;
3536 t->level2[index2] = t->level3_size++;
3539 index3 += t->level2[index2] << t->p;
3541 t->level3[index3] |= (uint32_t)1 << index4;
3544 /* Finalize and shrink. */
3545 static void
3546 add_locale_wctype_table (struct locale_file *file, struct wctype_table *t)
3548 size_t i, j, k;
3549 uint32_t reorder3[t->level3_size];
3550 uint32_t reorder2[t->level2_size];
3551 uint32_t level2_offset, level3_offset;
3553 /* Uniquify level3 blocks. */
3554 k = 0;
3555 for (j = 0; j < t->level3_size; j++)
3557 for (i = 0; i < k; i++)
3558 if (memcmp (&t->level3[i << t->p], &t->level3[j << t->p],
3559 (1 << t->p) * sizeof (uint32_t)) == 0)
3560 break;
3561 /* Relocate block j to block i. */
3562 reorder3[j] = i;
3563 if (i == k)
3565 if (i != j)
3566 memcpy (&t->level3[i << t->p], &t->level3[j << t->p],
3567 (1 << t->p) * sizeof (uint32_t));
3568 k++;
3571 t->level3_size = k;
3573 for (i = 0; i < (t->level2_size << t->q); i++)
3574 if (t->level2[i] != EMPTY)
3575 t->level2[i] = reorder3[t->level2[i]];
3577 /* Uniquify level2 blocks. */
3578 k = 0;
3579 for (j = 0; j < t->level2_size; j++)
3581 for (i = 0; i < k; i++)
3582 if (memcmp (&t->level2[i << t->q], &t->level2[j << t->q],
3583 (1 << t->q) * sizeof (uint32_t)) == 0)
3584 break;
3585 /* Relocate block j to block i. */
3586 reorder2[j] = i;
3587 if (i == k)
3589 if (i != j)
3590 memcpy (&t->level2[i << t->q], &t->level2[j << t->q],
3591 (1 << t->q) * sizeof (uint32_t));
3592 k++;
3595 t->level2_size = k;
3597 for (i = 0; i < t->level1_size; i++)
3598 if (t->level1[i] != EMPTY)
3599 t->level1[i] = reorder2[t->level1[i]];
3601 t->result_size =
3602 5 * sizeof (uint32_t)
3603 + t->level1_size * sizeof (uint32_t)
3604 + (t->level2_size << t->q) * sizeof (uint32_t)
3605 + (t->level3_size << t->p) * sizeof (uint32_t);
3607 level2_offset =
3608 5 * sizeof (uint32_t)
3609 + t->level1_size * sizeof (uint32_t);
3610 level3_offset =
3611 5 * sizeof (uint32_t)
3612 + t->level1_size * sizeof (uint32_t)
3613 + (t->level2_size << t->q) * sizeof (uint32_t);
3615 start_locale_structure (file);
3616 add_locale_uint32 (file, t->q + t->p + 5);
3617 add_locale_uint32 (file, t->level1_size);
3618 add_locale_uint32 (file, t->p + 5);
3619 add_locale_uint32 (file, (1 << t->q) - 1);
3620 add_locale_uint32 (file, (1 << t->p) - 1);
3622 for (i = 0; i < t->level1_size; i++)
3623 add_locale_uint32
3624 (file,
3625 t->level1[i] == EMPTY
3627 : (t->level1[i] << t->q) * sizeof (uint32_t) + level2_offset);
3629 for (i = 0; i < (t->level2_size << t->q); i++)
3630 add_locale_uint32
3631 (file,
3632 t->level2[i] == EMPTY
3634 : (t->level2[i] << t->p) * sizeof (uint32_t) + level3_offset);
3636 add_locale_uint32_array (file, t->level3, t->level3_size << t->p);
3637 end_locale_structure (file);
3639 if (t->level1_alloc > 0)
3640 free (t->level1);
3641 if (t->level2_alloc > 0)
3642 free (t->level2);
3643 if (t->level3_alloc > 0)
3644 free (t->level3);
3647 /* Flattens the included transliterations into a translit list.
3648 Inserts them in the list at `cursor', and returns the new cursor. */
3649 static struct translit_t **
3650 translit_flatten (struct locale_ctype_t *ctype,
3651 const struct charmap_t *charmap,
3652 struct translit_t **cursor)
3654 while (ctype->translit_include != NULL)
3656 const char *copy_locale = ctype->translit_include->copy_locale;
3657 const char *copy_repertoire = ctype->translit_include->copy_repertoire;
3658 struct localedef_t *other;
3660 /* Unchain the include statement. During the depth-first traversal
3661 we don't want to visit any locale more than once. */
3662 ctype->translit_include = ctype->translit_include->next;
3664 other = find_locale (LC_CTYPE, copy_locale, copy_repertoire, charmap);
3666 if (other == NULL || other->categories[LC_CTYPE].ctype == NULL)
3668 WITH_CUR_LOCALE (error (0, 0, _("\
3669 %s: transliteration data from locale `%s' not available"),
3670 "LC_CTYPE", copy_locale));
3672 else
3674 struct locale_ctype_t *other_ctype =
3675 other->categories[LC_CTYPE].ctype;
3677 cursor = translit_flatten (other_ctype, charmap, cursor);
3678 assert (other_ctype->translit_include == NULL);
3680 if (other_ctype->translit != NULL)
3682 /* Insert the other_ctype->translit list at *cursor. */
3683 struct translit_t *endp = other_ctype->translit;
3684 while (endp->next != NULL)
3685 endp = endp->next;
3687 endp->next = *cursor;
3688 *cursor = other_ctype->translit;
3690 /* Avoid any risk of circular lists. */
3691 other_ctype->translit = NULL;
3693 cursor = &endp->next;
3696 if (ctype->default_missing == NULL)
3697 ctype->default_missing = other_ctype->default_missing;
3701 return cursor;
3704 static void
3705 allocate_arrays (struct locale_ctype_t *ctype, const struct charmap_t *charmap,
3706 struct repertoire_t *repertoire)
3708 size_t idx, nr;
3709 const void *key;
3710 size_t len;
3711 void *vdata;
3712 void *curs;
3714 /* You wonder about this amount of memory? This is only because some
3715 users do not manage to address the array with unsigned values or
3716 data types with range >= 256. '\200' would result in the array
3717 index -128. To help these poor people we duplicate the entries for
3718 128 up to 255 below the entry for \0. */
3719 ctype->ctype_b = (char_class_t *) xcalloc (256 + 128, sizeof (char_class_t));
3720 ctype->ctype32_b = (char_class32_t *) xcalloc (256, sizeof (char_class32_t));
3721 ctype->class_b = (uint32_t **)
3722 xmalloc (ctype->nr_charclass * sizeof (uint32_t *));
3723 ctype->class_3level = (struct wctype_table *)
3724 xmalloc (ctype->nr_charclass * sizeof (struct wctype_table));
3726 /* This is the array accessed using the multibyte string elements. */
3727 for (idx = 0; idx < 256; ++idx)
3728 ctype->ctype_b[128 + idx] = ctype->class256_collection[idx];
3730 /* Mirror first 127 entries. We must take care that entry -1 is not
3731 mirrored because EOF == -1. */
3732 for (idx = 0; idx < 127; ++idx)
3733 ctype->ctype_b[idx] = ctype->ctype_b[256 + idx];
3735 /* The 32 bit array contains all characters < 0x100. */
3736 for (idx = 0; idx < ctype->class_collection_act; ++idx)
3737 if (ctype->charnames[idx] < 0x100)
3738 ctype->ctype32_b[ctype->charnames[idx]] = ctype->class_collection[idx];
3740 for (nr = 0; nr < ctype->nr_charclass; nr++)
3742 ctype->class_b[nr] = (uint32_t *) xcalloc (256 / 32, sizeof (uint32_t));
3744 /* We only set CLASS_B for the bits in the ISO C classes, not
3745 the user defined classes. The number should not change but
3746 who knows. */
3747 #define LAST_ISO_C_BIT 11
3748 if (nr <= LAST_ISO_C_BIT)
3749 for (idx = 0; idx < 256; ++idx)
3750 if (ctype->class256_collection[idx] & _ISbit (nr))
3751 ctype->class_b[nr][idx >> 5] |= (uint32_t) 1 << (idx & 0x1f);
3754 for (nr = 0; nr < ctype->nr_charclass; nr++)
3756 struct wctype_table *t;
3758 t = &ctype->class_3level[nr];
3759 t->p = 4; /* or: 5 */
3760 t->q = 7; /* or: 6 */
3761 wctype_table_init (t);
3763 for (idx = 0; idx < ctype->class_collection_act; ++idx)
3764 if (ctype->class_collection[idx] & _ISwbit (nr))
3765 wctype_table_add (t, ctype->charnames[idx]);
3767 if (verbose)
3768 WITH_CUR_LOCALE (fprintf (stderr, _("\
3769 %s: table for class \"%s\": %lu bytes\n"),
3770 "LC_CTYPE", ctype->classnames[nr],
3771 (unsigned long int) t->result_size));
3774 /* Room for table of mappings. */
3775 ctype->map_b = (uint32_t **) xmalloc (2 * sizeof (uint32_t *));
3776 ctype->map32_b = (uint32_t **) xmalloc (ctype->map_collection_nr
3777 * sizeof (uint32_t *));
3778 ctype->map_3level = (struct wctrans_table *)
3779 xmalloc (ctype->map_collection_nr * sizeof (struct wctrans_table));
3781 /* Fill in all mappings. */
3782 for (idx = 0; idx < 2; ++idx)
3784 unsigned int idx2;
3786 /* Allocate table. */
3787 ctype->map_b[idx] = (uint32_t *)
3788 xmalloc ((256 + 128) * sizeof (uint32_t));
3790 /* Copy values from collection. */
3791 for (idx2 = 0; idx2 < 256; ++idx2)
3792 ctype->map_b[idx][128 + idx2] = ctype->map256_collection[idx][idx2];
3794 /* Mirror first 127 entries. We must take care not to map entry
3795 -1 because EOF == -1. */
3796 for (idx2 = 0; idx2 < 127; ++idx2)
3797 ctype->map_b[idx][idx2] = ctype->map_b[idx][256 + idx2];
3799 /* EOF must map to EOF. */
3800 ctype->map_b[idx][127] = EOF;
3803 for (idx = 0; idx < ctype->map_collection_nr; ++idx)
3805 unsigned int idx2;
3807 /* Allocate table. */
3808 ctype->map32_b[idx] = (uint32_t *) xmalloc (256 * sizeof (uint32_t));
3810 /* Copy values from collection. Default is identity mapping. */
3811 for (idx2 = 0; idx2 < 256; ++idx2)
3812 ctype->map32_b[idx][idx2] =
3813 (ctype->map_collection[idx][idx2] != 0
3814 ? ctype->map_collection[idx][idx2]
3815 : idx2);
3818 for (nr = 0; nr < ctype->map_collection_nr; nr++)
3820 struct wctrans_table *t;
3822 t = &ctype->map_3level[nr];
3823 t->p = 7;
3824 t->q = 9;
3825 wctrans_table_init (t);
3827 for (idx = 0; idx < ctype->map_collection_act[nr]; ++idx)
3828 if (ctype->map_collection[nr][idx] != 0)
3829 wctrans_table_add (t, ctype->charnames[idx],
3830 ctype->map_collection[nr][idx]);
3832 if (verbose)
3833 WITH_CUR_LOCALE (fprintf (stderr, _("\
3834 %s: table for map \"%s\": %lu bytes\n"),
3835 "LC_CTYPE", ctype->mapnames[nr],
3836 (unsigned long int) t->result_size));
3839 /* Extra array for class and map names. */
3840 ctype->class_name_ptr = (uint32_t *) xmalloc (ctype->nr_charclass
3841 * sizeof (uint32_t));
3842 ctype->map_name_ptr = (uint32_t *) xmalloc (ctype->map_collection_nr
3843 * sizeof (uint32_t));
3845 ctype->class_offset = _NL_ITEM_INDEX (_NL_CTYPE_EXTRA_MAP_1);
3846 ctype->map_offset = ctype->class_offset + ctype->nr_charclass;
3848 /* Array for width information. Because the expected widths are very
3849 small (never larger than 2) we use only one single byte. This
3850 saves space.
3851 We put only printable characters in the table. wcwidth is specified
3852 to return -1 for non-printable characters. Doing the check here
3853 saves a run-time check.
3854 But we put L'\0' in the table. This again saves a run-time check. */
3856 struct wcwidth_table *t;
3858 t = &ctype->width;
3859 t->p = 7;
3860 t->q = 9;
3861 wcwidth_table_init (t);
3863 /* First set all the printable characters of the character set to
3864 the default width. */
3865 curs = NULL;
3866 while (iterate_table (&charmap->char_table, &curs, &key, &len, &vdata) == 0)
3868 struct charseq *data = (struct charseq *) vdata;
3870 if (data->ucs4 == UNINITIALIZED_CHAR_VALUE)
3871 data->ucs4 = repertoire_find_value (ctype->repertoire,
3872 data->name, len);
3874 if (data->ucs4 != ILLEGAL_CHAR_VALUE)
3876 uint32_t *class_bits =
3877 find_idx (ctype, &ctype->class_collection, NULL,
3878 &ctype->class_collection_act, data->ucs4);
3880 if (class_bits != NULL && (*class_bits & BITw (tok_print)))
3881 wcwidth_table_add (t, data->ucs4, charmap->width_default);
3885 /* Now add the explicitly specified widths. */
3886 if (charmap->width_rules != NULL)
3887 for (size_t cnt = 0; cnt < charmap->nwidth_rules; ++cnt)
3889 unsigned char bytes[charmap->mb_cur_max];
3890 int nbytes = charmap->width_rules[cnt].from->nbytes;
3892 /* We have the range of character for which the width is
3893 specified described using byte sequences of the multibyte
3894 charset. We have to convert this to UCS4 now. And we
3895 cannot simply convert the beginning and the end of the
3896 sequence, we have to iterate over the byte sequence and
3897 convert it for every single character. */
3898 memcpy (bytes, charmap->width_rules[cnt].from->bytes, nbytes);
3900 while (nbytes < charmap->width_rules[cnt].to->nbytes
3901 || memcmp (bytes, charmap->width_rules[cnt].to->bytes,
3902 nbytes) <= 0)
3904 /* Find the UCS value for `bytes'. */
3905 int inner;
3906 uint32_t wch;
3907 struct charseq *seq =
3908 charmap_find_symbol (charmap, (char *) bytes, nbytes);
3910 if (seq == NULL)
3911 wch = ILLEGAL_CHAR_VALUE;
3912 else if (seq->ucs4 != UNINITIALIZED_CHAR_VALUE)
3913 wch = seq->ucs4;
3914 else
3915 wch = repertoire_find_value (ctype->repertoire, seq->name,
3916 strlen (seq->name));
3918 if (wch != ILLEGAL_CHAR_VALUE)
3920 /* Store the value. */
3921 uint32_t *class_bits =
3922 find_idx (ctype, &ctype->class_collection, NULL,
3923 &ctype->class_collection_act, wch);
3925 if (class_bits != NULL && (*class_bits & BITw (tok_print)))
3926 wcwidth_table_add (t, wch,
3927 charmap->width_rules[cnt].width);
3930 /* "Increment" the bytes sequence. */
3931 inner = nbytes - 1;
3932 while (inner >= 0 && bytes[inner] == 0xff)
3933 --inner;
3935 if (inner < 0)
3937 /* We have to extend the byte sequence. */
3938 if (nbytes >= charmap->width_rules[cnt].to->nbytes)
3939 break;
3941 bytes[0] = 1;
3942 memset (&bytes[1], 0, nbytes);
3943 ++nbytes;
3945 else
3947 ++bytes[inner];
3948 while (++inner < nbytes)
3949 bytes[inner] = 0;
3954 /* Set the width of L'\0' to 0. */
3955 wcwidth_table_add (t, 0, 0);
3957 if (verbose)
3958 WITH_CUR_LOCALE (fprintf (stderr, _("%s: table for width: %lu bytes\n"),
3959 "LC_CTYPE", (unsigned long int) t->result_size));
3962 /* Set MB_CUR_MAX. */
3963 ctype->mb_cur_max = charmap->mb_cur_max;
3965 /* Now determine the table for the transliteration information.
3967 XXX It is not yet clear to me whether it is worth implementing a
3968 complicated algorithm which uses a hash table to locate the entries.
3969 For now I'll use a simple array which can be searching using binary
3970 search. */
3971 if (ctype->translit_include != NULL)
3972 /* Traverse the locales mentioned in the `include' statements in a
3973 depth-first way and fold in their transliteration information. */
3974 translit_flatten (ctype, charmap, &ctype->translit);
3976 if (ctype->translit != NULL)
3978 /* First count how many entries we have. This is the upper limit
3979 since some entries from the included files might be overwritten. */
3980 size_t number = 0;
3981 struct translit_t *runp = ctype->translit;
3982 struct translit_t **sorted;
3983 size_t from_len, to_len;
3985 while (runp != NULL)
3987 ++number;
3988 runp = runp->next;
3991 /* Next we allocate an array large enough and fill in the values. */
3992 sorted = (struct translit_t **) alloca (number
3993 * sizeof (struct translit_t **));
3994 runp = ctype->translit;
3995 number = 0;
3998 /* Search for the place where to insert this string.
3999 XXX Better use a real sorting algorithm later. */
4000 size_t idx = 0;
4001 int replace = 0;
4003 while (idx < number)
4005 int res = wcscmp ((const wchar_t *) sorted[idx]->from,
4006 (const wchar_t *) runp->from);
4007 if (res == 0)
4009 replace = 1;
4010 break;
4012 if (res > 0)
4013 break;
4014 ++idx;
4017 if (replace)
4018 sorted[idx] = runp;
4019 else
4021 memmove (&sorted[idx + 1], &sorted[idx],
4022 (number - idx) * sizeof (struct translit_t *));
4023 sorted[idx] = runp;
4024 ++number;
4027 runp = runp->next;
4029 while (runp != NULL);
4031 /* The next step is putting all the possible transliteration
4032 strings in one memory block so that we can write it out.
4033 We need several different blocks:
4034 - index to the from-string array
4035 - from-string array
4036 - index to the to-string array
4037 - to-string array.
4039 from_len = to_len = 0;
4040 for (size_t cnt = 0; cnt < number; ++cnt)
4042 struct translit_to_t *srunp;
4043 from_len += wcslen ((const wchar_t *) sorted[cnt]->from) + 1;
4044 srunp = sorted[cnt]->to;
4045 while (srunp != NULL)
4047 to_len += wcslen ((const wchar_t *) srunp->str) + 1;
4048 srunp = srunp->next;
4050 /* Plus one for the extra NUL character marking the end of
4051 the list for the current entry. */
4052 ++to_len;
4055 /* We can allocate the arrays for the results. */
4056 ctype->translit_from_idx = xmalloc (number * sizeof (uint32_t));
4057 ctype->translit_from_tbl = xmalloc (from_len * sizeof (uint32_t));
4058 ctype->translit_to_idx = xmalloc (number * sizeof (uint32_t));
4059 ctype->translit_to_tbl = xmalloc (to_len * sizeof (uint32_t));
4061 from_len = 0;
4062 to_len = 0;
4063 for (size_t cnt = 0; cnt < number; ++cnt)
4065 size_t len;
4066 struct translit_to_t *srunp;
4068 ctype->translit_from_idx[cnt] = from_len;
4069 ctype->translit_to_idx[cnt] = to_len;
4071 len = wcslen ((const wchar_t *) sorted[cnt]->from) + 1;
4072 wmemcpy ((wchar_t *) &ctype->translit_from_tbl[from_len],
4073 (const wchar_t *) sorted[cnt]->from, len);
4074 from_len += len;
4076 ctype->translit_to_idx[cnt] = to_len;
4077 srunp = sorted[cnt]->to;
4078 while (srunp != NULL)
4080 len = wcslen ((const wchar_t *) srunp->str) + 1;
4081 wmemcpy ((wchar_t *) &ctype->translit_to_tbl[to_len],
4082 (const wchar_t *) srunp->str, len);
4083 to_len += len;
4084 srunp = srunp->next;
4086 ctype->translit_to_tbl[to_len++] = L'\0';
4089 /* Store the information about the length. */
4090 ctype->translit_idx_size = number;
4091 ctype->translit_from_tbl_size = from_len * sizeof (uint32_t);
4092 ctype->translit_to_tbl_size = to_len * sizeof (uint32_t);
4094 else
4096 ctype->translit_from_idx = no_str;
4097 ctype->translit_from_tbl = no_str;
4098 ctype->translit_to_tbl = no_str;
4099 ctype->translit_idx_size = 0;
4100 ctype->translit_from_tbl_size = 0;
4101 ctype->translit_to_tbl_size = 0;