Update.
[glibc.git] / locale / programs / ld-ctype.c
blobc1a92d861d699fec09bcd803f292c8f2b9ef4b27
1 /* Copyright (C) 1995-1999, 2000, 2001 Free Software Foundation, Inc.
2 This file is part of the GNU C Library.
3 Contributed by Ulrich Drepper <drepper@gnu.org>, 1995.
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, write to the Free
17 Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
18 02111-1307 USA. */
20 #ifdef HAVE_CONFIG_H
21 # include <config.h>
22 #endif
24 #include <alloca.h>
25 #include <byteswap.h>
26 #include <endian.h>
27 #include <errno.h>
28 #include <limits.h>
29 #include <obstack.h>
30 #include <stdlib.h>
31 #include <string.h>
32 #include <wchar.h>
33 #include <wctype.h>
34 #include <sys/uio.h>
36 #include "charmap.h"
37 #include "localeinfo.h"
38 #include "langinfo.h"
39 #include "linereader.h"
40 #include "locfile-token.h"
41 #include "locfile.h"
42 #include "localedef.h"
44 #include <assert.h>
47 #ifdef PREDEFINED_CLASSES
48 /* These are the extra bits not in wctype.h since these are not preallocated
49 classes. */
50 # define _ISwspecial1 (1 << 29)
51 # define _ISwspecial2 (1 << 30)
52 # define _ISwspecial3 (1 << 31)
53 #endif
56 /* The bit used for representing a special class. */
57 #define BITPOS(class) ((class) - tok_upper)
58 #define BIT(class) (_ISbit (BITPOS (class)))
59 #define BITw(class) (_ISwbit (BITPOS (class)))
61 #define ELEM(ctype, collection, idx, value) \
62 *find_idx (ctype, &ctype->collection idx, &ctype->collection##_max idx, \
63 &ctype->collection##_act idx, value)
66 /* To be compatible with former implementations we for now restrict
67 the number of bits for character classes to 16. When compatibility
68 is not necessary anymore increase the number to 32. */
69 #define char_class_t uint16_t
70 #define char_class32_t uint32_t
73 /* Type to describe a transliteration action. We have a possibly
74 multiple character from-string and a set of multiple character
75 to-strings. All are 32bit values since this is what is used in
76 the gconv functions. */
77 struct translit_to_t
79 uint32_t *str;
81 struct translit_to_t *next;
84 struct translit_t
86 uint32_t *from;
88 const char *fname;
89 size_t lineno;
91 struct translit_to_t *to;
93 struct translit_t *next;
96 struct translit_ignore_t
98 uint32_t from;
99 uint32_t to;
100 uint32_t step;
102 const char *fname;
103 size_t lineno;
105 struct translit_ignore_t *next;
109 /* Type to describe a transliteration include statement. */
110 struct translit_include_t
112 const char *copy_locale;
113 const char *copy_repertoire;
115 struct translit_include_t *next;
119 /* Sparse table of uint32_t. */
120 #define TABLE idx_table
121 #define ELEMENT uint32_t
122 #define DEFAULT ((uint32_t) ~0)
123 #define NO_FINALIZE
124 #include "3level.h"
127 /* The real definition of the struct for the LC_CTYPE locale. */
128 struct locale_ctype_t
130 uint32_t *charnames;
131 size_t charnames_max;
132 size_t charnames_act;
133 /* An index lookup table, to speedup find_idx. */
134 struct idx_table charnames_idx;
136 struct repertoire_t *repertoire;
138 /* We will allow up to 8 * sizeof (uint32_t) character classes. */
139 #define MAX_NR_CHARCLASS (8 * sizeof (uint32_t))
140 size_t nr_charclass;
141 const char *classnames[MAX_NR_CHARCLASS];
142 uint32_t last_class_char;
143 uint32_t class256_collection[256];
144 uint32_t *class_collection;
145 size_t class_collection_max;
146 size_t class_collection_act;
147 uint32_t class_done;
148 uint32_t class_offset;
150 struct charseq **mbdigits;
151 size_t mbdigits_act;
152 size_t mbdigits_max;
153 uint32_t *wcdigits;
154 size_t wcdigits_act;
155 size_t wcdigits_max;
157 struct charseq *mboutdigits[10];
158 uint32_t wcoutdigits[10];
159 size_t outdigits_act;
161 /* If the following number ever turns out to be too small simply
162 increase it. But I doubt it will. --drepper@gnu */
163 #define MAX_NR_CHARMAP 16
164 const char *mapnames[MAX_NR_CHARMAP];
165 uint32_t *map_collection[MAX_NR_CHARMAP];
166 uint32_t map256_collection[2][256];
167 size_t map_collection_max[MAX_NR_CHARMAP];
168 size_t map_collection_act[MAX_NR_CHARMAP];
169 size_t map_collection_nr;
170 size_t last_map_idx;
171 int tomap_done[MAX_NR_CHARMAP];
172 uint32_t map_offset;
174 /* Transliteration information. */
175 struct translit_include_t *translit_include;
176 struct translit_t *translit;
177 struct translit_ignore_t *translit_ignore;
178 uint32_t ntranslit_ignore;
180 uint32_t *default_missing;
181 const char *default_missing_file;
182 size_t default_missing_lineno;
184 /* The arrays for the binary representation. */
185 char_class_t *ctype_b;
186 char_class32_t *ctype32_b;
187 uint32_t **map_b;
188 uint32_t **map32_b;
189 uint32_t **class_b;
190 struct iovec *class_3level;
191 struct iovec *map_3level;
192 uint32_t *class_name_ptr;
193 uint32_t *map_name_ptr;
194 struct iovec width;
195 uint32_t mb_cur_max;
196 const char *codeset_name;
197 uint32_t *translit_from_idx;
198 uint32_t *translit_from_tbl;
199 uint32_t *translit_to_idx;
200 uint32_t *translit_to_tbl;
201 uint32_t translit_idx_size;
202 size_t translit_from_tbl_size;
203 size_t translit_to_tbl_size;
205 struct obstack mempool;
209 /* Marker for an empty slot. This has the value 0xFFFFFFFF, regardless
210 whether 'int' is 16 bit, 32 bit, or 64 bit. */
211 #define EMPTY ((uint32_t) ~0)
214 #define obstack_chunk_alloc xmalloc
215 #define obstack_chunk_free free
218 /* Prototypes for local functions. */
219 static void ctype_startup (struct linereader *lr, struct localedef_t *locale,
220 const struct charmap_t *charmap,
221 struct localedef_t *copy_locale,
222 int ignore_content);
223 static void ctype_class_new (struct linereader *lr,
224 struct locale_ctype_t *ctype, const char *name);
225 static void ctype_map_new (struct linereader *lr,
226 struct locale_ctype_t *ctype,
227 const char *name, const struct charmap_t *charmap);
228 static uint32_t *find_idx (struct locale_ctype_t *ctype, uint32_t **table,
229 size_t *max, size_t *act, unsigned int idx);
230 static void set_class_defaults (struct locale_ctype_t *ctype,
231 const struct charmap_t *charmap,
232 struct repertoire_t *repertoire);
233 static void allocate_arrays (struct locale_ctype_t *ctype,
234 const struct charmap_t *charmap,
235 struct repertoire_t *repertoire);
238 static const char *longnames[] =
240 "zero", "one", "two", "three", "four",
241 "five", "six", "seven", "eight", "nine"
243 static const char *uninames[] =
245 "U00000030", "U00000031", "U00000032", "U00000033", "U00000034",
246 "U00000035", "U00000036", "U00000037", "U00000038", "U00000039"
248 static const unsigned char digits[] = "0123456789";
251 static void
252 ctype_startup (struct linereader *lr, struct localedef_t *locale,
253 const struct charmap_t *charmap,
254 struct localedef_t *copy_locale, int ignore_content)
256 unsigned int cnt;
257 struct locale_ctype_t *ctype;
259 if (!ignore_content && locale->categories[LC_CTYPE].ctype == NULL)
261 if (copy_locale == NULL)
263 /* Allocate the needed room. */
264 locale->categories[LC_CTYPE].ctype = ctype =
265 (struct locale_ctype_t *) xcalloc (1,
266 sizeof (struct locale_ctype_t));
268 /* We have seen no names yet. */
269 ctype->charnames_max = charmap->mb_cur_max == 1 ? 256 : 512;
270 ctype->charnames =
271 (unsigned int *) xmalloc (ctype->charnames_max
272 * sizeof (unsigned int));
273 for (cnt = 0; cnt < 256; ++cnt)
274 ctype->charnames[cnt] = cnt;
275 ctype->charnames_act = 256;
276 idx_table_init (&ctype->charnames_idx);
278 /* Fill character class information. */
279 ctype->last_class_char = ILLEGAL_CHAR_VALUE;
280 /* The order of the following instructions determines the bit
281 positions! */
282 ctype_class_new (lr, ctype, "upper");
283 ctype_class_new (lr, ctype, "lower");
284 ctype_class_new (lr, ctype, "alpha");
285 ctype_class_new (lr, ctype, "digit");
286 ctype_class_new (lr, ctype, "xdigit");
287 ctype_class_new (lr, ctype, "space");
288 ctype_class_new (lr, ctype, "print");
289 ctype_class_new (lr, ctype, "graph");
290 ctype_class_new (lr, ctype, "blank");
291 ctype_class_new (lr, ctype, "cntrl");
292 ctype_class_new (lr, ctype, "punct");
293 ctype_class_new (lr, ctype, "alnum");
294 #ifdef PREDEFINED_CLASSES
295 /* The following are extensions from ISO 14652. */
296 ctype_class_new (lr, ctype, "left_to_right");
297 ctype_class_new (lr, ctype, "right_to_left");
298 ctype_class_new (lr, ctype, "num_terminator");
299 ctype_class_new (lr, ctype, "num_separator");
300 ctype_class_new (lr, ctype, "segment_separator");
301 ctype_class_new (lr, ctype, "block_separator");
302 ctype_class_new (lr, ctype, "direction_control");
303 ctype_class_new (lr, ctype, "sym_swap_layout");
304 ctype_class_new (lr, ctype, "char_shape_selector");
305 ctype_class_new (lr, ctype, "num_shape_selector");
306 ctype_class_new (lr, ctype, "non_spacing");
307 ctype_class_new (lr, ctype, "non_spacing_level3");
308 ctype_class_new (lr, ctype, "normal_connect");
309 ctype_class_new (lr, ctype, "r_connect");
310 ctype_class_new (lr, ctype, "no_connect");
311 ctype_class_new (lr, ctype, "no_connect-space");
312 ctype_class_new (lr, ctype, "vowel_connect");
313 #endif
315 ctype->class_collection_max = charmap->mb_cur_max == 1 ? 256 : 512;
316 ctype->class_collection
317 = (uint32_t *) xcalloc (sizeof (unsigned long int),
318 ctype->class_collection_max);
319 ctype->class_collection_act = 256;
321 /* Fill character map information. */
322 ctype->last_map_idx = MAX_NR_CHARMAP;
323 ctype_map_new (lr, ctype, "toupper", charmap);
324 ctype_map_new (lr, ctype, "tolower", charmap);
325 #ifdef PREDEFINED_CLASSES
326 ctype_map_new (lr, ctype, "tosymmetric", charmap);
327 #endif
329 /* Fill first 256 entries in `toXXX' arrays. */
330 for (cnt = 0; cnt < 256; ++cnt)
332 ctype->map_collection[0][cnt] = cnt;
333 ctype->map_collection[1][cnt] = cnt;
334 #ifdef PREDEFINED_CLASSES
335 ctype->map_collection[2][cnt] = cnt;
336 #endif
337 ctype->map256_collection[0][cnt] = cnt;
338 ctype->map256_collection[1][cnt] = cnt;
341 obstack_init (&ctype->mempool);
343 else
344 ctype = locale->categories[LC_CTYPE].ctype =
345 copy_locale->categories[LC_CTYPE].ctype;
350 void
351 ctype_finish (struct localedef_t *locale, const struct charmap_t *charmap)
353 /* See POSIX.2, table 2-6 for the meaning of the following table. */
354 #define NCLASS 12
355 static const struct
357 const char *name;
358 const char allow[NCLASS];
360 valid_table[NCLASS] =
362 /* The order is important. See token.h for more information.
363 M = Always, D = Default, - = Permitted, X = Mutually exclusive */
364 { "upper", "--MX-XDDXXX-" },
365 { "lower", "--MX-XDDXXX-" },
366 { "alpha", "---X-XDDXXX-" },
367 { "digit", "XXX--XDDXXX-" },
368 { "xdigit", "-----XDDXXX-" },
369 { "space", "XXXXX------X" },
370 { "print", "---------X--" },
371 { "graph", "---------X--" },
372 { "blank", "XXXXXM-----X" },
373 { "cntrl", "XXXXX-XX--XX" },
374 { "punct", "XXXXX-DD-X-X" },
375 { "alnum", "-----XDDXXX-" }
377 size_t cnt;
378 int cls1, cls2;
379 uint32_t space_value;
380 struct charseq *space_seq;
381 struct locale_ctype_t *ctype = locale->categories[LC_CTYPE].ctype;
382 int warned;
383 const void *key;
384 size_t len;
385 void *vdata;
386 void *curs;
388 /* Now resolve copying and also handle completely missing definitions. */
389 if (ctype == NULL)
391 const char *repertoire_name;
393 /* First see whether we were supposed to copy. If yes, find the
394 actual definition. */
395 if (locale->copy_name[LC_CTYPE] != NULL)
397 /* Find the copying locale. This has to happen transitively since
398 the locale we are copying from might also copying another one. */
399 struct localedef_t *from = locale;
402 from = find_locale (LC_CTYPE, from->copy_name[LC_CTYPE],
403 from->repertoire_name, charmap);
404 while (from->categories[LC_CTYPE].ctype == NULL
405 && from->copy_name[LC_CTYPE] != NULL);
407 ctype = locale->categories[LC_CTYPE].ctype
408 = from->categories[LC_CTYPE].ctype;
411 /* If there is still no definition issue an warning and create an
412 empty one. */
413 if (ctype == NULL)
415 if (! be_quiet)
416 error (0, 0, _("No definition for %s category found"), "LC_CTYPE");
417 ctype_startup (NULL, locale, charmap, NULL, 0);
418 ctype = locale->categories[LC_CTYPE].ctype;
421 /* Get the repertoire we have to use. */
422 repertoire_name = locale->repertoire_name ?: repertoire_global;
423 if (repertoire_name != NULL)
424 ctype->repertoire = repertoire_read (repertoire_name);
427 /* We need the name of the currently used 8-bit character set to
428 make correct conversion between this 8-bit representation and the
429 ISO 10646 character set used internally for wide characters. */
430 ctype->codeset_name = charmap->code_set_name;
431 if (ctype->codeset_name == NULL)
433 if (! be_quiet)
434 error (0, 0, _("No character set name specified in charmap"));
435 ctype->codeset_name = "//UNKNOWN//";
438 /* Set default value for classes not specified. */
439 set_class_defaults (ctype, charmap, ctype->repertoire);
441 /* Check according to table. */
442 for (cnt = 0; cnt < ctype->class_collection_act; ++cnt)
444 uint32_t tmp = ctype->class_collection[cnt];
446 if (tmp != 0)
448 for (cls1 = 0; cls1 < NCLASS; ++cls1)
449 if ((tmp & _ISwbit (cls1)) != 0)
450 for (cls2 = 0; cls2 < NCLASS; ++cls2)
451 if (valid_table[cls1].allow[cls2] != '-')
453 int eq = (tmp & _ISwbit (cls2)) != 0;
454 switch (valid_table[cls1].allow[cls2])
456 case 'M':
457 if (!eq)
459 uint32_t value = ctype->charnames[cnt];
461 if (!be_quiet)
462 error (0, 0, _("\
463 character L'\\u%0*x' in class `%s' must be in class `%s'"),
464 value > 0xffff ? 8 : 4, value,
465 valid_table[cls1].name,
466 valid_table[cls2].name);
468 break;
470 case 'X':
471 if (eq)
473 uint32_t value = ctype->charnames[cnt];
475 if (!be_quiet)
476 error (0, 0, _("\
477 character L'\\u%0*x' in class `%s' must not be in class `%s'"),
478 value > 0xffff ? 8 : 4, value,
479 valid_table[cls1].name,
480 valid_table[cls2].name);
482 break;
484 case 'D':
485 ctype->class_collection[cnt] |= _ISwbit (cls2);
486 break;
488 default:
489 error (5, 0, _("internal error in %s, line %u"),
490 __FUNCTION__, __LINE__);
496 for (cnt = 0; cnt < 256; ++cnt)
498 uint32_t tmp = ctype->class256_collection[cnt];
500 if (tmp != 0)
502 for (cls1 = 0; cls1 < NCLASS; ++cls1)
503 if ((tmp & _ISbit (cls1)) != 0)
504 for (cls2 = 0; cls2 < NCLASS; ++cls2)
505 if (valid_table[cls1].allow[cls2] != '-')
507 int eq = (tmp & _ISbit (cls2)) != 0;
508 switch (valid_table[cls1].allow[cls2])
510 case 'M':
511 if (!eq)
513 char buf[17];
515 snprintf (buf, sizeof buf, "\\%Zo", cnt);
517 if (!be_quiet)
518 error (0, 0, _("\
519 character '%s' in class `%s' must be in class `%s'"),
520 buf, valid_table[cls1].name,
521 valid_table[cls2].name);
523 break;
525 case 'X':
526 if (eq)
528 char buf[17];
530 snprintf (buf, sizeof buf, "\\%Zo", cnt);
532 if (!be_quiet)
533 error (0, 0, _("\
534 character '%s' in class `%s' must not be in class `%s'"),
535 buf, valid_table[cls1].name,
536 valid_table[cls2].name);
538 break;
540 case 'D':
541 ctype->class256_collection[cnt] |= _ISbit (cls2);
542 break;
544 default:
545 error (5, 0, _("internal error in %s, line %u"),
546 __FUNCTION__, __LINE__);
552 /* ... and now test <SP> as a special case. */
553 space_value = 32;
554 if (((cnt = BITPOS (tok_space),
555 (ELEM (ctype, class_collection, , space_value)
556 & BITw (tok_space)) == 0)
557 || (cnt = BITPOS (tok_blank),
558 (ELEM (ctype, class_collection, , space_value)
559 & BITw (tok_blank)) == 0)))
561 if (!be_quiet)
562 error (0, 0, _("<SP> character not in class `%s'"),
563 valid_table[cnt].name);
565 else if (((cnt = BITPOS (tok_punct),
566 (ELEM (ctype, class_collection, , space_value)
567 & BITw (tok_punct)) != 0)
568 || (cnt = BITPOS (tok_graph),
569 (ELEM (ctype, class_collection, , space_value)
570 & BITw (tok_graph))
571 != 0)))
573 if (!be_quiet)
574 error (0, 0, _("<SP> character must not be in class `%s'"),
575 valid_table[cnt].name);
577 else
578 ELEM (ctype, class_collection, , space_value) |= BITw (tok_print);
580 space_seq = charmap_find_value (charmap, "SP", 2);
581 if (space_seq == NULL)
582 space_seq = charmap_find_value (charmap, "space", 5);
583 if (space_seq == NULL)
584 space_seq = charmap_find_value (charmap, "U00000020", 9);
585 if (space_seq == NULL || space_seq->nbytes != 1)
587 if (!be_quiet)
588 error (0, 0, _("character <SP> not defined in character map"));
590 else if (((cnt = BITPOS (tok_space),
591 (ctype->class256_collection[space_seq->bytes[0]]
592 & BIT (tok_space)) == 0)
593 || (cnt = BITPOS (tok_blank),
594 (ctype->class256_collection[space_seq->bytes[0]]
595 & BIT (tok_blank)) == 0)))
597 if (!be_quiet)
598 error (0, 0, _("<SP> character not in class `%s'"),
599 valid_table[cnt].name);
601 else if (((cnt = BITPOS (tok_punct),
602 (ctype->class256_collection[space_seq->bytes[0]]
603 & BIT (tok_punct)) != 0)
604 || (cnt = BITPOS (tok_graph),
605 (ctype->class256_collection[space_seq->bytes[0]]
606 & BIT (tok_graph)) != 0)))
608 if (!be_quiet)
609 error (0, 0, _("<SP> character must not be in class `%s'"),
610 valid_table[cnt].name);
612 else
613 ctype->class256_collection[space_seq->bytes[0]] |= BIT (tok_print);
615 /* Now that the tests are done make sure the name array contains all
616 characters which are handled in the WIDTH section of the
617 character set definition file. */
618 if (charmap->width_rules != NULL)
619 for (cnt = 0; cnt < charmap->nwidth_rules; ++cnt)
621 unsigned char bytes[charmap->mb_cur_max];
622 int nbytes = charmap->width_rules[cnt].from->nbytes;
624 /* We have the range of character for which the width is
625 specified described using byte sequences of the multibyte
626 charset. We have to convert this to UCS4 now. And we
627 cannot simply convert the beginning and the end of the
628 sequence, we have to iterate over the byte sequence and
629 convert it for every single character. */
630 memcpy (bytes, charmap->width_rules[cnt].from->bytes, nbytes);
632 while (nbytes < charmap->width_rules[cnt].to->nbytes
633 || memcmp (bytes, charmap->width_rules[cnt].to->bytes,
634 nbytes) <= 0)
636 /* Find the UCS value for `bytes'. */
637 int inner;
638 uint32_t wch;
639 struct charseq *seq = charmap_find_symbol (charmap, bytes, nbytes);
641 if (seq == NULL)
642 wch = ILLEGAL_CHAR_VALUE;
643 else if (seq->ucs4 != UNINITIALIZED_CHAR_VALUE)
644 wch = seq->ucs4;
645 else
646 wch = repertoire_find_value (ctype->repertoire, seq->name,
647 strlen (seq->name));
649 if (wch != ILLEGAL_CHAR_VALUE)
650 /* We are only interested in the side-effects of the
651 `find_idx' call. It will add appropriate entries in
652 the name array if this is necessary. */
653 (void) find_idx (ctype, NULL, NULL, NULL, wch);
655 /* "Increment" the bytes sequence. */
656 inner = nbytes - 1;
657 while (inner >= 0 && bytes[inner] == 0xff)
658 --inner;
660 if (inner < 0)
662 /* We have to extend the byte sequence. */
663 if (nbytes >= charmap->width_rules[cnt].to->nbytes)
664 break;
666 bytes[0] = 1;
667 memset (&bytes[1], 0, nbytes);
668 ++nbytes;
670 else
672 ++bytes[inner];
673 while (++inner < nbytes)
674 bytes[inner] = 0;
679 /* Now set all the other characters of the character set to the
680 default width. */
681 curs = NULL;
682 while (iterate_table (&charmap->char_table, &curs, &key, &len, &vdata) == 0)
684 struct charseq *data = (struct charseq *) vdata;
686 if (data->ucs4 == UNINITIALIZED_CHAR_VALUE)
687 data->ucs4 = repertoire_find_value (ctype->repertoire,
688 data->name, len);
690 if (data->ucs4 != ILLEGAL_CHAR_VALUE)
691 (void) find_idx (ctype, NULL, NULL, NULL, data->ucs4);
694 /* There must be a multiple of 10 digits. */
695 if (ctype->mbdigits_act % 10 != 0)
697 assert (ctype->mbdigits_act == ctype->wcdigits_act);
698 ctype->wcdigits_act -= ctype->mbdigits_act % 10;
699 ctype->mbdigits_act -= ctype->mbdigits_act % 10;
700 error (0, 0, _("`digit' category has not entries in groups of ten"));
703 /* Check the input digits. There must be a multiple of ten available.
704 In each group it could be that one or the other character is missing.
705 In this case the whole group must be removed. */
706 cnt = 0;
707 while (cnt < ctype->mbdigits_act)
709 size_t inner;
710 for (inner = 0; inner < 10; ++inner)
711 if (ctype->mbdigits[cnt + inner] == NULL)
712 break;
714 if (inner == 10)
715 cnt += 10;
716 else
718 /* Remove the group. */
719 memmove (&ctype->mbdigits[cnt], &ctype->mbdigits[cnt + 10],
720 ((ctype->wcdigits_act - cnt - 10)
721 * sizeof (ctype->mbdigits[0])));
722 ctype->mbdigits_act -= 10;
726 /* If no input digits are given use the default. */
727 if (ctype->mbdigits_act == 0)
729 if (ctype->mbdigits_max == 0)
731 ctype->mbdigits = obstack_alloc (&((struct charmap_t *) charmap)->mem_pool,
732 10 * sizeof (struct charseq *));
733 ctype->mbdigits_max = 10;
736 for (cnt = 0; cnt < 10; ++cnt)
738 ctype->mbdigits[cnt] = charmap_find_symbol (charmap,
739 digits + cnt, 1);
740 if (ctype->mbdigits[cnt] == NULL)
742 ctype->mbdigits[cnt] = charmap_find_symbol (charmap,
743 longnames[cnt],
744 strlen (longnames[cnt]));
745 if (ctype->mbdigits[cnt] == NULL)
747 /* Hum, this ain't good. */
748 error (0, 0, _("\
749 no input digits defined and none of the standard names in the charmap"));
751 ctype->mbdigits[cnt] = obstack_alloc (&((struct charmap_t *) charmap)->mem_pool,
752 sizeof (struct charseq) + 1);
754 /* This is better than nothing. */
755 ctype->mbdigits[cnt]->bytes[0] = digits[cnt];
756 ctype->mbdigits[cnt]->nbytes = 1;
761 ctype->mbdigits_act = 10;
764 /* Check the wide character input digits. There must be a multiple
765 of ten available. In each group it could be that one or the other
766 character is missing. In this case the whole group must be
767 removed. */
768 cnt = 0;
769 while (cnt < ctype->wcdigits_act)
771 size_t inner;
772 for (inner = 0; inner < 10; ++inner)
773 if (ctype->wcdigits[cnt + inner] == ILLEGAL_CHAR_VALUE)
774 break;
776 if (inner == 10)
777 cnt += 10;
778 else
780 /* Remove the group. */
781 memmove (&ctype->wcdigits[cnt], &ctype->wcdigits[cnt + 10],
782 ((ctype->wcdigits_act - cnt - 10)
783 * sizeof (ctype->wcdigits[0])));
784 ctype->wcdigits_act -= 10;
788 /* If no input digits are given use the default. */
789 if (ctype->wcdigits_act == 0)
791 if (ctype->wcdigits_max == 0)
793 ctype->wcdigits = obstack_alloc (&((struct charmap_t *) charmap)->mem_pool,
794 10 * sizeof (uint32_t));
795 ctype->wcdigits_max = 10;
798 for (cnt = 0; cnt < 10; ++cnt)
799 ctype->wcdigits[cnt] = L'0' + cnt;
801 ctype->mbdigits_act = 10;
804 /* Check the outdigits. */
805 warned = 0;
806 for (cnt = 0; cnt < 10; ++cnt)
807 if (ctype->mboutdigits[cnt] == NULL)
809 static struct charseq replace[2];
811 if (!warned)
813 error (0, 0, _("\
814 not all characters used in `outdigit' are available in the charmap"));
815 warned = 1;
818 replace[0].nbytes = 1;
819 replace[0].bytes[0] = '?';
820 replace[0].bytes[1] = '\0';
821 ctype->mboutdigits[cnt] = &replace[0];
824 warned = 0;
825 for (cnt = 0; cnt < 10; ++cnt)
826 if (ctype->wcoutdigits[cnt] == 0)
828 if (!warned)
830 error (0, 0, _("\
831 not all characters used in `outdigit' are available in the repertoire"));
832 warned = 1;
835 ctype->wcoutdigits[cnt] = L'?';
838 /* Sort the entries in the translit_ignore list. */
839 if (ctype->translit_ignore != NULL)
841 struct translit_ignore_t *firstp = ctype->translit_ignore;
842 struct translit_ignore_t *runp;
844 ctype->ntranslit_ignore = 1;
846 for (runp = firstp->next; runp != NULL; runp = runp->next)
848 struct translit_ignore_t *lastp = NULL;
849 struct translit_ignore_t *cmpp;
851 ++ctype->ntranslit_ignore;
853 for (cmpp = firstp; cmpp != NULL; lastp = cmpp, cmpp = cmpp->next)
854 if (runp->from < cmpp->from)
855 break;
857 runp->next = lastp;
858 if (lastp == NULL)
859 firstp = runp;
862 ctype->translit_ignore = firstp;
867 void
868 ctype_output (struct localedef_t *locale, const struct charmap_t *charmap,
869 const char *output_path)
871 static const char nulbytes[4] = { 0, 0, 0, 0 };
872 struct locale_ctype_t *ctype = locale->categories[LC_CTYPE].ctype;
873 const size_t nelems = (_NL_ITEM_INDEX (_NL_CTYPE_EXTRA_MAP_1)
874 + ctype->nr_charclass + ctype->map_collection_nr);
875 struct iovec iov[2 + nelems + 2 * ctype->nr_charclass
876 + ctype->map_collection_nr + 4];
877 struct locale_file data;
878 uint32_t idx[nelems + 1];
879 uint32_t default_missing_len;
880 size_t elem, cnt, offset, total;
881 char *cp;
883 /* Now prepare the output: Find the sizes of the table we can use. */
884 allocate_arrays (ctype, charmap, ctype->repertoire);
886 data.magic = LIMAGIC (LC_CTYPE);
887 data.n = nelems;
888 iov[0].iov_base = (void *) &data;
889 iov[0].iov_len = sizeof (data);
891 iov[1].iov_base = (void *) idx;
892 iov[1].iov_len = nelems * sizeof (uint32_t);
894 idx[0] = iov[0].iov_len + iov[1].iov_len;
895 offset = 0;
897 for (elem = 0; elem < nelems; ++elem)
899 if (elem < _NL_ITEM_INDEX (_NL_CTYPE_EXTRA_MAP_1))
900 switch (elem)
902 #define CTYPE_EMPTY(name) \
903 case name: \
904 iov[2 + elem + offset].iov_base = NULL; \
905 iov[2 + elem + offset].iov_len = 0; \
906 idx[elem + 1] = idx[elem]; \
907 break
909 CTYPE_EMPTY(_NL_CTYPE_GAP1);
910 CTYPE_EMPTY(_NL_CTYPE_GAP2);
911 CTYPE_EMPTY(_NL_CTYPE_GAP3);
912 CTYPE_EMPTY(_NL_CTYPE_GAP4);
913 CTYPE_EMPTY(_NL_CTYPE_GAP5);
914 CTYPE_EMPTY(_NL_CTYPE_GAP6);
916 #define CTYPE_DATA(name, base, len) \
917 case _NL_ITEM_INDEX (name): \
918 iov[2 + elem + offset].iov_base = (base); \
919 iov[2 + elem + offset].iov_len = (len); \
920 idx[elem + 1] = idx[elem] + iov[2 + elem + offset].iov_len; \
921 break
923 CTYPE_DATA (_NL_CTYPE_CLASS,
924 ctype->ctype_b,
925 (256 + 128) * sizeof (char_class_t));
927 CTYPE_DATA (_NL_CTYPE_TOUPPER,
928 ctype->map_b[0],
929 (256 + 128) * sizeof (uint32_t));
930 CTYPE_DATA (_NL_CTYPE_TOLOWER,
931 ctype->map_b[1],
932 (256 + 128) * sizeof (uint32_t));
934 CTYPE_DATA (_NL_CTYPE_TOUPPER32,
935 ctype->map32_b[0],
936 256 * sizeof (uint32_t));
937 CTYPE_DATA (_NL_CTYPE_TOLOWER32,
938 ctype->map32_b[1],
939 256 * sizeof (uint32_t));
941 CTYPE_DATA (_NL_CTYPE_CLASS32,
942 ctype->ctype32_b,
943 256 * sizeof (char_class32_t));
945 CTYPE_DATA (_NL_CTYPE_CLASS_OFFSET,
946 &ctype->class_offset, sizeof (uint32_t));
948 CTYPE_DATA (_NL_CTYPE_MAP_OFFSET,
949 &ctype->map_offset, sizeof (uint32_t));
951 CTYPE_DATA (_NL_CTYPE_TRANSLIT_TAB_SIZE,
952 &ctype->translit_idx_size, sizeof (uint32_t));
954 CTYPE_DATA (_NL_CTYPE_TRANSLIT_FROM_IDX,
955 ctype->translit_from_idx,
956 ctype->translit_idx_size * sizeof (uint32_t));
958 CTYPE_DATA (_NL_CTYPE_TRANSLIT_FROM_TBL,
959 ctype->translit_from_tbl,
960 ctype->translit_from_tbl_size);
962 CTYPE_DATA (_NL_CTYPE_TRANSLIT_TO_IDX,
963 ctype->translit_to_idx,
964 ctype->translit_idx_size * sizeof (uint32_t));
966 CTYPE_DATA (_NL_CTYPE_TRANSLIT_TO_TBL,
967 ctype->translit_to_tbl, ctype->translit_to_tbl_size);
969 case _NL_ITEM_INDEX (_NL_CTYPE_CLASS_NAMES):
970 /* The class name array. */
971 total = 0;
972 for (cnt = 0; cnt < ctype->nr_charclass; ++cnt, ++offset)
974 iov[2 + elem + offset].iov_base
975 = (void *) ctype->classnames[cnt];
976 iov[2 + elem + offset].iov_len
977 = strlen (ctype->classnames[cnt]) + 1;
978 total += iov[2 + elem + offset].iov_len;
980 iov[2 + elem + offset].iov_base = (void *) nulbytes;
981 iov[2 + elem + offset].iov_len = 1 + (4 - ((total + 1) % 4));
982 total += 1 + (4 - ((total + 1) % 4));
984 idx[elem + 1] = idx[elem] + total;
985 break;
987 case _NL_ITEM_INDEX (_NL_CTYPE_MAP_NAMES):
988 /* The class name array. */
989 total = 0;
990 for (cnt = 0; cnt < ctype->map_collection_nr; ++cnt, ++offset)
992 iov[2 + elem + offset].iov_base
993 = (void *) ctype->mapnames[cnt];
994 iov[2 + elem + offset].iov_len
995 = strlen (ctype->mapnames[cnt]) + 1;
996 total += iov[2 + elem + offset].iov_len;
998 iov[2 + elem + offset].iov_base = (void *) nulbytes;
999 iov[2 + elem + offset].iov_len = 1 + (4 - ((total + 1) % 4));
1000 total += 1 + (4 - ((total + 1) % 4));
1002 idx[elem + 1] = idx[elem] + total;
1003 break;
1005 CTYPE_DATA (_NL_CTYPE_WIDTH,
1006 ctype->width.iov_base,
1007 ctype->width.iov_len);
1009 CTYPE_DATA (_NL_CTYPE_MB_CUR_MAX,
1010 &ctype->mb_cur_max, sizeof (uint32_t));
1012 case _NL_ITEM_INDEX (_NL_CTYPE_CODESET_NAME):
1013 total = strlen (ctype->codeset_name) + 1;
1014 if (total % 4 == 0)
1015 iov[2 + elem + offset].iov_base = (char *) ctype->codeset_name;
1016 else
1018 iov[2 + elem + offset].iov_base = alloca ((total + 3) & ~3);
1019 memset (mempcpy (iov[2 + elem + offset].iov_base,
1020 ctype->codeset_name, total),
1021 '\0', 4 - (total & 3));
1022 total = (total + 3) & ~3;
1024 iov[2 + elem + offset].iov_len = total;
1025 idx[elem + 1] = idx[elem] + iov[2 + elem + offset].iov_len;
1026 break;
1028 case _NL_ITEM_INDEX (_NL_CTYPE_INDIGITS_MB_LEN):
1029 iov[2 + elem + offset].iov_base = alloca (sizeof (uint32_t));
1030 iov[2 + elem + offset].iov_len = sizeof (uint32_t);
1031 *(uint32_t *) iov[2 + elem + offset].iov_base =
1032 ctype->mbdigits_act / 10;
1033 idx[elem + 1] = idx[elem] + sizeof (uint32_t);
1034 break;
1036 case _NL_ITEM_INDEX (_NL_CTYPE_INDIGITS_WC_LEN):
1037 /* Align entries. */
1038 iov[2 + elem + offset].iov_base = (void *) nulbytes;
1039 iov[2 + elem + offset].iov_len = (4 - idx[elem] % 4) % 4;
1040 idx[elem] += iov[2 + elem + offset].iov_len;
1041 ++offset;
1043 iov[2 + elem + offset].iov_base = alloca (sizeof (uint32_t));
1044 iov[2 + elem + offset].iov_len = sizeof (uint32_t);
1045 *(uint32_t *) iov[2 + elem + offset].iov_base =
1046 ctype->wcdigits_act / 10;
1047 idx[elem + 1] = idx[elem] + sizeof (uint32_t);
1048 break;
1050 case _NL_ITEM_INDEX (_NL_CTYPE_INDIGITS0_MB) ... _NL_ITEM_INDEX (_NL_CTYPE_INDIGITS9_MB):
1051 /* Compute the length of all possible characters. For INDIGITS
1052 there might be more than one. We simply concatenate all of
1053 them with a NUL byte following. The NUL byte wouldn't be
1054 necessary but it makes it easier for the user. */
1055 total = 0;
1057 for (cnt = elem - _NL_ITEM_INDEX (_NL_CTYPE_INDIGITS0_MB);
1058 cnt < ctype->mbdigits_act; cnt += 10)
1059 total += ctype->mbdigits[cnt]->nbytes + 1;
1060 iov[2 + elem + offset].iov_base = (char *) alloca (total);
1061 iov[2 + elem + offset].iov_len = total;
1063 cp = iov[2 + elem + offset].iov_base;
1064 for (cnt = elem - _NL_ITEM_INDEX (_NL_CTYPE_INDIGITS0_MB);
1065 cnt < ctype->mbdigits_act; cnt += 10)
1067 cp = mempcpy (cp, ctype->mbdigits[cnt]->bytes,
1068 ctype->mbdigits[cnt]->nbytes);
1069 *cp++ = '\0';
1071 idx[elem + 1] = idx[elem] + iov[2 + elem + offset].iov_len;
1072 break;
1074 case _NL_ITEM_INDEX (_NL_CTYPE_OUTDIGIT0_MB) ... _NL_ITEM_INDEX (_NL_CTYPE_OUTDIGIT9_MB):
1075 /* Compute the length of all possible characters. For INDIGITS
1076 there might be more than one. We simply concatenate all of
1077 them with a NUL byte following. The NUL byte wouldn't be
1078 necessary but it makes it easier for the user. */
1079 cnt = elem - _NL_ITEM_INDEX (_NL_CTYPE_OUTDIGIT0_MB);
1080 total = ctype->mboutdigits[cnt]->nbytes + 1;
1081 iov[2 + elem + offset].iov_base = (char *) alloca (total);
1082 iov[2 + elem + offset].iov_len = total;
1084 *(char *) mempcpy (iov[2 + elem + offset].iov_base,
1085 ctype->mboutdigits[cnt]->bytes,
1086 ctype->mboutdigits[cnt]->nbytes) = '\0';
1087 idx[elem + 1] = idx[elem] + iov[2 + elem + offset].iov_len;
1088 break;
1090 case _NL_ITEM_INDEX (_NL_CTYPE_INDIGITS0_WC) ... _NL_ITEM_INDEX (_NL_CTYPE_INDIGITS9_WC):
1091 total = ctype->wcdigits_act / 10;
1093 iov[2 + elem + offset].iov_base =
1094 (uint32_t *) alloca (total * sizeof (uint32_t));
1095 iov[2 + elem + offset].iov_len = total * sizeof (uint32_t);
1097 for (cnt = elem - _NL_ITEM_INDEX (_NL_CTYPE_INDIGITS0_WC);
1098 cnt < ctype->wcdigits_act; cnt += 10)
1099 ((uint32_t *) iov[2 + elem + offset].iov_base)[cnt / 10]
1100 = ctype->wcdigits[cnt];
1101 idx[elem + 1] = idx[elem] + iov[2 + elem + offset].iov_len;
1102 break;
1104 case _NL_ITEM_INDEX (_NL_CTYPE_OUTDIGIT0_WC):
1105 /* Align entries. */
1106 iov[2 + elem + offset].iov_base = (void *) nulbytes;
1107 iov[2 + elem + offset].iov_len = (4 - idx[elem] % 4) % 4;
1108 idx[elem] += iov[2 + elem + offset].iov_len;
1109 ++offset;
1110 /* FALLTRHOUGH */
1112 case _NL_ITEM_INDEX (_NL_CTYPE_OUTDIGIT1_WC) ... _NL_ITEM_INDEX (_NL_CTYPE_OUTDIGIT9_WC):
1113 cnt = elem - _NL_ITEM_INDEX (_NL_CTYPE_OUTDIGIT0_WC);
1114 iov[2 + elem + offset].iov_base = &ctype->wcoutdigits[cnt];
1115 iov[2 + elem + offset].iov_len = sizeof (uint32_t);
1116 idx[elem + 1] = idx[elem] + iov[2 + elem + offset].iov_len;
1117 break;
1119 case _NL_ITEM_INDEX(_NL_CTYPE_TRANSLIT_DEFAULT_MISSING_LEN):
1120 /* Align entries. */
1121 iov[2 + elem + offset].iov_base = (void *) nulbytes;
1122 iov[2 + elem + offset].iov_len = (4 - idx[elem] % 4) % 4;
1123 idx[elem] += iov[2 + elem + offset].iov_len;
1124 ++offset;
1126 default_missing_len = (ctype->default_missing
1127 ? wcslen ((wchar_t *)ctype->default_missing)
1128 : 0);
1129 iov[2 + elem + offset].iov_base = &default_missing_len;
1130 iov[2 + elem + offset].iov_len = sizeof (uint32_t);
1131 idx[elem + 1] = idx[elem] + iov[2 + elem + offset].iov_len;
1132 break;
1134 case _NL_ITEM_INDEX(_NL_CTYPE_TRANSLIT_DEFAULT_MISSING):
1135 iov[2 + elem + offset].iov_base =
1136 ctype->default_missing ?: (uint32_t *) L"";
1137 iov[2 + elem + offset].iov_len =
1138 wcslen (iov[2 + elem + offset].iov_base);
1139 idx[elem + 1] = idx[elem] + iov[2 + elem + offset].iov_len;
1140 break;
1142 case _NL_ITEM_INDEX(_NL_CTYPE_TRANSLIT_IGNORE_LEN):
1143 /* Align entries. */
1144 iov[2 + elem + offset].iov_base = (void *) nulbytes;
1145 iov[2 + elem + offset].iov_len = (4 - idx[elem] % 4) % 4;
1146 idx[elem] += iov[2 + elem + offset].iov_len;
1147 ++offset;
1149 iov[2 + elem + offset].iov_base = &ctype->ntranslit_ignore;
1150 iov[2 + elem + offset].iov_len = sizeof (uint32_t);
1151 idx[elem + 1] = idx[elem] + iov[2 + elem + offset].iov_len;
1152 break;
1154 case _NL_ITEM_INDEX(_NL_CTYPE_TRANSLIT_IGNORE):
1156 uint32_t *ranges = (uint32_t *) alloca (ctype->ntranslit_ignore
1157 * 3 * sizeof (uint32_t));
1158 struct translit_ignore_t *runp;
1160 iov[2 + elem + offset].iov_base = ranges;
1161 iov[2 + elem + offset].iov_len = (ctype->ntranslit_ignore
1162 * 3 * sizeof (uint32_t));
1164 for (runp = ctype->translit_ignore; runp != NULL;
1165 runp = runp->next)
1167 *ranges++ = runp->from;
1168 *ranges++ = runp->to;
1169 *ranges++ = runp->step;
1172 /* Remove the following line in case a new entry is added
1173 after _NL_CTYPE_TRANSLIT_DEFAULT_MISSING_LEN. */
1174 if (elem < nelems)
1175 idx[elem + 1] = idx[elem] + iov[2 + elem + offset].iov_len;
1176 break;
1178 default:
1179 assert (! "unknown CTYPE element");
1181 else
1183 /* Handle extra maps. */
1184 size_t nr = elem - _NL_ITEM_INDEX (_NL_CTYPE_EXTRA_MAP_1);
1185 if (nr < ctype->nr_charclass)
1187 iov[2 + elem + offset].iov_base = ctype->class_b[nr];
1188 iov[2 + elem + offset].iov_len = 256 / 32 * sizeof (uint32_t);
1189 idx[elem] += iov[2 + elem + offset].iov_len;
1190 ++offset;
1192 iov[2 + elem + offset] = ctype->class_3level[nr];
1194 else
1196 nr -= ctype->nr_charclass;
1197 assert (nr < ctype->map_collection_nr);
1198 iov[2 + elem + offset] = ctype->map_3level[nr];
1200 idx[elem + 1] = idx[elem] + iov[2 + elem + offset].iov_len;
1204 assert (2 + elem + offset == (nelems + 2 * ctype->nr_charclass
1205 + ctype->map_collection_nr + 4 + 2));
1207 write_locale_data (output_path, "LC_CTYPE", 2 + elem + offset, iov);
1211 /* Local functions. */
1212 static void
1213 ctype_class_new (struct linereader *lr, struct locale_ctype_t *ctype,
1214 const char *name)
1216 size_t cnt;
1218 for (cnt = 0; cnt < ctype->nr_charclass; ++cnt)
1219 if (strcmp (ctype->classnames[cnt], name) == 0)
1220 break;
1222 if (cnt < ctype->nr_charclass)
1224 lr_error (lr, _("character class `%s' already defined"), name);
1225 return;
1228 if (ctype->nr_charclass == MAX_NR_CHARCLASS)
1229 /* Exit code 2 is prescribed in P1003.2b. */
1230 error (2, 0, _("\
1231 implementation limit: no more than %Zd character classes allowed"),
1232 MAX_NR_CHARCLASS);
1234 ctype->classnames[ctype->nr_charclass++] = name;
1238 static void
1239 ctype_map_new (struct linereader *lr, struct locale_ctype_t *ctype,
1240 const char *name, const struct charmap_t *charmap)
1242 size_t max_chars = 0;
1243 size_t cnt;
1245 for (cnt = 0; cnt < ctype->map_collection_nr; ++cnt)
1247 if (strcmp (ctype->mapnames[cnt], name) == 0)
1248 break;
1250 if (max_chars < ctype->map_collection_max[cnt])
1251 max_chars = ctype->map_collection_max[cnt];
1254 if (cnt < ctype->map_collection_nr)
1256 lr_error (lr, _("character map `%s' already defined"), name);
1257 return;
1260 if (ctype->map_collection_nr == MAX_NR_CHARMAP)
1261 /* Exit code 2 is prescribed in P1003.2b. */
1262 error (2, 0, _("\
1263 implementation limit: no more than %d character maps allowed"),
1264 MAX_NR_CHARMAP);
1266 ctype->mapnames[cnt] = name;
1268 if (max_chars == 0)
1269 ctype->map_collection_max[cnt] = charmap->mb_cur_max == 1 ? 256 : 512;
1270 else
1271 ctype->map_collection_max[cnt] = max_chars;
1273 ctype->map_collection[cnt] = (uint32_t *)
1274 xcalloc (sizeof (uint32_t), ctype->map_collection_max[cnt]);
1275 ctype->map_collection_act[cnt] = 256;
1277 ++ctype->map_collection_nr;
1281 /* We have to be prepared that TABLE, MAX, and ACT can be NULL. This
1282 is possible if we only want to extend the name array. */
1283 static uint32_t *
1284 find_idx (struct locale_ctype_t *ctype, uint32_t **table, size_t *max,
1285 size_t *act, uint32_t idx)
1287 size_t cnt;
1289 if (idx < 256)
1290 return table == NULL ? NULL : &(*table)[idx];
1292 /* Use the charnames_idx lookup table instead of the slow search loop. */
1293 #if 1
1294 cnt = idx_table_get (&ctype->charnames_idx, idx);
1295 if (cnt == EMPTY)
1296 /* Not found. */
1297 cnt = ctype->charnames_act;
1298 #else
1299 for (cnt = 256; cnt < ctype->charnames_act; ++cnt)
1300 if (ctype->charnames[cnt] == idx)
1301 break;
1302 #endif
1304 /* We have to distinguish two cases: the name is found or not. */
1305 if (cnt == ctype->charnames_act)
1307 /* Extend the name array. */
1308 if (ctype->charnames_act == ctype->charnames_max)
1310 ctype->charnames_max *= 2;
1311 ctype->charnames = (uint32_t *)
1312 xrealloc (ctype->charnames,
1313 sizeof (uint32_t) * ctype->charnames_max);
1315 ctype->charnames[ctype->charnames_act++] = idx;
1316 idx_table_add (&ctype->charnames_idx, idx, cnt);
1319 if (table == NULL)
1320 /* We have done everything we are asked to do. */
1321 return NULL;
1323 if (max == NULL)
1324 /* The caller does not want to extend the table. */
1325 return (cnt >= *act ? NULL : &(*table)[cnt]);
1327 if (cnt >= *act)
1329 if (cnt >= *max)
1331 size_t old_max = *max;
1333 *max *= 2;
1334 while (*max <= cnt);
1336 *table =
1337 (uint32_t *) xrealloc (*table, *max * sizeof (uint32_t));
1338 memset (&(*table)[old_max], '\0',
1339 (*max - old_max) * sizeof (uint32_t));
1342 *act = cnt + 1;
1345 return &(*table)[cnt];
1349 static int
1350 get_character (struct token *now, const struct charmap_t *charmap,
1351 struct repertoire_t *repertoire,
1352 struct charseq **seqp, uint32_t *wchp)
1354 if (now->tok == tok_bsymbol)
1356 /* This will hopefully be the normal case. */
1357 *wchp = repertoire_find_value (repertoire, now->val.str.startmb,
1358 now->val.str.lenmb);
1359 *seqp = charmap_find_value (charmap, now->val.str.startmb,
1360 now->val.str.lenmb);
1362 else if (now->tok == tok_ucs4)
1364 char utmp[10];
1366 snprintf (utmp, sizeof (utmp), "U%08X", now->val.ucs4);
1367 *seqp = charmap_find_value (charmap, utmp, 9);
1369 if (*seqp == NULL)
1370 *seqp = repertoire_find_seq (repertoire, now->val.ucs4);
1372 if (*seqp == NULL)
1374 /* Compute the value in the charmap from the UCS value. */
1375 const char *symbol = repertoire_find_symbol (repertoire,
1376 now->val.ucs4);
1378 if (symbol == NULL)
1379 *seqp = NULL;
1380 else
1381 *seqp = charmap_find_value (charmap, symbol, strlen (symbol));
1383 if (*seqp == NULL)
1385 if (repertoire != NULL)
1387 /* Insert a negative entry. */
1388 static const struct charseq negative
1389 = { .ucs4 = ILLEGAL_CHAR_VALUE };
1390 uint32_t *newp = obstack_alloc (&repertoire->mem_pool,
1391 sizeof (uint32_t));
1392 *newp = now->val.ucs4;
1394 insert_entry (&repertoire->seq_table, newp,
1395 sizeof (uint32_t), (void *) &negative);
1398 else
1399 (*seqp)->ucs4 = now->val.ucs4;
1401 else if ((*seqp)->ucs4 != now->val.ucs4)
1402 *seqp = NULL;
1404 *wchp = now->val.ucs4;
1406 else if (now->tok == tok_charcode)
1408 /* We must map from the byte code to UCS4. */
1409 *seqp = charmap_find_symbol (charmap, now->val.str.startmb,
1410 now->val.str.lenmb);
1412 if (*seqp == NULL)
1413 *wchp = ILLEGAL_CHAR_VALUE;
1414 else
1416 if ((*seqp)->ucs4 == UNINITIALIZED_CHAR_VALUE)
1417 (*seqp)->ucs4 = repertoire_find_value (repertoire, (*seqp)->name,
1418 strlen ((*seqp)->name));
1419 *wchp = (*seqp)->ucs4;
1422 else
1423 return 1;
1425 return 0;
1429 /* Ellipsis like in `<foo123>..<foo12a>' or `<j1234>....<j1245>' and
1430 the .(2). counterparts. */
1431 static void
1432 charclass_symbolic_ellipsis (struct linereader *ldfile,
1433 struct locale_ctype_t *ctype,
1434 const struct charmap_t *charmap,
1435 struct repertoire_t *repertoire,
1436 struct token *now,
1437 const char *last_str,
1438 unsigned long int class256_bit,
1439 unsigned long int class_bit, int base,
1440 int ignore_content, int handle_digits, int step)
1442 const char *nowstr = now->val.str.startmb;
1443 char tmp[now->val.str.lenmb + 1];
1444 const char *cp;
1445 char *endp;
1446 unsigned long int from;
1447 unsigned long int to;
1449 /* We have to compute the ellipsis values using the symbolic names. */
1450 assert (last_str != NULL);
1452 if (strlen (last_str) != now->val.str.lenmb)
1454 invalid_range:
1455 lr_error (ldfile,
1456 _("`%s' and `%.*s' are no valid names for symbolic range"),
1457 last_str, (int) now->val.str.lenmb, nowstr);
1458 return;
1461 if (memcmp (last_str, nowstr, now->val.str.lenmb) == 0)
1462 /* Nothing to do, the names are the same. */
1463 return;
1465 for (cp = last_str; *cp == *(nowstr + (cp - last_str)); ++cp)
1468 errno = 0;
1469 from = strtoul (cp, &endp, base);
1470 if ((from == UINT_MAX && errno == ERANGE) || *endp != '\0')
1471 goto invalid_range;
1473 to = strtoul (nowstr + (cp - last_str), &endp, base);
1474 if ((to == UINT_MAX && errno == ERANGE)
1475 || (endp - nowstr) != now->val.str.lenmb || from >= to)
1476 goto invalid_range;
1478 /* OK, we have a range FROM - TO. Now we can create the symbolic names. */
1479 if (!ignore_content)
1481 now->val.str.startmb = tmp;
1482 while ((from += step) <= to)
1484 struct charseq *seq;
1485 uint32_t wch;
1487 sprintf (tmp, (base == 10 ? "%.*s%0*ld" : "%.*s%0*lX"),
1488 (int) (cp - last_str), last_str,
1489 (int) (now->val.str.lenmb - (cp - last_str)),
1490 from);
1492 get_character (now, charmap, repertoire, &seq, &wch);
1494 if (seq != NULL && seq->nbytes == 1)
1495 /* Yep, we can store information about this byte sequence. */
1496 ctype->class256_collection[seq->bytes[0]] |= class256_bit;
1498 if (wch != ILLEGAL_CHAR_VALUE && class_bit != 0)
1499 /* We have the UCS4 position. */
1500 *find_idx (ctype, &ctype->class_collection,
1501 &ctype->class_collection_max,
1502 &ctype->class_collection_act, wch) |= class_bit;
1504 if (handle_digits == 1)
1506 /* We must store the digit values. */
1507 if (ctype->mbdigits_act == ctype->mbdigits_max)
1509 ctype->mbdigits_max *= 2;
1510 ctype->mbdigits = xrealloc (ctype->mbdigits,
1511 (ctype->mbdigits_max
1512 * sizeof (char *)));
1513 ctype->wcdigits_max *= 2;
1514 ctype->wcdigits = xrealloc (ctype->wcdigits,
1515 (ctype->wcdigits_max
1516 * sizeof (uint32_t)));
1519 ctype->mbdigits[ctype->mbdigits_act++] = seq;
1520 ctype->wcdigits[ctype->wcdigits_act++] = wch;
1522 else if (handle_digits == 2)
1524 /* We must store the digit values. */
1525 if (ctype->outdigits_act >= 10)
1527 lr_error (ldfile, _("\
1528 %s: field `%s' does not contain exactly ten entries"),
1529 "LC_CTYPE", "outdigit");
1530 return;
1533 ctype->mboutdigits[ctype->outdigits_act] = seq;
1534 ctype->wcoutdigits[ctype->outdigits_act] = wch;
1535 ++ctype->outdigits_act;
1542 /* Ellipsis like in `<U1234>..<U2345>' or `<U1234>..(2)..<U2345>'. */
1543 static void
1544 charclass_ucs4_ellipsis (struct linereader *ldfile,
1545 struct locale_ctype_t *ctype,
1546 const struct charmap_t *charmap,
1547 struct repertoire_t *repertoire,
1548 struct token *now, uint32_t last_wch,
1549 unsigned long int class256_bit,
1550 unsigned long int class_bit, int ignore_content,
1551 int handle_digits, int step)
1553 if (last_wch > now->val.ucs4)
1555 lr_error (ldfile, _("\
1556 to-value <U%0*X> of range is smaller than from-value <U%0*X>"),
1557 (now->val.ucs4 | last_wch) < 65536 ? 4 : 8, now->val.ucs4,
1558 (now->val.ucs4 | last_wch) < 65536 ? 4 : 8, last_wch);
1559 return;
1562 if (!ignore_content)
1563 while ((last_wch += step) <= now->val.ucs4)
1565 /* We have to find out whether there is a byte sequence corresponding
1566 to this UCS4 value. */
1567 struct charseq *seq;
1568 char utmp[10];
1570 snprintf (utmp, sizeof (utmp), "U%08X", last_wch);
1571 seq = charmap_find_value (charmap, utmp, 9);
1572 if (seq == NULL)
1574 snprintf (utmp, sizeof (utmp), "U%04X", last_wch);
1575 seq = charmap_find_value (charmap, utmp, 5);
1578 if (seq == NULL)
1579 /* Try looking in the repertoire map. */
1580 seq = repertoire_find_seq (repertoire, last_wch);
1582 /* If this is the first time we look for this sequence create a new
1583 entry. */
1584 if (seq == NULL)
1586 static const struct charseq negative
1587 = { .ucs4 = ILLEGAL_CHAR_VALUE };
1589 /* Find the symbolic name for this UCS4 value. */
1590 if (repertoire != NULL)
1592 const char *symbol = repertoire_find_symbol (repertoire,
1593 last_wch);
1594 uint32_t *newp = obstack_alloc (&repertoire->mem_pool,
1595 sizeof (uint32_t));
1596 *newp = last_wch;
1598 if (symbol != NULL)
1599 /* We have a name, now search the multibyte value. */
1600 seq = charmap_find_value (charmap, symbol, strlen (symbol));
1602 if (seq == NULL)
1603 /* We have to create a fake entry. */
1604 seq = (struct charseq *) &negative;
1605 else
1606 seq->ucs4 = last_wch;
1608 insert_entry (&repertoire->seq_table, newp, sizeof (uint32_t),
1609 seq);
1611 else
1612 /* We have to create a fake entry. */
1613 seq = (struct charseq *) &negative;
1616 /* We have a name, now search the multibyte value. */
1617 if (seq->ucs4 == last_wch && seq->nbytes == 1)
1618 /* Yep, we can store information about this byte sequence. */
1619 ctype->class256_collection[(size_t) seq->bytes[0]]
1620 |= class256_bit;
1622 /* And of course we have the UCS4 position. */
1623 if (class_bit != 0)
1624 *find_idx (ctype, &ctype->class_collection,
1625 &ctype->class_collection_max,
1626 &ctype->class_collection_act, last_wch) |= class_bit;
1628 if (handle_digits == 1)
1630 /* We must store the digit values. */
1631 if (ctype->mbdigits_act == ctype->mbdigits_max)
1633 ctype->mbdigits_max *= 2;
1634 ctype->mbdigits = xrealloc (ctype->mbdigits,
1635 (ctype->mbdigits_max
1636 * sizeof (char *)));
1637 ctype->wcdigits_max *= 2;
1638 ctype->wcdigits = xrealloc (ctype->wcdigits,
1639 (ctype->wcdigits_max
1640 * sizeof (uint32_t)));
1643 ctype->mbdigits[ctype->mbdigits_act++] = (seq->ucs4 == last_wch
1644 ? seq : NULL);
1645 ctype->wcdigits[ctype->wcdigits_act++] = last_wch;
1647 else if (handle_digits == 2)
1649 /* We must store the digit values. */
1650 if (ctype->outdigits_act >= 10)
1652 lr_error (ldfile, _("\
1653 %s: field `%s' does not contain exactly ten entries"),
1654 "LC_CTYPE", "outdigit");
1655 return;
1658 ctype->mboutdigits[ctype->outdigits_act] = (seq->ucs4 == last_wch
1659 ? seq : NULL);
1660 ctype->wcoutdigits[ctype->outdigits_act] = last_wch;
1661 ++ctype->outdigits_act;
1667 /* Ellipsis as in `/xea/x12.../xea/x34'. */
1668 static void
1669 charclass_charcode_ellipsis (struct linereader *ldfile,
1670 struct locale_ctype_t *ctype,
1671 const struct charmap_t *charmap,
1672 struct repertoire_t *repertoire,
1673 struct token *now, char *last_charcode,
1674 uint32_t last_charcode_len,
1675 unsigned long int class256_bit,
1676 unsigned long int class_bit, int ignore_content,
1677 int handle_digits)
1679 /* First check whether the to-value is larger. */
1680 if (now->val.charcode.nbytes != last_charcode_len)
1682 lr_error (ldfile, _("\
1683 start and end character sequence of range must have the same length"));
1684 return;
1687 if (memcmp (last_charcode, now->val.charcode.bytes, last_charcode_len) > 0)
1689 lr_error (ldfile, _("\
1690 to-value character sequence is smaller than from-value sequence"));
1691 return;
1694 if (!ignore_content)
1698 /* Increment the byte sequence value. */
1699 struct charseq *seq;
1700 uint32_t wch;
1701 int i;
1703 for (i = last_charcode_len - 1; i >= 0; --i)
1704 if (++last_charcode[i] != 0)
1705 break;
1707 if (last_charcode_len == 1)
1708 /* Of course we have the charcode value. */
1709 ctype->class256_collection[(size_t) last_charcode[0]]
1710 |= class256_bit;
1712 /* Find the symbolic name. */
1713 seq = charmap_find_symbol (charmap, last_charcode,
1714 last_charcode_len);
1715 if (seq != NULL)
1717 if (seq->ucs4 == UNINITIALIZED_CHAR_VALUE)
1718 seq->ucs4 = repertoire_find_value (repertoire, seq->name,
1719 strlen (seq->name));
1720 wch = seq == NULL ? ILLEGAL_CHAR_VALUE : seq->ucs4;
1722 if (wch != ILLEGAL_CHAR_VALUE && class_bit != 0)
1723 *find_idx (ctype, &ctype->class_collection,
1724 &ctype->class_collection_max,
1725 &ctype->class_collection_act, wch) |= class_bit;
1727 else
1728 wch = ILLEGAL_CHAR_VALUE;
1730 if (handle_digits == 1)
1732 /* We must store the digit values. */
1733 if (ctype->mbdigits_act == ctype->mbdigits_max)
1735 ctype->mbdigits_max *= 2;
1736 ctype->mbdigits = xrealloc (ctype->mbdigits,
1737 (ctype->mbdigits_max
1738 * sizeof (char *)));
1739 ctype->wcdigits_max *= 2;
1740 ctype->wcdigits = xrealloc (ctype->wcdigits,
1741 (ctype->wcdigits_max
1742 * sizeof (uint32_t)));
1745 seq = xmalloc (sizeof (struct charseq) + last_charcode_len);
1746 memcpy ((char *) (seq + 1), last_charcode, last_charcode_len);
1747 seq->nbytes = last_charcode_len;
1749 ctype->mbdigits[ctype->mbdigits_act++] = seq;
1750 ctype->wcdigits[ctype->wcdigits_act++] = wch;
1752 else if (handle_digits == 2)
1754 struct charseq *seq;
1755 /* We must store the digit values. */
1756 if (ctype->outdigits_act >= 10)
1758 lr_error (ldfile, _("\
1759 %s: field `%s' does not contain exactly ten entries"),
1760 "LC_CTYPE", "outdigit");
1761 return;
1764 seq = xmalloc (sizeof (struct charseq) + last_charcode_len);
1765 memcpy ((char *) (seq + 1), last_charcode, last_charcode_len);
1766 seq->nbytes = last_charcode_len;
1768 ctype->mboutdigits[ctype->outdigits_act] = seq;
1769 ctype->wcoutdigits[ctype->outdigits_act] = wch;
1770 ++ctype->outdigits_act;
1773 while (memcmp (last_charcode, now->val.charcode.bytes,
1774 last_charcode_len) != 0);
1779 static uint32_t *
1780 find_translit2 (struct locale_ctype_t *ctype, const struct charmap_t *charmap,
1781 uint32_t wch)
1783 struct translit_t *trunp = ctype->translit;
1784 struct translit_ignore_t *tirunp = ctype->translit_ignore;
1786 while (trunp != NULL)
1788 /* XXX We simplify things here. The transliterations we look
1789 for are only allowed to have one character. */
1790 if (trunp->from[0] == wch && trunp->from[1] == 0)
1792 /* Found it. Now look for a transliteration which can be
1793 represented with the character set. */
1794 struct translit_to_t *torunp = trunp->to;
1796 while (torunp != NULL)
1798 int i;
1800 for (i = 0; torunp->str[i] != 0; ++i)
1802 char utmp[10];
1804 snprintf (utmp, sizeof (utmp), "U%08X", torunp->str[i]);
1805 if (charmap_find_value (charmap, utmp, 9) == NULL)
1806 /* This character cannot be represented. */
1807 break;
1810 if (torunp->str[i] == 0)
1811 return torunp->str;
1813 torunp = torunp->next;
1816 break;
1819 trunp = trunp->next;
1822 /* Check for ignored chars. */
1823 while (tirunp != NULL)
1825 if (tirunp->from <= wch && tirunp->to >= wch)
1827 uint32_t wi;
1829 for (wi = tirunp->from; wi <= wch; wi += tirunp->step)
1830 if (wi == wch)
1831 return (uint32_t []) { 0 };
1835 /* Nothing found. */
1836 return NULL;
1840 uint32_t *
1841 find_translit (struct localedef_t *locale, const struct charmap_t *charmap,
1842 uint32_t wch)
1844 struct locale_ctype_t *ctype;
1845 uint32_t *result = NULL;
1847 assert (locale != NULL);
1848 ctype = locale->categories[LC_CTYPE].ctype;
1850 if (ctype->translit != NULL)
1851 result = find_translit2 (ctype, charmap, wch);
1853 if (result == NULL)
1855 struct translit_include_t *irunp = ctype->translit_include;
1857 while (irunp != NULL && result == NULL)
1859 result = find_translit (find_locale (CTYPE_LOCALE,
1860 irunp->copy_locale,
1861 irunp->copy_repertoire,
1862 charmap),
1863 charmap, wch);
1864 irunp = irunp->next;
1868 return result;
1872 /* Read one transliteration entry. */
1873 static uint32_t *
1874 read_widestring (struct linereader *ldfile, struct token *now,
1875 const struct charmap_t *charmap,
1876 struct repertoire_t *repertoire)
1878 uint32_t *wstr;
1880 if (now->tok == tok_default_missing)
1881 /* The special name "" will denote this case. */
1882 wstr = ((uint32_t *) { 0 });
1883 else if (now->tok == tok_bsymbol)
1885 /* Get the value from the repertoire. */
1886 wstr = (uint32_t *) xmalloc (2 * sizeof (uint32_t));
1887 wstr[0] = repertoire_find_value (repertoire, now->val.str.startmb,
1888 now->val.str.lenmb);
1889 if (wstr[0] == ILLEGAL_CHAR_VALUE)
1891 /* We cannot proceed, we don't know the UCS4 value. */
1892 free (wstr);
1893 return NULL;
1896 wstr[1] = 0;
1898 else if (now->tok == tok_ucs4)
1900 wstr = (uint32_t *) xmalloc (2 * sizeof (uint32_t));
1901 wstr[0] = now->val.ucs4;
1902 wstr[1] = 0;
1904 else if (now->tok == tok_charcode)
1906 /* Argh, we have to convert to the symbol name first and then to the
1907 UCS4 value. */
1908 struct charseq *seq = charmap_find_symbol (charmap,
1909 now->val.str.startmb,
1910 now->val.str.lenmb);
1911 if (seq == NULL)
1912 /* Cannot find the UCS4 value. */
1913 return NULL;
1915 if (seq->ucs4 == UNINITIALIZED_CHAR_VALUE)
1916 seq->ucs4 = repertoire_find_value (repertoire, seq->name,
1917 strlen (seq->name));
1918 if (seq->ucs4 == ILLEGAL_CHAR_VALUE)
1919 /* We cannot proceed, we don't know the UCS4 value. */
1920 return NULL;
1922 wstr = (uint32_t *) xmalloc (2 * sizeof (uint32_t));
1923 wstr[0] = seq->ucs4;
1924 wstr[1] = 0;
1926 else if (now->tok == tok_string)
1928 wstr = now->val.str.startwc;
1929 if (wstr == NULL || wstr[0] == 0)
1930 return NULL;
1932 else
1934 if (now->tok != tok_eol && now->tok != tok_eof)
1935 lr_ignore_rest (ldfile, 0);
1936 SYNTAX_ERROR (_("%s: syntax error"), "LC_CTYPE");
1937 return (uint32_t *) -1l;
1940 return wstr;
1944 static void
1945 read_translit_entry (struct linereader *ldfile, struct locale_ctype_t *ctype,
1946 struct token *now, const struct charmap_t *charmap,
1947 struct repertoire_t *repertoire)
1949 uint32_t *from_wstr = read_widestring (ldfile, now, charmap, repertoire);
1950 struct translit_t *result;
1951 struct translit_to_t **top;
1952 struct obstack *ob = &ctype->mempool;
1953 int first;
1954 int ignore;
1956 if (from_wstr == NULL)
1957 /* There is no valid from string. */
1958 return;
1960 result = (struct translit_t *) obstack_alloc (ob,
1961 sizeof (struct translit_t));
1962 result->from = from_wstr;
1963 result->fname = ldfile->fname;
1964 result->lineno = ldfile->lineno;
1965 result->next = NULL;
1966 result->to = NULL;
1967 top = &result->to;
1968 first = 1;
1969 ignore = 0;
1971 while (1)
1973 uint32_t *to_wstr;
1975 /* Next we have one or more transliterations. They are
1976 separated by semicolons. */
1977 now = lr_token (ldfile, charmap, NULL, repertoire, verbose);
1979 if (!first && (now->tok == tok_semicolon || now->tok == tok_eol))
1981 /* One string read. */
1982 const uint32_t zero = 0;
1984 if (!ignore)
1986 obstack_grow (ob, &zero, 4);
1987 to_wstr = obstack_finish (ob);
1989 *top = obstack_alloc (ob, sizeof (struct translit_to_t));
1990 (*top)->str = to_wstr;
1991 (*top)->next = NULL;
1994 if (now->tok == tok_eol)
1996 result->next = ctype->translit;
1997 ctype->translit = result;
1998 return;
2001 if (!ignore)
2002 top = &(*top)->next;
2003 ignore = 0;
2005 else
2007 to_wstr = read_widestring (ldfile, now, charmap, repertoire);
2008 if (to_wstr == (uint32_t *) -1l)
2010 /* An error occurred. */
2011 obstack_free (ob, result);
2012 return;
2015 if (to_wstr == NULL)
2016 ignore = 1;
2017 else
2018 /* This value is usable. */
2019 obstack_grow (ob, to_wstr, wcslen ((wchar_t *) to_wstr) * 4);
2021 first = 0;
2027 static void
2028 read_translit_ignore_entry (struct linereader *ldfile,
2029 struct locale_ctype_t *ctype,
2030 const struct charmap_t *charmap,
2031 struct repertoire_t *repertoire)
2033 /* We expect a semicolon-separated list of characters we ignore. We are
2034 only interested in the wide character definitions. These must be
2035 single characters, possibly defining a range when an ellipsis is used. */
2036 while (1)
2038 struct token *now = lr_token (ldfile, charmap, NULL, repertoire,
2039 verbose);
2040 struct translit_ignore_t *newp;
2041 uint32_t from;
2043 if (now->tok == tok_eol || now->tok == tok_eof)
2045 lr_error (ldfile,
2046 _("premature end of `translit_ignore' definition"));
2047 return;
2050 if (now->tok != tok_bsymbol && now->tok != tok_ucs4)
2052 lr_error (ldfile, _("syntax error"));
2053 lr_ignore_rest (ldfile, 0);
2054 return;
2057 if (now->tok == tok_ucs4)
2058 from = now->val.ucs4;
2059 else
2060 /* Try to get the value. */
2061 from = repertoire_find_value (repertoire, now->val.str.startmb,
2062 now->val.str.lenmb);
2064 if (from == ILLEGAL_CHAR_VALUE)
2066 lr_error (ldfile, "invalid character name");
2067 newp = NULL;
2069 else
2071 newp = (struct translit_ignore_t *)
2072 obstack_alloc (&ctype->mempool, sizeof (struct translit_ignore_t));
2073 newp->from = from;
2074 newp->to = from;
2075 newp->step = 1;
2077 newp->next = ctype->translit_ignore;
2078 ctype->translit_ignore = newp;
2081 /* Now we expect either a semicolon, an ellipsis, or the end of the
2082 line. */
2083 now = lr_token (ldfile, charmap, NULL, repertoire, verbose);
2085 if (now->tok == tok_ellipsis2 || now->tok == tok_ellipsis2_2)
2087 /* XXX Should we bother implementing `....'? `...' certainly
2088 will not be implemented. */
2089 uint32_t to;
2090 int step = now->tok == tok_ellipsis2_2 ? 2 : 1;
2092 now = lr_token (ldfile, charmap, NULL, repertoire, verbose);
2094 if (now->tok == tok_eol || now->tok == tok_eof)
2096 lr_error (ldfile,
2097 _("premature end of `translit_ignore' definition"));
2098 return;
2101 if (now->tok != tok_bsymbol && now->tok != tok_ucs4)
2103 lr_error (ldfile, _("syntax error"));
2104 lr_ignore_rest (ldfile, 0);
2105 return;
2108 if (now->tok == tok_ucs4)
2109 to = now->val.ucs4;
2110 else
2111 /* Try to get the value. */
2112 to = repertoire_find_value (repertoire, now->val.str.startmb,
2113 now->val.str.lenmb);
2115 if (to == ILLEGAL_CHAR_VALUE)
2116 lr_error (ldfile, "invalid character name");
2117 else
2119 /* Make sure the `to'-value is larger. */
2120 if (to >= from)
2122 newp->to = to;
2123 newp->step = step;
2125 else
2126 lr_error (ldfile, _("\
2127 to-value <U%0*X> of range is smaller than from-value <U%0*X>"),
2128 (to | from) < 65536 ? 4 : 8, to,
2129 (to | from) < 65536 ? 4 : 8, from);
2132 /* And the next token. */
2133 now = lr_token (ldfile, charmap, NULL, repertoire, verbose);
2136 if (now->tok == tok_eol || now->tok == tok_eof)
2137 /* We are done. */
2138 return;
2140 if (now->tok == tok_semicolon)
2141 /* Next round. */
2142 continue;
2144 /* If we come here something is wrong. */
2145 lr_error (ldfile, _("syntax error"));
2146 lr_ignore_rest (ldfile, 0);
2147 return;
2152 /* The parser for the LC_CTYPE section of the locale definition. */
2153 void
2154 ctype_read (struct linereader *ldfile, struct localedef_t *result,
2155 const struct charmap_t *charmap, const char *repertoire_name,
2156 int ignore_content)
2158 struct repertoire_t *repertoire = NULL;
2159 struct locale_ctype_t *ctype;
2160 struct token *now;
2161 enum token_t nowtok;
2162 size_t cnt;
2163 struct charseq *last_seq;
2164 uint32_t last_wch = 0;
2165 enum token_t last_token;
2166 enum token_t ellipsis_token;
2167 int step;
2168 char last_charcode[16];
2169 size_t last_charcode_len = 0;
2170 const char *last_str = NULL;
2171 int mapidx;
2172 struct localedef_t *copy_locale = NULL;
2174 /* Get the repertoire we have to use. */
2175 if (repertoire_name != NULL)
2176 repertoire = repertoire_read (repertoire_name);
2178 /* The rest of the line containing `LC_CTYPE' must be free. */
2179 lr_ignore_rest (ldfile, 1);
2184 now = lr_token (ldfile, charmap, NULL, NULL, verbose);
2185 nowtok = now->tok;
2187 while (nowtok == tok_eol);
2189 /* If we see `copy' now we are almost done. */
2190 if (nowtok == tok_copy)
2192 now = lr_token (ldfile, charmap, NULL, NULL, verbose);
2193 if (now->tok != tok_string)
2195 SYNTAX_ERROR (_("%s: syntax error"), "LC_CTYPE");
2197 skip_category:
2199 now = lr_token (ldfile, charmap, NULL, NULL, verbose);
2200 while (now->tok != tok_eof && now->tok != tok_end);
2202 if (now->tok != tok_eof
2203 || (now = lr_token (ldfile, charmap, NULL, NULL, verbose),
2204 now->tok == tok_eof))
2205 lr_error (ldfile, _("%s: premature end of file"), "LC_CTYPE");
2206 else if (now->tok != tok_lc_ctype)
2208 lr_error (ldfile, _("\
2209 %1$s: definition does not end with `END %1$s'"), "LC_CTYPE");
2210 lr_ignore_rest (ldfile, 0);
2212 else
2213 lr_ignore_rest (ldfile, 1);
2215 return;
2218 if (! ignore_content)
2220 /* Get the locale definition. */
2221 copy_locale = load_locale (LC_CTYPE, now->val.str.startmb,
2222 repertoire_name, charmap, NULL);
2223 if ((copy_locale->avail & CTYPE_LOCALE) == 0)
2225 /* Not yet loaded. So do it now. */
2226 if (locfile_read (copy_locale, charmap) != 0)
2227 goto skip_category;
2231 lr_ignore_rest (ldfile, 1);
2233 now = lr_token (ldfile, charmap, NULL, NULL, verbose);
2234 nowtok = now->tok;
2237 /* Prepare the data structures. */
2238 ctype_startup (ldfile, result, charmap, copy_locale, ignore_content);
2239 ctype = result->categories[LC_CTYPE].ctype;
2241 /* Remember the repertoire we use. */
2242 if (!ignore_content)
2243 ctype->repertoire = repertoire;
2245 while (1)
2247 unsigned long int class_bit = 0;
2248 unsigned long int class256_bit = 0;
2249 int handle_digits = 0;
2251 /* Of course we don't proceed beyond the end of file. */
2252 if (nowtok == tok_eof)
2253 break;
2255 /* Ingore empty lines. */
2256 if (nowtok == tok_eol)
2258 now = lr_token (ldfile, charmap, NULL, NULL, verbose);
2259 nowtok = now->tok;
2260 continue;
2263 switch (nowtok)
2265 case tok_charclass:
2266 now = lr_token (ldfile, charmap, NULL, NULL, verbose);
2267 while (now->tok == tok_ident || now->tok == tok_string)
2269 ctype_class_new (ldfile, ctype, now->val.str.startmb);
2270 now = lr_token (ldfile, charmap, NULL, NULL, verbose);
2271 if (now->tok != tok_semicolon)
2272 break;
2273 now = lr_token (ldfile, charmap, NULL, NULL, verbose);
2275 if (now->tok != tok_eol)
2276 SYNTAX_ERROR (_("\
2277 %s: syntax error in definition of new character class"), "LC_CTYPE");
2278 break;
2280 case tok_charconv:
2281 now = lr_token (ldfile, charmap, NULL, NULL, verbose);
2282 while (now->tok == tok_ident || now->tok == tok_string)
2284 ctype_map_new (ldfile, ctype, now->val.str.startmb, charmap);
2285 now = lr_token (ldfile, charmap, NULL, NULL, verbose);
2286 if (now->tok != tok_semicolon)
2287 break;
2288 now = lr_token (ldfile, charmap, NULL, NULL, verbose);
2290 if (now->tok != tok_eol)
2291 SYNTAX_ERROR (_("\
2292 %s: syntax error in definition of new character map"), "LC_CTYPE");
2293 break;
2295 case tok_class:
2296 /* Ignore the rest of the line if we don't need the input of
2297 this line. */
2298 if (ignore_content)
2300 lr_ignore_rest (ldfile, 0);
2301 break;
2304 /* We simply forget the `class' keyword and use the following
2305 operand to determine the bit. */
2306 now = lr_token (ldfile, charmap, NULL, NULL, verbose);
2307 if (now->tok == tok_ident || now->tok == tok_string)
2309 /* Must can be one of the predefined class names. */
2310 for (cnt = 0; cnt < ctype->nr_charclass; ++cnt)
2311 if (strcmp (ctype->classnames[cnt], now->val.str.startmb) == 0)
2312 break;
2313 if (cnt >= ctype->nr_charclass)
2315 #ifdef PREDEFINED_CLASSES
2316 if (now->val.str.lenmb == 8
2317 && memcmp ("special1", now->val.str.startmb, 8) == 0)
2318 class_bit = _ISwspecial1;
2319 else if (now->val.str.lenmb == 8
2320 && memcmp ("special2", now->val.str.startmb, 8) == 0)
2321 class_bit = _ISwspecial2;
2322 else if (now->val.str.lenmb == 8
2323 && memcmp ("special3", now->val.str.startmb, 8) == 0)
2324 class_bit = _ISwspecial3;
2325 else
2326 #endif
2328 /* OK, it's a new class. */
2329 ctype_class_new (ldfile, ctype, now->val.str.startmb);
2331 class_bit = _ISwbit (ctype->nr_charclass - 1);
2334 else
2336 class_bit = _ISwbit (cnt);
2338 free (now->val.str.startmb);
2341 else if (now->tok == tok_digit)
2342 goto handle_tok_digit;
2343 else if (now->tok < tok_upper || now->tok > tok_blank)
2344 goto err_label;
2345 else
2347 class_bit = BITw (now->tok);
2348 class256_bit = BIT (now->tok);
2351 /* The next character must be a semicolon. */
2352 now = lr_token (ldfile, charmap, NULL, NULL, verbose);
2353 if (now->tok != tok_semicolon)
2354 goto err_label;
2355 goto read_charclass;
2357 case tok_upper:
2358 case tok_lower:
2359 case tok_alpha:
2360 case tok_alnum:
2361 case tok_space:
2362 case tok_cntrl:
2363 case tok_punct:
2364 case tok_graph:
2365 case tok_print:
2366 case tok_xdigit:
2367 case tok_blank:
2368 /* Ignore the rest of the line if we don't need the input of
2369 this line. */
2370 if (ignore_content)
2372 lr_ignore_rest (ldfile, 0);
2373 break;
2376 class_bit = BITw (now->tok);
2377 class256_bit = BIT (now->tok);
2378 handle_digits = 0;
2379 read_charclass:
2380 ctype->class_done |= class_bit;
2381 last_token = tok_none;
2382 ellipsis_token = tok_none;
2383 step = 1;
2384 now = lr_token (ldfile, charmap, NULL, NULL, verbose);
2385 while (now->tok != tok_eol && now->tok != tok_eof)
2387 uint32_t wch;
2388 struct charseq *seq;
2390 if (ellipsis_token == tok_none)
2392 if (get_character (now, charmap, repertoire, &seq, &wch))
2393 goto err_label;
2395 if (!ignore_content && seq != NULL && seq->nbytes == 1)
2396 /* Yep, we can store information about this byte
2397 sequence. */
2398 ctype->class256_collection[seq->bytes[0]] |= class256_bit;
2400 if (!ignore_content && wch != ILLEGAL_CHAR_VALUE
2401 && class_bit != 0)
2402 /* We have the UCS4 position. */
2403 *find_idx (ctype, &ctype->class_collection,
2404 &ctype->class_collection_max,
2405 &ctype->class_collection_act, wch) |= class_bit;
2407 last_token = now->tok;
2408 /* Terminate the string. */
2409 if (last_token == tok_bsymbol)
2411 now->val.str.startmb[now->val.str.lenmb] = '\0';
2412 last_str = now->val.str.startmb;
2414 else
2415 last_str = NULL;
2416 last_seq = seq;
2417 last_wch = wch;
2418 memcpy (last_charcode, now->val.charcode.bytes, 16);
2419 last_charcode_len = now->val.charcode.nbytes;
2421 if (!ignore_content && handle_digits == 1)
2423 /* We must store the digit values. */
2424 if (ctype->mbdigits_act == ctype->mbdigits_max)
2426 ctype->mbdigits_max += 10;
2427 ctype->mbdigits = xrealloc (ctype->mbdigits,
2428 (ctype->mbdigits_max
2429 * sizeof (char *)));
2430 ctype->wcdigits_max += 10;
2431 ctype->wcdigits = xrealloc (ctype->wcdigits,
2432 (ctype->wcdigits_max
2433 * sizeof (uint32_t)));
2436 ctype->mbdigits[ctype->mbdigits_act++] = seq;
2437 ctype->wcdigits[ctype->wcdigits_act++] = wch;
2439 else if (!ignore_content && handle_digits == 2)
2441 /* We must store the digit values. */
2442 if (ctype->outdigits_act >= 10)
2444 lr_error (ldfile, _("\
2445 %s: field `%s' does not contain exactly ten entries"),
2446 "LC_CTYPE", "outdigit");
2447 lr_ignore_rest (ldfile, 0);
2448 break;
2451 ctype->mboutdigits[ctype->outdigits_act] = seq;
2452 ctype->wcoutdigits[ctype->outdigits_act] = wch;
2453 ++ctype->outdigits_act;
2456 else
2458 /* Now it gets complicated. We have to resolve the
2459 ellipsis problem. First we must distinguish between
2460 the different kind of ellipsis and this must match the
2461 tokens we have seen. */
2462 assert (last_token != tok_none);
2464 if (last_token != now->tok)
2466 lr_error (ldfile, _("\
2467 ellipsis range must be marked by two operands of same type"));
2468 lr_ignore_rest (ldfile, 0);
2469 break;
2472 if (last_token == tok_bsymbol)
2474 if (ellipsis_token == tok_ellipsis3)
2475 lr_error (ldfile, _("with symbolic name range values \
2476 the absolute ellipsis `...' must not be used"));
2478 charclass_symbolic_ellipsis (ldfile, ctype, charmap,
2479 repertoire, now, last_str,
2480 class256_bit, class_bit,
2481 (ellipsis_token
2482 == tok_ellipsis4
2483 ? 10 : 16),
2484 ignore_content,
2485 handle_digits, step);
2487 else if (last_token == tok_ucs4)
2489 if (ellipsis_token != tok_ellipsis2)
2490 lr_error (ldfile, _("\
2491 with UCS range values one must use the hexadecimal symbolic ellipsis `..'"));
2493 charclass_ucs4_ellipsis (ldfile, ctype, charmap,
2494 repertoire, now, last_wch,
2495 class256_bit, class_bit,
2496 ignore_content, handle_digits,
2497 step);
2499 else
2501 assert (last_token == tok_charcode);
2503 if (ellipsis_token != tok_ellipsis3)
2504 lr_error (ldfile, _("\
2505 with character code range values one must use the absolute ellipsis `...'"));
2507 charclass_charcode_ellipsis (ldfile, ctype, charmap,
2508 repertoire, now,
2509 last_charcode,
2510 last_charcode_len,
2511 class256_bit, class_bit,
2512 ignore_content,
2513 handle_digits);
2516 /* Now we have used the last value. */
2517 last_token = tok_none;
2520 /* Next we expect a semicolon or the end of the line. */
2521 now = lr_token (ldfile, charmap, NULL, NULL, verbose);
2522 if (now->tok == tok_eol || now->tok == tok_eof)
2523 break;
2525 if (last_token != tok_none
2526 && now->tok >= tok_ellipsis2 && now->tok <= tok_ellipsis4_2)
2528 if (now->tok == tok_ellipsis2_2)
2530 now->tok = tok_ellipsis2;
2531 step = 2;
2533 else if (now->tok == tok_ellipsis4_2)
2535 now->tok = tok_ellipsis4;
2536 step = 2;
2539 ellipsis_token = now->tok;
2541 now = lr_token (ldfile, charmap, NULL, NULL, verbose);
2542 continue;
2545 if (now->tok != tok_semicolon)
2546 goto err_label;
2548 /* And get the next character. */
2549 now = lr_token (ldfile, charmap, NULL, NULL, verbose);
2551 ellipsis_token = tok_none;
2552 step = 1;
2554 break;
2556 case tok_digit:
2557 /* Ignore the rest of the line if we don't need the input of
2558 this line. */
2559 if (ignore_content)
2561 lr_ignore_rest (ldfile, 0);
2562 break;
2565 handle_tok_digit:
2566 class_bit = _ISwdigit;
2567 class256_bit = _ISdigit;
2568 handle_digits = 1;
2569 goto read_charclass;
2571 case tok_outdigit:
2572 /* Ignore the rest of the line if we don't need the input of
2573 this line. */
2574 if (ignore_content)
2576 lr_ignore_rest (ldfile, 0);
2577 break;
2580 if (ctype->outdigits_act != 0)
2581 lr_error (ldfile, _("\
2582 %s: field `%s' declared more than once"),
2583 "LC_CTYPE", "outdigit");
2584 class_bit = 0;
2585 class256_bit = 0;
2586 handle_digits = 2;
2587 goto read_charclass;
2589 case tok_toupper:
2590 /* Ignore the rest of the line if we don't need the input of
2591 this line. */
2592 if (ignore_content)
2594 lr_ignore_rest (ldfile, 0);
2595 break;
2598 mapidx = 0;
2599 goto read_mapping;
2601 case tok_tolower:
2602 /* Ignore the rest of the line if we don't need the input of
2603 this line. */
2604 if (ignore_content)
2606 lr_ignore_rest (ldfile, 0);
2607 break;
2610 mapidx = 1;
2611 goto read_mapping;
2613 case tok_map:
2614 /* Ignore the rest of the line if we don't need the input of
2615 this line. */
2616 if (ignore_content)
2618 lr_ignore_rest (ldfile, 0);
2619 break;
2622 /* We simply forget the `map' keyword and use the following
2623 operand to determine the mapping. */
2624 now = lr_token (ldfile, charmap, NULL, NULL, verbose);
2625 if (now->tok == tok_ident || now->tok == tok_string)
2627 size_t cnt;
2629 for (cnt = 2; cnt < ctype->map_collection_nr; ++cnt)
2630 if (strcmp (now->val.str.startmb, ctype->mapnames[cnt]) == 0)
2631 break;
2633 if (cnt < ctype->map_collection_nr)
2634 free (now->val.str.startmb);
2635 else
2636 /* OK, it's a new map. */
2637 ctype_map_new (ldfile, ctype, now->val.str.startmb, charmap);
2639 mapidx = cnt;
2641 else if (now->tok < tok_toupper || now->tok > tok_tolower)
2642 goto err_label;
2643 else
2644 mapidx = now->tok - tok_toupper;
2646 now = lr_token (ldfile, charmap, NULL, NULL, verbose);
2647 /* This better should be a semicolon. */
2648 if (now->tok != tok_semicolon)
2649 goto err_label;
2651 read_mapping:
2652 /* Test whether this mapping was already defined. */
2653 if (ctype->tomap_done[mapidx])
2655 lr_error (ldfile, _("duplicated definition for mapping `%s'"),
2656 ctype->mapnames[mapidx]);
2657 lr_ignore_rest (ldfile, 0);
2658 break;
2660 ctype->tomap_done[mapidx] = 1;
2662 now = lr_token (ldfile, charmap, NULL, NULL, verbose);
2663 while (now->tok != tok_eol && now->tok != tok_eof)
2665 struct charseq *from_seq;
2666 uint32_t from_wch;
2667 struct charseq *to_seq;
2668 uint32_t to_wch;
2670 /* Every pair starts with an opening brace. */
2671 if (now->tok != tok_open_brace)
2672 goto err_label;
2674 /* Next comes the from-value. */
2675 now = lr_token (ldfile, charmap, NULL, NULL, verbose);
2676 if (get_character (now, charmap, repertoire, &from_seq,
2677 &from_wch) != 0)
2678 goto err_label;
2680 /* The next is a comma. */
2681 now = lr_token (ldfile, charmap, NULL, NULL, verbose);
2682 if (now->tok != tok_comma)
2683 goto err_label;
2685 /* And the other value. */
2686 now = lr_token (ldfile, charmap, NULL, NULL, verbose);
2687 if (get_character (now, charmap, repertoire, &to_seq,
2688 &to_wch) != 0)
2689 goto err_label;
2691 /* And the last thing is the closing brace. */
2692 now = lr_token (ldfile, charmap, NULL, NULL, verbose);
2693 if (now->tok != tok_close_brace)
2694 goto err_label;
2696 if (!ignore_content)
2698 if (mapidx < 2 && from_seq != NULL && to_seq != NULL
2699 && from_seq->nbytes == 1 && to_seq->nbytes == 1)
2700 /* We can use this value. */
2701 ctype->map256_collection[mapidx][from_seq->bytes[0]]
2702 = to_seq->bytes[0];
2704 if (from_wch != ILLEGAL_CHAR_VALUE
2705 && to_wch != ILLEGAL_CHAR_VALUE)
2706 /* Both correct values. */
2707 *find_idx (ctype, &ctype->map_collection[mapidx],
2708 &ctype->map_collection_max[mapidx],
2709 &ctype->map_collection_act[mapidx],
2710 from_wch) = to_wch;
2713 /* Now comes a semicolon or the end of the line/file. */
2714 now = lr_token (ldfile, charmap, NULL, NULL, verbose);
2715 if (now->tok == tok_semicolon)
2716 now = lr_token (ldfile, charmap, NULL, NULL, verbose);
2718 break;
2720 case tok_translit_start:
2721 /* Ignore the entire translit section with its peculiar syntax
2722 if we don't need the input. */
2723 if (ignore_content)
2727 lr_ignore_rest (ldfile, 0);
2728 now = lr_token (ldfile, charmap, NULL, NULL, verbose);
2730 while (now->tok != tok_translit_end && now->tok != tok_eof);
2732 if (now->tok == tok_eof)
2733 lr_error (ldfile, _(\
2734 "%s: `translit_start' section does not end with `translit_end'"),
2735 "LC_CTYPE");
2737 break;
2740 /* The rest of the line better should be empty. */
2741 lr_ignore_rest (ldfile, 1);
2743 /* We count here the number of allocated entries in the `translit'
2744 array. */
2745 cnt = 0;
2747 ldfile->translate_strings = 1;
2748 ldfile->return_widestr = 1;
2750 /* We proceed until we see the `translit_end' token. */
2751 while (now = lr_token (ldfile, charmap, NULL, repertoire, verbose),
2752 now->tok != tok_translit_end && now->tok != tok_eof)
2754 if (now->tok == tok_eol)
2755 /* Ignore empty lines. */
2756 continue;
2758 if (now->tok == tok_include)
2760 /* We have to include locale. */
2761 const char *locale_name;
2762 const char *repertoire_name;
2763 struct translit_include_t *include_stmt, **include_ptr;
2765 now = lr_token (ldfile, charmap, NULL, NULL, verbose);
2766 /* This should be a string or an identifier. In any
2767 case something to name a locale. */
2768 if (now->tok != tok_string && now->tok != tok_ident)
2770 translit_syntax:
2771 lr_error (ldfile, _("%s: syntax error"), "LC_CTYPE");
2772 lr_ignore_rest (ldfile, 0);
2773 continue;
2775 locale_name = now->val.str.startmb;
2777 /* Next should be a semicolon. */
2778 now = lr_token (ldfile, charmap, NULL, NULL, verbose);
2779 if (now->tok != tok_semicolon)
2780 goto translit_syntax;
2782 /* Now the repertoire name. */
2783 now = lr_token (ldfile, charmap, NULL, NULL, verbose);
2784 if ((now->tok != tok_string && now->tok != tok_ident)
2785 || now->val.str.startmb == NULL)
2786 goto translit_syntax;
2787 repertoire_name = now->val.str.startmb;
2789 /* Save the include statement for later processing. */
2790 include_stmt = (struct translit_include_t *)
2791 xmalloc (sizeof (struct translit_include_t));
2792 include_stmt->copy_locale = locale_name;
2793 include_stmt->copy_repertoire = repertoire_name;
2794 include_stmt->next = NULL;
2796 include_ptr = &ctype->translit_include;
2797 while (*include_ptr != NULL)
2798 include_ptr = &(*include_ptr)->next;
2799 *include_ptr = include_stmt;
2801 /* The rest of the line must be empty. */
2802 lr_ignore_rest (ldfile, 1);
2804 /* Make sure the locale is read. */
2805 add_to_readlist (LC_CTYPE, locale_name, repertoire_name,
2806 1, NULL);
2807 continue;
2809 else if (now->tok == tok_default_missing)
2811 uint32_t *wstr;
2813 while (1)
2815 /* We expect a single character or string as the
2816 argument. */
2817 now = lr_token (ldfile, charmap, NULL, NULL, verbose);
2818 wstr = read_widestring (ldfile, now, charmap,
2819 repertoire);
2821 if (wstr != NULL)
2823 if (ctype->default_missing != NULL)
2825 lr_error (ldfile, _("\
2826 %s: duplicate `default_missing' definition"), "LC_CTYPE");
2827 error_at_line (0, 0, ctype->default_missing_file,
2828 ctype->default_missing_lineno,
2829 _("\
2830 previous definition was here"));
2832 else
2834 ctype->default_missing = wstr;
2835 ctype->default_missing_file = ldfile->fname;
2836 ctype->default_missing_lineno = ldfile->lineno;
2838 /* We can have more entries, ignore them. */
2839 lr_ignore_rest (ldfile, 0);
2840 break;
2842 else if (wstr == (uint32_t *) -1l)
2843 /* This was an syntax error. */
2844 break;
2846 /* Maybe there is another replacement we can use. */
2847 now = lr_token (ldfile, charmap, NULL, NULL, verbose);
2848 if (now->tok == tok_eol || now->tok == tok_eof)
2850 /* Nothing found. We tell the user. */
2851 lr_error (ldfile, _("\
2852 %s: no representable `default_missing' definition found"), "LC_CTYPE");
2853 break;
2855 if (now->tok != tok_semicolon)
2856 goto translit_syntax;
2859 continue;
2861 else if (now->tok == tok_translit_ignore)
2863 read_translit_ignore_entry (ldfile, ctype, charmap,
2864 repertoire);
2865 continue;
2868 read_translit_entry (ldfile, ctype, now, charmap, repertoire);
2870 ldfile->return_widestr = 0;
2872 if (now->tok == tok_eof)
2873 lr_error (ldfile, _(\
2874 "%s: `translit_start' section does not end with `translit_end'"),
2875 "LC_CTYPE");
2877 break;
2879 case tok_ident:
2880 /* Ignore the rest of the line if we don't need the input of
2881 this line. */
2882 if (ignore_content)
2884 lr_ignore_rest (ldfile, 0);
2885 break;
2888 /* This could mean one of several things. First test whether
2889 it's a character class name. */
2890 for (cnt = 0; cnt < ctype->nr_charclass; ++cnt)
2891 if (strcmp (now->val.str.startmb, ctype->classnames[cnt]) == 0)
2892 break;
2893 if (cnt < ctype->nr_charclass)
2895 class_bit = _ISwbit (cnt);
2896 class256_bit = cnt <= 11 ? _ISbit (cnt) : 0;
2897 free (now->val.str.startmb);
2898 goto read_charclass;
2900 for (cnt = 0; cnt < ctype->map_collection_nr; ++cnt)
2901 if (strcmp (now->val.str.startmb, ctype->mapnames[cnt]) == 0)
2902 break;
2903 if (cnt < ctype->map_collection_nr)
2905 mapidx = cnt;
2906 free (now->val.str.startmb);
2907 goto read_mapping;
2909 #ifdef PREDEFINED_CLASSES
2910 if (strcmp (now->val.str.startmb, "special1") == 0)
2912 class_bit = _ISwspecial1;
2913 free (now->val.str.startmb);
2914 goto read_charclass;
2916 if (strcmp (now->val.str.startmb, "special2") == 0)
2918 class_bit = _ISwspecial2;
2919 free (now->val.str.startmb);
2920 goto read_charclass;
2922 if (strcmp (now->val.str.startmb, "special3") == 0)
2924 class_bit = _ISwspecial3;
2925 free (now->val.str.startmb);
2926 goto read_charclass;
2928 if (strcmp (now->val.str.startmb, "tosymmetric") == 0)
2930 mapidx = 2;
2931 goto read_mapping;
2933 #endif
2934 break;
2936 case tok_end:
2937 /* Next we assume `LC_CTYPE'. */
2938 now = lr_token (ldfile, charmap, NULL, NULL, verbose);
2939 if (now->tok == tok_eof)
2940 break;
2941 if (now->tok == tok_eol)
2942 lr_error (ldfile, _("%s: incomplete `END' line"),
2943 "LC_CTYPE");
2944 else if (now->tok != tok_lc_ctype)
2945 lr_error (ldfile, _("\
2946 %1$s: definition does not end with `END %1$s'"), "LC_CTYPE");
2947 lr_ignore_rest (ldfile, now->tok == tok_lc_ctype);
2948 return;
2950 default:
2951 err_label:
2952 if (now->tok != tok_eof)
2953 SYNTAX_ERROR (_("%s: syntax error"), "LC_CTYPE");
2956 /* Prepare for the next round. */
2957 now = lr_token (ldfile, charmap, NULL, NULL, verbose);
2958 nowtok = now->tok;
2961 /* When we come here we reached the end of the file. */
2962 lr_error (ldfile, _("%s: premature end of file"), "LC_CTYPE");
2966 static void
2967 set_class_defaults (struct locale_ctype_t *ctype,
2968 const struct charmap_t *charmap,
2969 struct repertoire_t *repertoire)
2971 size_t cnt;
2973 /* These function defines the default values for the classes and conversions
2974 according to POSIX.2 2.5.2.1.
2975 It may seem that the order of these if-blocks is arbitrary but it is NOT.
2976 Don't move them unless you know what you do! */
2978 auto void set_default (int bitpos, int from, int to);
2980 void set_default (int bitpos, int from, int to)
2982 char tmp[2];
2983 int ch;
2984 int bit = _ISbit (bitpos);
2985 int bitw = _ISwbit (bitpos);
2986 /* Define string. */
2987 strcpy (tmp, "?");
2989 for (ch = from; ch <= to; ++ch)
2991 struct charseq *seq;
2992 tmp[0] = ch;
2994 seq = charmap_find_value (charmap, tmp, 1);
2995 if (seq == NULL)
2997 char buf[10];
2998 sprintf (buf, "U%08X", ch);
2999 seq = charmap_find_value (charmap, buf, 9);
3001 if (seq == NULL)
3003 if (!be_quiet)
3004 error (0, 0, _("\
3005 %s: character `%s' not defined in charmap while needed as default value"),
3006 "LC_CTYPE", tmp);
3008 else if (seq->nbytes != 1)
3009 error (0, 0, _("\
3010 %s: character `%s' in charmap not representable with one byte"),
3011 "LC_CTYPE", tmp);
3012 else
3013 ctype->class256_collection[seq->bytes[0]] |= bit;
3015 /* No need to search here, the ASCII value is also the Unicode
3016 value. */
3017 ELEM (ctype, class_collection, , ch) |= bitw;
3021 /* Set default values if keyword was not present. */
3022 if ((ctype->class_done & BITw (tok_upper)) == 0)
3023 /* "If this keyword [lower] is not specified, the lowercase letters
3024 `A' through `Z', ..., shall automatically belong to this class,
3025 with implementation defined character values." [P1003.2, 2.5.2.1] */
3026 set_default (BITPOS (tok_upper), 'A', 'Z');
3028 if ((ctype->class_done & BITw (tok_lower)) == 0)
3029 /* "If this keyword [lower] is not specified, the lowercase letters
3030 `a' through `z', ..., shall automatically belong to this class,
3031 with implementation defined character values." [P1003.2, 2.5.2.1] */
3032 set_default (BITPOS (tok_lower), 'a', 'z');
3034 if ((ctype->class_done & BITw (tok_alpha)) == 0)
3036 /* Table 2-6 in P1003.2 says that characters in class `upper' or
3037 class `lower' *must* be in class `alpha'. */
3038 unsigned long int mask = BIT (tok_upper) | BIT (tok_lower);
3039 unsigned long int maskw = BITw (tok_upper) | BITw (tok_lower);
3041 for (cnt = 0; cnt < 256; ++cnt)
3042 if ((ctype->class256_collection[cnt] & mask) != 0)
3043 ctype->class256_collection[cnt] |= BIT (tok_alpha);
3045 for (cnt = 0; cnt < ctype->class_collection_act; ++cnt)
3046 if ((ctype->class_collection[cnt] & maskw) != 0)
3047 ctype->class_collection[cnt] |= BITw (tok_alpha);
3050 if ((ctype->class_done & BITw (tok_digit)) == 0)
3051 /* "If this keyword [digit] is not specified, the digits `0' through
3052 `9', ..., shall automatically belong to this class, with
3053 implementation-defined character values." [P1003.2, 2.5.2.1] */
3054 set_default (BITPOS (tok_digit), '0', '9');
3056 /* "Only characters specified for the `alpha' and `digit' keyword
3057 shall be specified. Characters specified for the keyword `alpha'
3058 and `digit' are automatically included in this class. */
3060 unsigned long int mask = BIT (tok_alpha) | BIT (tok_digit);
3061 unsigned long int maskw = BITw (tok_alpha) | BITw (tok_digit);
3063 for (cnt = 0; cnt < 256; ++cnt)
3064 if ((ctype->class256_collection[cnt] & mask) != 0)
3065 ctype->class256_collection[cnt] |= BIT (tok_alnum);
3067 for (cnt = 0; cnt < ctype->class_collection_act; ++cnt)
3068 if ((ctype->class_collection[cnt] & maskw) != 0)
3069 ctype->class_collection[cnt] |= BITw (tok_alnum);
3072 if ((ctype->class_done & BITw (tok_space)) == 0)
3073 /* "If this keyword [space] is not specified, the characters <space>,
3074 <form-feed>, <newline>, <carriage-return>, <tab>, and
3075 <vertical-tab>, ..., shall automatically belong to this class,
3076 with implementation-defined character values." [P1003.2, 2.5.2.1] */
3078 struct charseq *seq;
3080 seq = charmap_find_value (charmap, "space", 5);
3081 if (seq == NULL)
3082 seq = charmap_find_value (charmap, "SP", 2);
3083 if (seq == NULL)
3084 seq = charmap_find_value (charmap, "U00000020", 9);
3085 if (seq == NULL)
3087 if (!be_quiet)
3088 error (0, 0, _("\
3089 %s: character `%s' not defined while needed as default value"),
3090 "LC_CTYPE", "<space>");
3092 else if (seq->nbytes != 1)
3093 error (0, 0, _("\
3094 %s: character `%s' in charmap not representable with one byte"),
3095 "LC_CTYPE", "<space>");
3096 else
3097 ctype->class256_collection[seq->bytes[0]] |= BIT (tok_space);
3099 /* No need to search. */
3100 ELEM (ctype, class_collection, , L' ') |= BITw (tok_space);
3102 seq = charmap_find_value (charmap, "form-feed", 9);
3103 if (seq == NULL)
3104 seq = charmap_find_value (charmap, "U0000000C", 9);
3105 if (seq == NULL)
3107 if (!be_quiet)
3108 error (0, 0, _("\
3109 %s: character `%s' not defined while needed as default value"),
3110 "LC_CTYPE", "<form-feed>");
3112 else if (seq->nbytes != 1)
3113 error (0, 0, _("\
3114 %s: character `%s' in charmap not representable with one byte"),
3115 "LC_CTYPE", "<form-feed>");
3116 else
3117 ctype->class256_collection[seq->bytes[0]] |= BIT (tok_space);
3119 /* No need to search. */
3120 ELEM (ctype, class_collection, , L'\f') |= BITw (tok_space);
3123 seq = charmap_find_value (charmap, "newline", 7);
3124 if (seq == NULL)
3125 seq = charmap_find_value (charmap, "U0000000A", 9);
3126 if (seq == NULL)
3128 if (!be_quiet)
3129 error (0, 0, _("\
3130 character `%s' not defined while needed as default value"),
3131 "<newline>");
3133 else if (seq->nbytes != 1)
3134 error (0, 0, _("\
3135 %s: character `%s' in charmap not representable with one byte"),
3136 "LC_CTYPE", "<newline>");
3137 else
3138 ctype->class256_collection[seq->bytes[0]] |= BIT (tok_space);
3140 /* No need to search. */
3141 ELEM (ctype, class_collection, , L'\n') |= BITw (tok_space);
3144 seq = charmap_find_value (charmap, "carriage-return", 15);
3145 if (seq == NULL)
3146 seq = charmap_find_value (charmap, "U0000000D", 9);
3147 if (seq == NULL)
3149 if (!be_quiet)
3150 error (0, 0, _("\
3151 %s: character `%s' not defined while needed as default value"),
3152 "LC_CTYPE", "<carriage-return>");
3154 else if (seq->nbytes != 1)
3155 error (0, 0, _("\
3156 %s: character `%s' in charmap not representable with one byte"),
3157 "LC_CTYPE", "<carriage-return>");
3158 else
3159 ctype->class256_collection[seq->bytes[0]] |= BIT (tok_space);
3161 /* No need to search. */
3162 ELEM (ctype, class_collection, , L'\r') |= BITw (tok_space);
3165 seq = charmap_find_value (charmap, "tab", 3);
3166 if (seq == NULL)
3167 seq = charmap_find_value (charmap, "U00000009", 9);
3168 if (seq == NULL)
3170 if (!be_quiet)
3171 error (0, 0, _("\
3172 %s: character `%s' not defined while needed as default value"),
3173 "LC_CTYPE", "<tab>");
3175 else if (seq->nbytes != 1)
3176 error (0, 0, _("\
3177 %s: character `%s' in charmap not representable with one byte"),
3178 "LC_CTYPE", "<tab>");
3179 else
3180 ctype->class256_collection[seq->bytes[0]] |= BIT (tok_space);
3182 /* No need to search. */
3183 ELEM (ctype, class_collection, , L'\t') |= BITw (tok_space);
3186 seq = charmap_find_value (charmap, "vertical-tab", 12);
3187 if (seq == NULL)
3188 seq = charmap_find_value (charmap, "U0000000B", 9);
3189 if (seq == NULL)
3191 if (!be_quiet)
3192 error (0, 0, _("\
3193 %s: character `%s' not defined while needed as default value"),
3194 "LC_CTYPE", "<vertical-tab>");
3196 else if (seq->nbytes != 1)
3197 error (0, 0, _("\
3198 %s: character `%s' in charmap not representable with one byte"),
3199 "LC_CTYPE", "<vertical-tab>");
3200 else
3201 ctype->class256_collection[seq->bytes[0]] |= BIT (tok_space);
3203 /* No need to search. */
3204 ELEM (ctype, class_collection, , L'\v') |= BITw (tok_space);
3207 if ((ctype->class_done & BITw (tok_xdigit)) == 0)
3208 /* "If this keyword is not specified, the digits `0' to `9', the
3209 uppercase letters `A' through `F', and the lowercase letters `a'
3210 through `f', ..., shell automatically belong to this class, with
3211 implementation defined character values." [P1003.2, 2.5.2.1] */
3213 set_default (BITPOS (tok_xdigit), '0', '9');
3214 set_default (BITPOS (tok_xdigit), 'A', 'F');
3215 set_default (BITPOS (tok_xdigit), 'a', 'f');
3218 if ((ctype->class_done & BITw (tok_blank)) == 0)
3219 /* "If this keyword [blank] is unspecified, the characters <space> and
3220 <tab> shall belong to this character class." [P1003.2, 2.5.2.1] */
3222 struct charseq *seq;
3224 seq = charmap_find_value (charmap, "space", 5);
3225 if (seq == NULL)
3226 seq = charmap_find_value (charmap, "SP", 2);
3227 if (seq == NULL)
3228 seq = charmap_find_value (charmap, "U00000020", 9);
3229 if (seq == NULL)
3231 if (!be_quiet)
3232 error (0, 0, _("\
3233 %s: character `%s' not defined while needed as default value"),
3234 "LC_CTYPE", "<space>");
3236 else if (seq->nbytes != 1)
3237 error (0, 0, _("\
3238 %s: character `%s' in charmap not representable with one byte"),
3239 "LC_CTYPE", "<space>");
3240 else
3241 ctype->class256_collection[seq->bytes[0]] |= BIT (tok_blank);
3243 /* No need to search. */
3244 ELEM (ctype, class_collection, , L' ') |= BITw (tok_blank);
3247 seq = charmap_find_value (charmap, "tab", 3);
3248 if (seq == NULL)
3249 seq = charmap_find_value (charmap, "U00000009", 9);
3250 if (seq == NULL)
3252 if (!be_quiet)
3253 error (0, 0, _("\
3254 %s: character `%s' not defined while needed as default value"),
3255 "LC_CTYPE", "<tab>");
3257 else if (seq->nbytes != 1)
3258 error (0, 0, _("\
3259 %s: character `%s' in charmap not representable with one byte"),
3260 "LC_CTYPE", "<tab>");
3261 else
3262 ctype->class256_collection[seq->bytes[0]] |= BIT (tok_blank);
3264 /* No need to search. */
3265 ELEM (ctype, class_collection, , L'\t') |= BITw (tok_blank);
3268 if ((ctype->class_done & BITw (tok_graph)) == 0)
3269 /* "If this keyword [graph] is not specified, characters specified for
3270 the keywords `upper', `lower', `alpha', `digit', `xdigit' and `punct',
3271 shall belong to this character class." [P1003.2, 2.5.2.1] */
3273 unsigned long int mask = BIT (tok_upper) | BIT (tok_lower) |
3274 BIT (tok_alpha) | BIT (tok_digit) | BIT (tok_xdigit) | BIT (tok_punct);
3275 unsigned long int maskw = BITw (tok_upper) | BITw (tok_lower) |
3276 BITw (tok_alpha) | BITw (tok_digit) | BITw (tok_xdigit) |
3277 BITw (tok_punct);
3278 size_t cnt;
3280 for (cnt = 0; cnt < ctype->class_collection_act; ++cnt)
3281 if ((ctype->class_collection[cnt] & maskw) != 0)
3282 ctype->class_collection[cnt] |= BITw (tok_graph);
3284 for (cnt = 0; cnt < 256; ++cnt)
3285 if ((ctype->class256_collection[cnt] & mask) != 0)
3286 ctype->class256_collection[cnt] |= BIT (tok_graph);
3289 if ((ctype->class_done & BITw (tok_print)) == 0)
3290 /* "If this keyword [print] is not provided, characters specified for
3291 the keywords `upper', `lower', `alpha', `digit', `xdigit', `punct',
3292 and the <space> character shall belong to this character class."
3293 [P1003.2, 2.5.2.1] */
3295 unsigned long int mask = BIT (tok_upper) | BIT (tok_lower) |
3296 BIT (tok_alpha) | BIT (tok_digit) | BIT (tok_xdigit) | BIT (tok_punct);
3297 unsigned long int maskw = BITw (tok_upper) | BITw (tok_lower) |
3298 BITw (tok_alpha) | BITw (tok_digit) | BITw (tok_xdigit) |
3299 BITw (tok_punct);
3300 size_t cnt;
3301 struct charseq *seq;
3303 for (cnt = 0; cnt < ctype->class_collection_act; ++cnt)
3304 if ((ctype->class_collection[cnt] & maskw) != 0)
3305 ctype->class_collection[cnt] |= BITw (tok_print);
3307 for (cnt = 0; cnt < 256; ++cnt)
3308 if ((ctype->class256_collection[cnt] & mask) != 0)
3309 ctype->class256_collection[cnt] |= BIT (tok_print);
3312 seq = charmap_find_value (charmap, "space", 5);
3313 if (seq == NULL)
3314 seq = charmap_find_value (charmap, "SP", 2);
3315 if (seq == NULL)
3316 seq = charmap_find_value (charmap, "U00000020", 9);
3317 if (seq == NULL)
3319 if (!be_quiet)
3320 error (0, 0, _("\
3321 %s: character `%s' not defined while needed as default value"),
3322 "LC_CTYPE", "<space>");
3324 else if (seq->nbytes != 1)
3325 error (0, 0, _("\
3326 %s: character `%s' in charmap not representable with one byte"),
3327 "LC_CTYPE", "<space>");
3328 else
3329 ctype->class256_collection[seq->bytes[0]] |= BIT (tok_print);
3331 /* No need to search. */
3332 ELEM (ctype, class_collection, , L' ') |= BITw (tok_print);
3335 if (ctype->tomap_done[0] == 0)
3336 /* "If this keyword [toupper] is not specified, the lowercase letters
3337 `a' through `z', and their corresponding uppercase letters `A' to
3338 `Z', ..., shall automatically be included, with implementation-
3339 defined character values." [P1003.2, 2.5.2.1] */
3341 char tmp[4];
3342 int ch;
3344 strcpy (tmp, "<?>");
3346 for (ch = 'a'; ch <= 'z'; ++ch)
3348 struct charseq *seq_from, *seq_to;
3350 tmp[1] = (char) ch;
3352 seq_from = charmap_find_value (charmap, &tmp[1], 1);
3353 if (seq_from == NULL)
3355 char buf[10];
3356 sprintf (buf, "U%08X", ch);
3357 seq_from = charmap_find_value (charmap, buf, 9);
3359 if (seq_from == NULL)
3361 if (!be_quiet)
3362 error (0, 0, _("\
3363 %s: character `%s' not defined while needed as default value"),
3364 "LC_CTYPE", tmp);
3366 else if (seq_from->nbytes != 1)
3368 if (!be_quiet)
3369 error (0, 0, _("\
3370 %s: character `%s' needed as default value not representable with one byte"),
3371 "LC_CTYPE", tmp);
3373 else
3375 /* This conversion is implementation defined. */
3376 tmp[1] = (char) (ch + ('A' - 'a'));
3377 seq_to = charmap_find_value (charmap, &tmp[1], 1);
3378 if (seq_to == NULL)
3380 char buf[10];
3381 sprintf (buf, "U%08X", ch + ('A' - 'a'));
3382 seq_to = charmap_find_value (charmap, buf, 9);
3384 if (seq_to == NULL)
3386 if (!be_quiet)
3387 error (0, 0, _("\
3388 %s: character `%s' not defined while needed as default value"),
3389 "LC_CTYPE", tmp);
3391 else if (seq_to->nbytes != 1)
3393 if (!be_quiet)
3394 error (0, 0, _("\
3395 %s: character `%s' needed as default value not representable with one byte"),
3396 "LC_CTYPE", tmp);
3398 else
3399 /* The index [0] is determined by the order of the
3400 `ctype_map_newP' calls in `ctype_startup'. */
3401 ctype->map256_collection[0][seq_from->bytes[0]]
3402 = seq_to->bytes[0];
3405 /* No need to search. */
3406 ELEM (ctype, map_collection, [0], ch) = ch + ('A' - 'a');
3410 if (ctype->tomap_done[1] == 0)
3411 /* "If this keyword [tolower] is not specified, the mapping shall be
3412 the reverse mapping of the one specified to `toupper'." [P1003.2] */
3414 for (cnt = 0; cnt < ctype->map_collection_act[0]; ++cnt)
3415 if (ctype->map_collection[0][cnt] != 0)
3416 ELEM (ctype, map_collection, [1],
3417 ctype->map_collection[0][cnt])
3418 = ctype->charnames[cnt];
3420 for (cnt = 0; cnt < 256; ++cnt)
3421 if (ctype->map256_collection[0][cnt] != 0)
3422 ctype->map256_collection[1][ctype->map256_collection[0][cnt]] = cnt;
3425 if (ctype->outdigits_act != 10)
3427 if (ctype->outdigits_act != 0)
3428 error (0,0, _("%s: field `%s' does not contain exactly ten entries"),
3429 "LC_CTYPE", "outdigit");
3431 for (cnt = ctype->outdigits_act; cnt < 10; ++cnt)
3433 ctype->mboutdigits[cnt] = charmap_find_symbol (charmap,
3434 digits + cnt, 1);
3436 if (ctype->mboutdigits[cnt] == NULL)
3437 ctype->mboutdigits[cnt] = charmap_find_symbol (charmap,
3438 longnames[cnt],
3439 strlen (longnames[cnt]));
3441 if (ctype->mboutdigits[cnt] == NULL)
3442 ctype->mboutdigits[cnt] = charmap_find_symbol (charmap,
3443 uninames[cnt], 9);
3445 if (ctype->mboutdigits[cnt] == NULL)
3447 /* Provide a replacement. */
3448 error (0, 0, _("\
3449 no output digits defined and none of the standard names in the charmap"));
3451 ctype->mboutdigits[cnt] = obstack_alloc (&((struct charmap_t *) charmap)->mem_pool,
3452 sizeof (struct charseq)
3453 + 1);
3455 /* This is better than nothing. */
3456 ctype->mboutdigits[cnt]->bytes[0] = digits[cnt];
3457 ctype->mboutdigits[cnt]->nbytes = 1;
3460 ctype->wcoutdigits[cnt] = L'0' + cnt;
3463 ctype->outdigits_act = 10;
3468 /* Construction of sparse 3-level tables.
3469 See wchar-lookup.h for their structure and the meaning of p and q. */
3471 struct wctype_table
3473 /* Parameters. */
3474 unsigned int p;
3475 unsigned int q;
3476 /* Working representation. */
3477 size_t level1_alloc;
3478 size_t level1_size;
3479 uint32_t *level1;
3480 size_t level2_alloc;
3481 size_t level2_size;
3482 uint32_t *level2;
3483 size_t level3_alloc;
3484 size_t level3_size;
3485 uint32_t *level3;
3486 /* Compressed representation. */
3487 size_t result_size;
3488 char *result;
3491 /* Initialize. Assumes t->p and t->q have already been set. */
3492 static inline void
3493 wctype_table_init (struct wctype_table *t)
3495 t->level1 = NULL;
3496 t->level1_alloc = t->level1_size = 0;
3497 t->level2 = NULL;
3498 t->level2_alloc = t->level2_size = 0;
3499 t->level3 = NULL;
3500 t->level3_alloc = t->level3_size = 0;
3503 /* Retrieve an entry. */
3504 static inline int
3505 wctype_table_get (struct wctype_table *t, uint32_t wc)
3507 uint32_t index1 = wc >> (t->q + t->p + 5);
3508 if (index1 < t->level1_size)
3510 uint32_t lookup1 = t->level1[index1];
3511 if (lookup1 != EMPTY)
3513 uint32_t index2 = ((wc >> (t->p + 5)) & ((1 << t->q) - 1))
3514 + (lookup1 << t->q);
3515 uint32_t lookup2 = t->level2[index2];
3516 if (lookup2 != EMPTY)
3518 uint32_t index3 = ((wc >> 5) & ((1 << t->p) - 1))
3519 + (lookup2 << t->p);
3520 uint32_t lookup3 = t->level3[index3];
3521 uint32_t index4 = wc & 0x1f;
3523 return (lookup3 >> index4) & 1;
3527 return 0;
3530 /* Add one entry. */
3531 static void
3532 wctype_table_add (struct wctype_table *t, uint32_t wc)
3534 uint32_t index1 = wc >> (t->q + t->p + 5);
3535 uint32_t index2 = (wc >> (t->p + 5)) & ((1 << t->q) - 1);
3536 uint32_t index3 = (wc >> 5) & ((1 << t->p) - 1);
3537 uint32_t index4 = wc & 0x1f;
3538 size_t i, i1, i2;
3540 if (index1 >= t->level1_size)
3542 if (index1 >= t->level1_alloc)
3544 size_t alloc = 2 * t->level1_alloc;
3545 if (alloc <= index1)
3546 alloc = index1 + 1;
3547 t->level1 = (uint32_t *) xrealloc ((char *) t->level1,
3548 alloc * sizeof (uint32_t));
3549 t->level1_alloc = alloc;
3551 while (index1 >= t->level1_size)
3552 t->level1[t->level1_size++] = EMPTY;
3555 if (t->level1[index1] == EMPTY)
3557 if (t->level2_size == t->level2_alloc)
3559 size_t alloc = 2 * t->level2_alloc + 1;
3560 t->level2 = (uint32_t *) xrealloc ((char *) t->level2,
3561 (alloc << t->q) * sizeof (uint32_t));
3562 t->level2_alloc = alloc;
3564 i1 = t->level2_size << t->q;
3565 i2 = (t->level2_size + 1) << t->q;
3566 for (i = i1; i < i2; i++)
3567 t->level2[i] = EMPTY;
3568 t->level1[index1] = t->level2_size++;
3571 index2 += t->level1[index1] << t->q;
3573 if (t->level2[index2] == EMPTY)
3575 if (t->level3_size == t->level3_alloc)
3577 size_t alloc = 2 * t->level3_alloc + 1;
3578 t->level3 = (uint32_t *) xrealloc ((char *) t->level3,
3579 (alloc << t->p) * sizeof (uint32_t));
3580 t->level3_alloc = alloc;
3582 i1 = t->level3_size << t->p;
3583 i2 = (t->level3_size + 1) << t->p;
3584 for (i = i1; i < i2; i++)
3585 t->level3[i] = 0;
3586 t->level2[index2] = t->level3_size++;
3589 index3 += t->level2[index2] << t->p;
3591 t->level3[index3] |= (uint32_t)1 << index4;
3594 /* Finalize and shrink. */
3595 static void
3596 wctype_table_finalize (struct wctype_table *t)
3598 size_t i, j, k;
3599 uint32_t reorder3[t->level3_size];
3600 uint32_t reorder2[t->level2_size];
3601 uint32_t level1_offset, level2_offset, level3_offset;
3603 /* Uniquify level3 blocks. */
3604 k = 0;
3605 for (j = 0; j < t->level3_size; j++)
3607 for (i = 0; i < k; i++)
3608 if (memcmp (&t->level3[i << t->p], &t->level3[j << t->p],
3609 (1 << t->p) * sizeof (uint32_t)) == 0)
3610 break;
3611 /* Relocate block j to block i. */
3612 reorder3[j] = i;
3613 if (i == k)
3615 if (i != j)
3616 memcpy (&t->level3[i << t->p], &t->level3[j << t->p],
3617 (1 << t->p) * sizeof (uint32_t));
3618 k++;
3621 t->level3_size = k;
3623 for (i = 0; i < (t->level2_size << t->q); i++)
3624 if (t->level2[i] != EMPTY)
3625 t->level2[i] = reorder3[t->level2[i]];
3627 /* Uniquify level2 blocks. */
3628 k = 0;
3629 for (j = 0; j < t->level2_size; j++)
3631 for (i = 0; i < k; i++)
3632 if (memcmp (&t->level2[i << t->q], &t->level2[j << t->q],
3633 (1 << t->q) * sizeof (uint32_t)) == 0)
3634 break;
3635 /* Relocate block j to block i. */
3636 reorder2[j] = i;
3637 if (i == k)
3639 if (i != j)
3640 memcpy (&t->level2[i << t->q], &t->level2[j << t->q],
3641 (1 << t->q) * sizeof (uint32_t));
3642 k++;
3645 t->level2_size = k;
3647 for (i = 0; i < t->level1_size; i++)
3648 if (t->level1[i] != EMPTY)
3649 t->level1[i] = reorder2[t->level1[i]];
3651 /* Create and fill the resulting compressed representation. */
3652 t->result_size =
3653 5 * sizeof (uint32_t)
3654 + t->level1_size * sizeof (uint32_t)
3655 + (t->level2_size << t->q) * sizeof (uint32_t)
3656 + (t->level3_size << t->p) * sizeof (uint32_t);
3657 t->result = (char *) xmalloc (t->result_size);
3659 level1_offset =
3660 5 * sizeof (uint32_t);
3661 level2_offset =
3662 5 * sizeof (uint32_t)
3663 + t->level1_size * sizeof (uint32_t);
3664 level3_offset =
3665 5 * sizeof (uint32_t)
3666 + t->level1_size * sizeof (uint32_t)
3667 + (t->level2_size << t->q) * sizeof (uint32_t);
3669 ((uint32_t *) t->result)[0] = t->q + t->p + 5;
3670 ((uint32_t *) t->result)[1] = t->level1_size;
3671 ((uint32_t *) t->result)[2] = t->p + 5;
3672 ((uint32_t *) t->result)[3] = (1 << t->q) - 1;
3673 ((uint32_t *) t->result)[4] = (1 << t->p) - 1;
3675 for (i = 0; i < t->level1_size; i++)
3676 ((uint32_t *) (t->result + level1_offset))[i] =
3677 (t->level1[i] == EMPTY
3679 : (t->level1[i] << t->q) * sizeof (uint32_t) + level2_offset);
3681 for (i = 0; i < (t->level2_size << t->q); i++)
3682 ((uint32_t *) (t->result + level2_offset))[i] =
3683 (t->level2[i] == EMPTY
3685 : (t->level2[i] << t->p) * sizeof (uint32_t) + level3_offset);
3687 for (i = 0; i < (t->level3_size << t->p); i++)
3688 ((uint32_t *) (t->result + level3_offset))[i] = t->level3[i];
3690 if (t->level1_alloc > 0)
3691 free (t->level1);
3692 if (t->level2_alloc > 0)
3693 free (t->level2);
3694 if (t->level3_alloc > 0)
3695 free (t->level3);
3698 #define TABLE wcwidth_table
3699 #define ELEMENT uint8_t
3700 #define DEFAULT 0xff
3701 #include "3level.h"
3703 #define TABLE wctrans_table
3704 #define ELEMENT int32_t
3705 #define DEFAULT 0
3706 #define wctrans_table_add wctrans_table_add_internal
3707 #include "3level.h"
3708 #undef wctrans_table_add
3709 /* The wctrans_table must actually store the difference between the
3710 desired result and the argument. */
3711 static inline void
3712 wctrans_table_add (struct wctrans_table *t, uint32_t wc, uint32_t mapped_wc)
3714 wctrans_table_add_internal (t, wc, mapped_wc - wc);
3718 /* Flattens the included transliterations into a translit list.
3719 Inserts them in the list at `cursor', and returns the new cursor. */
3720 static struct translit_t **
3721 translit_flatten (struct locale_ctype_t *ctype,
3722 const struct charmap_t *charmap,
3723 struct translit_t **cursor)
3725 while (ctype->translit_include != NULL)
3727 const char *copy_locale = ctype->translit_include->copy_locale;
3728 const char *copy_repertoire = ctype->translit_include->copy_repertoire;
3729 struct localedef_t *other;
3731 /* Unchain the include statement. During the depth-first traversal
3732 we don't want to visit any locale more than once. */
3733 ctype->translit_include = ctype->translit_include->next;
3735 other = find_locale (LC_CTYPE, copy_locale, copy_repertoire, charmap);
3737 if (other == NULL)
3739 error (0, 0, _("\
3740 %s: transliteration data from locale `%s' not available"),
3741 "LC_CTYPE", copy_locale);
3743 else
3745 struct locale_ctype_t *other_ctype =
3746 other->categories[LC_CTYPE].ctype;
3748 cursor = translit_flatten (other_ctype, charmap, cursor);
3749 assert (other_ctype->translit_include == NULL);
3751 if (other_ctype->translit != NULL)
3753 /* Insert the other_ctype->translit list at *cursor. */
3754 struct translit_t *endp = other_ctype->translit;
3755 while (endp->next != NULL)
3756 endp = endp->next;
3758 endp->next = *cursor;
3759 *cursor = other_ctype->translit;
3761 /* Avoid any risk of circular lists. */
3762 other_ctype->translit = NULL;
3764 cursor = &endp->next;
3767 if (ctype->default_missing == NULL)
3768 ctype->default_missing = other_ctype->default_missing;
3772 return cursor;
3775 static void
3776 allocate_arrays (struct locale_ctype_t *ctype, const struct charmap_t *charmap,
3777 struct repertoire_t *repertoire)
3779 size_t idx, nr;
3780 const void *key;
3781 size_t len;
3782 void *vdata;
3783 void *curs;
3785 /* You wonder about this amount of memory? This is only because some
3786 users do not manage to address the array with unsigned values or
3787 data types with range >= 256. '\200' would result in the array
3788 index -128. To help these poor people we duplicate the entries for
3789 128 up to 255 below the entry for \0. */
3790 ctype->ctype_b = (char_class_t *) xcalloc (256 + 128, sizeof (char_class_t));
3791 ctype->ctype32_b = (char_class32_t *) xcalloc (256, sizeof (char_class32_t));
3792 ctype->class_b = (uint32_t **)
3793 xmalloc (ctype->nr_charclass * sizeof (uint32_t *));
3794 ctype->class_3level = (struct iovec *)
3795 xmalloc (ctype->nr_charclass * sizeof (struct iovec));
3797 /* This is the array accessed using the multibyte string elements. */
3798 for (idx = 0; idx < 256; ++idx)
3799 ctype->ctype_b[128 + idx] = ctype->class256_collection[idx];
3801 /* Mirror first 127 entries. We must take care that entry -1 is not
3802 mirrored because EOF == -1. */
3803 for (idx = 0; idx < 127; ++idx)
3804 ctype->ctype_b[idx] = ctype->ctype_b[256 + idx];
3806 /* The 32 bit array contains all characters < 0x100. */
3807 for (idx = 0; idx < ctype->class_collection_act; ++idx)
3808 if (ctype->charnames[idx] < 0x100)
3809 ctype->ctype32_b[ctype->charnames[idx]] = ctype->class_collection[idx];
3811 for (nr = 0; nr < ctype->nr_charclass; nr++)
3813 ctype->class_b[nr] = (uint32_t *) xcalloc (256 / 32, sizeof (uint32_t));
3815 for (idx = 0; idx < 256; ++idx)
3816 if (ctype->class256_collection[idx] & _ISbit (nr))
3817 ctype->class_b[nr][idx >> 5] |= (uint32_t)1 << (idx & 0x1f);
3820 for (nr = 0; nr < ctype->nr_charclass; nr++)
3822 struct wctype_table t;
3824 t.p = 4; /* or: 5 */
3825 t.q = 7; /* or: 6 */
3826 wctype_table_init (&t);
3828 for (idx = 0; idx < ctype->class_collection_act; ++idx)
3829 if (ctype->class_collection[idx] & _ISwbit (nr))
3830 wctype_table_add (&t, ctype->charnames[idx]);
3832 wctype_table_finalize (&t);
3834 if (verbose)
3835 fprintf (stderr, _("%s: table for class \"%s\": %lu bytes\n"),
3836 "LC_CTYPE", ctype->classnames[nr],
3837 (unsigned long int) t.result_size);
3839 ctype->class_3level[nr].iov_base = t.result;
3840 ctype->class_3level[nr].iov_len = t.result_size;
3843 /* Room for table of mappings. */
3844 ctype->map_b = (uint32_t **) xmalloc (2 * sizeof (uint32_t *));
3845 ctype->map32_b = (uint32_t **) xmalloc (ctype->map_collection_nr
3846 * sizeof (uint32_t *));
3847 ctype->map_3level = (struct iovec *)
3848 xmalloc (ctype->map_collection_nr * sizeof (struct iovec));
3850 /* Fill in all mappings. */
3851 for (idx = 0; idx < 2; ++idx)
3853 unsigned int idx2;
3855 /* Allocate table. */
3856 ctype->map_b[idx] = (uint32_t *)
3857 xmalloc ((256 + 128) * sizeof (uint32_t));
3859 /* Copy values from collection. */
3860 for (idx2 = 0; idx2 < 256; ++idx2)
3861 ctype->map_b[idx][128 + idx2] = ctype->map256_collection[idx][idx2];
3863 /* Mirror first 127 entries. We must take care not to map entry
3864 -1 because EOF == -1. */
3865 for (idx2 = 0; idx2 < 127; ++idx2)
3866 ctype->map_b[idx][idx2] = ctype->map_b[idx][256 + idx2];
3868 /* EOF must map to EOF. */
3869 ctype->map_b[idx][127] = EOF;
3872 for (idx = 0; idx < ctype->map_collection_nr; ++idx)
3874 unsigned int idx2;
3876 /* Allocate table. */
3877 ctype->map32_b[idx] = (uint32_t *) xmalloc (256 * sizeof (uint32_t));
3879 /* Copy values from collection. Default is identity mapping. */
3880 for (idx2 = 0; idx2 < 256; ++idx2)
3881 ctype->map32_b[idx][idx2] =
3882 (ctype->map_collection[idx][idx2] != 0
3883 ? ctype->map_collection[idx][idx2]
3884 : idx2);
3887 for (nr = 0; nr < ctype->map_collection_nr; nr++)
3889 struct wctrans_table t;
3891 t.p = 7;
3892 t.q = 9;
3893 wctrans_table_init (&t);
3895 for (idx = 0; idx < ctype->map_collection_act[nr]; ++idx)
3896 if (ctype->map_collection[nr][idx] != 0)
3897 wctrans_table_add (&t, ctype->charnames[idx],
3898 ctype->map_collection[nr][idx]);
3900 wctrans_table_finalize (&t);
3902 if (verbose)
3903 fprintf (stderr, _("%s: table for map \"%s\": %lu bytes\n"),
3904 "LC_CTYPE", ctype->mapnames[nr],
3905 (unsigned long int) t.result_size);
3907 ctype->map_3level[nr].iov_base = t.result;
3908 ctype->map_3level[nr].iov_len = t.result_size;
3911 /* Extra array for class and map names. */
3912 ctype->class_name_ptr = (uint32_t *) xmalloc (ctype->nr_charclass
3913 * sizeof (uint32_t));
3914 ctype->map_name_ptr = (uint32_t *) xmalloc (ctype->map_collection_nr
3915 * sizeof (uint32_t));
3917 ctype->class_offset = _NL_ITEM_INDEX (_NL_CTYPE_EXTRA_MAP_1);
3918 ctype->map_offset = ctype->class_offset + ctype->nr_charclass;
3920 /* Array for width information. Because the expected widths are very
3921 small (never larger than 2) we use only one single byte. This
3922 saves space.
3923 We put only printable characters in the table. wcwidth is specified
3924 to return -1 for non-printable characters. Doing the check here
3925 saves a run-time check.
3926 But we put L'\0' in the table. This again saves a run-time check. */
3928 struct wcwidth_table t;
3930 t.p = 7;
3931 t.q = 9;
3932 wcwidth_table_init (&t);
3934 /* First set all the printable characters of the character set to
3935 the default width. */
3936 curs = NULL;
3937 while (iterate_table (&charmap->char_table, &curs, &key, &len, &vdata) == 0)
3939 struct charseq *data = (struct charseq *) vdata;
3941 if (data->ucs4 == UNINITIALIZED_CHAR_VALUE)
3942 data->ucs4 = repertoire_find_value (ctype->repertoire,
3943 data->name, len);
3945 if (data->ucs4 != ILLEGAL_CHAR_VALUE)
3947 uint32_t *class_bits =
3948 find_idx (ctype, &ctype->class_collection, NULL,
3949 &ctype->class_collection_act, data->ucs4);
3951 if (class_bits != NULL && (*class_bits & BITw (tok_print)))
3952 wcwidth_table_add (&t, data->ucs4, charmap->width_default);
3956 /* Now add the explicitly specified widths. */
3957 if (charmap->width_rules != NULL)
3959 size_t cnt;
3961 for (cnt = 0; cnt < charmap->nwidth_rules; ++cnt)
3963 unsigned char bytes[charmap->mb_cur_max];
3964 int nbytes = charmap->width_rules[cnt].from->nbytes;
3966 /* We have the range of character for which the width is
3967 specified described using byte sequences of the multibyte
3968 charset. We have to convert this to UCS4 now. And we
3969 cannot simply convert the beginning and the end of the
3970 sequence, we have to iterate over the byte sequence and
3971 convert it for every single character. */
3972 memcpy (bytes, charmap->width_rules[cnt].from->bytes, nbytes);
3974 while (nbytes < charmap->width_rules[cnt].to->nbytes
3975 || memcmp (bytes, charmap->width_rules[cnt].to->bytes,
3976 nbytes) <= 0)
3978 /* Find the UCS value for `bytes'. */
3979 int inner;
3980 uint32_t wch;
3981 struct charseq *seq =
3982 charmap_find_symbol (charmap, bytes, nbytes);
3984 if (seq == NULL)
3985 wch = ILLEGAL_CHAR_VALUE;
3986 else if (seq->ucs4 != UNINITIALIZED_CHAR_VALUE)
3987 wch = seq->ucs4;
3988 else
3989 wch = repertoire_find_value (ctype->repertoire, seq->name,
3990 strlen (seq->name));
3992 if (wch != ILLEGAL_CHAR_VALUE)
3994 /* Store the value. */
3995 uint32_t *class_bits =
3996 find_idx (ctype, &ctype->class_collection, NULL,
3997 &ctype->class_collection_act, wch);
3999 if (class_bits != NULL && (*class_bits & BITw (tok_print)))
4000 wcwidth_table_add (&t, wch,
4001 charmap->width_rules[cnt].width);
4004 /* "Increment" the bytes sequence. */
4005 inner = nbytes - 1;
4006 while (inner >= 0 && bytes[inner] == 0xff)
4007 --inner;
4009 if (inner < 0)
4011 /* We have to extend the byte sequence. */
4012 if (nbytes >= charmap->width_rules[cnt].to->nbytes)
4013 break;
4015 bytes[0] = 1;
4016 memset (&bytes[1], 0, nbytes);
4017 ++nbytes;
4019 else
4021 ++bytes[inner];
4022 while (++inner < nbytes)
4023 bytes[inner] = 0;
4029 /* Set the width of L'\0' to 0. */
4030 wcwidth_table_add (&t, 0, 0);
4032 wcwidth_table_finalize (&t);
4034 if (verbose)
4035 fprintf (stderr, _("%s: table for width: %lu bytes\n"),
4036 "LC_CTYPE", (unsigned long int) t.result_size);
4038 ctype->width.iov_base = t.result;
4039 ctype->width.iov_len = t.result_size;
4042 /* Set MB_CUR_MAX. */
4043 ctype->mb_cur_max = charmap->mb_cur_max;
4045 /* Now determine the table for the transliteration information.
4047 XXX It is not yet clear to me whether it is worth implementing a
4048 complicated algorithm which uses a hash table to locate the entries.
4049 For now I'll use a simple array which can be searching using binary
4050 search. */
4051 if (ctype->translit_include != NULL)
4052 /* Traverse the locales mentioned in the `include' statements in a
4053 depth-first way and fold in their transliteration information. */
4054 translit_flatten (ctype, charmap, &ctype->translit);
4056 if (ctype->translit != NULL)
4058 /* First count how many entries we have. This is the upper limit
4059 since some entries from the included files might be overwritten. */
4060 size_t number = 0;
4061 size_t cnt;
4062 struct translit_t *runp = ctype->translit;
4063 struct translit_t **sorted;
4064 size_t from_len, to_len;
4066 while (runp != NULL)
4068 ++number;
4069 runp = runp->next;
4072 /* Next we allocate an array large enough and fill in the values. */
4073 sorted = (struct translit_t **) alloca (number
4074 * sizeof (struct translit_t **));
4075 runp = ctype->translit;
4076 number = 0;
4079 /* Search for the place where to insert this string.
4080 XXX Better use a real sorting algorithm later. */
4081 size_t idx = 0;
4082 int replace = 0;
4084 while (idx < number)
4086 int res = wcscmp ((const wchar_t *) sorted[idx]->from,
4087 (const wchar_t *) runp->from);
4088 if (res == 0)
4090 replace = 1;
4091 break;
4093 if (res > 0)
4094 break;
4095 ++idx;
4098 if (replace)
4099 sorted[idx] = runp;
4100 else
4102 memmove (&sorted[idx + 1], &sorted[idx],
4103 (number - idx) * sizeof (struct translit_t *));
4104 sorted[idx] = runp;
4105 ++number;
4108 runp = runp->next;
4110 while (runp != NULL);
4112 /* The next step is putting all the possible transliteration
4113 strings in one memory block so that we can write it out.
4114 We need several different blocks:
4115 - index to the from-string array
4116 - from-string array
4117 - index to the to-string array
4118 - to-string array.
4120 from_len = to_len = 0;
4121 for (cnt = 0; cnt < number; ++cnt)
4123 struct translit_to_t *srunp;
4124 from_len += wcslen ((const wchar_t *) sorted[cnt]->from) + 1;
4125 srunp = sorted[cnt]->to;
4126 while (srunp != NULL)
4128 to_len += wcslen ((const wchar_t *) srunp->str) + 1;
4129 srunp = srunp->next;
4131 /* Plus one for the extra NUL character marking the end of
4132 the list for the current entry. */
4133 ++to_len;
4136 /* We can allocate the arrays for the results. */
4137 ctype->translit_from_idx = xmalloc (number * sizeof (uint32_t));
4138 ctype->translit_from_tbl = xmalloc (from_len * sizeof (uint32_t));
4139 ctype->translit_to_idx = xmalloc (number * sizeof (uint32_t));
4140 ctype->translit_to_tbl = xmalloc (to_len * sizeof (uint32_t));
4142 from_len = 0;
4143 to_len = 0;
4144 for (cnt = 0; cnt < number; ++cnt)
4146 size_t len;
4147 struct translit_to_t *srunp;
4149 ctype->translit_from_idx[cnt] = from_len;
4150 ctype->translit_to_idx[cnt] = to_len;
4152 len = wcslen ((const wchar_t *) sorted[cnt]->from) + 1;
4153 wmemcpy ((wchar_t *) &ctype->translit_from_tbl[from_len],
4154 (const wchar_t *) sorted[cnt]->from, len);
4155 from_len += len;
4157 ctype->translit_to_idx[cnt] = to_len;
4158 srunp = sorted[cnt]->to;
4159 while (srunp != NULL)
4161 len = wcslen ((const wchar_t *) srunp->str) + 1;
4162 wmemcpy ((wchar_t *) &ctype->translit_to_tbl[to_len],
4163 (const wchar_t *) srunp->str, len);
4164 to_len += len;
4165 srunp = srunp->next;
4167 ctype->translit_to_tbl[to_len++] = L'\0';
4170 /* Store the information about the length. */
4171 ctype->translit_idx_size = number;
4172 ctype->translit_from_tbl_size = from_len * sizeof (uint32_t);
4173 ctype->translit_to_tbl_size = to_len * sizeof (uint32_t);
4175 else
4177 /* Provide some dummy pointers since we have nothing to write out. */
4178 static uint32_t no_str = { 0 };
4180 ctype->translit_from_idx = &no_str;
4181 ctype->translit_from_tbl = &no_str;
4182 ctype->translit_to_tbl = &no_str;
4183 ctype->translit_idx_size = 0;
4184 ctype->translit_from_tbl_size = 0;
4185 ctype->translit_to_tbl_size = 0;