x86-64: Optimize strlen/strnlen/wcslen/wcsnlen with AVX2
[glibc.git] / locale / programs / ld-ctype.c
blobdf266c20d6bad5259b1b18deebb82de8ceba7211
1 /* Copyright (C) 1995-2017 Free Software Foundation, Inc.
2 This file is part of the GNU C Library.
3 Contributed by Ulrich Drepper <drepper@gnu.org>, 1995.
5 This program is free software; you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published
7 by the Free Software Foundation; version 2 of the License, or
8 (at your option) any later version.
10 This program is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 GNU General Public License for more details.
15 You should have received a copy of the GNU General Public License
16 along with this program; if not, see <http://www.gnu.org/licenses/>. */
18 #ifdef HAVE_CONFIG_H
19 # include <config.h>
20 #endif
22 #include <alloca.h>
23 #include <byteswap.h>
24 #include <endian.h>
25 #include <errno.h>
26 #include <limits.h>
27 #include <obstack.h>
28 #include <stdlib.h>
29 #include <string.h>
30 #include <wchar.h>
31 #include <wctype.h>
32 #include <stdint.h>
33 #include <sys/uio.h>
35 #include "localedef.h"
36 #include "charmap.h"
37 #include "localeinfo.h"
38 #include "langinfo.h"
39 #include "linereader.h"
40 #include "locfile-token.h"
41 #include "locfile.h"
43 #include <assert.h>
46 /* The bit used for representing a special class. */
47 #define BITPOS(class) ((class) - tok_upper)
48 #define BIT(class) (_ISbit (BITPOS (class)))
49 #define BITw(class) (_ISwbit (BITPOS (class)))
51 #define ELEM(ctype, collection, idx, value) \
52 *find_idx (ctype, &ctype->collection idx, &ctype->collection##_max idx, \
53 &ctype->collection##_act idx, value)
56 /* To be compatible with former implementations we for now restrict
57 the number of bits for character classes to 16. When compatibility
58 is not necessary anymore increase the number to 32. */
59 #define char_class_t uint16_t
60 #define char_class32_t uint32_t
63 /* Type to describe a transliteration action. We have a possibly
64 multiple character from-string and a set of multiple character
65 to-strings. All are 32bit values since this is what is used in
66 the gconv functions. */
67 struct translit_to_t
69 uint32_t *str;
71 struct translit_to_t *next;
74 struct translit_t
76 uint32_t *from;
78 const char *fname;
79 size_t lineno;
81 struct translit_to_t *to;
83 struct translit_t *next;
86 struct translit_ignore_t
88 uint32_t from;
89 uint32_t to;
90 uint32_t step;
92 const char *fname;
93 size_t lineno;
95 struct translit_ignore_t *next;
99 /* Type to describe a transliteration include statement. */
100 struct translit_include_t
102 const char *copy_locale;
103 const char *copy_repertoire;
105 struct translit_include_t *next;
108 /* Provide some dummy pointer for empty string. */
109 static uint32_t no_str[] = { 0 };
112 /* Sparse table of uint32_t. */
113 #define TABLE idx_table
114 #define ELEMENT uint32_t
115 #define DEFAULT ((uint32_t) ~0)
116 #define NO_ADD_LOCALE
117 #include "3level.h"
119 #define TABLE wcwidth_table
120 #define ELEMENT uint8_t
121 #define DEFAULT 0xff
122 #include "3level.h"
124 #define TABLE wctrans_table
125 #define ELEMENT int32_t
126 #define DEFAULT 0
127 #define wctrans_table_add wctrans_table_add_internal
128 #include "3level.h"
129 #undef wctrans_table_add
130 /* The wctrans_table must actually store the difference between the
131 desired result and the argument. */
132 static inline void
133 wctrans_table_add (struct wctrans_table *t, uint32_t wc, uint32_t mapped_wc)
135 wctrans_table_add_internal (t, wc, mapped_wc - wc);
138 /* Construction of sparse 3-level tables.
139 See wchar-lookup.h for their structure and the meaning of p and q. */
141 struct wctype_table
143 /* Parameters. */
144 unsigned int p;
145 unsigned int q;
146 /* Working representation. */
147 size_t level1_alloc;
148 size_t level1_size;
149 uint32_t *level1;
150 size_t level2_alloc;
151 size_t level2_size;
152 uint32_t *level2;
153 size_t level3_alloc;
154 size_t level3_size;
155 uint32_t *level3;
156 size_t result_size;
159 static void add_locale_wctype_table (struct locale_file *file,
160 struct wctype_table *t);
162 /* The real definition of the struct for the LC_CTYPE locale. */
163 struct locale_ctype_t
165 uint32_t *charnames;
166 size_t charnames_max;
167 size_t charnames_act;
168 /* An index lookup table, to speedup find_idx. */
169 struct idx_table charnames_idx;
171 struct repertoire_t *repertoire;
173 /* We will allow up to 8 * sizeof (uint32_t) character classes. */
174 #define MAX_NR_CHARCLASS (8 * sizeof (uint32_t))
175 size_t nr_charclass;
176 const char *classnames[MAX_NR_CHARCLASS];
177 uint32_t last_class_char;
178 uint32_t class256_collection[256];
179 uint32_t *class_collection;
180 size_t class_collection_max;
181 size_t class_collection_act;
182 uint32_t class_done;
183 uint32_t class_offset;
185 struct charseq **mbdigits;
186 size_t mbdigits_act;
187 size_t mbdigits_max;
188 uint32_t *wcdigits;
189 size_t wcdigits_act;
190 size_t wcdigits_max;
192 struct charseq *mboutdigits[10];
193 uint32_t wcoutdigits[10];
194 size_t outdigits_act;
196 /* If the following number ever turns out to be too small simply
197 increase it. But I doubt it will. --drepper@gnu */
198 #define MAX_NR_CHARMAP 16
199 const char *mapnames[MAX_NR_CHARMAP];
200 uint32_t *map_collection[MAX_NR_CHARMAP];
201 uint32_t map256_collection[2][256];
202 size_t map_collection_max[MAX_NR_CHARMAP];
203 size_t map_collection_act[MAX_NR_CHARMAP];
204 size_t map_collection_nr;
205 size_t last_map_idx;
206 int tomap_done[MAX_NR_CHARMAP];
207 uint32_t map_offset;
209 /* Transliteration information. */
210 struct translit_include_t *translit_include;
211 struct translit_t *translit;
212 struct translit_ignore_t *translit_ignore;
213 uint32_t ntranslit_ignore;
215 uint32_t *default_missing;
216 const char *default_missing_file;
217 size_t default_missing_lineno;
219 uint32_t to_nonascii;
220 uint32_t nonascii_case;
222 /* The arrays for the binary representation. */
223 char_class_t *ctype_b;
224 char_class32_t *ctype32_b;
225 uint32_t **map_b;
226 uint32_t **map32_b;
227 uint32_t **class_b;
228 struct wctype_table *class_3level;
229 struct wctrans_table *map_3level;
230 uint32_t *class_name_ptr;
231 uint32_t *map_name_ptr;
232 struct wcwidth_table width;
233 uint32_t mb_cur_max;
234 const char *codeset_name;
235 uint32_t *translit_from_idx;
236 uint32_t *translit_from_tbl;
237 uint32_t *translit_to_idx;
238 uint32_t *translit_to_tbl;
239 uint32_t translit_idx_size;
240 size_t translit_from_tbl_size;
241 size_t translit_to_tbl_size;
243 struct obstack mempool;
247 /* Marker for an empty slot. This has the value 0xFFFFFFFF, regardless
248 whether 'int' is 16 bit, 32 bit, or 64 bit. */
249 #define EMPTY ((uint32_t) ~0)
252 #define obstack_chunk_alloc xmalloc
253 #define obstack_chunk_free free
256 /* Prototypes for local functions. */
257 static void ctype_startup (struct linereader *lr, struct localedef_t *locale,
258 const struct charmap_t *charmap,
259 struct localedef_t *copy_locale,
260 int ignore_content);
261 static void ctype_class_new (struct linereader *lr,
262 struct locale_ctype_t *ctype, const char *name);
263 static void ctype_map_new (struct linereader *lr,
264 struct locale_ctype_t *ctype,
265 const char *name, const struct charmap_t *charmap);
266 static uint32_t *find_idx (struct locale_ctype_t *ctype, uint32_t **table,
267 size_t *max, size_t *act, uint32_t idx);
268 static void set_class_defaults (struct locale_ctype_t *ctype,
269 const struct charmap_t *charmap,
270 struct repertoire_t *repertoire);
271 static void allocate_arrays (struct locale_ctype_t *ctype,
272 const struct charmap_t *charmap,
273 struct repertoire_t *repertoire);
276 static const char *longnames[] =
278 "zero", "one", "two", "three", "four",
279 "five", "six", "seven", "eight", "nine"
281 static const char *uninames[] =
283 "U00000030", "U00000031", "U00000032", "U00000033", "U00000034",
284 "U00000035", "U00000036", "U00000037", "U00000038", "U00000039"
286 static const unsigned char digits[] = "0123456789";
289 static void
290 ctype_startup (struct linereader *lr, struct localedef_t *locale,
291 const struct charmap_t *charmap,
292 struct localedef_t *copy_locale, int ignore_content)
294 unsigned int cnt;
295 struct locale_ctype_t *ctype;
297 if (!ignore_content && locale->categories[LC_CTYPE].ctype == NULL)
299 if (copy_locale == NULL)
301 /* Allocate the needed room. */
302 locale->categories[LC_CTYPE].ctype = ctype =
303 (struct locale_ctype_t *) xcalloc (1,
304 sizeof (struct locale_ctype_t));
306 /* We have seen no names yet. */
307 ctype->charnames_max = charmap->mb_cur_max == 1 ? 256 : 512;
308 ctype->charnames = (uint32_t *) xmalloc (ctype->charnames_max
309 * sizeof (uint32_t));
310 for (cnt = 0; cnt < 256; ++cnt)
311 ctype->charnames[cnt] = cnt;
312 ctype->charnames_act = 256;
313 idx_table_init (&ctype->charnames_idx);
315 /* Fill character class information. */
316 ctype->last_class_char = ILLEGAL_CHAR_VALUE;
317 /* The order of the following instructions determines the bit
318 positions! */
319 ctype_class_new (lr, ctype, "upper");
320 ctype_class_new (lr, ctype, "lower");
321 ctype_class_new (lr, ctype, "alpha");
322 ctype_class_new (lr, ctype, "digit");
323 ctype_class_new (lr, ctype, "xdigit");
324 ctype_class_new (lr, ctype, "space");
325 ctype_class_new (lr, ctype, "print");
326 ctype_class_new (lr, ctype, "graph");
327 ctype_class_new (lr, ctype, "blank");
328 ctype_class_new (lr, ctype, "cntrl");
329 ctype_class_new (lr, ctype, "punct");
330 ctype_class_new (lr, ctype, "alnum");
332 ctype->class_collection_max = charmap->mb_cur_max == 1 ? 256 : 512;
333 ctype->class_collection
334 = (uint32_t *) xcalloc (sizeof (unsigned long int),
335 ctype->class_collection_max);
336 ctype->class_collection_act = 256;
338 /* Fill character map information. */
339 ctype->last_map_idx = MAX_NR_CHARMAP;
340 ctype_map_new (lr, ctype, "toupper", charmap);
341 ctype_map_new (lr, ctype, "tolower", charmap);
343 /* Fill first 256 entries in `toXXX' arrays. */
344 for (cnt = 0; cnt < 256; ++cnt)
346 ctype->map_collection[0][cnt] = cnt;
347 ctype->map_collection[1][cnt] = cnt;
349 ctype->map256_collection[0][cnt] = cnt;
350 ctype->map256_collection[1][cnt] = cnt;
353 if (enc_not_ascii_compatible)
354 ctype->to_nonascii = 1;
356 obstack_init (&ctype->mempool);
358 else
359 ctype = locale->categories[LC_CTYPE].ctype =
360 copy_locale->categories[LC_CTYPE].ctype;
365 void
366 ctype_finish (struct localedef_t *locale, const struct charmap_t *charmap)
368 /* See POSIX.2, table 2-6 for the meaning of the following table. */
369 #define NCLASS 12
370 static const struct
372 const char *name;
373 const char allow[NCLASS];
375 valid_table[NCLASS] =
377 /* The order is important. See token.h for more information.
378 M = Always, D = Default, - = Permitted, X = Mutually exclusive */
379 { "upper", "--MX-XDDXXX-" },
380 { "lower", "--MX-XDDXXX-" },
381 { "alpha", "---X-XDDXXX-" },
382 { "digit", "XXX--XDDXXX-" },
383 { "xdigit", "-----XDDXXX-" },
384 { "space", "XXXXX------X" },
385 { "print", "---------X--" },
386 { "graph", "---------X--" },
387 { "blank", "XXXXXM-----X" },
388 { "cntrl", "XXXXX-XX--XX" },
389 { "punct", "XXXXX-DD-X-X" },
390 { "alnum", "-----XDDXXX-" }
392 size_t cnt;
393 int cls1, cls2;
394 uint32_t space_value;
395 struct charseq *space_seq;
396 struct locale_ctype_t *ctype = locale->categories[LC_CTYPE].ctype;
397 int warned;
398 const void *key;
399 size_t len;
400 void *vdata;
401 void *curs;
403 /* Now resolve copying and also handle completely missing definitions. */
404 if (ctype == NULL)
406 const char *repertoire_name;
408 /* First see whether we were supposed to copy. If yes, find the
409 actual definition. */
410 if (locale->copy_name[LC_CTYPE] != NULL)
412 /* Find the copying locale. This has to happen transitively since
413 the locale we are copying from might also copying another one. */
414 struct localedef_t *from = locale;
417 from = find_locale (LC_CTYPE, from->copy_name[LC_CTYPE],
418 from->repertoire_name, charmap);
419 while (from->categories[LC_CTYPE].ctype == NULL
420 && from->copy_name[LC_CTYPE] != NULL);
422 ctype = locale->categories[LC_CTYPE].ctype
423 = from->categories[LC_CTYPE].ctype;
426 /* If there is still no definition issue an warning and create an
427 empty one. */
428 if (ctype == NULL)
430 if (! be_quiet)
431 WITH_CUR_LOCALE (error (0, 0, _("\
432 No definition for %s category found"), "LC_CTYPE"));
433 ctype_startup (NULL, locale, charmap, NULL, 0);
434 ctype = locale->categories[LC_CTYPE].ctype;
437 /* Get the repertoire we have to use. */
438 repertoire_name = locale->repertoire_name ?: repertoire_global;
439 if (repertoire_name != NULL)
440 ctype->repertoire = repertoire_read (repertoire_name);
443 /* We need the name of the currently used 8-bit character set to
444 make correct conversion between this 8-bit representation and the
445 ISO 10646 character set used internally for wide characters. */
446 ctype->codeset_name = charmap->code_set_name;
447 if (ctype->codeset_name == NULL)
449 if (! be_quiet)
450 WITH_CUR_LOCALE (error (0, 0, _("\
451 No character set name specified in charmap")));
452 ctype->codeset_name = "//UNKNOWN//";
455 /* Set default value for classes not specified. */
456 set_class_defaults (ctype, charmap, ctype->repertoire);
458 /* Check according to table. */
459 for (cnt = 0; cnt < ctype->class_collection_act; ++cnt)
461 uint32_t tmp = ctype->class_collection[cnt];
463 if (tmp != 0)
465 for (cls1 = 0; cls1 < NCLASS; ++cls1)
466 if ((tmp & _ISwbit (cls1)) != 0)
467 for (cls2 = 0; cls2 < NCLASS; ++cls2)
468 if (valid_table[cls1].allow[cls2] != '-')
470 int eq = (tmp & _ISwbit (cls2)) != 0;
471 switch (valid_table[cls1].allow[cls2])
473 case 'M':
474 if (!eq)
476 uint32_t value = ctype->charnames[cnt];
478 if (!be_quiet)
479 WITH_CUR_LOCALE (error (0, 0, _("\
480 character L'\\u%0*x' in class `%s' must be in class `%s'"),
481 value > 0xffff ? 8 : 4,
482 value,
483 valid_table[cls1].name,
484 valid_table[cls2].name));
486 break;
488 case 'X':
489 if (eq)
491 uint32_t value = ctype->charnames[cnt];
493 if (!be_quiet)
494 WITH_CUR_LOCALE (error (0, 0, _("\
495 character L'\\u%0*x' in class `%s' must not be in class `%s'"),
496 value > 0xffff ? 8 : 4,
497 value,
498 valid_table[cls1].name,
499 valid_table[cls2].name));
501 break;
503 case 'D':
504 ctype->class_collection[cnt] |= _ISwbit (cls2);
505 break;
507 default:
508 WITH_CUR_LOCALE (error (5, 0, _("\
509 internal error in %s, line %u"), __FUNCTION__, __LINE__));
515 for (cnt = 0; cnt < 256; ++cnt)
517 uint32_t tmp = ctype->class256_collection[cnt];
519 if (tmp != 0)
521 for (cls1 = 0; cls1 < NCLASS; ++cls1)
522 if ((tmp & _ISbit (cls1)) != 0)
523 for (cls2 = 0; cls2 < NCLASS; ++cls2)
524 if (valid_table[cls1].allow[cls2] != '-')
526 int eq = (tmp & _ISbit (cls2)) != 0;
527 switch (valid_table[cls1].allow[cls2])
529 case 'M':
530 if (!eq)
532 char buf[17];
534 snprintf (buf, sizeof buf, "\\%Zo", cnt);
536 if (!be_quiet)
537 WITH_CUR_LOCALE (error (0, 0, _("\
538 character '%s' in class `%s' must be in class `%s'"),
539 buf,
540 valid_table[cls1].name,
541 valid_table[cls2].name));
543 break;
545 case 'X':
546 if (eq)
548 char buf[17];
550 snprintf (buf, sizeof buf, "\\%Zo", cnt);
552 if (!be_quiet)
553 WITH_CUR_LOCALE (error (0, 0, _("\
554 character '%s' in class `%s' must not be in class `%s'"),
555 buf,
556 valid_table[cls1].name,
557 valid_table[cls2].name));
559 break;
561 case 'D':
562 ctype->class256_collection[cnt] |= _ISbit (cls2);
563 break;
565 default:
566 WITH_CUR_LOCALE (error (5, 0, _("\
567 internal error in %s, line %u"), __FUNCTION__, __LINE__));
573 /* ... and now test <SP> as a special case. */
574 space_value = 32;
575 if (((cnt = BITPOS (tok_space),
576 (ELEM (ctype, class_collection, , space_value)
577 & BITw (tok_space)) == 0)
578 || (cnt = BITPOS (tok_blank),
579 (ELEM (ctype, class_collection, , space_value)
580 & BITw (tok_blank)) == 0)))
582 if (!be_quiet)
583 WITH_CUR_LOCALE (error (0, 0, _("<SP> character not in class `%s'"),
584 valid_table[cnt].name));
586 else if (((cnt = BITPOS (tok_punct),
587 (ELEM (ctype, class_collection, , space_value)
588 & BITw (tok_punct)) != 0)
589 || (cnt = BITPOS (tok_graph),
590 (ELEM (ctype, class_collection, , space_value)
591 & BITw (tok_graph))
592 != 0)))
594 if (!be_quiet)
595 WITH_CUR_LOCALE (error (0, 0, _("\
596 <SP> character must not be in class `%s'"),
597 valid_table[cnt].name));
599 else
600 ELEM (ctype, class_collection, , space_value) |= BITw (tok_print);
602 space_seq = charmap_find_value (charmap, "SP", 2);
603 if (space_seq == NULL)
604 space_seq = charmap_find_value (charmap, "space", 5);
605 if (space_seq == NULL)
606 space_seq = charmap_find_value (charmap, "U00000020", 9);
607 if (space_seq == NULL || space_seq->nbytes != 1)
609 if (!be_quiet)
610 WITH_CUR_LOCALE (error (0, 0, _("\
611 character <SP> not defined in character map")));
613 else if (((cnt = BITPOS (tok_space),
614 (ctype->class256_collection[space_seq->bytes[0]]
615 & BIT (tok_space)) == 0)
616 || (cnt = BITPOS (tok_blank),
617 (ctype->class256_collection[space_seq->bytes[0]]
618 & BIT (tok_blank)) == 0)))
620 if (!be_quiet)
621 WITH_CUR_LOCALE (error (0, 0, _("<SP> character not in class `%s'"),
622 valid_table[cnt].name));
624 else if (((cnt = BITPOS (tok_punct),
625 (ctype->class256_collection[space_seq->bytes[0]]
626 & BIT (tok_punct)) != 0)
627 || (cnt = BITPOS (tok_graph),
628 (ctype->class256_collection[space_seq->bytes[0]]
629 & BIT (tok_graph)) != 0)))
631 if (!be_quiet)
632 WITH_CUR_LOCALE (error (0, 0, _("\
633 <SP> character must not be in class `%s'"),
634 valid_table[cnt].name));
636 else
637 ctype->class256_collection[space_seq->bytes[0]] |= BIT (tok_print);
639 /* Check whether all single-byte characters make to their upper/lowercase
640 equivalent according to the ASCII rules. */
641 for (cnt = 'A'; cnt <= 'Z'; ++cnt)
643 uint32_t uppval = ctype->map256_collection[0][cnt];
644 uint32_t lowval = ctype->map256_collection[1][cnt];
645 uint32_t lowuppval = ctype->map256_collection[0][lowval];
646 uint32_t lowlowval = ctype->map256_collection[1][lowval];
648 if (uppval != cnt
649 || lowval != cnt + 0x20
650 || lowuppval != cnt
651 || lowlowval != cnt + 0x20)
652 ctype->nonascii_case = 1;
654 for (cnt = 0; cnt < 256; ++cnt)
655 if (cnt < 'A' || (cnt > 'Z' && cnt < 'a') || cnt > 'z')
656 if (ctype->map256_collection[0][cnt] != cnt
657 || ctype->map256_collection[1][cnt] != cnt)
658 ctype->nonascii_case = 1;
660 /* Now that the tests are done make sure the name array contains all
661 characters which are handled in the WIDTH section of the
662 character set definition file. */
663 if (charmap->width_rules != NULL)
664 for (cnt = 0; cnt < charmap->nwidth_rules; ++cnt)
666 unsigned char bytes[charmap->mb_cur_max];
667 int nbytes = charmap->width_rules[cnt].from->nbytes;
669 /* We have the range of character for which the width is
670 specified described using byte sequences of the multibyte
671 charset. We have to convert this to UCS4 now. And we
672 cannot simply convert the beginning and the end of the
673 sequence, we have to iterate over the byte sequence and
674 convert it for every single character. */
675 memcpy (bytes, charmap->width_rules[cnt].from->bytes, nbytes);
677 while (nbytes < charmap->width_rules[cnt].to->nbytes
678 || memcmp (bytes, charmap->width_rules[cnt].to->bytes,
679 nbytes) <= 0)
681 /* Find the UCS value for `bytes'. */
682 int inner;
683 uint32_t wch;
684 struct charseq *seq
685 = charmap_find_symbol (charmap, (char *) bytes, nbytes);
687 if (seq == NULL)
688 wch = ILLEGAL_CHAR_VALUE;
689 else if (seq->ucs4 != UNINITIALIZED_CHAR_VALUE)
690 wch = seq->ucs4;
691 else
692 wch = repertoire_find_value (ctype->repertoire, seq->name,
693 strlen (seq->name));
695 if (wch != ILLEGAL_CHAR_VALUE)
696 /* We are only interested in the side-effects of the
697 `find_idx' call. It will add appropriate entries in
698 the name array if this is necessary. */
699 (void) find_idx (ctype, NULL, NULL, NULL, wch);
701 /* "Increment" the bytes sequence. */
702 inner = nbytes - 1;
703 while (inner >= 0 && bytes[inner] == 0xff)
704 --inner;
706 if (inner < 0)
708 /* We have to extend the byte sequence. */
709 if (nbytes >= charmap->width_rules[cnt].to->nbytes)
710 break;
712 bytes[0] = 1;
713 memset (&bytes[1], 0, nbytes);
714 ++nbytes;
716 else
718 ++bytes[inner];
719 while (++inner < nbytes)
720 bytes[inner] = 0;
725 /* Now set all the other characters of the character set to the
726 default width. */
727 curs = NULL;
728 while (iterate_table (&charmap->char_table, &curs, &key, &len, &vdata) == 0)
730 struct charseq *data = (struct charseq *) vdata;
732 if (data->ucs4 == UNINITIALIZED_CHAR_VALUE)
733 data->ucs4 = repertoire_find_value (ctype->repertoire,
734 data->name, len);
736 if (data->ucs4 != ILLEGAL_CHAR_VALUE)
737 (void) find_idx (ctype, NULL, NULL, NULL, data->ucs4);
740 /* There must be a multiple of 10 digits. */
741 if (ctype->mbdigits_act % 10 != 0)
743 assert (ctype->mbdigits_act == ctype->wcdigits_act);
744 ctype->wcdigits_act -= ctype->mbdigits_act % 10;
745 ctype->mbdigits_act -= ctype->mbdigits_act % 10;
746 WITH_CUR_LOCALE (error (0, 0, _("\
747 `digit' category has not entries in groups of ten")));
750 /* Check the input digits. There must be a multiple of ten available.
751 In each group it could be that one or the other character is missing.
752 In this case the whole group must be removed. */
753 cnt = 0;
754 while (cnt < ctype->mbdigits_act)
756 size_t inner;
757 for (inner = 0; inner < 10; ++inner)
758 if (ctype->mbdigits[cnt + inner] == NULL)
759 break;
761 if (inner == 10)
762 cnt += 10;
763 else
765 /* Remove the group. */
766 memmove (&ctype->mbdigits[cnt], &ctype->mbdigits[cnt + 10],
767 ((ctype->wcdigits_act - cnt - 10)
768 * sizeof (ctype->mbdigits[0])));
769 ctype->mbdigits_act -= 10;
773 /* If no input digits are given use the default. */
774 if (ctype->mbdigits_act == 0)
776 if (ctype->mbdigits_max == 0)
778 ctype->mbdigits = obstack_alloc (&((struct charmap_t *) charmap)->mem_pool,
779 10 * sizeof (struct charseq *));
780 ctype->mbdigits_max = 10;
783 for (cnt = 0; cnt < 10; ++cnt)
785 ctype->mbdigits[cnt] = charmap_find_symbol (charmap,
786 (char *) digits + cnt, 1);
787 if (ctype->mbdigits[cnt] == NULL)
789 ctype->mbdigits[cnt] = charmap_find_symbol (charmap,
790 longnames[cnt],
791 strlen (longnames[cnt]));
792 if (ctype->mbdigits[cnt] == NULL)
794 /* Hum, this ain't good. */
795 WITH_CUR_LOCALE (error (0, 0, _("\
796 no input digits defined and none of the standard names in the charmap")));
798 ctype->mbdigits[cnt] = obstack_alloc (&((struct charmap_t *) charmap)->mem_pool,
799 sizeof (struct charseq) + 1);
801 /* This is better than nothing. */
802 ctype->mbdigits[cnt]->bytes[0] = digits[cnt];
803 ctype->mbdigits[cnt]->nbytes = 1;
808 ctype->mbdigits_act = 10;
811 /* Check the wide character input digits. There must be a multiple
812 of ten available. In each group it could be that one or the other
813 character is missing. In this case the whole group must be
814 removed. */
815 cnt = 0;
816 while (cnt < ctype->wcdigits_act)
818 size_t inner;
819 for (inner = 0; inner < 10; ++inner)
820 if (ctype->wcdigits[cnt + inner] == ILLEGAL_CHAR_VALUE)
821 break;
823 if (inner == 10)
824 cnt += 10;
825 else
827 /* Remove the group. */
828 memmove (&ctype->wcdigits[cnt], &ctype->wcdigits[cnt + 10],
829 ((ctype->wcdigits_act - cnt - 10)
830 * sizeof (ctype->wcdigits[0])));
831 ctype->wcdigits_act -= 10;
835 /* If no input digits are given use the default. */
836 if (ctype->wcdigits_act == 0)
838 if (ctype->wcdigits_max == 0)
840 ctype->wcdigits = obstack_alloc (&((struct charmap_t *) charmap)->mem_pool,
841 10 * sizeof (uint32_t));
842 ctype->wcdigits_max = 10;
845 for (cnt = 0; cnt < 10; ++cnt)
846 ctype->wcdigits[cnt] = L'0' + cnt;
848 ctype->mbdigits_act = 10;
851 /* Check the outdigits. */
852 warned = 0;
853 for (cnt = 0; cnt < 10; ++cnt)
854 if (ctype->mboutdigits[cnt] == NULL)
856 static struct charseq replace[2];
858 if (!warned)
860 WITH_CUR_LOCALE (error (0, 0, _("\
861 not all characters used in `outdigit' are available in the charmap")));
862 warned = 1;
865 replace[0].nbytes = 1;
866 replace[0].bytes[0] = '?';
867 replace[0].bytes[1] = '\0';
868 ctype->mboutdigits[cnt] = &replace[0];
871 warned = 0;
872 for (cnt = 0; cnt < 10; ++cnt)
873 if (ctype->wcoutdigits[cnt] == 0)
875 if (!warned)
877 WITH_CUR_LOCALE (error (0, 0, _("\
878 not all characters used in `outdigit' are available in the repertoire")));
879 warned = 1;
882 ctype->wcoutdigits[cnt] = L'?';
885 /* Sort the entries in the translit_ignore list. */
886 if (ctype->translit_ignore != NULL)
888 struct translit_ignore_t *firstp = ctype->translit_ignore;
889 struct translit_ignore_t *runp;
891 ctype->ntranslit_ignore = 1;
893 for (runp = firstp->next; runp != NULL; runp = runp->next)
895 struct translit_ignore_t *lastp = NULL;
896 struct translit_ignore_t *cmpp;
898 ++ctype->ntranslit_ignore;
900 for (cmpp = firstp; cmpp != NULL; lastp = cmpp, cmpp = cmpp->next)
901 if (runp->from < cmpp->from)
902 break;
904 runp->next = lastp;
905 if (lastp == NULL)
906 firstp = runp;
909 ctype->translit_ignore = firstp;
914 void
915 ctype_output (struct localedef_t *locale, const struct charmap_t *charmap,
916 const char *output_path)
918 struct locale_ctype_t *ctype = locale->categories[LC_CTYPE].ctype;
919 const size_t nelems = (_NL_ITEM_INDEX (_NL_CTYPE_EXTRA_MAP_1)
920 + ctype->nr_charclass + ctype->map_collection_nr);
921 struct locale_file file;
922 uint32_t default_missing_len;
923 size_t elem, cnt;
925 /* Now prepare the output: Find the sizes of the table we can use. */
926 allocate_arrays (ctype, charmap, ctype->repertoire);
928 default_missing_len = (ctype->default_missing
929 ? wcslen ((wchar_t *) ctype->default_missing)
930 : 0);
932 init_locale_data (&file, nelems);
933 for (elem = 0; elem < nelems; ++elem)
935 if (elem < _NL_ITEM_INDEX (_NL_CTYPE_EXTRA_MAP_1))
936 switch (elem)
938 #define CTYPE_EMPTY(name) \
939 case name: \
940 add_locale_empty (&file); \
941 break
943 CTYPE_EMPTY(_NL_CTYPE_GAP1);
944 CTYPE_EMPTY(_NL_CTYPE_GAP2);
945 CTYPE_EMPTY(_NL_CTYPE_GAP3);
946 CTYPE_EMPTY(_NL_CTYPE_GAP4);
947 CTYPE_EMPTY(_NL_CTYPE_GAP5);
948 CTYPE_EMPTY(_NL_CTYPE_GAP6);
950 #define CTYPE_RAW_DATA(name, base, size) \
951 case _NL_ITEM_INDEX (name): \
952 add_locale_raw_data (&file, base, size); \
953 break
955 CTYPE_RAW_DATA (_NL_CTYPE_CLASS,
956 ctype->ctype_b,
957 (256 + 128) * sizeof (char_class_t));
959 #define CTYPE_UINT32_ARRAY(name, base, n_elems) \
960 case _NL_ITEM_INDEX (name): \
961 add_locale_uint32_array (&file, base, n_elems); \
962 break
964 CTYPE_UINT32_ARRAY (_NL_CTYPE_TOUPPER, ctype->map_b[0], 256 + 128);
965 CTYPE_UINT32_ARRAY (_NL_CTYPE_TOLOWER, ctype->map_b[1], 256 + 128);
966 CTYPE_UINT32_ARRAY (_NL_CTYPE_TOUPPER32, ctype->map32_b[0], 256);
967 CTYPE_UINT32_ARRAY (_NL_CTYPE_TOLOWER32, ctype->map32_b[1], 256);
968 CTYPE_RAW_DATA (_NL_CTYPE_CLASS32,
969 ctype->ctype32_b,
970 256 * sizeof (char_class32_t));
972 #define CTYPE_UINT32(name, value) \
973 case _NL_ITEM_INDEX (name): \
974 add_locale_uint32 (&file, value); \
975 break
977 CTYPE_UINT32 (_NL_CTYPE_CLASS_OFFSET, ctype->class_offset);
978 CTYPE_UINT32 (_NL_CTYPE_MAP_OFFSET, ctype->map_offset);
979 CTYPE_UINT32 (_NL_CTYPE_TRANSLIT_TAB_SIZE, ctype->translit_idx_size);
981 CTYPE_UINT32_ARRAY (_NL_CTYPE_TRANSLIT_FROM_IDX,
982 ctype->translit_from_idx,
983 ctype->translit_idx_size);
985 CTYPE_UINT32_ARRAY (_NL_CTYPE_TRANSLIT_FROM_TBL,
986 ctype->translit_from_tbl,
987 ctype->translit_from_tbl_size
988 / sizeof (uint32_t));
990 CTYPE_UINT32_ARRAY (_NL_CTYPE_TRANSLIT_TO_IDX,
991 ctype->translit_to_idx,
992 ctype->translit_idx_size);
994 CTYPE_UINT32_ARRAY (_NL_CTYPE_TRANSLIT_TO_TBL,
995 ctype->translit_to_tbl,
996 ctype->translit_to_tbl_size / sizeof (uint32_t));
998 case _NL_ITEM_INDEX (_NL_CTYPE_CLASS_NAMES):
999 /* The class name array. */
1000 start_locale_structure (&file);
1001 for (cnt = 0; cnt < ctype->nr_charclass; ++cnt)
1002 add_locale_string (&file, ctype->classnames[cnt]);
1003 add_locale_char (&file, 0);
1004 align_locale_data (&file, LOCFILE_ALIGN);
1005 end_locale_structure (&file);
1006 break;
1008 case _NL_ITEM_INDEX (_NL_CTYPE_MAP_NAMES):
1009 /* The class name array. */
1010 start_locale_structure (&file);
1011 for (cnt = 0; cnt < ctype->map_collection_nr; ++cnt)
1012 add_locale_string (&file, ctype->mapnames[cnt]);
1013 add_locale_char (&file, 0);
1014 align_locale_data (&file, LOCFILE_ALIGN);
1015 end_locale_structure (&file);
1016 break;
1018 case _NL_ITEM_INDEX (_NL_CTYPE_WIDTH):
1019 add_locale_wcwidth_table (&file, &ctype->width);
1020 break;
1022 CTYPE_UINT32 (_NL_CTYPE_MB_CUR_MAX, ctype->mb_cur_max);
1024 case _NL_ITEM_INDEX (_NL_CTYPE_CODESET_NAME):
1025 add_locale_string (&file, ctype->codeset_name);
1026 break;
1028 CTYPE_UINT32 (_NL_CTYPE_MAP_TO_NONASCII, ctype->to_nonascii);
1030 CTYPE_UINT32 (_NL_CTYPE_NONASCII_CASE, ctype->nonascii_case);
1032 case _NL_ITEM_INDEX (_NL_CTYPE_INDIGITS_MB_LEN):
1033 add_locale_uint32 (&file, ctype->mbdigits_act / 10);
1034 break;
1036 case _NL_ITEM_INDEX (_NL_CTYPE_INDIGITS_WC_LEN):
1037 add_locale_uint32 (&file, ctype->wcdigits_act / 10);
1038 break;
1040 case _NL_ITEM_INDEX (_NL_CTYPE_INDIGITS0_MB) ... _NL_ITEM_INDEX (_NL_CTYPE_INDIGITS9_MB):
1041 start_locale_structure (&file);
1042 for (cnt = elem - _NL_ITEM_INDEX (_NL_CTYPE_INDIGITS0_MB);
1043 cnt < ctype->mbdigits_act; cnt += 10)
1045 add_locale_raw_data (&file, ctype->mbdigits[cnt]->bytes,
1046 ctype->mbdigits[cnt]->nbytes);
1047 add_locale_char (&file, 0);
1049 end_locale_structure (&file);
1050 break;
1052 case _NL_ITEM_INDEX (_NL_CTYPE_OUTDIGIT0_MB) ... _NL_ITEM_INDEX (_NL_CTYPE_OUTDIGIT9_MB):
1053 start_locale_structure (&file);
1054 cnt = elem - _NL_ITEM_INDEX (_NL_CTYPE_OUTDIGIT0_MB);
1055 add_locale_raw_data (&file, ctype->mboutdigits[cnt]->bytes,
1056 ctype->mboutdigits[cnt]->nbytes);
1057 add_locale_char (&file, 0);
1058 end_locale_structure (&file);
1059 break;
1061 case _NL_ITEM_INDEX (_NL_CTYPE_INDIGITS0_WC) ... _NL_ITEM_INDEX (_NL_CTYPE_INDIGITS9_WC):
1062 start_locale_structure (&file);
1063 for (cnt = elem - _NL_ITEM_INDEX (_NL_CTYPE_INDIGITS0_WC);
1064 cnt < ctype->wcdigits_act; cnt += 10)
1065 add_locale_uint32 (&file, ctype->wcdigits[cnt]);
1066 end_locale_structure (&file);
1067 break;
1069 case _NL_ITEM_INDEX (_NL_CTYPE_OUTDIGIT0_WC) ... _NL_ITEM_INDEX (_NL_CTYPE_OUTDIGIT9_WC):
1070 cnt = elem - _NL_ITEM_INDEX (_NL_CTYPE_OUTDIGIT0_WC);
1071 add_locale_uint32 (&file, ctype->wcoutdigits[cnt]);
1072 break;
1074 case _NL_ITEM_INDEX(_NL_CTYPE_TRANSLIT_DEFAULT_MISSING_LEN):
1075 add_locale_uint32 (&file, default_missing_len);
1076 break;
1078 case _NL_ITEM_INDEX(_NL_CTYPE_TRANSLIT_DEFAULT_MISSING):
1079 add_locale_uint32_array (&file, ctype->default_missing,
1080 default_missing_len);
1081 break;
1083 case _NL_ITEM_INDEX(_NL_CTYPE_TRANSLIT_IGNORE_LEN):
1084 add_locale_uint32 (&file, ctype->ntranslit_ignore);
1085 break;
1087 case _NL_ITEM_INDEX(_NL_CTYPE_TRANSLIT_IGNORE):
1088 start_locale_structure (&file);
1090 struct translit_ignore_t *runp;
1091 for (runp = ctype->translit_ignore; runp != NULL;
1092 runp = runp->next)
1094 add_locale_uint32 (&file, runp->from);
1095 add_locale_uint32 (&file, runp->to);
1096 add_locale_uint32 (&file, runp->step);
1099 end_locale_structure (&file);
1100 break;
1102 default:
1103 assert (! "unknown CTYPE element");
1105 else
1107 /* Handle extra maps. */
1108 size_t nr = elem - _NL_ITEM_INDEX (_NL_CTYPE_EXTRA_MAP_1);
1109 if (nr < ctype->nr_charclass)
1111 start_locale_prelude (&file);
1112 add_locale_uint32_array (&file, ctype->class_b[nr], 256 / 32);
1113 end_locale_prelude (&file);
1114 add_locale_wctype_table (&file, &ctype->class_3level[nr]);
1116 else
1118 nr -= ctype->nr_charclass;
1119 assert (nr < ctype->map_collection_nr);
1120 add_locale_wctrans_table (&file, &ctype->map_3level[nr]);
1125 write_locale_data (output_path, LC_CTYPE, "LC_CTYPE", &file);
1129 /* Local functions. */
1130 static void
1131 ctype_class_new (struct linereader *lr, struct locale_ctype_t *ctype,
1132 const char *name)
1134 size_t cnt;
1136 for (cnt = 0; cnt < ctype->nr_charclass; ++cnt)
1137 if (strcmp (ctype->classnames[cnt], name) == 0)
1138 break;
1140 if (cnt < ctype->nr_charclass)
1142 lr_error (lr, _("character class `%s' already defined"), name);
1143 return;
1146 if (ctype->nr_charclass == MAX_NR_CHARCLASS)
1147 /* Exit code 2 is prescribed in P1003.2b. */
1148 WITH_CUR_LOCALE (error (2, 0, _("\
1149 implementation limit: no more than %Zd character classes allowed"),
1150 MAX_NR_CHARCLASS));
1152 ctype->classnames[ctype->nr_charclass++] = name;
1156 static void
1157 ctype_map_new (struct linereader *lr, struct locale_ctype_t *ctype,
1158 const char *name, const struct charmap_t *charmap)
1160 size_t max_chars = 0;
1161 size_t cnt;
1163 for (cnt = 0; cnt < ctype->map_collection_nr; ++cnt)
1165 if (strcmp (ctype->mapnames[cnt], name) == 0)
1166 break;
1168 if (max_chars < ctype->map_collection_max[cnt])
1169 max_chars = ctype->map_collection_max[cnt];
1172 if (cnt < ctype->map_collection_nr)
1174 lr_error (lr, _("character map `%s' already defined"), name);
1175 return;
1178 if (ctype->map_collection_nr == MAX_NR_CHARMAP)
1179 /* Exit code 2 is prescribed in P1003.2b. */
1180 WITH_CUR_LOCALE (error (2, 0, _("\
1181 implementation limit: no more than %d character maps allowed"),
1182 MAX_NR_CHARMAP));
1184 ctype->mapnames[cnt] = name;
1186 if (max_chars == 0)
1187 ctype->map_collection_max[cnt] = charmap->mb_cur_max == 1 ? 256 : 512;
1188 else
1189 ctype->map_collection_max[cnt] = max_chars;
1191 ctype->map_collection[cnt] = (uint32_t *)
1192 xcalloc (sizeof (uint32_t), ctype->map_collection_max[cnt]);
1193 ctype->map_collection_act[cnt] = 256;
1195 ++ctype->map_collection_nr;
1199 /* We have to be prepared that TABLE, MAX, and ACT can be NULL. This
1200 is possible if we only want to extend the name array. */
1201 static uint32_t *
1202 find_idx (struct locale_ctype_t *ctype, uint32_t **table, size_t *max,
1203 size_t *act, uint32_t idx)
1205 size_t cnt;
1207 if (idx < 256)
1208 return table == NULL ? NULL : &(*table)[idx];
1210 /* Use the charnames_idx lookup table instead of the slow search loop. */
1211 #if 1
1212 cnt = idx_table_get (&ctype->charnames_idx, idx);
1213 if (cnt == EMPTY)
1214 /* Not found. */
1215 cnt = ctype->charnames_act;
1216 #else
1217 for (cnt = 256; cnt < ctype->charnames_act; ++cnt)
1218 if (ctype->charnames[cnt] == idx)
1219 break;
1220 #endif
1222 /* We have to distinguish two cases: the name is found or not. */
1223 if (cnt == ctype->charnames_act)
1225 /* Extend the name array. */
1226 if (ctype->charnames_act == ctype->charnames_max)
1228 ctype->charnames_max *= 2;
1229 ctype->charnames = (uint32_t *)
1230 xrealloc (ctype->charnames,
1231 sizeof (uint32_t) * ctype->charnames_max);
1233 ctype->charnames[ctype->charnames_act++] = idx;
1234 idx_table_add (&ctype->charnames_idx, idx, cnt);
1237 if (table == NULL)
1238 /* We have done everything we are asked to do. */
1239 return NULL;
1241 if (max == NULL)
1242 /* The caller does not want to extend the table. */
1243 return (cnt >= *act ? NULL : &(*table)[cnt]);
1245 if (cnt >= *act)
1247 if (cnt >= *max)
1249 size_t old_max = *max;
1251 *max *= 2;
1252 while (*max <= cnt);
1254 *table =
1255 (uint32_t *) xrealloc (*table, *max * sizeof (uint32_t));
1256 memset (&(*table)[old_max], '\0',
1257 (*max - old_max) * sizeof (uint32_t));
1260 *act = cnt + 1;
1263 return &(*table)[cnt];
1267 static int
1268 get_character (struct token *now, const struct charmap_t *charmap,
1269 struct repertoire_t *repertoire,
1270 struct charseq **seqp, uint32_t *wchp)
1272 if (now->tok == tok_bsymbol)
1274 /* This will hopefully be the normal case. */
1275 *wchp = repertoire_find_value (repertoire, now->val.str.startmb,
1276 now->val.str.lenmb);
1277 *seqp = charmap_find_value (charmap, now->val.str.startmb,
1278 now->val.str.lenmb);
1280 else if (now->tok == tok_ucs4)
1282 char utmp[10];
1284 snprintf (utmp, sizeof (utmp), "U%08X", now->val.ucs4);
1285 *seqp = charmap_find_value (charmap, utmp, 9);
1287 if (*seqp == NULL)
1288 *seqp = repertoire_find_seq (repertoire, now->val.ucs4);
1290 if (*seqp == NULL)
1292 /* Compute the value in the charmap from the UCS value. */
1293 const char *symbol = repertoire_find_symbol (repertoire,
1294 now->val.ucs4);
1296 if (symbol == NULL)
1297 *seqp = NULL;
1298 else
1299 *seqp = charmap_find_value (charmap, symbol, strlen (symbol));
1301 if (*seqp == NULL)
1303 if (repertoire != NULL)
1305 /* Insert a negative entry. */
1306 static const struct charseq negative
1307 = { .ucs4 = ILLEGAL_CHAR_VALUE };
1308 uint32_t *newp = obstack_alloc (&repertoire->mem_pool,
1309 sizeof (uint32_t));
1310 *newp = now->val.ucs4;
1312 insert_entry (&repertoire->seq_table, newp,
1313 sizeof (uint32_t), (void *) &negative);
1316 else
1317 (*seqp)->ucs4 = now->val.ucs4;
1319 else if ((*seqp)->ucs4 != now->val.ucs4)
1320 *seqp = NULL;
1322 *wchp = now->val.ucs4;
1324 else if (now->tok == tok_charcode)
1326 /* We must map from the byte code to UCS4. */
1327 *seqp = charmap_find_symbol (charmap, now->val.str.startmb,
1328 now->val.str.lenmb);
1330 if (*seqp == NULL)
1331 *wchp = ILLEGAL_CHAR_VALUE;
1332 else
1334 if ((*seqp)->ucs4 == UNINITIALIZED_CHAR_VALUE)
1335 (*seqp)->ucs4 = repertoire_find_value (repertoire, (*seqp)->name,
1336 strlen ((*seqp)->name));
1337 *wchp = (*seqp)->ucs4;
1340 else
1341 return 1;
1343 return 0;
1347 /* Ellipsis like in `<foo123>..<foo12a>' or `<j1234>....<j1245>' and
1348 the .(2). counterparts. */
1349 static void
1350 charclass_symbolic_ellipsis (struct linereader *ldfile,
1351 struct locale_ctype_t *ctype,
1352 const struct charmap_t *charmap,
1353 struct repertoire_t *repertoire,
1354 struct token *now,
1355 const char *last_str,
1356 unsigned long int class256_bit,
1357 unsigned long int class_bit, int base,
1358 int ignore_content, int handle_digits, int step)
1360 const char *nowstr = now->val.str.startmb;
1361 char tmp[now->val.str.lenmb + 1];
1362 const char *cp;
1363 char *endp;
1364 unsigned long int from;
1365 unsigned long int to;
1367 /* We have to compute the ellipsis values using the symbolic names. */
1368 assert (last_str != NULL);
1370 if (strlen (last_str) != now->val.str.lenmb)
1372 invalid_range:
1373 lr_error (ldfile,
1374 _("`%s' and `%.*s' are not valid names for symbolic range"),
1375 last_str, (int) now->val.str.lenmb, nowstr);
1376 return;
1379 if (memcmp (last_str, nowstr, now->val.str.lenmb) == 0)
1380 /* Nothing to do, the names are the same. */
1381 return;
1383 for (cp = last_str; *cp == *(nowstr + (cp - last_str)); ++cp)
1386 errno = 0;
1387 from = strtoul (cp, &endp, base);
1388 if ((from == UINT_MAX && errno == ERANGE) || *endp != '\0')
1389 goto invalid_range;
1391 to = strtoul (nowstr + (cp - last_str), &endp, base);
1392 if ((to == UINT_MAX && errno == ERANGE)
1393 || (endp - nowstr) != now->val.str.lenmb || from >= to)
1394 goto invalid_range;
1396 /* OK, we have a range FROM - TO. Now we can create the symbolic names. */
1397 if (!ignore_content)
1399 now->val.str.startmb = tmp;
1400 while ((from += step) <= to)
1402 struct charseq *seq;
1403 uint32_t wch;
1405 sprintf (tmp, (base == 10 ? "%.*s%0*ld" : "%.*s%0*lX"),
1406 (int) (cp - last_str), last_str,
1407 (int) (now->val.str.lenmb - (cp - last_str)),
1408 from);
1410 get_character (now, charmap, repertoire, &seq, &wch);
1412 if (seq != NULL && seq->nbytes == 1)
1413 /* Yep, we can store information about this byte sequence. */
1414 ctype->class256_collection[seq->bytes[0]] |= class256_bit;
1416 if (wch != ILLEGAL_CHAR_VALUE && class_bit != 0)
1417 /* We have the UCS4 position. */
1418 *find_idx (ctype, &ctype->class_collection,
1419 &ctype->class_collection_max,
1420 &ctype->class_collection_act, wch) |= class_bit;
1422 if (handle_digits == 1)
1424 /* We must store the digit values. */
1425 if (ctype->mbdigits_act == ctype->mbdigits_max)
1427 ctype->mbdigits_max *= 2;
1428 ctype->mbdigits = xrealloc (ctype->mbdigits,
1429 (ctype->mbdigits_max
1430 * sizeof (char *)));
1431 ctype->wcdigits_max *= 2;
1432 ctype->wcdigits = xrealloc (ctype->wcdigits,
1433 (ctype->wcdigits_max
1434 * sizeof (uint32_t)));
1437 ctype->mbdigits[ctype->mbdigits_act++] = seq;
1438 ctype->wcdigits[ctype->wcdigits_act++] = wch;
1440 else if (handle_digits == 2)
1442 /* We must store the digit values. */
1443 if (ctype->outdigits_act >= 10)
1445 lr_error (ldfile, _("\
1446 %s: field `%s' does not contain exactly ten entries"),
1447 "LC_CTYPE", "outdigit");
1448 return;
1451 ctype->mboutdigits[ctype->outdigits_act] = seq;
1452 ctype->wcoutdigits[ctype->outdigits_act] = wch;
1453 ++ctype->outdigits_act;
1460 /* Ellipsis like in `<U1234>..<U2345>' or `<U1234>..(2)..<U2345>'. */
1461 static void
1462 charclass_ucs4_ellipsis (struct linereader *ldfile,
1463 struct locale_ctype_t *ctype,
1464 const struct charmap_t *charmap,
1465 struct repertoire_t *repertoire,
1466 struct token *now, uint32_t last_wch,
1467 unsigned long int class256_bit,
1468 unsigned long int class_bit, int ignore_content,
1469 int handle_digits, int step)
1471 if (last_wch > now->val.ucs4)
1473 lr_error (ldfile, _("\
1474 to-value <U%0*X> of range is smaller than from-value <U%0*X>"),
1475 (now->val.ucs4 | last_wch) < 65536 ? 4 : 8, now->val.ucs4,
1476 (now->val.ucs4 | last_wch) < 65536 ? 4 : 8, last_wch);
1477 return;
1480 if (!ignore_content)
1481 while ((last_wch += step) <= now->val.ucs4)
1483 /* We have to find out whether there is a byte sequence corresponding
1484 to this UCS4 value. */
1485 struct charseq *seq;
1486 char utmp[10];
1488 snprintf (utmp, sizeof (utmp), "U%08X", last_wch);
1489 seq = charmap_find_value (charmap, utmp, 9);
1490 if (seq == NULL)
1492 snprintf (utmp, sizeof (utmp), "U%04X", last_wch);
1493 seq = charmap_find_value (charmap, utmp, 5);
1496 if (seq == NULL)
1497 /* Try looking in the repertoire map. */
1498 seq = repertoire_find_seq (repertoire, last_wch);
1500 /* If this is the first time we look for this sequence create a new
1501 entry. */
1502 if (seq == NULL)
1504 static const struct charseq negative
1505 = { .ucs4 = ILLEGAL_CHAR_VALUE };
1507 /* Find the symbolic name for this UCS4 value. */
1508 if (repertoire != NULL)
1510 const char *symbol = repertoire_find_symbol (repertoire,
1511 last_wch);
1512 uint32_t *newp = obstack_alloc (&repertoire->mem_pool,
1513 sizeof (uint32_t));
1514 *newp = last_wch;
1516 if (symbol != NULL)
1517 /* We have a name, now search the multibyte value. */
1518 seq = charmap_find_value (charmap, symbol, strlen (symbol));
1520 if (seq == NULL)
1521 /* We have to create a fake entry. */
1522 seq = (struct charseq *) &negative;
1523 else
1524 seq->ucs4 = last_wch;
1526 insert_entry (&repertoire->seq_table, newp, sizeof (uint32_t),
1527 seq);
1529 else
1530 /* We have to create a fake entry. */
1531 seq = (struct charseq *) &negative;
1534 /* We have a name, now search the multibyte value. */
1535 if (seq->ucs4 == last_wch && seq->nbytes == 1)
1536 /* Yep, we can store information about this byte sequence. */
1537 ctype->class256_collection[(size_t) seq->bytes[0]]
1538 |= class256_bit;
1540 /* And of course we have the UCS4 position. */
1541 if (class_bit != 0)
1542 *find_idx (ctype, &ctype->class_collection,
1543 &ctype->class_collection_max,
1544 &ctype->class_collection_act, last_wch) |= class_bit;
1546 if (handle_digits == 1)
1548 /* We must store the digit values. */
1549 if (ctype->mbdigits_act == ctype->mbdigits_max)
1551 ctype->mbdigits_max *= 2;
1552 ctype->mbdigits = xrealloc (ctype->mbdigits,
1553 (ctype->mbdigits_max
1554 * sizeof (char *)));
1555 ctype->wcdigits_max *= 2;
1556 ctype->wcdigits = xrealloc (ctype->wcdigits,
1557 (ctype->wcdigits_max
1558 * sizeof (uint32_t)));
1561 ctype->mbdigits[ctype->mbdigits_act++] = (seq->ucs4 == last_wch
1562 ? seq : NULL);
1563 ctype->wcdigits[ctype->wcdigits_act++] = last_wch;
1565 else if (handle_digits == 2)
1567 /* We must store the digit values. */
1568 if (ctype->outdigits_act >= 10)
1570 lr_error (ldfile, _("\
1571 %s: field `%s' does not contain exactly ten entries"),
1572 "LC_CTYPE", "outdigit");
1573 return;
1576 ctype->mboutdigits[ctype->outdigits_act] = (seq->ucs4 == last_wch
1577 ? seq : NULL);
1578 ctype->wcoutdigits[ctype->outdigits_act] = last_wch;
1579 ++ctype->outdigits_act;
1585 /* Ellipsis as in `/xea/x12.../xea/x34'. */
1586 static void
1587 charclass_charcode_ellipsis (struct linereader *ldfile,
1588 struct locale_ctype_t *ctype,
1589 const struct charmap_t *charmap,
1590 struct repertoire_t *repertoire,
1591 struct token *now, char *last_charcode,
1592 uint32_t last_charcode_len,
1593 unsigned long int class256_bit,
1594 unsigned long int class_bit, int ignore_content,
1595 int handle_digits)
1597 /* First check whether the to-value is larger. */
1598 if (now->val.charcode.nbytes != last_charcode_len)
1600 lr_error (ldfile, _("\
1601 start and end character sequence of range must have the same length"));
1602 return;
1605 if (memcmp (last_charcode, now->val.charcode.bytes, last_charcode_len) > 0)
1607 lr_error (ldfile, _("\
1608 to-value character sequence is smaller than from-value sequence"));
1609 return;
1612 if (!ignore_content)
1616 /* Increment the byte sequence value. */
1617 struct charseq *seq;
1618 uint32_t wch;
1619 int i;
1621 for (i = last_charcode_len - 1; i >= 0; --i)
1622 if (++last_charcode[i] != 0)
1623 break;
1625 if (last_charcode_len == 1)
1626 /* Of course we have the charcode value. */
1627 ctype->class256_collection[(size_t) last_charcode[0]]
1628 |= class256_bit;
1630 /* Find the symbolic name. */
1631 seq = charmap_find_symbol (charmap, last_charcode,
1632 last_charcode_len);
1633 if (seq != NULL)
1635 if (seq->ucs4 == UNINITIALIZED_CHAR_VALUE)
1636 seq->ucs4 = repertoire_find_value (repertoire, seq->name,
1637 strlen (seq->name));
1638 wch = seq == NULL ? ILLEGAL_CHAR_VALUE : seq->ucs4;
1640 if (wch != ILLEGAL_CHAR_VALUE && class_bit != 0)
1641 *find_idx (ctype, &ctype->class_collection,
1642 &ctype->class_collection_max,
1643 &ctype->class_collection_act, wch) |= class_bit;
1645 else
1646 wch = ILLEGAL_CHAR_VALUE;
1648 if (handle_digits == 1)
1650 /* We must store the digit values. */
1651 if (ctype->mbdigits_act == ctype->mbdigits_max)
1653 ctype->mbdigits_max *= 2;
1654 ctype->mbdigits = xrealloc (ctype->mbdigits,
1655 (ctype->mbdigits_max
1656 * sizeof (char *)));
1657 ctype->wcdigits_max *= 2;
1658 ctype->wcdigits = xrealloc (ctype->wcdigits,
1659 (ctype->wcdigits_max
1660 * sizeof (uint32_t)));
1663 seq = xmalloc (sizeof (struct charseq) + last_charcode_len);
1664 memcpy ((char *) (seq + 1), last_charcode, last_charcode_len);
1665 seq->nbytes = last_charcode_len;
1667 ctype->mbdigits[ctype->mbdigits_act++] = seq;
1668 ctype->wcdigits[ctype->wcdigits_act++] = wch;
1670 else if (handle_digits == 2)
1672 struct charseq *seq;
1673 /* We must store the digit values. */
1674 if (ctype->outdigits_act >= 10)
1676 lr_error (ldfile, _("\
1677 %s: field `%s' does not contain exactly ten entries"),
1678 "LC_CTYPE", "outdigit");
1679 return;
1682 seq = xmalloc (sizeof (struct charseq) + last_charcode_len);
1683 memcpy ((char *) (seq + 1), last_charcode, last_charcode_len);
1684 seq->nbytes = last_charcode_len;
1686 ctype->mboutdigits[ctype->outdigits_act] = seq;
1687 ctype->wcoutdigits[ctype->outdigits_act] = wch;
1688 ++ctype->outdigits_act;
1691 while (memcmp (last_charcode, now->val.charcode.bytes,
1692 last_charcode_len) != 0);
1697 static uint32_t *
1698 find_translit2 (struct locale_ctype_t *ctype, const struct charmap_t *charmap,
1699 uint32_t wch)
1701 struct translit_t *trunp = ctype->translit;
1702 struct translit_ignore_t *tirunp = ctype->translit_ignore;
1704 while (trunp != NULL)
1706 /* XXX We simplify things here. The transliterations we look
1707 for are only allowed to have one character. */
1708 if (trunp->from[0] == wch && trunp->from[1] == 0)
1710 /* Found it. Now look for a transliteration which can be
1711 represented with the character set. */
1712 struct translit_to_t *torunp = trunp->to;
1714 while (torunp != NULL)
1716 int i;
1718 for (i = 0; torunp->str[i] != 0; ++i)
1720 char utmp[10];
1722 snprintf (utmp, sizeof (utmp), "U%08X", torunp->str[i]);
1723 if (charmap_find_value (charmap, utmp, 9) == NULL)
1724 /* This character cannot be represented. */
1725 break;
1728 if (torunp->str[i] == 0)
1729 return torunp->str;
1731 torunp = torunp->next;
1734 break;
1737 trunp = trunp->next;
1740 /* Check for ignored chars. */
1741 while (tirunp != NULL)
1743 if (tirunp->from <= wch && tirunp->to >= wch)
1745 uint32_t wi;
1747 for (wi = tirunp->from; wi <= wch; wi += tirunp->step)
1748 if (wi == wch)
1749 return no_str;
1753 /* Nothing found. */
1754 return NULL;
1758 uint32_t *
1759 find_translit (struct localedef_t *locale, const struct charmap_t *charmap,
1760 uint32_t wch)
1762 struct locale_ctype_t *ctype;
1763 uint32_t *result = NULL;
1765 assert (locale != NULL);
1766 ctype = locale->categories[LC_CTYPE].ctype;
1768 if (ctype == NULL)
1769 return NULL;
1771 if (ctype->translit != NULL)
1772 result = find_translit2 (ctype, charmap, wch);
1774 if (result == NULL)
1776 struct translit_include_t *irunp = ctype->translit_include;
1778 while (irunp != NULL && result == NULL)
1780 result = find_translit (find_locale (CTYPE_LOCALE,
1781 irunp->copy_locale,
1782 irunp->copy_repertoire,
1783 charmap),
1784 charmap, wch);
1785 irunp = irunp->next;
1789 return result;
1793 /* Read one transliteration entry. */
1794 static uint32_t *
1795 read_widestring (struct linereader *ldfile, struct token *now,
1796 const struct charmap_t *charmap,
1797 struct repertoire_t *repertoire)
1799 uint32_t *wstr;
1801 if (now->tok == tok_default_missing)
1802 /* The special name "" will denote this case. */
1803 wstr = no_str;
1804 else if (now->tok == tok_bsymbol)
1806 /* Get the value from the repertoire. */
1807 wstr = (uint32_t *) xmalloc (2 * sizeof (uint32_t));
1808 wstr[0] = repertoire_find_value (repertoire, now->val.str.startmb,
1809 now->val.str.lenmb);
1810 if (wstr[0] == ILLEGAL_CHAR_VALUE)
1812 /* We cannot proceed, we don't know the UCS4 value. */
1813 free (wstr);
1814 return NULL;
1817 wstr[1] = 0;
1819 else if (now->tok == tok_ucs4)
1821 wstr = (uint32_t *) xmalloc (2 * sizeof (uint32_t));
1822 wstr[0] = now->val.ucs4;
1823 wstr[1] = 0;
1825 else if (now->tok == tok_charcode)
1827 /* Argh, we have to convert to the symbol name first and then to the
1828 UCS4 value. */
1829 struct charseq *seq = charmap_find_symbol (charmap,
1830 now->val.str.startmb,
1831 now->val.str.lenmb);
1832 if (seq == NULL)
1833 /* Cannot find the UCS4 value. */
1834 return NULL;
1836 if (seq->ucs4 == UNINITIALIZED_CHAR_VALUE)
1837 seq->ucs4 = repertoire_find_value (repertoire, seq->name,
1838 strlen (seq->name));
1839 if (seq->ucs4 == ILLEGAL_CHAR_VALUE)
1840 /* We cannot proceed, we don't know the UCS4 value. */
1841 return NULL;
1843 wstr = (uint32_t *) xmalloc (2 * sizeof (uint32_t));
1844 wstr[0] = seq->ucs4;
1845 wstr[1] = 0;
1847 else if (now->tok == tok_string)
1849 wstr = now->val.str.startwc;
1850 if (wstr == NULL || wstr[0] == 0)
1851 return NULL;
1853 else
1855 if (now->tok != tok_eol && now->tok != tok_eof)
1856 lr_ignore_rest (ldfile, 0);
1857 SYNTAX_ERROR (_("%s: syntax error"), "LC_CTYPE");
1858 return (uint32_t *) -1l;
1861 return wstr;
1865 static void
1866 read_translit_entry (struct linereader *ldfile, struct locale_ctype_t *ctype,
1867 struct token *now, const struct charmap_t *charmap,
1868 struct repertoire_t *repertoire)
1870 uint32_t *from_wstr = read_widestring (ldfile, now, charmap, repertoire);
1871 struct translit_t *result;
1872 struct translit_to_t **top;
1873 struct obstack *ob = &ctype->mempool;
1874 int first;
1875 int ignore;
1877 if (from_wstr == NULL)
1878 /* There is no valid from string. */
1879 return;
1881 result = (struct translit_t *) obstack_alloc (ob,
1882 sizeof (struct translit_t));
1883 result->from = from_wstr;
1884 result->fname = ldfile->fname;
1885 result->lineno = ldfile->lineno;
1886 result->next = NULL;
1887 result->to = NULL;
1888 top = &result->to;
1889 first = 1;
1890 ignore = 0;
1892 while (1)
1894 uint32_t *to_wstr;
1896 /* Next we have one or more transliterations. They are
1897 separated by semicolons. */
1898 now = lr_token (ldfile, charmap, NULL, repertoire, verbose);
1900 if (!first && (now->tok == tok_semicolon || now->tok == tok_eol))
1902 /* One string read. */
1903 const uint32_t zero = 0;
1905 if (!ignore)
1907 obstack_grow (ob, &zero, 4);
1908 to_wstr = obstack_finish (ob);
1910 *top = obstack_alloc (ob, sizeof (struct translit_to_t));
1911 (*top)->str = to_wstr;
1912 (*top)->next = NULL;
1915 if (now->tok == tok_eol)
1917 result->next = ctype->translit;
1918 ctype->translit = result;
1919 return;
1922 if (!ignore)
1923 top = &(*top)->next;
1924 ignore = 0;
1926 else
1928 to_wstr = read_widestring (ldfile, now, charmap, repertoire);
1929 if (to_wstr == (uint32_t *) -1l)
1931 /* An error occurred. */
1932 obstack_free (ob, result);
1933 return;
1936 if (to_wstr == NULL)
1937 ignore = 1;
1938 else
1939 /* This value is usable. */
1940 obstack_grow (ob, to_wstr, wcslen ((wchar_t *) to_wstr) * 4);
1942 first = 0;
1948 static void
1949 read_translit_ignore_entry (struct linereader *ldfile,
1950 struct locale_ctype_t *ctype,
1951 const struct charmap_t *charmap,
1952 struct repertoire_t *repertoire)
1954 /* We expect a semicolon-separated list of characters we ignore. We are
1955 only interested in the wide character definitions. These must be
1956 single characters, possibly defining a range when an ellipsis is used. */
1957 while (1)
1959 struct token *now = lr_token (ldfile, charmap, NULL, repertoire,
1960 verbose);
1961 struct translit_ignore_t *newp;
1962 uint32_t from;
1964 if (now->tok == tok_eol || now->tok == tok_eof)
1966 lr_error (ldfile,
1967 _("premature end of `translit_ignore' definition"));
1968 return;
1971 if (now->tok != tok_bsymbol && now->tok != tok_ucs4)
1973 lr_error (ldfile, _("syntax error"));
1974 lr_ignore_rest (ldfile, 0);
1975 return;
1978 if (now->tok == tok_ucs4)
1979 from = now->val.ucs4;
1980 else
1981 /* Try to get the value. */
1982 from = repertoire_find_value (repertoire, now->val.str.startmb,
1983 now->val.str.lenmb);
1985 if (from == ILLEGAL_CHAR_VALUE)
1987 lr_error (ldfile, "invalid character name");
1988 newp = NULL;
1990 else
1992 newp = (struct translit_ignore_t *)
1993 obstack_alloc (&ctype->mempool, sizeof (struct translit_ignore_t));
1994 newp->from = from;
1995 newp->to = from;
1996 newp->step = 1;
1998 newp->next = ctype->translit_ignore;
1999 ctype->translit_ignore = newp;
2002 /* Now we expect either a semicolon, an ellipsis, or the end of the
2003 line. */
2004 now = lr_token (ldfile, charmap, NULL, repertoire, verbose);
2006 if (now->tok == tok_ellipsis2 || now->tok == tok_ellipsis2_2)
2008 /* XXX Should we bother implementing `....'? `...' certainly
2009 will not be implemented. */
2010 uint32_t to;
2011 int step = now->tok == tok_ellipsis2_2 ? 2 : 1;
2013 now = lr_token (ldfile, charmap, NULL, repertoire, verbose);
2015 if (now->tok == tok_eol || now->tok == tok_eof)
2017 lr_error (ldfile,
2018 _("premature end of `translit_ignore' definition"));
2019 return;
2022 if (now->tok != tok_bsymbol && now->tok != tok_ucs4)
2024 lr_error (ldfile, _("syntax error"));
2025 lr_ignore_rest (ldfile, 0);
2026 return;
2029 if (now->tok == tok_ucs4)
2030 to = now->val.ucs4;
2031 else
2032 /* Try to get the value. */
2033 to = repertoire_find_value (repertoire, now->val.str.startmb,
2034 now->val.str.lenmb);
2036 if (to == ILLEGAL_CHAR_VALUE)
2037 lr_error (ldfile, "invalid character name");
2038 else
2040 /* Make sure the `to'-value is larger. */
2041 if (to >= from)
2043 newp->to = to;
2044 newp->step = step;
2046 else
2047 lr_error (ldfile, _("\
2048 to-value <U%0*X> of range is smaller than from-value <U%0*X>"),
2049 (to | from) < 65536 ? 4 : 8, to,
2050 (to | from) < 65536 ? 4 : 8, from);
2053 /* And the next token. */
2054 now = lr_token (ldfile, charmap, NULL, repertoire, verbose);
2057 if (now->tok == tok_eol || now->tok == tok_eof)
2058 /* We are done. */
2059 return;
2061 if (now->tok == tok_semicolon)
2062 /* Next round. */
2063 continue;
2065 /* If we come here something is wrong. */
2066 lr_error (ldfile, _("syntax error"));
2067 lr_ignore_rest (ldfile, 0);
2068 return;
2073 /* The parser for the LC_CTYPE section of the locale definition. */
2074 void
2075 ctype_read (struct linereader *ldfile, struct localedef_t *result,
2076 const struct charmap_t *charmap, const char *repertoire_name,
2077 int ignore_content)
2079 struct repertoire_t *repertoire = NULL;
2080 struct locale_ctype_t *ctype;
2081 struct token *now;
2082 enum token_t nowtok;
2083 size_t cnt;
2084 uint32_t last_wch = 0;
2085 enum token_t last_token;
2086 enum token_t ellipsis_token;
2087 int step;
2088 char last_charcode[16];
2089 size_t last_charcode_len = 0;
2090 const char *last_str = NULL;
2091 int mapidx;
2092 struct localedef_t *copy_locale = NULL;
2094 /* Get the repertoire we have to use. */
2095 if (repertoire_name != NULL)
2096 repertoire = repertoire_read (repertoire_name);
2098 /* The rest of the line containing `LC_CTYPE' must be free. */
2099 lr_ignore_rest (ldfile, 1);
2104 now = lr_token (ldfile, charmap, NULL, NULL, verbose);
2105 nowtok = now->tok;
2107 while (nowtok == tok_eol);
2109 /* If we see `copy' now we are almost done. */
2110 if (nowtok == tok_copy)
2112 now = lr_token (ldfile, charmap, NULL, NULL, verbose);
2113 if (now->tok != tok_string)
2115 SYNTAX_ERROR (_("%s: syntax error"), "LC_CTYPE");
2117 skip_category:
2119 now = lr_token (ldfile, charmap, NULL, NULL, verbose);
2120 while (now->tok != tok_eof && now->tok != tok_end);
2122 if (now->tok != tok_eof
2123 || (now = lr_token (ldfile, charmap, NULL, NULL, verbose),
2124 now->tok == tok_eof))
2125 lr_error (ldfile, _("%s: premature end of file"), "LC_CTYPE");
2126 else if (now->tok != tok_lc_ctype)
2128 lr_error (ldfile, _("\
2129 %1$s: definition does not end with `END %1$s'"), "LC_CTYPE");
2130 lr_ignore_rest (ldfile, 0);
2132 else
2133 lr_ignore_rest (ldfile, 1);
2135 return;
2138 if (! ignore_content)
2140 /* Get the locale definition. */
2141 copy_locale = load_locale (LC_CTYPE, now->val.str.startmb,
2142 repertoire_name, charmap, NULL);
2143 if ((copy_locale->avail & CTYPE_LOCALE) == 0)
2145 /* Not yet loaded. So do it now. */
2146 if (locfile_read (copy_locale, charmap) != 0)
2147 goto skip_category;
2150 if (copy_locale->categories[LC_CTYPE].ctype == NULL)
2151 return;
2154 lr_ignore_rest (ldfile, 1);
2156 now = lr_token (ldfile, charmap, NULL, NULL, verbose);
2157 nowtok = now->tok;
2160 /* Prepare the data structures. */
2161 ctype_startup (ldfile, result, charmap, copy_locale, ignore_content);
2162 ctype = result->categories[LC_CTYPE].ctype;
2164 /* Remember the repertoire we use. */
2165 if (!ignore_content)
2166 ctype->repertoire = repertoire;
2168 while (1)
2170 unsigned long int class_bit = 0;
2171 unsigned long int class256_bit = 0;
2172 int handle_digits = 0;
2174 /* Of course we don't proceed beyond the end of file. */
2175 if (nowtok == tok_eof)
2176 break;
2178 /* Ingore empty lines. */
2179 if (nowtok == tok_eol)
2181 now = lr_token (ldfile, charmap, NULL, NULL, verbose);
2182 nowtok = now->tok;
2183 continue;
2186 switch (nowtok)
2188 case tok_charclass:
2189 now = lr_token (ldfile, charmap, NULL, NULL, verbose);
2190 while (now->tok == tok_ident || now->tok == tok_string)
2192 ctype_class_new (ldfile, ctype, now->val.str.startmb);
2193 now = lr_token (ldfile, charmap, NULL, NULL, verbose);
2194 if (now->tok != tok_semicolon)
2195 break;
2196 now = lr_token (ldfile, charmap, NULL, NULL, verbose);
2198 if (now->tok != tok_eol)
2199 SYNTAX_ERROR (_("\
2200 %s: syntax error in definition of new character class"), "LC_CTYPE");
2201 break;
2203 case tok_charconv:
2204 now = lr_token (ldfile, charmap, NULL, NULL, verbose);
2205 while (now->tok == tok_ident || now->tok == tok_string)
2207 ctype_map_new (ldfile, ctype, now->val.str.startmb, charmap);
2208 now = lr_token (ldfile, charmap, NULL, NULL, verbose);
2209 if (now->tok != tok_semicolon)
2210 break;
2211 now = lr_token (ldfile, charmap, NULL, NULL, verbose);
2213 if (now->tok != tok_eol)
2214 SYNTAX_ERROR (_("\
2215 %s: syntax error in definition of new character map"), "LC_CTYPE");
2216 break;
2218 case tok_class:
2219 /* Ignore the rest of the line if we don't need the input of
2220 this line. */
2221 if (ignore_content)
2223 lr_ignore_rest (ldfile, 0);
2224 break;
2227 /* We simply forget the `class' keyword and use the following
2228 operand to determine the bit. */
2229 now = lr_token (ldfile, charmap, NULL, NULL, verbose);
2230 if (now->tok == tok_ident || now->tok == tok_string)
2232 /* Must can be one of the predefined class names. */
2233 for (cnt = 0; cnt < ctype->nr_charclass; ++cnt)
2234 if (strcmp (ctype->classnames[cnt], now->val.str.startmb) == 0)
2235 break;
2236 if (cnt >= ctype->nr_charclass)
2238 /* OK, it's a new class. */
2239 ctype_class_new (ldfile, ctype, now->val.str.startmb);
2241 class_bit = _ISwbit (ctype->nr_charclass - 1);
2243 else
2245 class_bit = _ISwbit (cnt);
2247 free (now->val.str.startmb);
2250 else if (now->tok == tok_digit)
2251 goto handle_tok_digit;
2252 else if (now->tok < tok_upper || now->tok > tok_blank)
2253 goto err_label;
2254 else
2256 class_bit = BITw (now->tok);
2257 class256_bit = BIT (now->tok);
2260 /* The next character must be a semicolon. */
2261 now = lr_token (ldfile, charmap, NULL, NULL, verbose);
2262 if (now->tok != tok_semicolon)
2263 goto err_label;
2264 goto read_charclass;
2266 case tok_upper:
2267 case tok_lower:
2268 case tok_alpha:
2269 case tok_alnum:
2270 case tok_space:
2271 case tok_cntrl:
2272 case tok_punct:
2273 case tok_graph:
2274 case tok_print:
2275 case tok_xdigit:
2276 case tok_blank:
2277 /* Ignore the rest of the line if we don't need the input of
2278 this line. */
2279 if (ignore_content)
2281 lr_ignore_rest (ldfile, 0);
2282 break;
2285 class_bit = BITw (now->tok);
2286 class256_bit = BIT (now->tok);
2287 handle_digits = 0;
2288 read_charclass:
2289 ctype->class_done |= class_bit;
2290 last_token = tok_none;
2291 ellipsis_token = tok_none;
2292 step = 1;
2293 now = lr_token (ldfile, charmap, NULL, NULL, verbose);
2294 while (now->tok != tok_eol && now->tok != tok_eof)
2296 uint32_t wch;
2297 struct charseq *seq;
2299 if (ellipsis_token == tok_none)
2301 if (get_character (now, charmap, repertoire, &seq, &wch))
2302 goto err_label;
2304 if (!ignore_content && seq != NULL && seq->nbytes == 1)
2305 /* Yep, we can store information about this byte
2306 sequence. */
2307 ctype->class256_collection[seq->bytes[0]] |= class256_bit;
2309 if (!ignore_content && wch != ILLEGAL_CHAR_VALUE
2310 && class_bit != 0)
2311 /* We have the UCS4 position. */
2312 *find_idx (ctype, &ctype->class_collection,
2313 &ctype->class_collection_max,
2314 &ctype->class_collection_act, wch) |= class_bit;
2316 last_token = now->tok;
2317 /* Terminate the string. */
2318 if (last_token == tok_bsymbol)
2320 now->val.str.startmb[now->val.str.lenmb] = '\0';
2321 last_str = now->val.str.startmb;
2323 else
2324 last_str = NULL;
2325 last_wch = wch;
2326 memcpy (last_charcode, now->val.charcode.bytes, 16);
2327 last_charcode_len = now->val.charcode.nbytes;
2329 if (!ignore_content && handle_digits == 1)
2331 /* We must store the digit values. */
2332 if (ctype->mbdigits_act == ctype->mbdigits_max)
2334 ctype->mbdigits_max += 10;
2335 ctype->mbdigits = xrealloc (ctype->mbdigits,
2336 (ctype->mbdigits_max
2337 * sizeof (char *)));
2338 ctype->wcdigits_max += 10;
2339 ctype->wcdigits = xrealloc (ctype->wcdigits,
2340 (ctype->wcdigits_max
2341 * sizeof (uint32_t)));
2344 ctype->mbdigits[ctype->mbdigits_act++] = seq;
2345 ctype->wcdigits[ctype->wcdigits_act++] = wch;
2347 else if (!ignore_content && handle_digits == 2)
2349 /* We must store the digit values. */
2350 if (ctype->outdigits_act >= 10)
2352 lr_error (ldfile, _("\
2353 %s: field `%s' does not contain exactly ten entries"),
2354 "LC_CTYPE", "outdigit");
2355 lr_ignore_rest (ldfile, 0);
2356 break;
2359 ctype->mboutdigits[ctype->outdigits_act] = seq;
2360 ctype->wcoutdigits[ctype->outdigits_act] = wch;
2361 ++ctype->outdigits_act;
2364 else
2366 /* Now it gets complicated. We have to resolve the
2367 ellipsis problem. First we must distinguish between
2368 the different kind of ellipsis and this must match the
2369 tokens we have seen. */
2370 assert (last_token != tok_none);
2372 if (last_token != now->tok)
2374 lr_error (ldfile, _("\
2375 ellipsis range must be marked by two operands of same type"));
2376 lr_ignore_rest (ldfile, 0);
2377 break;
2380 if (last_token == tok_bsymbol)
2382 if (ellipsis_token == tok_ellipsis3)
2383 lr_error (ldfile, _("with symbolic name range values \
2384 the absolute ellipsis `...' must not be used"));
2386 charclass_symbolic_ellipsis (ldfile, ctype, charmap,
2387 repertoire, now, last_str,
2388 class256_bit, class_bit,
2389 (ellipsis_token
2390 == tok_ellipsis4
2391 ? 10 : 16),
2392 ignore_content,
2393 handle_digits, step);
2395 else if (last_token == tok_ucs4)
2397 if (ellipsis_token != tok_ellipsis2)
2398 lr_error (ldfile, _("\
2399 with UCS range values one must use the hexadecimal symbolic ellipsis `..'"));
2401 charclass_ucs4_ellipsis (ldfile, ctype, charmap,
2402 repertoire, now, last_wch,
2403 class256_bit, class_bit,
2404 ignore_content, handle_digits,
2405 step);
2407 else
2409 assert (last_token == tok_charcode);
2411 if (ellipsis_token != tok_ellipsis3)
2412 lr_error (ldfile, _("\
2413 with character code range values one must use the absolute ellipsis `...'"));
2415 charclass_charcode_ellipsis (ldfile, ctype, charmap,
2416 repertoire, now,
2417 last_charcode,
2418 last_charcode_len,
2419 class256_bit, class_bit,
2420 ignore_content,
2421 handle_digits);
2424 /* Now we have used the last value. */
2425 last_token = tok_none;
2428 /* Next we expect a semicolon or the end of the line. */
2429 now = lr_token (ldfile, charmap, NULL, NULL, verbose);
2430 if (now->tok == tok_eol || now->tok == tok_eof)
2431 break;
2433 if (last_token != tok_none
2434 && now->tok >= tok_ellipsis2 && now->tok <= tok_ellipsis4_2)
2436 if (now->tok == tok_ellipsis2_2)
2438 now->tok = tok_ellipsis2;
2439 step = 2;
2441 else if (now->tok == tok_ellipsis4_2)
2443 now->tok = tok_ellipsis4;
2444 step = 2;
2447 ellipsis_token = now->tok;
2449 now = lr_token (ldfile, charmap, NULL, NULL, verbose);
2450 continue;
2453 if (now->tok != tok_semicolon)
2454 goto err_label;
2456 /* And get the next character. */
2457 now = lr_token (ldfile, charmap, NULL, NULL, verbose);
2459 ellipsis_token = tok_none;
2460 step = 1;
2462 break;
2464 case tok_digit:
2465 /* Ignore the rest of the line if we don't need the input of
2466 this line. */
2467 if (ignore_content)
2469 lr_ignore_rest (ldfile, 0);
2470 break;
2473 handle_tok_digit:
2474 class_bit = _ISwdigit;
2475 class256_bit = _ISdigit;
2476 handle_digits = 1;
2477 goto read_charclass;
2479 case tok_outdigit:
2480 /* Ignore the rest of the line if we don't need the input of
2481 this line. */
2482 if (ignore_content)
2484 lr_ignore_rest (ldfile, 0);
2485 break;
2488 if (ctype->outdigits_act != 0)
2489 lr_error (ldfile, _("\
2490 %s: field `%s' declared more than once"),
2491 "LC_CTYPE", "outdigit");
2492 class_bit = 0;
2493 class256_bit = 0;
2494 handle_digits = 2;
2495 goto read_charclass;
2497 case tok_toupper:
2498 /* Ignore the rest of the line if we don't need the input of
2499 this line. */
2500 if (ignore_content)
2502 lr_ignore_rest (ldfile, 0);
2503 break;
2506 mapidx = 0;
2507 goto read_mapping;
2509 case tok_tolower:
2510 /* Ignore the rest of the line if we don't need the input of
2511 this line. */
2512 if (ignore_content)
2514 lr_ignore_rest (ldfile, 0);
2515 break;
2518 mapidx = 1;
2519 goto read_mapping;
2521 case tok_map:
2522 /* Ignore the rest of the line if we don't need the input of
2523 this line. */
2524 if (ignore_content)
2526 lr_ignore_rest (ldfile, 0);
2527 break;
2530 /* We simply forget the `map' keyword and use the following
2531 operand to determine the mapping. */
2532 now = lr_token (ldfile, charmap, NULL, NULL, verbose);
2533 if (now->tok == tok_ident || now->tok == tok_string)
2535 size_t cnt;
2537 for (cnt = 2; cnt < ctype->map_collection_nr; ++cnt)
2538 if (strcmp (now->val.str.startmb, ctype->mapnames[cnt]) == 0)
2539 break;
2541 if (cnt < ctype->map_collection_nr)
2542 free (now->val.str.startmb);
2543 else
2544 /* OK, it's a new map. */
2545 ctype_map_new (ldfile, ctype, now->val.str.startmb, charmap);
2547 mapidx = cnt;
2549 else if (now->tok < tok_toupper || now->tok > tok_tolower)
2550 goto err_label;
2551 else
2552 mapidx = now->tok - tok_toupper;
2554 now = lr_token (ldfile, charmap, NULL, NULL, verbose);
2555 /* This better should be a semicolon. */
2556 if (now->tok != tok_semicolon)
2557 goto err_label;
2559 read_mapping:
2560 /* Test whether this mapping was already defined. */
2561 if (ctype->tomap_done[mapidx])
2563 lr_error (ldfile, _("duplicated definition for mapping `%s'"),
2564 ctype->mapnames[mapidx]);
2565 lr_ignore_rest (ldfile, 0);
2566 break;
2568 ctype->tomap_done[mapidx] = 1;
2570 now = lr_token (ldfile, charmap, NULL, NULL, verbose);
2571 while (now->tok != tok_eol && now->tok != tok_eof)
2573 struct charseq *from_seq;
2574 uint32_t from_wch;
2575 struct charseq *to_seq;
2576 uint32_t to_wch;
2578 /* Every pair starts with an opening brace. */
2579 if (now->tok != tok_open_brace)
2580 goto err_label;
2582 /* Next comes the from-value. */
2583 now = lr_token (ldfile, charmap, NULL, NULL, verbose);
2584 if (get_character (now, charmap, repertoire, &from_seq,
2585 &from_wch) != 0)
2586 goto err_label;
2588 /* The next is a comma. */
2589 now = lr_token (ldfile, charmap, NULL, NULL, verbose);
2590 if (now->tok != tok_comma)
2591 goto err_label;
2593 /* And the other value. */
2594 now = lr_token (ldfile, charmap, NULL, NULL, verbose);
2595 if (get_character (now, charmap, repertoire, &to_seq,
2596 &to_wch) != 0)
2597 goto err_label;
2599 /* And the last thing is the closing brace. */
2600 now = lr_token (ldfile, charmap, NULL, NULL, verbose);
2601 if (now->tok != tok_close_brace)
2602 goto err_label;
2604 if (!ignore_content)
2606 /* Check whether the mapping converts from an ASCII value
2607 to a non-ASCII value. */
2608 if (from_seq != NULL && from_seq->nbytes == 1
2609 && isascii (from_seq->bytes[0])
2610 && to_seq != NULL && (to_seq->nbytes != 1
2611 || !isascii (to_seq->bytes[0])))
2612 ctype->to_nonascii = 1;
2614 if (mapidx < 2 && from_seq != NULL && to_seq != NULL
2615 && from_seq->nbytes == 1 && to_seq->nbytes == 1)
2616 /* We can use this value. */
2617 ctype->map256_collection[mapidx][from_seq->bytes[0]]
2618 = to_seq->bytes[0];
2620 if (from_wch != ILLEGAL_CHAR_VALUE
2621 && to_wch != ILLEGAL_CHAR_VALUE)
2622 /* Both correct values. */
2623 *find_idx (ctype, &ctype->map_collection[mapidx],
2624 &ctype->map_collection_max[mapidx],
2625 &ctype->map_collection_act[mapidx],
2626 from_wch) = to_wch;
2629 /* Now comes a semicolon or the end of the line/file. */
2630 now = lr_token (ldfile, charmap, NULL, NULL, verbose);
2631 if (now->tok == tok_semicolon)
2632 now = lr_token (ldfile, charmap, NULL, NULL, verbose);
2634 break;
2636 case tok_translit_start:
2637 /* Ignore the entire translit section with its peculiar syntax
2638 if we don't need the input. */
2639 if (ignore_content)
2643 lr_ignore_rest (ldfile, 0);
2644 now = lr_token (ldfile, charmap, NULL, NULL, verbose);
2646 while (now->tok != tok_translit_end && now->tok != tok_eof);
2648 if (now->tok == tok_eof)
2649 lr_error (ldfile, _(\
2650 "%s: `translit_start' section does not end with `translit_end'"),
2651 "LC_CTYPE");
2653 break;
2656 /* The rest of the line better should be empty. */
2657 lr_ignore_rest (ldfile, 1);
2659 /* We count here the number of allocated entries in the `translit'
2660 array. */
2661 cnt = 0;
2663 ldfile->translate_strings = 1;
2664 ldfile->return_widestr = 1;
2666 /* We proceed until we see the `translit_end' token. */
2667 while (now = lr_token (ldfile, charmap, NULL, repertoire, verbose),
2668 now->tok != tok_translit_end && now->tok != tok_eof)
2670 if (now->tok == tok_eol)
2671 /* Ignore empty lines. */
2672 continue;
2674 if (now->tok == tok_include)
2676 /* We have to include locale. */
2677 const char *locale_name;
2678 const char *repertoire_name;
2679 struct translit_include_t *include_stmt, **include_ptr;
2681 now = lr_token (ldfile, charmap, NULL, NULL, verbose);
2682 /* This should be a string or an identifier. In any
2683 case something to name a locale. */
2684 if (now->tok != tok_string && now->tok != tok_ident)
2686 translit_syntax:
2687 lr_error (ldfile, _("%s: syntax error"), "LC_CTYPE");
2688 lr_ignore_rest (ldfile, 0);
2689 continue;
2691 locale_name = now->val.str.startmb;
2693 /* Next should be a semicolon. */
2694 now = lr_token (ldfile, charmap, NULL, NULL, verbose);
2695 if (now->tok != tok_semicolon)
2696 goto translit_syntax;
2698 /* Now the repertoire name. */
2699 now = lr_token (ldfile, charmap, NULL, NULL, verbose);
2700 if ((now->tok != tok_string && now->tok != tok_ident)
2701 || now->val.str.startmb == NULL)
2702 goto translit_syntax;
2703 repertoire_name = now->val.str.startmb;
2704 if (repertoire_name[0] == '\0')
2705 /* Ignore the empty string. */
2706 repertoire_name = NULL;
2708 /* Save the include statement for later processing. */
2709 include_stmt = (struct translit_include_t *)
2710 xmalloc (sizeof (struct translit_include_t));
2711 include_stmt->copy_locale = locale_name;
2712 include_stmt->copy_repertoire = repertoire_name;
2713 include_stmt->next = NULL;
2715 include_ptr = &ctype->translit_include;
2716 while (*include_ptr != NULL)
2717 include_ptr = &(*include_ptr)->next;
2718 *include_ptr = include_stmt;
2720 /* The rest of the line must be empty. */
2721 lr_ignore_rest (ldfile, 1);
2723 /* Make sure the locale is read. */
2724 add_to_readlist (LC_CTYPE, locale_name, repertoire_name,
2725 1, NULL);
2726 continue;
2728 else if (now->tok == tok_default_missing)
2730 uint32_t *wstr;
2732 while (1)
2734 /* We expect a single character or string as the
2735 argument. */
2736 now = lr_token (ldfile, charmap, NULL, NULL, verbose);
2737 wstr = read_widestring (ldfile, now, charmap,
2738 repertoire);
2740 if (wstr != NULL)
2742 if (ctype->default_missing != NULL)
2744 lr_error (ldfile, _("\
2745 %s: duplicate `default_missing' definition"), "LC_CTYPE");
2746 WITH_CUR_LOCALE (error_at_line (0, 0,
2747 ctype->default_missing_file,
2748 ctype->default_missing_lineno,
2749 _("\
2750 previous definition was here")));
2752 else
2754 ctype->default_missing = wstr;
2755 ctype->default_missing_file = ldfile->fname;
2756 ctype->default_missing_lineno = ldfile->lineno;
2758 /* We can have more entries, ignore them. */
2759 lr_ignore_rest (ldfile, 0);
2760 break;
2762 else if (wstr == (uint32_t *) -1l)
2763 /* This was an syntax error. */
2764 break;
2766 /* Maybe there is another replacement we can use. */
2767 now = lr_token (ldfile, charmap, NULL, NULL, verbose);
2768 if (now->tok == tok_eol || now->tok == tok_eof)
2770 /* Nothing found. We tell the user. */
2771 lr_error (ldfile, _("\
2772 %s: no representable `default_missing' definition found"), "LC_CTYPE");
2773 break;
2775 if (now->tok != tok_semicolon)
2776 goto translit_syntax;
2779 continue;
2781 else if (now->tok == tok_translit_ignore)
2783 read_translit_ignore_entry (ldfile, ctype, charmap,
2784 repertoire);
2785 continue;
2788 read_translit_entry (ldfile, ctype, now, charmap, repertoire);
2790 ldfile->return_widestr = 0;
2792 if (now->tok == tok_eof)
2793 lr_error (ldfile, _(\
2794 "%s: `translit_start' section does not end with `translit_end'"),
2795 "LC_CTYPE");
2797 break;
2799 case tok_ident:
2800 /* Ignore the rest of the line if we don't need the input of
2801 this line. */
2802 if (ignore_content)
2804 lr_ignore_rest (ldfile, 0);
2805 break;
2808 /* This could mean one of several things. First test whether
2809 it's a character class name. */
2810 for (cnt = 0; cnt < ctype->nr_charclass; ++cnt)
2811 if (strcmp (now->val.str.startmb, ctype->classnames[cnt]) == 0)
2812 break;
2813 if (cnt < ctype->nr_charclass)
2815 class_bit = _ISwbit (cnt);
2816 class256_bit = cnt <= 11 ? _ISbit (cnt) : 0;
2817 free (now->val.str.startmb);
2818 goto read_charclass;
2820 for (cnt = 0; cnt < ctype->map_collection_nr; ++cnt)
2821 if (strcmp (now->val.str.startmb, ctype->mapnames[cnt]) == 0)
2822 break;
2823 if (cnt < ctype->map_collection_nr)
2825 mapidx = cnt;
2826 free (now->val.str.startmb);
2827 goto read_mapping;
2829 break;
2831 case tok_end:
2832 /* Next we assume `LC_CTYPE'. */
2833 now = lr_token (ldfile, charmap, NULL, NULL, verbose);
2834 if (now->tok == tok_eof)
2835 break;
2836 if (now->tok == tok_eol)
2837 lr_error (ldfile, _("%s: incomplete `END' line"),
2838 "LC_CTYPE");
2839 else if (now->tok != tok_lc_ctype)
2840 lr_error (ldfile, _("\
2841 %1$s: definition does not end with `END %1$s'"), "LC_CTYPE");
2842 lr_ignore_rest (ldfile, now->tok == tok_lc_ctype);
2843 return;
2845 default:
2846 err_label:
2847 if (now->tok != tok_eof)
2848 SYNTAX_ERROR (_("%s: syntax error"), "LC_CTYPE");
2851 /* Prepare for the next round. */
2852 now = lr_token (ldfile, charmap, NULL, NULL, verbose);
2853 nowtok = now->tok;
2856 /* When we come here we reached the end of the file. */
2857 lr_error (ldfile, _("%s: premature end of file"), "LC_CTYPE");
2861 /* Subroutine of set_class_defaults, below. */
2862 static void
2863 set_one_default (struct locale_ctype_t *ctype,
2864 const struct charmap_t *charmap,
2865 int bitpos, int from, int to)
2867 char tmp[2];
2868 int ch;
2869 int bit = _ISbit (bitpos);
2870 int bitw = _ISwbit (bitpos);
2871 /* Define string. */
2872 strcpy (tmp, "?");
2874 for (ch = from; ch <= to; ++ch)
2876 struct charseq *seq;
2877 tmp[0] = ch;
2879 seq = charmap_find_value (charmap, tmp, 1);
2880 if (seq == NULL)
2882 char buf[10];
2883 sprintf (buf, "U%08X", ch);
2884 seq = charmap_find_value (charmap, buf, 9);
2886 if (seq == NULL)
2888 if (!be_quiet)
2889 WITH_CUR_LOCALE (error (0, 0, _("\
2890 %s: character `%s' not defined while needed as default value"),
2891 "LC_CTYPE", tmp));
2893 else if (seq->nbytes != 1)
2894 WITH_CUR_LOCALE (error (0, 0, _("\
2895 %s: character `%s' in charmap not representable with one byte"),
2896 "LC_CTYPE", tmp));
2897 else
2898 ctype->class256_collection[seq->bytes[0]] |= bit;
2900 /* No need to search here, the ASCII value is also the Unicode
2901 value. */
2902 ELEM (ctype, class_collection, , ch) |= bitw;
2906 static void
2907 set_class_defaults (struct locale_ctype_t *ctype,
2908 const struct charmap_t *charmap,
2909 struct repertoire_t *repertoire)
2911 #define set_default(bitpos, from, to) \
2912 set_one_default (ctype, charmap, bitpos, from, to)
2914 /* These function defines the default values for the classes and conversions
2915 according to POSIX.2 2.5.2.1.
2916 It may seem that the order of these if-blocks is arbitrary but it is NOT.
2917 Don't move them unless you know what you do! */
2919 /* Set default values if keyword was not present. */
2920 if ((ctype->class_done & BITw (tok_upper)) == 0)
2921 /* "If this keyword [lower] is not specified, the lowercase letters
2922 `A' through `Z', ..., shall automatically belong to this class,
2923 with implementation defined character values." [P1003.2, 2.5.2.1] */
2924 set_default (BITPOS (tok_upper), 'A', 'Z');
2926 if ((ctype->class_done & BITw (tok_lower)) == 0)
2927 /* "If this keyword [lower] is not specified, the lowercase letters
2928 `a' through `z', ..., shall automatically belong to this class,
2929 with implementation defined character values." [P1003.2, 2.5.2.1] */
2930 set_default (BITPOS (tok_lower), 'a', 'z');
2932 if ((ctype->class_done & BITw (tok_alpha)) == 0)
2934 /* Table 2-6 in P1003.2 says that characters in class `upper' or
2935 class `lower' *must* be in class `alpha'. */
2936 unsigned long int mask = BIT (tok_upper) | BIT (tok_lower);
2937 unsigned long int maskw = BITw (tok_upper) | BITw (tok_lower);
2939 for (size_t cnt = 0; cnt < 256; ++cnt)
2940 if ((ctype->class256_collection[cnt] & mask) != 0)
2941 ctype->class256_collection[cnt] |= BIT (tok_alpha);
2943 for (size_t cnt = 0; cnt < ctype->class_collection_act; ++cnt)
2944 if ((ctype->class_collection[cnt] & maskw) != 0)
2945 ctype->class_collection[cnt] |= BITw (tok_alpha);
2948 if ((ctype->class_done & BITw (tok_digit)) == 0)
2949 /* "If this keyword [digit] is not specified, the digits `0' through
2950 `9', ..., shall automatically belong to this class, with
2951 implementation-defined character values." [P1003.2, 2.5.2.1] */
2952 set_default (BITPOS (tok_digit), '0', '9');
2954 /* "Only characters specified for the `alpha' and `digit' keyword
2955 shall be specified. Characters specified for the keyword `alpha'
2956 and `digit' are automatically included in this class. */
2958 unsigned long int mask = BIT (tok_alpha) | BIT (tok_digit);
2959 unsigned long int maskw = BITw (tok_alpha) | BITw (tok_digit);
2961 for (size_t cnt = 0; cnt < 256; ++cnt)
2962 if ((ctype->class256_collection[cnt] & mask) != 0)
2963 ctype->class256_collection[cnt] |= BIT (tok_alnum);
2965 for (size_t cnt = 0; cnt < ctype->class_collection_act; ++cnt)
2966 if ((ctype->class_collection[cnt] & maskw) != 0)
2967 ctype->class_collection[cnt] |= BITw (tok_alnum);
2970 if ((ctype->class_done & BITw (tok_space)) == 0)
2971 /* "If this keyword [space] is not specified, the characters <space>,
2972 <form-feed>, <newline>, <carriage-return>, <tab>, and
2973 <vertical-tab>, ..., shall automatically belong to this class,
2974 with implementation-defined character values." [P1003.2, 2.5.2.1] */
2976 struct charseq *seq;
2978 seq = charmap_find_value (charmap, "space", 5);
2979 if (seq == NULL)
2980 seq = charmap_find_value (charmap, "SP", 2);
2981 if (seq == NULL)
2982 seq = charmap_find_value (charmap, "U00000020", 9);
2983 if (seq == NULL)
2985 if (!be_quiet)
2986 WITH_CUR_LOCALE (error (0, 0, _("\
2987 %s: character `%s' not defined while needed as default value"),
2988 "LC_CTYPE", "<space>"));
2990 else if (seq->nbytes != 1)
2991 WITH_CUR_LOCALE (error (0, 0, _("\
2992 %s: character `%s' in charmap not representable with one byte"),
2993 "LC_CTYPE", "<space>"));
2994 else
2995 ctype->class256_collection[seq->bytes[0]] |= BIT (tok_space);
2997 /* No need to search. */
2998 ELEM (ctype, class_collection, , L' ') |= BITw (tok_space);
3000 seq = charmap_find_value (charmap, "form-feed", 9);
3001 if (seq == NULL)
3002 seq = charmap_find_value (charmap, "U0000000C", 9);
3003 if (seq == NULL)
3005 if (!be_quiet)
3006 WITH_CUR_LOCALE (error (0, 0, _("\
3007 %s: character `%s' not defined while needed as default value"),
3008 "LC_CTYPE", "<form-feed>"));
3010 else if (seq->nbytes != 1)
3011 WITH_CUR_LOCALE (error (0, 0, _("\
3012 %s: character `%s' in charmap not representable with one byte"),
3013 "LC_CTYPE", "<form-feed>"));
3014 else
3015 ctype->class256_collection[seq->bytes[0]] |= BIT (tok_space);
3017 /* No need to search. */
3018 ELEM (ctype, class_collection, , L'\f') |= BITw (tok_space);
3021 seq = charmap_find_value (charmap, "newline", 7);
3022 if (seq == NULL)
3023 seq = charmap_find_value (charmap, "U0000000A", 9);
3024 if (seq == NULL)
3026 if (!be_quiet)
3027 WITH_CUR_LOCALE (error (0, 0, _("\
3028 %s: character `%s' not defined while needed as default value"),
3029 "LC_CTYPE", "<newline>"));
3031 else if (seq->nbytes != 1)
3032 WITH_CUR_LOCALE (error (0, 0, _("\
3033 %s: character `%s' in charmap not representable with one byte"),
3034 "LC_CTYPE", "<newline>"));
3035 else
3036 ctype->class256_collection[seq->bytes[0]] |= BIT (tok_space);
3038 /* No need to search. */
3039 ELEM (ctype, class_collection, , L'\n') |= BITw (tok_space);
3042 seq = charmap_find_value (charmap, "carriage-return", 15);
3043 if (seq == NULL)
3044 seq = charmap_find_value (charmap, "U0000000D", 9);
3045 if (seq == NULL)
3047 if (!be_quiet)
3048 WITH_CUR_LOCALE (error (0, 0, _("\
3049 %s: character `%s' not defined while needed as default value"),
3050 "LC_CTYPE", "<carriage-return>"));
3052 else if (seq->nbytes != 1)
3053 WITH_CUR_LOCALE (error (0, 0, _("\
3054 %s: character `%s' in charmap not representable with one byte"),
3055 "LC_CTYPE", "<carriage-return>"));
3056 else
3057 ctype->class256_collection[seq->bytes[0]] |= BIT (tok_space);
3059 /* No need to search. */
3060 ELEM (ctype, class_collection, , L'\r') |= BITw (tok_space);
3063 seq = charmap_find_value (charmap, "tab", 3);
3064 if (seq == NULL)
3065 seq = charmap_find_value (charmap, "U00000009", 9);
3066 if (seq == NULL)
3068 if (!be_quiet)
3069 WITH_CUR_LOCALE (error (0, 0, _("\
3070 %s: character `%s' not defined while needed as default value"),
3071 "LC_CTYPE", "<tab>"));
3073 else if (seq->nbytes != 1)
3074 WITH_CUR_LOCALE (error (0, 0, _("\
3075 %s: character `%s' in charmap not representable with one byte"),
3076 "LC_CTYPE", "<tab>"));
3077 else
3078 ctype->class256_collection[seq->bytes[0]] |= BIT (tok_space);
3080 /* No need to search. */
3081 ELEM (ctype, class_collection, , L'\t') |= BITw (tok_space);
3084 seq = charmap_find_value (charmap, "vertical-tab", 12);
3085 if (seq == NULL)
3086 seq = charmap_find_value (charmap, "U0000000B", 9);
3087 if (seq == NULL)
3089 if (!be_quiet)
3090 WITH_CUR_LOCALE (error (0, 0, _("\
3091 %s: character `%s' not defined while needed as default value"),
3092 "LC_CTYPE", "<vertical-tab>"));
3094 else if (seq->nbytes != 1)
3095 WITH_CUR_LOCALE (error (0, 0, _("\
3096 %s: character `%s' in charmap not representable with one byte"),
3097 "LC_CTYPE", "<vertical-tab>"));
3098 else
3099 ctype->class256_collection[seq->bytes[0]] |= BIT (tok_space);
3101 /* No need to search. */
3102 ELEM (ctype, class_collection, , L'\v') |= BITw (tok_space);
3105 if ((ctype->class_done & BITw (tok_xdigit)) == 0)
3106 /* "If this keyword is not specified, the digits `0' to `9', the
3107 uppercase letters `A' through `F', and the lowercase letters `a'
3108 through `f', ..., shell automatically belong to this class, with
3109 implementation defined character values." [P1003.2, 2.5.2.1] */
3111 set_default (BITPOS (tok_xdigit), '0', '9');
3112 set_default (BITPOS (tok_xdigit), 'A', 'F');
3113 set_default (BITPOS (tok_xdigit), 'a', 'f');
3116 if ((ctype->class_done & BITw (tok_blank)) == 0)
3117 /* "If this keyword [blank] is unspecified, the characters <space> and
3118 <tab> shall belong to this character class." [P1003.2, 2.5.2.1] */
3120 struct charseq *seq;
3122 seq = charmap_find_value (charmap, "space", 5);
3123 if (seq == NULL)
3124 seq = charmap_find_value (charmap, "SP", 2);
3125 if (seq == NULL)
3126 seq = charmap_find_value (charmap, "U00000020", 9);
3127 if (seq == NULL)
3129 if (!be_quiet)
3130 WITH_CUR_LOCALE (error (0, 0, _("\
3131 %s: character `%s' not defined while needed as default value"),
3132 "LC_CTYPE", "<space>"));
3134 else if (seq->nbytes != 1)
3135 WITH_CUR_LOCALE (error (0, 0, _("\
3136 %s: character `%s' in charmap not representable with one byte"),
3137 "LC_CTYPE", "<space>"));
3138 else
3139 ctype->class256_collection[seq->bytes[0]] |= BIT (tok_blank);
3141 /* No need to search. */
3142 ELEM (ctype, class_collection, , L' ') |= BITw (tok_blank);
3145 seq = charmap_find_value (charmap, "tab", 3);
3146 if (seq == NULL)
3147 seq = charmap_find_value (charmap, "U00000009", 9);
3148 if (seq == NULL)
3150 if (!be_quiet)
3151 WITH_CUR_LOCALE (error (0, 0, _("\
3152 %s: character `%s' not defined while needed as default value"),
3153 "LC_CTYPE", "<tab>"));
3155 else if (seq->nbytes != 1)
3156 WITH_CUR_LOCALE (error (0, 0, _("\
3157 %s: character `%s' in charmap not representable with one byte"),
3158 "LC_CTYPE", "<tab>"));
3159 else
3160 ctype->class256_collection[seq->bytes[0]] |= BIT (tok_blank);
3162 /* No need to search. */
3163 ELEM (ctype, class_collection, , L'\t') |= BITw (tok_blank);
3166 if ((ctype->class_done & BITw (tok_graph)) == 0)
3167 /* "If this keyword [graph] is not specified, characters specified for
3168 the keywords `upper', `lower', `alpha', `digit', `xdigit' and `punct',
3169 shall belong to this character class." [P1003.2, 2.5.2.1] */
3171 unsigned long int mask = BIT (tok_upper) | BIT (tok_lower) |
3172 BIT (tok_alpha) | BIT (tok_digit) | BIT (tok_xdigit) | BIT (tok_punct);
3173 unsigned long int maskw = BITw (tok_upper) | BITw (tok_lower) |
3174 BITw (tok_alpha) | BITw (tok_digit) | BITw (tok_xdigit) |
3175 BITw (tok_punct);
3177 for (size_t cnt = 0; cnt < ctype->class_collection_act; ++cnt)
3178 if ((ctype->class_collection[cnt] & maskw) != 0)
3179 ctype->class_collection[cnt] |= BITw (tok_graph);
3181 for (size_t cnt = 0; cnt < 256; ++cnt)
3182 if ((ctype->class256_collection[cnt] & mask) != 0)
3183 ctype->class256_collection[cnt] |= BIT (tok_graph);
3186 if ((ctype->class_done & BITw (tok_print)) == 0)
3187 /* "If this keyword [print] is not provided, characters specified for
3188 the keywords `upper', `lower', `alpha', `digit', `xdigit', `punct',
3189 and the <space> character shall belong to this character class."
3190 [P1003.2, 2.5.2.1] */
3192 unsigned long int mask = BIT (tok_upper) | BIT (tok_lower) |
3193 BIT (tok_alpha) | BIT (tok_digit) | BIT (tok_xdigit) | BIT (tok_punct);
3194 unsigned long int maskw = BITw (tok_upper) | BITw (tok_lower) |
3195 BITw (tok_alpha) | BITw (tok_digit) | BITw (tok_xdigit) |
3196 BITw (tok_punct);
3197 struct charseq *seq;
3199 for (size_t cnt = 0; cnt < ctype->class_collection_act; ++cnt)
3200 if ((ctype->class_collection[cnt] & maskw) != 0)
3201 ctype->class_collection[cnt] |= BITw (tok_print);
3203 for (size_t cnt = 0; cnt < 256; ++cnt)
3204 if ((ctype->class256_collection[cnt] & mask) != 0)
3205 ctype->class256_collection[cnt] |= BIT (tok_print);
3208 seq = charmap_find_value (charmap, "space", 5);
3209 if (seq == NULL)
3210 seq = charmap_find_value (charmap, "SP", 2);
3211 if (seq == NULL)
3212 seq = charmap_find_value (charmap, "U00000020", 9);
3213 if (seq == NULL)
3215 if (!be_quiet)
3216 WITH_CUR_LOCALE (error (0, 0, _("\
3217 %s: character `%s' not defined while needed as default value"),
3218 "LC_CTYPE", "<space>"));
3220 else if (seq->nbytes != 1)
3221 WITH_CUR_LOCALE (error (0, 0, _("\
3222 %s: character `%s' in charmap not representable with one byte"),
3223 "LC_CTYPE", "<space>"));
3224 else
3225 ctype->class256_collection[seq->bytes[0]] |= BIT (tok_print);
3227 /* No need to search. */
3228 ELEM (ctype, class_collection, , L' ') |= BITw (tok_print);
3231 if (ctype->tomap_done[0] == 0)
3232 /* "If this keyword [toupper] is not specified, the lowercase letters
3233 `a' through `z', and their corresponding uppercase letters `A' to
3234 `Z', ..., shall automatically be included, with implementation-
3235 defined character values." [P1003.2, 2.5.2.1] */
3237 char tmp[4];
3238 int ch;
3240 strcpy (tmp, "<?>");
3242 for (ch = 'a'; ch <= 'z'; ++ch)
3244 struct charseq *seq_from, *seq_to;
3246 tmp[1] = (char) ch;
3248 seq_from = charmap_find_value (charmap, &tmp[1], 1);
3249 if (seq_from == NULL)
3251 char buf[10];
3252 sprintf (buf, "U%08X", ch);
3253 seq_from = charmap_find_value (charmap, buf, 9);
3255 if (seq_from == NULL)
3257 if (!be_quiet)
3258 WITH_CUR_LOCALE (error (0, 0, _("\
3259 %s: character `%s' not defined while needed as default value"),
3260 "LC_CTYPE", tmp));
3262 else if (seq_from->nbytes != 1)
3264 if (!be_quiet)
3265 WITH_CUR_LOCALE (error (0, 0, _("\
3266 %s: character `%s' needed as default value not representable with one byte"),
3267 "LC_CTYPE", tmp));
3269 else
3271 /* This conversion is implementation defined. */
3272 tmp[1] = (char) (ch + ('A' - 'a'));
3273 seq_to = charmap_find_value (charmap, &tmp[1], 1);
3274 if (seq_to == NULL)
3276 char buf[10];
3277 sprintf (buf, "U%08X", ch + ('A' - 'a'));
3278 seq_to = charmap_find_value (charmap, buf, 9);
3280 if (seq_to == NULL)
3282 if (!be_quiet)
3283 WITH_CUR_LOCALE (error (0, 0, _("\
3284 %s: character `%s' not defined while needed as default value"),
3285 "LC_CTYPE", tmp));
3287 else if (seq_to->nbytes != 1)
3289 if (!be_quiet)
3290 WITH_CUR_LOCALE (error (0, 0, _("\
3291 %s: character `%s' needed as default value not representable with one byte"),
3292 "LC_CTYPE", tmp));
3294 else
3295 /* The index [0] is determined by the order of the
3296 `ctype_map_newP' calls in `ctype_startup'. */
3297 ctype->map256_collection[0][seq_from->bytes[0]]
3298 = seq_to->bytes[0];
3301 /* No need to search. */
3302 ELEM (ctype, map_collection, [0], ch) = ch + ('A' - 'a');
3306 if (ctype->tomap_done[1] == 0)
3307 /* "If this keyword [tolower] is not specified, the mapping shall be
3308 the reverse mapping of the one specified to `toupper'." [P1003.2] */
3310 for (size_t cnt = 0; cnt < ctype->map_collection_act[0]; ++cnt)
3311 if (ctype->map_collection[0][cnt] != 0)
3312 ELEM (ctype, map_collection, [1],
3313 ctype->map_collection[0][cnt])
3314 = ctype->charnames[cnt];
3316 for (size_t cnt = 0; cnt < 256; ++cnt)
3317 if (ctype->map256_collection[0][cnt] != 0)
3318 ctype->map256_collection[1][ctype->map256_collection[0][cnt]] = cnt;
3321 if (ctype->outdigits_act != 10)
3323 if (ctype->outdigits_act != 0)
3324 WITH_CUR_LOCALE (error (0, 0, _("\
3325 %s: field `%s' does not contain exactly ten entries"),
3326 "LC_CTYPE", "outdigit"));
3328 for (size_t cnt = ctype->outdigits_act; cnt < 10; ++cnt)
3330 ctype->mboutdigits[cnt] = charmap_find_symbol (charmap,
3331 (char *) digits + cnt,
3334 if (ctype->mboutdigits[cnt] == NULL)
3335 ctype->mboutdigits[cnt] = charmap_find_symbol (charmap,
3336 longnames[cnt],
3337 strlen (longnames[cnt]));
3339 if (ctype->mboutdigits[cnt] == NULL)
3340 ctype->mboutdigits[cnt] = charmap_find_symbol (charmap,
3341 uninames[cnt], 9);
3343 if (ctype->mboutdigits[cnt] == NULL)
3345 /* Provide a replacement. */
3346 WITH_CUR_LOCALE (error (0, 0, _("\
3347 no output digits defined and none of the standard names in the charmap")));
3349 ctype->mboutdigits[cnt] = obstack_alloc (&((struct charmap_t *) charmap)->mem_pool,
3350 sizeof (struct charseq)
3351 + 1);
3353 /* This is better than nothing. */
3354 ctype->mboutdigits[cnt]->bytes[0] = digits[cnt];
3355 ctype->mboutdigits[cnt]->nbytes = 1;
3358 ctype->wcoutdigits[cnt] = L'0' + cnt;
3361 ctype->outdigits_act = 10;
3364 #undef set_default
3368 /* Initialize. Assumes t->p and t->q have already been set. */
3369 static inline void
3370 wctype_table_init (struct wctype_table *t)
3372 t->level1 = NULL;
3373 t->level1_alloc = t->level1_size = 0;
3374 t->level2 = NULL;
3375 t->level2_alloc = t->level2_size = 0;
3376 t->level3 = NULL;
3377 t->level3_alloc = t->level3_size = 0;
3380 /* Retrieve an entry. */
3381 static inline int
3382 wctype_table_get (struct wctype_table *t, uint32_t wc)
3384 uint32_t index1 = wc >> (t->q + t->p + 5);
3385 if (index1 < t->level1_size)
3387 uint32_t lookup1 = t->level1[index1];
3388 if (lookup1 != EMPTY)
3390 uint32_t index2 = ((wc >> (t->p + 5)) & ((1 << t->q) - 1))
3391 + (lookup1 << t->q);
3392 uint32_t lookup2 = t->level2[index2];
3393 if (lookup2 != EMPTY)
3395 uint32_t index3 = ((wc >> 5) & ((1 << t->p) - 1))
3396 + (lookup2 << t->p);
3397 uint32_t lookup3 = t->level3[index3];
3398 uint32_t index4 = wc & 0x1f;
3400 return (lookup3 >> index4) & 1;
3404 return 0;
3407 /* Add one entry. */
3408 static void
3409 wctype_table_add (struct wctype_table *t, uint32_t wc)
3411 uint32_t index1 = wc >> (t->q + t->p + 5);
3412 uint32_t index2 = (wc >> (t->p + 5)) & ((1 << t->q) - 1);
3413 uint32_t index3 = (wc >> 5) & ((1 << t->p) - 1);
3414 uint32_t index4 = wc & 0x1f;
3415 size_t i, i1, i2;
3417 if (index1 >= t->level1_size)
3419 if (index1 >= t->level1_alloc)
3421 size_t alloc = 2 * t->level1_alloc;
3422 if (alloc <= index1)
3423 alloc = index1 + 1;
3424 t->level1 = (uint32_t *) xrealloc ((char *) t->level1,
3425 alloc * sizeof (uint32_t));
3426 t->level1_alloc = alloc;
3428 while (index1 >= t->level1_size)
3429 t->level1[t->level1_size++] = EMPTY;
3432 if (t->level1[index1] == EMPTY)
3434 if (t->level2_size == t->level2_alloc)
3436 size_t alloc = 2 * t->level2_alloc + 1;
3437 t->level2 = (uint32_t *) xrealloc ((char *) t->level2,
3438 (alloc << t->q) * sizeof (uint32_t));
3439 t->level2_alloc = alloc;
3441 i1 = t->level2_size << t->q;
3442 i2 = (t->level2_size + 1) << t->q;
3443 for (i = i1; i < i2; i++)
3444 t->level2[i] = EMPTY;
3445 t->level1[index1] = t->level2_size++;
3448 index2 += t->level1[index1] << t->q;
3450 if (t->level2[index2] == EMPTY)
3452 if (t->level3_size == t->level3_alloc)
3454 size_t alloc = 2 * t->level3_alloc + 1;
3455 t->level3 = (uint32_t *) xrealloc ((char *) t->level3,
3456 (alloc << t->p) * sizeof (uint32_t));
3457 t->level3_alloc = alloc;
3459 i1 = t->level3_size << t->p;
3460 i2 = (t->level3_size + 1) << t->p;
3461 for (i = i1; i < i2; i++)
3462 t->level3[i] = 0;
3463 t->level2[index2] = t->level3_size++;
3466 index3 += t->level2[index2] << t->p;
3468 t->level3[index3] |= (uint32_t)1 << index4;
3471 /* Finalize and shrink. */
3472 static void
3473 add_locale_wctype_table (struct locale_file *file, struct wctype_table *t)
3475 size_t i, j, k;
3476 uint32_t reorder3[t->level3_size];
3477 uint32_t reorder2[t->level2_size];
3478 uint32_t level2_offset, level3_offset;
3480 /* Uniquify level3 blocks. */
3481 k = 0;
3482 for (j = 0; j < t->level3_size; j++)
3484 for (i = 0; i < k; i++)
3485 if (memcmp (&t->level3[i << t->p], &t->level3[j << t->p],
3486 (1 << t->p) * sizeof (uint32_t)) == 0)
3487 break;
3488 /* Relocate block j to block i. */
3489 reorder3[j] = i;
3490 if (i == k)
3492 if (i != j)
3493 memcpy (&t->level3[i << t->p], &t->level3[j << t->p],
3494 (1 << t->p) * sizeof (uint32_t));
3495 k++;
3498 t->level3_size = k;
3500 for (i = 0; i < (t->level2_size << t->q); i++)
3501 if (t->level2[i] != EMPTY)
3502 t->level2[i] = reorder3[t->level2[i]];
3504 /* Uniquify level2 blocks. */
3505 k = 0;
3506 for (j = 0; j < t->level2_size; j++)
3508 for (i = 0; i < k; i++)
3509 if (memcmp (&t->level2[i << t->q], &t->level2[j << t->q],
3510 (1 << t->q) * sizeof (uint32_t)) == 0)
3511 break;
3512 /* Relocate block j to block i. */
3513 reorder2[j] = i;
3514 if (i == k)
3516 if (i != j)
3517 memcpy (&t->level2[i << t->q], &t->level2[j << t->q],
3518 (1 << t->q) * sizeof (uint32_t));
3519 k++;
3522 t->level2_size = k;
3524 for (i = 0; i < t->level1_size; i++)
3525 if (t->level1[i] != EMPTY)
3526 t->level1[i] = reorder2[t->level1[i]];
3528 t->result_size =
3529 5 * sizeof (uint32_t)
3530 + t->level1_size * sizeof (uint32_t)
3531 + (t->level2_size << t->q) * sizeof (uint32_t)
3532 + (t->level3_size << t->p) * sizeof (uint32_t);
3534 level2_offset =
3535 5 * sizeof (uint32_t)
3536 + t->level1_size * sizeof (uint32_t);
3537 level3_offset =
3538 5 * sizeof (uint32_t)
3539 + t->level1_size * sizeof (uint32_t)
3540 + (t->level2_size << t->q) * sizeof (uint32_t);
3542 start_locale_structure (file);
3543 add_locale_uint32 (file, t->q + t->p + 5);
3544 add_locale_uint32 (file, t->level1_size);
3545 add_locale_uint32 (file, t->p + 5);
3546 add_locale_uint32 (file, (1 << t->q) - 1);
3547 add_locale_uint32 (file, (1 << t->p) - 1);
3549 for (i = 0; i < t->level1_size; i++)
3550 add_locale_uint32
3551 (file,
3552 t->level1[i] == EMPTY
3554 : (t->level1[i] << t->q) * sizeof (uint32_t) + level2_offset);
3556 for (i = 0; i < (t->level2_size << t->q); i++)
3557 add_locale_uint32
3558 (file,
3559 t->level2[i] == EMPTY
3561 : (t->level2[i] << t->p) * sizeof (uint32_t) + level3_offset);
3563 add_locale_uint32_array (file, t->level3, t->level3_size << t->p);
3564 end_locale_structure (file);
3566 if (t->level1_alloc > 0)
3567 free (t->level1);
3568 if (t->level2_alloc > 0)
3569 free (t->level2);
3570 if (t->level3_alloc > 0)
3571 free (t->level3);
3574 /* Flattens the included transliterations into a translit list.
3575 Inserts them in the list at `cursor', and returns the new cursor. */
3576 static struct translit_t **
3577 translit_flatten (struct locale_ctype_t *ctype,
3578 const struct charmap_t *charmap,
3579 struct translit_t **cursor)
3581 while (ctype->translit_include != NULL)
3583 const char *copy_locale = ctype->translit_include->copy_locale;
3584 const char *copy_repertoire = ctype->translit_include->copy_repertoire;
3585 struct localedef_t *other;
3587 /* Unchain the include statement. During the depth-first traversal
3588 we don't want to visit any locale more than once. */
3589 ctype->translit_include = ctype->translit_include->next;
3591 other = find_locale (LC_CTYPE, copy_locale, copy_repertoire, charmap);
3593 if (other == NULL || other->categories[LC_CTYPE].ctype == NULL)
3595 WITH_CUR_LOCALE (error (0, 0, _("\
3596 %s: transliteration data from locale `%s' not available"),
3597 "LC_CTYPE", copy_locale));
3599 else
3601 struct locale_ctype_t *other_ctype =
3602 other->categories[LC_CTYPE].ctype;
3604 cursor = translit_flatten (other_ctype, charmap, cursor);
3605 assert (other_ctype->translit_include == NULL);
3607 if (other_ctype->translit != NULL)
3609 /* Insert the other_ctype->translit list at *cursor. */
3610 struct translit_t *endp = other_ctype->translit;
3611 while (endp->next != NULL)
3612 endp = endp->next;
3614 endp->next = *cursor;
3615 *cursor = other_ctype->translit;
3617 /* Avoid any risk of circular lists. */
3618 other_ctype->translit = NULL;
3620 cursor = &endp->next;
3623 if (ctype->default_missing == NULL)
3624 ctype->default_missing = other_ctype->default_missing;
3628 return cursor;
3631 static void
3632 allocate_arrays (struct locale_ctype_t *ctype, const struct charmap_t *charmap,
3633 struct repertoire_t *repertoire)
3635 size_t idx, nr;
3636 const void *key;
3637 size_t len;
3638 void *vdata;
3639 void *curs;
3641 /* You wonder about this amount of memory? This is only because some
3642 users do not manage to address the array with unsigned values or
3643 data types with range >= 256. '\200' would result in the array
3644 index -128. To help these poor people we duplicate the entries for
3645 128 up to 255 below the entry for \0. */
3646 ctype->ctype_b = (char_class_t *) xcalloc (256 + 128, sizeof (char_class_t));
3647 ctype->ctype32_b = (char_class32_t *) xcalloc (256, sizeof (char_class32_t));
3648 ctype->class_b = (uint32_t **)
3649 xmalloc (ctype->nr_charclass * sizeof (uint32_t *));
3650 ctype->class_3level = (struct wctype_table *)
3651 xmalloc (ctype->nr_charclass * sizeof (struct wctype_table));
3653 /* This is the array accessed using the multibyte string elements. */
3654 for (idx = 0; idx < 256; ++idx)
3655 ctype->ctype_b[128 + idx] = ctype->class256_collection[idx];
3657 /* Mirror first 127 entries. We must take care that entry -1 is not
3658 mirrored because EOF == -1. */
3659 for (idx = 0; idx < 127; ++idx)
3660 ctype->ctype_b[idx] = ctype->ctype_b[256 + idx];
3662 /* The 32 bit array contains all characters < 0x100. */
3663 for (idx = 0; idx < ctype->class_collection_act; ++idx)
3664 if (ctype->charnames[idx] < 0x100)
3665 ctype->ctype32_b[ctype->charnames[idx]] = ctype->class_collection[idx];
3667 for (nr = 0; nr < ctype->nr_charclass; nr++)
3669 ctype->class_b[nr] = (uint32_t *) xcalloc (256 / 32, sizeof (uint32_t));
3671 /* We only set CLASS_B for the bits in the ISO C classes, not
3672 the user defined classes. The number should not change but
3673 who knows. */
3674 #define LAST_ISO_C_BIT 11
3675 if (nr <= LAST_ISO_C_BIT)
3676 for (idx = 0; idx < 256; ++idx)
3677 if (ctype->class256_collection[idx] & _ISbit (nr))
3678 ctype->class_b[nr][idx >> 5] |= (uint32_t) 1 << (idx & 0x1f);
3681 for (nr = 0; nr < ctype->nr_charclass; nr++)
3683 struct wctype_table *t;
3685 t = &ctype->class_3level[nr];
3686 t->p = 4; /* or: 5 */
3687 t->q = 7; /* or: 6 */
3688 wctype_table_init (t);
3690 for (idx = 0; idx < ctype->class_collection_act; ++idx)
3691 if (ctype->class_collection[idx] & _ISwbit (nr))
3692 wctype_table_add (t, ctype->charnames[idx]);
3694 if (verbose)
3695 WITH_CUR_LOCALE (fprintf (stderr, _("\
3696 %s: table for class \"%s\": %lu bytes\n"),
3697 "LC_CTYPE", ctype->classnames[nr],
3698 (unsigned long int) t->result_size));
3701 /* Room for table of mappings. */
3702 ctype->map_b = (uint32_t **) xmalloc (2 * sizeof (uint32_t *));
3703 ctype->map32_b = (uint32_t **) xmalloc (ctype->map_collection_nr
3704 * sizeof (uint32_t *));
3705 ctype->map_3level = (struct wctrans_table *)
3706 xmalloc (ctype->map_collection_nr * sizeof (struct wctrans_table));
3708 /* Fill in all mappings. */
3709 for (idx = 0; idx < 2; ++idx)
3711 unsigned int idx2;
3713 /* Allocate table. */
3714 ctype->map_b[idx] = (uint32_t *)
3715 xmalloc ((256 + 128) * sizeof (uint32_t));
3717 /* Copy values from collection. */
3718 for (idx2 = 0; idx2 < 256; ++idx2)
3719 ctype->map_b[idx][128 + idx2] = ctype->map256_collection[idx][idx2];
3721 /* Mirror first 127 entries. We must take care not to map entry
3722 -1 because EOF == -1. */
3723 for (idx2 = 0; idx2 < 127; ++idx2)
3724 ctype->map_b[idx][idx2] = ctype->map_b[idx][256 + idx2];
3726 /* EOF must map to EOF. */
3727 ctype->map_b[idx][127] = EOF;
3730 for (idx = 0; idx < ctype->map_collection_nr; ++idx)
3732 unsigned int idx2;
3734 /* Allocate table. */
3735 ctype->map32_b[idx] = (uint32_t *) xmalloc (256 * sizeof (uint32_t));
3737 /* Copy values from collection. Default is identity mapping. */
3738 for (idx2 = 0; idx2 < 256; ++idx2)
3739 ctype->map32_b[idx][idx2] =
3740 (ctype->map_collection[idx][idx2] != 0
3741 ? ctype->map_collection[idx][idx2]
3742 : idx2);
3745 for (nr = 0; nr < ctype->map_collection_nr; nr++)
3747 struct wctrans_table *t;
3749 t = &ctype->map_3level[nr];
3750 t->p = 7;
3751 t->q = 9;
3752 wctrans_table_init (t);
3754 for (idx = 0; idx < ctype->map_collection_act[nr]; ++idx)
3755 if (ctype->map_collection[nr][idx] != 0)
3756 wctrans_table_add (t, ctype->charnames[idx],
3757 ctype->map_collection[nr][idx]);
3759 if (verbose)
3760 WITH_CUR_LOCALE (fprintf (stderr, _("\
3761 %s: table for map \"%s\": %lu bytes\n"),
3762 "LC_CTYPE", ctype->mapnames[nr],
3763 (unsigned long int) t->result_size));
3766 /* Extra array for class and map names. */
3767 ctype->class_name_ptr = (uint32_t *) xmalloc (ctype->nr_charclass
3768 * sizeof (uint32_t));
3769 ctype->map_name_ptr = (uint32_t *) xmalloc (ctype->map_collection_nr
3770 * sizeof (uint32_t));
3772 ctype->class_offset = _NL_ITEM_INDEX (_NL_CTYPE_EXTRA_MAP_1);
3773 ctype->map_offset = ctype->class_offset + ctype->nr_charclass;
3775 /* Array for width information. Because the expected widths are very
3776 small (never larger than 2) we use only one single byte. This
3777 saves space.
3778 We put only printable characters in the table. wcwidth is specified
3779 to return -1 for non-printable characters. Doing the check here
3780 saves a run-time check.
3781 But we put L'\0' in the table. This again saves a run-time check. */
3783 struct wcwidth_table *t;
3785 t = &ctype->width;
3786 t->p = 7;
3787 t->q = 9;
3788 wcwidth_table_init (t);
3790 /* First set all the printable characters of the character set to
3791 the default width. */
3792 curs = NULL;
3793 while (iterate_table (&charmap->char_table, &curs, &key, &len, &vdata) == 0)
3795 struct charseq *data = (struct charseq *) vdata;
3797 if (data->ucs4 == UNINITIALIZED_CHAR_VALUE)
3798 data->ucs4 = repertoire_find_value (ctype->repertoire,
3799 data->name, len);
3801 if (data->ucs4 != ILLEGAL_CHAR_VALUE)
3803 uint32_t *class_bits =
3804 find_idx (ctype, &ctype->class_collection, NULL,
3805 &ctype->class_collection_act, data->ucs4);
3807 if (class_bits != NULL && (*class_bits & BITw (tok_print)))
3808 wcwidth_table_add (t, data->ucs4, charmap->width_default);
3812 /* Now add the explicitly specified widths. */
3813 if (charmap->width_rules != NULL)
3814 for (size_t cnt = 0; cnt < charmap->nwidth_rules; ++cnt)
3816 unsigned char bytes[charmap->mb_cur_max];
3817 int nbytes = charmap->width_rules[cnt].from->nbytes;
3819 /* We have the range of character for which the width is
3820 specified described using byte sequences of the multibyte
3821 charset. We have to convert this to UCS4 now. And we
3822 cannot simply convert the beginning and the end of the
3823 sequence, we have to iterate over the byte sequence and
3824 convert it for every single character. */
3825 memcpy (bytes, charmap->width_rules[cnt].from->bytes, nbytes);
3827 while (nbytes < charmap->width_rules[cnt].to->nbytes
3828 || memcmp (bytes, charmap->width_rules[cnt].to->bytes,
3829 nbytes) <= 0)
3831 /* Find the UCS value for `bytes'. */
3832 int inner;
3833 uint32_t wch;
3834 struct charseq *seq =
3835 charmap_find_symbol (charmap, (char *) bytes, nbytes);
3837 if (seq == NULL)
3838 wch = ILLEGAL_CHAR_VALUE;
3839 else if (seq->ucs4 != UNINITIALIZED_CHAR_VALUE)
3840 wch = seq->ucs4;
3841 else
3842 wch = repertoire_find_value (ctype->repertoire, seq->name,
3843 strlen (seq->name));
3845 if (wch != ILLEGAL_CHAR_VALUE)
3847 /* Store the value. */
3848 uint32_t *class_bits =
3849 find_idx (ctype, &ctype->class_collection, NULL,
3850 &ctype->class_collection_act, wch);
3852 if (class_bits != NULL && (*class_bits & BITw (tok_print)))
3853 wcwidth_table_add (t, wch,
3854 charmap->width_rules[cnt].width);
3857 /* "Increment" the bytes sequence. */
3858 inner = nbytes - 1;
3859 while (inner >= 0 && bytes[inner] == 0xff)
3860 --inner;
3862 if (inner < 0)
3864 /* We have to extend the byte sequence. */
3865 if (nbytes >= charmap->width_rules[cnt].to->nbytes)
3866 break;
3868 bytes[0] = 1;
3869 memset (&bytes[1], 0, nbytes);
3870 ++nbytes;
3872 else
3874 ++bytes[inner];
3875 while (++inner < nbytes)
3876 bytes[inner] = 0;
3881 /* Set the width of L'\0' to 0. */
3882 wcwidth_table_add (t, 0, 0);
3884 if (verbose)
3885 WITH_CUR_LOCALE (fprintf (stderr, _("%s: table for width: %lu bytes\n"),
3886 "LC_CTYPE", (unsigned long int) t->result_size));
3889 /* Set MB_CUR_MAX. */
3890 ctype->mb_cur_max = charmap->mb_cur_max;
3892 /* Now determine the table for the transliteration information.
3894 XXX It is not yet clear to me whether it is worth implementing a
3895 complicated algorithm which uses a hash table to locate the entries.
3896 For now I'll use a simple array which can be searching using binary
3897 search. */
3898 if (ctype->translit_include != NULL)
3899 /* Traverse the locales mentioned in the `include' statements in a
3900 depth-first way and fold in their transliteration information. */
3901 translit_flatten (ctype, charmap, &ctype->translit);
3903 if (ctype->translit != NULL)
3905 /* First count how many entries we have. This is the upper limit
3906 since some entries from the included files might be overwritten. */
3907 size_t number = 0;
3908 struct translit_t *runp = ctype->translit;
3909 struct translit_t **sorted;
3910 size_t from_len, to_len;
3912 while (runp != NULL)
3914 ++number;
3915 runp = runp->next;
3918 /* Next we allocate an array large enough and fill in the values. */
3919 sorted = (struct translit_t **) alloca (number
3920 * sizeof (struct translit_t **));
3921 runp = ctype->translit;
3922 number = 0;
3925 /* Search for the place where to insert this string.
3926 XXX Better use a real sorting algorithm later. */
3927 size_t idx = 0;
3928 int replace = 0;
3930 while (idx < number)
3932 int res = wcscmp ((const wchar_t *) sorted[idx]->from,
3933 (const wchar_t *) runp->from);
3934 if (res == 0)
3936 replace = 1;
3937 break;
3939 if (res > 0)
3940 break;
3941 ++idx;
3944 if (replace)
3945 sorted[idx] = runp;
3946 else
3948 memmove (&sorted[idx + 1], &sorted[idx],
3949 (number - idx) * sizeof (struct translit_t *));
3950 sorted[idx] = runp;
3951 ++number;
3954 runp = runp->next;
3956 while (runp != NULL);
3958 /* The next step is putting all the possible transliteration
3959 strings in one memory block so that we can write it out.
3960 We need several different blocks:
3961 - index to the from-string array
3962 - from-string array
3963 - index to the to-string array
3964 - to-string array.
3966 from_len = to_len = 0;
3967 for (size_t cnt = 0; cnt < number; ++cnt)
3969 struct translit_to_t *srunp;
3970 from_len += wcslen ((const wchar_t *) sorted[cnt]->from) + 1;
3971 srunp = sorted[cnt]->to;
3972 while (srunp != NULL)
3974 to_len += wcslen ((const wchar_t *) srunp->str) + 1;
3975 srunp = srunp->next;
3977 /* Plus one for the extra NUL character marking the end of
3978 the list for the current entry. */
3979 ++to_len;
3982 /* We can allocate the arrays for the results. */
3983 ctype->translit_from_idx = xmalloc (number * sizeof (uint32_t));
3984 ctype->translit_from_tbl = xmalloc (from_len * sizeof (uint32_t));
3985 ctype->translit_to_idx = xmalloc (number * sizeof (uint32_t));
3986 ctype->translit_to_tbl = xmalloc (to_len * sizeof (uint32_t));
3988 from_len = 0;
3989 to_len = 0;
3990 for (size_t cnt = 0; cnt < number; ++cnt)
3992 size_t len;
3993 struct translit_to_t *srunp;
3995 ctype->translit_from_idx[cnt] = from_len;
3996 ctype->translit_to_idx[cnt] = to_len;
3998 len = wcslen ((const wchar_t *) sorted[cnt]->from) + 1;
3999 wmemcpy ((wchar_t *) &ctype->translit_from_tbl[from_len],
4000 (const wchar_t *) sorted[cnt]->from, len);
4001 from_len += len;
4003 ctype->translit_to_idx[cnt] = to_len;
4004 srunp = sorted[cnt]->to;
4005 while (srunp != NULL)
4007 len = wcslen ((const wchar_t *) srunp->str) + 1;
4008 wmemcpy ((wchar_t *) &ctype->translit_to_tbl[to_len],
4009 (const wchar_t *) srunp->str, len);
4010 to_len += len;
4011 srunp = srunp->next;
4013 ctype->translit_to_tbl[to_len++] = L'\0';
4016 /* Store the information about the length. */
4017 ctype->translit_idx_size = number;
4018 ctype->translit_from_tbl_size = from_len * sizeof (uint32_t);
4019 ctype->translit_to_tbl_size = to_len * sizeof (uint32_t);
4021 else
4023 ctype->translit_from_idx = no_str;
4024 ctype->translit_from_tbl = no_str;
4025 ctype->translit_to_tbl = no_str;
4026 ctype->translit_idx_size = 0;
4027 ctype->translit_from_tbl_size = 0;
4028 ctype->translit_to_tbl_size = 0;