Update copyright notices with scripts/update-copyrights
[glibc.git] / locale / programs / ld-ctype.c
blob505cb13c54ffe1d3bb9a4561058f063fc174731b
1 /* Copyright (C) 1995-2014 Free Software Foundation, Inc.
2 This file is part of the GNU C Library.
3 Contributed by Ulrich Drepper <drepper@gnu.org>, 1995.
5 This program is free software; you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published
7 by the Free Software Foundation; version 2 of the License, or
8 (at your option) any later version.
10 This program is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 GNU General Public License for more details.
15 You should have received a copy of the GNU General Public License
16 along with this program; if not, see <http://www.gnu.org/licenses/>. */
18 #ifdef HAVE_CONFIG_H
19 # include <config.h>
20 #endif
22 #include <alloca.h>
23 #include <byteswap.h>
24 #include <endian.h>
25 #include <errno.h>
26 #include <limits.h>
27 #include <obstack.h>
28 #include <stdlib.h>
29 #include <string.h>
30 #include <wchar.h>
31 #include <wctype.h>
32 #include <stdint.h>
33 #include <sys/uio.h>
35 #include "localedef.h"
36 #include "charmap.h"
37 #include "localeinfo.h"
38 #include "langinfo.h"
39 #include "linereader.h"
40 #include "locfile-token.h"
41 #include "locfile.h"
43 #include <assert.h>
46 #ifdef PREDEFINED_CLASSES
47 /* These are the extra bits not in wctype.h since these are not preallocated
48 classes. */
49 # define _ISwspecial1 (1 << 29)
50 # define _ISwspecial2 (1 << 30)
51 # define _ISwspecial3 (1 << 31)
52 #endif
55 /* The bit used for representing a special class. */
56 #define BITPOS(class) ((class) - tok_upper)
57 #define BIT(class) (_ISbit (BITPOS (class)))
58 #define BITw(class) (_ISwbit (BITPOS (class)))
60 #define ELEM(ctype, collection, idx, value) \
61 *find_idx (ctype, &ctype->collection idx, &ctype->collection##_max idx, \
62 &ctype->collection##_act idx, value)
65 /* To be compatible with former implementations we for now restrict
66 the number of bits for character classes to 16. When compatibility
67 is not necessary anymore increase the number to 32. */
68 #define char_class_t uint16_t
69 #define char_class32_t uint32_t
72 /* Type to describe a transliteration action. We have a possibly
73 multiple character from-string and a set of multiple character
74 to-strings. All are 32bit values since this is what is used in
75 the gconv functions. */
76 struct translit_to_t
78 uint32_t *str;
80 struct translit_to_t *next;
83 struct translit_t
85 uint32_t *from;
87 const char *fname;
88 size_t lineno;
90 struct translit_to_t *to;
92 struct translit_t *next;
95 struct translit_ignore_t
97 uint32_t from;
98 uint32_t to;
99 uint32_t step;
101 const char *fname;
102 size_t lineno;
104 struct translit_ignore_t *next;
108 /* Type to describe a transliteration include statement. */
109 struct translit_include_t
111 const char *copy_locale;
112 const char *copy_repertoire;
114 struct translit_include_t *next;
118 /* Sparse table of uint32_t. */
119 #define TABLE idx_table
120 #define ELEMENT uint32_t
121 #define DEFAULT ((uint32_t) ~0)
122 #define NO_ADD_LOCALE
123 #include "3level.h"
125 #define TABLE wcwidth_table
126 #define ELEMENT uint8_t
127 #define DEFAULT 0xff
128 #include "3level.h"
130 #define TABLE wctrans_table
131 #define ELEMENT int32_t
132 #define DEFAULT 0
133 #define wctrans_table_add wctrans_table_add_internal
134 #include "3level.h"
135 #undef wctrans_table_add
136 /* The wctrans_table must actually store the difference between the
137 desired result and the argument. */
138 static inline void
139 wctrans_table_add (struct wctrans_table *t, uint32_t wc, uint32_t mapped_wc)
141 wctrans_table_add_internal (t, wc, mapped_wc - wc);
144 /* Construction of sparse 3-level tables.
145 See wchar-lookup.h for their structure and the meaning of p and q. */
147 struct wctype_table
149 /* Parameters. */
150 unsigned int p;
151 unsigned int q;
152 /* Working representation. */
153 size_t level1_alloc;
154 size_t level1_size;
155 uint32_t *level1;
156 size_t level2_alloc;
157 size_t level2_size;
158 uint32_t *level2;
159 size_t level3_alloc;
160 size_t level3_size;
161 uint32_t *level3;
162 size_t result_size;
165 static void add_locale_wctype_table (struct locale_file *file,
166 struct wctype_table *t);
168 /* The real definition of the struct for the LC_CTYPE locale. */
169 struct locale_ctype_t
171 uint32_t *charnames;
172 size_t charnames_max;
173 size_t charnames_act;
174 /* An index lookup table, to speedup find_idx. */
175 struct idx_table charnames_idx;
177 struct repertoire_t *repertoire;
179 /* We will allow up to 8 * sizeof (uint32_t) character classes. */
180 #define MAX_NR_CHARCLASS (8 * sizeof (uint32_t))
181 size_t nr_charclass;
182 const char *classnames[MAX_NR_CHARCLASS];
183 uint32_t last_class_char;
184 uint32_t class256_collection[256];
185 uint32_t *class_collection;
186 size_t class_collection_max;
187 size_t class_collection_act;
188 uint32_t class_done;
189 uint32_t class_offset;
191 struct charseq **mbdigits;
192 size_t mbdigits_act;
193 size_t mbdigits_max;
194 uint32_t *wcdigits;
195 size_t wcdigits_act;
196 size_t wcdigits_max;
198 struct charseq *mboutdigits[10];
199 uint32_t wcoutdigits[10];
200 size_t outdigits_act;
202 /* If the following number ever turns out to be too small simply
203 increase it. But I doubt it will. --drepper@gnu */
204 #define MAX_NR_CHARMAP 16
205 const char *mapnames[MAX_NR_CHARMAP];
206 uint32_t *map_collection[MAX_NR_CHARMAP];
207 uint32_t map256_collection[2][256];
208 size_t map_collection_max[MAX_NR_CHARMAP];
209 size_t map_collection_act[MAX_NR_CHARMAP];
210 size_t map_collection_nr;
211 size_t last_map_idx;
212 int tomap_done[MAX_NR_CHARMAP];
213 uint32_t map_offset;
215 /* Transliteration information. */
216 struct translit_include_t *translit_include;
217 struct translit_t *translit;
218 struct translit_ignore_t *translit_ignore;
219 uint32_t ntranslit_ignore;
221 uint32_t *default_missing;
222 const char *default_missing_file;
223 size_t default_missing_lineno;
225 uint32_t to_nonascii;
226 uint32_t nonascii_case;
228 /* The arrays for the binary representation. */
229 char_class_t *ctype_b;
230 char_class32_t *ctype32_b;
231 uint32_t **map_b;
232 uint32_t **map32_b;
233 uint32_t **class_b;
234 struct wctype_table *class_3level;
235 struct wctrans_table *map_3level;
236 uint32_t *class_name_ptr;
237 uint32_t *map_name_ptr;
238 struct wcwidth_table width;
239 uint32_t mb_cur_max;
240 const char *codeset_name;
241 uint32_t *translit_from_idx;
242 uint32_t *translit_from_tbl;
243 uint32_t *translit_to_idx;
244 uint32_t *translit_to_tbl;
245 uint32_t translit_idx_size;
246 size_t translit_from_tbl_size;
247 size_t translit_to_tbl_size;
249 struct obstack mempool;
253 /* Marker for an empty slot. This has the value 0xFFFFFFFF, regardless
254 whether 'int' is 16 bit, 32 bit, or 64 bit. */
255 #define EMPTY ((uint32_t) ~0)
258 #define obstack_chunk_alloc xmalloc
259 #define obstack_chunk_free free
262 /* Prototypes for local functions. */
263 static void ctype_startup (struct linereader *lr, struct localedef_t *locale,
264 const struct charmap_t *charmap,
265 struct localedef_t *copy_locale,
266 int ignore_content);
267 static void ctype_class_new (struct linereader *lr,
268 struct locale_ctype_t *ctype, const char *name);
269 static void ctype_map_new (struct linereader *lr,
270 struct locale_ctype_t *ctype,
271 const char *name, const struct charmap_t *charmap);
272 static uint32_t *find_idx (struct locale_ctype_t *ctype, uint32_t **table,
273 size_t *max, size_t *act, uint32_t idx);
274 static void set_class_defaults (struct locale_ctype_t *ctype,
275 const struct charmap_t *charmap,
276 struct repertoire_t *repertoire);
277 static void allocate_arrays (struct locale_ctype_t *ctype,
278 const struct charmap_t *charmap,
279 struct repertoire_t *repertoire);
282 static const char *longnames[] =
284 "zero", "one", "two", "three", "four",
285 "five", "six", "seven", "eight", "nine"
287 static const char *uninames[] =
289 "U00000030", "U00000031", "U00000032", "U00000033", "U00000034",
290 "U00000035", "U00000036", "U00000037", "U00000038", "U00000039"
292 static const unsigned char digits[] = "0123456789";
295 static void
296 ctype_startup (struct linereader *lr, struct localedef_t *locale,
297 const struct charmap_t *charmap,
298 struct localedef_t *copy_locale, int ignore_content)
300 unsigned int cnt;
301 struct locale_ctype_t *ctype;
303 if (!ignore_content && locale->categories[LC_CTYPE].ctype == NULL)
305 if (copy_locale == NULL)
307 /* Allocate the needed room. */
308 locale->categories[LC_CTYPE].ctype = ctype =
309 (struct locale_ctype_t *) xcalloc (1,
310 sizeof (struct locale_ctype_t));
312 /* We have seen no names yet. */
313 ctype->charnames_max = charmap->mb_cur_max == 1 ? 256 : 512;
314 ctype->charnames = (uint32_t *) xmalloc (ctype->charnames_max
315 * sizeof (uint32_t));
316 for (cnt = 0; cnt < 256; ++cnt)
317 ctype->charnames[cnt] = cnt;
318 ctype->charnames_act = 256;
319 idx_table_init (&ctype->charnames_idx);
321 /* Fill character class information. */
322 ctype->last_class_char = ILLEGAL_CHAR_VALUE;
323 /* The order of the following instructions determines the bit
324 positions! */
325 ctype_class_new (lr, ctype, "upper");
326 ctype_class_new (lr, ctype, "lower");
327 ctype_class_new (lr, ctype, "alpha");
328 ctype_class_new (lr, ctype, "digit");
329 ctype_class_new (lr, ctype, "xdigit");
330 ctype_class_new (lr, ctype, "space");
331 ctype_class_new (lr, ctype, "print");
332 ctype_class_new (lr, ctype, "graph");
333 ctype_class_new (lr, ctype, "blank");
334 ctype_class_new (lr, ctype, "cntrl");
335 ctype_class_new (lr, ctype, "punct");
336 ctype_class_new (lr, ctype, "alnum");
337 #ifdef PREDEFINED_CLASSES
338 /* The following are extensions from ISO 14652. */
339 ctype_class_new (lr, ctype, "left_to_right");
340 ctype_class_new (lr, ctype, "right_to_left");
341 ctype_class_new (lr, ctype, "num_terminator");
342 ctype_class_new (lr, ctype, "num_separator");
343 ctype_class_new (lr, ctype, "segment_separator");
344 ctype_class_new (lr, ctype, "block_separator");
345 ctype_class_new (lr, ctype, "direction_control");
346 ctype_class_new (lr, ctype, "sym_swap_layout");
347 ctype_class_new (lr, ctype, "char_shape_selector");
348 ctype_class_new (lr, ctype, "num_shape_selector");
349 ctype_class_new (lr, ctype, "non_spacing");
350 ctype_class_new (lr, ctype, "non_spacing_level3");
351 ctype_class_new (lr, ctype, "normal_connect");
352 ctype_class_new (lr, ctype, "r_connect");
353 ctype_class_new (lr, ctype, "no_connect");
354 ctype_class_new (lr, ctype, "no_connect-space");
355 ctype_class_new (lr, ctype, "vowel_connect");
356 #endif
358 ctype->class_collection_max = charmap->mb_cur_max == 1 ? 256 : 512;
359 ctype->class_collection
360 = (uint32_t *) xcalloc (sizeof (unsigned long int),
361 ctype->class_collection_max);
362 ctype->class_collection_act = 256;
364 /* Fill character map information. */
365 ctype->last_map_idx = MAX_NR_CHARMAP;
366 ctype_map_new (lr, ctype, "toupper", charmap);
367 ctype_map_new (lr, ctype, "tolower", charmap);
368 #ifdef PREDEFINED_CLASSES
369 ctype_map_new (lr, ctype, "tosymmetric", charmap);
370 #endif
372 /* Fill first 256 entries in `toXXX' arrays. */
373 for (cnt = 0; cnt < 256; ++cnt)
375 ctype->map_collection[0][cnt] = cnt;
376 ctype->map_collection[1][cnt] = cnt;
377 #ifdef PREDEFINED_CLASSES
378 ctype->map_collection[2][cnt] = cnt;
379 #endif
380 ctype->map256_collection[0][cnt] = cnt;
381 ctype->map256_collection[1][cnt] = cnt;
384 if (enc_not_ascii_compatible)
385 ctype->to_nonascii = 1;
387 obstack_init (&ctype->mempool);
389 else
390 ctype = locale->categories[LC_CTYPE].ctype =
391 copy_locale->categories[LC_CTYPE].ctype;
396 void
397 ctype_finish (struct localedef_t *locale, const struct charmap_t *charmap)
399 /* See POSIX.2, table 2-6 for the meaning of the following table. */
400 #define NCLASS 12
401 static const struct
403 const char *name;
404 const char allow[NCLASS];
406 valid_table[NCLASS] =
408 /* The order is important. See token.h for more information.
409 M = Always, D = Default, - = Permitted, X = Mutually exclusive */
410 { "upper", "--MX-XDDXXX-" },
411 { "lower", "--MX-XDDXXX-" },
412 { "alpha", "---X-XDDXXX-" },
413 { "digit", "XXX--XDDXXX-" },
414 { "xdigit", "-----XDDXXX-" },
415 { "space", "XXXXX------X" },
416 { "print", "---------X--" },
417 { "graph", "---------X--" },
418 { "blank", "XXXXXM-----X" },
419 { "cntrl", "XXXXX-XX--XX" },
420 { "punct", "XXXXX-DD-X-X" },
421 { "alnum", "-----XDDXXX-" }
423 size_t cnt;
424 int cls1, cls2;
425 uint32_t space_value;
426 struct charseq *space_seq;
427 struct locale_ctype_t *ctype = locale->categories[LC_CTYPE].ctype;
428 int warned;
429 const void *key;
430 size_t len;
431 void *vdata;
432 void *curs;
434 /* Now resolve copying and also handle completely missing definitions. */
435 if (ctype == NULL)
437 const char *repertoire_name;
439 /* First see whether we were supposed to copy. If yes, find the
440 actual definition. */
441 if (locale->copy_name[LC_CTYPE] != NULL)
443 /* Find the copying locale. This has to happen transitively since
444 the locale we are copying from might also copying another one. */
445 struct localedef_t *from = locale;
448 from = find_locale (LC_CTYPE, from->copy_name[LC_CTYPE],
449 from->repertoire_name, charmap);
450 while (from->categories[LC_CTYPE].ctype == NULL
451 && from->copy_name[LC_CTYPE] != NULL);
453 ctype = locale->categories[LC_CTYPE].ctype
454 = from->categories[LC_CTYPE].ctype;
457 /* If there is still no definition issue an warning and create an
458 empty one. */
459 if (ctype == NULL)
461 if (! be_quiet)
462 WITH_CUR_LOCALE (error (0, 0, _("\
463 No definition for %s category found"), "LC_CTYPE"));
464 ctype_startup (NULL, locale, charmap, NULL, 0);
465 ctype = locale->categories[LC_CTYPE].ctype;
468 /* Get the repertoire we have to use. */
469 repertoire_name = locale->repertoire_name ?: repertoire_global;
470 if (repertoire_name != NULL)
471 ctype->repertoire = repertoire_read (repertoire_name);
474 /* We need the name of the currently used 8-bit character set to
475 make correct conversion between this 8-bit representation and the
476 ISO 10646 character set used internally for wide characters. */
477 ctype->codeset_name = charmap->code_set_name;
478 if (ctype->codeset_name == NULL)
480 if (! be_quiet)
481 WITH_CUR_LOCALE (error (0, 0, _("\
482 No character set name specified in charmap")));
483 ctype->codeset_name = "//UNKNOWN//";
486 /* Set default value for classes not specified. */
487 set_class_defaults (ctype, charmap, ctype->repertoire);
489 /* Check according to table. */
490 for (cnt = 0; cnt < ctype->class_collection_act; ++cnt)
492 uint32_t tmp = ctype->class_collection[cnt];
494 if (tmp != 0)
496 for (cls1 = 0; cls1 < NCLASS; ++cls1)
497 if ((tmp & _ISwbit (cls1)) != 0)
498 for (cls2 = 0; cls2 < NCLASS; ++cls2)
499 if (valid_table[cls1].allow[cls2] != '-')
501 int eq = (tmp & _ISwbit (cls2)) != 0;
502 switch (valid_table[cls1].allow[cls2])
504 case 'M':
505 if (!eq)
507 uint32_t value = ctype->charnames[cnt];
509 if (!be_quiet)
510 WITH_CUR_LOCALE (error (0, 0, _("\
511 character L'\\u%0*x' in class `%s' must be in class `%s'"),
512 value > 0xffff ? 8 : 4,
513 value,
514 valid_table[cls1].name,
515 valid_table[cls2].name));
517 break;
519 case 'X':
520 if (eq)
522 uint32_t value = ctype->charnames[cnt];
524 if (!be_quiet)
525 WITH_CUR_LOCALE (error (0, 0, _("\
526 character L'\\u%0*x' in class `%s' must not be in class `%s'"),
527 value > 0xffff ? 8 : 4,
528 value,
529 valid_table[cls1].name,
530 valid_table[cls2].name));
532 break;
534 case 'D':
535 ctype->class_collection[cnt] |= _ISwbit (cls2);
536 break;
538 default:
539 WITH_CUR_LOCALE (error (5, 0, _("\
540 internal error in %s, line %u"), __FUNCTION__, __LINE__));
546 for (cnt = 0; cnt < 256; ++cnt)
548 uint32_t tmp = ctype->class256_collection[cnt];
550 if (tmp != 0)
552 for (cls1 = 0; cls1 < NCLASS; ++cls1)
553 if ((tmp & _ISbit (cls1)) != 0)
554 for (cls2 = 0; cls2 < NCLASS; ++cls2)
555 if (valid_table[cls1].allow[cls2] != '-')
557 int eq = (tmp & _ISbit (cls2)) != 0;
558 switch (valid_table[cls1].allow[cls2])
560 case 'M':
561 if (!eq)
563 char buf[17];
565 snprintf (buf, sizeof buf, "\\%Zo", cnt);
567 if (!be_quiet)
568 WITH_CUR_LOCALE (error (0, 0, _("\
569 character '%s' in class `%s' must be in class `%s'"),
570 buf,
571 valid_table[cls1].name,
572 valid_table[cls2].name));
574 break;
576 case 'X':
577 if (eq)
579 char buf[17];
581 snprintf (buf, sizeof buf, "\\%Zo", cnt);
583 if (!be_quiet)
584 WITH_CUR_LOCALE (error (0, 0, _("\
585 character '%s' in class `%s' must not be in class `%s'"),
586 buf,
587 valid_table[cls1].name,
588 valid_table[cls2].name));
590 break;
592 case 'D':
593 ctype->class256_collection[cnt] |= _ISbit (cls2);
594 break;
596 default:
597 WITH_CUR_LOCALE (error (5, 0, _("\
598 internal error in %s, line %u"), __FUNCTION__, __LINE__));
604 /* ... and now test <SP> as a special case. */
605 space_value = 32;
606 if (((cnt = BITPOS (tok_space),
607 (ELEM (ctype, class_collection, , space_value)
608 & BITw (tok_space)) == 0)
609 || (cnt = BITPOS (tok_blank),
610 (ELEM (ctype, class_collection, , space_value)
611 & BITw (tok_blank)) == 0)))
613 if (!be_quiet)
614 WITH_CUR_LOCALE (error (0, 0, _("<SP> character not in class `%s'"),
615 valid_table[cnt].name));
617 else if (((cnt = BITPOS (tok_punct),
618 (ELEM (ctype, class_collection, , space_value)
619 & BITw (tok_punct)) != 0)
620 || (cnt = BITPOS (tok_graph),
621 (ELEM (ctype, class_collection, , space_value)
622 & BITw (tok_graph))
623 != 0)))
625 if (!be_quiet)
626 WITH_CUR_LOCALE (error (0, 0, _("\
627 <SP> character must not be in class `%s'"),
628 valid_table[cnt].name));
630 else
631 ELEM (ctype, class_collection, , space_value) |= BITw (tok_print);
633 space_seq = charmap_find_value (charmap, "SP", 2);
634 if (space_seq == NULL)
635 space_seq = charmap_find_value (charmap, "space", 5);
636 if (space_seq == NULL)
637 space_seq = charmap_find_value (charmap, "U00000020", 9);
638 if (space_seq == NULL || space_seq->nbytes != 1)
640 if (!be_quiet)
641 WITH_CUR_LOCALE (error (0, 0, _("\
642 character <SP> not defined in character map")));
644 else if (((cnt = BITPOS (tok_space),
645 (ctype->class256_collection[space_seq->bytes[0]]
646 & BIT (tok_space)) == 0)
647 || (cnt = BITPOS (tok_blank),
648 (ctype->class256_collection[space_seq->bytes[0]]
649 & BIT (tok_blank)) == 0)))
651 if (!be_quiet)
652 WITH_CUR_LOCALE (error (0, 0, _("<SP> character not in class `%s'"),
653 valid_table[cnt].name));
655 else if (((cnt = BITPOS (tok_punct),
656 (ctype->class256_collection[space_seq->bytes[0]]
657 & BIT (tok_punct)) != 0)
658 || (cnt = BITPOS (tok_graph),
659 (ctype->class256_collection[space_seq->bytes[0]]
660 & BIT (tok_graph)) != 0)))
662 if (!be_quiet)
663 WITH_CUR_LOCALE (error (0, 0, _("\
664 <SP> character must not be in class `%s'"),
665 valid_table[cnt].name));
667 else
668 ctype->class256_collection[space_seq->bytes[0]] |= BIT (tok_print);
670 /* Check whether all single-byte characters make to their upper/lowercase
671 equivalent according to the ASCII rules. */
672 for (cnt = 'A'; cnt <= 'Z'; ++cnt)
674 uint32_t uppval = ctype->map256_collection[0][cnt];
675 uint32_t lowval = ctype->map256_collection[1][cnt];
676 uint32_t lowuppval = ctype->map256_collection[0][lowval];
677 uint32_t lowlowval = ctype->map256_collection[1][lowval];
679 if (uppval != cnt
680 || lowval != cnt + 0x20
681 || lowuppval != cnt
682 || lowlowval != cnt + 0x20)
683 ctype->nonascii_case = 1;
685 for (cnt = 0; cnt < 256; ++cnt)
686 if (cnt < 'A' || (cnt > 'Z' && cnt < 'a') || cnt > 'z')
687 if (ctype->map256_collection[0][cnt] != cnt
688 || ctype->map256_collection[1][cnt] != cnt)
689 ctype->nonascii_case = 1;
691 /* Now that the tests are done make sure the name array contains all
692 characters which are handled in the WIDTH section of the
693 character set definition file. */
694 if (charmap->width_rules != NULL)
695 for (cnt = 0; cnt < charmap->nwidth_rules; ++cnt)
697 unsigned char bytes[charmap->mb_cur_max];
698 int nbytes = charmap->width_rules[cnt].from->nbytes;
700 /* We have the range of character for which the width is
701 specified described using byte sequences of the multibyte
702 charset. We have to convert this to UCS4 now. And we
703 cannot simply convert the beginning and the end of the
704 sequence, we have to iterate over the byte sequence and
705 convert it for every single character. */
706 memcpy (bytes, charmap->width_rules[cnt].from->bytes, nbytes);
708 while (nbytes < charmap->width_rules[cnt].to->nbytes
709 || memcmp (bytes, charmap->width_rules[cnt].to->bytes,
710 nbytes) <= 0)
712 /* Find the UCS value for `bytes'. */
713 int inner;
714 uint32_t wch;
715 struct charseq *seq
716 = charmap_find_symbol (charmap, (char *) bytes, nbytes);
718 if (seq == NULL)
719 wch = ILLEGAL_CHAR_VALUE;
720 else if (seq->ucs4 != UNINITIALIZED_CHAR_VALUE)
721 wch = seq->ucs4;
722 else
723 wch = repertoire_find_value (ctype->repertoire, seq->name,
724 strlen (seq->name));
726 if (wch != ILLEGAL_CHAR_VALUE)
727 /* We are only interested in the side-effects of the
728 `find_idx' call. It will add appropriate entries in
729 the name array if this is necessary. */
730 (void) find_idx (ctype, NULL, NULL, NULL, wch);
732 /* "Increment" the bytes sequence. */
733 inner = nbytes - 1;
734 while (inner >= 0 && bytes[inner] == 0xff)
735 --inner;
737 if (inner < 0)
739 /* We have to extend the byte sequence. */
740 if (nbytes >= charmap->width_rules[cnt].to->nbytes)
741 break;
743 bytes[0] = 1;
744 memset (&bytes[1], 0, nbytes);
745 ++nbytes;
747 else
749 ++bytes[inner];
750 while (++inner < nbytes)
751 bytes[inner] = 0;
756 /* Now set all the other characters of the character set to the
757 default width. */
758 curs = NULL;
759 while (iterate_table (&charmap->char_table, &curs, &key, &len, &vdata) == 0)
761 struct charseq *data = (struct charseq *) vdata;
763 if (data->ucs4 == UNINITIALIZED_CHAR_VALUE)
764 data->ucs4 = repertoire_find_value (ctype->repertoire,
765 data->name, len);
767 if (data->ucs4 != ILLEGAL_CHAR_VALUE)
768 (void) find_idx (ctype, NULL, NULL, NULL, data->ucs4);
771 /* There must be a multiple of 10 digits. */
772 if (ctype->mbdigits_act % 10 != 0)
774 assert (ctype->mbdigits_act == ctype->wcdigits_act);
775 ctype->wcdigits_act -= ctype->mbdigits_act % 10;
776 ctype->mbdigits_act -= ctype->mbdigits_act % 10;
777 WITH_CUR_LOCALE (error (0, 0, _("\
778 `digit' category has not entries in groups of ten")));
781 /* Check the input digits. There must be a multiple of ten available.
782 In each group it could be that one or the other character is missing.
783 In this case the whole group must be removed. */
784 cnt = 0;
785 while (cnt < ctype->mbdigits_act)
787 size_t inner;
788 for (inner = 0; inner < 10; ++inner)
789 if (ctype->mbdigits[cnt + inner] == NULL)
790 break;
792 if (inner == 10)
793 cnt += 10;
794 else
796 /* Remove the group. */
797 memmove (&ctype->mbdigits[cnt], &ctype->mbdigits[cnt + 10],
798 ((ctype->wcdigits_act - cnt - 10)
799 * sizeof (ctype->mbdigits[0])));
800 ctype->mbdigits_act -= 10;
804 /* If no input digits are given use the default. */
805 if (ctype->mbdigits_act == 0)
807 if (ctype->mbdigits_max == 0)
809 ctype->mbdigits = obstack_alloc (&((struct charmap_t *) charmap)->mem_pool,
810 10 * sizeof (struct charseq *));
811 ctype->mbdigits_max = 10;
814 for (cnt = 0; cnt < 10; ++cnt)
816 ctype->mbdigits[cnt] = charmap_find_symbol (charmap,
817 (char *) digits + cnt, 1);
818 if (ctype->mbdigits[cnt] == NULL)
820 ctype->mbdigits[cnt] = charmap_find_symbol (charmap,
821 longnames[cnt],
822 strlen (longnames[cnt]));
823 if (ctype->mbdigits[cnt] == NULL)
825 /* Hum, this ain't good. */
826 WITH_CUR_LOCALE (error (0, 0, _("\
827 no input digits defined and none of the standard names in the charmap")));
829 ctype->mbdigits[cnt] = obstack_alloc (&((struct charmap_t *) charmap)->mem_pool,
830 sizeof (struct charseq) + 1);
832 /* This is better than nothing. */
833 ctype->mbdigits[cnt]->bytes[0] = digits[cnt];
834 ctype->mbdigits[cnt]->nbytes = 1;
839 ctype->mbdigits_act = 10;
842 /* Check the wide character input digits. There must be a multiple
843 of ten available. In each group it could be that one or the other
844 character is missing. In this case the whole group must be
845 removed. */
846 cnt = 0;
847 while (cnt < ctype->wcdigits_act)
849 size_t inner;
850 for (inner = 0; inner < 10; ++inner)
851 if (ctype->wcdigits[cnt + inner] == ILLEGAL_CHAR_VALUE)
852 break;
854 if (inner == 10)
855 cnt += 10;
856 else
858 /* Remove the group. */
859 memmove (&ctype->wcdigits[cnt], &ctype->wcdigits[cnt + 10],
860 ((ctype->wcdigits_act - cnt - 10)
861 * sizeof (ctype->wcdigits[0])));
862 ctype->wcdigits_act -= 10;
866 /* If no input digits are given use the default. */
867 if (ctype->wcdigits_act == 0)
869 if (ctype->wcdigits_max == 0)
871 ctype->wcdigits = obstack_alloc (&((struct charmap_t *) charmap)->mem_pool,
872 10 * sizeof (uint32_t));
873 ctype->wcdigits_max = 10;
876 for (cnt = 0; cnt < 10; ++cnt)
877 ctype->wcdigits[cnt] = L'0' + cnt;
879 ctype->mbdigits_act = 10;
882 /* Check the outdigits. */
883 warned = 0;
884 for (cnt = 0; cnt < 10; ++cnt)
885 if (ctype->mboutdigits[cnt] == NULL)
887 static struct charseq replace[2];
889 if (!warned)
891 WITH_CUR_LOCALE (error (0, 0, _("\
892 not all characters used in `outdigit' are available in the charmap")));
893 warned = 1;
896 replace[0].nbytes = 1;
897 replace[0].bytes[0] = '?';
898 replace[0].bytes[1] = '\0';
899 ctype->mboutdigits[cnt] = &replace[0];
902 warned = 0;
903 for (cnt = 0; cnt < 10; ++cnt)
904 if (ctype->wcoutdigits[cnt] == 0)
906 if (!warned)
908 WITH_CUR_LOCALE (error (0, 0, _("\
909 not all characters used in `outdigit' are available in the repertoire")));
910 warned = 1;
913 ctype->wcoutdigits[cnt] = L'?';
916 /* Sort the entries in the translit_ignore list. */
917 if (ctype->translit_ignore != NULL)
919 struct translit_ignore_t *firstp = ctype->translit_ignore;
920 struct translit_ignore_t *runp;
922 ctype->ntranslit_ignore = 1;
924 for (runp = firstp->next; runp != NULL; runp = runp->next)
926 struct translit_ignore_t *lastp = NULL;
927 struct translit_ignore_t *cmpp;
929 ++ctype->ntranslit_ignore;
931 for (cmpp = firstp; cmpp != NULL; lastp = cmpp, cmpp = cmpp->next)
932 if (runp->from < cmpp->from)
933 break;
935 runp->next = lastp;
936 if (lastp == NULL)
937 firstp = runp;
940 ctype->translit_ignore = firstp;
945 void
946 ctype_output (struct localedef_t *locale, const struct charmap_t *charmap,
947 const char *output_path)
949 struct locale_ctype_t *ctype = locale->categories[LC_CTYPE].ctype;
950 const size_t nelems = (_NL_ITEM_INDEX (_NL_CTYPE_EXTRA_MAP_1)
951 + ctype->nr_charclass + ctype->map_collection_nr);
952 struct locale_file file;
953 uint32_t default_missing_len;
954 size_t elem, cnt;
956 /* Now prepare the output: Find the sizes of the table we can use. */
957 allocate_arrays (ctype, charmap, ctype->repertoire);
959 default_missing_len = (ctype->default_missing
960 ? wcslen ((wchar_t *) ctype->default_missing)
961 : 0);
963 init_locale_data (&file, nelems);
964 for (elem = 0; elem < nelems; ++elem)
966 if (elem < _NL_ITEM_INDEX (_NL_CTYPE_EXTRA_MAP_1))
967 switch (elem)
969 #define CTYPE_EMPTY(name) \
970 case name: \
971 add_locale_empty (&file); \
972 break
974 CTYPE_EMPTY(_NL_CTYPE_GAP1);
975 CTYPE_EMPTY(_NL_CTYPE_GAP2);
976 CTYPE_EMPTY(_NL_CTYPE_GAP3);
977 CTYPE_EMPTY(_NL_CTYPE_GAP4);
978 CTYPE_EMPTY(_NL_CTYPE_GAP5);
979 CTYPE_EMPTY(_NL_CTYPE_GAP6);
981 #define CTYPE_RAW_DATA(name, base, size) \
982 case _NL_ITEM_INDEX (name): \
983 add_locale_raw_data (&file, base, size); \
984 break
986 CTYPE_RAW_DATA (_NL_CTYPE_CLASS,
987 ctype->ctype_b,
988 (256 + 128) * sizeof (char_class_t));
990 #define CTYPE_UINT32_ARRAY(name, base, n_elems) \
991 case _NL_ITEM_INDEX (name): \
992 add_locale_uint32_array (&file, base, n_elems); \
993 break
995 CTYPE_UINT32_ARRAY (_NL_CTYPE_TOUPPER, ctype->map_b[0], 256 + 128);
996 CTYPE_UINT32_ARRAY (_NL_CTYPE_TOLOWER, ctype->map_b[1], 256 + 128);
997 CTYPE_UINT32_ARRAY (_NL_CTYPE_TOUPPER32, ctype->map32_b[0], 256);
998 CTYPE_UINT32_ARRAY (_NL_CTYPE_TOLOWER32, ctype->map32_b[1], 256);
999 CTYPE_RAW_DATA (_NL_CTYPE_CLASS32,
1000 ctype->ctype32_b,
1001 256 * sizeof (char_class32_t));
1003 #define CTYPE_UINT32(name, value) \
1004 case _NL_ITEM_INDEX (name): \
1005 add_locale_uint32 (&file, value); \
1006 break
1008 CTYPE_UINT32 (_NL_CTYPE_CLASS_OFFSET, ctype->class_offset);
1009 CTYPE_UINT32 (_NL_CTYPE_MAP_OFFSET, ctype->map_offset);
1010 CTYPE_UINT32 (_NL_CTYPE_TRANSLIT_TAB_SIZE, ctype->translit_idx_size);
1012 CTYPE_UINT32_ARRAY (_NL_CTYPE_TRANSLIT_FROM_IDX,
1013 ctype->translit_from_idx,
1014 ctype->translit_idx_size);
1016 CTYPE_UINT32_ARRAY (_NL_CTYPE_TRANSLIT_FROM_TBL,
1017 ctype->translit_from_tbl,
1018 ctype->translit_from_tbl_size
1019 / sizeof (uint32_t));
1021 CTYPE_UINT32_ARRAY (_NL_CTYPE_TRANSLIT_TO_IDX,
1022 ctype->translit_to_idx,
1023 ctype->translit_idx_size);
1025 CTYPE_UINT32_ARRAY (_NL_CTYPE_TRANSLIT_TO_TBL,
1026 ctype->translit_to_tbl,
1027 ctype->translit_to_tbl_size / sizeof (uint32_t));
1029 case _NL_ITEM_INDEX (_NL_CTYPE_CLASS_NAMES):
1030 /* The class name array. */
1031 start_locale_structure (&file);
1032 for (cnt = 0; cnt < ctype->nr_charclass; ++cnt)
1033 add_locale_string (&file, ctype->classnames[cnt]);
1034 add_locale_char (&file, 0);
1035 align_locale_data (&file, LOCFILE_ALIGN);
1036 end_locale_structure (&file);
1037 break;
1039 case _NL_ITEM_INDEX (_NL_CTYPE_MAP_NAMES):
1040 /* The class name array. */
1041 start_locale_structure (&file);
1042 for (cnt = 0; cnt < ctype->map_collection_nr; ++cnt)
1043 add_locale_string (&file, ctype->mapnames[cnt]);
1044 add_locale_char (&file, 0);
1045 align_locale_data (&file, LOCFILE_ALIGN);
1046 end_locale_structure (&file);
1047 break;
1049 case _NL_ITEM_INDEX (_NL_CTYPE_WIDTH):
1050 add_locale_wcwidth_table (&file, &ctype->width);
1051 break;
1053 CTYPE_UINT32 (_NL_CTYPE_MB_CUR_MAX, ctype->mb_cur_max);
1055 case _NL_ITEM_INDEX (_NL_CTYPE_CODESET_NAME):
1056 add_locale_string (&file, ctype->codeset_name);
1057 break;
1059 CTYPE_UINT32 (_NL_CTYPE_MAP_TO_NONASCII, ctype->to_nonascii);
1061 CTYPE_UINT32 (_NL_CTYPE_NONASCII_CASE, ctype->nonascii_case);
1063 case _NL_ITEM_INDEX (_NL_CTYPE_INDIGITS_MB_LEN):
1064 add_locale_uint32 (&file, ctype->mbdigits_act / 10);
1065 break;
1067 case _NL_ITEM_INDEX (_NL_CTYPE_INDIGITS_WC_LEN):
1068 add_locale_uint32 (&file, ctype->wcdigits_act / 10);
1069 break;
1071 case _NL_ITEM_INDEX (_NL_CTYPE_INDIGITS0_MB) ... _NL_ITEM_INDEX (_NL_CTYPE_INDIGITS9_MB):
1072 start_locale_structure (&file);
1073 for (cnt = elem - _NL_ITEM_INDEX (_NL_CTYPE_INDIGITS0_MB);
1074 cnt < ctype->mbdigits_act; cnt += 10)
1076 add_locale_raw_data (&file, ctype->mbdigits[cnt]->bytes,
1077 ctype->mbdigits[cnt]->nbytes);
1078 add_locale_char (&file, 0);
1080 end_locale_structure (&file);
1081 break;
1083 case _NL_ITEM_INDEX (_NL_CTYPE_OUTDIGIT0_MB) ... _NL_ITEM_INDEX (_NL_CTYPE_OUTDIGIT9_MB):
1084 start_locale_structure (&file);
1085 cnt = elem - _NL_ITEM_INDEX (_NL_CTYPE_OUTDIGIT0_MB);
1086 add_locale_raw_data (&file, ctype->mboutdigits[cnt]->bytes,
1087 ctype->mboutdigits[cnt]->nbytes);
1088 add_locale_char (&file, 0);
1089 end_locale_structure (&file);
1090 break;
1092 case _NL_ITEM_INDEX (_NL_CTYPE_INDIGITS0_WC) ... _NL_ITEM_INDEX (_NL_CTYPE_INDIGITS9_WC):
1093 start_locale_structure (&file);
1094 for (cnt = elem - _NL_ITEM_INDEX (_NL_CTYPE_INDIGITS0_WC);
1095 cnt < ctype->wcdigits_act; cnt += 10)
1096 add_locale_uint32 (&file, ctype->wcdigits[cnt]);
1097 end_locale_structure (&file);
1098 break;
1100 case _NL_ITEM_INDEX (_NL_CTYPE_OUTDIGIT0_WC) ... _NL_ITEM_INDEX (_NL_CTYPE_OUTDIGIT9_WC):
1101 cnt = elem - _NL_ITEM_INDEX (_NL_CTYPE_OUTDIGIT0_WC);
1102 add_locale_uint32 (&file, ctype->wcoutdigits[cnt]);
1103 break;
1105 case _NL_ITEM_INDEX(_NL_CTYPE_TRANSLIT_DEFAULT_MISSING_LEN):
1106 add_locale_uint32 (&file, default_missing_len);
1107 break;
1109 case _NL_ITEM_INDEX(_NL_CTYPE_TRANSLIT_DEFAULT_MISSING):
1110 add_locale_uint32_array (&file, ctype->default_missing,
1111 default_missing_len);
1112 break;
1114 case _NL_ITEM_INDEX(_NL_CTYPE_TRANSLIT_IGNORE_LEN):
1115 add_locale_uint32 (&file, ctype->ntranslit_ignore);
1116 break;
1118 case _NL_ITEM_INDEX(_NL_CTYPE_TRANSLIT_IGNORE):
1119 start_locale_structure (&file);
1121 struct translit_ignore_t *runp;
1122 for (runp = ctype->translit_ignore; runp != NULL;
1123 runp = runp->next)
1125 add_locale_uint32 (&file, runp->from);
1126 add_locale_uint32 (&file, runp->to);
1127 add_locale_uint32 (&file, runp->step);
1130 end_locale_structure (&file);
1131 break;
1133 default:
1134 assert (! "unknown CTYPE element");
1136 else
1138 /* Handle extra maps. */
1139 size_t nr = elem - _NL_ITEM_INDEX (_NL_CTYPE_EXTRA_MAP_1);
1140 if (nr < ctype->nr_charclass)
1142 start_locale_prelude (&file);
1143 add_locale_uint32_array (&file, ctype->class_b[nr], 256 / 32);
1144 end_locale_prelude (&file);
1145 add_locale_wctype_table (&file, &ctype->class_3level[nr]);
1147 else
1149 nr -= ctype->nr_charclass;
1150 assert (nr < ctype->map_collection_nr);
1151 add_locale_wctrans_table (&file, &ctype->map_3level[nr]);
1156 write_locale_data (output_path, LC_CTYPE, "LC_CTYPE", &file);
1160 /* Local functions. */
1161 static void
1162 ctype_class_new (struct linereader *lr, struct locale_ctype_t *ctype,
1163 const char *name)
1165 size_t cnt;
1167 for (cnt = 0; cnt < ctype->nr_charclass; ++cnt)
1168 if (strcmp (ctype->classnames[cnt], name) == 0)
1169 break;
1171 if (cnt < ctype->nr_charclass)
1173 lr_error (lr, _("character class `%s' already defined"), name);
1174 return;
1177 if (ctype->nr_charclass == MAX_NR_CHARCLASS)
1178 /* Exit code 2 is prescribed in P1003.2b. */
1179 WITH_CUR_LOCALE (error (2, 0, _("\
1180 implementation limit: no more than %Zd character classes allowed"),
1181 MAX_NR_CHARCLASS));
1183 ctype->classnames[ctype->nr_charclass++] = name;
1187 static void
1188 ctype_map_new (struct linereader *lr, struct locale_ctype_t *ctype,
1189 const char *name, const struct charmap_t *charmap)
1191 size_t max_chars = 0;
1192 size_t cnt;
1194 for (cnt = 0; cnt < ctype->map_collection_nr; ++cnt)
1196 if (strcmp (ctype->mapnames[cnt], name) == 0)
1197 break;
1199 if (max_chars < ctype->map_collection_max[cnt])
1200 max_chars = ctype->map_collection_max[cnt];
1203 if (cnt < ctype->map_collection_nr)
1205 lr_error (lr, _("character map `%s' already defined"), name);
1206 return;
1209 if (ctype->map_collection_nr == MAX_NR_CHARMAP)
1210 /* Exit code 2 is prescribed in P1003.2b. */
1211 WITH_CUR_LOCALE (error (2, 0, _("\
1212 implementation limit: no more than %d character maps allowed"),
1213 MAX_NR_CHARMAP));
1215 ctype->mapnames[cnt] = name;
1217 if (max_chars == 0)
1218 ctype->map_collection_max[cnt] = charmap->mb_cur_max == 1 ? 256 : 512;
1219 else
1220 ctype->map_collection_max[cnt] = max_chars;
1222 ctype->map_collection[cnt] = (uint32_t *)
1223 xcalloc (sizeof (uint32_t), ctype->map_collection_max[cnt]);
1224 ctype->map_collection_act[cnt] = 256;
1226 ++ctype->map_collection_nr;
1230 /* We have to be prepared that TABLE, MAX, and ACT can be NULL. This
1231 is possible if we only want to extend the name array. */
1232 static uint32_t *
1233 find_idx (struct locale_ctype_t *ctype, uint32_t **table, size_t *max,
1234 size_t *act, uint32_t idx)
1236 size_t cnt;
1238 if (idx < 256)
1239 return table == NULL ? NULL : &(*table)[idx];
1241 /* Use the charnames_idx lookup table instead of the slow search loop. */
1242 #if 1
1243 cnt = idx_table_get (&ctype->charnames_idx, idx);
1244 if (cnt == EMPTY)
1245 /* Not found. */
1246 cnt = ctype->charnames_act;
1247 #else
1248 for (cnt = 256; cnt < ctype->charnames_act; ++cnt)
1249 if (ctype->charnames[cnt] == idx)
1250 break;
1251 #endif
1253 /* We have to distinguish two cases: the name is found or not. */
1254 if (cnt == ctype->charnames_act)
1256 /* Extend the name array. */
1257 if (ctype->charnames_act == ctype->charnames_max)
1259 ctype->charnames_max *= 2;
1260 ctype->charnames = (uint32_t *)
1261 xrealloc (ctype->charnames,
1262 sizeof (uint32_t) * ctype->charnames_max);
1264 ctype->charnames[ctype->charnames_act++] = idx;
1265 idx_table_add (&ctype->charnames_idx, idx, cnt);
1268 if (table == NULL)
1269 /* We have done everything we are asked to do. */
1270 return NULL;
1272 if (max == NULL)
1273 /* The caller does not want to extend the table. */
1274 return (cnt >= *act ? NULL : &(*table)[cnt]);
1276 if (cnt >= *act)
1278 if (cnt >= *max)
1280 size_t old_max = *max;
1282 *max *= 2;
1283 while (*max <= cnt);
1285 *table =
1286 (uint32_t *) xrealloc (*table, *max * sizeof (uint32_t));
1287 memset (&(*table)[old_max], '\0',
1288 (*max - old_max) * sizeof (uint32_t));
1291 *act = cnt + 1;
1294 return &(*table)[cnt];
1298 static int
1299 get_character (struct token *now, const struct charmap_t *charmap,
1300 struct repertoire_t *repertoire,
1301 struct charseq **seqp, uint32_t *wchp)
1303 if (now->tok == tok_bsymbol)
1305 /* This will hopefully be the normal case. */
1306 *wchp = repertoire_find_value (repertoire, now->val.str.startmb,
1307 now->val.str.lenmb);
1308 *seqp = charmap_find_value (charmap, now->val.str.startmb,
1309 now->val.str.lenmb);
1311 else if (now->tok == tok_ucs4)
1313 char utmp[10];
1315 snprintf (utmp, sizeof (utmp), "U%08X", now->val.ucs4);
1316 *seqp = charmap_find_value (charmap, utmp, 9);
1318 if (*seqp == NULL)
1319 *seqp = repertoire_find_seq (repertoire, now->val.ucs4);
1321 if (*seqp == NULL)
1323 /* Compute the value in the charmap from the UCS value. */
1324 const char *symbol = repertoire_find_symbol (repertoire,
1325 now->val.ucs4);
1327 if (symbol == NULL)
1328 *seqp = NULL;
1329 else
1330 *seqp = charmap_find_value (charmap, symbol, strlen (symbol));
1332 if (*seqp == NULL)
1334 if (repertoire != NULL)
1336 /* Insert a negative entry. */
1337 static const struct charseq negative
1338 = { .ucs4 = ILLEGAL_CHAR_VALUE };
1339 uint32_t *newp = obstack_alloc (&repertoire->mem_pool,
1340 sizeof (uint32_t));
1341 *newp = now->val.ucs4;
1343 insert_entry (&repertoire->seq_table, newp,
1344 sizeof (uint32_t), (void *) &negative);
1347 else
1348 (*seqp)->ucs4 = now->val.ucs4;
1350 else if ((*seqp)->ucs4 != now->val.ucs4)
1351 *seqp = NULL;
1353 *wchp = now->val.ucs4;
1355 else if (now->tok == tok_charcode)
1357 /* We must map from the byte code to UCS4. */
1358 *seqp = charmap_find_symbol (charmap, now->val.str.startmb,
1359 now->val.str.lenmb);
1361 if (*seqp == NULL)
1362 *wchp = ILLEGAL_CHAR_VALUE;
1363 else
1365 if ((*seqp)->ucs4 == UNINITIALIZED_CHAR_VALUE)
1366 (*seqp)->ucs4 = repertoire_find_value (repertoire, (*seqp)->name,
1367 strlen ((*seqp)->name));
1368 *wchp = (*seqp)->ucs4;
1371 else
1372 return 1;
1374 return 0;
1378 /* Ellipsis like in `<foo123>..<foo12a>' or `<j1234>....<j1245>' and
1379 the .(2). counterparts. */
1380 static void
1381 charclass_symbolic_ellipsis (struct linereader *ldfile,
1382 struct locale_ctype_t *ctype,
1383 const struct charmap_t *charmap,
1384 struct repertoire_t *repertoire,
1385 struct token *now,
1386 const char *last_str,
1387 unsigned long int class256_bit,
1388 unsigned long int class_bit, int base,
1389 int ignore_content, int handle_digits, int step)
1391 const char *nowstr = now->val.str.startmb;
1392 char tmp[now->val.str.lenmb + 1];
1393 const char *cp;
1394 char *endp;
1395 unsigned long int from;
1396 unsigned long int to;
1398 /* We have to compute the ellipsis values using the symbolic names. */
1399 assert (last_str != NULL);
1401 if (strlen (last_str) != now->val.str.lenmb)
1403 invalid_range:
1404 lr_error (ldfile,
1405 _("`%s' and `%.*s' are not valid names for symbolic range"),
1406 last_str, (int) now->val.str.lenmb, nowstr);
1407 return;
1410 if (memcmp (last_str, nowstr, now->val.str.lenmb) == 0)
1411 /* Nothing to do, the names are the same. */
1412 return;
1414 for (cp = last_str; *cp == *(nowstr + (cp - last_str)); ++cp)
1417 errno = 0;
1418 from = strtoul (cp, &endp, base);
1419 if ((from == UINT_MAX && errno == ERANGE) || *endp != '\0')
1420 goto invalid_range;
1422 to = strtoul (nowstr + (cp - last_str), &endp, base);
1423 if ((to == UINT_MAX && errno == ERANGE)
1424 || (endp - nowstr) != now->val.str.lenmb || from >= to)
1425 goto invalid_range;
1427 /* OK, we have a range FROM - TO. Now we can create the symbolic names. */
1428 if (!ignore_content)
1430 now->val.str.startmb = tmp;
1431 while ((from += step) <= to)
1433 struct charseq *seq;
1434 uint32_t wch;
1436 sprintf (tmp, (base == 10 ? "%.*s%0*ld" : "%.*s%0*lX"),
1437 (int) (cp - last_str), last_str,
1438 (int) (now->val.str.lenmb - (cp - last_str)),
1439 from);
1441 get_character (now, charmap, repertoire, &seq, &wch);
1443 if (seq != NULL && seq->nbytes == 1)
1444 /* Yep, we can store information about this byte sequence. */
1445 ctype->class256_collection[seq->bytes[0]] |= class256_bit;
1447 if (wch != ILLEGAL_CHAR_VALUE && class_bit != 0)
1448 /* We have the UCS4 position. */
1449 *find_idx (ctype, &ctype->class_collection,
1450 &ctype->class_collection_max,
1451 &ctype->class_collection_act, wch) |= class_bit;
1453 if (handle_digits == 1)
1455 /* We must store the digit values. */
1456 if (ctype->mbdigits_act == ctype->mbdigits_max)
1458 ctype->mbdigits_max *= 2;
1459 ctype->mbdigits = xrealloc (ctype->mbdigits,
1460 (ctype->mbdigits_max
1461 * sizeof (char *)));
1462 ctype->wcdigits_max *= 2;
1463 ctype->wcdigits = xrealloc (ctype->wcdigits,
1464 (ctype->wcdigits_max
1465 * sizeof (uint32_t)));
1468 ctype->mbdigits[ctype->mbdigits_act++] = seq;
1469 ctype->wcdigits[ctype->wcdigits_act++] = wch;
1471 else if (handle_digits == 2)
1473 /* We must store the digit values. */
1474 if (ctype->outdigits_act >= 10)
1476 lr_error (ldfile, _("\
1477 %s: field `%s' does not contain exactly ten entries"),
1478 "LC_CTYPE", "outdigit");
1479 return;
1482 ctype->mboutdigits[ctype->outdigits_act] = seq;
1483 ctype->wcoutdigits[ctype->outdigits_act] = wch;
1484 ++ctype->outdigits_act;
1491 /* Ellipsis like in `<U1234>..<U2345>' or `<U1234>..(2)..<U2345>'. */
1492 static void
1493 charclass_ucs4_ellipsis (struct linereader *ldfile,
1494 struct locale_ctype_t *ctype,
1495 const struct charmap_t *charmap,
1496 struct repertoire_t *repertoire,
1497 struct token *now, uint32_t last_wch,
1498 unsigned long int class256_bit,
1499 unsigned long int class_bit, int ignore_content,
1500 int handle_digits, int step)
1502 if (last_wch > now->val.ucs4)
1504 lr_error (ldfile, _("\
1505 to-value <U%0*X> of range is smaller than from-value <U%0*X>"),
1506 (now->val.ucs4 | last_wch) < 65536 ? 4 : 8, now->val.ucs4,
1507 (now->val.ucs4 | last_wch) < 65536 ? 4 : 8, last_wch);
1508 return;
1511 if (!ignore_content)
1512 while ((last_wch += step) <= now->val.ucs4)
1514 /* We have to find out whether there is a byte sequence corresponding
1515 to this UCS4 value. */
1516 struct charseq *seq;
1517 char utmp[10];
1519 snprintf (utmp, sizeof (utmp), "U%08X", last_wch);
1520 seq = charmap_find_value (charmap, utmp, 9);
1521 if (seq == NULL)
1523 snprintf (utmp, sizeof (utmp), "U%04X", last_wch);
1524 seq = charmap_find_value (charmap, utmp, 5);
1527 if (seq == NULL)
1528 /* Try looking in the repertoire map. */
1529 seq = repertoire_find_seq (repertoire, last_wch);
1531 /* If this is the first time we look for this sequence create a new
1532 entry. */
1533 if (seq == NULL)
1535 static const struct charseq negative
1536 = { .ucs4 = ILLEGAL_CHAR_VALUE };
1538 /* Find the symbolic name for this UCS4 value. */
1539 if (repertoire != NULL)
1541 const char *symbol = repertoire_find_symbol (repertoire,
1542 last_wch);
1543 uint32_t *newp = obstack_alloc (&repertoire->mem_pool,
1544 sizeof (uint32_t));
1545 *newp = last_wch;
1547 if (symbol != NULL)
1548 /* We have a name, now search the multibyte value. */
1549 seq = charmap_find_value (charmap, symbol, strlen (symbol));
1551 if (seq == NULL)
1552 /* We have to create a fake entry. */
1553 seq = (struct charseq *) &negative;
1554 else
1555 seq->ucs4 = last_wch;
1557 insert_entry (&repertoire->seq_table, newp, sizeof (uint32_t),
1558 seq);
1560 else
1561 /* We have to create a fake entry. */
1562 seq = (struct charseq *) &negative;
1565 /* We have a name, now search the multibyte value. */
1566 if (seq->ucs4 == last_wch && seq->nbytes == 1)
1567 /* Yep, we can store information about this byte sequence. */
1568 ctype->class256_collection[(size_t) seq->bytes[0]]
1569 |= class256_bit;
1571 /* And of course we have the UCS4 position. */
1572 if (class_bit != 0)
1573 *find_idx (ctype, &ctype->class_collection,
1574 &ctype->class_collection_max,
1575 &ctype->class_collection_act, last_wch) |= class_bit;
1577 if (handle_digits == 1)
1579 /* We must store the digit values. */
1580 if (ctype->mbdigits_act == ctype->mbdigits_max)
1582 ctype->mbdigits_max *= 2;
1583 ctype->mbdigits = xrealloc (ctype->mbdigits,
1584 (ctype->mbdigits_max
1585 * sizeof (char *)));
1586 ctype->wcdigits_max *= 2;
1587 ctype->wcdigits = xrealloc (ctype->wcdigits,
1588 (ctype->wcdigits_max
1589 * sizeof (uint32_t)));
1592 ctype->mbdigits[ctype->mbdigits_act++] = (seq->ucs4 == last_wch
1593 ? seq : NULL);
1594 ctype->wcdigits[ctype->wcdigits_act++] = last_wch;
1596 else if (handle_digits == 2)
1598 /* We must store the digit values. */
1599 if (ctype->outdigits_act >= 10)
1601 lr_error (ldfile, _("\
1602 %s: field `%s' does not contain exactly ten entries"),
1603 "LC_CTYPE", "outdigit");
1604 return;
1607 ctype->mboutdigits[ctype->outdigits_act] = (seq->ucs4 == last_wch
1608 ? seq : NULL);
1609 ctype->wcoutdigits[ctype->outdigits_act] = last_wch;
1610 ++ctype->outdigits_act;
1616 /* Ellipsis as in `/xea/x12.../xea/x34'. */
1617 static void
1618 charclass_charcode_ellipsis (struct linereader *ldfile,
1619 struct locale_ctype_t *ctype,
1620 const struct charmap_t *charmap,
1621 struct repertoire_t *repertoire,
1622 struct token *now, char *last_charcode,
1623 uint32_t last_charcode_len,
1624 unsigned long int class256_bit,
1625 unsigned long int class_bit, int ignore_content,
1626 int handle_digits)
1628 /* First check whether the to-value is larger. */
1629 if (now->val.charcode.nbytes != last_charcode_len)
1631 lr_error (ldfile, _("\
1632 start and end character sequence of range must have the same length"));
1633 return;
1636 if (memcmp (last_charcode, now->val.charcode.bytes, last_charcode_len) > 0)
1638 lr_error (ldfile, _("\
1639 to-value character sequence is smaller than from-value sequence"));
1640 return;
1643 if (!ignore_content)
1647 /* Increment the byte sequence value. */
1648 struct charseq *seq;
1649 uint32_t wch;
1650 int i;
1652 for (i = last_charcode_len - 1; i >= 0; --i)
1653 if (++last_charcode[i] != 0)
1654 break;
1656 if (last_charcode_len == 1)
1657 /* Of course we have the charcode value. */
1658 ctype->class256_collection[(size_t) last_charcode[0]]
1659 |= class256_bit;
1661 /* Find the symbolic name. */
1662 seq = charmap_find_symbol (charmap, last_charcode,
1663 last_charcode_len);
1664 if (seq != NULL)
1666 if (seq->ucs4 == UNINITIALIZED_CHAR_VALUE)
1667 seq->ucs4 = repertoire_find_value (repertoire, seq->name,
1668 strlen (seq->name));
1669 wch = seq == NULL ? ILLEGAL_CHAR_VALUE : seq->ucs4;
1671 if (wch != ILLEGAL_CHAR_VALUE && class_bit != 0)
1672 *find_idx (ctype, &ctype->class_collection,
1673 &ctype->class_collection_max,
1674 &ctype->class_collection_act, wch) |= class_bit;
1676 else
1677 wch = ILLEGAL_CHAR_VALUE;
1679 if (handle_digits == 1)
1681 /* We must store the digit values. */
1682 if (ctype->mbdigits_act == ctype->mbdigits_max)
1684 ctype->mbdigits_max *= 2;
1685 ctype->mbdigits = xrealloc (ctype->mbdigits,
1686 (ctype->mbdigits_max
1687 * sizeof (char *)));
1688 ctype->wcdigits_max *= 2;
1689 ctype->wcdigits = xrealloc (ctype->wcdigits,
1690 (ctype->wcdigits_max
1691 * sizeof (uint32_t)));
1694 seq = xmalloc (sizeof (struct charseq) + last_charcode_len);
1695 memcpy ((char *) (seq + 1), last_charcode, last_charcode_len);
1696 seq->nbytes = last_charcode_len;
1698 ctype->mbdigits[ctype->mbdigits_act++] = seq;
1699 ctype->wcdigits[ctype->wcdigits_act++] = wch;
1701 else if (handle_digits == 2)
1703 struct charseq *seq;
1704 /* We must store the digit values. */
1705 if (ctype->outdigits_act >= 10)
1707 lr_error (ldfile, _("\
1708 %s: field `%s' does not contain exactly ten entries"),
1709 "LC_CTYPE", "outdigit");
1710 return;
1713 seq = xmalloc (sizeof (struct charseq) + last_charcode_len);
1714 memcpy ((char *) (seq + 1), last_charcode, last_charcode_len);
1715 seq->nbytes = last_charcode_len;
1717 ctype->mboutdigits[ctype->outdigits_act] = seq;
1718 ctype->wcoutdigits[ctype->outdigits_act] = wch;
1719 ++ctype->outdigits_act;
1722 while (memcmp (last_charcode, now->val.charcode.bytes,
1723 last_charcode_len) != 0);
1728 static uint32_t *
1729 find_translit2 (struct locale_ctype_t *ctype, const struct charmap_t *charmap,
1730 uint32_t wch)
1732 struct translit_t *trunp = ctype->translit;
1733 struct translit_ignore_t *tirunp = ctype->translit_ignore;
1735 while (trunp != NULL)
1737 /* XXX We simplify things here. The transliterations we look
1738 for are only allowed to have one character. */
1739 if (trunp->from[0] == wch && trunp->from[1] == 0)
1741 /* Found it. Now look for a transliteration which can be
1742 represented with the character set. */
1743 struct translit_to_t *torunp = trunp->to;
1745 while (torunp != NULL)
1747 int i;
1749 for (i = 0; torunp->str[i] != 0; ++i)
1751 char utmp[10];
1753 snprintf (utmp, sizeof (utmp), "U%08X", torunp->str[i]);
1754 if (charmap_find_value (charmap, utmp, 9) == NULL)
1755 /* This character cannot be represented. */
1756 break;
1759 if (torunp->str[i] == 0)
1760 return torunp->str;
1762 torunp = torunp->next;
1765 break;
1768 trunp = trunp->next;
1771 /* Check for ignored chars. */
1772 while (tirunp != NULL)
1774 if (tirunp->from <= wch && tirunp->to >= wch)
1776 uint32_t wi;
1778 for (wi = tirunp->from; wi <= wch; wi += tirunp->step)
1779 if (wi == wch)
1780 return (uint32_t []) { 0 };
1784 /* Nothing found. */
1785 return NULL;
1789 uint32_t *
1790 find_translit (struct localedef_t *locale, const struct charmap_t *charmap,
1791 uint32_t wch)
1793 struct locale_ctype_t *ctype;
1794 uint32_t *result = NULL;
1796 assert (locale != NULL);
1797 ctype = locale->categories[LC_CTYPE].ctype;
1799 if (ctype == NULL)
1800 return NULL;
1802 if (ctype->translit != NULL)
1803 result = find_translit2 (ctype, charmap, wch);
1805 if (result == NULL)
1807 struct translit_include_t *irunp = ctype->translit_include;
1809 while (irunp != NULL && result == NULL)
1811 result = find_translit (find_locale (CTYPE_LOCALE,
1812 irunp->copy_locale,
1813 irunp->copy_repertoire,
1814 charmap),
1815 charmap, wch);
1816 irunp = irunp->next;
1820 return result;
1824 /* Read one transliteration entry. */
1825 static uint32_t *
1826 read_widestring (struct linereader *ldfile, struct token *now,
1827 const struct charmap_t *charmap,
1828 struct repertoire_t *repertoire)
1830 uint32_t *wstr;
1832 if (now->tok == tok_default_missing)
1833 /* The special name "" will denote this case. */
1834 wstr = ((uint32_t *) { 0 });
1835 else if (now->tok == tok_bsymbol)
1837 /* Get the value from the repertoire. */
1838 wstr = (uint32_t *) xmalloc (2 * sizeof (uint32_t));
1839 wstr[0] = repertoire_find_value (repertoire, now->val.str.startmb,
1840 now->val.str.lenmb);
1841 if (wstr[0] == ILLEGAL_CHAR_VALUE)
1843 /* We cannot proceed, we don't know the UCS4 value. */
1844 free (wstr);
1845 return NULL;
1848 wstr[1] = 0;
1850 else if (now->tok == tok_ucs4)
1852 wstr = (uint32_t *) xmalloc (2 * sizeof (uint32_t));
1853 wstr[0] = now->val.ucs4;
1854 wstr[1] = 0;
1856 else if (now->tok == tok_charcode)
1858 /* Argh, we have to convert to the symbol name first and then to the
1859 UCS4 value. */
1860 struct charseq *seq = charmap_find_symbol (charmap,
1861 now->val.str.startmb,
1862 now->val.str.lenmb);
1863 if (seq == NULL)
1864 /* Cannot find the UCS4 value. */
1865 return NULL;
1867 if (seq->ucs4 == UNINITIALIZED_CHAR_VALUE)
1868 seq->ucs4 = repertoire_find_value (repertoire, seq->name,
1869 strlen (seq->name));
1870 if (seq->ucs4 == ILLEGAL_CHAR_VALUE)
1871 /* We cannot proceed, we don't know the UCS4 value. */
1872 return NULL;
1874 wstr = (uint32_t *) xmalloc (2 * sizeof (uint32_t));
1875 wstr[0] = seq->ucs4;
1876 wstr[1] = 0;
1878 else if (now->tok == tok_string)
1880 wstr = now->val.str.startwc;
1881 if (wstr == NULL || wstr[0] == 0)
1882 return NULL;
1884 else
1886 if (now->tok != tok_eol && now->tok != tok_eof)
1887 lr_ignore_rest (ldfile, 0);
1888 SYNTAX_ERROR (_("%s: syntax error"), "LC_CTYPE");
1889 return (uint32_t *) -1l;
1892 return wstr;
1896 static void
1897 read_translit_entry (struct linereader *ldfile, struct locale_ctype_t *ctype,
1898 struct token *now, const struct charmap_t *charmap,
1899 struct repertoire_t *repertoire)
1901 uint32_t *from_wstr = read_widestring (ldfile, now, charmap, repertoire);
1902 struct translit_t *result;
1903 struct translit_to_t **top;
1904 struct obstack *ob = &ctype->mempool;
1905 int first;
1906 int ignore;
1908 if (from_wstr == NULL)
1909 /* There is no valid from string. */
1910 return;
1912 result = (struct translit_t *) obstack_alloc (ob,
1913 sizeof (struct translit_t));
1914 result->from = from_wstr;
1915 result->fname = ldfile->fname;
1916 result->lineno = ldfile->lineno;
1917 result->next = NULL;
1918 result->to = NULL;
1919 top = &result->to;
1920 first = 1;
1921 ignore = 0;
1923 while (1)
1925 uint32_t *to_wstr;
1927 /* Next we have one or more transliterations. They are
1928 separated by semicolons. */
1929 now = lr_token (ldfile, charmap, NULL, repertoire, verbose);
1931 if (!first && (now->tok == tok_semicolon || now->tok == tok_eol))
1933 /* One string read. */
1934 const uint32_t zero = 0;
1936 if (!ignore)
1938 obstack_grow (ob, &zero, 4);
1939 to_wstr = obstack_finish (ob);
1941 *top = obstack_alloc (ob, sizeof (struct translit_to_t));
1942 (*top)->str = to_wstr;
1943 (*top)->next = NULL;
1946 if (now->tok == tok_eol)
1948 result->next = ctype->translit;
1949 ctype->translit = result;
1950 return;
1953 if (!ignore)
1954 top = &(*top)->next;
1955 ignore = 0;
1957 else
1959 to_wstr = read_widestring (ldfile, now, charmap, repertoire);
1960 if (to_wstr == (uint32_t *) -1l)
1962 /* An error occurred. */
1963 obstack_free (ob, result);
1964 return;
1967 if (to_wstr == NULL)
1968 ignore = 1;
1969 else
1970 /* This value is usable. */
1971 obstack_grow (ob, to_wstr, wcslen ((wchar_t *) to_wstr) * 4);
1973 first = 0;
1979 static void
1980 read_translit_ignore_entry (struct linereader *ldfile,
1981 struct locale_ctype_t *ctype,
1982 const struct charmap_t *charmap,
1983 struct repertoire_t *repertoire)
1985 /* We expect a semicolon-separated list of characters we ignore. We are
1986 only interested in the wide character definitions. These must be
1987 single characters, possibly defining a range when an ellipsis is used. */
1988 while (1)
1990 struct token *now = lr_token (ldfile, charmap, NULL, repertoire,
1991 verbose);
1992 struct translit_ignore_t *newp;
1993 uint32_t from;
1995 if (now->tok == tok_eol || now->tok == tok_eof)
1997 lr_error (ldfile,
1998 _("premature end of `translit_ignore' definition"));
1999 return;
2002 if (now->tok != tok_bsymbol && now->tok != tok_ucs4)
2004 lr_error (ldfile, _("syntax error"));
2005 lr_ignore_rest (ldfile, 0);
2006 return;
2009 if (now->tok == tok_ucs4)
2010 from = now->val.ucs4;
2011 else
2012 /* Try to get the value. */
2013 from = repertoire_find_value (repertoire, now->val.str.startmb,
2014 now->val.str.lenmb);
2016 if (from == ILLEGAL_CHAR_VALUE)
2018 lr_error (ldfile, "invalid character name");
2019 newp = NULL;
2021 else
2023 newp = (struct translit_ignore_t *)
2024 obstack_alloc (&ctype->mempool, sizeof (struct translit_ignore_t));
2025 newp->from = from;
2026 newp->to = from;
2027 newp->step = 1;
2029 newp->next = ctype->translit_ignore;
2030 ctype->translit_ignore = newp;
2033 /* Now we expect either a semicolon, an ellipsis, or the end of the
2034 line. */
2035 now = lr_token (ldfile, charmap, NULL, repertoire, verbose);
2037 if (now->tok == tok_ellipsis2 || now->tok == tok_ellipsis2_2)
2039 /* XXX Should we bother implementing `....'? `...' certainly
2040 will not be implemented. */
2041 uint32_t to;
2042 int step = now->tok == tok_ellipsis2_2 ? 2 : 1;
2044 now = lr_token (ldfile, charmap, NULL, repertoire, verbose);
2046 if (now->tok == tok_eol || now->tok == tok_eof)
2048 lr_error (ldfile,
2049 _("premature end of `translit_ignore' definition"));
2050 return;
2053 if (now->tok != tok_bsymbol && now->tok != tok_ucs4)
2055 lr_error (ldfile, _("syntax error"));
2056 lr_ignore_rest (ldfile, 0);
2057 return;
2060 if (now->tok == tok_ucs4)
2061 to = now->val.ucs4;
2062 else
2063 /* Try to get the value. */
2064 to = repertoire_find_value (repertoire, now->val.str.startmb,
2065 now->val.str.lenmb);
2067 if (to == ILLEGAL_CHAR_VALUE)
2068 lr_error (ldfile, "invalid character name");
2069 else
2071 /* Make sure the `to'-value is larger. */
2072 if (to >= from)
2074 newp->to = to;
2075 newp->step = step;
2077 else
2078 lr_error (ldfile, _("\
2079 to-value <U%0*X> of range is smaller than from-value <U%0*X>"),
2080 (to | from) < 65536 ? 4 : 8, to,
2081 (to | from) < 65536 ? 4 : 8, from);
2084 /* And the next token. */
2085 now = lr_token (ldfile, charmap, NULL, repertoire, verbose);
2088 if (now->tok == tok_eol || now->tok == tok_eof)
2089 /* We are done. */
2090 return;
2092 if (now->tok == tok_semicolon)
2093 /* Next round. */
2094 continue;
2096 /* If we come here something is wrong. */
2097 lr_error (ldfile, _("syntax error"));
2098 lr_ignore_rest (ldfile, 0);
2099 return;
2104 /* The parser for the LC_CTYPE section of the locale definition. */
2105 void
2106 ctype_read (struct linereader *ldfile, struct localedef_t *result,
2107 const struct charmap_t *charmap, const char *repertoire_name,
2108 int ignore_content)
2110 struct repertoire_t *repertoire = NULL;
2111 struct locale_ctype_t *ctype;
2112 struct token *now;
2113 enum token_t nowtok;
2114 size_t cnt;
2115 uint32_t last_wch = 0;
2116 enum token_t last_token;
2117 enum token_t ellipsis_token;
2118 int step;
2119 char last_charcode[16];
2120 size_t last_charcode_len = 0;
2121 const char *last_str = NULL;
2122 int mapidx;
2123 struct localedef_t *copy_locale = NULL;
2125 /* Get the repertoire we have to use. */
2126 if (repertoire_name != NULL)
2127 repertoire = repertoire_read (repertoire_name);
2129 /* The rest of the line containing `LC_CTYPE' must be free. */
2130 lr_ignore_rest (ldfile, 1);
2135 now = lr_token (ldfile, charmap, NULL, NULL, verbose);
2136 nowtok = now->tok;
2138 while (nowtok == tok_eol);
2140 /* If we see `copy' now we are almost done. */
2141 if (nowtok == tok_copy)
2143 now = lr_token (ldfile, charmap, NULL, NULL, verbose);
2144 if (now->tok != tok_string)
2146 SYNTAX_ERROR (_("%s: syntax error"), "LC_CTYPE");
2148 skip_category:
2150 now = lr_token (ldfile, charmap, NULL, NULL, verbose);
2151 while (now->tok != tok_eof && now->tok != tok_end);
2153 if (now->tok != tok_eof
2154 || (now = lr_token (ldfile, charmap, NULL, NULL, verbose),
2155 now->tok == tok_eof))
2156 lr_error (ldfile, _("%s: premature end of file"), "LC_CTYPE");
2157 else if (now->tok != tok_lc_ctype)
2159 lr_error (ldfile, _("\
2160 %1$s: definition does not end with `END %1$s'"), "LC_CTYPE");
2161 lr_ignore_rest (ldfile, 0);
2163 else
2164 lr_ignore_rest (ldfile, 1);
2166 return;
2169 if (! ignore_content)
2171 /* Get the locale definition. */
2172 copy_locale = load_locale (LC_CTYPE, now->val.str.startmb,
2173 repertoire_name, charmap, NULL);
2174 if ((copy_locale->avail & CTYPE_LOCALE) == 0)
2176 /* Not yet loaded. So do it now. */
2177 if (locfile_read (copy_locale, charmap) != 0)
2178 goto skip_category;
2181 if (copy_locale->categories[LC_CTYPE].ctype == NULL)
2182 return;
2185 lr_ignore_rest (ldfile, 1);
2187 now = lr_token (ldfile, charmap, NULL, NULL, verbose);
2188 nowtok = now->tok;
2191 /* Prepare the data structures. */
2192 ctype_startup (ldfile, result, charmap, copy_locale, ignore_content);
2193 ctype = result->categories[LC_CTYPE].ctype;
2195 /* Remember the repertoire we use. */
2196 if (!ignore_content)
2197 ctype->repertoire = repertoire;
2199 while (1)
2201 unsigned long int class_bit = 0;
2202 unsigned long int class256_bit = 0;
2203 int handle_digits = 0;
2205 /* Of course we don't proceed beyond the end of file. */
2206 if (nowtok == tok_eof)
2207 break;
2209 /* Ingore empty lines. */
2210 if (nowtok == tok_eol)
2212 now = lr_token (ldfile, charmap, NULL, NULL, verbose);
2213 nowtok = now->tok;
2214 continue;
2217 switch (nowtok)
2219 case tok_charclass:
2220 now = lr_token (ldfile, charmap, NULL, NULL, verbose);
2221 while (now->tok == tok_ident || now->tok == tok_string)
2223 ctype_class_new (ldfile, ctype, now->val.str.startmb);
2224 now = lr_token (ldfile, charmap, NULL, NULL, verbose);
2225 if (now->tok != tok_semicolon)
2226 break;
2227 now = lr_token (ldfile, charmap, NULL, NULL, verbose);
2229 if (now->tok != tok_eol)
2230 SYNTAX_ERROR (_("\
2231 %s: syntax error in definition of new character class"), "LC_CTYPE");
2232 break;
2234 case tok_charconv:
2235 now = lr_token (ldfile, charmap, NULL, NULL, verbose);
2236 while (now->tok == tok_ident || now->tok == tok_string)
2238 ctype_map_new (ldfile, ctype, now->val.str.startmb, charmap);
2239 now = lr_token (ldfile, charmap, NULL, NULL, verbose);
2240 if (now->tok != tok_semicolon)
2241 break;
2242 now = lr_token (ldfile, charmap, NULL, NULL, verbose);
2244 if (now->tok != tok_eol)
2245 SYNTAX_ERROR (_("\
2246 %s: syntax error in definition of new character map"), "LC_CTYPE");
2247 break;
2249 case tok_class:
2250 /* Ignore the rest of the line if we don't need the input of
2251 this line. */
2252 if (ignore_content)
2254 lr_ignore_rest (ldfile, 0);
2255 break;
2258 /* We simply forget the `class' keyword and use the following
2259 operand to determine the bit. */
2260 now = lr_token (ldfile, charmap, NULL, NULL, verbose);
2261 if (now->tok == tok_ident || now->tok == tok_string)
2263 /* Must can be one of the predefined class names. */
2264 for (cnt = 0; cnt < ctype->nr_charclass; ++cnt)
2265 if (strcmp (ctype->classnames[cnt], now->val.str.startmb) == 0)
2266 break;
2267 if (cnt >= ctype->nr_charclass)
2269 #ifdef PREDEFINED_CLASSES
2270 if (now->val.str.lenmb == 8
2271 && memcmp ("special1", now->val.str.startmb, 8) == 0)
2272 class_bit = _ISwspecial1;
2273 else if (now->val.str.lenmb == 8
2274 && memcmp ("special2", now->val.str.startmb, 8) == 0)
2275 class_bit = _ISwspecial2;
2276 else if (now->val.str.lenmb == 8
2277 && memcmp ("special3", now->val.str.startmb, 8) == 0)
2278 class_bit = _ISwspecial3;
2279 else
2280 #endif
2282 /* OK, it's a new class. */
2283 ctype_class_new (ldfile, ctype, now->val.str.startmb);
2285 class_bit = _ISwbit (ctype->nr_charclass - 1);
2288 else
2290 class_bit = _ISwbit (cnt);
2292 free (now->val.str.startmb);
2295 else if (now->tok == tok_digit)
2296 goto handle_tok_digit;
2297 else if (now->tok < tok_upper || now->tok > tok_blank)
2298 goto err_label;
2299 else
2301 class_bit = BITw (now->tok);
2302 class256_bit = BIT (now->tok);
2305 /* The next character must be a semicolon. */
2306 now = lr_token (ldfile, charmap, NULL, NULL, verbose);
2307 if (now->tok != tok_semicolon)
2308 goto err_label;
2309 goto read_charclass;
2311 case tok_upper:
2312 case tok_lower:
2313 case tok_alpha:
2314 case tok_alnum:
2315 case tok_space:
2316 case tok_cntrl:
2317 case tok_punct:
2318 case tok_graph:
2319 case tok_print:
2320 case tok_xdigit:
2321 case tok_blank:
2322 /* Ignore the rest of the line if we don't need the input of
2323 this line. */
2324 if (ignore_content)
2326 lr_ignore_rest (ldfile, 0);
2327 break;
2330 class_bit = BITw (now->tok);
2331 class256_bit = BIT (now->tok);
2332 handle_digits = 0;
2333 read_charclass:
2334 ctype->class_done |= class_bit;
2335 last_token = tok_none;
2336 ellipsis_token = tok_none;
2337 step = 1;
2338 now = lr_token (ldfile, charmap, NULL, NULL, verbose);
2339 while (now->tok != tok_eol && now->tok != tok_eof)
2341 uint32_t wch;
2342 struct charseq *seq;
2344 if (ellipsis_token == tok_none)
2346 if (get_character (now, charmap, repertoire, &seq, &wch))
2347 goto err_label;
2349 if (!ignore_content && seq != NULL && seq->nbytes == 1)
2350 /* Yep, we can store information about this byte
2351 sequence. */
2352 ctype->class256_collection[seq->bytes[0]] |= class256_bit;
2354 if (!ignore_content && wch != ILLEGAL_CHAR_VALUE
2355 && class_bit != 0)
2356 /* We have the UCS4 position. */
2357 *find_idx (ctype, &ctype->class_collection,
2358 &ctype->class_collection_max,
2359 &ctype->class_collection_act, wch) |= class_bit;
2361 last_token = now->tok;
2362 /* Terminate the string. */
2363 if (last_token == tok_bsymbol)
2365 now->val.str.startmb[now->val.str.lenmb] = '\0';
2366 last_str = now->val.str.startmb;
2368 else
2369 last_str = NULL;
2370 last_wch = wch;
2371 memcpy (last_charcode, now->val.charcode.bytes, 16);
2372 last_charcode_len = now->val.charcode.nbytes;
2374 if (!ignore_content && handle_digits == 1)
2376 /* We must store the digit values. */
2377 if (ctype->mbdigits_act == ctype->mbdigits_max)
2379 ctype->mbdigits_max += 10;
2380 ctype->mbdigits = xrealloc (ctype->mbdigits,
2381 (ctype->mbdigits_max
2382 * sizeof (char *)));
2383 ctype->wcdigits_max += 10;
2384 ctype->wcdigits = xrealloc (ctype->wcdigits,
2385 (ctype->wcdigits_max
2386 * sizeof (uint32_t)));
2389 ctype->mbdigits[ctype->mbdigits_act++] = seq;
2390 ctype->wcdigits[ctype->wcdigits_act++] = wch;
2392 else if (!ignore_content && handle_digits == 2)
2394 /* We must store the digit values. */
2395 if (ctype->outdigits_act >= 10)
2397 lr_error (ldfile, _("\
2398 %s: field `%s' does not contain exactly ten entries"),
2399 "LC_CTYPE", "outdigit");
2400 lr_ignore_rest (ldfile, 0);
2401 break;
2404 ctype->mboutdigits[ctype->outdigits_act] = seq;
2405 ctype->wcoutdigits[ctype->outdigits_act] = wch;
2406 ++ctype->outdigits_act;
2409 else
2411 /* Now it gets complicated. We have to resolve the
2412 ellipsis problem. First we must distinguish between
2413 the different kind of ellipsis and this must match the
2414 tokens we have seen. */
2415 assert (last_token != tok_none);
2417 if (last_token != now->tok)
2419 lr_error (ldfile, _("\
2420 ellipsis range must be marked by two operands of same type"));
2421 lr_ignore_rest (ldfile, 0);
2422 break;
2425 if (last_token == tok_bsymbol)
2427 if (ellipsis_token == tok_ellipsis3)
2428 lr_error (ldfile, _("with symbolic name range values \
2429 the absolute ellipsis `...' must not be used"));
2431 charclass_symbolic_ellipsis (ldfile, ctype, charmap,
2432 repertoire, now, last_str,
2433 class256_bit, class_bit,
2434 (ellipsis_token
2435 == tok_ellipsis4
2436 ? 10 : 16),
2437 ignore_content,
2438 handle_digits, step);
2440 else if (last_token == tok_ucs4)
2442 if (ellipsis_token != tok_ellipsis2)
2443 lr_error (ldfile, _("\
2444 with UCS range values one must use the hexadecimal symbolic ellipsis `..'"));
2446 charclass_ucs4_ellipsis (ldfile, ctype, charmap,
2447 repertoire, now, last_wch,
2448 class256_bit, class_bit,
2449 ignore_content, handle_digits,
2450 step);
2452 else
2454 assert (last_token == tok_charcode);
2456 if (ellipsis_token != tok_ellipsis3)
2457 lr_error (ldfile, _("\
2458 with character code range values one must use the absolute ellipsis `...'"));
2460 charclass_charcode_ellipsis (ldfile, ctype, charmap,
2461 repertoire, now,
2462 last_charcode,
2463 last_charcode_len,
2464 class256_bit, class_bit,
2465 ignore_content,
2466 handle_digits);
2469 /* Now we have used the last value. */
2470 last_token = tok_none;
2473 /* Next we expect a semicolon or the end of the line. */
2474 now = lr_token (ldfile, charmap, NULL, NULL, verbose);
2475 if (now->tok == tok_eol || now->tok == tok_eof)
2476 break;
2478 if (last_token != tok_none
2479 && now->tok >= tok_ellipsis2 && now->tok <= tok_ellipsis4_2)
2481 if (now->tok == tok_ellipsis2_2)
2483 now->tok = tok_ellipsis2;
2484 step = 2;
2486 else if (now->tok == tok_ellipsis4_2)
2488 now->tok = tok_ellipsis4;
2489 step = 2;
2492 ellipsis_token = now->tok;
2494 now = lr_token (ldfile, charmap, NULL, NULL, verbose);
2495 continue;
2498 if (now->tok != tok_semicolon)
2499 goto err_label;
2501 /* And get the next character. */
2502 now = lr_token (ldfile, charmap, NULL, NULL, verbose);
2504 ellipsis_token = tok_none;
2505 step = 1;
2507 break;
2509 case tok_digit:
2510 /* Ignore the rest of the line if we don't need the input of
2511 this line. */
2512 if (ignore_content)
2514 lr_ignore_rest (ldfile, 0);
2515 break;
2518 handle_tok_digit:
2519 class_bit = _ISwdigit;
2520 class256_bit = _ISdigit;
2521 handle_digits = 1;
2522 goto read_charclass;
2524 case tok_outdigit:
2525 /* Ignore the rest of the line if we don't need the input of
2526 this line. */
2527 if (ignore_content)
2529 lr_ignore_rest (ldfile, 0);
2530 break;
2533 if (ctype->outdigits_act != 0)
2534 lr_error (ldfile, _("\
2535 %s: field `%s' declared more than once"),
2536 "LC_CTYPE", "outdigit");
2537 class_bit = 0;
2538 class256_bit = 0;
2539 handle_digits = 2;
2540 goto read_charclass;
2542 case tok_toupper:
2543 /* Ignore the rest of the line if we don't need the input of
2544 this line. */
2545 if (ignore_content)
2547 lr_ignore_rest (ldfile, 0);
2548 break;
2551 mapidx = 0;
2552 goto read_mapping;
2554 case tok_tolower:
2555 /* Ignore the rest of the line if we don't need the input of
2556 this line. */
2557 if (ignore_content)
2559 lr_ignore_rest (ldfile, 0);
2560 break;
2563 mapidx = 1;
2564 goto read_mapping;
2566 case tok_map:
2567 /* Ignore the rest of the line if we don't need the input of
2568 this line. */
2569 if (ignore_content)
2571 lr_ignore_rest (ldfile, 0);
2572 break;
2575 /* We simply forget the `map' keyword and use the following
2576 operand to determine the mapping. */
2577 now = lr_token (ldfile, charmap, NULL, NULL, verbose);
2578 if (now->tok == tok_ident || now->tok == tok_string)
2580 size_t cnt;
2582 for (cnt = 2; cnt < ctype->map_collection_nr; ++cnt)
2583 if (strcmp (now->val.str.startmb, ctype->mapnames[cnt]) == 0)
2584 break;
2586 if (cnt < ctype->map_collection_nr)
2587 free (now->val.str.startmb);
2588 else
2589 /* OK, it's a new map. */
2590 ctype_map_new (ldfile, ctype, now->val.str.startmb, charmap);
2592 mapidx = cnt;
2594 else if (now->tok < tok_toupper || now->tok > tok_tolower)
2595 goto err_label;
2596 else
2597 mapidx = now->tok - tok_toupper;
2599 now = lr_token (ldfile, charmap, NULL, NULL, verbose);
2600 /* This better should be a semicolon. */
2601 if (now->tok != tok_semicolon)
2602 goto err_label;
2604 read_mapping:
2605 /* Test whether this mapping was already defined. */
2606 if (ctype->tomap_done[mapidx])
2608 lr_error (ldfile, _("duplicated definition for mapping `%s'"),
2609 ctype->mapnames[mapidx]);
2610 lr_ignore_rest (ldfile, 0);
2611 break;
2613 ctype->tomap_done[mapidx] = 1;
2615 now = lr_token (ldfile, charmap, NULL, NULL, verbose);
2616 while (now->tok != tok_eol && now->tok != tok_eof)
2618 struct charseq *from_seq;
2619 uint32_t from_wch;
2620 struct charseq *to_seq;
2621 uint32_t to_wch;
2623 /* Every pair starts with an opening brace. */
2624 if (now->tok != tok_open_brace)
2625 goto err_label;
2627 /* Next comes the from-value. */
2628 now = lr_token (ldfile, charmap, NULL, NULL, verbose);
2629 if (get_character (now, charmap, repertoire, &from_seq,
2630 &from_wch) != 0)
2631 goto err_label;
2633 /* The next is a comma. */
2634 now = lr_token (ldfile, charmap, NULL, NULL, verbose);
2635 if (now->tok != tok_comma)
2636 goto err_label;
2638 /* And the other value. */
2639 now = lr_token (ldfile, charmap, NULL, NULL, verbose);
2640 if (get_character (now, charmap, repertoire, &to_seq,
2641 &to_wch) != 0)
2642 goto err_label;
2644 /* And the last thing is the closing brace. */
2645 now = lr_token (ldfile, charmap, NULL, NULL, verbose);
2646 if (now->tok != tok_close_brace)
2647 goto err_label;
2649 if (!ignore_content)
2651 /* Check whether the mapping converts from an ASCII value
2652 to a non-ASCII value. */
2653 if (from_seq != NULL && from_seq->nbytes == 1
2654 && isascii (from_seq->bytes[0])
2655 && to_seq != NULL && (to_seq->nbytes != 1
2656 || !isascii (to_seq->bytes[0])))
2657 ctype->to_nonascii = 1;
2659 if (mapidx < 2 && from_seq != NULL && to_seq != NULL
2660 && from_seq->nbytes == 1 && to_seq->nbytes == 1)
2661 /* We can use this value. */
2662 ctype->map256_collection[mapidx][from_seq->bytes[0]]
2663 = to_seq->bytes[0];
2665 if (from_wch != ILLEGAL_CHAR_VALUE
2666 && to_wch != ILLEGAL_CHAR_VALUE)
2667 /* Both correct values. */
2668 *find_idx (ctype, &ctype->map_collection[mapidx],
2669 &ctype->map_collection_max[mapidx],
2670 &ctype->map_collection_act[mapidx],
2671 from_wch) = to_wch;
2674 /* Now comes a semicolon or the end of the line/file. */
2675 now = lr_token (ldfile, charmap, NULL, NULL, verbose);
2676 if (now->tok == tok_semicolon)
2677 now = lr_token (ldfile, charmap, NULL, NULL, verbose);
2679 break;
2681 case tok_translit_start:
2682 /* Ignore the entire translit section with its peculiar syntax
2683 if we don't need the input. */
2684 if (ignore_content)
2688 lr_ignore_rest (ldfile, 0);
2689 now = lr_token (ldfile, charmap, NULL, NULL, verbose);
2691 while (now->tok != tok_translit_end && now->tok != tok_eof);
2693 if (now->tok == tok_eof)
2694 lr_error (ldfile, _(\
2695 "%s: `translit_start' section does not end with `translit_end'"),
2696 "LC_CTYPE");
2698 break;
2701 /* The rest of the line better should be empty. */
2702 lr_ignore_rest (ldfile, 1);
2704 /* We count here the number of allocated entries in the `translit'
2705 array. */
2706 cnt = 0;
2708 ldfile->translate_strings = 1;
2709 ldfile->return_widestr = 1;
2711 /* We proceed until we see the `translit_end' token. */
2712 while (now = lr_token (ldfile, charmap, NULL, repertoire, verbose),
2713 now->tok != tok_translit_end && now->tok != tok_eof)
2715 if (now->tok == tok_eol)
2716 /* Ignore empty lines. */
2717 continue;
2719 if (now->tok == tok_include)
2721 /* We have to include locale. */
2722 const char *locale_name;
2723 const char *repertoire_name;
2724 struct translit_include_t *include_stmt, **include_ptr;
2726 now = lr_token (ldfile, charmap, NULL, NULL, verbose);
2727 /* This should be a string or an identifier. In any
2728 case something to name a locale. */
2729 if (now->tok != tok_string && now->tok != tok_ident)
2731 translit_syntax:
2732 lr_error (ldfile, _("%s: syntax error"), "LC_CTYPE");
2733 lr_ignore_rest (ldfile, 0);
2734 continue;
2736 locale_name = now->val.str.startmb;
2738 /* Next should be a semicolon. */
2739 now = lr_token (ldfile, charmap, NULL, NULL, verbose);
2740 if (now->tok != tok_semicolon)
2741 goto translit_syntax;
2743 /* Now the repertoire name. */
2744 now = lr_token (ldfile, charmap, NULL, NULL, verbose);
2745 if ((now->tok != tok_string && now->tok != tok_ident)
2746 || now->val.str.startmb == NULL)
2747 goto translit_syntax;
2748 repertoire_name = now->val.str.startmb;
2749 if (repertoire_name[0] == '\0')
2750 /* Ignore the empty string. */
2751 repertoire_name = NULL;
2753 /* Save the include statement for later processing. */
2754 include_stmt = (struct translit_include_t *)
2755 xmalloc (sizeof (struct translit_include_t));
2756 include_stmt->copy_locale = locale_name;
2757 include_stmt->copy_repertoire = repertoire_name;
2758 include_stmt->next = NULL;
2760 include_ptr = &ctype->translit_include;
2761 while (*include_ptr != NULL)
2762 include_ptr = &(*include_ptr)->next;
2763 *include_ptr = include_stmt;
2765 /* The rest of the line must be empty. */
2766 lr_ignore_rest (ldfile, 1);
2768 /* Make sure the locale is read. */
2769 add_to_readlist (LC_CTYPE, locale_name, repertoire_name,
2770 1, NULL);
2771 continue;
2773 else if (now->tok == tok_default_missing)
2775 uint32_t *wstr;
2777 while (1)
2779 /* We expect a single character or string as the
2780 argument. */
2781 now = lr_token (ldfile, charmap, NULL, NULL, verbose);
2782 wstr = read_widestring (ldfile, now, charmap,
2783 repertoire);
2785 if (wstr != NULL)
2787 if (ctype->default_missing != NULL)
2789 lr_error (ldfile, _("\
2790 %s: duplicate `default_missing' definition"), "LC_CTYPE");
2791 WITH_CUR_LOCALE (error_at_line (0, 0,
2792 ctype->default_missing_file,
2793 ctype->default_missing_lineno,
2794 _("\
2795 previous definition was here")));
2797 else
2799 ctype->default_missing = wstr;
2800 ctype->default_missing_file = ldfile->fname;
2801 ctype->default_missing_lineno = ldfile->lineno;
2803 /* We can have more entries, ignore them. */
2804 lr_ignore_rest (ldfile, 0);
2805 break;
2807 else if (wstr == (uint32_t *) -1l)
2808 /* This was an syntax error. */
2809 break;
2811 /* Maybe there is another replacement we can use. */
2812 now = lr_token (ldfile, charmap, NULL, NULL, verbose);
2813 if (now->tok == tok_eol || now->tok == tok_eof)
2815 /* Nothing found. We tell the user. */
2816 lr_error (ldfile, _("\
2817 %s: no representable `default_missing' definition found"), "LC_CTYPE");
2818 break;
2820 if (now->tok != tok_semicolon)
2821 goto translit_syntax;
2824 continue;
2826 else if (now->tok == tok_translit_ignore)
2828 read_translit_ignore_entry (ldfile, ctype, charmap,
2829 repertoire);
2830 continue;
2833 read_translit_entry (ldfile, ctype, now, charmap, repertoire);
2835 ldfile->return_widestr = 0;
2837 if (now->tok == tok_eof)
2838 lr_error (ldfile, _(\
2839 "%s: `translit_start' section does not end with `translit_end'"),
2840 "LC_CTYPE");
2842 break;
2844 case tok_ident:
2845 /* Ignore the rest of the line if we don't need the input of
2846 this line. */
2847 if (ignore_content)
2849 lr_ignore_rest (ldfile, 0);
2850 break;
2853 /* This could mean one of several things. First test whether
2854 it's a character class name. */
2855 for (cnt = 0; cnt < ctype->nr_charclass; ++cnt)
2856 if (strcmp (now->val.str.startmb, ctype->classnames[cnt]) == 0)
2857 break;
2858 if (cnt < ctype->nr_charclass)
2860 class_bit = _ISwbit (cnt);
2861 class256_bit = cnt <= 11 ? _ISbit (cnt) : 0;
2862 free (now->val.str.startmb);
2863 goto read_charclass;
2865 for (cnt = 0; cnt < ctype->map_collection_nr; ++cnt)
2866 if (strcmp (now->val.str.startmb, ctype->mapnames[cnt]) == 0)
2867 break;
2868 if (cnt < ctype->map_collection_nr)
2870 mapidx = cnt;
2871 free (now->val.str.startmb);
2872 goto read_mapping;
2874 #ifdef PREDEFINED_CLASSES
2875 if (strcmp (now->val.str.startmb, "special1") == 0)
2877 class_bit = _ISwspecial1;
2878 free (now->val.str.startmb);
2879 goto read_charclass;
2881 if (strcmp (now->val.str.startmb, "special2") == 0)
2883 class_bit = _ISwspecial2;
2884 free (now->val.str.startmb);
2885 goto read_charclass;
2887 if (strcmp (now->val.str.startmb, "special3") == 0)
2889 class_bit = _ISwspecial3;
2890 free (now->val.str.startmb);
2891 goto read_charclass;
2893 if (strcmp (now->val.str.startmb, "tosymmetric") == 0)
2895 mapidx = 2;
2896 goto read_mapping;
2898 #endif
2899 break;
2901 case tok_end:
2902 /* Next we assume `LC_CTYPE'. */
2903 now = lr_token (ldfile, charmap, NULL, NULL, verbose);
2904 if (now->tok == tok_eof)
2905 break;
2906 if (now->tok == tok_eol)
2907 lr_error (ldfile, _("%s: incomplete `END' line"),
2908 "LC_CTYPE");
2909 else if (now->tok != tok_lc_ctype)
2910 lr_error (ldfile, _("\
2911 %1$s: definition does not end with `END %1$s'"), "LC_CTYPE");
2912 lr_ignore_rest (ldfile, now->tok == tok_lc_ctype);
2913 return;
2915 default:
2916 err_label:
2917 if (now->tok != tok_eof)
2918 SYNTAX_ERROR (_("%s: syntax error"), "LC_CTYPE");
2921 /* Prepare for the next round. */
2922 now = lr_token (ldfile, charmap, NULL, NULL, verbose);
2923 nowtok = now->tok;
2926 /* When we come here we reached the end of the file. */
2927 lr_error (ldfile, _("%s: premature end of file"), "LC_CTYPE");
2931 static void
2932 set_class_defaults (struct locale_ctype_t *ctype,
2933 const struct charmap_t *charmap,
2934 struct repertoire_t *repertoire)
2936 size_t cnt;
2938 /* These function defines the default values for the classes and conversions
2939 according to POSIX.2 2.5.2.1.
2940 It may seem that the order of these if-blocks is arbitrary but it is NOT.
2941 Don't move them unless you know what you do! */
2943 auto void set_default (int bitpos, int from, int to);
2945 void set_default (int bitpos, int from, int to)
2947 char tmp[2];
2948 int ch;
2949 int bit = _ISbit (bitpos);
2950 int bitw = _ISwbit (bitpos);
2951 /* Define string. */
2952 strcpy (tmp, "?");
2954 for (ch = from; ch <= to; ++ch)
2956 struct charseq *seq;
2957 tmp[0] = ch;
2959 seq = charmap_find_value (charmap, tmp, 1);
2960 if (seq == NULL)
2962 char buf[10];
2963 sprintf (buf, "U%08X", ch);
2964 seq = charmap_find_value (charmap, buf, 9);
2966 if (seq == NULL)
2968 if (!be_quiet)
2969 WITH_CUR_LOCALE (error (0, 0, _("\
2970 %s: character `%s' not defined while needed as default value"),
2971 "LC_CTYPE", tmp));
2973 else if (seq->nbytes != 1)
2974 WITH_CUR_LOCALE (error (0, 0, _("\
2975 %s: character `%s' in charmap not representable with one byte"),
2976 "LC_CTYPE", tmp));
2977 else
2978 ctype->class256_collection[seq->bytes[0]] |= bit;
2980 /* No need to search here, the ASCII value is also the Unicode
2981 value. */
2982 ELEM (ctype, class_collection, , ch) |= bitw;
2986 /* Set default values if keyword was not present. */
2987 if ((ctype->class_done & BITw (tok_upper)) == 0)
2988 /* "If this keyword [lower] is not specified, the lowercase letters
2989 `A' through `Z', ..., shall automatically belong to this class,
2990 with implementation defined character values." [P1003.2, 2.5.2.1] */
2991 set_default (BITPOS (tok_upper), 'A', 'Z');
2993 if ((ctype->class_done & BITw (tok_lower)) == 0)
2994 /* "If this keyword [lower] is not specified, the lowercase letters
2995 `a' through `z', ..., shall automatically belong to this class,
2996 with implementation defined character values." [P1003.2, 2.5.2.1] */
2997 set_default (BITPOS (tok_lower), 'a', 'z');
2999 if ((ctype->class_done & BITw (tok_alpha)) == 0)
3001 /* Table 2-6 in P1003.2 says that characters in class `upper' or
3002 class `lower' *must* be in class `alpha'. */
3003 unsigned long int mask = BIT (tok_upper) | BIT (tok_lower);
3004 unsigned long int maskw = BITw (tok_upper) | BITw (tok_lower);
3006 for (cnt = 0; cnt < 256; ++cnt)
3007 if ((ctype->class256_collection[cnt] & mask) != 0)
3008 ctype->class256_collection[cnt] |= BIT (tok_alpha);
3010 for (cnt = 0; cnt < ctype->class_collection_act; ++cnt)
3011 if ((ctype->class_collection[cnt] & maskw) != 0)
3012 ctype->class_collection[cnt] |= BITw (tok_alpha);
3015 if ((ctype->class_done & BITw (tok_digit)) == 0)
3016 /* "If this keyword [digit] is not specified, the digits `0' through
3017 `9', ..., shall automatically belong to this class, with
3018 implementation-defined character values." [P1003.2, 2.5.2.1] */
3019 set_default (BITPOS (tok_digit), '0', '9');
3021 /* "Only characters specified for the `alpha' and `digit' keyword
3022 shall be specified. Characters specified for the keyword `alpha'
3023 and `digit' are automatically included in this class. */
3025 unsigned long int mask = BIT (tok_alpha) | BIT (tok_digit);
3026 unsigned long int maskw = BITw (tok_alpha) | BITw (tok_digit);
3028 for (cnt = 0; cnt < 256; ++cnt)
3029 if ((ctype->class256_collection[cnt] & mask) != 0)
3030 ctype->class256_collection[cnt] |= BIT (tok_alnum);
3032 for (cnt = 0; cnt < ctype->class_collection_act; ++cnt)
3033 if ((ctype->class_collection[cnt] & maskw) != 0)
3034 ctype->class_collection[cnt] |= BITw (tok_alnum);
3037 if ((ctype->class_done & BITw (tok_space)) == 0)
3038 /* "If this keyword [space] is not specified, the characters <space>,
3039 <form-feed>, <newline>, <carriage-return>, <tab>, and
3040 <vertical-tab>, ..., shall automatically belong to this class,
3041 with implementation-defined character values." [P1003.2, 2.5.2.1] */
3043 struct charseq *seq;
3045 seq = charmap_find_value (charmap, "space", 5);
3046 if (seq == NULL)
3047 seq = charmap_find_value (charmap, "SP", 2);
3048 if (seq == NULL)
3049 seq = charmap_find_value (charmap, "U00000020", 9);
3050 if (seq == NULL)
3052 if (!be_quiet)
3053 WITH_CUR_LOCALE (error (0, 0, _("\
3054 %s: character `%s' not defined while needed as default value"),
3055 "LC_CTYPE", "<space>"));
3057 else if (seq->nbytes != 1)
3058 WITH_CUR_LOCALE (error (0, 0, _("\
3059 %s: character `%s' in charmap not representable with one byte"),
3060 "LC_CTYPE", "<space>"));
3061 else
3062 ctype->class256_collection[seq->bytes[0]] |= BIT (tok_space);
3064 /* No need to search. */
3065 ELEM (ctype, class_collection, , L' ') |= BITw (tok_space);
3067 seq = charmap_find_value (charmap, "form-feed", 9);
3068 if (seq == NULL)
3069 seq = charmap_find_value (charmap, "U0000000C", 9);
3070 if (seq == NULL)
3072 if (!be_quiet)
3073 WITH_CUR_LOCALE (error (0, 0, _("\
3074 %s: character `%s' not defined while needed as default value"),
3075 "LC_CTYPE", "<form-feed>"));
3077 else if (seq->nbytes != 1)
3078 WITH_CUR_LOCALE (error (0, 0, _("\
3079 %s: character `%s' in charmap not representable with one byte"),
3080 "LC_CTYPE", "<form-feed>"));
3081 else
3082 ctype->class256_collection[seq->bytes[0]] |= BIT (tok_space);
3084 /* No need to search. */
3085 ELEM (ctype, class_collection, , L'\f') |= BITw (tok_space);
3088 seq = charmap_find_value (charmap, "newline", 7);
3089 if (seq == NULL)
3090 seq = charmap_find_value (charmap, "U0000000A", 9);
3091 if (seq == NULL)
3093 if (!be_quiet)
3094 WITH_CUR_LOCALE (error (0, 0, _("\
3095 %s: character `%s' not defined while needed as default value"),
3096 "LC_CTYPE", "<newline>"));
3098 else if (seq->nbytes != 1)
3099 WITH_CUR_LOCALE (error (0, 0, _("\
3100 %s: character `%s' in charmap not representable with one byte"),
3101 "LC_CTYPE", "<newline>"));
3102 else
3103 ctype->class256_collection[seq->bytes[0]] |= BIT (tok_space);
3105 /* No need to search. */
3106 ELEM (ctype, class_collection, , L'\n') |= BITw (tok_space);
3109 seq = charmap_find_value (charmap, "carriage-return", 15);
3110 if (seq == NULL)
3111 seq = charmap_find_value (charmap, "U0000000D", 9);
3112 if (seq == NULL)
3114 if (!be_quiet)
3115 WITH_CUR_LOCALE (error (0, 0, _("\
3116 %s: character `%s' not defined while needed as default value"),
3117 "LC_CTYPE", "<carriage-return>"));
3119 else if (seq->nbytes != 1)
3120 WITH_CUR_LOCALE (error (0, 0, _("\
3121 %s: character `%s' in charmap not representable with one byte"),
3122 "LC_CTYPE", "<carriage-return>"));
3123 else
3124 ctype->class256_collection[seq->bytes[0]] |= BIT (tok_space);
3126 /* No need to search. */
3127 ELEM (ctype, class_collection, , L'\r') |= BITw (tok_space);
3130 seq = charmap_find_value (charmap, "tab", 3);
3131 if (seq == NULL)
3132 seq = charmap_find_value (charmap, "U00000009", 9);
3133 if (seq == NULL)
3135 if (!be_quiet)
3136 WITH_CUR_LOCALE (error (0, 0, _("\
3137 %s: character `%s' not defined while needed as default value"),
3138 "LC_CTYPE", "<tab>"));
3140 else if (seq->nbytes != 1)
3141 WITH_CUR_LOCALE (error (0, 0, _("\
3142 %s: character `%s' in charmap not representable with one byte"),
3143 "LC_CTYPE", "<tab>"));
3144 else
3145 ctype->class256_collection[seq->bytes[0]] |= BIT (tok_space);
3147 /* No need to search. */
3148 ELEM (ctype, class_collection, , L'\t') |= BITw (tok_space);
3151 seq = charmap_find_value (charmap, "vertical-tab", 12);
3152 if (seq == NULL)
3153 seq = charmap_find_value (charmap, "U0000000B", 9);
3154 if (seq == NULL)
3156 if (!be_quiet)
3157 WITH_CUR_LOCALE (error (0, 0, _("\
3158 %s: character `%s' not defined while needed as default value"),
3159 "LC_CTYPE", "<vertical-tab>"));
3161 else if (seq->nbytes != 1)
3162 WITH_CUR_LOCALE (error (0, 0, _("\
3163 %s: character `%s' in charmap not representable with one byte"),
3164 "LC_CTYPE", "<vertical-tab>"));
3165 else
3166 ctype->class256_collection[seq->bytes[0]] |= BIT (tok_space);
3168 /* No need to search. */
3169 ELEM (ctype, class_collection, , L'\v') |= BITw (tok_space);
3172 if ((ctype->class_done & BITw (tok_xdigit)) == 0)
3173 /* "If this keyword is not specified, the digits `0' to `9', the
3174 uppercase letters `A' through `F', and the lowercase letters `a'
3175 through `f', ..., shell automatically belong to this class, with
3176 implementation defined character values." [P1003.2, 2.5.2.1] */
3178 set_default (BITPOS (tok_xdigit), '0', '9');
3179 set_default (BITPOS (tok_xdigit), 'A', 'F');
3180 set_default (BITPOS (tok_xdigit), 'a', 'f');
3183 if ((ctype->class_done & BITw (tok_blank)) == 0)
3184 /* "If this keyword [blank] is unspecified, the characters <space> and
3185 <tab> shall belong to this character class." [P1003.2, 2.5.2.1] */
3187 struct charseq *seq;
3189 seq = charmap_find_value (charmap, "space", 5);
3190 if (seq == NULL)
3191 seq = charmap_find_value (charmap, "SP", 2);
3192 if (seq == NULL)
3193 seq = charmap_find_value (charmap, "U00000020", 9);
3194 if (seq == NULL)
3196 if (!be_quiet)
3197 WITH_CUR_LOCALE (error (0, 0, _("\
3198 %s: character `%s' not defined while needed as default value"),
3199 "LC_CTYPE", "<space>"));
3201 else if (seq->nbytes != 1)
3202 WITH_CUR_LOCALE (error (0, 0, _("\
3203 %s: character `%s' in charmap not representable with one byte"),
3204 "LC_CTYPE", "<space>"));
3205 else
3206 ctype->class256_collection[seq->bytes[0]] |= BIT (tok_blank);
3208 /* No need to search. */
3209 ELEM (ctype, class_collection, , L' ') |= BITw (tok_blank);
3212 seq = charmap_find_value (charmap, "tab", 3);
3213 if (seq == NULL)
3214 seq = charmap_find_value (charmap, "U00000009", 9);
3215 if (seq == NULL)
3217 if (!be_quiet)
3218 WITH_CUR_LOCALE (error (0, 0, _("\
3219 %s: character `%s' not defined while needed as default value"),
3220 "LC_CTYPE", "<tab>"));
3222 else if (seq->nbytes != 1)
3223 WITH_CUR_LOCALE (error (0, 0, _("\
3224 %s: character `%s' in charmap not representable with one byte"),
3225 "LC_CTYPE", "<tab>"));
3226 else
3227 ctype->class256_collection[seq->bytes[0]] |= BIT (tok_blank);
3229 /* No need to search. */
3230 ELEM (ctype, class_collection, , L'\t') |= BITw (tok_blank);
3233 if ((ctype->class_done & BITw (tok_graph)) == 0)
3234 /* "If this keyword [graph] is not specified, characters specified for
3235 the keywords `upper', `lower', `alpha', `digit', `xdigit' and `punct',
3236 shall belong to this character class." [P1003.2, 2.5.2.1] */
3238 unsigned long int mask = BIT (tok_upper) | BIT (tok_lower) |
3239 BIT (tok_alpha) | BIT (tok_digit) | BIT (tok_xdigit) | BIT (tok_punct);
3240 unsigned long int maskw = BITw (tok_upper) | BITw (tok_lower) |
3241 BITw (tok_alpha) | BITw (tok_digit) | BITw (tok_xdigit) |
3242 BITw (tok_punct);
3243 size_t cnt;
3245 for (cnt = 0; cnt < ctype->class_collection_act; ++cnt)
3246 if ((ctype->class_collection[cnt] & maskw) != 0)
3247 ctype->class_collection[cnt] |= BITw (tok_graph);
3249 for (cnt = 0; cnt < 256; ++cnt)
3250 if ((ctype->class256_collection[cnt] & mask) != 0)
3251 ctype->class256_collection[cnt] |= BIT (tok_graph);
3254 if ((ctype->class_done & BITw (tok_print)) == 0)
3255 /* "If this keyword [print] is not provided, characters specified for
3256 the keywords `upper', `lower', `alpha', `digit', `xdigit', `punct',
3257 and the <space> character shall belong to this character class."
3258 [P1003.2, 2.5.2.1] */
3260 unsigned long int mask = BIT (tok_upper) | BIT (tok_lower) |
3261 BIT (tok_alpha) | BIT (tok_digit) | BIT (tok_xdigit) | BIT (tok_punct);
3262 unsigned long int maskw = BITw (tok_upper) | BITw (tok_lower) |
3263 BITw (tok_alpha) | BITw (tok_digit) | BITw (tok_xdigit) |
3264 BITw (tok_punct);
3265 size_t cnt;
3266 struct charseq *seq;
3268 for (cnt = 0; cnt < ctype->class_collection_act; ++cnt)
3269 if ((ctype->class_collection[cnt] & maskw) != 0)
3270 ctype->class_collection[cnt] |= BITw (tok_print);
3272 for (cnt = 0; cnt < 256; ++cnt)
3273 if ((ctype->class256_collection[cnt] & mask) != 0)
3274 ctype->class256_collection[cnt] |= BIT (tok_print);
3277 seq = charmap_find_value (charmap, "space", 5);
3278 if (seq == NULL)
3279 seq = charmap_find_value (charmap, "SP", 2);
3280 if (seq == NULL)
3281 seq = charmap_find_value (charmap, "U00000020", 9);
3282 if (seq == NULL)
3284 if (!be_quiet)
3285 WITH_CUR_LOCALE (error (0, 0, _("\
3286 %s: character `%s' not defined while needed as default value"),
3287 "LC_CTYPE", "<space>"));
3289 else if (seq->nbytes != 1)
3290 WITH_CUR_LOCALE (error (0, 0, _("\
3291 %s: character `%s' in charmap not representable with one byte"),
3292 "LC_CTYPE", "<space>"));
3293 else
3294 ctype->class256_collection[seq->bytes[0]] |= BIT (tok_print);
3296 /* No need to search. */
3297 ELEM (ctype, class_collection, , L' ') |= BITw (tok_print);
3300 if (ctype->tomap_done[0] == 0)
3301 /* "If this keyword [toupper] is not specified, the lowercase letters
3302 `a' through `z', and their corresponding uppercase letters `A' to
3303 `Z', ..., shall automatically be included, with implementation-
3304 defined character values." [P1003.2, 2.5.2.1] */
3306 char tmp[4];
3307 int ch;
3309 strcpy (tmp, "<?>");
3311 for (ch = 'a'; ch <= 'z'; ++ch)
3313 struct charseq *seq_from, *seq_to;
3315 tmp[1] = (char) ch;
3317 seq_from = charmap_find_value (charmap, &tmp[1], 1);
3318 if (seq_from == NULL)
3320 char buf[10];
3321 sprintf (buf, "U%08X", ch);
3322 seq_from = charmap_find_value (charmap, buf, 9);
3324 if (seq_from == NULL)
3326 if (!be_quiet)
3327 WITH_CUR_LOCALE (error (0, 0, _("\
3328 %s: character `%s' not defined while needed as default value"),
3329 "LC_CTYPE", tmp));
3331 else if (seq_from->nbytes != 1)
3333 if (!be_quiet)
3334 WITH_CUR_LOCALE (error (0, 0, _("\
3335 %s: character `%s' needed as default value not representable with one byte"),
3336 "LC_CTYPE", tmp));
3338 else
3340 /* This conversion is implementation defined. */
3341 tmp[1] = (char) (ch + ('A' - 'a'));
3342 seq_to = charmap_find_value (charmap, &tmp[1], 1);
3343 if (seq_to == NULL)
3345 char buf[10];
3346 sprintf (buf, "U%08X", ch + ('A' - 'a'));
3347 seq_to = charmap_find_value (charmap, buf, 9);
3349 if (seq_to == NULL)
3351 if (!be_quiet)
3352 WITH_CUR_LOCALE (error (0, 0, _("\
3353 %s: character `%s' not defined while needed as default value"),
3354 "LC_CTYPE", tmp));
3356 else if (seq_to->nbytes != 1)
3358 if (!be_quiet)
3359 WITH_CUR_LOCALE (error (0, 0, _("\
3360 %s: character `%s' needed as default value not representable with one byte"),
3361 "LC_CTYPE", tmp));
3363 else
3364 /* The index [0] is determined by the order of the
3365 `ctype_map_newP' calls in `ctype_startup'. */
3366 ctype->map256_collection[0][seq_from->bytes[0]]
3367 = seq_to->bytes[0];
3370 /* No need to search. */
3371 ELEM (ctype, map_collection, [0], ch) = ch + ('A' - 'a');
3375 if (ctype->tomap_done[1] == 0)
3376 /* "If this keyword [tolower] is not specified, the mapping shall be
3377 the reverse mapping of the one specified to `toupper'." [P1003.2] */
3379 for (cnt = 0; cnt < ctype->map_collection_act[0]; ++cnt)
3380 if (ctype->map_collection[0][cnt] != 0)
3381 ELEM (ctype, map_collection, [1],
3382 ctype->map_collection[0][cnt])
3383 = ctype->charnames[cnt];
3385 for (cnt = 0; cnt < 256; ++cnt)
3386 if (ctype->map256_collection[0][cnt] != 0)
3387 ctype->map256_collection[1][ctype->map256_collection[0][cnt]] = cnt;
3390 if (ctype->outdigits_act != 10)
3392 if (ctype->outdigits_act != 0)
3393 WITH_CUR_LOCALE (error (0, 0, _("\
3394 %s: field `%s' does not contain exactly ten entries"),
3395 "LC_CTYPE", "outdigit"));
3397 for (cnt = ctype->outdigits_act; cnt < 10; ++cnt)
3399 ctype->mboutdigits[cnt] = charmap_find_symbol (charmap,
3400 (char *) digits + cnt,
3403 if (ctype->mboutdigits[cnt] == NULL)
3404 ctype->mboutdigits[cnt] = charmap_find_symbol (charmap,
3405 longnames[cnt],
3406 strlen (longnames[cnt]));
3408 if (ctype->mboutdigits[cnt] == NULL)
3409 ctype->mboutdigits[cnt] = charmap_find_symbol (charmap,
3410 uninames[cnt], 9);
3412 if (ctype->mboutdigits[cnt] == NULL)
3414 /* Provide a replacement. */
3415 WITH_CUR_LOCALE (error (0, 0, _("\
3416 no output digits defined and none of the standard names in the charmap")));
3418 ctype->mboutdigits[cnt] = obstack_alloc (&((struct charmap_t *) charmap)->mem_pool,
3419 sizeof (struct charseq)
3420 + 1);
3422 /* This is better than nothing. */
3423 ctype->mboutdigits[cnt]->bytes[0] = digits[cnt];
3424 ctype->mboutdigits[cnt]->nbytes = 1;
3427 ctype->wcoutdigits[cnt] = L'0' + cnt;
3430 ctype->outdigits_act = 10;
3435 /* Initialize. Assumes t->p and t->q have already been set. */
3436 static inline void
3437 wctype_table_init (struct wctype_table *t)
3439 t->level1 = NULL;
3440 t->level1_alloc = t->level1_size = 0;
3441 t->level2 = NULL;
3442 t->level2_alloc = t->level2_size = 0;
3443 t->level3 = NULL;
3444 t->level3_alloc = t->level3_size = 0;
3447 /* Retrieve an entry. */
3448 static inline int
3449 wctype_table_get (struct wctype_table *t, uint32_t wc)
3451 uint32_t index1 = wc >> (t->q + t->p + 5);
3452 if (index1 < t->level1_size)
3454 uint32_t lookup1 = t->level1[index1];
3455 if (lookup1 != EMPTY)
3457 uint32_t index2 = ((wc >> (t->p + 5)) & ((1 << t->q) - 1))
3458 + (lookup1 << t->q);
3459 uint32_t lookup2 = t->level2[index2];
3460 if (lookup2 != EMPTY)
3462 uint32_t index3 = ((wc >> 5) & ((1 << t->p) - 1))
3463 + (lookup2 << t->p);
3464 uint32_t lookup3 = t->level3[index3];
3465 uint32_t index4 = wc & 0x1f;
3467 return (lookup3 >> index4) & 1;
3471 return 0;
3474 /* Add one entry. */
3475 static void
3476 wctype_table_add (struct wctype_table *t, uint32_t wc)
3478 uint32_t index1 = wc >> (t->q + t->p + 5);
3479 uint32_t index2 = (wc >> (t->p + 5)) & ((1 << t->q) - 1);
3480 uint32_t index3 = (wc >> 5) & ((1 << t->p) - 1);
3481 uint32_t index4 = wc & 0x1f;
3482 size_t i, i1, i2;
3484 if (index1 >= t->level1_size)
3486 if (index1 >= t->level1_alloc)
3488 size_t alloc = 2 * t->level1_alloc;
3489 if (alloc <= index1)
3490 alloc = index1 + 1;
3491 t->level1 = (uint32_t *) xrealloc ((char *) t->level1,
3492 alloc * sizeof (uint32_t));
3493 t->level1_alloc = alloc;
3495 while (index1 >= t->level1_size)
3496 t->level1[t->level1_size++] = EMPTY;
3499 if (t->level1[index1] == EMPTY)
3501 if (t->level2_size == t->level2_alloc)
3503 size_t alloc = 2 * t->level2_alloc + 1;
3504 t->level2 = (uint32_t *) xrealloc ((char *) t->level2,
3505 (alloc << t->q) * sizeof (uint32_t));
3506 t->level2_alloc = alloc;
3508 i1 = t->level2_size << t->q;
3509 i2 = (t->level2_size + 1) << t->q;
3510 for (i = i1; i < i2; i++)
3511 t->level2[i] = EMPTY;
3512 t->level1[index1] = t->level2_size++;
3515 index2 += t->level1[index1] << t->q;
3517 if (t->level2[index2] == EMPTY)
3519 if (t->level3_size == t->level3_alloc)
3521 size_t alloc = 2 * t->level3_alloc + 1;
3522 t->level3 = (uint32_t *) xrealloc ((char *) t->level3,
3523 (alloc << t->p) * sizeof (uint32_t));
3524 t->level3_alloc = alloc;
3526 i1 = t->level3_size << t->p;
3527 i2 = (t->level3_size + 1) << t->p;
3528 for (i = i1; i < i2; i++)
3529 t->level3[i] = 0;
3530 t->level2[index2] = t->level3_size++;
3533 index3 += t->level2[index2] << t->p;
3535 t->level3[index3] |= (uint32_t)1 << index4;
3538 /* Finalize and shrink. */
3539 static void
3540 add_locale_wctype_table (struct locale_file *file, struct wctype_table *t)
3542 size_t i, j, k;
3543 uint32_t reorder3[t->level3_size];
3544 uint32_t reorder2[t->level2_size];
3545 uint32_t level2_offset, level3_offset;
3547 /* Uniquify level3 blocks. */
3548 k = 0;
3549 for (j = 0; j < t->level3_size; j++)
3551 for (i = 0; i < k; i++)
3552 if (memcmp (&t->level3[i << t->p], &t->level3[j << t->p],
3553 (1 << t->p) * sizeof (uint32_t)) == 0)
3554 break;
3555 /* Relocate block j to block i. */
3556 reorder3[j] = i;
3557 if (i == k)
3559 if (i != j)
3560 memcpy (&t->level3[i << t->p], &t->level3[j << t->p],
3561 (1 << t->p) * sizeof (uint32_t));
3562 k++;
3565 t->level3_size = k;
3567 for (i = 0; i < (t->level2_size << t->q); i++)
3568 if (t->level2[i] != EMPTY)
3569 t->level2[i] = reorder3[t->level2[i]];
3571 /* Uniquify level2 blocks. */
3572 k = 0;
3573 for (j = 0; j < t->level2_size; j++)
3575 for (i = 0; i < k; i++)
3576 if (memcmp (&t->level2[i << t->q], &t->level2[j << t->q],
3577 (1 << t->q) * sizeof (uint32_t)) == 0)
3578 break;
3579 /* Relocate block j to block i. */
3580 reorder2[j] = i;
3581 if (i == k)
3583 if (i != j)
3584 memcpy (&t->level2[i << t->q], &t->level2[j << t->q],
3585 (1 << t->q) * sizeof (uint32_t));
3586 k++;
3589 t->level2_size = k;
3591 for (i = 0; i < t->level1_size; i++)
3592 if (t->level1[i] != EMPTY)
3593 t->level1[i] = reorder2[t->level1[i]];
3595 t->result_size =
3596 5 * sizeof (uint32_t)
3597 + t->level1_size * sizeof (uint32_t)
3598 + (t->level2_size << t->q) * sizeof (uint32_t)
3599 + (t->level3_size << t->p) * sizeof (uint32_t);
3601 level2_offset =
3602 5 * sizeof (uint32_t)
3603 + t->level1_size * sizeof (uint32_t);
3604 level3_offset =
3605 5 * sizeof (uint32_t)
3606 + t->level1_size * sizeof (uint32_t)
3607 + (t->level2_size << t->q) * sizeof (uint32_t);
3609 start_locale_structure (file);
3610 add_locale_uint32 (file, t->q + t->p + 5);
3611 add_locale_uint32 (file, t->level1_size);
3612 add_locale_uint32 (file, t->p + 5);
3613 add_locale_uint32 (file, (1 << t->q) - 1);
3614 add_locale_uint32 (file, (1 << t->p) - 1);
3616 for (i = 0; i < t->level1_size; i++)
3617 add_locale_uint32
3618 (file,
3619 t->level1[i] == EMPTY
3621 : (t->level1[i] << t->q) * sizeof (uint32_t) + level2_offset);
3623 for (i = 0; i < (t->level2_size << t->q); i++)
3624 add_locale_uint32
3625 (file,
3626 t->level2[i] == EMPTY
3628 : (t->level2[i] << t->p) * sizeof (uint32_t) + level3_offset);
3630 add_locale_uint32_array (file, t->level3, t->level3_size << t->p);
3631 end_locale_structure (file);
3633 if (t->level1_alloc > 0)
3634 free (t->level1);
3635 if (t->level2_alloc > 0)
3636 free (t->level2);
3637 if (t->level3_alloc > 0)
3638 free (t->level3);
3641 /* Flattens the included transliterations into a translit list.
3642 Inserts them in the list at `cursor', and returns the new cursor. */
3643 static struct translit_t **
3644 translit_flatten (struct locale_ctype_t *ctype,
3645 const struct charmap_t *charmap,
3646 struct translit_t **cursor)
3648 while (ctype->translit_include != NULL)
3650 const char *copy_locale = ctype->translit_include->copy_locale;
3651 const char *copy_repertoire = ctype->translit_include->copy_repertoire;
3652 struct localedef_t *other;
3654 /* Unchain the include statement. During the depth-first traversal
3655 we don't want to visit any locale more than once. */
3656 ctype->translit_include = ctype->translit_include->next;
3658 other = find_locale (LC_CTYPE, copy_locale, copy_repertoire, charmap);
3660 if (other == NULL || other->categories[LC_CTYPE].ctype == NULL)
3662 WITH_CUR_LOCALE (error (0, 0, _("\
3663 %s: transliteration data from locale `%s' not available"),
3664 "LC_CTYPE", copy_locale));
3666 else
3668 struct locale_ctype_t *other_ctype =
3669 other->categories[LC_CTYPE].ctype;
3671 cursor = translit_flatten (other_ctype, charmap, cursor);
3672 assert (other_ctype->translit_include == NULL);
3674 if (other_ctype->translit != NULL)
3676 /* Insert the other_ctype->translit list at *cursor. */
3677 struct translit_t *endp = other_ctype->translit;
3678 while (endp->next != NULL)
3679 endp = endp->next;
3681 endp->next = *cursor;
3682 *cursor = other_ctype->translit;
3684 /* Avoid any risk of circular lists. */
3685 other_ctype->translit = NULL;
3687 cursor = &endp->next;
3690 if (ctype->default_missing == NULL)
3691 ctype->default_missing = other_ctype->default_missing;
3695 return cursor;
3698 static void
3699 allocate_arrays (struct locale_ctype_t *ctype, const struct charmap_t *charmap,
3700 struct repertoire_t *repertoire)
3702 size_t idx, nr;
3703 const void *key;
3704 size_t len;
3705 void *vdata;
3706 void *curs;
3708 /* You wonder about this amount of memory? This is only because some
3709 users do not manage to address the array with unsigned values or
3710 data types with range >= 256. '\200' would result in the array
3711 index -128. To help these poor people we duplicate the entries for
3712 128 up to 255 below the entry for \0. */
3713 ctype->ctype_b = (char_class_t *) xcalloc (256 + 128, sizeof (char_class_t));
3714 ctype->ctype32_b = (char_class32_t *) xcalloc (256, sizeof (char_class32_t));
3715 ctype->class_b = (uint32_t **)
3716 xmalloc (ctype->nr_charclass * sizeof (uint32_t *));
3717 ctype->class_3level = (struct wctype_table *)
3718 xmalloc (ctype->nr_charclass * sizeof (struct wctype_table));
3720 /* This is the array accessed using the multibyte string elements. */
3721 for (idx = 0; idx < 256; ++idx)
3722 ctype->ctype_b[128 + idx] = ctype->class256_collection[idx];
3724 /* Mirror first 127 entries. We must take care that entry -1 is not
3725 mirrored because EOF == -1. */
3726 for (idx = 0; idx < 127; ++idx)
3727 ctype->ctype_b[idx] = ctype->ctype_b[256 + idx];
3729 /* The 32 bit array contains all characters < 0x100. */
3730 for (idx = 0; idx < ctype->class_collection_act; ++idx)
3731 if (ctype->charnames[idx] < 0x100)
3732 ctype->ctype32_b[ctype->charnames[idx]] = ctype->class_collection[idx];
3734 for (nr = 0; nr < ctype->nr_charclass; nr++)
3736 ctype->class_b[nr] = (uint32_t *) xcalloc (256 / 32, sizeof (uint32_t));
3738 /* We only set CLASS_B for the bits in the ISO C classes, not
3739 the user defined classes. The number should not change but
3740 who knows. */
3741 #define LAST_ISO_C_BIT 11
3742 if (nr <= LAST_ISO_C_BIT)
3743 for (idx = 0; idx < 256; ++idx)
3744 if (ctype->class256_collection[idx] & _ISbit (nr))
3745 ctype->class_b[nr][idx >> 5] |= (uint32_t) 1 << (idx & 0x1f);
3748 for (nr = 0; nr < ctype->nr_charclass; nr++)
3750 struct wctype_table *t;
3752 t = &ctype->class_3level[nr];
3753 t->p = 4; /* or: 5 */
3754 t->q = 7; /* or: 6 */
3755 wctype_table_init (t);
3757 for (idx = 0; idx < ctype->class_collection_act; ++idx)
3758 if (ctype->class_collection[idx] & _ISwbit (nr))
3759 wctype_table_add (t, ctype->charnames[idx]);
3761 if (verbose)
3762 WITH_CUR_LOCALE (fprintf (stderr, _("\
3763 %s: table for class \"%s\": %lu bytes\n"),
3764 "LC_CTYPE", ctype->classnames[nr],
3765 (unsigned long int) t->result_size));
3768 /* Room for table of mappings. */
3769 ctype->map_b = (uint32_t **) xmalloc (2 * sizeof (uint32_t *));
3770 ctype->map32_b = (uint32_t **) xmalloc (ctype->map_collection_nr
3771 * sizeof (uint32_t *));
3772 ctype->map_3level = (struct wctrans_table *)
3773 xmalloc (ctype->map_collection_nr * sizeof (struct wctrans_table));
3775 /* Fill in all mappings. */
3776 for (idx = 0; idx < 2; ++idx)
3778 unsigned int idx2;
3780 /* Allocate table. */
3781 ctype->map_b[idx] = (uint32_t *)
3782 xmalloc ((256 + 128) * sizeof (uint32_t));
3784 /* Copy values from collection. */
3785 for (idx2 = 0; idx2 < 256; ++idx2)
3786 ctype->map_b[idx][128 + idx2] = ctype->map256_collection[idx][idx2];
3788 /* Mirror first 127 entries. We must take care not to map entry
3789 -1 because EOF == -1. */
3790 for (idx2 = 0; idx2 < 127; ++idx2)
3791 ctype->map_b[idx][idx2] = ctype->map_b[idx][256 + idx2];
3793 /* EOF must map to EOF. */
3794 ctype->map_b[idx][127] = EOF;
3797 for (idx = 0; idx < ctype->map_collection_nr; ++idx)
3799 unsigned int idx2;
3801 /* Allocate table. */
3802 ctype->map32_b[idx] = (uint32_t *) xmalloc (256 * sizeof (uint32_t));
3804 /* Copy values from collection. Default is identity mapping. */
3805 for (idx2 = 0; idx2 < 256; ++idx2)
3806 ctype->map32_b[idx][idx2] =
3807 (ctype->map_collection[idx][idx2] != 0
3808 ? ctype->map_collection[idx][idx2]
3809 : idx2);
3812 for (nr = 0; nr < ctype->map_collection_nr; nr++)
3814 struct wctrans_table *t;
3816 t = &ctype->map_3level[nr];
3817 t->p = 7;
3818 t->q = 9;
3819 wctrans_table_init (t);
3821 for (idx = 0; idx < ctype->map_collection_act[nr]; ++idx)
3822 if (ctype->map_collection[nr][idx] != 0)
3823 wctrans_table_add (t, ctype->charnames[idx],
3824 ctype->map_collection[nr][idx]);
3826 if (verbose)
3827 WITH_CUR_LOCALE (fprintf (stderr, _("\
3828 %s: table for map \"%s\": %lu bytes\n"),
3829 "LC_CTYPE", ctype->mapnames[nr],
3830 (unsigned long int) t->result_size));
3833 /* Extra array for class and map names. */
3834 ctype->class_name_ptr = (uint32_t *) xmalloc (ctype->nr_charclass
3835 * sizeof (uint32_t));
3836 ctype->map_name_ptr = (uint32_t *) xmalloc (ctype->map_collection_nr
3837 * sizeof (uint32_t));
3839 ctype->class_offset = _NL_ITEM_INDEX (_NL_CTYPE_EXTRA_MAP_1);
3840 ctype->map_offset = ctype->class_offset + ctype->nr_charclass;
3842 /* Array for width information. Because the expected widths are very
3843 small (never larger than 2) we use only one single byte. This
3844 saves space.
3845 We put only printable characters in the table. wcwidth is specified
3846 to return -1 for non-printable characters. Doing the check here
3847 saves a run-time check.
3848 But we put L'\0' in the table. This again saves a run-time check. */
3850 struct wcwidth_table *t;
3852 t = &ctype->width;
3853 t->p = 7;
3854 t->q = 9;
3855 wcwidth_table_init (t);
3857 /* First set all the printable characters of the character set to
3858 the default width. */
3859 curs = NULL;
3860 while (iterate_table (&charmap->char_table, &curs, &key, &len, &vdata) == 0)
3862 struct charseq *data = (struct charseq *) vdata;
3864 if (data->ucs4 == UNINITIALIZED_CHAR_VALUE)
3865 data->ucs4 = repertoire_find_value (ctype->repertoire,
3866 data->name, len);
3868 if (data->ucs4 != ILLEGAL_CHAR_VALUE)
3870 uint32_t *class_bits =
3871 find_idx (ctype, &ctype->class_collection, NULL,
3872 &ctype->class_collection_act, data->ucs4);
3874 if (class_bits != NULL && (*class_bits & BITw (tok_print)))
3875 wcwidth_table_add (t, data->ucs4, charmap->width_default);
3879 /* Now add the explicitly specified widths. */
3880 if (charmap->width_rules != NULL)
3882 size_t cnt;
3884 for (cnt = 0; cnt < charmap->nwidth_rules; ++cnt)
3886 unsigned char bytes[charmap->mb_cur_max];
3887 int nbytes = charmap->width_rules[cnt].from->nbytes;
3889 /* We have the range of character for which the width is
3890 specified described using byte sequences of the multibyte
3891 charset. We have to convert this to UCS4 now. And we
3892 cannot simply convert the beginning and the end of the
3893 sequence, we have to iterate over the byte sequence and
3894 convert it for every single character. */
3895 memcpy (bytes, charmap->width_rules[cnt].from->bytes, nbytes);
3897 while (nbytes < charmap->width_rules[cnt].to->nbytes
3898 || memcmp (bytes, charmap->width_rules[cnt].to->bytes,
3899 nbytes) <= 0)
3901 /* Find the UCS value for `bytes'. */
3902 int inner;
3903 uint32_t wch;
3904 struct charseq *seq =
3905 charmap_find_symbol (charmap, (char *) bytes, nbytes);
3907 if (seq == NULL)
3908 wch = ILLEGAL_CHAR_VALUE;
3909 else if (seq->ucs4 != UNINITIALIZED_CHAR_VALUE)
3910 wch = seq->ucs4;
3911 else
3912 wch = repertoire_find_value (ctype->repertoire, seq->name,
3913 strlen (seq->name));
3915 if (wch != ILLEGAL_CHAR_VALUE)
3917 /* Store the value. */
3918 uint32_t *class_bits =
3919 find_idx (ctype, &ctype->class_collection, NULL,
3920 &ctype->class_collection_act, wch);
3922 if (class_bits != NULL && (*class_bits & BITw (tok_print)))
3923 wcwidth_table_add (t, wch,
3924 charmap->width_rules[cnt].width);
3927 /* "Increment" the bytes sequence. */
3928 inner = nbytes - 1;
3929 while (inner >= 0 && bytes[inner] == 0xff)
3930 --inner;
3932 if (inner < 0)
3934 /* We have to extend the byte sequence. */
3935 if (nbytes >= charmap->width_rules[cnt].to->nbytes)
3936 break;
3938 bytes[0] = 1;
3939 memset (&bytes[1], 0, nbytes);
3940 ++nbytes;
3942 else
3944 ++bytes[inner];
3945 while (++inner < nbytes)
3946 bytes[inner] = 0;
3952 /* Set the width of L'\0' to 0. */
3953 wcwidth_table_add (t, 0, 0);
3955 if (verbose)
3956 WITH_CUR_LOCALE (fprintf (stderr, _("%s: table for width: %lu bytes\n"),
3957 "LC_CTYPE", (unsigned long int) t->result_size));
3960 /* Set MB_CUR_MAX. */
3961 ctype->mb_cur_max = charmap->mb_cur_max;
3963 /* Now determine the table for the transliteration information.
3965 XXX It is not yet clear to me whether it is worth implementing a
3966 complicated algorithm which uses a hash table to locate the entries.
3967 For now I'll use a simple array which can be searching using binary
3968 search. */
3969 if (ctype->translit_include != NULL)
3970 /* Traverse the locales mentioned in the `include' statements in a
3971 depth-first way and fold in their transliteration information. */
3972 translit_flatten (ctype, charmap, &ctype->translit);
3974 if (ctype->translit != NULL)
3976 /* First count how many entries we have. This is the upper limit
3977 since some entries from the included files might be overwritten. */
3978 size_t number = 0;
3979 size_t cnt;
3980 struct translit_t *runp = ctype->translit;
3981 struct translit_t **sorted;
3982 size_t from_len, to_len;
3984 while (runp != NULL)
3986 ++number;
3987 runp = runp->next;
3990 /* Next we allocate an array large enough and fill in the values. */
3991 sorted = (struct translit_t **) alloca (number
3992 * sizeof (struct translit_t **));
3993 runp = ctype->translit;
3994 number = 0;
3997 /* Search for the place where to insert this string.
3998 XXX Better use a real sorting algorithm later. */
3999 size_t idx = 0;
4000 int replace = 0;
4002 while (idx < number)
4004 int res = wcscmp ((const wchar_t *) sorted[idx]->from,
4005 (const wchar_t *) runp->from);
4006 if (res == 0)
4008 replace = 1;
4009 break;
4011 if (res > 0)
4012 break;
4013 ++idx;
4016 if (replace)
4017 sorted[idx] = runp;
4018 else
4020 memmove (&sorted[idx + 1], &sorted[idx],
4021 (number - idx) * sizeof (struct translit_t *));
4022 sorted[idx] = runp;
4023 ++number;
4026 runp = runp->next;
4028 while (runp != NULL);
4030 /* The next step is putting all the possible transliteration
4031 strings in one memory block so that we can write it out.
4032 We need several different blocks:
4033 - index to the from-string array
4034 - from-string array
4035 - index to the to-string array
4036 - to-string array.
4038 from_len = to_len = 0;
4039 for (cnt = 0; cnt < number; ++cnt)
4041 struct translit_to_t *srunp;
4042 from_len += wcslen ((const wchar_t *) sorted[cnt]->from) + 1;
4043 srunp = sorted[cnt]->to;
4044 while (srunp != NULL)
4046 to_len += wcslen ((const wchar_t *) srunp->str) + 1;
4047 srunp = srunp->next;
4049 /* Plus one for the extra NUL character marking the end of
4050 the list for the current entry. */
4051 ++to_len;
4054 /* We can allocate the arrays for the results. */
4055 ctype->translit_from_idx = xmalloc (number * sizeof (uint32_t));
4056 ctype->translit_from_tbl = xmalloc (from_len * sizeof (uint32_t));
4057 ctype->translit_to_idx = xmalloc (number * sizeof (uint32_t));
4058 ctype->translit_to_tbl = xmalloc (to_len * sizeof (uint32_t));
4060 from_len = 0;
4061 to_len = 0;
4062 for (cnt = 0; cnt < number; ++cnt)
4064 size_t len;
4065 struct translit_to_t *srunp;
4067 ctype->translit_from_idx[cnt] = from_len;
4068 ctype->translit_to_idx[cnt] = to_len;
4070 len = wcslen ((const wchar_t *) sorted[cnt]->from) + 1;
4071 wmemcpy ((wchar_t *) &ctype->translit_from_tbl[from_len],
4072 (const wchar_t *) sorted[cnt]->from, len);
4073 from_len += len;
4075 ctype->translit_to_idx[cnt] = to_len;
4076 srunp = sorted[cnt]->to;
4077 while (srunp != NULL)
4079 len = wcslen ((const wchar_t *) srunp->str) + 1;
4080 wmemcpy ((wchar_t *) &ctype->translit_to_tbl[to_len],
4081 (const wchar_t *) srunp->str, len);
4082 to_len += len;
4083 srunp = srunp->next;
4085 ctype->translit_to_tbl[to_len++] = L'\0';
4088 /* Store the information about the length. */
4089 ctype->translit_idx_size = number;
4090 ctype->translit_from_tbl_size = from_len * sizeof (uint32_t);
4091 ctype->translit_to_tbl_size = to_len * sizeof (uint32_t);
4093 else
4095 /* Provide some dummy pointers since we have nothing to write out. */
4096 static uint32_t no_str = { 0 };
4098 ctype->translit_from_idx = &no_str;
4099 ctype->translit_from_tbl = &no_str;
4100 ctype->translit_to_tbl = &no_str;
4101 ctype->translit_idx_size = 0;
4102 ctype->translit_from_tbl_size = 0;
4103 ctype->translit_to_tbl_size = 0;