1 /* Copyright (C) 1995, 1996, 1997, 1998, 1999 Free Software Foundation, Inc.
2 This file is part of the GNU C Library.
3 Contributed by Ulrich Drepper <drepper@gnu.org>, 1995.
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Library General Public License as
7 published by the Free Software Foundation; either version 2 of the
8 License, or (at your option) any later version.
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Library General Public License for more details.
15 You should have received a copy of the GNU Library General Public
16 License along with the GNU C Library; see the file COPYING.LIB. If not,
17 write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
18 Boston, MA 02111-1307, USA. */
37 #include "localeinfo.h"
39 #include "linereader.h"
40 #include "locfile-token.h"
42 #include "localedef.h"
47 /* These are the extra bits not in wctype.h since these are not preallocated
49 #define _ISwspecial1 (1 << 29)
50 #define _ISwspecial2 (1 << 30)
51 #define _ISwspecial3 (1 << 31)
54 /* The bit used for representing a special class. */
55 #define BITPOS(class) ((class) - tok_upper)
56 #define BIT(class) (_ISbit (BITPOS (class)))
57 #define BITw(class) (_ISwbit (BITPOS (class)))
59 #define ELEM(ctype, collection, idx, value) \
60 *find_idx (ctype, &ctype->collection idx, &ctype->collection##_max idx, \
61 &ctype->collection##_act idx, value)
64 /* To be compatible with former implementations we for now restrict
65 the number of bits for character classes to 16. When compatibility
66 is not necessary anymore increase the number to 32. */
67 #define char_class_t uint16_t
68 #define char_class32_t uint32_t
71 /* Type to describe a transliteration action. We have a possibly
72 multiple character from-string and a set of multiple character
73 to-strings. All are 32bit values since this is what is used in
74 the gconv functions. */
79 struct translit_to_t
*next
;
86 struct translit_to_t
*to
;
88 struct translit_t
*next
;
92 /* The real definition of the struct for the LC_CTYPE locale. */
99 struct repertoire_t
*repertoire
;
101 /* We will allow up to 8 * sizeof (uint32_t) character classes. */
102 #define MAX_NR_CHARCLASS (8 * sizeof (uint32_t))
104 const char *classnames
[MAX_NR_CHARCLASS
];
105 uint32_t last_class_char
;
106 uint32_t class256_collection
[256];
107 uint32_t *class_collection
;
108 size_t class_collection_max
;
109 size_t class_collection_act
;
112 struct charseq
**mbdigits
;
119 struct charseq
*mboutdigits
[10];
120 uint32_t wcoutdigits
[10];
121 size_t outdigits_act
;
123 /* If the following number ever turns out to be too small simply
124 increase it. But I doubt it will. --drepper@gnu */
125 #define MAX_NR_CHARMAP 16
126 const char *mapnames
[MAX_NR_CHARMAP
];
127 uint32_t *map_collection
[MAX_NR_CHARMAP
];
128 uint32_t map256_collection
[2][256];
129 size_t map_collection_max
[MAX_NR_CHARMAP
];
130 size_t map_collection_act
[MAX_NR_CHARMAP
];
131 size_t map_collection_nr
;
133 int tomap_done
[MAX_NR_CHARMAP
];
135 /* Transliteration information. */
136 const char *translit_copy_locale
;
137 const char *translit_copy_repertoire
;
138 struct translit_t
*translit
;
140 /* The arrays for the binary representation. */
143 char_class_t
*ctype_b
;
144 char_class32_t
*ctype32_b
;
147 uint32_t *class_name_ptr
;
148 uint32_t *map_name_ptr
;
149 unsigned char *width
;
151 const char *codeset_name
;
152 uint32_t translit_hash_size
;
153 uint32_t translit_hash_layers
;
154 uint32_t *translit_from_idx
;
155 uint32_t *translit_from_tbl
;
156 uint32_t *translit_to_idx
;
157 uint32_t *translit_to_tbl
;
158 size_t translit_idx_size
;
159 size_t translit_from_tbl_size
;
160 size_t translit_to_tbl_size
;
162 struct obstack mem_pool
;
166 #define obstack_chunk_alloc xmalloc
167 #define obstack_chunk_free free
170 /* Prototypes for local functions. */
171 static void ctype_startup (struct linereader
*lr
, struct localedef_t
*locale
,
172 struct charmap_t
*charmap
, int ignore_content
);
173 static void ctype_class_new (struct linereader
*lr
,
174 struct locale_ctype_t
*ctype
, const char *name
);
175 static void ctype_map_new (struct linereader
*lr
,
176 struct locale_ctype_t
*ctype
,
177 const char *name
, struct charmap_t
*charmap
);
178 static uint32_t *find_idx (struct locale_ctype_t
*ctype
, uint32_t **table
,
179 size_t *max
, size_t *act
, unsigned int idx
);
180 static void set_class_defaults (struct locale_ctype_t
*ctype
,
181 struct charmap_t
*charmap
,
182 struct repertoire_t
*repertoire
);
183 static void allocate_arrays (struct locale_ctype_t
*ctype
,
184 struct charmap_t
*charmap
,
185 struct repertoire_t
*repertoire
);
188 static const char *longnames
[] =
190 "zero", "one", "two", "three", "four",
191 "five", "six", "seven", "eight", "nine"
193 static const unsigned char digits
[] = "0123456789";
197 ctype_startup (struct linereader
*lr
, struct localedef_t
*locale
,
198 struct charmap_t
*charmap
, int ignore_content
)
201 struct locale_ctype_t
*ctype
;
205 /* Allocate the needed room. */
206 locale
->categories
[LC_CTYPE
].ctype
= ctype
=
207 (struct locale_ctype_t
*) xcalloc (1, sizeof (struct locale_ctype_t
));
209 /* We have seen no names yet. */
210 ctype
->charnames_max
= charmap
->mb_cur_max
== 1 ? 256 : 512;
212 (unsigned int *) xmalloc (ctype
->charnames_max
213 * sizeof (unsigned int));
214 for (cnt
= 0; cnt
< 256; ++cnt
)
215 ctype
->charnames
[cnt
] = cnt
;
216 ctype
->charnames_act
= 256;
218 /* Fill character class information. */
219 ctype
->last_class_char
= ILLEGAL_CHAR_VALUE
;
220 /* The order of the following instructions determines the bit
222 ctype_class_new (lr
, ctype
, "upper");
223 ctype_class_new (lr
, ctype
, "lower");
224 ctype_class_new (lr
, ctype
, "alpha");
225 ctype_class_new (lr
, ctype
, "digit");
226 ctype_class_new (lr
, ctype
, "xdigit");
227 ctype_class_new (lr
, ctype
, "space");
228 ctype_class_new (lr
, ctype
, "print");
229 ctype_class_new (lr
, ctype
, "graph");
230 ctype_class_new (lr
, ctype
, "blank");
231 ctype_class_new (lr
, ctype
, "cntrl");
232 ctype_class_new (lr
, ctype
, "punct");
233 ctype_class_new (lr
, ctype
, "alnum");
234 /* The following are extensions from ISO 14652. */
235 ctype_class_new (lr
, ctype
, "left_to_right");
236 ctype_class_new (lr
, ctype
, "right_to_left");
237 ctype_class_new (lr
, ctype
, "num_terminator");
238 ctype_class_new (lr
, ctype
, "num_separator");
239 ctype_class_new (lr
, ctype
, "segment_separator");
240 ctype_class_new (lr
, ctype
, "block_separator");
241 ctype_class_new (lr
, ctype
, "direction_control");
242 ctype_class_new (lr
, ctype
, "sym_swap_layout");
243 ctype_class_new (lr
, ctype
, "char_shape_selector");
244 ctype_class_new (lr
, ctype
, "num_shape_selector");
245 ctype_class_new (lr
, ctype
, "non_spacing");
246 ctype_class_new (lr
, ctype
, "non_spacing_level3");
247 ctype_class_new (lr
, ctype
, "normal_connect");
248 ctype_class_new (lr
, ctype
, "r_connect");
249 ctype_class_new (lr
, ctype
, "no_connect");
250 ctype_class_new (lr
, ctype
, "no_connect-space");
251 ctype_class_new (lr
, ctype
, "vowel_connect");
253 ctype
->class_collection_max
= charmap
->mb_cur_max
== 1 ? 256 : 512;
254 ctype
->class_collection
255 = (uint32_t *) xcalloc (sizeof (unsigned long int),
256 ctype
->class_collection_max
);
257 ctype
->class_collection_act
= 256;
259 /* Fill character map information. */
260 ctype
->map_collection_nr
= 0;
261 ctype
->last_map_idx
= MAX_NR_CHARMAP
;
262 ctype_map_new (lr
, ctype
, "toupper", charmap
);
263 ctype_map_new (lr
, ctype
, "tolower", charmap
);
264 ctype_map_new (lr
, ctype
, "tosymmetric", charmap
);
266 /* Fill first 256 entries in `toXXX' arrays. */
267 for (cnt
= 0; cnt
< 256; ++cnt
)
269 ctype
->map_collection
[0][cnt
] = cnt
;
270 ctype
->map_collection
[1][cnt
] = cnt
;
271 ctype
->map_collection
[2][cnt
] = cnt
;
272 ctype
->map256_collection
[0][cnt
] = cnt
;
273 ctype
->map256_collection
[1][cnt
] = cnt
;
276 obstack_init (&ctype
->mem_pool
);
282 ctype_finish (struct localedef_t
*locale
, struct charmap_t
*charmap
)
284 /* See POSIX.2, table 2-6 for the meaning of the following table. */
289 const char allow
[NCLASS
];
291 valid_table
[NCLASS
] =
293 /* The order is important. See token.h for more information.
294 M = Always, D = Default, - = Permitted, X = Mutually exclusive */
295 { "upper", "--MX-XDDXXX-" },
296 { "lower", "--MX-XDDXXX-" },
297 { "alpha", "---X-XDDXXX-" },
298 { "digit", "XXX--XDDXXX-" },
299 { "xdigit", "-----XDDXXX-" },
300 { "space", "XXXXX------X" },
301 { "print", "---------X--" },
302 { "graph", "---------X--" },
303 { "blank", "XXXXXM-----X" },
304 { "cntrl", "XXXXX-XX--XX" },
305 { "punct", "XXXXX-DD-X-X" },
306 { "alnum", "-----XDDXXX-" }
310 uint32_t space_value
;
311 struct charseq
*space_seq
;
312 struct locale_ctype_t
*ctype
= locale
->categories
[LC_CTYPE
].ctype
;
315 /* Now resolve copying and also handle completely missing definitions. */
318 /* First see whether we were supposed to copy. If yes, find the
319 actual definition. */
320 if (locale
->copy_name
[LC_CTYPE
] != NULL
)
322 /* Find the copying locale. This has to happen transitively since
323 the locale we are copying from might also copying another one. */
324 struct localedef_t
*from
= locale
;
327 from
= find_locale (LC_CTYPE
, from
->copy_name
[LC_CTYPE
],
328 from
->repertoire_name
, charmap
);
329 while (from
->categories
[LC_CTYPE
].ctype
== NULL
330 && from
->copy_name
[LC_CTYPE
] != NULL
);
332 ctype
= locale
->categories
[LC_CTYPE
].ctype
333 = from
->categories
[LC_CTYPE
].ctype
;
336 /* If there is still no definition issue an warning and create an
340 error (0, 0, _("No definition for %s category found"), "LC_CTYPE");
341 ctype_startup (NULL
, locale
, charmap
, 0);
342 ctype
= locale
->categories
[LC_CTYPE
].ctype
;
346 /* Set default value for classes not specified. */
347 set_class_defaults (ctype
, charmap
, ctype
->repertoire
);
349 /* Check according to table. */
350 for (cnt
= 0; cnt
< ctype
->class_collection_act
; ++cnt
)
352 uint32_t tmp
= ctype
->class_collection
[cnt
];
356 for (cls1
= 0; cls1
< NCLASS
; ++cls1
)
357 if ((tmp
& _ISwbit (cls1
)) != 0)
358 for (cls2
= 0; cls2
< NCLASS
; ++cls2
)
359 if (valid_table
[cls1
].allow
[cls2
] != '-')
361 int eq
= (tmp
& _ISwbit (cls2
)) != 0;
362 switch (valid_table
[cls1
].allow
[cls2
])
367 uint32_t value
= ctype
->charnames
[cnt
];
371 character L'\\u%0*x' in class `%s' must be in class `%s'"),
372 value
> 0xffff ? 8 : 4, value
,
373 valid_table
[cls1
].name
,
374 valid_table
[cls2
].name
);
381 uint32_t value
= ctype
->charnames
[cnt
];
385 character L'\\u%0*x' in class `%s' must not be in class `%s'"),
386 value
> 0xffff ? 8 : 4, value
,
387 valid_table
[cls1
].name
,
388 valid_table
[cls2
].name
);
393 ctype
->class_collection
[cnt
] |= _ISwbit (cls2
);
397 error (5, 0, _("internal error in %s, line %u"),
398 __FUNCTION__
, __LINE__
);
404 for (cnt
= 0; cnt
< 256; ++cnt
)
406 uint32_t tmp
= ctype
->class256_collection
[cnt
];
410 for (cls1
= 0; cls1
< NCLASS
; ++cls1
)
411 if ((tmp
& _ISbit (cls1
)) != 0)
412 for (cls2
= 0; cls2
< NCLASS
; ++cls2
)
413 if (valid_table
[cls1
].allow
[cls2
] != '-')
415 int eq
= (tmp
& _ISbit (cls2
)) != 0;
416 switch (valid_table
[cls1
].allow
[cls2
])
423 sprintf (buf
, "\\%o", cnt
);
427 character '%s' in class `%s' must be in class `%s'"),
428 buf
, valid_table
[cls1
].name
,
429 valid_table
[cls2
].name
);
438 sprintf (buf
, "\\%o", cnt
);
442 character '%s' in class `%s' must not be in class `%s'"),
443 buf
, valid_table
[cls1
].name
,
444 valid_table
[cls2
].name
);
449 ctype
->class256_collection
[cnt
] |= _ISbit (cls2
);
453 error (5, 0, _("internal error in %s, line %u"),
454 __FUNCTION__
, __LINE__
);
460 /* ... and now test <SP> as a special case. */
461 space_value
= repertoire_find_value (ctype
->repertoire
, "SP", 2);
462 if (space_value
== ILLEGAL_CHAR_VALUE
)
465 error (0, 0, _("character <SP> not defined in character map"));
467 else if (((cnt
= BITPOS (tok_space
),
468 (ELEM (ctype
, class_collection
, , space_value
)
469 & BITw (tok_space
)) == 0)
470 || (cnt
= BITPOS (tok_blank
),
471 (ELEM (ctype
, class_collection
, , space_value
)
472 & BITw (tok_blank
)) == 0)))
475 error (0, 0, _("<SP> character not in class `%s'"),
476 valid_table
[cnt
].name
);
478 else if (((cnt
= BITPOS (tok_punct
),
479 (ELEM (ctype
, class_collection
, , space_value
)
480 & BITw (tok_punct
)) != 0)
481 || (cnt
= BITPOS (tok_graph
),
482 (ELEM (ctype
, class_collection
, , space_value
)
487 error (0, 0, _("<SP> character must not be in class `%s'"),
488 valid_table
[cnt
].name
);
491 ELEM (ctype
, class_collection
, , space_value
) |= BITw (tok_print
);
493 space_seq
= charmap_find_value (charmap
, "SP", 2);
494 if (space_seq
== NULL
|| space_seq
->nbytes
!= 1)
497 error (0, 0, _("character <SP> not defined in character map"));
499 else if (((cnt
= BITPOS (tok_space
),
500 (ctype
->class256_collection
[space_seq
->bytes
[0]]
501 & BIT (tok_space
)) == 0)
502 || (cnt
= BITPOS (tok_blank
),
503 (ctype
->class256_collection
[space_seq
->bytes
[0]]
504 & BIT (tok_blank
)) == 0)))
507 error (0, 0, _("<SP> character not in class `%s'"),
508 valid_table
[cnt
].name
);
510 else if (((cnt
= BITPOS (tok_punct
),
511 (ctype
->class256_collection
[space_seq
->bytes
[0]]
512 & BIT (tok_punct
)) != 0)
513 || (cnt
= BITPOS (tok_graph
),
514 (ctype
->class256_collection
[space_seq
->bytes
[0]]
515 & BIT (tok_graph
)) != 0)))
518 error (0, 0, _("<SP> character must not be in class `%s'"),
519 valid_table
[cnt
].name
);
522 ctype
->class256_collection
[space_seq
->bytes
[0]] |= BIT (tok_print
);
524 /* Now that the tests are done make sure the name array contains all
525 characters which are handled in the WIDTH section of the
526 character set definition file. */
527 if (charmap
->width_rules
!= NULL
)
528 for (cnt
= 0; cnt
< charmap
->nwidth_rules
; ++cnt
)
530 unsigned char bytes
[charmap
->mb_cur_max
];
531 int nbytes
= charmap
->width_rules
[cnt
].from
->nbytes
;
533 /* We have the range of character for which the width is
534 specified described using byte sequences of the multibyte
535 charset. We have to convert this to UCS4 now. And we
536 cannot simply convert the beginning and the end of the
537 sequence, we have to iterate over the byte sequence and
538 convert it for every single character. */
539 memcpy (bytes
, charmap
->width_rules
[cnt
].from
->bytes
, nbytes
);
541 while (nbytes
< charmap
->width_rules
[cnt
].to
->nbytes
542 || memcmp (bytes
, charmap
->width_rules
[cnt
].to
->bytes
,
545 /* Find the UCS value for `bytes'. */
546 uint32_t wch
= repertoire_find_value (ctype
->repertoire
, bytes
,
550 if (wch
!= ILLEGAL_CHAR_VALUE
)
551 /* We are only interested in the side-effects of the
552 `find_idx' call. It will add appropriate entries in
553 the name array if this is necessary. */
554 (void) find_idx (ctype
, NULL
, NULL
, NULL
, wch
);
556 /* "Increment" the bytes sequence. */
558 while (inner
>= 0 && bytes
[inner
] == 0xff)
563 /* We have to extend the byte sequence. */
564 if (nbytes
>= charmap
->width_rules
[cnt
].to
->nbytes
)
568 memset (&bytes
[1], 0, nbytes
);
574 while (++inner
< nbytes
)
580 /* There must be a multiple of 10 digits. */
581 if (ctype
->mbdigits_act
% 10 != 0)
583 assert (ctype
->mbdigits_act
== ctype
->wcdigits_act
);
584 ctype
->wcdigits_act
-= ctype
->mbdigits_act
% 10;
585 ctype
->mbdigits_act
-= ctype
->mbdigits_act
% 10;
586 error (0, 0, _("`digit' category has not entries in groups of ten"));
589 /* Check the input digits. There must be a multiple of ten available.
590 In each group it could be that one or the other character is missing.
591 In this case the whole group must be removed. */
593 while (cnt
< ctype
->mbdigits_act
)
596 for (inner
= 0; inner
< 10; ++inner
)
597 if (ctype
->mbdigits
[cnt
+ inner
] == NULL
)
604 /* Remove the group. */
605 memmove (&ctype
->mbdigits
[cnt
], &ctype
->mbdigits
[cnt
+ 10],
606 ((ctype
->wcdigits_act
- cnt
- 10)
607 * sizeof (ctype
->mbdigits
[0])));
608 ctype
->mbdigits_act
-= 10;
612 /* If no input digits are given use the default. */
613 if (ctype
->mbdigits_act
== 0)
615 if (ctype
->mbdigits_max
== 0)
617 ctype
->mbdigits
= obstack_alloc (&charmap
->mem_pool
,
618 10 * sizeof (struct charseq
*));
619 ctype
->mbdigits_max
= 10;
622 for (cnt
= 0; cnt
< 10; ++cnt
)
624 ctype
->mbdigits
[cnt
] = charmap_find_symbol (charmap
,
626 if (ctype
->mbdigits
[cnt
] == NULL
)
628 ctype
->mbdigits
[cnt
] = charmap_find_symbol (charmap
,
630 strlen (longnames
[cnt
]));
631 if (ctype
->mbdigits
[cnt
] == NULL
)
633 /* Hum, this ain't good. */
635 no input digits defined and none of the standard names in the charmap"));
637 ctype
->mbdigits
[cnt
] = obstack_alloc (&charmap
->mem_pool
,
638 sizeof (struct charseq
) + 1);
640 /* This is better than nothing. */
641 ctype
->mbdigits
[cnt
]->bytes
[0] = digits
[cnt
];
642 ctype
->mbdigits
[cnt
]->nbytes
= 1;
647 ctype
->mbdigits_act
= 10;
650 /* Check the wide character input digits. There must be a multiple
651 of ten available. In each group it could be that one or the other
652 character is missing. In this case the whole group must be
655 while (cnt
< ctype
->wcdigits_act
)
658 for (inner
= 0; inner
< 10; ++inner
)
659 if (ctype
->wcdigits
[cnt
+ inner
] == ILLEGAL_CHAR_VALUE
)
666 /* Remove the group. */
667 memmove (&ctype
->wcdigits
[cnt
], &ctype
->wcdigits
[cnt
+ 10],
668 ((ctype
->wcdigits_act
- cnt
- 10)
669 * sizeof (ctype
->wcdigits
[0])));
670 ctype
->wcdigits_act
-= 10;
674 /* If no input digits are given use the default. */
675 if (ctype
->wcdigits_act
== 0)
677 if (ctype
->wcdigits_max
== 0)
679 ctype
->wcdigits
= obstack_alloc (&charmap
->mem_pool
,
680 10 * sizeof (uint32_t));
681 ctype
->wcdigits_max
= 10;
684 for (cnt
= 0; cnt
< 10; ++cnt
)
685 ctype
->wcdigits
[cnt
] = L
'0' + cnt
;
687 ctype
->mbdigits_act
= 10;
690 /* Check the outdigits. */
692 for (cnt
= 0; cnt
< 10; ++cnt
)
693 if (ctype
->mboutdigits
[cnt
] == NULL
)
695 static struct charseq replace
[2];
700 not all characters used in `outdigit' are available in the charmap"));
704 replace
[0].nbytes
= 1;
705 replace
[0].bytes
[0] = '?';
706 replace
[0].bytes
[1] = '\0';
707 ctype
->mboutdigits
[cnt
] = &replace
[0];
711 for (cnt
= 0; cnt
< 10; ++cnt
)
712 if (ctype
->wcoutdigits
[cnt
] == 0)
717 not all characters used in `outdigit' are available in the repertoire"));
721 ctype
->wcoutdigits
[cnt
] = L
'?';
727 ctype_output (struct localedef_t
*locale
, struct charmap_t
*charmap
,
728 const char *output_path
)
730 struct locale_ctype_t
*ctype
= locale
->categories
[LC_CTYPE
].ctype
;
731 const size_t nelems
= (_NL_ITEM_INDEX (_NL_NUM_LC_CTYPE
)
732 + (ctype
->map_collection_nr
- 2));
733 struct iovec iov
[2 + nelems
+ ctype
->nr_charclass
734 + ctype
->map_collection_nr
];
735 struct locale_file data
;
736 uint32_t idx
[nelems
+ 1];
737 size_t elem
, cnt
, offset
, total
;
740 /* Now prepare the output: Find the sizes of the table we can use. */
741 allocate_arrays (ctype
, charmap
, ctype
->repertoire
);
743 data
.magic
= LIMAGIC (LC_CTYPE
);
745 iov
[0].iov_base
= (void *) &data
;
746 iov
[0].iov_len
= sizeof (data
);
748 iov
[1].iov_base
= (void *) idx
;
749 iov
[1].iov_len
= sizeof (idx
);
751 idx
[0] = iov
[0].iov_len
+ iov
[1].iov_len
;
754 for (elem
= 0; elem
< nelems
; ++elem
)
756 if (elem
< _NL_ITEM_INDEX (_NL_NUM_LC_CTYPE
))
759 #define CTYPE_DATA(name, base, len) \
760 case _NL_ITEM_INDEX (name): \
761 iov[2 + elem + offset].iov_base = (base); \
762 iov[2 + elem + offset].iov_len = (len); \
763 if (elem + 1 < nelems) \
764 idx[elem + 1] = idx[elem] + iov[2 + elem + offset].iov_len; \
767 CTYPE_DATA (_NL_CTYPE_CLASS
,
769 (256 + 128) * sizeof (char_class_t
));
771 CTYPE_DATA (_NL_CTYPE_TOUPPER
,
773 (ctype
->plane_size
* ctype
->plane_cnt
+ 128)
774 * sizeof (uint32_t));
775 CTYPE_DATA (_NL_CTYPE_TOLOWER
,
777 (ctype
->plane_size
* ctype
->plane_cnt
+ 128)
778 * sizeof (uint32_t));
780 CTYPE_DATA (_NL_CTYPE_CLASS32
,
782 (ctype
->plane_size
* ctype
->plane_cnt
783 * sizeof (char_class32_t
)));
785 CTYPE_DATA (_NL_CTYPE_NAMES
,
786 ctype
->names
, (ctype
->plane_size
* ctype
->plane_cnt
787 * sizeof (uint32_t)));
789 CTYPE_DATA (_NL_CTYPE_TRANSLIT_HASH_SIZE
,
790 &ctype
->translit_hash_size
, sizeof (uint32_t));
791 CTYPE_DATA (_NL_CTYPE_TRANSLIT_HASH_LAYERS
,
792 &ctype
->translit_hash_layers
, sizeof (uint32_t));
794 CTYPE_DATA (_NL_CTYPE_TRANSLIT_FROM_IDX
,
795 ctype
->translit_from_idx
,
796 ctype
->translit_idx_size
);
798 CTYPE_DATA (_NL_CTYPE_TRANSLIT_FROM_TBL
,
799 ctype
->translit_from_tbl
,
800 ctype
->translit_from_tbl_size
);
802 CTYPE_DATA (_NL_CTYPE_TRANSLIT_TO_IDX
,
803 ctype
->translit_to_idx
,
804 ctype
->translit_idx_size
);
806 CTYPE_DATA (_NL_CTYPE_TRANSLIT_TO_TBL
,
807 ctype
->translit_to_tbl
, ctype
->translit_to_tbl_size
);
809 CTYPE_DATA (_NL_CTYPE_HASH_SIZE
,
810 &ctype
->plane_size
, sizeof (uint32_t));
811 CTYPE_DATA (_NL_CTYPE_HASH_LAYERS
,
812 &ctype
->plane_cnt
, sizeof (uint32_t));
814 case _NL_ITEM_INDEX (_NL_CTYPE_CLASS_NAMES
):
815 /* The class name array. */
817 for (cnt
= 0; cnt
< ctype
->nr_charclass
; ++cnt
, ++offset
)
819 iov
[2 + elem
+ offset
].iov_base
820 = (void *) ctype
->classnames
[cnt
];
821 iov
[2 + elem
+ offset
].iov_len
822 = strlen (ctype
->classnames
[cnt
]) + 1;
823 total
+= iov
[2 + elem
+ offset
].iov_len
;
825 iov
[2 + elem
+ offset
].iov_base
= (void *) "\0\0\0";
826 iov
[2 + elem
+ offset
].iov_len
= 1 + (4 - ((total
+ 1) % 4));
827 total
+= 1 + (4 - ((total
+ 1) % 4));
829 idx
[elem
+ 1] = idx
[elem
] + total
;
832 case _NL_ITEM_INDEX (_NL_CTYPE_MAP_NAMES
):
833 /* The class name array. */
835 for (cnt
= 0; cnt
< ctype
->map_collection_nr
; ++cnt
, ++offset
)
837 iov
[2 + elem
+ offset
].iov_base
838 = (void *) ctype
->mapnames
[cnt
];
839 iov
[2 + elem
+ offset
].iov_len
840 = strlen (ctype
->mapnames
[cnt
]) + 1;
841 total
+= iov
[2 + elem
+ offset
].iov_len
;
843 iov
[2 + elem
+ offset
].iov_base
= (void *) "\0\0\0";
844 iov
[2 + elem
+ offset
].iov_len
= 1 + (4 - ((total
+ 1) % 4));
845 total
+= 1 + (4 - ((total
+ 1) % 4));
847 idx
[elem
+ 1] = idx
[elem
] + total
;
850 CTYPE_DATA (_NL_CTYPE_WIDTH
,
851 ctype
->width
, ctype
->plane_size
* ctype
->plane_cnt
);
853 CTYPE_DATA (_NL_CTYPE_MB_CUR_MAX
,
854 &ctype
->mb_cur_max
, sizeof (uint32_t));
856 case _NL_ITEM_INDEX (_NL_CTYPE_CODESET_NAME
):
857 total
= strlen (ctype
->codeset_name
) + 1;
859 iov
[2 + elem
+ offset
].iov_base
= (char *) ctype
->codeset_name
;
862 iov
[2 + elem
+ offset
].iov_base
= alloca ((total
+ 3) & ~3);
863 memset (mempcpy (iov
[2 + elem
+ offset
].iov_base
,
864 ctype
->codeset_name
, total
),
865 '\0', 4 - (total
& 3));
866 total
= (total
+ 3) & ~3;
868 iov
[2 + elem
+ offset
].iov_len
= total
;
869 idx
[elem
+ 1] = idx
[elem
] + iov
[2 + elem
+ offset
].iov_len
;
872 case _NL_ITEM_INDEX (_NL_CTYPE_INDIGITS_MB_LEN
):
873 iov
[2 + elem
+ offset
].iov_base
= alloca (sizeof (uint32_t));
874 iov
[2 + elem
+ offset
].iov_len
= sizeof (uint32_t);
875 *(uint32_t *) iov
[2 + elem
+ offset
].iov_base
=
876 ctype
->mbdigits_act
/ 10;
877 idx
[elem
+ 1] = idx
[elem
] + sizeof (uint32_t);
880 case _NL_ITEM_INDEX (_NL_CTYPE_INDIGITS_WC_LEN
):
881 iov
[2 + elem
+ offset
].iov_base
= alloca (sizeof (uint32_t));
882 iov
[2 + elem
+ offset
].iov_len
= sizeof (uint32_t);
883 *(uint32_t *) iov
[2 + elem
+ offset
].iov_base
=
884 ctype
->wcdigits_act
/ 10;
885 idx
[elem
+ 1] = idx
[elem
] + sizeof (uint32_t);
888 case _NL_ITEM_INDEX (_NL_CTYPE_INDIGITS0_MB
) ... _NL_ITEM_INDEX (_NL_CTYPE_INDIGITS9_MB
):
889 /* Compute the length of all possible characters. For INDIGITS
890 there might be more than one. We simply concatenate all of
891 them with a NUL byte following. The NUL byte wouldn't be
892 necessary but it makes it easier for the user. */
894 for (cnt
= elem
- _NL_CTYPE_INDIGITS0_MB
;
895 cnt
< ctype
->mbdigits_act
; cnt
+= 10)
896 total
+= ctype
->mbdigits
[cnt
]->nbytes
+ 1;
897 iov
[2 + elem
+ offset
].iov_base
= (char *) alloca (total
);
898 iov
[2 + elem
+ offset
].iov_len
= total
;
900 cp
= iov
[2 + elem
+ offset
].iov_base
;
901 for (cnt
= elem
- _NL_CTYPE_INDIGITS0_MB
;
902 cnt
< ctype
->mbdigits_act
; cnt
+= 10)
904 cp
= mempcpy (cp
, ctype
->mbdigits
[cnt
]->bytes
,
905 ctype
->mbdigits
[cnt
]->nbytes
);
908 idx
[elem
+ 1] = idx
[elem
] + iov
[2 + elem
+ offset
].iov_len
;
911 case _NL_ITEM_INDEX (_NL_CTYPE_OUTDIGIT0_MB
) ... _NL_ITEM_INDEX (_NL_CTYPE_OUTDIGIT9_MB
):
912 /* Compute the length of all possible characters. For INDIGITS
913 there might be more than one. We simply concatenate all of
914 them with a NUL byte following. The NUL byte wouldn't be
915 necessary but it makes it easier for the user. */
916 cnt
= elem
- _NL_CTYPE_OUTDIGIT0_MB
;
917 total
= ctype
->mboutdigits
[cnt
]->nbytes
+ 1;
918 iov
[2 + elem
+ offset
].iov_base
= (char *) alloca (total
);
919 iov
[2 + elem
+ offset
].iov_len
= total
;
921 *(char *) mempcpy (iov
[2 + elem
+ offset
].iov_base
,
922 ctype
->mbdigits
[cnt
]->bytes
,
923 ctype
->mbdigits
[cnt
]->nbytes
) = '\0';
924 idx
[elem
+ 1] = idx
[elem
] + iov
[2 + elem
+ offset
].iov_len
;
927 case _NL_ITEM_INDEX (_NL_CTYPE_INDIGITS0_WC
) ... _NL_ITEM_INDEX (_NL_CTYPE_INDIGITS9_WC
):
928 total
= ctype
->wcdigits_act
/ 10;
930 iov
[2 + elem
+ offset
].iov_base
=
931 (uint32_t *) alloca (total
* sizeof (uint32_t));
932 iov
[2 + elem
+ offset
].iov_len
= total
* sizeof (uint32_t);
934 for (cnt
= elem
- _NL_CTYPE_INDIGITS0_WC
;
935 cnt
< ctype
->wcdigits_act
; cnt
+= 10)
936 ((uint32_t *) iov
[2 + elem
+ offset
].iov_base
)[cnt
/ 10]
937 = ctype
->wcdigits
[cnt
];
938 idx
[elem
+ 1] = idx
[elem
] + iov
[2 + elem
+ offset
].iov_len
;
941 case _NL_ITEM_INDEX (_NL_CTYPE_OUTDIGIT0_WC
) ... _NL_ITEM_INDEX (_NL_CTYPE_OUTDIGIT9_WC
):
942 cnt
= elem
- _NL_CTYPE_OUTDIGIT0_WC
;
943 iov
[2 + elem
+ offset
].iov_base
= &ctype
->wcoutdigits
[cnt
];
944 iov
[2 + elem
+ offset
].iov_len
= sizeof (uint32_t);
945 idx
[elem
+ 1] = idx
[elem
] + iov
[2 + elem
+ offset
].iov_len
;
949 assert (! "unknown CTYPE element");
953 /* Handle extra maps. */
954 size_t nr
= (elem
- _NL_ITEM_INDEX (_NL_NUM_LC_CTYPE
)) + 2;
956 iov
[2 + elem
+ offset
].iov_base
= ctype
->map
[nr
];
957 iov
[2 + elem
+ offset
].iov_len
= ((ctype
->plane_size
958 * ctype
->plane_cnt
+ 128)
959 * sizeof (uint32_t));
961 idx
[elem
+ 1] = idx
[elem
] + iov
[2 + elem
+ offset
].iov_len
;
965 assert (2 + elem
+ offset
== (nelems
+ ctype
->nr_charclass
966 + ctype
->map_collection_nr
+ 2));
968 write_locale_data (output_path
, "LC_CTYPE", 2 + elem
+ offset
, iov
);
972 /* Local functions. */
974 ctype_class_new (struct linereader
*lr
, struct locale_ctype_t
*ctype
,
979 for (cnt
= 0; cnt
< ctype
->nr_charclass
; ++cnt
)
980 if (strcmp (ctype
->classnames
[cnt
], name
) == 0)
983 if (cnt
< ctype
->nr_charclass
)
985 lr_error (lr
, _("character class `%s' already defined"), name
);
989 if (ctype
->nr_charclass
== MAX_NR_CHARCLASS
)
990 /* Exit code 2 is prescribed in P1003.2b. */
992 implementation limit: no more than %d character classes allowed"),
995 ctype
->classnames
[ctype
->nr_charclass
++] = name
;
1000 ctype_map_new (struct linereader
*lr
, struct locale_ctype_t
*ctype
,
1001 const char *name
, struct charmap_t
*charmap
)
1003 size_t max_chars
= 0;
1006 for (cnt
= 0; cnt
< ctype
->map_collection_nr
; ++cnt
)
1008 if (strcmp (ctype
->mapnames
[cnt
], name
) == 0)
1011 if (max_chars
< ctype
->map_collection_max
[cnt
])
1012 max_chars
= ctype
->map_collection_max
[cnt
];
1015 if (cnt
< ctype
->map_collection_nr
)
1017 lr_error (lr
, _("character map `%s' already defined"), name
);
1021 if (ctype
->map_collection_nr
== MAX_NR_CHARMAP
)
1022 /* Exit code 2 is prescribed in P1003.2b. */
1024 implementation limit: no more than %d character maps allowed"),
1027 ctype
->mapnames
[cnt
] = name
;
1030 ctype
->map_collection_max
[cnt
] = charmap
->mb_cur_max
== 1 ? 256 : 512;
1032 ctype
->map_collection_max
[cnt
] = max_chars
;
1034 ctype
->map_collection
[cnt
] = (uint32_t *)
1035 xmalloc (sizeof (uint32_t) * ctype
->map_collection_max
[cnt
]);
1036 memset (ctype
->map_collection
[cnt
], '\0',
1037 sizeof (uint32_t) * ctype
->map_collection_max
[cnt
]);
1038 ctype
->map_collection_act
[cnt
] = 256;
1040 ++ctype
->map_collection_nr
;
1044 /* We have to be prepared that TABLE, MAX, and ACT can be NULL. This
1045 is possible if we only want to extend the name array. */
1047 find_idx (struct locale_ctype_t
*ctype
, uint32_t **table
, size_t *max
,
1048 size_t *act
, uint32_t idx
)
1053 return table
== NULL
? NULL
: &(*table
)[idx
];
1055 for (cnt
= 256; cnt
< ctype
->charnames_act
; ++cnt
)
1056 if (ctype
->charnames
[cnt
] == idx
)
1059 /* We have to distinguish two cases: the name is found or not. */
1060 if (cnt
== ctype
->charnames_act
)
1062 /* Extend the name array. */
1063 if (ctype
->charnames_act
== ctype
->charnames_max
)
1065 ctype
->charnames_max
*= 2;
1066 ctype
->charnames
= (unsigned int *)
1067 xrealloc (ctype
->charnames
,
1068 sizeof (unsigned int) * ctype
->charnames_max
);
1070 ctype
->charnames
[ctype
->charnames_act
++] = idx
;
1074 /* We have done everything we are asked to do. */
1081 size_t old_max
= *max
;
1084 while (*max
<= cnt
);
1087 (uint32_t *) xrealloc (*table
, *max
* sizeof (unsigned long int));
1088 memset (&(*table
)[old_max
], '\0',
1089 (*max
- old_max
) * sizeof (uint32_t));
1095 return &(*table
)[cnt
];
1100 get_character (struct token
*now
, struct charmap_t
*charmap
,
1101 struct repertoire_t
*repertoire
,
1102 struct charseq
**seqp
, uint32_t *wchp
)
1104 if (now
->tok
== tok_bsymbol
)
1106 /* This will hopefully be the normal case. */
1107 *wchp
= repertoire_find_value (repertoire
, now
->val
.str
.startmb
,
1108 now
->val
.str
.lenmb
);
1109 *seqp
= charmap_find_value (charmap
, now
->val
.str
.startmb
,
1110 now
->val
.str
.lenmb
);
1112 else if (now
->tok
== tok_ucs4
)
1114 *seqp
= repertoire_find_seq (repertoire
, now
->val
.ucs4
);
1118 /* Compute the value in the charmap from the UCS value. */
1119 const char *symbol
= repertoire_find_symbol (repertoire
,
1125 *seqp
= charmap_find_value (charmap
, symbol
, strlen (symbol
));
1129 /* Insert a negative entry. */
1130 static const struct charseq negative
1131 = { .ucs4
= ILLEGAL_CHAR_VALUE
};
1132 uint32_t *newp
= obstack_alloc (&repertoire
->mem_pool
, 4);
1133 *newp
= now
->val
.ucs4
;
1135 insert_entry (&repertoire
->seq_table
, newp
, 4,
1136 (void *) &negative
);
1139 (*seqp
)->ucs4
= now
->val
.ucs4
;
1141 else if ((*seqp
)->ucs4
!= now
->val
.ucs4
)
1144 *wchp
= now
->val
.ucs4
;
1146 else if (now
->tok
== tok_charcode
)
1148 /* We must map from the byte code to UCS4. */
1149 *seqp
= charmap_find_symbol (charmap
, now
->val
.str
.startmb
,
1150 now
->val
.str
.lenmb
);
1153 *wchp
= ILLEGAL_CHAR_VALUE
;
1156 if ((*seqp
)->ucs4
== UNINITIALIZED_CHAR_VALUE
)
1157 (*seqp
)->ucs4
= repertoire_find_value (repertoire
, (*seqp
)->name
,
1158 strlen ((*seqp
)->name
));
1159 *wchp
= (*seqp
)->ucs4
;
1169 /* Ellipsis like in `<foo123>..<foo12a>' or `<j1234>....<j1245>'. */
1171 charclass_symbolic_ellipsis (struct linereader
*ldfile
,
1172 struct locale_ctype_t
*ctype
,
1173 struct charmap_t
*charmap
,
1174 struct repertoire_t
*repertoire
,
1176 const char *last_str
,
1177 unsigned long int class256_bit
,
1178 unsigned long int class_bit
, int base
,
1179 int ignore_content
, int handle_digits
)
1181 const char *nowstr
= now
->val
.str
.startmb
;
1182 char tmp
[now
->val
.str
.lenmb
+ 1];
1185 unsigned long int from
;
1186 unsigned long int to
;
1188 /* We have to compute the ellipsis values using the symbolic names. */
1189 assert (last_str
!= NULL
);
1191 if (strlen (last_str
) != now
->val
.str
.lenmb
)
1195 _("`%s' and `%.*s' are no valid names for symbolic range"),
1196 last_str
, now
->val
.str
.lenmb
, nowstr
);
1200 if (memcmp (last_str
, nowstr
, now
->val
.str
.lenmb
) == 0)
1201 /* Nothing to do, the names are the same. */
1204 for (cp
= last_str
; *cp
== *(nowstr
+ (cp
- last_str
)); ++cp
)
1208 from
= strtoul (cp
, &endp
, base
);
1209 if ((from
== UINT_MAX
&& errno
== ERANGE
) || *endp
!= '\0')
1212 to
= strtoul (nowstr
+ (cp
- last_str
), &endp
, base
);
1213 if ((to
== UINT_MAX
&& errno
== ERANGE
)
1214 || (endp
- nowstr
) != now
->val
.str
.lenmb
|| from
>= to
)
1217 /* OK, we have a range FROM - TO. Now we can create the symbolic names. */
1218 if (!ignore_content
)
1220 now
->val
.str
.startmb
= tmp
;
1221 while (++from
<= to
)
1223 struct charseq
*seq
;
1226 sprintf (tmp
, (base
== 10 ? "%.*s%0*d" : "%.*s%0*X"), cp
- last_str
,
1227 last_str
, now
->val
.str
.lenmb
- (cp
- last_str
), from
);
1229 get_character (now
, charmap
, repertoire
, &seq
, &wch
);
1231 if (seq
!= NULL
&& seq
->nbytes
== 1)
1232 /* Yep, we can store information about this byte sequence. */
1233 ctype
->class256_collection
[seq
->bytes
[0]] |= class256_bit
;
1235 if (wch
!= ILLEGAL_CHAR_VALUE
&& class_bit
!= 0)
1236 /* We have the UCS4 position. */
1237 *find_idx (ctype
, &ctype
->class_collection
,
1238 &ctype
->class_collection_max
,
1239 &ctype
->class_collection_act
, wch
) |= class_bit
;
1241 if (handle_digits
== 1)
1243 /* We must store the digit values. */
1244 if (ctype
->mbdigits_act
== ctype
->mbdigits_max
)
1246 ctype
->mbdigits_max
*= 2;
1247 ctype
->mbdigits
= xrealloc (ctype
->mbdigits
,
1248 (ctype
->mbdigits_max
1249 * sizeof (char *)));
1250 ctype
->wcdigits_max
*= 2;
1251 ctype
->wcdigits
= xrealloc (ctype
->wcdigits
,
1252 (ctype
->wcdigits_max
1253 * sizeof (uint32_t)));
1256 ctype
->mbdigits
[ctype
->mbdigits_act
++] = seq
;
1257 ctype
->wcdigits
[ctype
->wcdigits_act
++] = wch
;
1259 else if (handle_digits
== 2)
1261 /* We must store the digit values. */
1262 if (ctype
->outdigits_act
>= 10)
1264 lr_error (ldfile
, _("\
1265 %s: field `%s' does not contain exactly ten entries"),
1266 "LC_CTYPE", "outdigit");
1270 ctype
->mboutdigits
[ctype
->outdigits_act
] = seq
;
1271 ctype
->wcoutdigits
[ctype
->outdigits_act
] = wch
;
1272 ++ctype
->outdigits_act
;
1279 /* Ellipsis like in `<U1234>..<U2345>'. */
1281 charclass_ucs4_ellipsis (struct linereader
*ldfile
,
1282 struct locale_ctype_t
*ctype
,
1283 struct charmap_t
*charmap
,
1284 struct repertoire_t
*repertoire
,
1285 struct token
*now
, uint32_t last_wch
,
1286 unsigned long int class256_bit
,
1287 unsigned long int class_bit
, int ignore_content
,
1290 if (last_wch
> now
->val
.ucs4
)
1292 lr_error (ldfile
, _("\
1293 to-value <U%0*X> of range is smaller than from-value <U%0*X>"),
1294 (now
->val
.ucs4
| last_wch
) < 65536 ? 4 : 8, now
->val
.ucs4
,
1295 (now
->val
.ucs4
| last_wch
) < 65536 ? 4 : 8, last_wch
);
1299 if (!ignore_content
)
1300 while (++last_wch
<= now
->val
.ucs4
)
1302 /* We have to find out whether there is a byte sequence corresponding
1303 to this UCS4 value. */
1304 struct charseq
*seq
= repertoire_find_seq (repertoire
, last_wch
);
1306 /* If this is the first time we look for this sequence create a new
1310 /* Find the symbolic name for this UCS4 value. */
1311 const char *symbol
= repertoire_find_symbol (repertoire
, last_wch
);
1312 uint32_t *newp
= obstack_alloc (&repertoire
->mem_pool
, 4);
1316 /* We have a name, now search the multibyte value. */
1317 seq
= charmap_find_value (charmap
, symbol
, strlen (symbol
));
1321 /* We have to create a fake entry. */
1322 static const struct charseq negative
1323 = { .ucs4
= ILLEGAL_CHAR_VALUE
};
1324 seq
= (struct charseq
*) &negative
;
1327 seq
->ucs4
= last_wch
;
1329 insert_entry (&repertoire
->seq_table
, newp
, 4, seq
);
1332 /* We have a name, now search the multibyte value. */
1333 if (seq
->ucs4
== last_wch
&& seq
->nbytes
== 1)
1334 /* Yep, we can store information about this byte sequence. */
1335 ctype
->class256_collection
[(size_t) seq
->bytes
[0]]
1338 /* And of course we have the UCS4 position. */
1339 if (class_bit
!= 0 && class_bit
!= 0)
1340 *find_idx (ctype
, &ctype
->class_collection
,
1341 &ctype
->class_collection_max
,
1342 &ctype
->class_collection_act
, last_wch
) |= class_bit
;
1344 if (handle_digits
== 1)
1346 /* We must store the digit values. */
1347 if (ctype
->mbdigits_act
== ctype
->mbdigits_max
)
1349 ctype
->mbdigits_max
*= 2;
1350 ctype
->mbdigits
= xrealloc (ctype
->mbdigits
,
1351 (ctype
->mbdigits_max
1352 * sizeof (char *)));
1353 ctype
->wcdigits_max
*= 2;
1354 ctype
->wcdigits
= xrealloc (ctype
->wcdigits
,
1355 (ctype
->wcdigits_max
1356 * sizeof (uint32_t)));
1359 ctype
->mbdigits
[ctype
->mbdigits_act
++] = (seq
->ucs4
== last_wch
1361 ctype
->wcdigits
[ctype
->wcdigits_act
++] = last_wch
;
1363 else if (handle_digits
== 2)
1365 /* We must store the digit values. */
1366 if (ctype
->outdigits_act
>= 10)
1368 lr_error (ldfile
, _("\
1369 %s: field `%s' does not contain exactly ten entries"),
1370 "LC_CTYPE", "outdigit");
1374 ctype
->mboutdigits
[ctype
->outdigits_act
] = (seq
->ucs4
== last_wch
1376 ctype
->wcoutdigits
[ctype
->outdigits_act
] = last_wch
;
1377 ++ctype
->outdigits_act
;
1383 /* Ellipsis as in `/xea/x12.../xea/x34'. */
1385 charclass_charcode_ellipsis (struct linereader
*ldfile
,
1386 struct locale_ctype_t
*ctype
,
1387 struct charmap_t
*charmap
,
1388 struct repertoire_t
*repertoire
,
1389 struct token
*now
, char *last_charcode
,
1390 uint32_t last_charcode_len
,
1391 unsigned long int class256_bit
,
1392 unsigned long int class_bit
, int ignore_content
,
1395 /* First check whether the to-value is larger. */
1396 if (now
->val
.charcode
.nbytes
!= last_charcode_len
)
1398 lr_error (ldfile
, _("\
1399 start end end character sequence of range must have the same length"));
1403 if (memcmp (last_charcode
, now
->val
.charcode
.bytes
, last_charcode_len
) > 0)
1405 lr_error (ldfile
, _("\
1406 to-value character sequence is smaller than from-value sequence"));
1410 if (!ignore_content
)
1414 /* Increment the byte sequence value. */
1415 struct charseq
*seq
;
1419 for (i
= last_charcode_len
- 1; i
>= 0; --i
)
1420 if (++last_charcode
[i
] != 0)
1423 if (last_charcode_len
== 1)
1424 /* Of course we have the charcode value. */
1425 ctype
->class256_collection
[(size_t) last_charcode
[0]]
1428 /* Find the symbolic name. */
1429 seq
= charmap_find_symbol (charmap
, last_charcode
,
1433 if (seq
->ucs4
== UNINITIALIZED_CHAR_VALUE
)
1434 seq
->ucs4
= repertoire_find_value (repertoire
, seq
->name
,
1435 strlen (seq
->name
));
1438 if (wch
!= ILLEGAL_CHAR_VALUE
&& class_bit
!= 0)
1439 *find_idx (ctype
, &ctype
->class_collection
,
1440 &ctype
->class_collection_max
,
1441 &ctype
->class_collection_act
, wch
) |= class_bit
;
1444 wch
= ILLEGAL_CHAR_VALUE
;
1446 if (handle_digits
== 1)
1448 /* We must store the digit values. */
1449 if (ctype
->mbdigits_act
== ctype
->mbdigits_max
)
1451 ctype
->mbdigits_max
*= 2;
1452 ctype
->mbdigits
= xrealloc (ctype
->mbdigits
,
1453 (ctype
->mbdigits_max
1454 * sizeof (char *)));
1455 ctype
->wcdigits_max
*= 2;
1456 ctype
->wcdigits
= xrealloc (ctype
->wcdigits
,
1457 (ctype
->wcdigits_max
1458 * sizeof (uint32_t)));
1461 seq
= xmalloc (sizeof (struct charseq
) + last_charcode_len
);
1462 memcpy ((char *) (seq
+ 1), last_charcode
, last_charcode_len
);
1463 seq
->nbytes
= last_charcode_len
;
1465 ctype
->mbdigits
[ctype
->mbdigits_act
++] = seq
;
1466 ctype
->wcdigits
[ctype
->wcdigits_act
++] = wch
;
1468 else if (handle_digits
== 2)
1470 struct charseq
*seq
;
1471 /* We must store the digit values. */
1472 if (ctype
->outdigits_act
>= 10)
1474 lr_error (ldfile
, _("\
1475 %s: field `%s' does not contain exactly ten entries"),
1476 "LC_CTYPE", "outdigit");
1480 seq
= xmalloc (sizeof (struct charseq
) + last_charcode_len
);
1481 memcpy ((char *) (seq
+ 1), last_charcode
, last_charcode_len
);
1482 seq
->nbytes
= last_charcode_len
;
1484 ctype
->mboutdigits
[ctype
->outdigits_act
] = seq
;
1485 ctype
->wcoutdigits
[ctype
->outdigits_act
] = wch
;
1486 ++ctype
->outdigits_act
;
1489 while (memcmp (last_charcode
, now
->val
.charcode
.bytes
,
1490 last_charcode_len
) != 0);
1495 /* Read one transliteration entry. */
1497 read_widestring (struct linereader
*ldfile
, struct token
*now
,
1498 struct charmap_t
*charmap
, struct repertoire_t
*repertoire
)
1502 if (now
->tok
== tok_default_missing
)
1503 /* The special name "" will denote this case. */
1504 wstr
= (uint32_t *) L
"";
1505 else if (now
->tok
== tok_bsymbol
)
1507 /* Get the value from the repertoire. */
1508 wstr
= xmalloc (2 * sizeof (uint32_t));
1509 wstr
[0] = repertoire_find_value (repertoire
, now
->val
.str
.startmb
,
1510 now
->val
.str
.lenmb
);
1511 if (wstr
[0] == ILLEGAL_CHAR_VALUE
)
1512 /* We cannot proceed, we don't know the UCS4 value. */
1517 else if (now
->tok
== tok_ucs4
)
1519 wstr
= xmalloc (2 * sizeof (uint32_t));
1520 wstr
[0] = now
->val
.ucs4
;
1523 else if (now
->tok
== tok_charcode
)
1525 /* Argh, we have to convert to the symbol name first and then to the
1527 struct charseq
*seq
= charmap_find_symbol (charmap
,
1528 now
->val
.str
.startmb
,
1529 now
->val
.str
.lenmb
);
1531 /* Cannot find the UCS4 value. */
1534 if (seq
->ucs4
== UNINITIALIZED_CHAR_VALUE
)
1535 seq
->ucs4
= repertoire_find_value (repertoire
, seq
->name
,
1536 strlen (seq
->name
));
1537 if (seq
->ucs4
== ILLEGAL_CHAR_VALUE
)
1538 /* We cannot proceed, we don't know the UCS4 value. */
1541 wstr
= xmalloc (2 * sizeof (uint32_t));
1542 wstr
[0] = seq
->ucs4
;
1545 else if (now
->tok
== tok_string
)
1547 wstr
= now
->val
.str
.startwc
;
1553 if (now
->tok
!= tok_eol
&& now
->tok
!= tok_eof
)
1554 lr_ignore_rest (ldfile
, 0);
1555 SYNTAX_ERROR (_("%s: syntax error"), "LC_CTYPE");
1556 return (uint32_t *) -1l;
1564 read_translit_entry (struct linereader
*ldfile
, struct locale_ctype_t
*ctype
,
1565 struct token
*now
, struct charmap_t
*charmap
,
1566 struct repertoire_t
*repertoire
)
1568 uint32_t *from_wstr
= read_widestring (ldfile
, now
, charmap
, repertoire
);
1569 struct translit_t
*result
;
1570 struct translit_to_t
**top
;
1571 struct obstack
*ob
= &ctype
->mem_pool
;
1575 if (from_wstr
== NULL
)
1576 /* There is no valid from string. */
1579 result
= (struct translit_t
*) obstack_alloc (ob
,
1580 sizeof (struct translit_t
));
1581 result
->from
= from_wstr
;
1582 result
->next
= NULL
;
1592 /* Next we have one or more transliterations. They are
1593 separated by semicolons. */
1594 now
= lr_token (ldfile
, charmap
, repertoire
);
1596 if (!first
&& (now
->tok
== tok_semicolon
|| now
->tok
== tok_eol
))
1598 /* One string read. */
1599 const uint32_t zero
= 0;
1603 obstack_grow (ob
, &zero
, 4);
1604 to_wstr
= obstack_finish (ob
);
1606 *top
= obstack_alloc (ob
, sizeof (struct translit_to_t
));
1607 (*top
)->str
= to_wstr
;
1608 (*top
)->next
= NULL
;
1611 if (now
->tok
== tok_eol
)
1613 result
->next
= ctype
->translit
;
1614 ctype
->translit
= result
;
1619 top
= &(*top
)->next
;
1624 to_wstr
= read_widestring (ldfile
, now
, charmap
, repertoire
);
1625 if (to_wstr
== (uint32_t *) -1l)
1627 /* An error occurred. */
1628 obstack_free (ob
, result
);
1632 if (to_wstr
== NULL
)
1635 /* This value is usable. */
1636 obstack_grow (ob
, to_wstr
, wcslen ((wchar_t *) to_wstr
) * 4);
1644 /* The parser for the LC_CTYPE section of the locale definition. */
1646 ctype_read (struct linereader
*ldfile
, struct localedef_t
*result
,
1647 struct charmap_t
*charmap
, const char *repertoire_name
,
1650 struct repertoire_t
*repertoire
= NULL
;
1651 struct locale_ctype_t
*ctype
;
1653 enum token_t nowtok
;
1655 struct charseq
*last_seq
;
1656 uint32_t last_wch
= 0;
1657 enum token_t last_token
;
1658 enum token_t ellipsis_token
;
1659 char last_charcode
[16];
1660 size_t last_charcode_len
= 0;
1661 const char *last_str
= NULL
;
1664 /* Get the repertoire we have to use. */
1665 if (repertoire_name
!= NULL
)
1666 repertoire
= repertoire_read (repertoire_name
);
1668 /* The rest of the line containing `LC_CTYPE' must be free. */
1669 lr_ignore_rest (ldfile
, 1);
1674 now
= lr_token (ldfile
, charmap
, NULL
);
1677 while (nowtok
== tok_eol
);
1679 /* If we see `copy' now we are almost done. */
1680 if (nowtok
== tok_copy
)
1682 handle_copy (ldfile
, charmap
, repertoire
, result
, tok_lc_ctype
, LC_CTYPE
,
1683 "LC_CTYPE", ignore_content
);
1687 /* Prepare the data structures. */
1688 ctype_startup (ldfile
, result
, charmap
, ignore_content
);
1689 ctype
= result
->categories
[LC_CTYPE
].ctype
;
1691 /* Remember the repertoire we use. */
1692 if (!ignore_content
)
1693 ctype
->repertoire
= repertoire
;
1697 unsigned long int class_bit
= 0;
1698 unsigned long int class256_bit
= 0;
1699 int handle_digits
= 0;
1701 /* Of course we don't proceed beyond the end of file. */
1702 if (nowtok
== tok_eof
)
1705 /* Ingore empty lines. */
1706 if (nowtok
== tok_eol
)
1708 now
= lr_token (ldfile
, charmap
, NULL
);
1716 now
= lr_token (ldfile
, charmap
, NULL
);
1717 while (now
->tok
== tok_ident
|| now
->tok
== tok_string
)
1719 ctype_class_new (ldfile
, ctype
, now
->val
.str
.startmb
);
1720 now
= lr_token (ldfile
, charmap
, NULL
);
1721 if (now
->tok
!= tok_semicolon
)
1723 now
= lr_token (ldfile
, charmap
, NULL
);
1725 if (now
->tok
!= tok_eol
)
1727 %s: syntax error in definition of new character class"), "LC_CTYPE");
1731 now
= lr_token (ldfile
, charmap
, NULL
);
1732 while (now
->tok
== tok_ident
|| now
->tok
== tok_string
)
1734 ctype_map_new (ldfile
, ctype
, now
->val
.str
.startmb
, charmap
);
1735 now
= lr_token (ldfile
, charmap
, NULL
);
1736 if (now
->tok
!= tok_semicolon
)
1738 now
= lr_token (ldfile
, charmap
, NULL
);
1740 if (now
->tok
!= tok_eol
)
1742 %s: syntax error in definition of new character map"), "LC_CTYPE");
1746 /* Ignore the rest of the line if we don't need the input of
1750 lr_ignore_rest (ldfile
, 0);
1754 /* We simply forget the `class' keyword and use the following
1755 operand to determine the bit. */
1756 now
= lr_token (ldfile
, charmap
, NULL
);
1757 if (now
->tok
== tok_ident
|| now
->tok
== tok_string
)
1759 /* Must be one of the predefined class names. */
1760 for (cnt
= 0; cnt
< ctype
->nr_charclass
; ++cnt
)
1761 if (strcmp (ctype
->classnames
[cnt
], now
->val
.str
.startmb
) == 0)
1763 if (cnt
>= ctype
->nr_charclass
)
1765 if (now
->val
.str
.lenmb
== 8
1766 && memcmp ("special1", now
->val
.str
.startmb
, 8) == 0)
1767 class_bit
= _ISwspecial1
;
1768 else if (now
->val
.str
.lenmb
== 8
1769 && memcmp ("special2", now
->val
.str
.startmb
, 8) == 0)
1770 class_bit
= _ISwspecial2
;
1771 else if (now
->val
.str
.lenmb
== 8
1772 && memcmp ("special3", now
->val
.str
.startmb
, 8) == 0)
1773 class_bit
= _ISwspecial3
;
1776 lr_error (ldfile
, _("\
1777 unknown character class `%s' in category `LC_CTYPE'"),
1778 now
->val
.str
.startmb
);
1779 free (now
->val
.str
.startmb
);
1781 lr_ignore_rest (ldfile
, 0);
1786 class_bit
= _ISwbit (cnt
);
1788 free (now
->val
.str
.startmb
);
1790 else if (now
->tok
== tok_digit
)
1791 goto handle_tok_digit
;
1792 else if (now
->tok
< tok_upper
|| now
->tok
> tok_blank
)
1796 class_bit
= BITw (now
->tok
);
1797 class256_bit
= BIT (now
->tok
);
1800 /* The next character must be a semicolon. */
1801 now
= lr_token (ldfile
, charmap
, NULL
);
1802 if (now
->tok
!= tok_semicolon
)
1804 goto read_charclass
;
1817 /* Ignore the rest of the line if we don't need the input of
1821 lr_ignore_rest (ldfile
, 0);
1825 class_bit
= BITw (now
->tok
);
1826 class256_bit
= BIT (now
->tok
);
1829 ctype
->class_done
|= class_bit
;
1830 last_token
= tok_none
;
1831 ellipsis_token
= tok_none
;
1832 now
= lr_token (ldfile
, charmap
, NULL
);
1833 while (now
->tok
!= tok_eol
&& now
->tok
!= tok_eof
)
1836 struct charseq
*seq
;
1838 if (now
->tok
!= tok_bsymbol
)
1839 /* XXX Cannot be handled yet. We will have support
1840 for tok_ucs4 soon. */
1843 if (ellipsis_token
== tok_none
)
1845 if (get_character (now
, charmap
, repertoire
, &seq
, &wch
))
1848 if (!ignore_content
&& seq
!= NULL
&& seq
->nbytes
== 1)
1849 /* Yep, we can store information about this byte
1851 ctype
->class256_collection
[seq
->bytes
[0]] |= class256_bit
;
1853 if (!ignore_content
&& wch
!= ILLEGAL_CHAR_VALUE
1855 /* We have the UCS4 position. */
1856 *find_idx (ctype
, &ctype
->class_collection
,
1857 &ctype
->class_collection_max
,
1858 &ctype
->class_collection_act
, wch
) |= class_bit
;
1860 last_token
= now
->tok
;
1861 /* Terminate the string. */
1862 now
->val
.str
.startmb
[now
->val
.str
.lenmb
] = '\0';
1863 last_str
= now
->val
.str
.startmb
;
1866 memcpy (last_charcode
, now
->val
.charcode
.bytes
, 16);
1867 last_charcode_len
= now
->val
.charcode
.nbytes
;
1869 if (!ignore_content
&& handle_digits
== 1)
1871 /* We must store the digit values. */
1872 if (ctype
->mbdigits_act
== ctype
->mbdigits_max
)
1874 ctype
->mbdigits_max
+= 10;
1875 ctype
->mbdigits
= xrealloc (ctype
->mbdigits
,
1876 (ctype
->mbdigits_max
1877 * sizeof (char *)));
1878 ctype
->wcdigits_max
+= 10;
1879 ctype
->wcdigits
= xrealloc (ctype
->wcdigits
,
1880 (ctype
->wcdigits_max
1881 * sizeof (uint32_t)));
1884 ctype
->mbdigits
[ctype
->mbdigits_act
++] = seq
;
1885 ctype
->wcdigits
[ctype
->wcdigits_act
++] = wch
;
1887 else if (!ignore_content
&& handle_digits
== 2)
1889 /* We must store the digit values. */
1890 if (ctype
->outdigits_act
>= 10)
1892 lr_error (ldfile
, _("\
1893 %s: field `%s' does not contain exactly ten entries"),
1894 "LC_CTYPE", "outdigit");
1898 ctype
->mboutdigits
[ctype
->outdigits_act
] = seq
;
1899 ctype
->wcoutdigits
[ctype
->outdigits_act
] = wch
;
1900 ++ctype
->outdigits_act
;
1905 /* Now it gets complicated. We have to resolve the
1906 ellipsis problem. First we must distinguish between
1907 the different kind of ellipsis and this must match the
1908 tokens we have seen. */
1909 assert (last_token
!= tok_none
);
1911 if (last_token
!= now
->tok
)
1913 lr_error (ldfile
, _("\
1914 ellipsis range must be marked by two operands of same type"));
1915 lr_ignore_rest (ldfile
, 0);
1919 if (last_token
== tok_bsymbol
)
1921 if (ellipsis_token
== tok_ellipsis3
)
1922 lr_error (ldfile
, _("with symbolic name range values \
1923 the absolute ellipsis `...' must not be used"));
1925 charclass_symbolic_ellipsis (ldfile
, ctype
, charmap
,
1926 repertoire
, now
, last_str
,
1927 class256_bit
, class_bit
,
1934 else if (last_token
== tok_ucs4
)
1936 if (ellipsis_token
!= tok_ellipsis2
)
1937 lr_error (ldfile
, _("\
1938 with UCS range values one must use the hexadecimal symbolic ellipsis `..'"));
1940 charclass_ucs4_ellipsis (ldfile
, ctype
, charmap
,
1941 repertoire
, now
, last_wch
,
1942 class256_bit
, class_bit
,
1943 ignore_content
, handle_digits
);
1947 assert (last_token
== tok_charcode
);
1949 if (ellipsis_token
!= tok_ellipsis3
)
1950 lr_error (ldfile
, _("\
1951 with character code range values one must use the absolute ellipsis `...'"));
1953 charclass_charcode_ellipsis (ldfile
, ctype
, charmap
,
1957 class256_bit
, class_bit
,
1962 /* Now we have used the last value. */
1963 last_token
= tok_none
;
1966 /* Next we expect a semicolon or the end of the line. */
1967 now
= lr_token (ldfile
, charmap
, NULL
);
1968 if (now
->tok
== tok_eol
|| now
->tok
== tok_eof
)
1971 if (last_token
!= tok_none
1972 && now
->tok
>= tok_ellipsis2
&& now
->tok
<= tok_ellipsis4
)
1974 ellipsis_token
= now
->tok
;
1975 now
= lr_token (ldfile
, charmap
, NULL
);
1979 if (now
->tok
!= tok_semicolon
)
1982 /* And get the next character. */
1983 now
= lr_token (ldfile
, charmap
, NULL
);
1985 ellipsis_token
= tok_none
;
1990 /* Ignore the rest of the line if we don't need the input of
1994 lr_ignore_rest (ldfile
, 0);
1999 class_bit
= _ISwdigit
;
2000 class256_bit
= _ISdigit
;
2002 goto read_charclass
;
2005 /* Ignore the rest of the line if we don't need the input of
2009 lr_ignore_rest (ldfile
, 0);
2013 if (ctype
->outdigits_act
!= 0)
2014 lr_error (ldfile
, _("\
2015 %s: field `%s' declared more than once"),
2016 "LC_CTYPE", "outdigit");
2020 goto read_charclass
;
2023 /* Ignore the rest of the line if we don't need the input of
2027 lr_ignore_rest (ldfile
, 0);
2035 /* Ignore the rest of the line if we don't need the input of
2039 lr_ignore_rest (ldfile
, 0);
2047 /* Ignore the rest of the line if we don't need the input of
2051 lr_ignore_rest (ldfile
, 0);
2055 /* We simply forget the `map' keyword and use the following
2056 operand to determine the mapping. */
2057 now
= lr_token (ldfile
, charmap
, NULL
);
2058 if (now
->tok
== tok_ident
|| now
->tok
== tok_string
)
2062 for (cnt
= 2; cnt
< ctype
->map_collection_nr
; ++cnt
)
2063 if (strcmp (now
->val
.str
.startmb
, ctype
->mapnames
[cnt
]) == 0)
2066 if (cnt
< ctype
->map_collection_nr
)
2070 lr_error (ldfile
, _("unknown map `%s'"),
2071 now
->val
.str
.startmb
);
2072 lr_ignore_rest (ldfile
, 0);
2076 else if (now
->tok
< tok_toupper
|| now
->tok
> tok_tolower
)
2079 mapidx
= now
->tok
- tok_toupper
;
2081 now
= lr_token (ldfile
, charmap
, NULL
);
2082 /* This better should be a semicolon. */
2083 if (now
->tok
!= tok_semicolon
)
2087 /* Test whether this mapping was already defined. */
2088 if (ctype
->tomap_done
[mapidx
])
2090 lr_error (ldfile
, _("duplicated definition for mapping `%s'"),
2091 ctype
->mapnames
[mapidx
]);
2092 lr_ignore_rest (ldfile
, 0);
2095 ctype
->tomap_done
[mapidx
] = 1;
2097 now
= lr_token (ldfile
, charmap
, NULL
);
2098 while (now
->tok
!= tok_eol
&& now
->tok
!= tok_eof
)
2100 struct charseq
*from_seq
;
2102 struct charseq
*to_seq
;
2105 /* Every pair starts with an opening brace. */
2106 if (now
->tok
!= tok_open_brace
)
2109 /* Next comes the from-value. */
2110 now
= lr_token (ldfile
, charmap
, NULL
);
2111 if (get_character (now
, charmap
, repertoire
, &from_seq
,
2115 /* The next is a comma. */
2116 now
= lr_token (ldfile
, charmap
, NULL
);
2117 if (now
->tok
!= tok_comma
)
2120 /* And the other value. */
2121 now
= lr_token (ldfile
, charmap
, NULL
);
2122 if (get_character (now
, charmap
, repertoire
, &to_seq
,
2126 /* And the last thing is the closing brace. */
2127 now
= lr_token (ldfile
, charmap
, NULL
);
2128 if (now
->tok
!= tok_close_brace
)
2131 if (!ignore_content
)
2133 if (mapidx
< 2 && from_seq
!= NULL
&& to_seq
!= NULL
2134 && from_seq
->nbytes
== 1 && to_seq
->nbytes
== 1)
2135 /* We can use this value. */
2136 ctype
->map256_collection
[mapidx
][from_seq
->bytes
[0]]
2139 if (from_wch
!= ILLEGAL_CHAR_VALUE
2140 && to_wch
!= ILLEGAL_CHAR_VALUE
)
2141 /* Both correct values. */
2142 *find_idx (ctype
, &ctype
->map_collection
[mapidx
],
2143 &ctype
->map_collection_max
[mapidx
],
2144 &ctype
->map_collection_act
[mapidx
],
2148 /* Now comes a semicolon or the end of the line/file. */
2149 now
= lr_token (ldfile
, charmap
, NULL
);
2150 if (now
->tok
== tok_semicolon
)
2151 now
= lr_token (ldfile
, charmap
, NULL
);
2155 case tok_translit_start
:
2156 /* Ignore the rest of the line if we don't need the input of
2160 lr_ignore_rest (ldfile
, 0);
2164 /* The rest of the line better should be empty. */
2165 lr_ignore_rest (ldfile
, 1);
2167 /* We count here the number of allocated entries in the `translit'
2171 /* We proceed until we see the `translit_end' token. */
2172 while (now
= lr_token (ldfile
, charmap
, repertoire
),
2173 now
->tok
!= tok_translit_end
&& now
->tok
!= tok_eof
)
2175 if (now
->tok
== tok_eol
)
2176 /* Ignore empty lines. */
2179 if (now
->tok
== tok_translit_end
)
2181 lr_ignore_rest (ldfile
, 0);
2185 if (now
->tok
== tok_include
)
2187 /* We have to include locale. */
2188 const char *locale_name
;
2189 const char *repertoire_name
;
2191 now
= lr_token (ldfile
, charmap
, NULL
);
2192 /* This should be a string or an identifier. In any
2193 case something to name a locale. */
2194 if (now
->tok
!= tok_string
&& now
->tok
!= tok_ident
)
2197 lr_error (ldfile
, _("%s: syntax error"), "LC_CTYPE");
2198 lr_ignore_rest (ldfile
, 0);
2201 locale_name
= now
->val
.str
.startmb
;
2203 /* Next should be a semicolon. */
2204 now
= lr_token (ldfile
, charmap
, NULL
);
2205 if (now
->tok
!= tok_semicolon
)
2206 goto translit_syntax
;
2208 /* Now the repertoire name. */
2209 now
= lr_token (ldfile
, charmap
, NULL
);
2210 if ((now
->tok
!= tok_string
&& now
->tok
!= tok_ident
)
2211 || now
->val
.str
.startmb
== NULL
)
2212 goto translit_syntax
;
2213 repertoire_name
= now
->val
.str
.startmb
;
2215 /* We must not have more than one `include'. */
2216 if (ctype
->translit_copy_locale
!= NULL
)
2218 lr_error (ldfile
, _("\
2219 %s: only one `include' instruction allowed"), "LC_CTYPE");
2220 lr_ignore_rest (ldfile
, 0);
2224 ctype
->translit_copy_locale
= locale_name
;
2225 ctype
->translit_copy_repertoire
= repertoire_name
;
2227 /* The rest of the line must be empty. */
2228 lr_ignore_rest (ldfile
, 1);
2232 read_translit_entry (ldfile
, ctype
, now
, charmap
, repertoire
);
2237 /* Ignore the rest of the line if we don't need the input of
2241 lr_ignore_rest (ldfile
, 0);
2245 /* This could mean one of several things. First test whether
2246 it's a character class name. */
2247 for (cnt
= 0; cnt
< ctype
->nr_charclass
; ++cnt
)
2248 if (strcmp (now
->val
.str
.startmb
, ctype
->classnames
[cnt
]) == 0)
2250 if (cnt
< ctype
->nr_charclass
)
2252 class_bit
= _ISwbit (cnt
);
2253 class256_bit
= cnt
<= 11 ? _ISbit (cnt
) : 0;
2254 free (now
->val
.str
.startmb
);
2255 goto read_charclass
;
2257 for (cnt
= 0; cnt
< ctype
->map_collection_nr
; ++cnt
)
2258 if (strcmp (now
->val
.str
.startmb
, ctype
->mapnames
[cnt
]) == 0)
2260 if (cnt
< ctype
->map_collection_nr
)
2263 free (now
->val
.str
.startmb
);
2266 if (strcmp (now
->val
.str
.startmb
, "special1") == 0)
2268 class_bit
= _ISwspecial1
;
2269 free (now
->val
.str
.startmb
);
2270 goto read_charclass
;
2272 if (strcmp (now
->val
.str
.startmb
, "special2") == 0)
2274 class_bit
= _ISwspecial2
;
2275 free (now
->val
.str
.startmb
);
2276 goto read_charclass
;
2278 if (strcmp (now
->val
.str
.startmb
, "special3") == 0)
2280 class_bit
= _ISwspecial3
;
2281 free (now
->val
.str
.startmb
);
2282 goto read_charclass
;
2284 if (strcmp (now
->val
.str
.startmb
, "tosymmetric") == 0)
2292 /* Next we assume `LC_CTYPE'. */
2293 now
= lr_token (ldfile
, charmap
, NULL
);
2294 if (now
->tok
== tok_eof
)
2296 if (now
->tok
== tok_eol
)
2297 lr_error (ldfile
, _("%s: incomplete `END' line"),
2299 else if (now
->tok
!= tok_lc_ctype
)
2300 lr_error (ldfile
, _("\
2301 %1$s: definition does not end with `END %1$s'"), "LC_CTYPE");
2302 lr_ignore_rest (ldfile
, now
->tok
== tok_lc_ctype
);
2307 if (now
->tok
!= tok_eof
)
2308 SYNTAX_ERROR (_("%s: syntax error"), "LC_CTYPE");
2311 /* Prepare for the next round. */
2312 now
= lr_token (ldfile
, charmap
, NULL
);
2316 /* When we come here we reached the end of the file. */
2317 lr_error (ldfile
, _("%s: premature end of file"), "LC_CTYPE");
2322 set_class_defaults (struct locale_ctype_t
*ctype
, struct charmap_t
*charmap
,
2323 struct repertoire_t
*repertoire
)
2327 /* These function defines the default values for the classes and conversions
2328 according to POSIX.2 2.5.2.1.
2329 It may seem that the order of these if-blocks is arbitrary but it is NOT.
2330 Don't move them unless you know what you do! */
2332 void set_default (int bitpos
, int from
, int to
)
2336 int bit
= _ISbit (bitpos
);
2337 int bitw
= _ISwbit (bitpos
);
2338 /* Define string. */
2341 for (ch
= from
; ch
<= to
; ++ch
)
2344 struct charseq
*seq
;
2347 value
= repertoire_find_value (repertoire
, tmp
, 1);
2348 if (value
== ILLEGAL_CHAR_VALUE
)
2352 %s: character `%s' not defined in repertoire while needed as default value"),
2356 ELEM (ctype
, class_collection
, , value
) |= bitw
;
2358 seq
= charmap_find_value (charmap
, tmp
, 1);
2363 %s: character `%s' not defined in charmap while needed as default value"),
2366 else if (seq
->nbytes
!= 1)
2368 %s: character `%s' in charmap not representable with one byte"),
2371 ctype
->class256_collection
[seq
->bytes
[0]] |= bit
;
2375 /* Set default values if keyword was not present. */
2376 if ((ctype
->class_done
& BITw (tok_upper
)) == 0)
2377 /* "If this keyword [lower] is not specified, the lowercase letters
2378 `A' through `Z', ..., shall automatically belong to this class,
2379 with implementation defined character values." [P1003.2, 2.5.2.1] */
2380 set_default (BITPOS (tok_upper
), 'A', 'Z');
2382 if ((ctype
->class_done
& BITw (tok_lower
)) == 0)
2383 /* "If this keyword [lower] is not specified, the lowercase letters
2384 `a' through `z', ..., shall automatically belong to this class,
2385 with implementation defined character values." [P1003.2, 2.5.2.1] */
2386 set_default (BITPOS (tok_lower
), 'a', 'z');
2388 if ((ctype
->class_done
& BITw (tok_alpha
)) == 0)
2390 /* Table 2-6 in P1003.2 says that characters in class `upper' or
2391 class `lower' *must* be in class `alpha'. */
2392 unsigned long int mask
= BIT (tok_upper
) | BIT (tok_lower
);
2393 unsigned long int maskw
= BITw (tok_upper
) | BITw (tok_lower
);
2395 for (cnt
= 0; cnt
< 256; ++cnt
)
2396 if ((ctype
->class256_collection
[cnt
] & mask
) != 0)
2397 ctype
->class256_collection
[cnt
] |= BIT (tok_alpha
);
2399 for (cnt
= 0; cnt
< ctype
->class_collection_act
; ++cnt
)
2400 if ((ctype
->class_collection
[cnt
] & maskw
) != 0)
2401 ctype
->class_collection
[cnt
] |= BITw (tok_alpha
);
2404 if ((ctype
->class_done
& BITw (tok_digit
)) == 0)
2405 /* "If this keyword [digit] is not specified, the digits `0' through
2406 `9', ..., shall automatically belong to this class, with
2407 implementation-defined character values." [P1003.2, 2.5.2.1] */
2408 set_default (BITPOS (tok_digit
), '0', '9');
2410 /* "Only characters specified for the `alpha' and `digit' keyword
2411 shall be specified. Characters specified for the keyword `alpha'
2412 and `digit' are automatically included in this class. */
2414 unsigned long int mask
= BIT (tok_alpha
) | BIT (tok_digit
);
2415 unsigned long int maskw
= BITw (tok_alpha
) | BITw (tok_digit
);
2417 for (cnt
= 0; cnt
< 256; ++cnt
)
2418 if ((ctype
->class256_collection
[cnt
] & mask
) != 0)
2419 ctype
->class256_collection
[cnt
] |= BIT (tok_alnum
);
2421 for (cnt
= 0; cnt
< ctype
->class_collection_act
; ++cnt
)
2422 if ((ctype
->class_collection
[cnt
] & maskw
) != 0)
2423 ctype
->class_collection
[cnt
] |= BITw (tok_alnum
);
2426 if ((ctype
->class_done
& BITw (tok_space
)) == 0)
2427 /* "If this keyword [space] is not specified, the characters <space>,
2428 <form-feed>, <newline>, <carriage-return>, <tab>, and
2429 <vertical-tab>, ..., shall automatically belong to this class,
2430 with implementation-defined character values." [P1003.2, 2.5.2.1] */
2433 struct charseq
*seq
;
2435 value
= repertoire_find_value (repertoire
, "space", 5);
2436 if (value
== ILLEGAL_CHAR_VALUE
)
2440 %s: character `%s' not defined while needed as default value"),
2441 "LC_CTYPE", "<space>");
2444 ELEM (ctype
, class_collection
, , value
) |= BIT (tok_space
);
2446 seq
= charmap_find_value (charmap
, "space", 5);
2451 %s: character `%s' not defined while needed as default value"),
2452 "LC_CTYPE", "<space>");
2454 else if (seq
->nbytes
!= 1)
2456 %s: character `%s' in charmap not representable with one byte"),
2457 "LC_CTYPE", "<space>");
2459 ctype
->class256_collection
[seq
->bytes
[0]] |= BIT (tok_space
);
2462 value
= repertoire_find_value (repertoire
, "form-feed", 9);
2463 if (value
== ILLEGAL_CHAR_VALUE
)
2467 %s: character `%s' not defined while needed as default value"),
2468 "LC_CTYPE", "<form-feed>");
2471 ELEM (ctype
, class_collection
, , value
) |= BIT (tok_space
);
2473 seq
= charmap_find_value (charmap
, "form-feed", 9);
2478 %s: character `%s' not defined while needed as default value"),
2479 "LC_CTYPE", "<form-feed>");
2481 else if (seq
->nbytes
!= 1)
2483 %s: character `%s' in charmap not representable with one byte"),
2484 "LC_CTYPE", "<form-feed>");
2486 ctype
->class256_collection
[seq
->bytes
[0]] |= BIT (tok_space
);
2489 value
= repertoire_find_value (repertoire
, "newline", 7);
2490 if (value
== ILLEGAL_CHAR_VALUE
)
2494 %s: character `%s' not defined while needed as default value"),
2495 "LC_CTYPE", "<newline>");
2498 ELEM (ctype
, class_collection
, , value
) |= BIT (tok_space
);
2500 seq
= charmap_find_value (charmap
, "newline", 7);
2505 character `%s' not defined while needed as default value"),
2508 else if (seq
->nbytes
!= 1)
2510 %s: character `%s' in charmap not representable with one byte"),
2511 "LC_CTYPE", "<newline>");
2513 ctype
->class256_collection
[seq
->bytes
[0]] |= BIT (tok_space
);
2516 value
= repertoire_find_value (repertoire
, "carriage-return", 15);
2517 if (value
== ILLEGAL_CHAR_VALUE
)
2521 %s: character `%s' not defined while needed as default value"),
2522 "LC_CTYPE", "<carriage-return>");
2525 ELEM (ctype
, class_collection
, , value
) |= BIT (tok_space
);
2527 seq
= charmap_find_value (charmap
, "carriage-return", 15);
2532 %s: character `%s' not defined while needed as default value"),
2533 "LC_CTYPE", "<carriage-return>");
2535 else if (seq
->nbytes
!= 1)
2537 %s: character `%s' in charmap not representable with one byte"),
2538 "LC_CTYPE", "<carriage-return>");
2540 ctype
->class256_collection
[seq
->bytes
[0]] |= BIT (tok_space
);
2543 value
= repertoire_find_value (repertoire
, "tab", 3);
2544 if (value
== ILLEGAL_CHAR_VALUE
)
2548 %s: character `%s' not defined while needed as default value"),
2549 "LC_CTYPE", "<tab>");
2552 ELEM (ctype
, class_collection
, , value
) |= BIT (tok_space
);
2554 seq
= charmap_find_value (charmap
, "tab", 3);
2559 %s: character `%s' not defined while needed as default value"),
2560 "LC_CTYPE", "<tab>");
2562 else if (seq
->nbytes
!= 1)
2564 %s: character `%s' in charmap not representable with one byte"),
2565 "LC_CTYPE", "<tab>");
2567 ctype
->class256_collection
[seq
->bytes
[0]] |= BIT (tok_space
);
2570 value
= repertoire_find_value (repertoire
, "vertical-tab", 12);
2571 if (value
== ILLEGAL_CHAR_VALUE
)
2575 %s: character `%s' not defined while needed as default value"),
2576 "LC_CTYPE", "<vertical-tab>");
2579 ELEM (ctype
, class_collection
, , value
) |= BIT (tok_space
);
2581 seq
= charmap_find_value (charmap
, "vertical-tab", 12);
2586 %s: character `%s' not defined while needed as default value"),
2587 "LC_CTYPE", "<vertical-tab>");
2589 else if (seq
->nbytes
!= 1)
2591 %s: character `%s' in charmap not representable with one byte"),
2592 "LC_CTYPE", "<vertical-tab>");
2594 ctype
->class256_collection
[seq
->bytes
[0]] |= BIT (tok_space
);
2597 if ((ctype
->class_done
& BITw (tok_xdigit
)) == 0)
2598 /* "If this keyword is not specified, the digits `0' to `9', the
2599 uppercase letters `A' through `F', and the lowercase letters `a'
2600 through `f', ..., shell automatically belong to this class, with
2601 implementation defined character values." [P1003.2, 2.5.2.1] */
2603 set_default (BITPOS (tok_xdigit
), '0', '9');
2604 set_default (BITPOS (tok_xdigit
), 'A', 'F');
2605 set_default (BITPOS (tok_xdigit
), 'a', 'f');
2608 if ((ctype
->class_done
& BITw (tok_blank
)) == 0)
2609 /* "If this keyword [blank] is unspecified, the characters <space> and
2610 <tab> shall belong to this character class." [P1003.2, 2.5.2.1] */
2613 struct charseq
*seq
;
2615 value
= repertoire_find_value (repertoire
, "space", 5);
2616 if (value
== ILLEGAL_CHAR_VALUE
)
2620 %s: character `%s' not defined while needed as default value"),
2621 "LC_CTYPE", "<space>");
2624 ELEM (ctype
, class_collection
, , value
) |= BIT (tok_blank
);
2626 seq
= charmap_find_value (charmap
, "space", 5);
2631 %s: character `%s' not defined while needed as default value"),
2632 "LC_CTYPE", "<space>");
2634 else if (seq
->nbytes
!= 1)
2636 %s: character `%s' in charmap not representable with one byte"),
2637 "LC_CTYPE", "<space>");
2639 ctype
->class256_collection
[seq
->bytes
[0]] |= BIT (tok_blank
);
2642 value
= repertoire_find_value (repertoire
, "tab", 3);
2643 if (value
== ILLEGAL_CHAR_VALUE
)
2647 %s: character `%s' not defined while needed as default value"),
2648 "LC_CTYPE", "<tab>");
2651 ELEM (ctype
, class_collection
, , value
) |= BIT (tok_blank
);
2653 seq
= charmap_find_value (charmap
, "tab", 3);
2658 %s: character `%s' not defined while needed as default value"),
2659 "LC_CTYPE", "<tab>");
2661 else if (seq
->nbytes
!= 1)
2663 %s: character `%s' in charmap not representable with one byte"),
2664 "LC_CTYPE", "<tab>");
2666 ctype
->class256_collection
[seq
->bytes
[0]] |= BIT (tok_blank
);
2669 if ((ctype
->class_done
& BITw (tok_graph
)) == 0)
2670 /* "If this keyword [graph] is not specified, characters specified for
2671 the keywords `upper', `lower', `alpha', `digit', `xdigit' and `punct',
2672 shall belong to this character class." [P1003.2, 2.5.2.1] */
2674 unsigned long int mask
= BIT (tok_upper
) | BIT (tok_lower
) |
2675 BIT (tok_alpha
) | BIT (tok_digit
) | BIT (tok_xdigit
) | BIT (tok_punct
);
2678 for (cnt
= 0; cnt
< ctype
->class_collection_act
; ++cnt
)
2679 if ((ctype
->class_collection
[cnt
] & mask
) != 0)
2680 ctype
->class_collection
[cnt
] |= BIT (tok_graph
);
2682 for (cnt
= 0; cnt
< 256; ++cnt
)
2683 if ((ctype
->class256_collection
[cnt
] & mask
) != 0)
2684 ctype
->class256_collection
[cnt
] |= BIT (tok_graph
);
2687 if ((ctype
->class_done
& BITw (tok_print
)) == 0)
2688 /* "If this keyword [print] is not provided, characters specified for
2689 the keywords `upper', `lower', `alpha', `digit', `xdigit', `punct',
2690 and the <space> character shall belong to this character class."
2691 [P1003.2, 2.5.2.1] */
2693 unsigned long int mask
= BIT (tok_upper
) | BIT (tok_lower
) |
2694 BIT (tok_alpha
) | BIT (tok_digit
) | BIT (tok_xdigit
) | BIT (tok_punct
);
2697 struct charseq
*seq
;
2699 for (cnt
= 0; cnt
< ctype
->class_collection_act
; ++cnt
)
2700 if ((ctype
->class_collection
[cnt
] & mask
) != 0)
2701 ctype
->class_collection
[cnt
] |= BIT (tok_print
);
2703 for (cnt
= 0; cnt
< 256; ++cnt
)
2704 if ((ctype
->class256_collection
[cnt
] & mask
) != 0)
2705 ctype
->class256_collection
[cnt
] |= BIT (tok_print
);
2708 space
= repertoire_find_value (repertoire
, "space", 5);
2709 if (space
== ILLEGAL_CHAR_VALUE
)
2713 %s: character `%s' not defined while needed as default value"),
2714 "LC_CTYPE", "<space>");
2717 ELEM (ctype
, class_collection
, , space
) |= BIT (tok_print
);
2719 seq
= charmap_find_value (charmap
, "space", 5);
2724 %s: character `%s' not defined while needed as default value"),
2725 "LC_CTYPE", "<space>");
2727 else if (seq
->nbytes
!= 1)
2729 %s: character `%s' in charmap not representable with one byte"),
2730 "LC_CTYPE", "<space>");
2732 ctype
->class256_collection
[seq
->bytes
[0]] |= BIT (tok_print
);
2735 if (ctype
->tomap_done
[0] == 0)
2736 /* "If this keyword [toupper] is not specified, the lowercase letters
2737 `a' through `z', and their corresponding uppercase letters `A' to
2738 `Z', ..., shall automatically be included, with implementation-
2739 defined character values." [P1003.2, 2.5.2.1] */
2744 strcpy (tmp
, "<?>");
2746 for (ch
= 'a'; ch
<= 'z'; ++ch
)
2748 uint32_t value_from
, value_to
;
2749 struct charseq
*seq_from
, *seq_to
;
2753 value_from
= repertoire_find_value (repertoire
, &tmp
[1], 1);
2754 if (value_from
== ILLEGAL_CHAR_VALUE
)
2758 %s: character `%s' not defined while needed as default value"),
2763 /* This conversion is implementation defined. */
2764 tmp
[1] = (char) (ch
+ ('A' - 'a'));
2765 value_to
= repertoire_find_value (repertoire
, &tmp
[1], 1);
2766 if (value_to
== ILLEGAL_CHAR_VALUE
)
2770 %s: character `%s' not defined while needed as default value"),
2774 /* The index [0] is determined by the order of the
2775 `ctype_map_newP' calls in `ctype_startup'. */
2776 ELEM (ctype
, map_collection
, [0], value_from
) = value_to
;
2779 seq_from
= charmap_find_value (charmap
, &tmp
[1], 1);
2780 if (seq_from
== NULL
)
2784 %s: character `%s' not defined while needed as default value"),
2787 else if (seq_from
->nbytes
!= 1)
2791 %s: character `%s' needed as default value not representable with one byte"),
2796 /* This conversion is implementation defined. */
2797 tmp
[1] = (char) (ch
+ ('A' - 'a'));
2798 seq_to
= charmap_find_value (charmap
, &tmp
[1], 1);
2803 %s: character `%s' not defined while needed as default value"),
2806 else if (seq_to
->nbytes
!= 1)
2810 %s: character `%s' needed as default value not representable with one byte"),
2814 /* The index [0] is determined by the order of the
2815 `ctype_map_newP' calls in `ctype_startup'. */
2816 ctype
->map256_collection
[0][seq_from
->bytes
[0]]
2822 if (ctype
->tomap_done
[1] == 0)
2823 /* "If this keyword [tolower] is not specified, the mapping shall be
2824 the reverse mapping of the one specified to `toupper'." [P1003.2] */
2826 for (cnt
= 0; cnt
< ctype
->map_collection_act
[0]; ++cnt
)
2827 if (ctype
->map_collection
[0][cnt
] != 0)
2828 ELEM (ctype
, map_collection
, [1],
2829 ctype
->map_collection
[0][cnt
])
2830 = ctype
->charnames
[cnt
];
2832 for (cnt
= 0; cnt
< 256; ++cnt
)
2833 if (ctype
->map256_collection
[0][cnt
] != 0)
2834 ctype
->map_collection
[1][ctype
->map_collection
[0][cnt
]]
2835 = ctype
->charnames
[cnt
];
2838 if (ctype
->outdigits_act
== 0)
2840 for (cnt
= 0; cnt
< 10; ++cnt
)
2842 ctype
->mboutdigits
[cnt
] = charmap_find_symbol (charmap
,
2845 if (ctype
->mboutdigits
[cnt
] == NULL
)
2847 ctype
->mboutdigits
[cnt
] = charmap_find_symbol (charmap
,
2849 strlen (longnames
[cnt
]));
2851 if (ctype
->mboutdigits
[cnt
] == NULL
)
2853 /* Provide a replacement. */
2855 no output digits defined and none of the standard names in the charmap"));
2857 ctype
->mboutdigits
[cnt
] = obstack_alloc (&charmap
->mem_pool
,
2858 sizeof (struct charseq
) + 1);
2860 /* This is better than nothing. */
2861 ctype
->mboutdigits
[cnt
]->bytes
[0] = digits
[cnt
];
2862 ctype
->mboutdigits
[cnt
]->nbytes
= 1;
2866 ctype
->wcoutdigits
[cnt
] = repertoire_find_value (repertoire
,
2869 if (ctype
->wcoutdigits
[cnt
] == ILLEGAL_CHAR_VALUE
)
2871 ctype
->wcoutdigits
[cnt
] = repertoire_find_value (repertoire
,
2873 strlen (longnames
[cnt
]));
2875 if (ctype
->wcoutdigits
[cnt
] == ILLEGAL_CHAR_VALUE
)
2877 /* Provide a replacement. */
2879 no output digits defined and none of the standard names in the repertoire"));
2881 /* This is better than nothing. */
2882 ctype
->wcoutdigits
[cnt
] = (uint32_t) digits
[cnt
];
2887 ctype
->outdigits_act
= 10;
2893 allocate_arrays (struct locale_ctype_t
*ctype
, struct charmap_t
*charmap
,
2894 struct repertoire_t
*repertoire
)
2898 /* First we have to decide how we organize the arrays. It is easy
2899 for a one-byte character set. But multi-byte character set
2900 cannot be stored flat because the chars might be sparsely used.
2901 So we determine an optimal hashing function for the used
2904 We use a very trivial hashing function to store the sparse
2905 table. CH % TABSIZE is used as an index. To solve multiple hits
2906 we have N planes. This guarantees a fixed search time for a
2907 character [N / 2]. In the following code we determine the minimum
2908 value for TABSIZE * N, where TABSIZE >= 256. */
2909 size_t min_total
= UINT_MAX
;
2910 size_t act_size
= 256;
2914 Computing table size for character classes might take a while..."),
2917 while (act_size
< min_total
)
2919 size_t cnt
[act_size
];
2920 size_t act_planes
= 1;
2922 memset (cnt
, '\0', sizeof cnt
);
2924 for (idx
= 0; idx
< 256; ++idx
)
2927 for (idx
= 0; idx
< ctype
->charnames_act
; ++idx
)
2928 if (ctype
->charnames
[idx
] >= 256)
2930 size_t nr
= ctype
->charnames
[idx
] % act_size
;
2932 if (++cnt
[nr
] > act_planes
)
2934 act_planes
= cnt
[nr
];
2935 if (act_size
* act_planes
>= min_total
)
2940 if (act_size
* act_planes
< min_total
)
2942 min_total
= act_size
* act_planes
;
2943 ctype
->plane_size
= act_size
;
2944 ctype
->plane_cnt
= act_planes
;
2951 fputs (_(" done\n"), stderr
);
2954 ctype
->names
= (uint32_t *) xcalloc (ctype
->plane_size
2958 for (idx
= 1; idx
< 256; ++idx
)
2959 ctype
->names
[idx
] = idx
;
2961 /* Trick: change the 0th entry's name to 1 to mark the cell occupied. */
2962 ctype
->names
[0] = 1;
2964 for (idx
= 256; idx
< ctype
->charnames_act
; ++idx
)
2966 size_t nr
= (ctype
->charnames
[idx
] % ctype
->plane_size
);
2969 while (ctype
->names
[nr
+ depth
* ctype
->plane_size
])
2971 assert (depth
< ctype
->plane_cnt
);
2973 ctype
->names
[nr
+ depth
* ctype
->plane_size
] = ctype
->charnames
[idx
];
2975 /* Now for faster access remember the index in the NAMES_B array. */
2976 ctype
->charnames
[idx
] = nr
+ depth
* ctype
->plane_size
;
2978 ctype
->names
[0] = 0;
2981 /* You wonder about this amount of memory? This is only because some
2982 users do not manage to address the array with unsigned values or
2983 data types with range >= 256. '\200' would result in the array
2984 index -128. To help these poor people we duplicate the entries for
2985 128 up to 255 below the entry for \0. */
2986 ctype
->ctype_b
= (char_class_t
*) xcalloc (256 + 128,
2987 sizeof (char_class_t
));
2988 ctype
->ctype32_b
= (char_class32_t
*) xcalloc (ctype
->plane_size
2990 sizeof (char_class32_t
));
2992 /* This is the array accessed using the multibyte string elements. */
2993 for (idx
= 0; idx
< 256; ++idx
)
2994 ctype
->ctype_b
[128 + idx
] = ctype
->class256_collection
[idx
];
2996 /* Mirror first 127 entries. We must take care that entry -1 is not
2997 mirrored because EOF == -1. */
2998 for (idx
= 0; idx
< 127; ++idx
)
2999 ctype
->ctype_b
[idx
] = ctype
->ctype_b
[256 + idx
];
3001 /* The 32 bit array contains all characters. */
3002 for (idx
= 0; idx
< ctype
->class_collection_act
; ++idx
)
3003 ctype
->ctype32_b
[ctype
->charnames
[idx
]] = ctype
->class_collection
[idx
];
3005 /* Room for table of mappings. */
3006 ctype
->map
= (uint32_t **) xmalloc (ctype
->map_collection_nr
3007 * sizeof (uint32_t *));
3009 /* Fill in all mappings. */
3010 for (idx
= 0; idx
< ctype
->map_collection_nr
; ++idx
)
3014 /* Allocate table. */
3015 ctype
->map
[idx
] = (uint32_t *) xmalloc ((ctype
->plane_size
3016 * ctype
->plane_cnt
+ 128)
3017 * sizeof (uint32_t));
3019 /* Copy default value (identity mapping). */
3020 memcpy (&ctype
->map
[idx
][128], ctype
->names
,
3021 ctype
->plane_size
* ctype
->plane_cnt
* sizeof (uint32_t));
3023 /* Copy values from collection. */
3024 for (idx2
= 0; idx2
< 256; ++idx2
)
3025 ctype
->map
[idx
][128 + idx2
] = ctype
->map256_collection
[idx
][idx2
];
3027 /* Mirror first 127 entries. We must take care not to map entry
3028 -1 because EOF == -1. */
3029 for (idx2
= 0; idx2
< 127; ++idx2
)
3030 ctype
->map
[idx
][idx2
] = ctype
->map
[idx
][256 + idx2
];
3032 /* EOF must map to EOF. */
3033 ctype
->map
[idx
][127] = EOF
;
3035 /* The 32 bit map collection. */
3036 for (idx2
= 0; idx2
< ctype
->map_collection_act
[idx
]; ++idx2
)
3037 if (ctype
->map_collection
[idx
][idx2
] != 0)
3038 ctype
->map
[idx
][128 + ctype
->charnames
[idx2
]]
3039 = ctype
->map_collection
[idx
][idx2
];
3042 /* Extra array for class and map names. */
3043 ctype
->class_name_ptr
= (uint32_t *) xmalloc (ctype
->nr_charclass
3044 * sizeof (uint32_t));
3045 ctype
->map_name_ptr
= (uint32_t *) xmalloc (ctype
->map_collection_nr
3046 * sizeof (uint32_t));
3048 /* Array for width information. Because the expected width are very
3049 small we use only one single byte. This save space and we need
3050 not provide the information twice with both endianesses. */
3051 ctype
->width
= (unsigned char *) xmalloc (ctype
->plane_size
3052 * ctype
->plane_cnt
);
3053 /* Initialize with default width value. */
3054 memset (ctype
->width
, charmap
->width_default
,
3055 ctype
->plane_size
* ctype
->plane_cnt
);
3056 if (charmap
->width_rules
!= NULL
)
3060 for (cnt
= 0; cnt
< charmap
->nwidth_rules
; ++cnt
)
3062 unsigned char bytes
[charmap
->mb_cur_max
];
3063 int nbytes
= charmap
->width_rules
[cnt
].from
->nbytes
;
3065 /* We have the range of character for which the width is
3066 specified described using byte sequences of the multibyte
3067 charset. We have to convert this to UCS4 now. And we
3068 cannot simply convert the beginning and the end of the
3069 sequence, we have to iterate over the byte sequence and
3070 convert it for every single character. */
3071 memcpy (bytes
, charmap
->width_rules
[cnt
].from
->bytes
, nbytes
);
3073 while (nbytes
< charmap
->width_rules
[cnt
].to
->nbytes
3074 || memcmp (bytes
, charmap
->width_rules
[cnt
].to
->bytes
,
3077 /* Find the UCS value for `bytes'. */
3078 uint32_t wch
= repertoire_find_value (ctype
->repertoire
, bytes
,
3082 if (wch
!= ILLEGAL_CHAR_VALUE
)
3084 /* Store the value. */
3085 size_t nr
= idx
% ctype
->plane_size
;
3088 while (ctype
->names
[nr
+ depth
* ctype
->plane_size
] != nr
)
3090 assert (depth
< ctype
->plane_cnt
);
3092 ctype
->width
[nr
+ depth
* ctype
->plane_size
]
3093 = charmap
->width_rules
[cnt
].width
;
3096 /* "Increment" the bytes sequence. */
3098 while (inner
>= 0 && bytes
[inner
] == 0xff)
3103 /* We have to extend the byte sequence. */
3104 if (nbytes
>= charmap
->width_rules
[cnt
].to
->nbytes
)
3108 memset (&bytes
[1], 0, nbytes
);
3114 while (++inner
< nbytes
)
3121 /* Set MB_CUR_MAX. */
3122 ctype
->mb_cur_max
= charmap
->mb_cur_max
;
3124 /* We need the name of the currently used 8-bit character set to
3125 make correct conversion between this 8-bit representation and the
3126 ISO 10646 character set used internally for wide characters. */
3127 ctype
->codeset_name
= charmap
->code_set_name
;
3129 /* Now determine the table for the transliteration information.
3131 XXX It is not yet clear to me whether it is worth implementing a
3132 complicated algorithm which uses a hash table to locate the entries.
3133 For now I'll use a simple array which can be searching using binary
3135 if (ctype
->translit_copy_locale
!= NULL
)
3137 /* Fold in the transliteration information from the locale mentioned
3138 in the `include' statement. */
3139 struct locale_ctype_t
*here
= ctype
;
3143 struct localedef_t
*other
= find_locale (LC_CTYPE
,
3144 here
->translit_copy_locale
,
3145 repertoire
->name
, charmap
);
3150 %s: transliteration data from locale `%s' not available"),
3151 "LC_CTYPE", here
->translit_copy_locale
);
3155 here
= other
->categories
[LC_CTYPE
].ctype
;
3157 /* Enqueue the information if necessary. */
3158 if (here
->translit
!= NULL
)
3160 struct translit_t
*endp
= here
->translit
;
3161 while (endp
->next
!= NULL
)
3164 endp
->next
= ctype
->translit
;
3165 ctype
->translit
= here
->translit
;
3168 while (here
->translit_copy_locale
!= NULL
);
3171 if (ctype
->translit
!= NULL
)
3173 /* First count how many entries we have. This is the upper limit
3174 since some entries from the included files might be overwritten. */
3177 struct translit_t
*runp
= ctype
->translit
;
3178 struct translit_t
**sorted
;
3179 size_t from_len
, to_len
;
3181 while (runp
!= NULL
)
3187 /* Next we allocate an array large enough and fill in the values. */
3188 sorted
= (struct translit_t
**) alloca (number
3189 * sizeof (struct translit_t
**));
3190 runp
= ctype
->translit
;
3194 /* Search for the place where to insert this string.
3195 XXX Better use a real sorting algorithm later. */
3199 while (idx
< number
)
3201 int res
= wcscmp ((const wchar_t *) sorted
[idx
]->from
,
3202 (const wchar_t *) runp
->from
);
3217 memmove (&sorted
[idx
+ 1], &sorted
[idx
],
3218 (number
- idx
) * sizeof (struct translit_t
*));
3225 while (runp
!= NULL
);
3227 /* The next step is putting all the possible transliteration
3228 strings in one memory block so that we can write it out.
3229 We need several different blocks:
3230 - index to the tfromstring array
3232 - index to the to-string array
3234 And this all must be available for both endianes variants.
3236 from_len
= to_len
= 0;
3237 for (cnt
= 0; cnt
< number
; ++cnt
)
3239 struct translit_to_t
*srunp
;
3240 from_len
+= wcslen ((const wchar_t *) sorted
[cnt
]->from
) + 1;
3241 srunp
= sorted
[cnt
]->to
;
3242 while (srunp
!= NULL
)
3244 to_len
+= wcslen ((const wchar_t *) srunp
->str
) + 1;
3245 srunp
= srunp
->next
;
3247 /* Plus one for the extra NUL character marking the end of
3248 the list for the current entry. */
3252 /* We can allocate the arrays for the results. */
3253 ctype
->translit_from_idx
= xmalloc (number
* sizeof (uint32_t));
3254 ctype
->translit_from_tbl
= xmalloc (from_len
* sizeof (uint32_t));
3255 ctype
->translit_to_idx
= xmalloc (number
* sizeof (uint32_t));
3256 ctype
->translit_to_tbl
= xmalloc (to_len
* sizeof (uint32_t));
3260 for (cnt
= 0; cnt
< number
; ++cnt
)
3263 struct translit_to_t
*srunp
;
3265 ctype
->translit_from_idx
[cnt
] = from_len
;
3266 ctype
->translit_to_idx
[cnt
] = to_len
;
3268 len
= wcslen ((const wchar_t *) sorted
[cnt
]->from
) + 1;
3269 wmemcpy ((wchar_t *) &ctype
->translit_from_tbl
[from_len
],
3270 (const wchar_t *) sorted
[cnt
]->from
, len
);
3273 ctype
->translit_to_idx
[cnt
] = to_len
;
3274 srunp
= sorted
[cnt
]->to
;
3275 while (srunp
!= NULL
)
3277 len
= wcslen ((const wchar_t *) srunp
->str
) + 1;
3278 wmemcpy ((wchar_t *) &ctype
->translit_to_tbl
[to_len
],
3279 (const wchar_t *) srunp
->str
, len
);
3281 srunp
= srunp
->next
;
3283 ctype
->translit_to_tbl
[to_len
++] = L
'\0';
3286 /* Store the information about the length. */
3287 ctype
->translit_idx_size
= number
* sizeof (uint32_t);
3288 ctype
->translit_from_tbl_size
= from_len
* sizeof (uint32_t);
3289 ctype
->translit_to_tbl_size
= to_len
* sizeof (uint32_t);
3293 /* Provide some dummy pointers since we have nothing to write out. */
3294 static uint32_t no_str
= { 0 };
3296 ctype
->translit_from_idx
= &no_str
;
3297 ctype
->translit_from_tbl
= &no_str
;
3298 ctype
->translit_to_tbl
= &no_str
;
3299 ctype
->translit_idx_size
= 0;
3300 ctype
->translit_from_tbl_size
= 0;
3301 ctype
->translit_to_tbl_size
= 0;