1 /* Copyright (C) 1995, 1996, 1997, 1998, 1999 Free Software Foundation, Inc.
2 This file is part of the GNU C Library.
3 Contributed by Ulrich Drepper <drepper@gnu.org>, 1995.
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Library General Public License as
7 published by the Free Software Foundation; either version 2 of the
8 License, or (at your option) any later version.
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Library General Public License for more details.
15 You should have received a copy of the GNU Library General Public
16 License along with the GNU C Library; see the file COPYING.LIB. If not,
17 write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
18 Boston, MA 02111-1307, USA. */
37 #include "localeinfo.h"
39 #include "linereader.h"
40 #include "locfile-token.h"
42 #include "localedef.h"
47 #ifdef PREDEFINED_CLASSES
48 /* These are the extra bits not in wctype.h since these are not preallocated
50 # define _ISwspecial1 (1 << 29)
51 # define _ISwspecial2 (1 << 30)
52 # define _ISwspecial3 (1 << 31)
56 /* The bit used for representing a special class. */
57 #define BITPOS(class) ((class) - tok_upper)
58 #define BIT(class) (_ISbit (BITPOS (class)))
59 #define BITw(class) (_ISwbit (BITPOS (class)))
61 #define ELEM(ctype, collection, idx, value) \
62 *find_idx (ctype, &ctype->collection idx, &ctype->collection##_max idx, \
63 &ctype->collection##_act idx, value)
66 /* To be compatible with former implementations we for now restrict
67 the number of bits for character classes to 16. When compatibility
68 is not necessary anymore increase the number to 32. */
69 #define char_class_t uint16_t
70 #define char_class32_t uint32_t
73 /* Type to describe a transliteration action. We have a possibly
74 multiple character from-string and a set of multiple character
75 to-strings. All are 32bit values since this is what is used in
76 the gconv functions. */
81 struct translit_to_t
*next
;
88 struct translit_to_t
*to
;
90 struct translit_t
*next
;
94 /* The real definition of the struct for the LC_CTYPE locale. */
101 struct repertoire_t
*repertoire
;
103 /* We will allow up to 8 * sizeof (uint32_t) character classes. */
104 #define MAX_NR_CHARCLASS (8 * sizeof (uint32_t))
106 const char *classnames
[MAX_NR_CHARCLASS
];
107 uint32_t last_class_char
;
108 uint32_t class256_collection
[256];
109 uint32_t *class_collection
;
110 size_t class_collection_max
;
111 size_t class_collection_act
;
114 struct charseq
**mbdigits
;
121 struct charseq
*mboutdigits
[10];
122 uint32_t wcoutdigits
[10];
123 size_t outdigits_act
;
125 /* If the following number ever turns out to be too small simply
126 increase it. But I doubt it will. --drepper@gnu */
127 #define MAX_NR_CHARMAP 16
128 const char *mapnames
[MAX_NR_CHARMAP
];
129 uint32_t *map_collection
[MAX_NR_CHARMAP
];
130 uint32_t map256_collection
[2][256];
131 size_t map_collection_max
[MAX_NR_CHARMAP
];
132 size_t map_collection_act
[MAX_NR_CHARMAP
];
133 size_t map_collection_nr
;
135 int tomap_done
[MAX_NR_CHARMAP
];
137 /* Transliteration information. */
138 const char *translit_copy_locale
;
139 const char *translit_copy_repertoire
;
140 struct translit_t
*translit
;
142 /* The arrays for the binary representation. */
145 char_class_t
*ctype_b
;
146 char_class32_t
*ctype32_b
;
149 uint32_t *class_name_ptr
;
150 uint32_t *map_name_ptr
;
151 unsigned char *width
;
153 const char *codeset_name
;
154 uint32_t translit_hash_size
;
155 uint32_t translit_hash_layers
;
156 uint32_t *translit_from_idx
;
157 uint32_t *translit_from_tbl
;
158 uint32_t *translit_to_idx
;
159 uint32_t *translit_to_tbl
;
160 size_t translit_idx_size
;
161 size_t translit_from_tbl_size
;
162 size_t translit_to_tbl_size
;
164 struct obstack mem_pool
;
168 #define obstack_chunk_alloc xmalloc
169 #define obstack_chunk_free free
172 /* Prototypes for local functions. */
173 static void ctype_startup (struct linereader
*lr
, struct localedef_t
*locale
,
174 struct charmap_t
*charmap
, int ignore_content
);
175 static void ctype_class_new (struct linereader
*lr
,
176 struct locale_ctype_t
*ctype
, const char *name
);
177 static void ctype_map_new (struct linereader
*lr
,
178 struct locale_ctype_t
*ctype
,
179 const char *name
, struct charmap_t
*charmap
);
180 static uint32_t *find_idx (struct locale_ctype_t
*ctype
, uint32_t **table
,
181 size_t *max
, size_t *act
, unsigned int idx
);
182 static void set_class_defaults (struct locale_ctype_t
*ctype
,
183 struct charmap_t
*charmap
,
184 struct repertoire_t
*repertoire
);
185 static void allocate_arrays (struct locale_ctype_t
*ctype
,
186 struct charmap_t
*charmap
,
187 struct repertoire_t
*repertoire
);
190 static const char *longnames
[] =
192 "zero", "one", "two", "three", "four",
193 "five", "six", "seven", "eight", "nine"
195 static const unsigned char digits
[] = "0123456789";
199 ctype_startup (struct linereader
*lr
, struct localedef_t
*locale
,
200 struct charmap_t
*charmap
, int ignore_content
)
203 struct locale_ctype_t
*ctype
;
207 /* Allocate the needed room. */
208 locale
->categories
[LC_CTYPE
].ctype
= ctype
=
209 (struct locale_ctype_t
*) xcalloc (1, sizeof (struct locale_ctype_t
));
211 /* We have seen no names yet. */
212 ctype
->charnames_max
= charmap
->mb_cur_max
== 1 ? 256 : 512;
214 (unsigned int *) xmalloc (ctype
->charnames_max
215 * sizeof (unsigned int));
216 for (cnt
= 0; cnt
< 256; ++cnt
)
217 ctype
->charnames
[cnt
] = cnt
;
218 ctype
->charnames_act
= 256;
220 /* Fill character class information. */
221 ctype
->last_class_char
= ILLEGAL_CHAR_VALUE
;
222 /* The order of the following instructions determines the bit
224 ctype_class_new (lr
, ctype
, "upper");
225 ctype_class_new (lr
, ctype
, "lower");
226 ctype_class_new (lr
, ctype
, "alpha");
227 ctype_class_new (lr
, ctype
, "digit");
228 ctype_class_new (lr
, ctype
, "xdigit");
229 ctype_class_new (lr
, ctype
, "space");
230 ctype_class_new (lr
, ctype
, "print");
231 ctype_class_new (lr
, ctype
, "graph");
232 ctype_class_new (lr
, ctype
, "blank");
233 ctype_class_new (lr
, ctype
, "cntrl");
234 ctype_class_new (lr
, ctype
, "punct");
235 ctype_class_new (lr
, ctype
, "alnum");
236 #ifdef PREDEFINED_CLASSES
237 /* The following are extensions from ISO 14652. */
238 ctype_class_new (lr
, ctype
, "left_to_right");
239 ctype_class_new (lr
, ctype
, "right_to_left");
240 ctype_class_new (lr
, ctype
, "num_terminator");
241 ctype_class_new (lr
, ctype
, "num_separator");
242 ctype_class_new (lr
, ctype
, "segment_separator");
243 ctype_class_new (lr
, ctype
, "block_separator");
244 ctype_class_new (lr
, ctype
, "direction_control");
245 ctype_class_new (lr
, ctype
, "sym_swap_layout");
246 ctype_class_new (lr
, ctype
, "char_shape_selector");
247 ctype_class_new (lr
, ctype
, "num_shape_selector");
248 ctype_class_new (lr
, ctype
, "non_spacing");
249 ctype_class_new (lr
, ctype
, "non_spacing_level3");
250 ctype_class_new (lr
, ctype
, "normal_connect");
251 ctype_class_new (lr
, ctype
, "r_connect");
252 ctype_class_new (lr
, ctype
, "no_connect");
253 ctype_class_new (lr
, ctype
, "no_connect-space");
254 ctype_class_new (lr
, ctype
, "vowel_connect");
257 ctype
->class_collection_max
= charmap
->mb_cur_max
== 1 ? 256 : 512;
258 ctype
->class_collection
259 = (uint32_t *) xcalloc (sizeof (unsigned long int),
260 ctype
->class_collection_max
);
261 ctype
->class_collection_act
= 256;
263 /* Fill character map information. */
264 ctype
->map_collection_nr
= 0;
265 ctype
->last_map_idx
= MAX_NR_CHARMAP
;
266 ctype_map_new (lr
, ctype
, "toupper", charmap
);
267 ctype_map_new (lr
, ctype
, "tolower", charmap
);
268 #ifdef PREDEFINED_CLASSES
269 ctype_map_new (lr
, ctype
, "tosymmetric", charmap
);
272 /* Fill first 256 entries in `toXXX' arrays. */
273 for (cnt
= 0; cnt
< 256; ++cnt
)
275 ctype
->map_collection
[0][cnt
] = cnt
;
276 ctype
->map_collection
[1][cnt
] = cnt
;
277 #ifdef PREDEFINED_CLASSES
278 ctype
->map_collection
[2][cnt
] = cnt
;
280 ctype
->map256_collection
[0][cnt
] = cnt
;
281 ctype
->map256_collection
[1][cnt
] = cnt
;
284 obstack_init (&ctype
->mem_pool
);
290 ctype_finish (struct localedef_t
*locale
, struct charmap_t
*charmap
)
292 /* See POSIX.2, table 2-6 for the meaning of the following table. */
297 const char allow
[NCLASS
];
299 valid_table
[NCLASS
] =
301 /* The order is important. See token.h for more information.
302 M = Always, D = Default, - = Permitted, X = Mutually exclusive */
303 { "upper", "--MX-XDDXXX-" },
304 { "lower", "--MX-XDDXXX-" },
305 { "alpha", "---X-XDDXXX-" },
306 { "digit", "XXX--XDDXXX-" },
307 { "xdigit", "-----XDDXXX-" },
308 { "space", "XXXXX------X" },
309 { "print", "---------X--" },
310 { "graph", "---------X--" },
311 { "blank", "XXXXXM-----X" },
312 { "cntrl", "XXXXX-XX--XX" },
313 { "punct", "XXXXX-DD-X-X" },
314 { "alnum", "-----XDDXXX-" }
318 uint32_t space_value
;
319 struct charseq
*space_seq
;
320 struct locale_ctype_t
*ctype
= locale
->categories
[LC_CTYPE
].ctype
;
323 /* Now resolve copying and also handle completely missing definitions. */
326 /* First see whether we were supposed to copy. If yes, find the
327 actual definition. */
328 if (locale
->copy_name
[LC_CTYPE
] != NULL
)
330 /* Find the copying locale. This has to happen transitively since
331 the locale we are copying from might also copying another one. */
332 struct localedef_t
*from
= locale
;
335 from
= find_locale (LC_CTYPE
, from
->copy_name
[LC_CTYPE
],
336 from
->repertoire_name
, charmap
);
337 while (from
->categories
[LC_CTYPE
].ctype
== NULL
338 && from
->copy_name
[LC_CTYPE
] != NULL
);
340 ctype
= locale
->categories
[LC_CTYPE
].ctype
341 = from
->categories
[LC_CTYPE
].ctype
;
344 /* If there is still no definition issue an warning and create an
348 error (0, 0, _("No definition for %s category found"), "LC_CTYPE");
349 ctype_startup (NULL
, locale
, charmap
, 0);
350 ctype
= locale
->categories
[LC_CTYPE
].ctype
;
354 /* Set default value for classes not specified. */
355 set_class_defaults (ctype
, charmap
, ctype
->repertoire
);
357 /* Check according to table. */
358 for (cnt
= 0; cnt
< ctype
->class_collection_act
; ++cnt
)
360 uint32_t tmp
= ctype
->class_collection
[cnt
];
364 for (cls1
= 0; cls1
< NCLASS
; ++cls1
)
365 if ((tmp
& _ISwbit (cls1
)) != 0)
366 for (cls2
= 0; cls2
< NCLASS
; ++cls2
)
367 if (valid_table
[cls1
].allow
[cls2
] != '-')
369 int eq
= (tmp
& _ISwbit (cls2
)) != 0;
370 switch (valid_table
[cls1
].allow
[cls2
])
375 uint32_t value
= ctype
->charnames
[cnt
];
379 character L'\\u%0*x' in class `%s' must be in class `%s'"),
380 value
> 0xffff ? 8 : 4, value
,
381 valid_table
[cls1
].name
,
382 valid_table
[cls2
].name
);
389 uint32_t value
= ctype
->charnames
[cnt
];
393 character L'\\u%0*x' in class `%s' must not be in class `%s'"),
394 value
> 0xffff ? 8 : 4, value
,
395 valid_table
[cls1
].name
,
396 valid_table
[cls2
].name
);
401 ctype
->class_collection
[cnt
] |= _ISwbit (cls2
);
405 error (5, 0, _("internal error in %s, line %u"),
406 __FUNCTION__
, __LINE__
);
412 for (cnt
= 0; cnt
< 256; ++cnt
)
414 uint32_t tmp
= ctype
->class256_collection
[cnt
];
418 for (cls1
= 0; cls1
< NCLASS
; ++cls1
)
419 if ((tmp
& _ISbit (cls1
)) != 0)
420 for (cls2
= 0; cls2
< NCLASS
; ++cls2
)
421 if (valid_table
[cls1
].allow
[cls2
] != '-')
423 int eq
= (tmp
& _ISbit (cls2
)) != 0;
424 switch (valid_table
[cls1
].allow
[cls2
])
431 sprintf (buf
, "\\%o", cnt
);
435 character '%s' in class `%s' must be in class `%s'"),
436 buf
, valid_table
[cls1
].name
,
437 valid_table
[cls2
].name
);
446 sprintf (buf
, "\\%o", cnt
);
450 character '%s' in class `%s' must not be in class `%s'"),
451 buf
, valid_table
[cls1
].name
,
452 valid_table
[cls2
].name
);
457 ctype
->class256_collection
[cnt
] |= _ISbit (cls2
);
461 error (5, 0, _("internal error in %s, line %u"),
462 __FUNCTION__
, __LINE__
);
468 /* ... and now test <SP> as a special case. */
469 space_value
= repertoire_find_value (ctype
->repertoire
, "SP", 2);
470 if (space_value
== ILLEGAL_CHAR_VALUE
)
473 error (0, 0, _("character <SP> not defined in character map"));
475 else if (((cnt
= BITPOS (tok_space
),
476 (ELEM (ctype
, class_collection
, , space_value
)
477 & BITw (tok_space
)) == 0)
478 || (cnt
= BITPOS (tok_blank
),
479 (ELEM (ctype
, class_collection
, , space_value
)
480 & BITw (tok_blank
)) == 0)))
483 error (0, 0, _("<SP> character not in class `%s'"),
484 valid_table
[cnt
].name
);
486 else if (((cnt
= BITPOS (tok_punct
),
487 (ELEM (ctype
, class_collection
, , space_value
)
488 & BITw (tok_punct
)) != 0)
489 || (cnt
= BITPOS (tok_graph
),
490 (ELEM (ctype
, class_collection
, , space_value
)
495 error (0, 0, _("<SP> character must not be in class `%s'"),
496 valid_table
[cnt
].name
);
499 ELEM (ctype
, class_collection
, , space_value
) |= BITw (tok_print
);
501 space_seq
= charmap_find_value (charmap
, "SP", 2);
502 if (space_seq
== NULL
|| space_seq
->nbytes
!= 1)
505 error (0, 0, _("character <SP> not defined in character map"));
507 else if (((cnt
= BITPOS (tok_space
),
508 (ctype
->class256_collection
[space_seq
->bytes
[0]]
509 & BIT (tok_space
)) == 0)
510 || (cnt
= BITPOS (tok_blank
),
511 (ctype
->class256_collection
[space_seq
->bytes
[0]]
512 & BIT (tok_blank
)) == 0)))
515 error (0, 0, _("<SP> character not in class `%s'"),
516 valid_table
[cnt
].name
);
518 else if (((cnt
= BITPOS (tok_punct
),
519 (ctype
->class256_collection
[space_seq
->bytes
[0]]
520 & BIT (tok_punct
)) != 0)
521 || (cnt
= BITPOS (tok_graph
),
522 (ctype
->class256_collection
[space_seq
->bytes
[0]]
523 & BIT (tok_graph
)) != 0)))
526 error (0, 0, _("<SP> character must not be in class `%s'"),
527 valid_table
[cnt
].name
);
530 ctype
->class256_collection
[space_seq
->bytes
[0]] |= BIT (tok_print
);
532 /* Now that the tests are done make sure the name array contains all
533 characters which are handled in the WIDTH section of the
534 character set definition file. */
535 if (charmap
->width_rules
!= NULL
)
536 for (cnt
= 0; cnt
< charmap
->nwidth_rules
; ++cnt
)
538 unsigned char bytes
[charmap
->mb_cur_max
];
539 int nbytes
= charmap
->width_rules
[cnt
].from
->nbytes
;
541 /* We have the range of character for which the width is
542 specified described using byte sequences of the multibyte
543 charset. We have to convert this to UCS4 now. And we
544 cannot simply convert the beginning and the end of the
545 sequence, we have to iterate over the byte sequence and
546 convert it for every single character. */
547 memcpy (bytes
, charmap
->width_rules
[cnt
].from
->bytes
, nbytes
);
549 while (nbytes
< charmap
->width_rules
[cnt
].to
->nbytes
550 || memcmp (bytes
, charmap
->width_rules
[cnt
].to
->bytes
,
553 /* Find the UCS value for `bytes'. */
554 uint32_t wch
= repertoire_find_value (ctype
->repertoire
, bytes
,
558 if (wch
!= ILLEGAL_CHAR_VALUE
)
559 /* We are only interested in the side-effects of the
560 `find_idx' call. It will add appropriate entries in
561 the name array if this is necessary. */
562 (void) find_idx (ctype
, NULL
, NULL
, NULL
, wch
);
564 /* "Increment" the bytes sequence. */
566 while (inner
>= 0 && bytes
[inner
] == 0xff)
571 /* We have to extend the byte sequence. */
572 if (nbytes
>= charmap
->width_rules
[cnt
].to
->nbytes
)
576 memset (&bytes
[1], 0, nbytes
);
582 while (++inner
< nbytes
)
588 /* There must be a multiple of 10 digits. */
589 if (ctype
->mbdigits_act
% 10 != 0)
591 assert (ctype
->mbdigits_act
== ctype
->wcdigits_act
);
592 ctype
->wcdigits_act
-= ctype
->mbdigits_act
% 10;
593 ctype
->mbdigits_act
-= ctype
->mbdigits_act
% 10;
594 error (0, 0, _("`digit' category has not entries in groups of ten"));
597 /* Check the input digits. There must be a multiple of ten available.
598 In each group it could be that one or the other character is missing.
599 In this case the whole group must be removed. */
601 while (cnt
< ctype
->mbdigits_act
)
604 for (inner
= 0; inner
< 10; ++inner
)
605 if (ctype
->mbdigits
[cnt
+ inner
] == NULL
)
612 /* Remove the group. */
613 memmove (&ctype
->mbdigits
[cnt
], &ctype
->mbdigits
[cnt
+ 10],
614 ((ctype
->wcdigits_act
- cnt
- 10)
615 * sizeof (ctype
->mbdigits
[0])));
616 ctype
->mbdigits_act
-= 10;
620 /* If no input digits are given use the default. */
621 if (ctype
->mbdigits_act
== 0)
623 if (ctype
->mbdigits_max
== 0)
625 ctype
->mbdigits
= obstack_alloc (&charmap
->mem_pool
,
626 10 * sizeof (struct charseq
*));
627 ctype
->mbdigits_max
= 10;
630 for (cnt
= 0; cnt
< 10; ++cnt
)
632 ctype
->mbdigits
[cnt
] = charmap_find_symbol (charmap
,
634 if (ctype
->mbdigits
[cnt
] == NULL
)
636 ctype
->mbdigits
[cnt
] = charmap_find_symbol (charmap
,
638 strlen (longnames
[cnt
]));
639 if (ctype
->mbdigits
[cnt
] == NULL
)
641 /* Hum, this ain't good. */
643 no input digits defined and none of the standard names in the charmap"));
645 ctype
->mbdigits
[cnt
] = obstack_alloc (&charmap
->mem_pool
,
646 sizeof (struct charseq
) + 1);
648 /* This is better than nothing. */
649 ctype
->mbdigits
[cnt
]->bytes
[0] = digits
[cnt
];
650 ctype
->mbdigits
[cnt
]->nbytes
= 1;
655 ctype
->mbdigits_act
= 10;
658 /* Check the wide character input digits. There must be a multiple
659 of ten available. In each group it could be that one or the other
660 character is missing. In this case the whole group must be
663 while (cnt
< ctype
->wcdigits_act
)
666 for (inner
= 0; inner
< 10; ++inner
)
667 if (ctype
->wcdigits
[cnt
+ inner
] == ILLEGAL_CHAR_VALUE
)
674 /* Remove the group. */
675 memmove (&ctype
->wcdigits
[cnt
], &ctype
->wcdigits
[cnt
+ 10],
676 ((ctype
->wcdigits_act
- cnt
- 10)
677 * sizeof (ctype
->wcdigits
[0])));
678 ctype
->wcdigits_act
-= 10;
682 /* If no input digits are given use the default. */
683 if (ctype
->wcdigits_act
== 0)
685 if (ctype
->wcdigits_max
== 0)
687 ctype
->wcdigits
= obstack_alloc (&charmap
->mem_pool
,
688 10 * sizeof (uint32_t));
689 ctype
->wcdigits_max
= 10;
692 for (cnt
= 0; cnt
< 10; ++cnt
)
693 ctype
->wcdigits
[cnt
] = L
'0' + cnt
;
695 ctype
->mbdigits_act
= 10;
698 /* Check the outdigits. */
700 for (cnt
= 0; cnt
< 10; ++cnt
)
701 if (ctype
->mboutdigits
[cnt
] == NULL
)
703 static struct charseq replace
[2];
708 not all characters used in `outdigit' are available in the charmap"));
712 replace
[0].nbytes
= 1;
713 replace
[0].bytes
[0] = '?';
714 replace
[0].bytes
[1] = '\0';
715 ctype
->mboutdigits
[cnt
] = &replace
[0];
719 for (cnt
= 0; cnt
< 10; ++cnt
)
720 if (ctype
->wcoutdigits
[cnt
] == 0)
725 not all characters used in `outdigit' are available in the repertoire"));
729 ctype
->wcoutdigits
[cnt
] = L
'?';
735 ctype_output (struct localedef_t
*locale
, struct charmap_t
*charmap
,
736 const char *output_path
)
738 struct locale_ctype_t
*ctype
= locale
->categories
[LC_CTYPE
].ctype
;
739 const size_t nelems
= (_NL_ITEM_INDEX (_NL_NUM_LC_CTYPE
)
740 + (ctype
->map_collection_nr
- 2));
741 struct iovec iov
[2 + nelems
+ ctype
->nr_charclass
742 + ctype
->map_collection_nr
];
743 struct locale_file data
;
744 uint32_t idx
[nelems
+ 1];
745 size_t elem
, cnt
, offset
, total
;
748 /* Now prepare the output: Find the sizes of the table we can use. */
749 allocate_arrays (ctype
, charmap
, ctype
->repertoire
);
751 data
.magic
= LIMAGIC (LC_CTYPE
);
753 iov
[0].iov_base
= (void *) &data
;
754 iov
[0].iov_len
= sizeof (data
);
756 iov
[1].iov_base
= (void *) idx
;
757 iov
[1].iov_len
= sizeof (idx
);
759 idx
[0] = iov
[0].iov_len
+ iov
[1].iov_len
;
762 for (elem
= 0; elem
< nelems
; ++elem
)
764 if (elem
< _NL_ITEM_INDEX (_NL_NUM_LC_CTYPE
))
767 #define CTYPE_DATA(name, base, len) \
768 case _NL_ITEM_INDEX (name): \
769 iov[2 + elem + offset].iov_base = (base); \
770 iov[2 + elem + offset].iov_len = (len); \
771 if (elem + 1 < nelems) \
772 idx[elem + 1] = idx[elem] + iov[2 + elem + offset].iov_len; \
775 CTYPE_DATA (_NL_CTYPE_CLASS
,
777 (256 + 128) * sizeof (char_class_t
));
779 CTYPE_DATA (_NL_CTYPE_TOUPPER
,
781 (ctype
->plane_size
* ctype
->plane_cnt
+ 128)
782 * sizeof (uint32_t));
783 CTYPE_DATA (_NL_CTYPE_TOLOWER
,
785 (ctype
->plane_size
* ctype
->plane_cnt
+ 128)
786 * sizeof (uint32_t));
788 CTYPE_DATA (_NL_CTYPE_CLASS32
,
790 (ctype
->plane_size
* ctype
->plane_cnt
791 * sizeof (char_class32_t
)));
793 CTYPE_DATA (_NL_CTYPE_NAMES
,
794 ctype
->names
, (ctype
->plane_size
* ctype
->plane_cnt
795 * sizeof (uint32_t)));
797 CTYPE_DATA (_NL_CTYPE_TRANSLIT_HASH_SIZE
,
798 &ctype
->translit_hash_size
, sizeof (uint32_t));
799 CTYPE_DATA (_NL_CTYPE_TRANSLIT_HASH_LAYERS
,
800 &ctype
->translit_hash_layers
, sizeof (uint32_t));
802 CTYPE_DATA (_NL_CTYPE_TRANSLIT_FROM_IDX
,
803 ctype
->translit_from_idx
,
804 ctype
->translit_idx_size
);
806 CTYPE_DATA (_NL_CTYPE_TRANSLIT_FROM_TBL
,
807 ctype
->translit_from_tbl
,
808 ctype
->translit_from_tbl_size
);
810 CTYPE_DATA (_NL_CTYPE_TRANSLIT_TO_IDX
,
811 ctype
->translit_to_idx
,
812 ctype
->translit_idx_size
);
814 CTYPE_DATA (_NL_CTYPE_TRANSLIT_TO_TBL
,
815 ctype
->translit_to_tbl
, ctype
->translit_to_tbl_size
);
817 CTYPE_DATA (_NL_CTYPE_HASH_SIZE
,
818 &ctype
->plane_size
, sizeof (uint32_t));
819 CTYPE_DATA (_NL_CTYPE_HASH_LAYERS
,
820 &ctype
->plane_cnt
, sizeof (uint32_t));
822 case _NL_ITEM_INDEX (_NL_CTYPE_CLASS_NAMES
):
823 /* The class name array. */
825 for (cnt
= 0; cnt
< ctype
->nr_charclass
; ++cnt
, ++offset
)
827 iov
[2 + elem
+ offset
].iov_base
828 = (void *) ctype
->classnames
[cnt
];
829 iov
[2 + elem
+ offset
].iov_len
830 = strlen (ctype
->classnames
[cnt
]) + 1;
831 total
+= iov
[2 + elem
+ offset
].iov_len
;
833 iov
[2 + elem
+ offset
].iov_base
= (void *) "\0\0\0";
834 iov
[2 + elem
+ offset
].iov_len
= 1 + (4 - ((total
+ 1) % 4));
835 total
+= 1 + (4 - ((total
+ 1) % 4));
837 idx
[elem
+ 1] = idx
[elem
] + total
;
840 case _NL_ITEM_INDEX (_NL_CTYPE_MAP_NAMES
):
841 /* The class name array. */
843 for (cnt
= 0; cnt
< ctype
->map_collection_nr
; ++cnt
, ++offset
)
845 iov
[2 + elem
+ offset
].iov_base
846 = (void *) ctype
->mapnames
[cnt
];
847 iov
[2 + elem
+ offset
].iov_len
848 = strlen (ctype
->mapnames
[cnt
]) + 1;
849 total
+= iov
[2 + elem
+ offset
].iov_len
;
851 iov
[2 + elem
+ offset
].iov_base
= (void *) "\0\0\0";
852 iov
[2 + elem
+ offset
].iov_len
= 1 + (4 - ((total
+ 1) % 4));
853 total
+= 1 + (4 - ((total
+ 1) % 4));
855 idx
[elem
+ 1] = idx
[elem
] + total
;
858 CTYPE_DATA (_NL_CTYPE_WIDTH
,
859 ctype
->width
, ctype
->plane_size
* ctype
->plane_cnt
);
861 CTYPE_DATA (_NL_CTYPE_MB_CUR_MAX
,
862 &ctype
->mb_cur_max
, sizeof (uint32_t));
864 case _NL_ITEM_INDEX (_NL_CTYPE_CODESET_NAME
):
865 total
= strlen (ctype
->codeset_name
) + 1;
867 iov
[2 + elem
+ offset
].iov_base
= (char *) ctype
->codeset_name
;
870 iov
[2 + elem
+ offset
].iov_base
= alloca ((total
+ 3) & ~3);
871 memset (mempcpy (iov
[2 + elem
+ offset
].iov_base
,
872 ctype
->codeset_name
, total
),
873 '\0', 4 - (total
& 3));
874 total
= (total
+ 3) & ~3;
876 iov
[2 + elem
+ offset
].iov_len
= total
;
877 idx
[elem
+ 1] = idx
[elem
] + iov
[2 + elem
+ offset
].iov_len
;
880 case _NL_ITEM_INDEX (_NL_CTYPE_INDIGITS_MB_LEN
):
881 iov
[2 + elem
+ offset
].iov_base
= alloca (sizeof (uint32_t));
882 iov
[2 + elem
+ offset
].iov_len
= sizeof (uint32_t);
883 *(uint32_t *) iov
[2 + elem
+ offset
].iov_base
=
884 ctype
->mbdigits_act
/ 10;
885 idx
[elem
+ 1] = idx
[elem
] + sizeof (uint32_t);
888 case _NL_ITEM_INDEX (_NL_CTYPE_INDIGITS_WC_LEN
):
889 iov
[2 + elem
+ offset
].iov_base
= alloca (sizeof (uint32_t));
890 iov
[2 + elem
+ offset
].iov_len
= sizeof (uint32_t);
891 *(uint32_t *) iov
[2 + elem
+ offset
].iov_base
=
892 ctype
->wcdigits_act
/ 10;
893 idx
[elem
+ 1] = idx
[elem
] + sizeof (uint32_t);
896 case _NL_ITEM_INDEX (_NL_CTYPE_INDIGITS0_MB
) ... _NL_ITEM_INDEX (_NL_CTYPE_INDIGITS9_MB
):
897 /* Compute the length of all possible characters. For INDIGITS
898 there might be more than one. We simply concatenate all of
899 them with a NUL byte following. The NUL byte wouldn't be
900 necessary but it makes it easier for the user. */
902 for (cnt
= elem
- _NL_CTYPE_INDIGITS0_MB
;
903 cnt
< ctype
->mbdigits_act
; cnt
+= 10)
904 total
+= ctype
->mbdigits
[cnt
]->nbytes
+ 1;
905 iov
[2 + elem
+ offset
].iov_base
= (char *) alloca (total
);
906 iov
[2 + elem
+ offset
].iov_len
= total
;
908 cp
= iov
[2 + elem
+ offset
].iov_base
;
909 for (cnt
= elem
- _NL_CTYPE_INDIGITS0_MB
;
910 cnt
< ctype
->mbdigits_act
; cnt
+= 10)
912 cp
= mempcpy (cp
, ctype
->mbdigits
[cnt
]->bytes
,
913 ctype
->mbdigits
[cnt
]->nbytes
);
916 idx
[elem
+ 1] = idx
[elem
] + iov
[2 + elem
+ offset
].iov_len
;
919 case _NL_ITEM_INDEX (_NL_CTYPE_OUTDIGIT0_MB
) ... _NL_ITEM_INDEX (_NL_CTYPE_OUTDIGIT9_MB
):
920 /* Compute the length of all possible characters. For INDIGITS
921 there might be more than one. We simply concatenate all of
922 them with a NUL byte following. The NUL byte wouldn't be
923 necessary but it makes it easier for the user. */
924 cnt
= elem
- _NL_CTYPE_OUTDIGIT0_MB
;
925 total
= ctype
->mboutdigits
[cnt
]->nbytes
+ 1;
926 iov
[2 + elem
+ offset
].iov_base
= (char *) alloca (total
);
927 iov
[2 + elem
+ offset
].iov_len
= total
;
929 *(char *) mempcpy (iov
[2 + elem
+ offset
].iov_base
,
930 ctype
->mbdigits
[cnt
]->bytes
,
931 ctype
->mbdigits
[cnt
]->nbytes
) = '\0';
932 idx
[elem
+ 1] = idx
[elem
] + iov
[2 + elem
+ offset
].iov_len
;
935 case _NL_ITEM_INDEX (_NL_CTYPE_INDIGITS0_WC
) ... _NL_ITEM_INDEX (_NL_CTYPE_INDIGITS9_WC
):
936 total
= ctype
->wcdigits_act
/ 10;
938 iov
[2 + elem
+ offset
].iov_base
=
939 (uint32_t *) alloca (total
* sizeof (uint32_t));
940 iov
[2 + elem
+ offset
].iov_len
= total
* sizeof (uint32_t);
942 for (cnt
= elem
- _NL_CTYPE_INDIGITS0_WC
;
943 cnt
< ctype
->wcdigits_act
; cnt
+= 10)
944 ((uint32_t *) iov
[2 + elem
+ offset
].iov_base
)[cnt
/ 10]
945 = ctype
->wcdigits
[cnt
];
946 idx
[elem
+ 1] = idx
[elem
] + iov
[2 + elem
+ offset
].iov_len
;
949 case _NL_ITEM_INDEX (_NL_CTYPE_OUTDIGIT0_WC
) ... _NL_ITEM_INDEX (_NL_CTYPE_OUTDIGIT9_WC
):
950 cnt
= elem
- _NL_CTYPE_OUTDIGIT0_WC
;
951 iov
[2 + elem
+ offset
].iov_base
= &ctype
->wcoutdigits
[cnt
];
952 iov
[2 + elem
+ offset
].iov_len
= sizeof (uint32_t);
953 idx
[elem
+ 1] = idx
[elem
] + iov
[2 + elem
+ offset
].iov_len
;
957 assert (! "unknown CTYPE element");
961 /* Handle extra maps. */
962 size_t nr
= (elem
- _NL_ITEM_INDEX (_NL_NUM_LC_CTYPE
)) + 2;
964 iov
[2 + elem
+ offset
].iov_base
= ctype
->map
[nr
];
965 iov
[2 + elem
+ offset
].iov_len
= ((ctype
->plane_size
966 * ctype
->plane_cnt
+ 128)
967 * sizeof (uint32_t));
969 idx
[elem
+ 1] = idx
[elem
] + iov
[2 + elem
+ offset
].iov_len
;
973 assert (2 + elem
+ offset
== (nelems
+ ctype
->nr_charclass
974 + ctype
->map_collection_nr
+ 2));
976 write_locale_data (output_path
, "LC_CTYPE", 2 + elem
+ offset
, iov
);
980 /* Local functions. */
982 ctype_class_new (struct linereader
*lr
, struct locale_ctype_t
*ctype
,
987 for (cnt
= 0; cnt
< ctype
->nr_charclass
; ++cnt
)
988 if (strcmp (ctype
->classnames
[cnt
], name
) == 0)
991 if (cnt
< ctype
->nr_charclass
)
993 lr_error (lr
, _("character class `%s' already defined"), name
);
997 if (ctype
->nr_charclass
== MAX_NR_CHARCLASS
)
998 /* Exit code 2 is prescribed in P1003.2b. */
1000 implementation limit: no more than %d character classes allowed"),
1003 ctype
->classnames
[ctype
->nr_charclass
++] = name
;
1008 ctype_map_new (struct linereader
*lr
, struct locale_ctype_t
*ctype
,
1009 const char *name
, struct charmap_t
*charmap
)
1011 size_t max_chars
= 0;
1014 for (cnt
= 0; cnt
< ctype
->map_collection_nr
; ++cnt
)
1016 if (strcmp (ctype
->mapnames
[cnt
], name
) == 0)
1019 if (max_chars
< ctype
->map_collection_max
[cnt
])
1020 max_chars
= ctype
->map_collection_max
[cnt
];
1023 if (cnt
< ctype
->map_collection_nr
)
1025 lr_error (lr
, _("character map `%s' already defined"), name
);
1029 if (ctype
->map_collection_nr
== MAX_NR_CHARMAP
)
1030 /* Exit code 2 is prescribed in P1003.2b. */
1032 implementation limit: no more than %d character maps allowed"),
1035 ctype
->mapnames
[cnt
] = name
;
1038 ctype
->map_collection_max
[cnt
] = charmap
->mb_cur_max
== 1 ? 256 : 512;
1040 ctype
->map_collection_max
[cnt
] = max_chars
;
1042 ctype
->map_collection
[cnt
] = (uint32_t *)
1043 xmalloc (sizeof (uint32_t) * ctype
->map_collection_max
[cnt
]);
1044 memset (ctype
->map_collection
[cnt
], '\0',
1045 sizeof (uint32_t) * ctype
->map_collection_max
[cnt
]);
1046 ctype
->map_collection_act
[cnt
] = 256;
1048 ++ctype
->map_collection_nr
;
1052 /* We have to be prepared that TABLE, MAX, and ACT can be NULL. This
1053 is possible if we only want to extend the name array. */
1055 find_idx (struct locale_ctype_t
*ctype
, uint32_t **table
, size_t *max
,
1056 size_t *act
, uint32_t idx
)
1061 return table
== NULL
? NULL
: &(*table
)[idx
];
1063 for (cnt
= 256; cnt
< ctype
->charnames_act
; ++cnt
)
1064 if (ctype
->charnames
[cnt
] == idx
)
1067 /* We have to distinguish two cases: the name is found or not. */
1068 if (cnt
== ctype
->charnames_act
)
1070 /* Extend the name array. */
1071 if (ctype
->charnames_act
== ctype
->charnames_max
)
1073 ctype
->charnames_max
*= 2;
1074 ctype
->charnames
= (unsigned int *)
1075 xrealloc (ctype
->charnames
,
1076 sizeof (unsigned int) * ctype
->charnames_max
);
1078 ctype
->charnames
[ctype
->charnames_act
++] = idx
;
1082 /* We have done everything we are asked to do. */
1089 size_t old_max
= *max
;
1092 while (*max
<= cnt
);
1095 (uint32_t *) xrealloc (*table
, *max
* sizeof (unsigned long int));
1096 memset (&(*table
)[old_max
], '\0',
1097 (*max
- old_max
) * sizeof (uint32_t));
1103 return &(*table
)[cnt
];
1108 get_character (struct token
*now
, struct charmap_t
*charmap
,
1109 struct repertoire_t
*repertoire
,
1110 struct charseq
**seqp
, uint32_t *wchp
)
1112 if (now
->tok
== tok_bsymbol
)
1114 /* This will hopefully be the normal case. */
1115 *wchp
= repertoire_find_value (repertoire
, now
->val
.str
.startmb
,
1116 now
->val
.str
.lenmb
);
1117 *seqp
= charmap_find_value (charmap
, now
->val
.str
.startmb
,
1118 now
->val
.str
.lenmb
);
1120 else if (now
->tok
== tok_ucs4
)
1122 *seqp
= repertoire_find_seq (repertoire
, now
->val
.ucs4
);
1126 /* Compute the value in the charmap from the UCS value. */
1127 const char *symbol
= repertoire_find_symbol (repertoire
,
1133 *seqp
= charmap_find_value (charmap
, symbol
, strlen (symbol
));
1137 /* Insert a negative entry. */
1138 static const struct charseq negative
1139 = { .ucs4
= ILLEGAL_CHAR_VALUE
};
1140 uint32_t *newp
= obstack_alloc (&repertoire
->mem_pool
, 4);
1141 *newp
= now
->val
.ucs4
;
1143 insert_entry (&repertoire
->seq_table
, newp
, 4,
1144 (void *) &negative
);
1147 (*seqp
)->ucs4
= now
->val
.ucs4
;
1149 else if ((*seqp
)->ucs4
!= now
->val
.ucs4
)
1152 *wchp
= now
->val
.ucs4
;
1154 else if (now
->tok
== tok_charcode
)
1156 /* We must map from the byte code to UCS4. */
1157 *seqp
= charmap_find_symbol (charmap
, now
->val
.str
.startmb
,
1158 now
->val
.str
.lenmb
);
1161 *wchp
= ILLEGAL_CHAR_VALUE
;
1164 if ((*seqp
)->ucs4
== UNINITIALIZED_CHAR_VALUE
)
1165 (*seqp
)->ucs4
= repertoire_find_value (repertoire
, (*seqp
)->name
,
1166 strlen ((*seqp
)->name
));
1167 *wchp
= (*seqp
)->ucs4
;
1177 /* Ellipsis like in `<foo123>..<foo12a>' or `<j1234>....<j1245>'. */
1179 charclass_symbolic_ellipsis (struct linereader
*ldfile
,
1180 struct locale_ctype_t
*ctype
,
1181 struct charmap_t
*charmap
,
1182 struct repertoire_t
*repertoire
,
1184 const char *last_str
,
1185 unsigned long int class256_bit
,
1186 unsigned long int class_bit
, int base
,
1187 int ignore_content
, int handle_digits
)
1189 const char *nowstr
= now
->val
.str
.startmb
;
1190 char tmp
[now
->val
.str
.lenmb
+ 1];
1193 unsigned long int from
;
1194 unsigned long int to
;
1196 /* We have to compute the ellipsis values using the symbolic names. */
1197 assert (last_str
!= NULL
);
1199 if (strlen (last_str
) != now
->val
.str
.lenmb
)
1203 _("`%s' and `%.*s' are no valid names for symbolic range"),
1204 last_str
, now
->val
.str
.lenmb
, nowstr
);
1208 if (memcmp (last_str
, nowstr
, now
->val
.str
.lenmb
) == 0)
1209 /* Nothing to do, the names are the same. */
1212 for (cp
= last_str
; *cp
== *(nowstr
+ (cp
- last_str
)); ++cp
)
1216 from
= strtoul (cp
, &endp
, base
);
1217 if ((from
== UINT_MAX
&& errno
== ERANGE
) || *endp
!= '\0')
1220 to
= strtoul (nowstr
+ (cp
- last_str
), &endp
, base
);
1221 if ((to
== UINT_MAX
&& errno
== ERANGE
)
1222 || (endp
- nowstr
) != now
->val
.str
.lenmb
|| from
>= to
)
1225 /* OK, we have a range FROM - TO. Now we can create the symbolic names. */
1226 if (!ignore_content
)
1228 now
->val
.str
.startmb
= tmp
;
1229 while (++from
<= to
)
1231 struct charseq
*seq
;
1234 sprintf (tmp
, (base
== 10 ? "%.*s%0*d" : "%.*s%0*X"), cp
- last_str
,
1235 last_str
, now
->val
.str
.lenmb
- (cp
- last_str
), from
);
1237 get_character (now
, charmap
, repertoire
, &seq
, &wch
);
1239 if (seq
!= NULL
&& seq
->nbytes
== 1)
1240 /* Yep, we can store information about this byte sequence. */
1241 ctype
->class256_collection
[seq
->bytes
[0]] |= class256_bit
;
1243 if (wch
!= ILLEGAL_CHAR_VALUE
&& class_bit
!= 0)
1244 /* We have the UCS4 position. */
1245 *find_idx (ctype
, &ctype
->class_collection
,
1246 &ctype
->class_collection_max
,
1247 &ctype
->class_collection_act
, wch
) |= class_bit
;
1249 if (handle_digits
== 1)
1251 /* We must store the digit values. */
1252 if (ctype
->mbdigits_act
== ctype
->mbdigits_max
)
1254 ctype
->mbdigits_max
*= 2;
1255 ctype
->mbdigits
= xrealloc (ctype
->mbdigits
,
1256 (ctype
->mbdigits_max
1257 * sizeof (char *)));
1258 ctype
->wcdigits_max
*= 2;
1259 ctype
->wcdigits
= xrealloc (ctype
->wcdigits
,
1260 (ctype
->wcdigits_max
1261 * sizeof (uint32_t)));
1264 ctype
->mbdigits
[ctype
->mbdigits_act
++] = seq
;
1265 ctype
->wcdigits
[ctype
->wcdigits_act
++] = wch
;
1267 else if (handle_digits
== 2)
1269 /* We must store the digit values. */
1270 if (ctype
->outdigits_act
>= 10)
1272 lr_error (ldfile
, _("\
1273 %s: field `%s' does not contain exactly ten entries"),
1274 "LC_CTYPE", "outdigit");
1278 ctype
->mboutdigits
[ctype
->outdigits_act
] = seq
;
1279 ctype
->wcoutdigits
[ctype
->outdigits_act
] = wch
;
1280 ++ctype
->outdigits_act
;
1287 /* Ellipsis like in `<U1234>..<U2345>'. */
1289 charclass_ucs4_ellipsis (struct linereader
*ldfile
,
1290 struct locale_ctype_t
*ctype
,
1291 struct charmap_t
*charmap
,
1292 struct repertoire_t
*repertoire
,
1293 struct token
*now
, uint32_t last_wch
,
1294 unsigned long int class256_bit
,
1295 unsigned long int class_bit
, int ignore_content
,
1298 if (last_wch
> now
->val
.ucs4
)
1300 lr_error (ldfile
, _("\
1301 to-value <U%0*X> of range is smaller than from-value <U%0*X>"),
1302 (now
->val
.ucs4
| last_wch
) < 65536 ? 4 : 8, now
->val
.ucs4
,
1303 (now
->val
.ucs4
| last_wch
) < 65536 ? 4 : 8, last_wch
);
1307 if (!ignore_content
)
1308 while (++last_wch
<= now
->val
.ucs4
)
1310 /* We have to find out whether there is a byte sequence corresponding
1311 to this UCS4 value. */
1312 struct charseq
*seq
= repertoire_find_seq (repertoire
, last_wch
);
1314 /* If this is the first time we look for this sequence create a new
1318 /* Find the symbolic name for this UCS4 value. */
1319 const char *symbol
= repertoire_find_symbol (repertoire
, last_wch
);
1320 uint32_t *newp
= obstack_alloc (&repertoire
->mem_pool
, 4);
1324 /* We have a name, now search the multibyte value. */
1325 seq
= charmap_find_value (charmap
, symbol
, strlen (symbol
));
1329 /* We have to create a fake entry. */
1330 static const struct charseq negative
1331 = { .ucs4
= ILLEGAL_CHAR_VALUE
};
1332 seq
= (struct charseq
*) &negative
;
1335 seq
->ucs4
= last_wch
;
1337 insert_entry (&repertoire
->seq_table
, newp
, 4, seq
);
1340 /* We have a name, now search the multibyte value. */
1341 if (seq
->ucs4
== last_wch
&& seq
->nbytes
== 1)
1342 /* Yep, we can store information about this byte sequence. */
1343 ctype
->class256_collection
[(size_t) seq
->bytes
[0]]
1346 /* And of course we have the UCS4 position. */
1347 if (class_bit
!= 0 && class_bit
!= 0)
1348 *find_idx (ctype
, &ctype
->class_collection
,
1349 &ctype
->class_collection_max
,
1350 &ctype
->class_collection_act
, last_wch
) |= class_bit
;
1352 if (handle_digits
== 1)
1354 /* We must store the digit values. */
1355 if (ctype
->mbdigits_act
== ctype
->mbdigits_max
)
1357 ctype
->mbdigits_max
*= 2;
1358 ctype
->mbdigits
= xrealloc (ctype
->mbdigits
,
1359 (ctype
->mbdigits_max
1360 * sizeof (char *)));
1361 ctype
->wcdigits_max
*= 2;
1362 ctype
->wcdigits
= xrealloc (ctype
->wcdigits
,
1363 (ctype
->wcdigits_max
1364 * sizeof (uint32_t)));
1367 ctype
->mbdigits
[ctype
->mbdigits_act
++] = (seq
->ucs4
== last_wch
1369 ctype
->wcdigits
[ctype
->wcdigits_act
++] = last_wch
;
1371 else if (handle_digits
== 2)
1373 /* We must store the digit values. */
1374 if (ctype
->outdigits_act
>= 10)
1376 lr_error (ldfile
, _("\
1377 %s: field `%s' does not contain exactly ten entries"),
1378 "LC_CTYPE", "outdigit");
1382 ctype
->mboutdigits
[ctype
->outdigits_act
] = (seq
->ucs4
== last_wch
1384 ctype
->wcoutdigits
[ctype
->outdigits_act
] = last_wch
;
1385 ++ctype
->outdigits_act
;
1391 /* Ellipsis as in `/xea/x12.../xea/x34'. */
1393 charclass_charcode_ellipsis (struct linereader
*ldfile
,
1394 struct locale_ctype_t
*ctype
,
1395 struct charmap_t
*charmap
,
1396 struct repertoire_t
*repertoire
,
1397 struct token
*now
, char *last_charcode
,
1398 uint32_t last_charcode_len
,
1399 unsigned long int class256_bit
,
1400 unsigned long int class_bit
, int ignore_content
,
1403 /* First check whether the to-value is larger. */
1404 if (now
->val
.charcode
.nbytes
!= last_charcode_len
)
1406 lr_error (ldfile
, _("\
1407 start end end character sequence of range must have the same length"));
1411 if (memcmp (last_charcode
, now
->val
.charcode
.bytes
, last_charcode_len
) > 0)
1413 lr_error (ldfile
, _("\
1414 to-value character sequence is smaller than from-value sequence"));
1418 if (!ignore_content
)
1422 /* Increment the byte sequence value. */
1423 struct charseq
*seq
;
1427 for (i
= last_charcode_len
- 1; i
>= 0; --i
)
1428 if (++last_charcode
[i
] != 0)
1431 if (last_charcode_len
== 1)
1432 /* Of course we have the charcode value. */
1433 ctype
->class256_collection
[(size_t) last_charcode
[0]]
1436 /* Find the symbolic name. */
1437 seq
= charmap_find_symbol (charmap
, last_charcode
,
1441 if (seq
->ucs4
== UNINITIALIZED_CHAR_VALUE
)
1442 seq
->ucs4
= repertoire_find_value (repertoire
, seq
->name
,
1443 strlen (seq
->name
));
1446 if (wch
!= ILLEGAL_CHAR_VALUE
&& class_bit
!= 0)
1447 *find_idx (ctype
, &ctype
->class_collection
,
1448 &ctype
->class_collection_max
,
1449 &ctype
->class_collection_act
, wch
) |= class_bit
;
1452 wch
= ILLEGAL_CHAR_VALUE
;
1454 if (handle_digits
== 1)
1456 /* We must store the digit values. */
1457 if (ctype
->mbdigits_act
== ctype
->mbdigits_max
)
1459 ctype
->mbdigits_max
*= 2;
1460 ctype
->mbdigits
= xrealloc (ctype
->mbdigits
,
1461 (ctype
->mbdigits_max
1462 * sizeof (char *)));
1463 ctype
->wcdigits_max
*= 2;
1464 ctype
->wcdigits
= xrealloc (ctype
->wcdigits
,
1465 (ctype
->wcdigits_max
1466 * sizeof (uint32_t)));
1469 seq
= xmalloc (sizeof (struct charseq
) + last_charcode_len
);
1470 memcpy ((char *) (seq
+ 1), last_charcode
, last_charcode_len
);
1471 seq
->nbytes
= last_charcode_len
;
1473 ctype
->mbdigits
[ctype
->mbdigits_act
++] = seq
;
1474 ctype
->wcdigits
[ctype
->wcdigits_act
++] = wch
;
1476 else if (handle_digits
== 2)
1478 struct charseq
*seq
;
1479 /* We must store the digit values. */
1480 if (ctype
->outdigits_act
>= 10)
1482 lr_error (ldfile
, _("\
1483 %s: field `%s' does not contain exactly ten entries"),
1484 "LC_CTYPE", "outdigit");
1488 seq
= xmalloc (sizeof (struct charseq
) + last_charcode_len
);
1489 memcpy ((char *) (seq
+ 1), last_charcode
, last_charcode_len
);
1490 seq
->nbytes
= last_charcode_len
;
1492 ctype
->mboutdigits
[ctype
->outdigits_act
] = seq
;
1493 ctype
->wcoutdigits
[ctype
->outdigits_act
] = wch
;
1494 ++ctype
->outdigits_act
;
1497 while (memcmp (last_charcode
, now
->val
.charcode
.bytes
,
1498 last_charcode_len
) != 0);
1503 /* Read one transliteration entry. */
1505 read_widestring (struct linereader
*ldfile
, struct token
*now
,
1506 struct charmap_t
*charmap
, struct repertoire_t
*repertoire
)
1510 if (now
->tok
== tok_default_missing
)
1511 /* The special name "" will denote this case. */
1512 wstr
= (uint32_t *) L
"";
1513 else if (now
->tok
== tok_bsymbol
)
1515 /* Get the value from the repertoire. */
1516 wstr
= xmalloc (2 * sizeof (uint32_t));
1517 wstr
[0] = repertoire_find_value (repertoire
, now
->val
.str
.startmb
,
1518 now
->val
.str
.lenmb
);
1519 if (wstr
[0] == ILLEGAL_CHAR_VALUE
)
1520 /* We cannot proceed, we don't know the UCS4 value. */
1525 else if (now
->tok
== tok_ucs4
)
1527 wstr
= xmalloc (2 * sizeof (uint32_t));
1528 wstr
[0] = now
->val
.ucs4
;
1531 else if (now
->tok
== tok_charcode
)
1533 /* Argh, we have to convert to the symbol name first and then to the
1535 struct charseq
*seq
= charmap_find_symbol (charmap
,
1536 now
->val
.str
.startmb
,
1537 now
->val
.str
.lenmb
);
1539 /* Cannot find the UCS4 value. */
1542 if (seq
->ucs4
== UNINITIALIZED_CHAR_VALUE
)
1543 seq
->ucs4
= repertoire_find_value (repertoire
, seq
->name
,
1544 strlen (seq
->name
));
1545 if (seq
->ucs4
== ILLEGAL_CHAR_VALUE
)
1546 /* We cannot proceed, we don't know the UCS4 value. */
1549 wstr
= xmalloc (2 * sizeof (uint32_t));
1550 wstr
[0] = seq
->ucs4
;
1553 else if (now
->tok
== tok_string
)
1555 wstr
= now
->val
.str
.startwc
;
1561 if (now
->tok
!= tok_eol
&& now
->tok
!= tok_eof
)
1562 lr_ignore_rest (ldfile
, 0);
1563 SYNTAX_ERROR (_("%s: syntax error"), "LC_CTYPE");
1564 return (uint32_t *) -1l;
1572 read_translit_entry (struct linereader
*ldfile
, struct locale_ctype_t
*ctype
,
1573 struct token
*now
, struct charmap_t
*charmap
,
1574 struct repertoire_t
*repertoire
)
1576 uint32_t *from_wstr
= read_widestring (ldfile
, now
, charmap
, repertoire
);
1577 struct translit_t
*result
;
1578 struct translit_to_t
**top
;
1579 struct obstack
*ob
= &ctype
->mem_pool
;
1583 if (from_wstr
== NULL
)
1584 /* There is no valid from string. */
1587 result
= (struct translit_t
*) obstack_alloc (ob
,
1588 sizeof (struct translit_t
));
1589 result
->from
= from_wstr
;
1590 result
->next
= NULL
;
1600 /* Next we have one or more transliterations. They are
1601 separated by semicolons. */
1602 now
= lr_token (ldfile
, charmap
, repertoire
);
1604 if (!first
&& (now
->tok
== tok_semicolon
|| now
->tok
== tok_eol
))
1606 /* One string read. */
1607 const uint32_t zero
= 0;
1611 obstack_grow (ob
, &zero
, 4);
1612 to_wstr
= obstack_finish (ob
);
1614 *top
= obstack_alloc (ob
, sizeof (struct translit_to_t
));
1615 (*top
)->str
= to_wstr
;
1616 (*top
)->next
= NULL
;
1619 if (now
->tok
== tok_eol
)
1621 result
->next
= ctype
->translit
;
1622 ctype
->translit
= result
;
1627 top
= &(*top
)->next
;
1632 to_wstr
= read_widestring (ldfile
, now
, charmap
, repertoire
);
1633 if (to_wstr
== (uint32_t *) -1l)
1635 /* An error occurred. */
1636 obstack_free (ob
, result
);
1640 if (to_wstr
== NULL
)
1643 /* This value is usable. */
1644 obstack_grow (ob
, to_wstr
, wcslen ((wchar_t *) to_wstr
) * 4);
1652 /* The parser for the LC_CTYPE section of the locale definition. */
1654 ctype_read (struct linereader
*ldfile
, struct localedef_t
*result
,
1655 struct charmap_t
*charmap
, const char *repertoire_name
,
1658 struct repertoire_t
*repertoire
= NULL
;
1659 struct locale_ctype_t
*ctype
;
1661 enum token_t nowtok
;
1663 struct charseq
*last_seq
;
1664 uint32_t last_wch
= 0;
1665 enum token_t last_token
;
1666 enum token_t ellipsis_token
;
1667 char last_charcode
[16];
1668 size_t last_charcode_len
= 0;
1669 const char *last_str
= NULL
;
1672 /* Get the repertoire we have to use. */
1673 if (repertoire_name
!= NULL
)
1674 repertoire
= repertoire_read (repertoire_name
);
1676 /* The rest of the line containing `LC_CTYPE' must be free. */
1677 lr_ignore_rest (ldfile
, 1);
1682 now
= lr_token (ldfile
, charmap
, NULL
);
1685 while (nowtok
== tok_eol
);
1687 /* If we see `copy' now we are almost done. */
1688 if (nowtok
== tok_copy
)
1690 handle_copy (ldfile
, charmap
, repertoire
, result
, tok_lc_ctype
, LC_CTYPE
,
1691 "LC_CTYPE", ignore_content
);
1695 /* Prepare the data structures. */
1696 ctype_startup (ldfile
, result
, charmap
, ignore_content
);
1697 ctype
= result
->categories
[LC_CTYPE
].ctype
;
1699 /* Remember the repertoire we use. */
1700 if (!ignore_content
)
1701 ctype
->repertoire
= repertoire
;
1705 unsigned long int class_bit
= 0;
1706 unsigned long int class256_bit
= 0;
1707 int handle_digits
= 0;
1709 /* Of course we don't proceed beyond the end of file. */
1710 if (nowtok
== tok_eof
)
1713 /* Ingore empty lines. */
1714 if (nowtok
== tok_eol
)
1716 now
= lr_token (ldfile
, charmap
, NULL
);
1724 now
= lr_token (ldfile
, charmap
, NULL
);
1725 while (now
->tok
== tok_ident
|| now
->tok
== tok_string
)
1727 ctype_class_new (ldfile
, ctype
, now
->val
.str
.startmb
);
1728 now
= lr_token (ldfile
, charmap
, NULL
);
1729 if (now
->tok
!= tok_semicolon
)
1731 now
= lr_token (ldfile
, charmap
, NULL
);
1733 if (now
->tok
!= tok_eol
)
1735 %s: syntax error in definition of new character class"), "LC_CTYPE");
1739 now
= lr_token (ldfile
, charmap
, NULL
);
1740 while (now
->tok
== tok_ident
|| now
->tok
== tok_string
)
1742 ctype_map_new (ldfile
, ctype
, now
->val
.str
.startmb
, charmap
);
1743 now
= lr_token (ldfile
, charmap
, NULL
);
1744 if (now
->tok
!= tok_semicolon
)
1746 now
= lr_token (ldfile
, charmap
, NULL
);
1748 if (now
->tok
!= tok_eol
)
1750 %s: syntax error in definition of new character map"), "LC_CTYPE");
1754 /* Ignore the rest of the line if we don't need the input of
1758 lr_ignore_rest (ldfile
, 0);
1762 /* We simply forget the `class' keyword and use the following
1763 operand to determine the bit. */
1764 now
= lr_token (ldfile
, charmap
, NULL
);
1765 if (now
->tok
== tok_ident
|| now
->tok
== tok_string
)
1767 /* Must be one of the predefined class names. */
1768 for (cnt
= 0; cnt
< ctype
->nr_charclass
; ++cnt
)
1769 if (strcmp (ctype
->classnames
[cnt
], now
->val
.str
.startmb
) == 0)
1771 if (cnt
>= ctype
->nr_charclass
)
1773 #ifdef PREDEFINED_CLASSES
1774 if (now
->val
.str
.lenmb
== 8
1775 && memcmp ("special1", now
->val
.str
.startmb
, 8) == 0)
1776 class_bit
= _ISwspecial1
;
1777 else if (now
->val
.str
.lenmb
== 8
1778 && memcmp ("special2", now
->val
.str
.startmb
, 8) == 0)
1779 class_bit
= _ISwspecial2
;
1780 else if (now
->val
.str
.lenmb
== 8
1781 && memcmp ("special3", now
->val
.str
.startmb
, 8) == 0)
1782 class_bit
= _ISwspecial3
;
1786 lr_error (ldfile
, _("\
1787 unknown character class `%s' in category `LC_CTYPE'"),
1788 now
->val
.str
.startmb
);
1789 free (now
->val
.str
.startmb
);
1791 lr_ignore_rest (ldfile
, 0);
1796 class_bit
= _ISwbit (cnt
);
1798 free (now
->val
.str
.startmb
);
1800 else if (now
->tok
== tok_digit
)
1801 goto handle_tok_digit
;
1802 else if (now
->tok
< tok_upper
|| now
->tok
> tok_blank
)
1806 class_bit
= BITw (now
->tok
);
1807 class256_bit
= BIT (now
->tok
);
1810 /* The next character must be a semicolon. */
1811 now
= lr_token (ldfile
, charmap
, NULL
);
1812 if (now
->tok
!= tok_semicolon
)
1814 goto read_charclass
;
1827 /* Ignore the rest of the line if we don't need the input of
1831 lr_ignore_rest (ldfile
, 0);
1835 class_bit
= BITw (now
->tok
);
1836 class256_bit
= BIT (now
->tok
);
1839 ctype
->class_done
|= class_bit
;
1840 last_token
= tok_none
;
1841 ellipsis_token
= tok_none
;
1842 now
= lr_token (ldfile
, charmap
, NULL
);
1843 while (now
->tok
!= tok_eol
&& now
->tok
!= tok_eof
)
1846 struct charseq
*seq
;
1848 if (ellipsis_token
== tok_none
)
1850 if (get_character (now
, charmap
, repertoire
, &seq
, &wch
))
1853 if (!ignore_content
&& seq
!= NULL
&& seq
->nbytes
== 1)
1854 /* Yep, we can store information about this byte
1856 ctype
->class256_collection
[seq
->bytes
[0]] |= class256_bit
;
1858 if (!ignore_content
&& wch
!= ILLEGAL_CHAR_VALUE
1860 /* We have the UCS4 position. */
1861 *find_idx (ctype
, &ctype
->class_collection
,
1862 &ctype
->class_collection_max
,
1863 &ctype
->class_collection_act
, wch
) |= class_bit
;
1865 last_token
= now
->tok
;
1866 /* Terminate the string. */
1867 if (last_token
== tok_bsymbol
)
1869 now
->val
.str
.startmb
[now
->val
.str
.lenmb
] = '\0';
1870 last_str
= now
->val
.str
.startmb
;
1876 memcpy (last_charcode
, now
->val
.charcode
.bytes
, 16);
1877 last_charcode_len
= now
->val
.charcode
.nbytes
;
1879 if (!ignore_content
&& handle_digits
== 1)
1881 /* We must store the digit values. */
1882 if (ctype
->mbdigits_act
== ctype
->mbdigits_max
)
1884 ctype
->mbdigits_max
+= 10;
1885 ctype
->mbdigits
= xrealloc (ctype
->mbdigits
,
1886 (ctype
->mbdigits_max
1887 * sizeof (char *)));
1888 ctype
->wcdigits_max
+= 10;
1889 ctype
->wcdigits
= xrealloc (ctype
->wcdigits
,
1890 (ctype
->wcdigits_max
1891 * sizeof (uint32_t)));
1894 ctype
->mbdigits
[ctype
->mbdigits_act
++] = seq
;
1895 ctype
->wcdigits
[ctype
->wcdigits_act
++] = wch
;
1897 else if (!ignore_content
&& handle_digits
== 2)
1899 /* We must store the digit values. */
1900 if (ctype
->outdigits_act
>= 10)
1902 lr_error (ldfile
, _("\
1903 %s: field `%s' does not contain exactly ten entries"),
1904 "LC_CTYPE", "outdigit");
1908 ctype
->mboutdigits
[ctype
->outdigits_act
] = seq
;
1909 ctype
->wcoutdigits
[ctype
->outdigits_act
] = wch
;
1910 ++ctype
->outdigits_act
;
1915 /* Now it gets complicated. We have to resolve the
1916 ellipsis problem. First we must distinguish between
1917 the different kind of ellipsis and this must match the
1918 tokens we have seen. */
1919 assert (last_token
!= tok_none
);
1921 if (last_token
!= now
->tok
)
1923 lr_error (ldfile
, _("\
1924 ellipsis range must be marked by two operands of same type"));
1925 lr_ignore_rest (ldfile
, 0);
1929 if (last_token
== tok_bsymbol
)
1931 if (ellipsis_token
== tok_ellipsis3
)
1932 lr_error (ldfile
, _("with symbolic name range values \
1933 the absolute ellipsis `...' must not be used"));
1935 charclass_symbolic_ellipsis (ldfile
, ctype
, charmap
,
1936 repertoire
, now
, last_str
,
1937 class256_bit
, class_bit
,
1944 else if (last_token
== tok_ucs4
)
1946 if (ellipsis_token
!= tok_ellipsis2
)
1947 lr_error (ldfile
, _("\
1948 with UCS range values one must use the hexadecimal symbolic ellipsis `..'"));
1950 charclass_ucs4_ellipsis (ldfile
, ctype
, charmap
,
1951 repertoire
, now
, last_wch
,
1952 class256_bit
, class_bit
,
1953 ignore_content
, handle_digits
);
1957 assert (last_token
== tok_charcode
);
1959 if (ellipsis_token
!= tok_ellipsis3
)
1960 lr_error (ldfile
, _("\
1961 with character code range values one must use the absolute ellipsis `...'"));
1963 charclass_charcode_ellipsis (ldfile
, ctype
, charmap
,
1967 class256_bit
, class_bit
,
1972 /* Now we have used the last value. */
1973 last_token
= tok_none
;
1976 /* Next we expect a semicolon or the end of the line. */
1977 now
= lr_token (ldfile
, charmap
, NULL
);
1978 if (now
->tok
== tok_eol
|| now
->tok
== tok_eof
)
1981 if (last_token
!= tok_none
1982 && now
->tok
>= tok_ellipsis2
&& now
->tok
<= tok_ellipsis4
)
1984 ellipsis_token
= now
->tok
;
1985 now
= lr_token (ldfile
, charmap
, NULL
);
1989 if (now
->tok
!= tok_semicolon
)
1992 /* And get the next character. */
1993 now
= lr_token (ldfile
, charmap
, NULL
);
1995 ellipsis_token
= tok_none
;
2000 /* Ignore the rest of the line if we don't need the input of
2004 lr_ignore_rest (ldfile
, 0);
2009 class_bit
= _ISwdigit
;
2010 class256_bit
= _ISdigit
;
2012 goto read_charclass
;
2015 /* Ignore the rest of the line if we don't need the input of
2019 lr_ignore_rest (ldfile
, 0);
2023 if (ctype
->outdigits_act
!= 0)
2024 lr_error (ldfile
, _("\
2025 %s: field `%s' declared more than once"),
2026 "LC_CTYPE", "outdigit");
2030 goto read_charclass
;
2033 /* Ignore the rest of the line if we don't need the input of
2037 lr_ignore_rest (ldfile
, 0);
2045 /* Ignore the rest of the line if we don't need the input of
2049 lr_ignore_rest (ldfile
, 0);
2057 /* Ignore the rest of the line if we don't need the input of
2061 lr_ignore_rest (ldfile
, 0);
2065 /* We simply forget the `map' keyword and use the following
2066 operand to determine the mapping. */
2067 now
= lr_token (ldfile
, charmap
, NULL
);
2068 if (now
->tok
== tok_ident
|| now
->tok
== tok_string
)
2072 for (cnt
= 2; cnt
< ctype
->map_collection_nr
; ++cnt
)
2073 if (strcmp (now
->val
.str
.startmb
, ctype
->mapnames
[cnt
]) == 0)
2076 if (cnt
< ctype
->map_collection_nr
)
2080 lr_error (ldfile
, _("unknown map `%s'"),
2081 now
->val
.str
.startmb
);
2082 lr_ignore_rest (ldfile
, 0);
2086 else if (now
->tok
< tok_toupper
|| now
->tok
> tok_tolower
)
2089 mapidx
= now
->tok
- tok_toupper
;
2091 now
= lr_token (ldfile
, charmap
, NULL
);
2092 /* This better should be a semicolon. */
2093 if (now
->tok
!= tok_semicolon
)
2097 /* Test whether this mapping was already defined. */
2098 if (ctype
->tomap_done
[mapidx
])
2100 lr_error (ldfile
, _("duplicated definition for mapping `%s'"),
2101 ctype
->mapnames
[mapidx
]);
2102 lr_ignore_rest (ldfile
, 0);
2105 ctype
->tomap_done
[mapidx
] = 1;
2107 now
= lr_token (ldfile
, charmap
, NULL
);
2108 while (now
->tok
!= tok_eol
&& now
->tok
!= tok_eof
)
2110 struct charseq
*from_seq
;
2112 struct charseq
*to_seq
;
2115 /* Every pair starts with an opening brace. */
2116 if (now
->tok
!= tok_open_brace
)
2119 /* Next comes the from-value. */
2120 now
= lr_token (ldfile
, charmap
, NULL
);
2121 if (get_character (now
, charmap
, repertoire
, &from_seq
,
2125 /* The next is a comma. */
2126 now
= lr_token (ldfile
, charmap
, NULL
);
2127 if (now
->tok
!= tok_comma
)
2130 /* And the other value. */
2131 now
= lr_token (ldfile
, charmap
, NULL
);
2132 if (get_character (now
, charmap
, repertoire
, &to_seq
,
2136 /* And the last thing is the closing brace. */
2137 now
= lr_token (ldfile
, charmap
, NULL
);
2138 if (now
->tok
!= tok_close_brace
)
2141 if (!ignore_content
)
2143 if (mapidx
< 2 && from_seq
!= NULL
&& to_seq
!= NULL
2144 && from_seq
->nbytes
== 1 && to_seq
->nbytes
== 1)
2145 /* We can use this value. */
2146 ctype
->map256_collection
[mapidx
][from_seq
->bytes
[0]]
2149 if (from_wch
!= ILLEGAL_CHAR_VALUE
2150 && to_wch
!= ILLEGAL_CHAR_VALUE
)
2151 /* Both correct values. */
2152 *find_idx (ctype
, &ctype
->map_collection
[mapidx
],
2153 &ctype
->map_collection_max
[mapidx
],
2154 &ctype
->map_collection_act
[mapidx
],
2158 /* Now comes a semicolon or the end of the line/file. */
2159 now
= lr_token (ldfile
, charmap
, NULL
);
2160 if (now
->tok
== tok_semicolon
)
2161 now
= lr_token (ldfile
, charmap
, NULL
);
2165 case tok_translit_start
:
2166 /* Ignore the rest of the line if we don't need the input of
2170 lr_ignore_rest (ldfile
, 0);
2174 /* The rest of the line better should be empty. */
2175 lr_ignore_rest (ldfile
, 1);
2177 /* We count here the number of allocated entries in the `translit'
2181 /* We proceed until we see the `translit_end' token. */
2182 while (now
= lr_token (ldfile
, charmap
, repertoire
),
2183 now
->tok
!= tok_translit_end
&& now
->tok
!= tok_eof
)
2185 if (now
->tok
== tok_eol
)
2186 /* Ignore empty lines. */
2189 if (now
->tok
== tok_translit_end
)
2191 lr_ignore_rest (ldfile
, 0);
2195 if (now
->tok
== tok_include
)
2197 /* We have to include locale. */
2198 const char *locale_name
;
2199 const char *repertoire_name
;
2201 now
= lr_token (ldfile
, charmap
, NULL
);
2202 /* This should be a string or an identifier. In any
2203 case something to name a locale. */
2204 if (now
->tok
!= tok_string
&& now
->tok
!= tok_ident
)
2207 lr_error (ldfile
, _("%s: syntax error"), "LC_CTYPE");
2208 lr_ignore_rest (ldfile
, 0);
2211 locale_name
= now
->val
.str
.startmb
;
2213 /* Next should be a semicolon. */
2214 now
= lr_token (ldfile
, charmap
, NULL
);
2215 if (now
->tok
!= tok_semicolon
)
2216 goto translit_syntax
;
2218 /* Now the repertoire name. */
2219 now
= lr_token (ldfile
, charmap
, NULL
);
2220 if ((now
->tok
!= tok_string
&& now
->tok
!= tok_ident
)
2221 || now
->val
.str
.startmb
== NULL
)
2222 goto translit_syntax
;
2223 repertoire_name
= now
->val
.str
.startmb
;
2225 /* We must not have more than one `include'. */
2226 if (ctype
->translit_copy_locale
!= NULL
)
2228 lr_error (ldfile
, _("\
2229 %s: only one `include' instruction allowed"), "LC_CTYPE");
2230 lr_ignore_rest (ldfile
, 0);
2234 ctype
->translit_copy_locale
= locale_name
;
2235 ctype
->translit_copy_repertoire
= repertoire_name
;
2237 /* The rest of the line must be empty. */
2238 lr_ignore_rest (ldfile
, 1);
2242 read_translit_entry (ldfile
, ctype
, now
, charmap
, repertoire
);
2247 /* Ignore the rest of the line if we don't need the input of
2251 lr_ignore_rest (ldfile
, 0);
2255 /* This could mean one of several things. First test whether
2256 it's a character class name. */
2257 for (cnt
= 0; cnt
< ctype
->nr_charclass
; ++cnt
)
2258 if (strcmp (now
->val
.str
.startmb
, ctype
->classnames
[cnt
]) == 0)
2260 if (cnt
< ctype
->nr_charclass
)
2262 class_bit
= _ISwbit (cnt
);
2263 class256_bit
= cnt
<= 11 ? _ISbit (cnt
) : 0;
2264 free (now
->val
.str
.startmb
);
2265 goto read_charclass
;
2267 for (cnt
= 0; cnt
< ctype
->map_collection_nr
; ++cnt
)
2268 if (strcmp (now
->val
.str
.startmb
, ctype
->mapnames
[cnt
]) == 0)
2270 if (cnt
< ctype
->map_collection_nr
)
2273 free (now
->val
.str
.startmb
);
2276 #ifdef PREDEFINED_CLASSES
2277 if (strcmp (now
->val
.str
.startmb
, "special1") == 0)
2279 class_bit
= _ISwspecial1
;
2280 free (now
->val
.str
.startmb
);
2281 goto read_charclass
;
2283 if (strcmp (now
->val
.str
.startmb
, "special2") == 0)
2285 class_bit
= _ISwspecial2
;
2286 free (now
->val
.str
.startmb
);
2287 goto read_charclass
;
2289 if (strcmp (now
->val
.str
.startmb
, "special3") == 0)
2291 class_bit
= _ISwspecial3
;
2292 free (now
->val
.str
.startmb
);
2293 goto read_charclass
;
2295 if (strcmp (now
->val
.str
.startmb
, "tosymmetric") == 0)
2304 /* Next we assume `LC_CTYPE'. */
2305 now
= lr_token (ldfile
, charmap
, NULL
);
2306 if (now
->tok
== tok_eof
)
2308 if (now
->tok
== tok_eol
)
2309 lr_error (ldfile
, _("%s: incomplete `END' line"),
2311 else if (now
->tok
!= tok_lc_ctype
)
2312 lr_error (ldfile
, _("\
2313 %1$s: definition does not end with `END %1$s'"), "LC_CTYPE");
2314 lr_ignore_rest (ldfile
, now
->tok
== tok_lc_ctype
);
2319 if (now
->tok
!= tok_eof
)
2320 SYNTAX_ERROR (_("%s: syntax error"), "LC_CTYPE");
2323 /* Prepare for the next round. */
2324 now
= lr_token (ldfile
, charmap
, NULL
);
2328 /* When we come here we reached the end of the file. */
2329 lr_error (ldfile
, _("%s: premature end of file"), "LC_CTYPE");
2334 set_class_defaults (struct locale_ctype_t
*ctype
, struct charmap_t
*charmap
,
2335 struct repertoire_t
*repertoire
)
2339 /* These function defines the default values for the classes and conversions
2340 according to POSIX.2 2.5.2.1.
2341 It may seem that the order of these if-blocks is arbitrary but it is NOT.
2342 Don't move them unless you know what you do! */
2344 void set_default (int bitpos
, int from
, int to
)
2348 int bit
= _ISbit (bitpos
);
2349 int bitw
= _ISwbit (bitpos
);
2350 /* Define string. */
2353 for (ch
= from
; ch
<= to
; ++ch
)
2356 struct charseq
*seq
;
2359 value
= repertoire_find_value (repertoire
, tmp
, 1);
2360 if (value
== ILLEGAL_CHAR_VALUE
)
2364 %s: character `%s' not defined in repertoire while needed as default value"),
2368 ELEM (ctype
, class_collection
, , value
) |= bitw
;
2370 seq
= charmap_find_value (charmap
, tmp
, 1);
2375 %s: character `%s' not defined in charmap while needed as default value"),
2378 else if (seq
->nbytes
!= 1)
2380 %s: character `%s' in charmap not representable with one byte"),
2383 ctype
->class256_collection
[seq
->bytes
[0]] |= bit
;
2387 /* Set default values if keyword was not present. */
2388 if ((ctype
->class_done
& BITw (tok_upper
)) == 0)
2389 /* "If this keyword [lower] is not specified, the lowercase letters
2390 `A' through `Z', ..., shall automatically belong to this class,
2391 with implementation defined character values." [P1003.2, 2.5.2.1] */
2392 set_default (BITPOS (tok_upper
), 'A', 'Z');
2394 if ((ctype
->class_done
& BITw (tok_lower
)) == 0)
2395 /* "If this keyword [lower] is not specified, the lowercase letters
2396 `a' through `z', ..., shall automatically belong to this class,
2397 with implementation defined character values." [P1003.2, 2.5.2.1] */
2398 set_default (BITPOS (tok_lower
), 'a', 'z');
2400 if ((ctype
->class_done
& BITw (tok_alpha
)) == 0)
2402 /* Table 2-6 in P1003.2 says that characters in class `upper' or
2403 class `lower' *must* be in class `alpha'. */
2404 unsigned long int mask
= BIT (tok_upper
) | BIT (tok_lower
);
2405 unsigned long int maskw
= BITw (tok_upper
) | BITw (tok_lower
);
2407 for (cnt
= 0; cnt
< 256; ++cnt
)
2408 if ((ctype
->class256_collection
[cnt
] & mask
) != 0)
2409 ctype
->class256_collection
[cnt
] |= BIT (tok_alpha
);
2411 for (cnt
= 0; cnt
< ctype
->class_collection_act
; ++cnt
)
2412 if ((ctype
->class_collection
[cnt
] & maskw
) != 0)
2413 ctype
->class_collection
[cnt
] |= BITw (tok_alpha
);
2416 if ((ctype
->class_done
& BITw (tok_digit
)) == 0)
2417 /* "If this keyword [digit] is not specified, the digits `0' through
2418 `9', ..., shall automatically belong to this class, with
2419 implementation-defined character values." [P1003.2, 2.5.2.1] */
2420 set_default (BITPOS (tok_digit
), '0', '9');
2422 /* "Only characters specified for the `alpha' and `digit' keyword
2423 shall be specified. Characters specified for the keyword `alpha'
2424 and `digit' are automatically included in this class. */
2426 unsigned long int mask
= BIT (tok_alpha
) | BIT (tok_digit
);
2427 unsigned long int maskw
= BITw (tok_alpha
) | BITw (tok_digit
);
2429 for (cnt
= 0; cnt
< 256; ++cnt
)
2430 if ((ctype
->class256_collection
[cnt
] & mask
) != 0)
2431 ctype
->class256_collection
[cnt
] |= BIT (tok_alnum
);
2433 for (cnt
= 0; cnt
< ctype
->class_collection_act
; ++cnt
)
2434 if ((ctype
->class_collection
[cnt
] & maskw
) != 0)
2435 ctype
->class_collection
[cnt
] |= BITw (tok_alnum
);
2438 if ((ctype
->class_done
& BITw (tok_space
)) == 0)
2439 /* "If this keyword [space] is not specified, the characters <space>,
2440 <form-feed>, <newline>, <carriage-return>, <tab>, and
2441 <vertical-tab>, ..., shall automatically belong to this class,
2442 with implementation-defined character values." [P1003.2, 2.5.2.1] */
2445 struct charseq
*seq
;
2447 value
= repertoire_find_value (repertoire
, "space", 5);
2448 if (value
== ILLEGAL_CHAR_VALUE
)
2452 %s: character `%s' not defined while needed as default value"),
2453 "LC_CTYPE", "<space>");
2456 ELEM (ctype
, class_collection
, , value
) |= BIT (tok_space
);
2458 seq
= charmap_find_value (charmap
, "space", 5);
2463 %s: character `%s' not defined while needed as default value"),
2464 "LC_CTYPE", "<space>");
2466 else if (seq
->nbytes
!= 1)
2468 %s: character `%s' in charmap not representable with one byte"),
2469 "LC_CTYPE", "<space>");
2471 ctype
->class256_collection
[seq
->bytes
[0]] |= BIT (tok_space
);
2474 value
= repertoire_find_value (repertoire
, "form-feed", 9);
2475 if (value
== ILLEGAL_CHAR_VALUE
)
2479 %s: character `%s' not defined while needed as default value"),
2480 "LC_CTYPE", "<form-feed>");
2483 ELEM (ctype
, class_collection
, , value
) |= BIT (tok_space
);
2485 seq
= charmap_find_value (charmap
, "form-feed", 9);
2490 %s: character `%s' not defined while needed as default value"),
2491 "LC_CTYPE", "<form-feed>");
2493 else if (seq
->nbytes
!= 1)
2495 %s: character `%s' in charmap not representable with one byte"),
2496 "LC_CTYPE", "<form-feed>");
2498 ctype
->class256_collection
[seq
->bytes
[0]] |= BIT (tok_space
);
2501 value
= repertoire_find_value (repertoire
, "newline", 7);
2502 if (value
== ILLEGAL_CHAR_VALUE
)
2506 %s: character `%s' not defined while needed as default value"),
2507 "LC_CTYPE", "<newline>");
2510 ELEM (ctype
, class_collection
, , value
) |= BIT (tok_space
);
2512 seq
= charmap_find_value (charmap
, "newline", 7);
2517 character `%s' not defined while needed as default value"),
2520 else if (seq
->nbytes
!= 1)
2522 %s: character `%s' in charmap not representable with one byte"),
2523 "LC_CTYPE", "<newline>");
2525 ctype
->class256_collection
[seq
->bytes
[0]] |= BIT (tok_space
);
2528 value
= repertoire_find_value (repertoire
, "carriage-return", 15);
2529 if (value
== ILLEGAL_CHAR_VALUE
)
2533 %s: character `%s' not defined while needed as default value"),
2534 "LC_CTYPE", "<carriage-return>");
2537 ELEM (ctype
, class_collection
, , value
) |= BIT (tok_space
);
2539 seq
= charmap_find_value (charmap
, "carriage-return", 15);
2544 %s: character `%s' not defined while needed as default value"),
2545 "LC_CTYPE", "<carriage-return>");
2547 else if (seq
->nbytes
!= 1)
2549 %s: character `%s' in charmap not representable with one byte"),
2550 "LC_CTYPE", "<carriage-return>");
2552 ctype
->class256_collection
[seq
->bytes
[0]] |= BIT (tok_space
);
2555 value
= repertoire_find_value (repertoire
, "tab", 3);
2556 if (value
== ILLEGAL_CHAR_VALUE
)
2560 %s: character `%s' not defined while needed as default value"),
2561 "LC_CTYPE", "<tab>");
2564 ELEM (ctype
, class_collection
, , value
) |= BIT (tok_space
);
2566 seq
= charmap_find_value (charmap
, "tab", 3);
2571 %s: character `%s' not defined while needed as default value"),
2572 "LC_CTYPE", "<tab>");
2574 else if (seq
->nbytes
!= 1)
2576 %s: character `%s' in charmap not representable with one byte"),
2577 "LC_CTYPE", "<tab>");
2579 ctype
->class256_collection
[seq
->bytes
[0]] |= BIT (tok_space
);
2582 value
= repertoire_find_value (repertoire
, "vertical-tab", 12);
2583 if (value
== ILLEGAL_CHAR_VALUE
)
2587 %s: character `%s' not defined while needed as default value"),
2588 "LC_CTYPE", "<vertical-tab>");
2591 ELEM (ctype
, class_collection
, , value
) |= BIT (tok_space
);
2593 seq
= charmap_find_value (charmap
, "vertical-tab", 12);
2598 %s: character `%s' not defined while needed as default value"),
2599 "LC_CTYPE", "<vertical-tab>");
2601 else if (seq
->nbytes
!= 1)
2603 %s: character `%s' in charmap not representable with one byte"),
2604 "LC_CTYPE", "<vertical-tab>");
2606 ctype
->class256_collection
[seq
->bytes
[0]] |= BIT (tok_space
);
2609 if ((ctype
->class_done
& BITw (tok_xdigit
)) == 0)
2610 /* "If this keyword is not specified, the digits `0' to `9', the
2611 uppercase letters `A' through `F', and the lowercase letters `a'
2612 through `f', ..., shell automatically belong to this class, with
2613 implementation defined character values." [P1003.2, 2.5.2.1] */
2615 set_default (BITPOS (tok_xdigit
), '0', '9');
2616 set_default (BITPOS (tok_xdigit
), 'A', 'F');
2617 set_default (BITPOS (tok_xdigit
), 'a', 'f');
2620 if ((ctype
->class_done
& BITw (tok_blank
)) == 0)
2621 /* "If this keyword [blank] is unspecified, the characters <space> and
2622 <tab> shall belong to this character class." [P1003.2, 2.5.2.1] */
2625 struct charseq
*seq
;
2627 value
= repertoire_find_value (repertoire
, "space", 5);
2628 if (value
== ILLEGAL_CHAR_VALUE
)
2632 %s: character `%s' not defined while needed as default value"),
2633 "LC_CTYPE", "<space>");
2636 ELEM (ctype
, class_collection
, , value
) |= BIT (tok_blank
);
2638 seq
= charmap_find_value (charmap
, "space", 5);
2643 %s: character `%s' not defined while needed as default value"),
2644 "LC_CTYPE", "<space>");
2646 else if (seq
->nbytes
!= 1)
2648 %s: character `%s' in charmap not representable with one byte"),
2649 "LC_CTYPE", "<space>");
2651 ctype
->class256_collection
[seq
->bytes
[0]] |= BIT (tok_blank
);
2654 value
= repertoire_find_value (repertoire
, "tab", 3);
2655 if (value
== ILLEGAL_CHAR_VALUE
)
2659 %s: character `%s' not defined while needed as default value"),
2660 "LC_CTYPE", "<tab>");
2663 ELEM (ctype
, class_collection
, , value
) |= BIT (tok_blank
);
2665 seq
= charmap_find_value (charmap
, "tab", 3);
2670 %s: character `%s' not defined while needed as default value"),
2671 "LC_CTYPE", "<tab>");
2673 else if (seq
->nbytes
!= 1)
2675 %s: character `%s' in charmap not representable with one byte"),
2676 "LC_CTYPE", "<tab>");
2678 ctype
->class256_collection
[seq
->bytes
[0]] |= BIT (tok_blank
);
2681 if ((ctype
->class_done
& BITw (tok_graph
)) == 0)
2682 /* "If this keyword [graph] is not specified, characters specified for
2683 the keywords `upper', `lower', `alpha', `digit', `xdigit' and `punct',
2684 shall belong to this character class." [P1003.2, 2.5.2.1] */
2686 unsigned long int mask
= BIT (tok_upper
) | BIT (tok_lower
) |
2687 BIT (tok_alpha
) | BIT (tok_digit
) | BIT (tok_xdigit
) | BIT (tok_punct
);
2690 for (cnt
= 0; cnt
< ctype
->class_collection_act
; ++cnt
)
2691 if ((ctype
->class_collection
[cnt
] & mask
) != 0)
2692 ctype
->class_collection
[cnt
] |= BIT (tok_graph
);
2694 for (cnt
= 0; cnt
< 256; ++cnt
)
2695 if ((ctype
->class256_collection
[cnt
] & mask
) != 0)
2696 ctype
->class256_collection
[cnt
] |= BIT (tok_graph
);
2699 if ((ctype
->class_done
& BITw (tok_print
)) == 0)
2700 /* "If this keyword [print] is not provided, characters specified for
2701 the keywords `upper', `lower', `alpha', `digit', `xdigit', `punct',
2702 and the <space> character shall belong to this character class."
2703 [P1003.2, 2.5.2.1] */
2705 unsigned long int mask
= BIT (tok_upper
) | BIT (tok_lower
) |
2706 BIT (tok_alpha
) | BIT (tok_digit
) | BIT (tok_xdigit
) | BIT (tok_punct
);
2709 struct charseq
*seq
;
2711 for (cnt
= 0; cnt
< ctype
->class_collection_act
; ++cnt
)
2712 if ((ctype
->class_collection
[cnt
] & mask
) != 0)
2713 ctype
->class_collection
[cnt
] |= BIT (tok_print
);
2715 for (cnt
= 0; cnt
< 256; ++cnt
)
2716 if ((ctype
->class256_collection
[cnt
] & mask
) != 0)
2717 ctype
->class256_collection
[cnt
] |= BIT (tok_print
);
2720 space
= repertoire_find_value (repertoire
, "space", 5);
2721 if (space
== ILLEGAL_CHAR_VALUE
)
2725 %s: character `%s' not defined while needed as default value"),
2726 "LC_CTYPE", "<space>");
2729 ELEM (ctype
, class_collection
, , space
) |= BIT (tok_print
);
2731 seq
= charmap_find_value (charmap
, "space", 5);
2736 %s: character `%s' not defined while needed as default value"),
2737 "LC_CTYPE", "<space>");
2739 else if (seq
->nbytes
!= 1)
2741 %s: character `%s' in charmap not representable with one byte"),
2742 "LC_CTYPE", "<space>");
2744 ctype
->class256_collection
[seq
->bytes
[0]] |= BIT (tok_print
);
2747 if (ctype
->tomap_done
[0] == 0)
2748 /* "If this keyword [toupper] is not specified, the lowercase letters
2749 `a' through `z', and their corresponding uppercase letters `A' to
2750 `Z', ..., shall automatically be included, with implementation-
2751 defined character values." [P1003.2, 2.5.2.1] */
2756 strcpy (tmp
, "<?>");
2758 for (ch
= 'a'; ch
<= 'z'; ++ch
)
2760 uint32_t value_from
, value_to
;
2761 struct charseq
*seq_from
, *seq_to
;
2765 value_from
= repertoire_find_value (repertoire
, &tmp
[1], 1);
2766 if (value_from
== ILLEGAL_CHAR_VALUE
)
2770 %s: character `%s' not defined while needed as default value"),
2775 /* This conversion is implementation defined. */
2776 tmp
[1] = (char) (ch
+ ('A' - 'a'));
2777 value_to
= repertoire_find_value (repertoire
, &tmp
[1], 1);
2778 if (value_to
== ILLEGAL_CHAR_VALUE
)
2782 %s: character `%s' not defined while needed as default value"),
2786 /* The index [0] is determined by the order of the
2787 `ctype_map_newP' calls in `ctype_startup'. */
2788 ELEM (ctype
, map_collection
, [0], value_from
) = value_to
;
2791 seq_from
= charmap_find_value (charmap
, &tmp
[1], 1);
2792 if (seq_from
== NULL
)
2796 %s: character `%s' not defined while needed as default value"),
2799 else if (seq_from
->nbytes
!= 1)
2803 %s: character `%s' needed as default value not representable with one byte"),
2808 /* This conversion is implementation defined. */
2809 tmp
[1] = (char) (ch
+ ('A' - 'a'));
2810 seq_to
= charmap_find_value (charmap
, &tmp
[1], 1);
2815 %s: character `%s' not defined while needed as default value"),
2818 else if (seq_to
->nbytes
!= 1)
2822 %s: character `%s' needed as default value not representable with one byte"),
2826 /* The index [0] is determined by the order of the
2827 `ctype_map_newP' calls in `ctype_startup'. */
2828 ctype
->map256_collection
[0][seq_from
->bytes
[0]]
2834 if (ctype
->tomap_done
[1] == 0)
2835 /* "If this keyword [tolower] is not specified, the mapping shall be
2836 the reverse mapping of the one specified to `toupper'." [P1003.2] */
2838 for (cnt
= 0; cnt
< ctype
->map_collection_act
[0]; ++cnt
)
2839 if (ctype
->map_collection
[0][cnt
] != 0)
2840 ELEM (ctype
, map_collection
, [1],
2841 ctype
->map_collection
[0][cnt
])
2842 = ctype
->charnames
[cnt
];
2844 for (cnt
= 0; cnt
< 256; ++cnt
)
2845 if (ctype
->map256_collection
[0][cnt
] != 0)
2846 ctype
->map_collection
[1][ctype
->map_collection
[0][cnt
]]
2847 = ctype
->charnames
[cnt
];
2850 if (ctype
->outdigits_act
== 0)
2852 for (cnt
= 0; cnt
< 10; ++cnt
)
2854 ctype
->mboutdigits
[cnt
] = charmap_find_symbol (charmap
,
2857 if (ctype
->mboutdigits
[cnt
] == NULL
)
2859 ctype
->mboutdigits
[cnt
] = charmap_find_symbol (charmap
,
2861 strlen (longnames
[cnt
]));
2863 if (ctype
->mboutdigits
[cnt
] == NULL
)
2865 /* Provide a replacement. */
2867 no output digits defined and none of the standard names in the charmap"));
2869 ctype
->mboutdigits
[cnt
] = obstack_alloc (&charmap
->mem_pool
,
2870 sizeof (struct charseq
) + 1);
2872 /* This is better than nothing. */
2873 ctype
->mboutdigits
[cnt
]->bytes
[0] = digits
[cnt
];
2874 ctype
->mboutdigits
[cnt
]->nbytes
= 1;
2878 ctype
->wcoutdigits
[cnt
] = repertoire_find_value (repertoire
,
2881 if (ctype
->wcoutdigits
[cnt
] == ILLEGAL_CHAR_VALUE
)
2883 ctype
->wcoutdigits
[cnt
] = repertoire_find_value (repertoire
,
2885 strlen (longnames
[cnt
]));
2887 if (ctype
->wcoutdigits
[cnt
] == ILLEGAL_CHAR_VALUE
)
2889 /* Provide a replacement. */
2891 no output digits defined and none of the standard names in the repertoire"));
2893 /* This is better than nothing. */
2894 ctype
->wcoutdigits
[cnt
] = (uint32_t) digits
[cnt
];
2899 ctype
->outdigits_act
= 10;
2905 allocate_arrays (struct locale_ctype_t
*ctype
, struct charmap_t
*charmap
,
2906 struct repertoire_t
*repertoire
)
2910 /* First we have to decide how we organize the arrays. It is easy
2911 for a one-byte character set. But multi-byte character set
2912 cannot be stored flat because the chars might be sparsely used.
2913 So we determine an optimal hashing function for the used
2916 We use a very trivial hashing function to store the sparse
2917 table. CH % TABSIZE is used as an index. To solve multiple hits
2918 we have N planes. This guarantees a fixed search time for a
2919 character [N / 2]. In the following code we determine the minimum
2920 value for TABSIZE * N, where TABSIZE >= 256. */
2921 size_t min_total
= UINT_MAX
;
2922 size_t act_size
= 256;
2926 Computing table size for character classes might take a while..."),
2929 while (act_size
< min_total
)
2931 size_t cnt
[act_size
];
2932 size_t act_planes
= 1;
2934 memset (cnt
, '\0', sizeof cnt
);
2936 for (idx
= 0; idx
< 256; ++idx
)
2939 for (idx
= 0; idx
< ctype
->charnames_act
; ++idx
)
2940 if (ctype
->charnames
[idx
] >= 256)
2942 size_t nr
= ctype
->charnames
[idx
] % act_size
;
2944 if (++cnt
[nr
] > act_planes
)
2946 act_planes
= cnt
[nr
];
2947 if (act_size
* act_planes
>= min_total
)
2952 if (act_size
* act_planes
< min_total
)
2954 min_total
= act_size
* act_planes
;
2955 ctype
->plane_size
= act_size
;
2956 ctype
->plane_cnt
= act_planes
;
2963 fputs (_(" done\n"), stderr
);
2966 ctype
->names
= (uint32_t *) xcalloc (ctype
->plane_size
2970 for (idx
= 1; idx
< 256; ++idx
)
2971 ctype
->names
[idx
] = idx
;
2973 /* Trick: change the 0th entry's name to 1 to mark the cell occupied. */
2974 ctype
->names
[0] = 1;
2976 for (idx
= 256; idx
< ctype
->charnames_act
; ++idx
)
2978 size_t nr
= (ctype
->charnames
[idx
] % ctype
->plane_size
);
2981 while (ctype
->names
[nr
+ depth
* ctype
->plane_size
])
2983 assert (depth
< ctype
->plane_cnt
);
2985 ctype
->names
[nr
+ depth
* ctype
->plane_size
] = ctype
->charnames
[idx
];
2987 /* Now for faster access remember the index in the NAMES_B array. */
2988 ctype
->charnames
[idx
] = nr
+ depth
* ctype
->plane_size
;
2990 ctype
->names
[0] = 0;
2993 /* You wonder about this amount of memory? This is only because some
2994 users do not manage to address the array with unsigned values or
2995 data types with range >= 256. '\200' would result in the array
2996 index -128. To help these poor people we duplicate the entries for
2997 128 up to 255 below the entry for \0. */
2998 ctype
->ctype_b
= (char_class_t
*) xcalloc (256 + 128,
2999 sizeof (char_class_t
));
3000 ctype
->ctype32_b
= (char_class32_t
*) xcalloc (ctype
->plane_size
3002 sizeof (char_class32_t
));
3004 /* This is the array accessed using the multibyte string elements. */
3005 for (idx
= 0; idx
< 256; ++idx
)
3006 ctype
->ctype_b
[128 + idx
] = ctype
->class256_collection
[idx
];
3008 /* Mirror first 127 entries. We must take care that entry -1 is not
3009 mirrored because EOF == -1. */
3010 for (idx
= 0; idx
< 127; ++idx
)
3011 ctype
->ctype_b
[idx
] = ctype
->ctype_b
[256 + idx
];
3013 /* The 32 bit array contains all characters. */
3014 for (idx
= 0; idx
< ctype
->class_collection_act
; ++idx
)
3015 ctype
->ctype32_b
[ctype
->charnames
[idx
]] = ctype
->class_collection
[idx
];
3017 /* Room for table of mappings. */
3018 ctype
->map
= (uint32_t **) xmalloc (ctype
->map_collection_nr
3019 * sizeof (uint32_t *));
3021 /* Fill in all mappings. */
3022 for (idx
= 0; idx
< ctype
->map_collection_nr
; ++idx
)
3026 /* Allocate table. */
3027 ctype
->map
[idx
] = (uint32_t *) xmalloc ((ctype
->plane_size
3028 * ctype
->plane_cnt
+ 128)
3029 * sizeof (uint32_t));
3031 /* Copy default value (identity mapping). */
3032 memcpy (&ctype
->map
[idx
][128], ctype
->names
,
3033 ctype
->plane_size
* ctype
->plane_cnt
* sizeof (uint32_t));
3035 /* Copy values from collection. */
3036 for (idx2
= 0; idx2
< 256; ++idx2
)
3037 ctype
->map
[idx
][128 + idx2
] = ctype
->map256_collection
[idx
][idx2
];
3039 /* Mirror first 127 entries. We must take care not to map entry
3040 -1 because EOF == -1. */
3041 for (idx2
= 0; idx2
< 127; ++idx2
)
3042 ctype
->map
[idx
][idx2
] = ctype
->map
[idx
][256 + idx2
];
3044 /* EOF must map to EOF. */
3045 ctype
->map
[idx
][127] = EOF
;
3047 /* The 32 bit map collection. */
3048 for (idx2
= 0; idx2
< ctype
->map_collection_act
[idx
]; ++idx2
)
3049 if (ctype
->map_collection
[idx
][idx2
] != 0)
3050 ctype
->map
[idx
][128 + ctype
->charnames
[idx2
]]
3051 = ctype
->map_collection
[idx
][idx2
];
3054 /* Extra array for class and map names. */
3055 ctype
->class_name_ptr
= (uint32_t *) xmalloc (ctype
->nr_charclass
3056 * sizeof (uint32_t));
3057 ctype
->map_name_ptr
= (uint32_t *) xmalloc (ctype
->map_collection_nr
3058 * sizeof (uint32_t));
3060 /* Array for width information. Because the expected width are very
3061 small we use only one single byte. This save space and we need
3062 not provide the information twice with both endianesses. */
3063 ctype
->width
= (unsigned char *) xmalloc (ctype
->plane_size
3064 * ctype
->plane_cnt
);
3065 /* Initialize with default width value. */
3066 memset (ctype
->width
, charmap
->width_default
,
3067 ctype
->plane_size
* ctype
->plane_cnt
);
3068 if (charmap
->width_rules
!= NULL
)
3072 for (cnt
= 0; cnt
< charmap
->nwidth_rules
; ++cnt
)
3074 unsigned char bytes
[charmap
->mb_cur_max
];
3075 int nbytes
= charmap
->width_rules
[cnt
].from
->nbytes
;
3077 /* We have the range of character for which the width is
3078 specified described using byte sequences of the multibyte
3079 charset. We have to convert this to UCS4 now. And we
3080 cannot simply convert the beginning and the end of the
3081 sequence, we have to iterate over the byte sequence and
3082 convert it for every single character. */
3083 memcpy (bytes
, charmap
->width_rules
[cnt
].from
->bytes
, nbytes
);
3085 while (nbytes
< charmap
->width_rules
[cnt
].to
->nbytes
3086 || memcmp (bytes
, charmap
->width_rules
[cnt
].to
->bytes
,
3089 /* Find the UCS value for `bytes'. */
3090 uint32_t wch
= repertoire_find_value (ctype
->repertoire
, bytes
,
3094 if (wch
!= ILLEGAL_CHAR_VALUE
)
3096 /* Store the value. */
3097 size_t nr
= idx
% ctype
->plane_size
;
3100 while (ctype
->names
[nr
+ depth
* ctype
->plane_size
] != nr
)
3102 assert (depth
< ctype
->plane_cnt
);
3104 ctype
->width
[nr
+ depth
* ctype
->plane_size
]
3105 = charmap
->width_rules
[cnt
].width
;
3108 /* "Increment" the bytes sequence. */
3110 while (inner
>= 0 && bytes
[inner
] == 0xff)
3115 /* We have to extend the byte sequence. */
3116 if (nbytes
>= charmap
->width_rules
[cnt
].to
->nbytes
)
3120 memset (&bytes
[1], 0, nbytes
);
3126 while (++inner
< nbytes
)
3133 /* Set MB_CUR_MAX. */
3134 ctype
->mb_cur_max
= charmap
->mb_cur_max
;
3136 /* We need the name of the currently used 8-bit character set to
3137 make correct conversion between this 8-bit representation and the
3138 ISO 10646 character set used internally for wide characters. */
3139 ctype
->codeset_name
= charmap
->code_set_name
;
3141 /* Now determine the table for the transliteration information.
3143 XXX It is not yet clear to me whether it is worth implementing a
3144 complicated algorithm which uses a hash table to locate the entries.
3145 For now I'll use a simple array which can be searching using binary
3147 if (ctype
->translit_copy_locale
!= NULL
)
3149 /* Fold in the transliteration information from the locale mentioned
3150 in the `include' statement. */
3151 struct locale_ctype_t
*here
= ctype
;
3155 struct localedef_t
*other
= find_locale (LC_CTYPE
,
3156 here
->translit_copy_locale
,
3157 repertoire
->name
, charmap
);
3162 %s: transliteration data from locale `%s' not available"),
3163 "LC_CTYPE", here
->translit_copy_locale
);
3167 here
= other
->categories
[LC_CTYPE
].ctype
;
3169 /* Enqueue the information if necessary. */
3170 if (here
->translit
!= NULL
)
3172 struct translit_t
*endp
= here
->translit
;
3173 while (endp
->next
!= NULL
)
3176 endp
->next
= ctype
->translit
;
3177 ctype
->translit
= here
->translit
;
3180 while (here
->translit_copy_locale
!= NULL
);
3183 if (ctype
->translit
!= NULL
)
3185 /* First count how many entries we have. This is the upper limit
3186 since some entries from the included files might be overwritten. */
3189 struct translit_t
*runp
= ctype
->translit
;
3190 struct translit_t
**sorted
;
3191 size_t from_len
, to_len
;
3193 while (runp
!= NULL
)
3199 /* Next we allocate an array large enough and fill in the values. */
3200 sorted
= (struct translit_t
**) alloca (number
3201 * sizeof (struct translit_t
**));
3202 runp
= ctype
->translit
;
3206 /* Search for the place where to insert this string.
3207 XXX Better use a real sorting algorithm later. */
3211 while (idx
< number
)
3213 int res
= wcscmp ((const wchar_t *) sorted
[idx
]->from
,
3214 (const wchar_t *) runp
->from
);
3229 memmove (&sorted
[idx
+ 1], &sorted
[idx
],
3230 (number
- idx
) * sizeof (struct translit_t
*));
3237 while (runp
!= NULL
);
3239 /* The next step is putting all the possible transliteration
3240 strings in one memory block so that we can write it out.
3241 We need several different blocks:
3242 - index to the tfromstring array
3244 - index to the to-string array
3246 And this all must be available for both endianes variants.
3248 from_len
= to_len
= 0;
3249 for (cnt
= 0; cnt
< number
; ++cnt
)
3251 struct translit_to_t
*srunp
;
3252 from_len
+= wcslen ((const wchar_t *) sorted
[cnt
]->from
) + 1;
3253 srunp
= sorted
[cnt
]->to
;
3254 while (srunp
!= NULL
)
3256 to_len
+= wcslen ((const wchar_t *) srunp
->str
) + 1;
3257 srunp
= srunp
->next
;
3259 /* Plus one for the extra NUL character marking the end of
3260 the list for the current entry. */
3264 /* We can allocate the arrays for the results. */
3265 ctype
->translit_from_idx
= xmalloc (number
* sizeof (uint32_t));
3266 ctype
->translit_from_tbl
= xmalloc (from_len
* sizeof (uint32_t));
3267 ctype
->translit_to_idx
= xmalloc (number
* sizeof (uint32_t));
3268 ctype
->translit_to_tbl
= xmalloc (to_len
* sizeof (uint32_t));
3272 for (cnt
= 0; cnt
< number
; ++cnt
)
3275 struct translit_to_t
*srunp
;
3277 ctype
->translit_from_idx
[cnt
] = from_len
;
3278 ctype
->translit_to_idx
[cnt
] = to_len
;
3280 len
= wcslen ((const wchar_t *) sorted
[cnt
]->from
) + 1;
3281 wmemcpy ((wchar_t *) &ctype
->translit_from_tbl
[from_len
],
3282 (const wchar_t *) sorted
[cnt
]->from
, len
);
3285 ctype
->translit_to_idx
[cnt
] = to_len
;
3286 srunp
= sorted
[cnt
]->to
;
3287 while (srunp
!= NULL
)
3289 len
= wcslen ((const wchar_t *) srunp
->str
) + 1;
3290 wmemcpy ((wchar_t *) &ctype
->translit_to_tbl
[to_len
],
3291 (const wchar_t *) srunp
->str
, len
);
3293 srunp
= srunp
->next
;
3295 ctype
->translit_to_tbl
[to_len
++] = L
'\0';
3298 /* Store the information about the length. */
3299 ctype
->translit_idx_size
= number
* sizeof (uint32_t);
3300 ctype
->translit_from_tbl_size
= from_len
* sizeof (uint32_t);
3301 ctype
->translit_to_tbl_size
= to_len
* sizeof (uint32_t);
3305 /* Provide some dummy pointers since we have nothing to write out. */
3306 static uint32_t no_str
= { 0 };
3308 ctype
->translit_from_idx
= &no_str
;
3309 ctype
->translit_from_tbl
= &no_str
;
3310 ctype
->translit_to_tbl
= &no_str
;
3311 ctype
->translit_idx_size
= 0;
3312 ctype
->translit_from_tbl_size
= 0;
3313 ctype
->translit_to_tbl_size
= 0;