1 /* Copyright (C) 1995-1999, 2000, 2001, 2002 Free Software Foundation, Inc.
2 This file is part of the GNU C Library.
3 Contributed by Ulrich Drepper <drepper@gnu.org>, 1995.
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, write to the Free
17 Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
36 #include "localedef.h"
38 #include "localeinfo.h"
40 #include "linereader.h"
41 #include "locfile-token.h"
47 #ifdef PREDEFINED_CLASSES
48 /* These are the extra bits not in wctype.h since these are not preallocated
50 # define _ISwspecial1 (1 << 29)
51 # define _ISwspecial2 (1 << 30)
52 # define _ISwspecial3 (1 << 31)
56 /* The bit used for representing a special class. */
57 #define BITPOS(class) ((class) - tok_upper)
58 #define BIT(class) (_ISbit (BITPOS (class)))
59 #define BITw(class) (_ISwbit (BITPOS (class)))
61 #define ELEM(ctype, collection, idx, value) \
62 *find_idx (ctype, &ctype->collection idx, &ctype->collection##_max idx, \
63 &ctype->collection##_act idx, value)
66 /* To be compatible with former implementations we for now restrict
67 the number of bits for character classes to 16. When compatibility
68 is not necessary anymore increase the number to 32. */
69 #define char_class_t uint16_t
70 #define char_class32_t uint32_t
73 /* Type to describe a transliteration action. We have a possibly
74 multiple character from-string and a set of multiple character
75 to-strings. All are 32bit values since this is what is used in
76 the gconv functions. */
81 struct translit_to_t
*next
;
91 struct translit_to_t
*to
;
93 struct translit_t
*next
;
96 struct translit_ignore_t
105 struct translit_ignore_t
*next
;
109 /* Type to describe a transliteration include statement. */
110 struct translit_include_t
112 const char *copy_locale
;
113 const char *copy_repertoire
;
115 struct translit_include_t
*next
;
119 /* Sparse table of uint32_t. */
120 #define TABLE idx_table
121 #define ELEMENT uint32_t
122 #define DEFAULT ((uint32_t) ~0)
127 /* The real definition of the struct for the LC_CTYPE locale. */
128 struct locale_ctype_t
131 size_t charnames_max
;
132 size_t charnames_act
;
133 /* An index lookup table, to speedup find_idx. */
134 struct idx_table charnames_idx
;
136 struct repertoire_t
*repertoire
;
138 /* We will allow up to 8 * sizeof (uint32_t) character classes. */
139 #define MAX_NR_CHARCLASS (8 * sizeof (uint32_t))
141 const char *classnames
[MAX_NR_CHARCLASS
];
142 uint32_t last_class_char
;
143 uint32_t class256_collection
[256];
144 uint32_t *class_collection
;
145 size_t class_collection_max
;
146 size_t class_collection_act
;
148 uint32_t class_offset
;
150 struct charseq
**mbdigits
;
157 struct charseq
*mboutdigits
[10];
158 uint32_t wcoutdigits
[10];
159 size_t outdigits_act
;
161 /* If the following number ever turns out to be too small simply
162 increase it. But I doubt it will. --drepper@gnu */
163 #define MAX_NR_CHARMAP 16
164 const char *mapnames
[MAX_NR_CHARMAP
];
165 uint32_t *map_collection
[MAX_NR_CHARMAP
];
166 uint32_t map256_collection
[2][256];
167 size_t map_collection_max
[MAX_NR_CHARMAP
];
168 size_t map_collection_act
[MAX_NR_CHARMAP
];
169 size_t map_collection_nr
;
171 int tomap_done
[MAX_NR_CHARMAP
];
174 /* Transliteration information. */
175 struct translit_include_t
*translit_include
;
176 struct translit_t
*translit
;
177 struct translit_ignore_t
*translit_ignore
;
178 uint32_t ntranslit_ignore
;
180 uint32_t *default_missing
;
181 const char *default_missing_file
;
182 size_t default_missing_lineno
;
184 /* The arrays for the binary representation. */
185 char_class_t
*ctype_b
;
186 char_class32_t
*ctype32_b
;
190 struct iovec
*class_3level
;
191 struct iovec
*map_3level
;
192 uint32_t *class_name_ptr
;
193 uint32_t *map_name_ptr
;
196 const char *codeset_name
;
197 uint32_t *translit_from_idx
;
198 uint32_t *translit_from_tbl
;
199 uint32_t *translit_to_idx
;
200 uint32_t *translit_to_tbl
;
201 uint32_t translit_idx_size
;
202 size_t translit_from_tbl_size
;
203 size_t translit_to_tbl_size
;
205 struct obstack mempool
;
209 /* Marker for an empty slot. This has the value 0xFFFFFFFF, regardless
210 whether 'int' is 16 bit, 32 bit, or 64 bit. */
211 #define EMPTY ((uint32_t) ~0)
214 #define obstack_chunk_alloc xmalloc
215 #define obstack_chunk_free free
218 /* Prototypes for local functions. */
219 static void ctype_startup (struct linereader
*lr
, struct localedef_t
*locale
,
220 const struct charmap_t
*charmap
,
221 struct localedef_t
*copy_locale
,
223 static void ctype_class_new (struct linereader
*lr
,
224 struct locale_ctype_t
*ctype
, const char *name
);
225 static void ctype_map_new (struct linereader
*lr
,
226 struct locale_ctype_t
*ctype
,
227 const char *name
, const struct charmap_t
*charmap
);
228 static uint32_t *find_idx (struct locale_ctype_t
*ctype
, uint32_t **table
,
229 size_t *max
, size_t *act
, unsigned int idx
);
230 static void set_class_defaults (struct locale_ctype_t
*ctype
,
231 const struct charmap_t
*charmap
,
232 struct repertoire_t
*repertoire
);
233 static void allocate_arrays (struct locale_ctype_t
*ctype
,
234 const struct charmap_t
*charmap
,
235 struct repertoire_t
*repertoire
);
238 static const char *longnames
[] =
240 "zero", "one", "two", "three", "four",
241 "five", "six", "seven", "eight", "nine"
243 static const char *uninames
[] =
245 "U00000030", "U00000031", "U00000032", "U00000033", "U00000034",
246 "U00000035", "U00000036", "U00000037", "U00000038", "U00000039"
248 static const unsigned char digits
[] = "0123456789";
252 ctype_startup (struct linereader
*lr
, struct localedef_t
*locale
,
253 const struct charmap_t
*charmap
,
254 struct localedef_t
*copy_locale
, int ignore_content
)
257 struct locale_ctype_t
*ctype
;
259 if (!ignore_content
&& locale
->categories
[LC_CTYPE
].ctype
== NULL
)
261 if (copy_locale
== NULL
)
263 /* Allocate the needed room. */
264 locale
->categories
[LC_CTYPE
].ctype
= ctype
=
265 (struct locale_ctype_t
*) xcalloc (1,
266 sizeof (struct locale_ctype_t
));
268 /* We have seen no names yet. */
269 ctype
->charnames_max
= charmap
->mb_cur_max
== 1 ? 256 : 512;
271 (unsigned int *) xmalloc (ctype
->charnames_max
272 * sizeof (unsigned int));
273 for (cnt
= 0; cnt
< 256; ++cnt
)
274 ctype
->charnames
[cnt
] = cnt
;
275 ctype
->charnames_act
= 256;
276 idx_table_init (&ctype
->charnames_idx
);
278 /* Fill character class information. */
279 ctype
->last_class_char
= ILLEGAL_CHAR_VALUE
;
280 /* The order of the following instructions determines the bit
282 ctype_class_new (lr
, ctype
, "upper");
283 ctype_class_new (lr
, ctype
, "lower");
284 ctype_class_new (lr
, ctype
, "alpha");
285 ctype_class_new (lr
, ctype
, "digit");
286 ctype_class_new (lr
, ctype
, "xdigit");
287 ctype_class_new (lr
, ctype
, "space");
288 ctype_class_new (lr
, ctype
, "print");
289 ctype_class_new (lr
, ctype
, "graph");
290 ctype_class_new (lr
, ctype
, "blank");
291 ctype_class_new (lr
, ctype
, "cntrl");
292 ctype_class_new (lr
, ctype
, "punct");
293 ctype_class_new (lr
, ctype
, "alnum");
294 #ifdef PREDEFINED_CLASSES
295 /* The following are extensions from ISO 14652. */
296 ctype_class_new (lr
, ctype
, "left_to_right");
297 ctype_class_new (lr
, ctype
, "right_to_left");
298 ctype_class_new (lr
, ctype
, "num_terminator");
299 ctype_class_new (lr
, ctype
, "num_separator");
300 ctype_class_new (lr
, ctype
, "segment_separator");
301 ctype_class_new (lr
, ctype
, "block_separator");
302 ctype_class_new (lr
, ctype
, "direction_control");
303 ctype_class_new (lr
, ctype
, "sym_swap_layout");
304 ctype_class_new (lr
, ctype
, "char_shape_selector");
305 ctype_class_new (lr
, ctype
, "num_shape_selector");
306 ctype_class_new (lr
, ctype
, "non_spacing");
307 ctype_class_new (lr
, ctype
, "non_spacing_level3");
308 ctype_class_new (lr
, ctype
, "normal_connect");
309 ctype_class_new (lr
, ctype
, "r_connect");
310 ctype_class_new (lr
, ctype
, "no_connect");
311 ctype_class_new (lr
, ctype
, "no_connect-space");
312 ctype_class_new (lr
, ctype
, "vowel_connect");
315 ctype
->class_collection_max
= charmap
->mb_cur_max
== 1 ? 256 : 512;
316 ctype
->class_collection
317 = (uint32_t *) xcalloc (sizeof (unsigned long int),
318 ctype
->class_collection_max
);
319 ctype
->class_collection_act
= 256;
321 /* Fill character map information. */
322 ctype
->last_map_idx
= MAX_NR_CHARMAP
;
323 ctype_map_new (lr
, ctype
, "toupper", charmap
);
324 ctype_map_new (lr
, ctype
, "tolower", charmap
);
325 #ifdef PREDEFINED_CLASSES
326 ctype_map_new (lr
, ctype
, "tosymmetric", charmap
);
329 /* Fill first 256 entries in `toXXX' arrays. */
330 for (cnt
= 0; cnt
< 256; ++cnt
)
332 ctype
->map_collection
[0][cnt
] = cnt
;
333 ctype
->map_collection
[1][cnt
] = cnt
;
334 #ifdef PREDEFINED_CLASSES
335 ctype
->map_collection
[2][cnt
] = cnt
;
337 ctype
->map256_collection
[0][cnt
] = cnt
;
338 ctype
->map256_collection
[1][cnt
] = cnt
;
341 obstack_init (&ctype
->mempool
);
344 ctype
= locale
->categories
[LC_CTYPE
].ctype
=
345 copy_locale
->categories
[LC_CTYPE
].ctype
;
351 ctype_finish (struct localedef_t
*locale
, const struct charmap_t
*charmap
)
353 /* See POSIX.2, table 2-6 for the meaning of the following table. */
358 const char allow
[NCLASS
];
360 valid_table
[NCLASS
] =
362 /* The order is important. See token.h for more information.
363 M = Always, D = Default, - = Permitted, X = Mutually exclusive */
364 { "upper", "--MX-XDDXXX-" },
365 { "lower", "--MX-XDDXXX-" },
366 { "alpha", "---X-XDDXXX-" },
367 { "digit", "XXX--XDDXXX-" },
368 { "xdigit", "-----XDDXXX-" },
369 { "space", "XXXXX------X" },
370 { "print", "---------X--" },
371 { "graph", "---------X--" },
372 { "blank", "XXXXXM-----X" },
373 { "cntrl", "XXXXX-XX--XX" },
374 { "punct", "XXXXX-DD-X-X" },
375 { "alnum", "-----XDDXXX-" }
379 uint32_t space_value
;
380 struct charseq
*space_seq
;
381 struct locale_ctype_t
*ctype
= locale
->categories
[LC_CTYPE
].ctype
;
388 /* Now resolve copying and also handle completely missing definitions. */
391 const char *repertoire_name
;
393 /* First see whether we were supposed to copy. If yes, find the
394 actual definition. */
395 if (locale
->copy_name
[LC_CTYPE
] != NULL
)
397 /* Find the copying locale. This has to happen transitively since
398 the locale we are copying from might also copying another one. */
399 struct localedef_t
*from
= locale
;
402 from
= find_locale (LC_CTYPE
, from
->copy_name
[LC_CTYPE
],
403 from
->repertoire_name
, charmap
);
404 while (from
->categories
[LC_CTYPE
].ctype
== NULL
405 && from
->copy_name
[LC_CTYPE
] != NULL
);
407 ctype
= locale
->categories
[LC_CTYPE
].ctype
408 = from
->categories
[LC_CTYPE
].ctype
;
411 /* If there is still no definition issue an warning and create an
416 WITH_CUR_LOCALE (error (0, 0, _("\
417 No definition for %s category found"), "LC_CTYPE"));
418 ctype_startup (NULL
, locale
, charmap
, NULL
, 0);
419 ctype
= locale
->categories
[LC_CTYPE
].ctype
;
422 /* Get the repertoire we have to use. */
423 repertoire_name
= locale
->repertoire_name
?: repertoire_global
;
424 if (repertoire_name
!= NULL
)
425 ctype
->repertoire
= repertoire_read (repertoire_name
);
428 /* We need the name of the currently used 8-bit character set to
429 make correct conversion between this 8-bit representation and the
430 ISO 10646 character set used internally for wide characters. */
431 ctype
->codeset_name
= charmap
->code_set_name
;
432 if (ctype
->codeset_name
== NULL
)
435 WITH_CUR_LOCALE (error (0, 0, _("\
436 No character set name specified in charmap")));
437 ctype
->codeset_name
= "//UNKNOWN//";
440 /* Set default value for classes not specified. */
441 set_class_defaults (ctype
, charmap
, ctype
->repertoire
);
443 /* Check according to table. */
444 for (cnt
= 0; cnt
< ctype
->class_collection_act
; ++cnt
)
446 uint32_t tmp
= ctype
->class_collection
[cnt
];
450 for (cls1
= 0; cls1
< NCLASS
; ++cls1
)
451 if ((tmp
& _ISwbit (cls1
)) != 0)
452 for (cls2
= 0; cls2
< NCLASS
; ++cls2
)
453 if (valid_table
[cls1
].allow
[cls2
] != '-')
455 int eq
= (tmp
& _ISwbit (cls2
)) != 0;
456 switch (valid_table
[cls1
].allow
[cls2
])
461 uint32_t value
= ctype
->charnames
[cnt
];
464 WITH_CUR_LOCALE (error (0, 0, _("\
465 character L'\\u%0*x' in class `%s' must be in class `%s'"),
466 value
> 0xffff ? 8 : 4,
468 valid_table
[cls1
].name
,
469 valid_table
[cls2
].name
));
476 uint32_t value
= ctype
->charnames
[cnt
];
479 WITH_CUR_LOCALE (error (0, 0, _("\
480 character L'\\u%0*x' in class `%s' must not be in class `%s'"),
481 value
> 0xffff ? 8 : 4,
483 valid_table
[cls1
].name
,
484 valid_table
[cls2
].name
));
489 ctype
->class_collection
[cnt
] |= _ISwbit (cls2
);
493 WITH_CUR_LOCALE (error (5, 0, _("\
494 internal error in %s, line %u"), __FUNCTION__
, __LINE__
));
500 for (cnt
= 0; cnt
< 256; ++cnt
)
502 uint32_t tmp
= ctype
->class256_collection
[cnt
];
506 for (cls1
= 0; cls1
< NCLASS
; ++cls1
)
507 if ((tmp
& _ISbit (cls1
)) != 0)
508 for (cls2
= 0; cls2
< NCLASS
; ++cls2
)
509 if (valid_table
[cls1
].allow
[cls2
] != '-')
511 int eq
= (tmp
& _ISbit (cls2
)) != 0;
512 switch (valid_table
[cls1
].allow
[cls2
])
519 snprintf (buf
, sizeof buf
, "\\%Zo", cnt
);
522 WITH_CUR_LOCALE (error (0, 0, _("\
523 character '%s' in class `%s' must be in class `%s'"),
525 valid_table
[cls1
].name
,
526 valid_table
[cls2
].name
));
535 snprintf (buf
, sizeof buf
, "\\%Zo", cnt
);
538 WITH_CUR_LOCALE (error (0, 0, _("\
539 character '%s' in class `%s' must not be in class `%s'"),
541 valid_table
[cls1
].name
,
542 valid_table
[cls2
].name
));
547 ctype
->class256_collection
[cnt
] |= _ISbit (cls2
);
551 WITH_CUR_LOCALE (error (5, 0, _("\
552 internal error in %s, line %u"), __FUNCTION__
, __LINE__
));
558 /* ... and now test <SP> as a special case. */
560 if (((cnt
= BITPOS (tok_space
),
561 (ELEM (ctype
, class_collection
, , space_value
)
562 & BITw (tok_space
)) == 0)
563 || (cnt
= BITPOS (tok_blank
),
564 (ELEM (ctype
, class_collection
, , space_value
)
565 & BITw (tok_blank
)) == 0)))
568 WITH_CUR_LOCALE (error (0, 0, _("<SP> character not in class `%s'"),
569 valid_table
[cnt
].name
));
571 else if (((cnt
= BITPOS (tok_punct
),
572 (ELEM (ctype
, class_collection
, , space_value
)
573 & BITw (tok_punct
)) != 0)
574 || (cnt
= BITPOS (tok_graph
),
575 (ELEM (ctype
, class_collection
, , space_value
)
580 WITH_CUR_LOCALE (error (0, 0, _("\
581 <SP> character must not be in class `%s'"),
582 valid_table
[cnt
].name
));
585 ELEM (ctype
, class_collection
, , space_value
) |= BITw (tok_print
);
587 space_seq
= charmap_find_value (charmap
, "SP", 2);
588 if (space_seq
== NULL
)
589 space_seq
= charmap_find_value (charmap
, "space", 5);
590 if (space_seq
== NULL
)
591 space_seq
= charmap_find_value (charmap
, "U00000020", 9);
592 if (space_seq
== NULL
|| space_seq
->nbytes
!= 1)
595 WITH_CUR_LOCALE (error (0, 0, _("\
596 character <SP> not defined in character map")));
598 else if (((cnt
= BITPOS (tok_space
),
599 (ctype
->class256_collection
[space_seq
->bytes
[0]]
600 & BIT (tok_space
)) == 0)
601 || (cnt
= BITPOS (tok_blank
),
602 (ctype
->class256_collection
[space_seq
->bytes
[0]]
603 & BIT (tok_blank
)) == 0)))
606 WITH_CUR_LOCALE (error (0, 0, _("<SP> character not in class `%s'"),
607 valid_table
[cnt
].name
));
609 else if (((cnt
= BITPOS (tok_punct
),
610 (ctype
->class256_collection
[space_seq
->bytes
[0]]
611 & BIT (tok_punct
)) != 0)
612 || (cnt
= BITPOS (tok_graph
),
613 (ctype
->class256_collection
[space_seq
->bytes
[0]]
614 & BIT (tok_graph
)) != 0)))
617 WITH_CUR_LOCALE (error (0, 0, _("\
618 <SP> character must not be in class `%s'"),
619 valid_table
[cnt
].name
));
622 ctype
->class256_collection
[space_seq
->bytes
[0]] |= BIT (tok_print
);
624 /* Now that the tests are done make sure the name array contains all
625 characters which are handled in the WIDTH section of the
626 character set definition file. */
627 if (charmap
->width_rules
!= NULL
)
628 for (cnt
= 0; cnt
< charmap
->nwidth_rules
; ++cnt
)
630 unsigned char bytes
[charmap
->mb_cur_max
];
631 int nbytes
= charmap
->width_rules
[cnt
].from
->nbytes
;
633 /* We have the range of character for which the width is
634 specified described using byte sequences of the multibyte
635 charset. We have to convert this to UCS4 now. And we
636 cannot simply convert the beginning and the end of the
637 sequence, we have to iterate over the byte sequence and
638 convert it for every single character. */
639 memcpy (bytes
, charmap
->width_rules
[cnt
].from
->bytes
, nbytes
);
641 while (nbytes
< charmap
->width_rules
[cnt
].to
->nbytes
642 || memcmp (bytes
, charmap
->width_rules
[cnt
].to
->bytes
,
645 /* Find the UCS value for `bytes'. */
648 struct charseq
*seq
= charmap_find_symbol (charmap
, bytes
, nbytes
);
651 wch
= ILLEGAL_CHAR_VALUE
;
652 else if (seq
->ucs4
!= UNINITIALIZED_CHAR_VALUE
)
655 wch
= repertoire_find_value (ctype
->repertoire
, seq
->name
,
658 if (wch
!= ILLEGAL_CHAR_VALUE
)
659 /* We are only interested in the side-effects of the
660 `find_idx' call. It will add appropriate entries in
661 the name array if this is necessary. */
662 (void) find_idx (ctype
, NULL
, NULL
, NULL
, wch
);
664 /* "Increment" the bytes sequence. */
666 while (inner
>= 0 && bytes
[inner
] == 0xff)
671 /* We have to extend the byte sequence. */
672 if (nbytes
>= charmap
->width_rules
[cnt
].to
->nbytes
)
676 memset (&bytes
[1], 0, nbytes
);
682 while (++inner
< nbytes
)
688 /* Now set all the other characters of the character set to the
691 while (iterate_table (&charmap
->char_table
, &curs
, &key
, &len
, &vdata
) == 0)
693 struct charseq
*data
= (struct charseq
*) vdata
;
695 if (data
->ucs4
== UNINITIALIZED_CHAR_VALUE
)
696 data
->ucs4
= repertoire_find_value (ctype
->repertoire
,
699 if (data
->ucs4
!= ILLEGAL_CHAR_VALUE
)
700 (void) find_idx (ctype
, NULL
, NULL
, NULL
, data
->ucs4
);
703 /* There must be a multiple of 10 digits. */
704 if (ctype
->mbdigits_act
% 10 != 0)
706 assert (ctype
->mbdigits_act
== ctype
->wcdigits_act
);
707 ctype
->wcdigits_act
-= ctype
->mbdigits_act
% 10;
708 ctype
->mbdigits_act
-= ctype
->mbdigits_act
% 10;
709 WITH_CUR_LOCALE (error (0, 0, _("\
710 `digit' category has not entries in groups of ten")));
713 /* Check the input digits. There must be a multiple of ten available.
714 In each group it could be that one or the other character is missing.
715 In this case the whole group must be removed. */
717 while (cnt
< ctype
->mbdigits_act
)
720 for (inner
= 0; inner
< 10; ++inner
)
721 if (ctype
->mbdigits
[cnt
+ inner
] == NULL
)
728 /* Remove the group. */
729 memmove (&ctype
->mbdigits
[cnt
], &ctype
->mbdigits
[cnt
+ 10],
730 ((ctype
->wcdigits_act
- cnt
- 10)
731 * sizeof (ctype
->mbdigits
[0])));
732 ctype
->mbdigits_act
-= 10;
736 /* If no input digits are given use the default. */
737 if (ctype
->mbdigits_act
== 0)
739 if (ctype
->mbdigits_max
== 0)
741 ctype
->mbdigits
= obstack_alloc (&((struct charmap_t
*) charmap
)->mem_pool
,
742 10 * sizeof (struct charseq
*));
743 ctype
->mbdigits_max
= 10;
746 for (cnt
= 0; cnt
< 10; ++cnt
)
748 ctype
->mbdigits
[cnt
] = charmap_find_symbol (charmap
,
750 if (ctype
->mbdigits
[cnt
] == NULL
)
752 ctype
->mbdigits
[cnt
] = charmap_find_symbol (charmap
,
754 strlen (longnames
[cnt
]));
755 if (ctype
->mbdigits
[cnt
] == NULL
)
757 /* Hum, this ain't good. */
758 WITH_CUR_LOCALE (error (0, 0, _("\
759 no input digits defined and none of the standard names in the charmap")));
761 ctype
->mbdigits
[cnt
] = obstack_alloc (&((struct charmap_t
*) charmap
)->mem_pool
,
762 sizeof (struct charseq
) + 1);
764 /* This is better than nothing. */
765 ctype
->mbdigits
[cnt
]->bytes
[0] = digits
[cnt
];
766 ctype
->mbdigits
[cnt
]->nbytes
= 1;
771 ctype
->mbdigits_act
= 10;
774 /* Check the wide character input digits. There must be a multiple
775 of ten available. In each group it could be that one or the other
776 character is missing. In this case the whole group must be
779 while (cnt
< ctype
->wcdigits_act
)
782 for (inner
= 0; inner
< 10; ++inner
)
783 if (ctype
->wcdigits
[cnt
+ inner
] == ILLEGAL_CHAR_VALUE
)
790 /* Remove the group. */
791 memmove (&ctype
->wcdigits
[cnt
], &ctype
->wcdigits
[cnt
+ 10],
792 ((ctype
->wcdigits_act
- cnt
- 10)
793 * sizeof (ctype
->wcdigits
[0])));
794 ctype
->wcdigits_act
-= 10;
798 /* If no input digits are given use the default. */
799 if (ctype
->wcdigits_act
== 0)
801 if (ctype
->wcdigits_max
== 0)
803 ctype
->wcdigits
= obstack_alloc (&((struct charmap_t
*) charmap
)->mem_pool
,
804 10 * sizeof (uint32_t));
805 ctype
->wcdigits_max
= 10;
808 for (cnt
= 0; cnt
< 10; ++cnt
)
809 ctype
->wcdigits
[cnt
] = L
'0' + cnt
;
811 ctype
->mbdigits_act
= 10;
814 /* Check the outdigits. */
816 for (cnt
= 0; cnt
< 10; ++cnt
)
817 if (ctype
->mboutdigits
[cnt
] == NULL
)
819 static struct charseq replace
[2];
823 WITH_CUR_LOCALE (error (0, 0, _("\
824 not all characters used in `outdigit' are available in the charmap")));
828 replace
[0].nbytes
= 1;
829 replace
[0].bytes
[0] = '?';
830 replace
[0].bytes
[1] = '\0';
831 ctype
->mboutdigits
[cnt
] = &replace
[0];
835 for (cnt
= 0; cnt
< 10; ++cnt
)
836 if (ctype
->wcoutdigits
[cnt
] == 0)
840 WITH_CUR_LOCALE (error (0, 0, _("\
841 not all characters used in `outdigit' are available in the repertoire")));
845 ctype
->wcoutdigits
[cnt
] = L
'?';
848 /* Sort the entries in the translit_ignore list. */
849 if (ctype
->translit_ignore
!= NULL
)
851 struct translit_ignore_t
*firstp
= ctype
->translit_ignore
;
852 struct translit_ignore_t
*runp
;
854 ctype
->ntranslit_ignore
= 1;
856 for (runp
= firstp
->next
; runp
!= NULL
; runp
= runp
->next
)
858 struct translit_ignore_t
*lastp
= NULL
;
859 struct translit_ignore_t
*cmpp
;
861 ++ctype
->ntranslit_ignore
;
863 for (cmpp
= firstp
; cmpp
!= NULL
; lastp
= cmpp
, cmpp
= cmpp
->next
)
864 if (runp
->from
< cmpp
->from
)
872 ctype
->translit_ignore
= firstp
;
878 ctype_output (struct localedef_t
*locale
, const struct charmap_t
*charmap
,
879 const char *output_path
)
881 static const char nulbytes
[4] = { 0, 0, 0, 0 };
882 struct locale_ctype_t
*ctype
= locale
->categories
[LC_CTYPE
].ctype
;
883 const size_t nelems
= (_NL_ITEM_INDEX (_NL_CTYPE_EXTRA_MAP_1
)
884 + ctype
->nr_charclass
+ ctype
->map_collection_nr
);
885 struct iovec iov
[2 + nelems
+ 2 * ctype
->nr_charclass
886 + ctype
->map_collection_nr
+ 4];
887 struct locale_file data
;
888 uint32_t idx
[nelems
+ 1];
889 uint32_t default_missing_len
;
890 size_t elem
, cnt
, offset
, total
;
893 /* Now prepare the output: Find the sizes of the table we can use. */
894 allocate_arrays (ctype
, charmap
, ctype
->repertoire
);
896 data
.magic
= LIMAGIC (LC_CTYPE
);
898 iov
[0].iov_base
= (void *) &data
;
899 iov
[0].iov_len
= sizeof (data
);
901 iov
[1].iov_base
= (void *) idx
;
902 iov
[1].iov_len
= nelems
* sizeof (uint32_t);
904 idx
[0] = iov
[0].iov_len
+ iov
[1].iov_len
;
907 for (elem
= 0; elem
< nelems
; ++elem
)
909 if (elem
< _NL_ITEM_INDEX (_NL_CTYPE_EXTRA_MAP_1
))
912 #define CTYPE_EMPTY(name) \
914 iov[2 + elem + offset].iov_base = NULL; \
915 iov[2 + elem + offset].iov_len = 0; \
916 idx[elem + 1] = idx[elem]; \
919 CTYPE_EMPTY(_NL_CTYPE_GAP1
);
920 CTYPE_EMPTY(_NL_CTYPE_GAP2
);
921 CTYPE_EMPTY(_NL_CTYPE_GAP3
);
922 CTYPE_EMPTY(_NL_CTYPE_GAP4
);
923 CTYPE_EMPTY(_NL_CTYPE_GAP5
);
924 CTYPE_EMPTY(_NL_CTYPE_GAP6
);
926 #define CTYPE_DATA(name, base, len) \
927 case _NL_ITEM_INDEX (name): \
928 iov[2 + elem + offset].iov_base = (base); \
929 iov[2 + elem + offset].iov_len = (len); \
930 idx[elem + 1] = idx[elem] + iov[2 + elem + offset].iov_len; \
933 CTYPE_DATA (_NL_CTYPE_CLASS
,
935 (256 + 128) * sizeof (char_class_t
));
937 CTYPE_DATA (_NL_CTYPE_TOUPPER
,
939 (256 + 128) * sizeof (uint32_t));
940 CTYPE_DATA (_NL_CTYPE_TOLOWER
,
942 (256 + 128) * sizeof (uint32_t));
944 CTYPE_DATA (_NL_CTYPE_TOUPPER32
,
946 256 * sizeof (uint32_t));
947 CTYPE_DATA (_NL_CTYPE_TOLOWER32
,
949 256 * sizeof (uint32_t));
951 CTYPE_DATA (_NL_CTYPE_CLASS32
,
953 256 * sizeof (char_class32_t
));
955 CTYPE_DATA (_NL_CTYPE_CLASS_OFFSET
,
956 &ctype
->class_offset
, sizeof (uint32_t));
958 CTYPE_DATA (_NL_CTYPE_MAP_OFFSET
,
959 &ctype
->map_offset
, sizeof (uint32_t));
961 CTYPE_DATA (_NL_CTYPE_TRANSLIT_TAB_SIZE
,
962 &ctype
->translit_idx_size
, sizeof (uint32_t));
964 CTYPE_DATA (_NL_CTYPE_TRANSLIT_FROM_IDX
,
965 ctype
->translit_from_idx
,
966 ctype
->translit_idx_size
* sizeof (uint32_t));
968 CTYPE_DATA (_NL_CTYPE_TRANSLIT_FROM_TBL
,
969 ctype
->translit_from_tbl
,
970 ctype
->translit_from_tbl_size
);
972 CTYPE_DATA (_NL_CTYPE_TRANSLIT_TO_IDX
,
973 ctype
->translit_to_idx
,
974 ctype
->translit_idx_size
* sizeof (uint32_t));
976 CTYPE_DATA (_NL_CTYPE_TRANSLIT_TO_TBL
,
977 ctype
->translit_to_tbl
, ctype
->translit_to_tbl_size
);
979 case _NL_ITEM_INDEX (_NL_CTYPE_CLASS_NAMES
):
980 /* The class name array. */
982 for (cnt
= 0; cnt
< ctype
->nr_charclass
; ++cnt
, ++offset
)
984 iov
[2 + elem
+ offset
].iov_base
985 = (void *) ctype
->classnames
[cnt
];
986 iov
[2 + elem
+ offset
].iov_len
987 = strlen (ctype
->classnames
[cnt
]) + 1;
988 total
+= iov
[2 + elem
+ offset
].iov_len
;
990 iov
[2 + elem
+ offset
].iov_base
= (void *) nulbytes
;
991 iov
[2 + elem
+ offset
].iov_len
= 1 + (4 - ((total
+ 1) % 4));
992 total
+= 1 + (4 - ((total
+ 1) % 4));
994 idx
[elem
+ 1] = idx
[elem
] + total
;
997 case _NL_ITEM_INDEX (_NL_CTYPE_MAP_NAMES
):
998 /* The class name array. */
1000 for (cnt
= 0; cnt
< ctype
->map_collection_nr
; ++cnt
, ++offset
)
1002 iov
[2 + elem
+ offset
].iov_base
1003 = (void *) ctype
->mapnames
[cnt
];
1004 iov
[2 + elem
+ offset
].iov_len
1005 = strlen (ctype
->mapnames
[cnt
]) + 1;
1006 total
+= iov
[2 + elem
+ offset
].iov_len
;
1008 iov
[2 + elem
+ offset
].iov_base
= (void *) nulbytes
;
1009 iov
[2 + elem
+ offset
].iov_len
= 1 + (4 - ((total
+ 1) % 4));
1010 total
+= 1 + (4 - ((total
+ 1) % 4));
1012 idx
[elem
+ 1] = idx
[elem
] + total
;
1015 CTYPE_DATA (_NL_CTYPE_WIDTH
,
1016 ctype
->width
.iov_base
,
1017 ctype
->width
.iov_len
);
1019 CTYPE_DATA (_NL_CTYPE_MB_CUR_MAX
,
1020 &ctype
->mb_cur_max
, sizeof (uint32_t));
1022 case _NL_ITEM_INDEX (_NL_CTYPE_CODESET_NAME
):
1023 total
= strlen (ctype
->codeset_name
) + 1;
1025 iov
[2 + elem
+ offset
].iov_base
= (char *) ctype
->codeset_name
;
1028 iov
[2 + elem
+ offset
].iov_base
= alloca ((total
+ 3) & ~3);
1029 memset (mempcpy (iov
[2 + elem
+ offset
].iov_base
,
1030 ctype
->codeset_name
, total
),
1031 '\0', 4 - (total
& 3));
1032 total
= (total
+ 3) & ~3;
1034 iov
[2 + elem
+ offset
].iov_len
= total
;
1035 idx
[elem
+ 1] = idx
[elem
] + iov
[2 + elem
+ offset
].iov_len
;
1038 case _NL_ITEM_INDEX (_NL_CTYPE_INDIGITS_MB_LEN
):
1039 iov
[2 + elem
+ offset
].iov_base
= alloca (sizeof (uint32_t));
1040 iov
[2 + elem
+ offset
].iov_len
= sizeof (uint32_t);
1041 *(uint32_t *) iov
[2 + elem
+ offset
].iov_base
=
1042 ctype
->mbdigits_act
/ 10;
1043 idx
[elem
+ 1] = idx
[elem
] + sizeof (uint32_t);
1046 case _NL_ITEM_INDEX (_NL_CTYPE_INDIGITS_WC_LEN
):
1047 /* Align entries. */
1048 iov
[2 + elem
+ offset
].iov_base
= (void *) nulbytes
;
1049 iov
[2 + elem
+ offset
].iov_len
= (4 - idx
[elem
] % 4) % 4;
1050 idx
[elem
] += iov
[2 + elem
+ offset
].iov_len
;
1053 iov
[2 + elem
+ offset
].iov_base
= alloca (sizeof (uint32_t));
1054 iov
[2 + elem
+ offset
].iov_len
= sizeof (uint32_t);
1055 *(uint32_t *) iov
[2 + elem
+ offset
].iov_base
=
1056 ctype
->wcdigits_act
/ 10;
1057 idx
[elem
+ 1] = idx
[elem
] + sizeof (uint32_t);
1060 case _NL_ITEM_INDEX (_NL_CTYPE_INDIGITS0_MB
) ... _NL_ITEM_INDEX (_NL_CTYPE_INDIGITS9_MB
):
1061 /* Compute the length of all possible characters. For INDIGITS
1062 there might be more than one. We simply concatenate all of
1063 them with a NUL byte following. The NUL byte wouldn't be
1064 necessary but it makes it easier for the user. */
1067 for (cnt
= elem
- _NL_ITEM_INDEX (_NL_CTYPE_INDIGITS0_MB
);
1068 cnt
< ctype
->mbdigits_act
; cnt
+= 10)
1069 total
+= ctype
->mbdigits
[cnt
]->nbytes
+ 1;
1070 iov
[2 + elem
+ offset
].iov_base
= (char *) alloca (total
);
1071 iov
[2 + elem
+ offset
].iov_len
= total
;
1073 cp
= iov
[2 + elem
+ offset
].iov_base
;
1074 for (cnt
= elem
- _NL_ITEM_INDEX (_NL_CTYPE_INDIGITS0_MB
);
1075 cnt
< ctype
->mbdigits_act
; cnt
+= 10)
1077 cp
= mempcpy (cp
, ctype
->mbdigits
[cnt
]->bytes
,
1078 ctype
->mbdigits
[cnt
]->nbytes
);
1081 idx
[elem
+ 1] = idx
[elem
] + iov
[2 + elem
+ offset
].iov_len
;
1084 case _NL_ITEM_INDEX (_NL_CTYPE_OUTDIGIT0_MB
) ... _NL_ITEM_INDEX (_NL_CTYPE_OUTDIGIT9_MB
):
1085 /* Compute the length of all possible characters. For INDIGITS
1086 there might be more than one. We simply concatenate all of
1087 them with a NUL byte following. The NUL byte wouldn't be
1088 necessary but it makes it easier for the user. */
1089 cnt
= elem
- _NL_ITEM_INDEX (_NL_CTYPE_OUTDIGIT0_MB
);
1090 total
= ctype
->mboutdigits
[cnt
]->nbytes
+ 1;
1091 iov
[2 + elem
+ offset
].iov_base
= (char *) alloca (total
);
1092 iov
[2 + elem
+ offset
].iov_len
= total
;
1094 *(char *) mempcpy (iov
[2 + elem
+ offset
].iov_base
,
1095 ctype
->mboutdigits
[cnt
]->bytes
,
1096 ctype
->mboutdigits
[cnt
]->nbytes
) = '\0';
1097 idx
[elem
+ 1] = idx
[elem
] + iov
[2 + elem
+ offset
].iov_len
;
1100 case _NL_ITEM_INDEX (_NL_CTYPE_INDIGITS0_WC
) ... _NL_ITEM_INDEX (_NL_CTYPE_INDIGITS9_WC
):
1101 total
= ctype
->wcdigits_act
/ 10;
1103 iov
[2 + elem
+ offset
].iov_base
=
1104 (uint32_t *) alloca (total
* sizeof (uint32_t));
1105 iov
[2 + elem
+ offset
].iov_len
= total
* sizeof (uint32_t);
1107 for (cnt
= elem
- _NL_ITEM_INDEX (_NL_CTYPE_INDIGITS0_WC
);
1108 cnt
< ctype
->wcdigits_act
; cnt
+= 10)
1109 ((uint32_t *) iov
[2 + elem
+ offset
].iov_base
)[cnt
/ 10]
1110 = ctype
->wcdigits
[cnt
];
1111 idx
[elem
+ 1] = idx
[elem
] + iov
[2 + elem
+ offset
].iov_len
;
1114 case _NL_ITEM_INDEX (_NL_CTYPE_OUTDIGIT0_WC
):
1115 /* Align entries. */
1116 iov
[2 + elem
+ offset
].iov_base
= (void *) nulbytes
;
1117 iov
[2 + elem
+ offset
].iov_len
= (4 - idx
[elem
] % 4) % 4;
1118 idx
[elem
] += iov
[2 + elem
+ offset
].iov_len
;
1122 case _NL_ITEM_INDEX (_NL_CTYPE_OUTDIGIT1_WC
) ... _NL_ITEM_INDEX (_NL_CTYPE_OUTDIGIT9_WC
):
1123 cnt
= elem
- _NL_ITEM_INDEX (_NL_CTYPE_OUTDIGIT0_WC
);
1124 iov
[2 + elem
+ offset
].iov_base
= &ctype
->wcoutdigits
[cnt
];
1125 iov
[2 + elem
+ offset
].iov_len
= sizeof (uint32_t);
1126 idx
[elem
+ 1] = idx
[elem
] + iov
[2 + elem
+ offset
].iov_len
;
1129 case _NL_ITEM_INDEX(_NL_CTYPE_TRANSLIT_DEFAULT_MISSING_LEN
):
1130 /* Align entries. */
1131 iov
[2 + elem
+ offset
].iov_base
= (void *) nulbytes
;
1132 iov
[2 + elem
+ offset
].iov_len
= (4 - idx
[elem
] % 4) % 4;
1133 idx
[elem
] += iov
[2 + elem
+ offset
].iov_len
;
1136 default_missing_len
= (ctype
->default_missing
1137 ? wcslen ((wchar_t *)ctype
->default_missing
)
1139 iov
[2 + elem
+ offset
].iov_base
= &default_missing_len
;
1140 iov
[2 + elem
+ offset
].iov_len
= sizeof (uint32_t);
1141 idx
[elem
+ 1] = idx
[elem
] + iov
[2 + elem
+ offset
].iov_len
;
1144 case _NL_ITEM_INDEX(_NL_CTYPE_TRANSLIT_DEFAULT_MISSING
):
1145 iov
[2 + elem
+ offset
].iov_base
=
1146 ctype
->default_missing
?: (uint32_t *) L
"";
1147 iov
[2 + elem
+ offset
].iov_len
=
1148 wcslen (iov
[2 + elem
+ offset
].iov_base
);
1149 idx
[elem
+ 1] = idx
[elem
] + iov
[2 + elem
+ offset
].iov_len
;
1152 case _NL_ITEM_INDEX(_NL_CTYPE_TRANSLIT_IGNORE_LEN
):
1153 /* Align entries. */
1154 iov
[2 + elem
+ offset
].iov_base
= (void *) nulbytes
;
1155 iov
[2 + elem
+ offset
].iov_len
= (4 - idx
[elem
] % 4) % 4;
1156 idx
[elem
] += iov
[2 + elem
+ offset
].iov_len
;
1159 iov
[2 + elem
+ offset
].iov_base
= &ctype
->ntranslit_ignore
;
1160 iov
[2 + elem
+ offset
].iov_len
= sizeof (uint32_t);
1161 idx
[elem
+ 1] = idx
[elem
] + iov
[2 + elem
+ offset
].iov_len
;
1164 case _NL_ITEM_INDEX(_NL_CTYPE_TRANSLIT_IGNORE
):
1166 uint32_t *ranges
= (uint32_t *) alloca (ctype
->ntranslit_ignore
1167 * 3 * sizeof (uint32_t));
1168 struct translit_ignore_t
*runp
;
1170 iov
[2 + elem
+ offset
].iov_base
= ranges
;
1171 iov
[2 + elem
+ offset
].iov_len
= (ctype
->ntranslit_ignore
1172 * 3 * sizeof (uint32_t));
1174 for (runp
= ctype
->translit_ignore
; runp
!= NULL
;
1177 *ranges
++ = runp
->from
;
1178 *ranges
++ = runp
->to
;
1179 *ranges
++ = runp
->step
;
1182 /* Remove the following line in case a new entry is added
1183 after _NL_CTYPE_TRANSLIT_DEFAULT_MISSING_LEN. */
1185 idx
[elem
+ 1] = idx
[elem
] + iov
[2 + elem
+ offset
].iov_len
;
1189 assert (! "unknown CTYPE element");
1193 /* Handle extra maps. */
1194 size_t nr
= elem
- _NL_ITEM_INDEX (_NL_CTYPE_EXTRA_MAP_1
);
1195 if (nr
< ctype
->nr_charclass
)
1197 iov
[2 + elem
+ offset
].iov_base
= ctype
->class_b
[nr
];
1198 iov
[2 + elem
+ offset
].iov_len
= 256 / 32 * sizeof (uint32_t);
1199 idx
[elem
] += iov
[2 + elem
+ offset
].iov_len
;
1202 iov
[2 + elem
+ offset
] = ctype
->class_3level
[nr
];
1206 nr
-= ctype
->nr_charclass
;
1207 assert (nr
< ctype
->map_collection_nr
);
1208 iov
[2 + elem
+ offset
] = ctype
->map_3level
[nr
];
1210 idx
[elem
+ 1] = idx
[elem
] + iov
[2 + elem
+ offset
].iov_len
;
1214 assert (2 + elem
+ offset
== (nelems
+ 2 * ctype
->nr_charclass
1215 + ctype
->map_collection_nr
+ 4 + 2));
1217 write_locale_data (output_path
, LC_CTYPE
, "LC_CTYPE", 2 + elem
+ offset
,
1222 /* Local functions. */
1224 ctype_class_new (struct linereader
*lr
, struct locale_ctype_t
*ctype
,
1229 for (cnt
= 0; cnt
< ctype
->nr_charclass
; ++cnt
)
1230 if (strcmp (ctype
->classnames
[cnt
], name
) == 0)
1233 if (cnt
< ctype
->nr_charclass
)
1235 lr_error (lr
, _("character class `%s' already defined"), name
);
1239 if (ctype
->nr_charclass
== MAX_NR_CHARCLASS
)
1240 /* Exit code 2 is prescribed in P1003.2b. */
1241 WITH_CUR_LOCALE (error (2, 0, _("\
1242 implementation limit: no more than %Zd character classes allowed"),
1245 ctype
->classnames
[ctype
->nr_charclass
++] = name
;
1250 ctype_map_new (struct linereader
*lr
, struct locale_ctype_t
*ctype
,
1251 const char *name
, const struct charmap_t
*charmap
)
1253 size_t max_chars
= 0;
1256 for (cnt
= 0; cnt
< ctype
->map_collection_nr
; ++cnt
)
1258 if (strcmp (ctype
->mapnames
[cnt
], name
) == 0)
1261 if (max_chars
< ctype
->map_collection_max
[cnt
])
1262 max_chars
= ctype
->map_collection_max
[cnt
];
1265 if (cnt
< ctype
->map_collection_nr
)
1267 lr_error (lr
, _("character map `%s' already defined"), name
);
1271 if (ctype
->map_collection_nr
== MAX_NR_CHARMAP
)
1272 /* Exit code 2 is prescribed in P1003.2b. */
1273 WITH_CUR_LOCALE (error (2, 0, _("\
1274 implementation limit: no more than %d character maps allowed"),
1277 ctype
->mapnames
[cnt
] = name
;
1280 ctype
->map_collection_max
[cnt
] = charmap
->mb_cur_max
== 1 ? 256 : 512;
1282 ctype
->map_collection_max
[cnt
] = max_chars
;
1284 ctype
->map_collection
[cnt
] = (uint32_t *)
1285 xcalloc (sizeof (uint32_t), ctype
->map_collection_max
[cnt
]);
1286 ctype
->map_collection_act
[cnt
] = 256;
1288 ++ctype
->map_collection_nr
;
1292 /* We have to be prepared that TABLE, MAX, and ACT can be NULL. This
1293 is possible if we only want to extend the name array. */
1295 find_idx (struct locale_ctype_t
*ctype
, uint32_t **table
, size_t *max
,
1296 size_t *act
, uint32_t idx
)
1301 return table
== NULL
? NULL
: &(*table
)[idx
];
1303 /* Use the charnames_idx lookup table instead of the slow search loop. */
1305 cnt
= idx_table_get (&ctype
->charnames_idx
, idx
);
1308 cnt
= ctype
->charnames_act
;
1310 for (cnt
= 256; cnt
< ctype
->charnames_act
; ++cnt
)
1311 if (ctype
->charnames
[cnt
] == idx
)
1315 /* We have to distinguish two cases: the name is found or not. */
1316 if (cnt
== ctype
->charnames_act
)
1318 /* Extend the name array. */
1319 if (ctype
->charnames_act
== ctype
->charnames_max
)
1321 ctype
->charnames_max
*= 2;
1322 ctype
->charnames
= (uint32_t *)
1323 xrealloc (ctype
->charnames
,
1324 sizeof (uint32_t) * ctype
->charnames_max
);
1326 ctype
->charnames
[ctype
->charnames_act
++] = idx
;
1327 idx_table_add (&ctype
->charnames_idx
, idx
, cnt
);
1331 /* We have done everything we are asked to do. */
1335 /* The caller does not want to extend the table. */
1336 return (cnt
>= *act
? NULL
: &(*table
)[cnt
]);
1342 size_t old_max
= *max
;
1345 while (*max
<= cnt
);
1348 (uint32_t *) xrealloc (*table
, *max
* sizeof (uint32_t));
1349 memset (&(*table
)[old_max
], '\0',
1350 (*max
- old_max
) * sizeof (uint32_t));
1356 return &(*table
)[cnt
];
1361 get_character (struct token
*now
, const struct charmap_t
*charmap
,
1362 struct repertoire_t
*repertoire
,
1363 struct charseq
**seqp
, uint32_t *wchp
)
1365 if (now
->tok
== tok_bsymbol
)
1367 /* This will hopefully be the normal case. */
1368 *wchp
= repertoire_find_value (repertoire
, now
->val
.str
.startmb
,
1369 now
->val
.str
.lenmb
);
1370 *seqp
= charmap_find_value (charmap
, now
->val
.str
.startmb
,
1371 now
->val
.str
.lenmb
);
1373 else if (now
->tok
== tok_ucs4
)
1377 snprintf (utmp
, sizeof (utmp
), "U%08X", now
->val
.ucs4
);
1378 *seqp
= charmap_find_value (charmap
, utmp
, 9);
1381 *seqp
= repertoire_find_seq (repertoire
, now
->val
.ucs4
);
1385 /* Compute the value in the charmap from the UCS value. */
1386 const char *symbol
= repertoire_find_symbol (repertoire
,
1392 *seqp
= charmap_find_value (charmap
, symbol
, strlen (symbol
));
1396 if (repertoire
!= NULL
)
1398 /* Insert a negative entry. */
1399 static const struct charseq negative
1400 = { .ucs4
= ILLEGAL_CHAR_VALUE
};
1401 uint32_t *newp
= obstack_alloc (&repertoire
->mem_pool
,
1403 *newp
= now
->val
.ucs4
;
1405 insert_entry (&repertoire
->seq_table
, newp
,
1406 sizeof (uint32_t), (void *) &negative
);
1410 (*seqp
)->ucs4
= now
->val
.ucs4
;
1412 else if ((*seqp
)->ucs4
!= now
->val
.ucs4
)
1415 *wchp
= now
->val
.ucs4
;
1417 else if (now
->tok
== tok_charcode
)
1419 /* We must map from the byte code to UCS4. */
1420 *seqp
= charmap_find_symbol (charmap
, now
->val
.str
.startmb
,
1421 now
->val
.str
.lenmb
);
1424 *wchp
= ILLEGAL_CHAR_VALUE
;
1427 if ((*seqp
)->ucs4
== UNINITIALIZED_CHAR_VALUE
)
1428 (*seqp
)->ucs4
= repertoire_find_value (repertoire
, (*seqp
)->name
,
1429 strlen ((*seqp
)->name
));
1430 *wchp
= (*seqp
)->ucs4
;
1440 /* Ellipsis like in `<foo123>..<foo12a>' or `<j1234>....<j1245>' and
1441 the .(2). counterparts. */
1443 charclass_symbolic_ellipsis (struct linereader
*ldfile
,
1444 struct locale_ctype_t
*ctype
,
1445 const struct charmap_t
*charmap
,
1446 struct repertoire_t
*repertoire
,
1448 const char *last_str
,
1449 unsigned long int class256_bit
,
1450 unsigned long int class_bit
, int base
,
1451 int ignore_content
, int handle_digits
, int step
)
1453 const char *nowstr
= now
->val
.str
.startmb
;
1454 char tmp
[now
->val
.str
.lenmb
+ 1];
1457 unsigned long int from
;
1458 unsigned long int to
;
1460 /* We have to compute the ellipsis values using the symbolic names. */
1461 assert (last_str
!= NULL
);
1463 if (strlen (last_str
) != now
->val
.str
.lenmb
)
1467 _("`%s' and `%.*s' are no valid names for symbolic range"),
1468 last_str
, (int) now
->val
.str
.lenmb
, nowstr
);
1472 if (memcmp (last_str
, nowstr
, now
->val
.str
.lenmb
) == 0)
1473 /* Nothing to do, the names are the same. */
1476 for (cp
= last_str
; *cp
== *(nowstr
+ (cp
- last_str
)); ++cp
)
1480 from
= strtoul (cp
, &endp
, base
);
1481 if ((from
== UINT_MAX
&& errno
== ERANGE
) || *endp
!= '\0')
1484 to
= strtoul (nowstr
+ (cp
- last_str
), &endp
, base
);
1485 if ((to
== UINT_MAX
&& errno
== ERANGE
)
1486 || (endp
- nowstr
) != now
->val
.str
.lenmb
|| from
>= to
)
1489 /* OK, we have a range FROM - TO. Now we can create the symbolic names. */
1490 if (!ignore_content
)
1492 now
->val
.str
.startmb
= tmp
;
1493 while ((from
+= step
) <= to
)
1495 struct charseq
*seq
;
1498 sprintf (tmp
, (base
== 10 ? "%.*s%0*ld" : "%.*s%0*lX"),
1499 (int) (cp
- last_str
), last_str
,
1500 (int) (now
->val
.str
.lenmb
- (cp
- last_str
)),
1503 get_character (now
, charmap
, repertoire
, &seq
, &wch
);
1505 if (seq
!= NULL
&& seq
->nbytes
== 1)
1506 /* Yep, we can store information about this byte sequence. */
1507 ctype
->class256_collection
[seq
->bytes
[0]] |= class256_bit
;
1509 if (wch
!= ILLEGAL_CHAR_VALUE
&& class_bit
!= 0)
1510 /* We have the UCS4 position. */
1511 *find_idx (ctype
, &ctype
->class_collection
,
1512 &ctype
->class_collection_max
,
1513 &ctype
->class_collection_act
, wch
) |= class_bit
;
1515 if (handle_digits
== 1)
1517 /* We must store the digit values. */
1518 if (ctype
->mbdigits_act
== ctype
->mbdigits_max
)
1520 ctype
->mbdigits_max
*= 2;
1521 ctype
->mbdigits
= xrealloc (ctype
->mbdigits
,
1522 (ctype
->mbdigits_max
1523 * sizeof (char *)));
1524 ctype
->wcdigits_max
*= 2;
1525 ctype
->wcdigits
= xrealloc (ctype
->wcdigits
,
1526 (ctype
->wcdigits_max
1527 * sizeof (uint32_t)));
1530 ctype
->mbdigits
[ctype
->mbdigits_act
++] = seq
;
1531 ctype
->wcdigits
[ctype
->wcdigits_act
++] = wch
;
1533 else if (handle_digits
== 2)
1535 /* We must store the digit values. */
1536 if (ctype
->outdigits_act
>= 10)
1538 lr_error (ldfile
, _("\
1539 %s: field `%s' does not contain exactly ten entries"),
1540 "LC_CTYPE", "outdigit");
1544 ctype
->mboutdigits
[ctype
->outdigits_act
] = seq
;
1545 ctype
->wcoutdigits
[ctype
->outdigits_act
] = wch
;
1546 ++ctype
->outdigits_act
;
1553 /* Ellipsis like in `<U1234>..<U2345>' or `<U1234>..(2)..<U2345>'. */
1555 charclass_ucs4_ellipsis (struct linereader
*ldfile
,
1556 struct locale_ctype_t
*ctype
,
1557 const struct charmap_t
*charmap
,
1558 struct repertoire_t
*repertoire
,
1559 struct token
*now
, uint32_t last_wch
,
1560 unsigned long int class256_bit
,
1561 unsigned long int class_bit
, int ignore_content
,
1562 int handle_digits
, int step
)
1564 if (last_wch
> now
->val
.ucs4
)
1566 lr_error (ldfile
, _("\
1567 to-value <U%0*X> of range is smaller than from-value <U%0*X>"),
1568 (now
->val
.ucs4
| last_wch
) < 65536 ? 4 : 8, now
->val
.ucs4
,
1569 (now
->val
.ucs4
| last_wch
) < 65536 ? 4 : 8, last_wch
);
1573 if (!ignore_content
)
1574 while ((last_wch
+= step
) <= now
->val
.ucs4
)
1576 /* We have to find out whether there is a byte sequence corresponding
1577 to this UCS4 value. */
1578 struct charseq
*seq
;
1581 snprintf (utmp
, sizeof (utmp
), "U%08X", last_wch
);
1582 seq
= charmap_find_value (charmap
, utmp
, 9);
1585 snprintf (utmp
, sizeof (utmp
), "U%04X", last_wch
);
1586 seq
= charmap_find_value (charmap
, utmp
, 5);
1590 /* Try looking in the repertoire map. */
1591 seq
= repertoire_find_seq (repertoire
, last_wch
);
1593 /* If this is the first time we look for this sequence create a new
1597 static const struct charseq negative
1598 = { .ucs4
= ILLEGAL_CHAR_VALUE
};
1600 /* Find the symbolic name for this UCS4 value. */
1601 if (repertoire
!= NULL
)
1603 const char *symbol
= repertoire_find_symbol (repertoire
,
1605 uint32_t *newp
= obstack_alloc (&repertoire
->mem_pool
,
1610 /* We have a name, now search the multibyte value. */
1611 seq
= charmap_find_value (charmap
, symbol
, strlen (symbol
));
1614 /* We have to create a fake entry. */
1615 seq
= (struct charseq
*) &negative
;
1617 seq
->ucs4
= last_wch
;
1619 insert_entry (&repertoire
->seq_table
, newp
, sizeof (uint32_t),
1623 /* We have to create a fake entry. */
1624 seq
= (struct charseq
*) &negative
;
1627 /* We have a name, now search the multibyte value. */
1628 if (seq
->ucs4
== last_wch
&& seq
->nbytes
== 1)
1629 /* Yep, we can store information about this byte sequence. */
1630 ctype
->class256_collection
[(size_t) seq
->bytes
[0]]
1633 /* And of course we have the UCS4 position. */
1635 *find_idx (ctype
, &ctype
->class_collection
,
1636 &ctype
->class_collection_max
,
1637 &ctype
->class_collection_act
, last_wch
) |= class_bit
;
1639 if (handle_digits
== 1)
1641 /* We must store the digit values. */
1642 if (ctype
->mbdigits_act
== ctype
->mbdigits_max
)
1644 ctype
->mbdigits_max
*= 2;
1645 ctype
->mbdigits
= xrealloc (ctype
->mbdigits
,
1646 (ctype
->mbdigits_max
1647 * sizeof (char *)));
1648 ctype
->wcdigits_max
*= 2;
1649 ctype
->wcdigits
= xrealloc (ctype
->wcdigits
,
1650 (ctype
->wcdigits_max
1651 * sizeof (uint32_t)));
1654 ctype
->mbdigits
[ctype
->mbdigits_act
++] = (seq
->ucs4
== last_wch
1656 ctype
->wcdigits
[ctype
->wcdigits_act
++] = last_wch
;
1658 else if (handle_digits
== 2)
1660 /* We must store the digit values. */
1661 if (ctype
->outdigits_act
>= 10)
1663 lr_error (ldfile
, _("\
1664 %s: field `%s' does not contain exactly ten entries"),
1665 "LC_CTYPE", "outdigit");
1669 ctype
->mboutdigits
[ctype
->outdigits_act
] = (seq
->ucs4
== last_wch
1671 ctype
->wcoutdigits
[ctype
->outdigits_act
] = last_wch
;
1672 ++ctype
->outdigits_act
;
1678 /* Ellipsis as in `/xea/x12.../xea/x34'. */
1680 charclass_charcode_ellipsis (struct linereader
*ldfile
,
1681 struct locale_ctype_t
*ctype
,
1682 const struct charmap_t
*charmap
,
1683 struct repertoire_t
*repertoire
,
1684 struct token
*now
, char *last_charcode
,
1685 uint32_t last_charcode_len
,
1686 unsigned long int class256_bit
,
1687 unsigned long int class_bit
, int ignore_content
,
1690 /* First check whether the to-value is larger. */
1691 if (now
->val
.charcode
.nbytes
!= last_charcode_len
)
1693 lr_error (ldfile
, _("\
1694 start and end character sequence of range must have the same length"));
1698 if (memcmp (last_charcode
, now
->val
.charcode
.bytes
, last_charcode_len
) > 0)
1700 lr_error (ldfile
, _("\
1701 to-value character sequence is smaller than from-value sequence"));
1705 if (!ignore_content
)
1709 /* Increment the byte sequence value. */
1710 struct charseq
*seq
;
1714 for (i
= last_charcode_len
- 1; i
>= 0; --i
)
1715 if (++last_charcode
[i
] != 0)
1718 if (last_charcode_len
== 1)
1719 /* Of course we have the charcode value. */
1720 ctype
->class256_collection
[(size_t) last_charcode
[0]]
1723 /* Find the symbolic name. */
1724 seq
= charmap_find_symbol (charmap
, last_charcode
,
1728 if (seq
->ucs4
== UNINITIALIZED_CHAR_VALUE
)
1729 seq
->ucs4
= repertoire_find_value (repertoire
, seq
->name
,
1730 strlen (seq
->name
));
1731 wch
= seq
== NULL
? ILLEGAL_CHAR_VALUE
: seq
->ucs4
;
1733 if (wch
!= ILLEGAL_CHAR_VALUE
&& class_bit
!= 0)
1734 *find_idx (ctype
, &ctype
->class_collection
,
1735 &ctype
->class_collection_max
,
1736 &ctype
->class_collection_act
, wch
) |= class_bit
;
1739 wch
= ILLEGAL_CHAR_VALUE
;
1741 if (handle_digits
== 1)
1743 /* We must store the digit values. */
1744 if (ctype
->mbdigits_act
== ctype
->mbdigits_max
)
1746 ctype
->mbdigits_max
*= 2;
1747 ctype
->mbdigits
= xrealloc (ctype
->mbdigits
,
1748 (ctype
->mbdigits_max
1749 * sizeof (char *)));
1750 ctype
->wcdigits_max
*= 2;
1751 ctype
->wcdigits
= xrealloc (ctype
->wcdigits
,
1752 (ctype
->wcdigits_max
1753 * sizeof (uint32_t)));
1756 seq
= xmalloc (sizeof (struct charseq
) + last_charcode_len
);
1757 memcpy ((char *) (seq
+ 1), last_charcode
, last_charcode_len
);
1758 seq
->nbytes
= last_charcode_len
;
1760 ctype
->mbdigits
[ctype
->mbdigits_act
++] = seq
;
1761 ctype
->wcdigits
[ctype
->wcdigits_act
++] = wch
;
1763 else if (handle_digits
== 2)
1765 struct charseq
*seq
;
1766 /* We must store the digit values. */
1767 if (ctype
->outdigits_act
>= 10)
1769 lr_error (ldfile
, _("\
1770 %s: field `%s' does not contain exactly ten entries"),
1771 "LC_CTYPE", "outdigit");
1775 seq
= xmalloc (sizeof (struct charseq
) + last_charcode_len
);
1776 memcpy ((char *) (seq
+ 1), last_charcode
, last_charcode_len
);
1777 seq
->nbytes
= last_charcode_len
;
1779 ctype
->mboutdigits
[ctype
->outdigits_act
] = seq
;
1780 ctype
->wcoutdigits
[ctype
->outdigits_act
] = wch
;
1781 ++ctype
->outdigits_act
;
1784 while (memcmp (last_charcode
, now
->val
.charcode
.bytes
,
1785 last_charcode_len
) != 0);
1791 find_translit2 (struct locale_ctype_t
*ctype
, const struct charmap_t
*charmap
,
1794 struct translit_t
*trunp
= ctype
->translit
;
1795 struct translit_ignore_t
*tirunp
= ctype
->translit_ignore
;
1797 while (trunp
!= NULL
)
1799 /* XXX We simplify things here. The transliterations we look
1800 for are only allowed to have one character. */
1801 if (trunp
->from
[0] == wch
&& trunp
->from
[1] == 0)
1803 /* Found it. Now look for a transliteration which can be
1804 represented with the character set. */
1805 struct translit_to_t
*torunp
= trunp
->to
;
1807 while (torunp
!= NULL
)
1811 for (i
= 0; torunp
->str
[i
] != 0; ++i
)
1815 snprintf (utmp
, sizeof (utmp
), "U%08X", torunp
->str
[i
]);
1816 if (charmap_find_value (charmap
, utmp
, 9) == NULL
)
1817 /* This character cannot be represented. */
1821 if (torunp
->str
[i
] == 0)
1824 torunp
= torunp
->next
;
1830 trunp
= trunp
->next
;
1833 /* Check for ignored chars. */
1834 while (tirunp
!= NULL
)
1836 if (tirunp
->from
<= wch
&& tirunp
->to
>= wch
)
1840 for (wi
= tirunp
->from
; wi
<= wch
; wi
+= tirunp
->step
)
1842 return (uint32_t []) { 0 };
1846 /* Nothing found. */
1852 find_translit (struct localedef_t
*locale
, const struct charmap_t
*charmap
,
1855 struct locale_ctype_t
*ctype
;
1856 uint32_t *result
= NULL
;
1858 assert (locale
!= NULL
);
1859 ctype
= locale
->categories
[LC_CTYPE
].ctype
;
1861 if (ctype
->translit
!= NULL
)
1862 result
= find_translit2 (ctype
, charmap
, wch
);
1866 struct translit_include_t
*irunp
= ctype
->translit_include
;
1868 while (irunp
!= NULL
&& result
== NULL
)
1870 result
= find_translit (find_locale (CTYPE_LOCALE
,
1872 irunp
->copy_repertoire
,
1875 irunp
= irunp
->next
;
1883 /* Read one transliteration entry. */
1885 read_widestring (struct linereader
*ldfile
, struct token
*now
,
1886 const struct charmap_t
*charmap
,
1887 struct repertoire_t
*repertoire
)
1891 if (now
->tok
== tok_default_missing
)
1892 /* The special name "" will denote this case. */
1893 wstr
= ((uint32_t *) { 0 });
1894 else if (now
->tok
== tok_bsymbol
)
1896 /* Get the value from the repertoire. */
1897 wstr
= (uint32_t *) xmalloc (2 * sizeof (uint32_t));
1898 wstr
[0] = repertoire_find_value (repertoire
, now
->val
.str
.startmb
,
1899 now
->val
.str
.lenmb
);
1900 if (wstr
[0] == ILLEGAL_CHAR_VALUE
)
1902 /* We cannot proceed, we don't know the UCS4 value. */
1909 else if (now
->tok
== tok_ucs4
)
1911 wstr
= (uint32_t *) xmalloc (2 * sizeof (uint32_t));
1912 wstr
[0] = now
->val
.ucs4
;
1915 else if (now
->tok
== tok_charcode
)
1917 /* Argh, we have to convert to the symbol name first and then to the
1919 struct charseq
*seq
= charmap_find_symbol (charmap
,
1920 now
->val
.str
.startmb
,
1921 now
->val
.str
.lenmb
);
1923 /* Cannot find the UCS4 value. */
1926 if (seq
->ucs4
== UNINITIALIZED_CHAR_VALUE
)
1927 seq
->ucs4
= repertoire_find_value (repertoire
, seq
->name
,
1928 strlen (seq
->name
));
1929 if (seq
->ucs4
== ILLEGAL_CHAR_VALUE
)
1930 /* We cannot proceed, we don't know the UCS4 value. */
1933 wstr
= (uint32_t *) xmalloc (2 * sizeof (uint32_t));
1934 wstr
[0] = seq
->ucs4
;
1937 else if (now
->tok
== tok_string
)
1939 wstr
= now
->val
.str
.startwc
;
1940 if (wstr
== NULL
|| wstr
[0] == 0)
1945 if (now
->tok
!= tok_eol
&& now
->tok
!= tok_eof
)
1946 lr_ignore_rest (ldfile
, 0);
1947 SYNTAX_ERROR (_("%s: syntax error"), "LC_CTYPE");
1948 return (uint32_t *) -1l;
1956 read_translit_entry (struct linereader
*ldfile
, struct locale_ctype_t
*ctype
,
1957 struct token
*now
, const struct charmap_t
*charmap
,
1958 struct repertoire_t
*repertoire
)
1960 uint32_t *from_wstr
= read_widestring (ldfile
, now
, charmap
, repertoire
);
1961 struct translit_t
*result
;
1962 struct translit_to_t
**top
;
1963 struct obstack
*ob
= &ctype
->mempool
;
1967 if (from_wstr
== NULL
)
1968 /* There is no valid from string. */
1971 result
= (struct translit_t
*) obstack_alloc (ob
,
1972 sizeof (struct translit_t
));
1973 result
->from
= from_wstr
;
1974 result
->fname
= ldfile
->fname
;
1975 result
->lineno
= ldfile
->lineno
;
1976 result
->next
= NULL
;
1986 /* Next we have one or more transliterations. They are
1987 separated by semicolons. */
1988 now
= lr_token (ldfile
, charmap
, NULL
, repertoire
, verbose
);
1990 if (!first
&& (now
->tok
== tok_semicolon
|| now
->tok
== tok_eol
))
1992 /* One string read. */
1993 const uint32_t zero
= 0;
1997 obstack_grow (ob
, &zero
, 4);
1998 to_wstr
= obstack_finish (ob
);
2000 *top
= obstack_alloc (ob
, sizeof (struct translit_to_t
));
2001 (*top
)->str
= to_wstr
;
2002 (*top
)->next
= NULL
;
2005 if (now
->tok
== tok_eol
)
2007 result
->next
= ctype
->translit
;
2008 ctype
->translit
= result
;
2013 top
= &(*top
)->next
;
2018 to_wstr
= read_widestring (ldfile
, now
, charmap
, repertoire
);
2019 if (to_wstr
== (uint32_t *) -1l)
2021 /* An error occurred. */
2022 obstack_free (ob
, result
);
2026 if (to_wstr
== NULL
)
2029 /* This value is usable. */
2030 obstack_grow (ob
, to_wstr
, wcslen ((wchar_t *) to_wstr
) * 4);
2039 read_translit_ignore_entry (struct linereader
*ldfile
,
2040 struct locale_ctype_t
*ctype
,
2041 const struct charmap_t
*charmap
,
2042 struct repertoire_t
*repertoire
)
2044 /* We expect a semicolon-separated list of characters we ignore. We are
2045 only interested in the wide character definitions. These must be
2046 single characters, possibly defining a range when an ellipsis is used. */
2049 struct token
*now
= lr_token (ldfile
, charmap
, NULL
, repertoire
,
2051 struct translit_ignore_t
*newp
;
2054 if (now
->tok
== tok_eol
|| now
->tok
== tok_eof
)
2057 _("premature end of `translit_ignore' definition"));
2061 if (now
->tok
!= tok_bsymbol
&& now
->tok
!= tok_ucs4
)
2063 lr_error (ldfile
, _("syntax error"));
2064 lr_ignore_rest (ldfile
, 0);
2068 if (now
->tok
== tok_ucs4
)
2069 from
= now
->val
.ucs4
;
2071 /* Try to get the value. */
2072 from
= repertoire_find_value (repertoire
, now
->val
.str
.startmb
,
2073 now
->val
.str
.lenmb
);
2075 if (from
== ILLEGAL_CHAR_VALUE
)
2077 lr_error (ldfile
, "invalid character name");
2082 newp
= (struct translit_ignore_t
*)
2083 obstack_alloc (&ctype
->mempool
, sizeof (struct translit_ignore_t
));
2088 newp
->next
= ctype
->translit_ignore
;
2089 ctype
->translit_ignore
= newp
;
2092 /* Now we expect either a semicolon, an ellipsis, or the end of the
2094 now
= lr_token (ldfile
, charmap
, NULL
, repertoire
, verbose
);
2096 if (now
->tok
== tok_ellipsis2
|| now
->tok
== tok_ellipsis2_2
)
2098 /* XXX Should we bother implementing `....'? `...' certainly
2099 will not be implemented. */
2101 int step
= now
->tok
== tok_ellipsis2_2
? 2 : 1;
2103 now
= lr_token (ldfile
, charmap
, NULL
, repertoire
, verbose
);
2105 if (now
->tok
== tok_eol
|| now
->tok
== tok_eof
)
2108 _("premature end of `translit_ignore' definition"));
2112 if (now
->tok
!= tok_bsymbol
&& now
->tok
!= tok_ucs4
)
2114 lr_error (ldfile
, _("syntax error"));
2115 lr_ignore_rest (ldfile
, 0);
2119 if (now
->tok
== tok_ucs4
)
2122 /* Try to get the value. */
2123 to
= repertoire_find_value (repertoire
, now
->val
.str
.startmb
,
2124 now
->val
.str
.lenmb
);
2126 if (to
== ILLEGAL_CHAR_VALUE
)
2127 lr_error (ldfile
, "invalid character name");
2130 /* Make sure the `to'-value is larger. */
2137 lr_error (ldfile
, _("\
2138 to-value <U%0*X> of range is smaller than from-value <U%0*X>"),
2139 (to
| from
) < 65536 ? 4 : 8, to
,
2140 (to
| from
) < 65536 ? 4 : 8, from
);
2143 /* And the next token. */
2144 now
= lr_token (ldfile
, charmap
, NULL
, repertoire
, verbose
);
2147 if (now
->tok
== tok_eol
|| now
->tok
== tok_eof
)
2151 if (now
->tok
== tok_semicolon
)
2155 /* If we come here something is wrong. */
2156 lr_error (ldfile
, _("syntax error"));
2157 lr_ignore_rest (ldfile
, 0);
2163 /* The parser for the LC_CTYPE section of the locale definition. */
2165 ctype_read (struct linereader
*ldfile
, struct localedef_t
*result
,
2166 const struct charmap_t
*charmap
, const char *repertoire_name
,
2169 struct repertoire_t
*repertoire
= NULL
;
2170 struct locale_ctype_t
*ctype
;
2172 enum token_t nowtok
;
2174 struct charseq
*last_seq
;
2175 uint32_t last_wch
= 0;
2176 enum token_t last_token
;
2177 enum token_t ellipsis_token
;
2179 char last_charcode
[16];
2180 size_t last_charcode_len
= 0;
2181 const char *last_str
= NULL
;
2183 struct localedef_t
*copy_locale
= NULL
;
2185 /* Get the repertoire we have to use. */
2186 if (repertoire_name
!= NULL
)
2187 repertoire
= repertoire_read (repertoire_name
);
2189 /* The rest of the line containing `LC_CTYPE' must be free. */
2190 lr_ignore_rest (ldfile
, 1);
2195 now
= lr_token (ldfile
, charmap
, NULL
, NULL
, verbose
);
2198 while (nowtok
== tok_eol
);
2200 /* If we see `copy' now we are almost done. */
2201 if (nowtok
== tok_copy
)
2203 now
= lr_token (ldfile
, charmap
, NULL
, NULL
, verbose
);
2204 if (now
->tok
!= tok_string
)
2206 SYNTAX_ERROR (_("%s: syntax error"), "LC_CTYPE");
2210 now
= lr_token (ldfile
, charmap
, NULL
, NULL
, verbose
);
2211 while (now
->tok
!= tok_eof
&& now
->tok
!= tok_end
);
2213 if (now
->tok
!= tok_eof
2214 || (now
= lr_token (ldfile
, charmap
, NULL
, NULL
, verbose
),
2215 now
->tok
== tok_eof
))
2216 lr_error (ldfile
, _("%s: premature end of file"), "LC_CTYPE");
2217 else if (now
->tok
!= tok_lc_ctype
)
2219 lr_error (ldfile
, _("\
2220 %1$s: definition does not end with `END %1$s'"), "LC_CTYPE");
2221 lr_ignore_rest (ldfile
, 0);
2224 lr_ignore_rest (ldfile
, 1);
2229 if (! ignore_content
)
2231 /* Get the locale definition. */
2232 copy_locale
= load_locale (LC_CTYPE
, now
->val
.str
.startmb
,
2233 repertoire_name
, charmap
, NULL
);
2234 if ((copy_locale
->avail
& CTYPE_LOCALE
) == 0)
2236 /* Not yet loaded. So do it now. */
2237 if (locfile_read (copy_locale
, charmap
) != 0)
2242 lr_ignore_rest (ldfile
, 1);
2244 now
= lr_token (ldfile
, charmap
, NULL
, NULL
, verbose
);
2248 /* Prepare the data structures. */
2249 ctype_startup (ldfile
, result
, charmap
, copy_locale
, ignore_content
);
2250 ctype
= result
->categories
[LC_CTYPE
].ctype
;
2252 /* Remember the repertoire we use. */
2253 if (!ignore_content
)
2254 ctype
->repertoire
= repertoire
;
2258 unsigned long int class_bit
= 0;
2259 unsigned long int class256_bit
= 0;
2260 int handle_digits
= 0;
2262 /* Of course we don't proceed beyond the end of file. */
2263 if (nowtok
== tok_eof
)
2266 /* Ingore empty lines. */
2267 if (nowtok
== tok_eol
)
2269 now
= lr_token (ldfile
, charmap
, NULL
, NULL
, verbose
);
2277 now
= lr_token (ldfile
, charmap
, NULL
, NULL
, verbose
);
2278 while (now
->tok
== tok_ident
|| now
->tok
== tok_string
)
2280 ctype_class_new (ldfile
, ctype
, now
->val
.str
.startmb
);
2281 now
= lr_token (ldfile
, charmap
, NULL
, NULL
, verbose
);
2282 if (now
->tok
!= tok_semicolon
)
2284 now
= lr_token (ldfile
, charmap
, NULL
, NULL
, verbose
);
2286 if (now
->tok
!= tok_eol
)
2288 %s: syntax error in definition of new character class"), "LC_CTYPE");
2292 now
= lr_token (ldfile
, charmap
, NULL
, NULL
, verbose
);
2293 while (now
->tok
== tok_ident
|| now
->tok
== tok_string
)
2295 ctype_map_new (ldfile
, ctype
, now
->val
.str
.startmb
, charmap
);
2296 now
= lr_token (ldfile
, charmap
, NULL
, NULL
, verbose
);
2297 if (now
->tok
!= tok_semicolon
)
2299 now
= lr_token (ldfile
, charmap
, NULL
, NULL
, verbose
);
2301 if (now
->tok
!= tok_eol
)
2303 %s: syntax error in definition of new character map"), "LC_CTYPE");
2307 /* Ignore the rest of the line if we don't need the input of
2311 lr_ignore_rest (ldfile
, 0);
2315 /* We simply forget the `class' keyword and use the following
2316 operand to determine the bit. */
2317 now
= lr_token (ldfile
, charmap
, NULL
, NULL
, verbose
);
2318 if (now
->tok
== tok_ident
|| now
->tok
== tok_string
)
2320 /* Must can be one of the predefined class names. */
2321 for (cnt
= 0; cnt
< ctype
->nr_charclass
; ++cnt
)
2322 if (strcmp (ctype
->classnames
[cnt
], now
->val
.str
.startmb
) == 0)
2324 if (cnt
>= ctype
->nr_charclass
)
2326 #ifdef PREDEFINED_CLASSES
2327 if (now
->val
.str
.lenmb
== 8
2328 && memcmp ("special1", now
->val
.str
.startmb
, 8) == 0)
2329 class_bit
= _ISwspecial1
;
2330 else if (now
->val
.str
.lenmb
== 8
2331 && memcmp ("special2", now
->val
.str
.startmb
, 8) == 0)
2332 class_bit
= _ISwspecial2
;
2333 else if (now
->val
.str
.lenmb
== 8
2334 && memcmp ("special3", now
->val
.str
.startmb
, 8) == 0)
2335 class_bit
= _ISwspecial3
;
2339 /* OK, it's a new class. */
2340 ctype_class_new (ldfile
, ctype
, now
->val
.str
.startmb
);
2342 class_bit
= _ISwbit (ctype
->nr_charclass
- 1);
2347 class_bit
= _ISwbit (cnt
);
2349 free (now
->val
.str
.startmb
);
2352 else if (now
->tok
== tok_digit
)
2353 goto handle_tok_digit
;
2354 else if (now
->tok
< tok_upper
|| now
->tok
> tok_blank
)
2358 class_bit
= BITw (now
->tok
);
2359 class256_bit
= BIT (now
->tok
);
2362 /* The next character must be a semicolon. */
2363 now
= lr_token (ldfile
, charmap
, NULL
, NULL
, verbose
);
2364 if (now
->tok
!= tok_semicolon
)
2366 goto read_charclass
;
2379 /* Ignore the rest of the line if we don't need the input of
2383 lr_ignore_rest (ldfile
, 0);
2387 class_bit
= BITw (now
->tok
);
2388 class256_bit
= BIT (now
->tok
);
2391 ctype
->class_done
|= class_bit
;
2392 last_token
= tok_none
;
2393 ellipsis_token
= tok_none
;
2395 now
= lr_token (ldfile
, charmap
, NULL
, NULL
, verbose
);
2396 while (now
->tok
!= tok_eol
&& now
->tok
!= tok_eof
)
2399 struct charseq
*seq
;
2401 if (ellipsis_token
== tok_none
)
2403 if (get_character (now
, charmap
, repertoire
, &seq
, &wch
))
2406 if (!ignore_content
&& seq
!= NULL
&& seq
->nbytes
== 1)
2407 /* Yep, we can store information about this byte
2409 ctype
->class256_collection
[seq
->bytes
[0]] |= class256_bit
;
2411 if (!ignore_content
&& wch
!= ILLEGAL_CHAR_VALUE
2413 /* We have the UCS4 position. */
2414 *find_idx (ctype
, &ctype
->class_collection
,
2415 &ctype
->class_collection_max
,
2416 &ctype
->class_collection_act
, wch
) |= class_bit
;
2418 last_token
= now
->tok
;
2419 /* Terminate the string. */
2420 if (last_token
== tok_bsymbol
)
2422 now
->val
.str
.startmb
[now
->val
.str
.lenmb
] = '\0';
2423 last_str
= now
->val
.str
.startmb
;
2429 memcpy (last_charcode
, now
->val
.charcode
.bytes
, 16);
2430 last_charcode_len
= now
->val
.charcode
.nbytes
;
2432 if (!ignore_content
&& handle_digits
== 1)
2434 /* We must store the digit values. */
2435 if (ctype
->mbdigits_act
== ctype
->mbdigits_max
)
2437 ctype
->mbdigits_max
+= 10;
2438 ctype
->mbdigits
= xrealloc (ctype
->mbdigits
,
2439 (ctype
->mbdigits_max
2440 * sizeof (char *)));
2441 ctype
->wcdigits_max
+= 10;
2442 ctype
->wcdigits
= xrealloc (ctype
->wcdigits
,
2443 (ctype
->wcdigits_max
2444 * sizeof (uint32_t)));
2447 ctype
->mbdigits
[ctype
->mbdigits_act
++] = seq
;
2448 ctype
->wcdigits
[ctype
->wcdigits_act
++] = wch
;
2450 else if (!ignore_content
&& handle_digits
== 2)
2452 /* We must store the digit values. */
2453 if (ctype
->outdigits_act
>= 10)
2455 lr_error (ldfile
, _("\
2456 %s: field `%s' does not contain exactly ten entries"),
2457 "LC_CTYPE", "outdigit");
2458 lr_ignore_rest (ldfile
, 0);
2462 ctype
->mboutdigits
[ctype
->outdigits_act
] = seq
;
2463 ctype
->wcoutdigits
[ctype
->outdigits_act
] = wch
;
2464 ++ctype
->outdigits_act
;
2469 /* Now it gets complicated. We have to resolve the
2470 ellipsis problem. First we must distinguish between
2471 the different kind of ellipsis and this must match the
2472 tokens we have seen. */
2473 assert (last_token
!= tok_none
);
2475 if (last_token
!= now
->tok
)
2477 lr_error (ldfile
, _("\
2478 ellipsis range must be marked by two operands of same type"));
2479 lr_ignore_rest (ldfile
, 0);
2483 if (last_token
== tok_bsymbol
)
2485 if (ellipsis_token
== tok_ellipsis3
)
2486 lr_error (ldfile
, _("with symbolic name range values \
2487 the absolute ellipsis `...' must not be used"));
2489 charclass_symbolic_ellipsis (ldfile
, ctype
, charmap
,
2490 repertoire
, now
, last_str
,
2491 class256_bit
, class_bit
,
2496 handle_digits
, step
);
2498 else if (last_token
== tok_ucs4
)
2500 if (ellipsis_token
!= tok_ellipsis2
)
2501 lr_error (ldfile
, _("\
2502 with UCS range values one must use the hexadecimal symbolic ellipsis `..'"));
2504 charclass_ucs4_ellipsis (ldfile
, ctype
, charmap
,
2505 repertoire
, now
, last_wch
,
2506 class256_bit
, class_bit
,
2507 ignore_content
, handle_digits
,
2512 assert (last_token
== tok_charcode
);
2514 if (ellipsis_token
!= tok_ellipsis3
)
2515 lr_error (ldfile
, _("\
2516 with character code range values one must use the absolute ellipsis `...'"));
2518 charclass_charcode_ellipsis (ldfile
, ctype
, charmap
,
2522 class256_bit
, class_bit
,
2527 /* Now we have used the last value. */
2528 last_token
= tok_none
;
2531 /* Next we expect a semicolon or the end of the line. */
2532 now
= lr_token (ldfile
, charmap
, NULL
, NULL
, verbose
);
2533 if (now
->tok
== tok_eol
|| now
->tok
== tok_eof
)
2536 if (last_token
!= tok_none
2537 && now
->tok
>= tok_ellipsis2
&& now
->tok
<= tok_ellipsis4_2
)
2539 if (now
->tok
== tok_ellipsis2_2
)
2541 now
->tok
= tok_ellipsis2
;
2544 else if (now
->tok
== tok_ellipsis4_2
)
2546 now
->tok
= tok_ellipsis4
;
2550 ellipsis_token
= now
->tok
;
2552 now
= lr_token (ldfile
, charmap
, NULL
, NULL
, verbose
);
2556 if (now
->tok
!= tok_semicolon
)
2559 /* And get the next character. */
2560 now
= lr_token (ldfile
, charmap
, NULL
, NULL
, verbose
);
2562 ellipsis_token
= tok_none
;
2568 /* Ignore the rest of the line if we don't need the input of
2572 lr_ignore_rest (ldfile
, 0);
2577 class_bit
= _ISwdigit
;
2578 class256_bit
= _ISdigit
;
2580 goto read_charclass
;
2583 /* Ignore the rest of the line if we don't need the input of
2587 lr_ignore_rest (ldfile
, 0);
2591 if (ctype
->outdigits_act
!= 0)
2592 lr_error (ldfile
, _("\
2593 %s: field `%s' declared more than once"),
2594 "LC_CTYPE", "outdigit");
2598 goto read_charclass
;
2601 /* Ignore the rest of the line if we don't need the input of
2605 lr_ignore_rest (ldfile
, 0);
2613 /* Ignore the rest of the line if we don't need the input of
2617 lr_ignore_rest (ldfile
, 0);
2625 /* Ignore the rest of the line if we don't need the input of
2629 lr_ignore_rest (ldfile
, 0);
2633 /* We simply forget the `map' keyword and use the following
2634 operand to determine the mapping. */
2635 now
= lr_token (ldfile
, charmap
, NULL
, NULL
, verbose
);
2636 if (now
->tok
== tok_ident
|| now
->tok
== tok_string
)
2640 for (cnt
= 2; cnt
< ctype
->map_collection_nr
; ++cnt
)
2641 if (strcmp (now
->val
.str
.startmb
, ctype
->mapnames
[cnt
]) == 0)
2644 if (cnt
< ctype
->map_collection_nr
)
2645 free (now
->val
.str
.startmb
);
2647 /* OK, it's a new map. */
2648 ctype_map_new (ldfile
, ctype
, now
->val
.str
.startmb
, charmap
);
2652 else if (now
->tok
< tok_toupper
|| now
->tok
> tok_tolower
)
2655 mapidx
= now
->tok
- tok_toupper
;
2657 now
= lr_token (ldfile
, charmap
, NULL
, NULL
, verbose
);
2658 /* This better should be a semicolon. */
2659 if (now
->tok
!= tok_semicolon
)
2663 /* Test whether this mapping was already defined. */
2664 if (ctype
->tomap_done
[mapidx
])
2666 lr_error (ldfile
, _("duplicated definition for mapping `%s'"),
2667 ctype
->mapnames
[mapidx
]);
2668 lr_ignore_rest (ldfile
, 0);
2671 ctype
->tomap_done
[mapidx
] = 1;
2673 now
= lr_token (ldfile
, charmap
, NULL
, NULL
, verbose
);
2674 while (now
->tok
!= tok_eol
&& now
->tok
!= tok_eof
)
2676 struct charseq
*from_seq
;
2678 struct charseq
*to_seq
;
2681 /* Every pair starts with an opening brace. */
2682 if (now
->tok
!= tok_open_brace
)
2685 /* Next comes the from-value. */
2686 now
= lr_token (ldfile
, charmap
, NULL
, NULL
, verbose
);
2687 if (get_character (now
, charmap
, repertoire
, &from_seq
,
2691 /* The next is a comma. */
2692 now
= lr_token (ldfile
, charmap
, NULL
, NULL
, verbose
);
2693 if (now
->tok
!= tok_comma
)
2696 /* And the other value. */
2697 now
= lr_token (ldfile
, charmap
, NULL
, NULL
, verbose
);
2698 if (get_character (now
, charmap
, repertoire
, &to_seq
,
2702 /* And the last thing is the closing brace. */
2703 now
= lr_token (ldfile
, charmap
, NULL
, NULL
, verbose
);
2704 if (now
->tok
!= tok_close_brace
)
2707 if (!ignore_content
)
2709 if (mapidx
< 2 && from_seq
!= NULL
&& to_seq
!= NULL
2710 && from_seq
->nbytes
== 1 && to_seq
->nbytes
== 1)
2711 /* We can use this value. */
2712 ctype
->map256_collection
[mapidx
][from_seq
->bytes
[0]]
2715 if (from_wch
!= ILLEGAL_CHAR_VALUE
2716 && to_wch
!= ILLEGAL_CHAR_VALUE
)
2717 /* Both correct values. */
2718 *find_idx (ctype
, &ctype
->map_collection
[mapidx
],
2719 &ctype
->map_collection_max
[mapidx
],
2720 &ctype
->map_collection_act
[mapidx
],
2724 /* Now comes a semicolon or the end of the line/file. */
2725 now
= lr_token (ldfile
, charmap
, NULL
, NULL
, verbose
);
2726 if (now
->tok
== tok_semicolon
)
2727 now
= lr_token (ldfile
, charmap
, NULL
, NULL
, verbose
);
2731 case tok_translit_start
:
2732 /* Ignore the entire translit section with its peculiar syntax
2733 if we don't need the input. */
2738 lr_ignore_rest (ldfile
, 0);
2739 now
= lr_token (ldfile
, charmap
, NULL
, NULL
, verbose
);
2741 while (now
->tok
!= tok_translit_end
&& now
->tok
!= tok_eof
);
2743 if (now
->tok
== tok_eof
)
2744 lr_error (ldfile
, _(\
2745 "%s: `translit_start' section does not end with `translit_end'"),
2751 /* The rest of the line better should be empty. */
2752 lr_ignore_rest (ldfile
, 1);
2754 /* We count here the number of allocated entries in the `translit'
2758 ldfile
->translate_strings
= 1;
2759 ldfile
->return_widestr
= 1;
2761 /* We proceed until we see the `translit_end' token. */
2762 while (now
= lr_token (ldfile
, charmap
, NULL
, repertoire
, verbose
),
2763 now
->tok
!= tok_translit_end
&& now
->tok
!= tok_eof
)
2765 if (now
->tok
== tok_eol
)
2766 /* Ignore empty lines. */
2769 if (now
->tok
== tok_include
)
2771 /* We have to include locale. */
2772 const char *locale_name
;
2773 const char *repertoire_name
;
2774 struct translit_include_t
*include_stmt
, **include_ptr
;
2776 now
= lr_token (ldfile
, charmap
, NULL
, NULL
, verbose
);
2777 /* This should be a string or an identifier. In any
2778 case something to name a locale. */
2779 if (now
->tok
!= tok_string
&& now
->tok
!= tok_ident
)
2782 lr_error (ldfile
, _("%s: syntax error"), "LC_CTYPE");
2783 lr_ignore_rest (ldfile
, 0);
2786 locale_name
= now
->val
.str
.startmb
;
2788 /* Next should be a semicolon. */
2789 now
= lr_token (ldfile
, charmap
, NULL
, NULL
, verbose
);
2790 if (now
->tok
!= tok_semicolon
)
2791 goto translit_syntax
;
2793 /* Now the repertoire name. */
2794 now
= lr_token (ldfile
, charmap
, NULL
, NULL
, verbose
);
2795 if ((now
->tok
!= tok_string
&& now
->tok
!= tok_ident
)
2796 || now
->val
.str
.startmb
== NULL
)
2797 goto translit_syntax
;
2798 repertoire_name
= now
->val
.str
.startmb
;
2799 if (repertoire_name
[0] == '\0')
2800 /* Ignore the empty string. */
2801 repertoire_name
= NULL
;
2803 /* Save the include statement for later processing. */
2804 include_stmt
= (struct translit_include_t
*)
2805 xmalloc (sizeof (struct translit_include_t
));
2806 include_stmt
->copy_locale
= locale_name
;
2807 include_stmt
->copy_repertoire
= repertoire_name
;
2808 include_stmt
->next
= NULL
;
2810 include_ptr
= &ctype
->translit_include
;
2811 while (*include_ptr
!= NULL
)
2812 include_ptr
= &(*include_ptr
)->next
;
2813 *include_ptr
= include_stmt
;
2815 /* The rest of the line must be empty. */
2816 lr_ignore_rest (ldfile
, 1);
2818 /* Make sure the locale is read. */
2819 add_to_readlist (LC_CTYPE
, locale_name
, repertoire_name
,
2823 else if (now
->tok
== tok_default_missing
)
2829 /* We expect a single character or string as the
2831 now
= lr_token (ldfile
, charmap
, NULL
, NULL
, verbose
);
2832 wstr
= read_widestring (ldfile
, now
, charmap
,
2837 if (ctype
->default_missing
!= NULL
)
2839 lr_error (ldfile
, _("\
2840 %s: duplicate `default_missing' definition"), "LC_CTYPE");
2841 WITH_CUR_LOCALE (error_at_line (0, 0,
2842 ctype
->default_missing_file
,
2843 ctype
->default_missing_lineno
,
2845 previous definition was here")));
2849 ctype
->default_missing
= wstr
;
2850 ctype
->default_missing_file
= ldfile
->fname
;
2851 ctype
->default_missing_lineno
= ldfile
->lineno
;
2853 /* We can have more entries, ignore them. */
2854 lr_ignore_rest (ldfile
, 0);
2857 else if (wstr
== (uint32_t *) -1l)
2858 /* This was an syntax error. */
2861 /* Maybe there is another replacement we can use. */
2862 now
= lr_token (ldfile
, charmap
, NULL
, NULL
, verbose
);
2863 if (now
->tok
== tok_eol
|| now
->tok
== tok_eof
)
2865 /* Nothing found. We tell the user. */
2866 lr_error (ldfile
, _("\
2867 %s: no representable `default_missing' definition found"), "LC_CTYPE");
2870 if (now
->tok
!= tok_semicolon
)
2871 goto translit_syntax
;
2876 else if (now
->tok
== tok_translit_ignore
)
2878 read_translit_ignore_entry (ldfile
, ctype
, charmap
,
2883 read_translit_entry (ldfile
, ctype
, now
, charmap
, repertoire
);
2885 ldfile
->return_widestr
= 0;
2887 if (now
->tok
== tok_eof
)
2888 lr_error (ldfile
, _(\
2889 "%s: `translit_start' section does not end with `translit_end'"),
2895 /* Ignore the rest of the line if we don't need the input of
2899 lr_ignore_rest (ldfile
, 0);
2903 /* This could mean one of several things. First test whether
2904 it's a character class name. */
2905 for (cnt
= 0; cnt
< ctype
->nr_charclass
; ++cnt
)
2906 if (strcmp (now
->val
.str
.startmb
, ctype
->classnames
[cnt
]) == 0)
2908 if (cnt
< ctype
->nr_charclass
)
2910 class_bit
= _ISwbit (cnt
);
2911 class256_bit
= cnt
<= 11 ? _ISbit (cnt
) : 0;
2912 free (now
->val
.str
.startmb
);
2913 goto read_charclass
;
2915 for (cnt
= 0; cnt
< ctype
->map_collection_nr
; ++cnt
)
2916 if (strcmp (now
->val
.str
.startmb
, ctype
->mapnames
[cnt
]) == 0)
2918 if (cnt
< ctype
->map_collection_nr
)
2921 free (now
->val
.str
.startmb
);
2924 #ifdef PREDEFINED_CLASSES
2925 if (strcmp (now
->val
.str
.startmb
, "special1") == 0)
2927 class_bit
= _ISwspecial1
;
2928 free (now
->val
.str
.startmb
);
2929 goto read_charclass
;
2931 if (strcmp (now
->val
.str
.startmb
, "special2") == 0)
2933 class_bit
= _ISwspecial2
;
2934 free (now
->val
.str
.startmb
);
2935 goto read_charclass
;
2937 if (strcmp (now
->val
.str
.startmb
, "special3") == 0)
2939 class_bit
= _ISwspecial3
;
2940 free (now
->val
.str
.startmb
);
2941 goto read_charclass
;
2943 if (strcmp (now
->val
.str
.startmb
, "tosymmetric") == 0)
2952 /* Next we assume `LC_CTYPE'. */
2953 now
= lr_token (ldfile
, charmap
, NULL
, NULL
, verbose
);
2954 if (now
->tok
== tok_eof
)
2956 if (now
->tok
== tok_eol
)
2957 lr_error (ldfile
, _("%s: incomplete `END' line"),
2959 else if (now
->tok
!= tok_lc_ctype
)
2960 lr_error (ldfile
, _("\
2961 %1$s: definition does not end with `END %1$s'"), "LC_CTYPE");
2962 lr_ignore_rest (ldfile
, now
->tok
== tok_lc_ctype
);
2967 if (now
->tok
!= tok_eof
)
2968 SYNTAX_ERROR (_("%s: syntax error"), "LC_CTYPE");
2971 /* Prepare for the next round. */
2972 now
= lr_token (ldfile
, charmap
, NULL
, NULL
, verbose
);
2976 /* When we come here we reached the end of the file. */
2977 lr_error (ldfile
, _("%s: premature end of file"), "LC_CTYPE");
2982 set_class_defaults (struct locale_ctype_t
*ctype
,
2983 const struct charmap_t
*charmap
,
2984 struct repertoire_t
*repertoire
)
2988 /* These function defines the default values for the classes and conversions
2989 according to POSIX.2 2.5.2.1.
2990 It may seem that the order of these if-blocks is arbitrary but it is NOT.
2991 Don't move them unless you know what you do! */
2993 auto void set_default (int bitpos
, int from
, int to
);
2995 void set_default (int bitpos
, int from
, int to
)
2999 int bit
= _ISbit (bitpos
);
3000 int bitw
= _ISwbit (bitpos
);
3001 /* Define string. */
3004 for (ch
= from
; ch
<= to
; ++ch
)
3006 struct charseq
*seq
;
3009 seq
= charmap_find_value (charmap
, tmp
, 1);
3013 sprintf (buf
, "U%08X", ch
);
3014 seq
= charmap_find_value (charmap
, buf
, 9);
3019 WITH_CUR_LOCALE (error (0, 0, _("\
3020 %s: character `%s' not defined in charmap while needed as default value"),
3023 else if (seq
->nbytes
!= 1)
3024 WITH_CUR_LOCALE (error (0, 0, _("\
3025 %s: character `%s' in charmap not representable with one byte"),
3028 ctype
->class256_collection
[seq
->bytes
[0]] |= bit
;
3030 /* No need to search here, the ASCII value is also the Unicode
3032 ELEM (ctype
, class_collection
, , ch
) |= bitw
;
3036 /* Set default values if keyword was not present. */
3037 if ((ctype
->class_done
& BITw (tok_upper
)) == 0)
3038 /* "If this keyword [lower] is not specified, the lowercase letters
3039 `A' through `Z', ..., shall automatically belong to this class,
3040 with implementation defined character values." [P1003.2, 2.5.2.1] */
3041 set_default (BITPOS (tok_upper
), 'A', 'Z');
3043 if ((ctype
->class_done
& BITw (tok_lower
)) == 0)
3044 /* "If this keyword [lower] is not specified, the lowercase letters
3045 `a' through `z', ..., shall automatically belong to this class,
3046 with implementation defined character values." [P1003.2, 2.5.2.1] */
3047 set_default (BITPOS (tok_lower
), 'a', 'z');
3049 if ((ctype
->class_done
& BITw (tok_alpha
)) == 0)
3051 /* Table 2-6 in P1003.2 says that characters in class `upper' or
3052 class `lower' *must* be in class `alpha'. */
3053 unsigned long int mask
= BIT (tok_upper
) | BIT (tok_lower
);
3054 unsigned long int maskw
= BITw (tok_upper
) | BITw (tok_lower
);
3056 for (cnt
= 0; cnt
< 256; ++cnt
)
3057 if ((ctype
->class256_collection
[cnt
] & mask
) != 0)
3058 ctype
->class256_collection
[cnt
] |= BIT (tok_alpha
);
3060 for (cnt
= 0; cnt
< ctype
->class_collection_act
; ++cnt
)
3061 if ((ctype
->class_collection
[cnt
] & maskw
) != 0)
3062 ctype
->class_collection
[cnt
] |= BITw (tok_alpha
);
3065 if ((ctype
->class_done
& BITw (tok_digit
)) == 0)
3066 /* "If this keyword [digit] is not specified, the digits `0' through
3067 `9', ..., shall automatically belong to this class, with
3068 implementation-defined character values." [P1003.2, 2.5.2.1] */
3069 set_default (BITPOS (tok_digit
), '0', '9');
3071 /* "Only characters specified for the `alpha' and `digit' keyword
3072 shall be specified. Characters specified for the keyword `alpha'
3073 and `digit' are automatically included in this class. */
3075 unsigned long int mask
= BIT (tok_alpha
) | BIT (tok_digit
);
3076 unsigned long int maskw
= BITw (tok_alpha
) | BITw (tok_digit
);
3078 for (cnt
= 0; cnt
< 256; ++cnt
)
3079 if ((ctype
->class256_collection
[cnt
] & mask
) != 0)
3080 ctype
->class256_collection
[cnt
] |= BIT (tok_alnum
);
3082 for (cnt
= 0; cnt
< ctype
->class_collection_act
; ++cnt
)
3083 if ((ctype
->class_collection
[cnt
] & maskw
) != 0)
3084 ctype
->class_collection
[cnt
] |= BITw (tok_alnum
);
3087 if ((ctype
->class_done
& BITw (tok_space
)) == 0)
3088 /* "If this keyword [space] is not specified, the characters <space>,
3089 <form-feed>, <newline>, <carriage-return>, <tab>, and
3090 <vertical-tab>, ..., shall automatically belong to this class,
3091 with implementation-defined character values." [P1003.2, 2.5.2.1] */
3093 struct charseq
*seq
;
3095 seq
= charmap_find_value (charmap
, "space", 5);
3097 seq
= charmap_find_value (charmap
, "SP", 2);
3099 seq
= charmap_find_value (charmap
, "U00000020", 9);
3103 WITH_CUR_LOCALE (error (0, 0, _("\
3104 %s: character `%s' not defined while needed as default value"),
3105 "LC_CTYPE", "<space>"));
3107 else if (seq
->nbytes
!= 1)
3108 WITH_CUR_LOCALE (error (0, 0, _("\
3109 %s: character `%s' in charmap not representable with one byte"),
3110 "LC_CTYPE", "<space>"));
3112 ctype
->class256_collection
[seq
->bytes
[0]] |= BIT (tok_space
);
3114 /* No need to search. */
3115 ELEM (ctype
, class_collection
, , L
' ') |= BITw (tok_space
);
3117 seq
= charmap_find_value (charmap
, "form-feed", 9);
3119 seq
= charmap_find_value (charmap
, "U0000000C", 9);
3123 WITH_CUR_LOCALE (error (0, 0, _("\
3124 %s: character `%s' not defined while needed as default value"),
3125 "LC_CTYPE", "<form-feed>"));
3127 else if (seq
->nbytes
!= 1)
3128 WITH_CUR_LOCALE (error (0, 0, _("\
3129 %s: character `%s' in charmap not representable with one byte"),
3130 "LC_CTYPE", "<form-feed>"));
3132 ctype
->class256_collection
[seq
->bytes
[0]] |= BIT (tok_space
);
3134 /* No need to search. */
3135 ELEM (ctype
, class_collection
, , L
'\f') |= BITw (tok_space
);
3138 seq
= charmap_find_value (charmap
, "newline", 7);
3140 seq
= charmap_find_value (charmap
, "U0000000A", 9);
3144 WITH_CUR_LOCALE (error (0, 0, _("\
3145 character `%s' not defined while needed as default value"),
3148 else if (seq
->nbytes
!= 1)
3149 WITH_CUR_LOCALE (error (0, 0, _("\
3150 %s: character `%s' in charmap not representable with one byte"),
3151 "LC_CTYPE", "<newline>"));
3153 ctype
->class256_collection
[seq
->bytes
[0]] |= BIT (tok_space
);
3155 /* No need to search. */
3156 ELEM (ctype
, class_collection
, , L
'\n') |= BITw (tok_space
);
3159 seq
= charmap_find_value (charmap
, "carriage-return", 15);
3161 seq
= charmap_find_value (charmap
, "U0000000D", 9);
3165 WITH_CUR_LOCALE (error (0, 0, _("\
3166 %s: character `%s' not defined while needed as default value"),
3167 "LC_CTYPE", "<carriage-return>"));
3169 else if (seq
->nbytes
!= 1)
3170 WITH_CUR_LOCALE (error (0, 0, _("\
3171 %s: character `%s' in charmap not representable with one byte"),
3172 "LC_CTYPE", "<carriage-return>"));
3174 ctype
->class256_collection
[seq
->bytes
[0]] |= BIT (tok_space
);
3176 /* No need to search. */
3177 ELEM (ctype
, class_collection
, , L
'\r') |= BITw (tok_space
);
3180 seq
= charmap_find_value (charmap
, "tab", 3);
3182 seq
= charmap_find_value (charmap
, "U00000009", 9);
3186 WITH_CUR_LOCALE (error (0, 0, _("\
3187 %s: character `%s' not defined while needed as default value"),
3188 "LC_CTYPE", "<tab>"));
3190 else if (seq
->nbytes
!= 1)
3191 WITH_CUR_LOCALE (error (0, 0, _("\
3192 %s: character `%s' in charmap not representable with one byte"),
3193 "LC_CTYPE", "<tab>"));
3195 ctype
->class256_collection
[seq
->bytes
[0]] |= BIT (tok_space
);
3197 /* No need to search. */
3198 ELEM (ctype
, class_collection
, , L
'\t') |= BITw (tok_space
);
3201 seq
= charmap_find_value (charmap
, "vertical-tab", 12);
3203 seq
= charmap_find_value (charmap
, "U0000000B", 9);
3207 WITH_CUR_LOCALE (error (0, 0, _("\
3208 %s: character `%s' not defined while needed as default value"),
3209 "LC_CTYPE", "<vertical-tab>"));
3211 else if (seq
->nbytes
!= 1)
3212 WITH_CUR_LOCALE (error (0, 0, _("\
3213 %s: character `%s' in charmap not representable with one byte"),
3214 "LC_CTYPE", "<vertical-tab>"));
3216 ctype
->class256_collection
[seq
->bytes
[0]] |= BIT (tok_space
);
3218 /* No need to search. */
3219 ELEM (ctype
, class_collection
, , L
'\v') |= BITw (tok_space
);
3222 if ((ctype
->class_done
& BITw (tok_xdigit
)) == 0)
3223 /* "If this keyword is not specified, the digits `0' to `9', the
3224 uppercase letters `A' through `F', and the lowercase letters `a'
3225 through `f', ..., shell automatically belong to this class, with
3226 implementation defined character values." [P1003.2, 2.5.2.1] */
3228 set_default (BITPOS (tok_xdigit
), '0', '9');
3229 set_default (BITPOS (tok_xdigit
), 'A', 'F');
3230 set_default (BITPOS (tok_xdigit
), 'a', 'f');
3233 if ((ctype
->class_done
& BITw (tok_blank
)) == 0)
3234 /* "If this keyword [blank] is unspecified, the characters <space> and
3235 <tab> shall belong to this character class." [P1003.2, 2.5.2.1] */
3237 struct charseq
*seq
;
3239 seq
= charmap_find_value (charmap
, "space", 5);
3241 seq
= charmap_find_value (charmap
, "SP", 2);
3243 seq
= charmap_find_value (charmap
, "U00000020", 9);
3247 WITH_CUR_LOCALE (error (0, 0, _("\
3248 %s: character `%s' not defined while needed as default value"),
3249 "LC_CTYPE", "<space>"));
3251 else if (seq
->nbytes
!= 1)
3252 WITH_CUR_LOCALE (error (0, 0, _("\
3253 %s: character `%s' in charmap not representable with one byte"),
3254 "LC_CTYPE", "<space>"));
3256 ctype
->class256_collection
[seq
->bytes
[0]] |= BIT (tok_blank
);
3258 /* No need to search. */
3259 ELEM (ctype
, class_collection
, , L
' ') |= BITw (tok_blank
);
3262 seq
= charmap_find_value (charmap
, "tab", 3);
3264 seq
= charmap_find_value (charmap
, "U00000009", 9);
3268 WITH_CUR_LOCALE (error (0, 0, _("\
3269 %s: character `%s' not defined while needed as default value"),
3270 "LC_CTYPE", "<tab>"));
3272 else if (seq
->nbytes
!= 1)
3273 WITH_CUR_LOCALE (error (0, 0, _("\
3274 %s: character `%s' in charmap not representable with one byte"),
3275 "LC_CTYPE", "<tab>"));
3277 ctype
->class256_collection
[seq
->bytes
[0]] |= BIT (tok_blank
);
3279 /* No need to search. */
3280 ELEM (ctype
, class_collection
, , L
'\t') |= BITw (tok_blank
);
3283 if ((ctype
->class_done
& BITw (tok_graph
)) == 0)
3284 /* "If this keyword [graph] is not specified, characters specified for
3285 the keywords `upper', `lower', `alpha', `digit', `xdigit' and `punct',
3286 shall belong to this character class." [P1003.2, 2.5.2.1] */
3288 unsigned long int mask
= BIT (tok_upper
) | BIT (tok_lower
) |
3289 BIT (tok_alpha
) | BIT (tok_digit
) | BIT (tok_xdigit
) | BIT (tok_punct
);
3290 unsigned long int maskw
= BITw (tok_upper
) | BITw (tok_lower
) |
3291 BITw (tok_alpha
) | BITw (tok_digit
) | BITw (tok_xdigit
) |
3295 for (cnt
= 0; cnt
< ctype
->class_collection_act
; ++cnt
)
3296 if ((ctype
->class_collection
[cnt
] & maskw
) != 0)
3297 ctype
->class_collection
[cnt
] |= BITw (tok_graph
);
3299 for (cnt
= 0; cnt
< 256; ++cnt
)
3300 if ((ctype
->class256_collection
[cnt
] & mask
) != 0)
3301 ctype
->class256_collection
[cnt
] |= BIT (tok_graph
);
3304 if ((ctype
->class_done
& BITw (tok_print
)) == 0)
3305 /* "If this keyword [print] is not provided, characters specified for
3306 the keywords `upper', `lower', `alpha', `digit', `xdigit', `punct',
3307 and the <space> character shall belong to this character class."
3308 [P1003.2, 2.5.2.1] */
3310 unsigned long int mask
= BIT (tok_upper
) | BIT (tok_lower
) |
3311 BIT (tok_alpha
) | BIT (tok_digit
) | BIT (tok_xdigit
) | BIT (tok_punct
);
3312 unsigned long int maskw
= BITw (tok_upper
) | BITw (tok_lower
) |
3313 BITw (tok_alpha
) | BITw (tok_digit
) | BITw (tok_xdigit
) |
3316 struct charseq
*seq
;
3318 for (cnt
= 0; cnt
< ctype
->class_collection_act
; ++cnt
)
3319 if ((ctype
->class_collection
[cnt
] & maskw
) != 0)
3320 ctype
->class_collection
[cnt
] |= BITw (tok_print
);
3322 for (cnt
= 0; cnt
< 256; ++cnt
)
3323 if ((ctype
->class256_collection
[cnt
] & mask
) != 0)
3324 ctype
->class256_collection
[cnt
] |= BIT (tok_print
);
3327 seq
= charmap_find_value (charmap
, "space", 5);
3329 seq
= charmap_find_value (charmap
, "SP", 2);
3331 seq
= charmap_find_value (charmap
, "U00000020", 9);
3335 WITH_CUR_LOCALE (error (0, 0, _("\
3336 %s: character `%s' not defined while needed as default value"),
3337 "LC_CTYPE", "<space>"));
3339 else if (seq
->nbytes
!= 1)
3340 WITH_CUR_LOCALE (error (0, 0, _("\
3341 %s: character `%s' in charmap not representable with one byte"),
3342 "LC_CTYPE", "<space>"));
3344 ctype
->class256_collection
[seq
->bytes
[0]] |= BIT (tok_print
);
3346 /* No need to search. */
3347 ELEM (ctype
, class_collection
, , L
' ') |= BITw (tok_print
);
3350 if (ctype
->tomap_done
[0] == 0)
3351 /* "If this keyword [toupper] is not specified, the lowercase letters
3352 `a' through `z', and their corresponding uppercase letters `A' to
3353 `Z', ..., shall automatically be included, with implementation-
3354 defined character values." [P1003.2, 2.5.2.1] */
3359 strcpy (tmp
, "<?>");
3361 for (ch
= 'a'; ch
<= 'z'; ++ch
)
3363 struct charseq
*seq_from
, *seq_to
;
3367 seq_from
= charmap_find_value (charmap
, &tmp
[1], 1);
3368 if (seq_from
== NULL
)
3371 sprintf (buf
, "U%08X", ch
);
3372 seq_from
= charmap_find_value (charmap
, buf
, 9);
3374 if (seq_from
== NULL
)
3377 WITH_CUR_LOCALE (error (0, 0, _("\
3378 %s: character `%s' not defined while needed as default value"),
3381 else if (seq_from
->nbytes
!= 1)
3384 WITH_CUR_LOCALE (error (0, 0, _("\
3385 %s: character `%s' needed as default value not representable with one byte"),
3390 /* This conversion is implementation defined. */
3391 tmp
[1] = (char) (ch
+ ('A' - 'a'));
3392 seq_to
= charmap_find_value (charmap
, &tmp
[1], 1);
3396 sprintf (buf
, "U%08X", ch
+ ('A' - 'a'));
3397 seq_to
= charmap_find_value (charmap
, buf
, 9);
3402 WITH_CUR_LOCALE (error (0, 0, _("\
3403 %s: character `%s' not defined while needed as default value"),
3406 else if (seq_to
->nbytes
!= 1)
3409 WITH_CUR_LOCALE (error (0, 0, _("\
3410 %s: character `%s' needed as default value not representable with one byte"),
3414 /* The index [0] is determined by the order of the
3415 `ctype_map_newP' calls in `ctype_startup'. */
3416 ctype
->map256_collection
[0][seq_from
->bytes
[0]]
3420 /* No need to search. */
3421 ELEM (ctype
, map_collection
, [0], ch
) = ch
+ ('A' - 'a');
3425 if (ctype
->tomap_done
[1] == 0)
3426 /* "If this keyword [tolower] is not specified, the mapping shall be
3427 the reverse mapping of the one specified to `toupper'." [P1003.2] */
3429 for (cnt
= 0; cnt
< ctype
->map_collection_act
[0]; ++cnt
)
3430 if (ctype
->map_collection
[0][cnt
] != 0)
3431 ELEM (ctype
, map_collection
, [1],
3432 ctype
->map_collection
[0][cnt
])
3433 = ctype
->charnames
[cnt
];
3435 for (cnt
= 0; cnt
< 256; ++cnt
)
3436 if (ctype
->map256_collection
[0][cnt
] != 0)
3437 ctype
->map256_collection
[1][ctype
->map256_collection
[0][cnt
]] = cnt
;
3440 if (ctype
->outdigits_act
!= 10)
3442 if (ctype
->outdigits_act
!= 0)
3443 WITH_CUR_LOCALE (error (0, 0, _("\
3444 %s: field `%s' does not contain exactly ten entries"),
3445 "LC_CTYPE", "outdigit"));
3447 for (cnt
= ctype
->outdigits_act
; cnt
< 10; ++cnt
)
3449 ctype
->mboutdigits
[cnt
] = charmap_find_symbol (charmap
,
3452 if (ctype
->mboutdigits
[cnt
] == NULL
)
3453 ctype
->mboutdigits
[cnt
] = charmap_find_symbol (charmap
,
3455 strlen (longnames
[cnt
]));
3457 if (ctype
->mboutdigits
[cnt
] == NULL
)
3458 ctype
->mboutdigits
[cnt
] = charmap_find_symbol (charmap
,
3461 if (ctype
->mboutdigits
[cnt
] == NULL
)
3463 /* Provide a replacement. */
3464 WITH_CUR_LOCALE (error (0, 0, _("\
3465 no output digits defined and none of the standard names in the charmap")));
3467 ctype
->mboutdigits
[cnt
] = obstack_alloc (&((struct charmap_t
*) charmap
)->mem_pool
,
3468 sizeof (struct charseq
)
3471 /* This is better than nothing. */
3472 ctype
->mboutdigits
[cnt
]->bytes
[0] = digits
[cnt
];
3473 ctype
->mboutdigits
[cnt
]->nbytes
= 1;
3476 ctype
->wcoutdigits
[cnt
] = L
'0' + cnt
;
3479 ctype
->outdigits_act
= 10;
3484 /* Construction of sparse 3-level tables.
3485 See wchar-lookup.h for their structure and the meaning of p and q. */
3492 /* Working representation. */
3493 size_t level1_alloc
;
3496 size_t level2_alloc
;
3499 size_t level3_alloc
;
3502 /* Compressed representation. */
3507 /* Initialize. Assumes t->p and t->q have already been set. */
3509 wctype_table_init (struct wctype_table
*t
)
3512 t
->level1_alloc
= t
->level1_size
= 0;
3514 t
->level2_alloc
= t
->level2_size
= 0;
3516 t
->level3_alloc
= t
->level3_size
= 0;
3519 /* Retrieve an entry. */
3521 wctype_table_get (struct wctype_table
*t
, uint32_t wc
)
3523 uint32_t index1
= wc
>> (t
->q
+ t
->p
+ 5);
3524 if (index1
< t
->level1_size
)
3526 uint32_t lookup1
= t
->level1
[index1
];
3527 if (lookup1
!= EMPTY
)
3529 uint32_t index2
= ((wc
>> (t
->p
+ 5)) & ((1 << t
->q
) - 1))
3530 + (lookup1
<< t
->q
);
3531 uint32_t lookup2
= t
->level2
[index2
];
3532 if (lookup2
!= EMPTY
)
3534 uint32_t index3
= ((wc
>> 5) & ((1 << t
->p
) - 1))
3535 + (lookup2
<< t
->p
);
3536 uint32_t lookup3
= t
->level3
[index3
];
3537 uint32_t index4
= wc
& 0x1f;
3539 return (lookup3
>> index4
) & 1;
3546 /* Add one entry. */
3548 wctype_table_add (struct wctype_table
*t
, uint32_t wc
)
3550 uint32_t index1
= wc
>> (t
->q
+ t
->p
+ 5);
3551 uint32_t index2
= (wc
>> (t
->p
+ 5)) & ((1 << t
->q
) - 1);
3552 uint32_t index3
= (wc
>> 5) & ((1 << t
->p
) - 1);
3553 uint32_t index4
= wc
& 0x1f;
3556 if (index1
>= t
->level1_size
)
3558 if (index1
>= t
->level1_alloc
)
3560 size_t alloc
= 2 * t
->level1_alloc
;
3561 if (alloc
<= index1
)
3563 t
->level1
= (uint32_t *) xrealloc ((char *) t
->level1
,
3564 alloc
* sizeof (uint32_t));
3565 t
->level1_alloc
= alloc
;
3567 while (index1
>= t
->level1_size
)
3568 t
->level1
[t
->level1_size
++] = EMPTY
;
3571 if (t
->level1
[index1
] == EMPTY
)
3573 if (t
->level2_size
== t
->level2_alloc
)
3575 size_t alloc
= 2 * t
->level2_alloc
+ 1;
3576 t
->level2
= (uint32_t *) xrealloc ((char *) t
->level2
,
3577 (alloc
<< t
->q
) * sizeof (uint32_t));
3578 t
->level2_alloc
= alloc
;
3580 i1
= t
->level2_size
<< t
->q
;
3581 i2
= (t
->level2_size
+ 1) << t
->q
;
3582 for (i
= i1
; i
< i2
; i
++)
3583 t
->level2
[i
] = EMPTY
;
3584 t
->level1
[index1
] = t
->level2_size
++;
3587 index2
+= t
->level1
[index1
] << t
->q
;
3589 if (t
->level2
[index2
] == EMPTY
)
3591 if (t
->level3_size
== t
->level3_alloc
)
3593 size_t alloc
= 2 * t
->level3_alloc
+ 1;
3594 t
->level3
= (uint32_t *) xrealloc ((char *) t
->level3
,
3595 (alloc
<< t
->p
) * sizeof (uint32_t));
3596 t
->level3_alloc
= alloc
;
3598 i1
= t
->level3_size
<< t
->p
;
3599 i2
= (t
->level3_size
+ 1) << t
->p
;
3600 for (i
= i1
; i
< i2
; i
++)
3602 t
->level2
[index2
] = t
->level3_size
++;
3605 index3
+= t
->level2
[index2
] << t
->p
;
3607 t
->level3
[index3
] |= (uint32_t)1 << index4
;
3610 /* Finalize and shrink. */
3612 wctype_table_finalize (struct wctype_table
*t
)
3615 uint32_t reorder3
[t
->level3_size
];
3616 uint32_t reorder2
[t
->level2_size
];
3617 uint32_t level1_offset
, level2_offset
, level3_offset
;
3619 /* Uniquify level3 blocks. */
3621 for (j
= 0; j
< t
->level3_size
; j
++)
3623 for (i
= 0; i
< k
; i
++)
3624 if (memcmp (&t
->level3
[i
<< t
->p
], &t
->level3
[j
<< t
->p
],
3625 (1 << t
->p
) * sizeof (uint32_t)) == 0)
3627 /* Relocate block j to block i. */
3632 memcpy (&t
->level3
[i
<< t
->p
], &t
->level3
[j
<< t
->p
],
3633 (1 << t
->p
) * sizeof (uint32_t));
3639 for (i
= 0; i
< (t
->level2_size
<< t
->q
); i
++)
3640 if (t
->level2
[i
] != EMPTY
)
3641 t
->level2
[i
] = reorder3
[t
->level2
[i
]];
3643 /* Uniquify level2 blocks. */
3645 for (j
= 0; j
< t
->level2_size
; j
++)
3647 for (i
= 0; i
< k
; i
++)
3648 if (memcmp (&t
->level2
[i
<< t
->q
], &t
->level2
[j
<< t
->q
],
3649 (1 << t
->q
) * sizeof (uint32_t)) == 0)
3651 /* Relocate block j to block i. */
3656 memcpy (&t
->level2
[i
<< t
->q
], &t
->level2
[j
<< t
->q
],
3657 (1 << t
->q
) * sizeof (uint32_t));
3663 for (i
= 0; i
< t
->level1_size
; i
++)
3664 if (t
->level1
[i
] != EMPTY
)
3665 t
->level1
[i
] = reorder2
[t
->level1
[i
]];
3667 /* Create and fill the resulting compressed representation. */
3669 5 * sizeof (uint32_t)
3670 + t
->level1_size
* sizeof (uint32_t)
3671 + (t
->level2_size
<< t
->q
) * sizeof (uint32_t)
3672 + (t
->level3_size
<< t
->p
) * sizeof (uint32_t);
3673 t
->result
= (char *) xmalloc (t
->result_size
);
3676 5 * sizeof (uint32_t);
3678 5 * sizeof (uint32_t)
3679 + t
->level1_size
* sizeof (uint32_t);
3681 5 * sizeof (uint32_t)
3682 + t
->level1_size
* sizeof (uint32_t)
3683 + (t
->level2_size
<< t
->q
) * sizeof (uint32_t);
3685 ((uint32_t *) t
->result
)[0] = t
->q
+ t
->p
+ 5;
3686 ((uint32_t *) t
->result
)[1] = t
->level1_size
;
3687 ((uint32_t *) t
->result
)[2] = t
->p
+ 5;
3688 ((uint32_t *) t
->result
)[3] = (1 << t
->q
) - 1;
3689 ((uint32_t *) t
->result
)[4] = (1 << t
->p
) - 1;
3691 for (i
= 0; i
< t
->level1_size
; i
++)
3692 ((uint32_t *) (t
->result
+ level1_offset
))[i
] =
3693 (t
->level1
[i
] == EMPTY
3695 : (t
->level1
[i
] << t
->q
) * sizeof (uint32_t) + level2_offset
);
3697 for (i
= 0; i
< (t
->level2_size
<< t
->q
); i
++)
3698 ((uint32_t *) (t
->result
+ level2_offset
))[i
] =
3699 (t
->level2
[i
] == EMPTY
3701 : (t
->level2
[i
] << t
->p
) * sizeof (uint32_t) + level3_offset
);
3703 for (i
= 0; i
< (t
->level3_size
<< t
->p
); i
++)
3704 ((uint32_t *) (t
->result
+ level3_offset
))[i
] = t
->level3
[i
];
3706 if (t
->level1_alloc
> 0)
3708 if (t
->level2_alloc
> 0)
3710 if (t
->level3_alloc
> 0)
3714 #define TABLE wcwidth_table
3715 #define ELEMENT uint8_t
3716 #define DEFAULT 0xff
3719 #define TABLE wctrans_table
3720 #define ELEMENT int32_t
3722 #define wctrans_table_add wctrans_table_add_internal
3724 #undef wctrans_table_add
3725 /* The wctrans_table must actually store the difference between the
3726 desired result and the argument. */
3728 wctrans_table_add (struct wctrans_table
*t
, uint32_t wc
, uint32_t mapped_wc
)
3730 wctrans_table_add_internal (t
, wc
, mapped_wc
- wc
);
3734 /* Flattens the included transliterations into a translit list.
3735 Inserts them in the list at `cursor', and returns the new cursor. */
3736 static struct translit_t
**
3737 translit_flatten (struct locale_ctype_t
*ctype
,
3738 const struct charmap_t
*charmap
,
3739 struct translit_t
**cursor
)
3741 while (ctype
->translit_include
!= NULL
)
3743 const char *copy_locale
= ctype
->translit_include
->copy_locale
;
3744 const char *copy_repertoire
= ctype
->translit_include
->copy_repertoire
;
3745 struct localedef_t
*other
;
3747 /* Unchain the include statement. During the depth-first traversal
3748 we don't want to visit any locale more than once. */
3749 ctype
->translit_include
= ctype
->translit_include
->next
;
3751 other
= find_locale (LC_CTYPE
, copy_locale
, copy_repertoire
, charmap
);
3755 WITH_CUR_LOCALE (error (0, 0, _("\
3756 %s: transliteration data from locale `%s' not available"),
3757 "LC_CTYPE", copy_locale
));
3761 struct locale_ctype_t
*other_ctype
=
3762 other
->categories
[LC_CTYPE
].ctype
;
3764 cursor
= translit_flatten (other_ctype
, charmap
, cursor
);
3765 assert (other_ctype
->translit_include
== NULL
);
3767 if (other_ctype
->translit
!= NULL
)
3769 /* Insert the other_ctype->translit list at *cursor. */
3770 struct translit_t
*endp
= other_ctype
->translit
;
3771 while (endp
->next
!= NULL
)
3774 endp
->next
= *cursor
;
3775 *cursor
= other_ctype
->translit
;
3777 /* Avoid any risk of circular lists. */
3778 other_ctype
->translit
= NULL
;
3780 cursor
= &endp
->next
;
3783 if (ctype
->default_missing
== NULL
)
3784 ctype
->default_missing
= other_ctype
->default_missing
;
3792 allocate_arrays (struct locale_ctype_t
*ctype
, const struct charmap_t
*charmap
,
3793 struct repertoire_t
*repertoire
)
3801 /* You wonder about this amount of memory? This is only because some
3802 users do not manage to address the array with unsigned values or
3803 data types with range >= 256. '\200' would result in the array
3804 index -128. To help these poor people we duplicate the entries for
3805 128 up to 255 below the entry for \0. */
3806 ctype
->ctype_b
= (char_class_t
*) xcalloc (256 + 128, sizeof (char_class_t
));
3807 ctype
->ctype32_b
= (char_class32_t
*) xcalloc (256, sizeof (char_class32_t
));
3808 ctype
->class_b
= (uint32_t **)
3809 xmalloc (ctype
->nr_charclass
* sizeof (uint32_t *));
3810 ctype
->class_3level
= (struct iovec
*)
3811 xmalloc (ctype
->nr_charclass
* sizeof (struct iovec
));
3813 /* This is the array accessed using the multibyte string elements. */
3814 for (idx
= 0; idx
< 256; ++idx
)
3815 ctype
->ctype_b
[128 + idx
] = ctype
->class256_collection
[idx
];
3817 /* Mirror first 127 entries. We must take care that entry -1 is not
3818 mirrored because EOF == -1. */
3819 for (idx
= 0; idx
< 127; ++idx
)
3820 ctype
->ctype_b
[idx
] = ctype
->ctype_b
[256 + idx
];
3822 /* The 32 bit array contains all characters < 0x100. */
3823 for (idx
= 0; idx
< ctype
->class_collection_act
; ++idx
)
3824 if (ctype
->charnames
[idx
] < 0x100)
3825 ctype
->ctype32_b
[ctype
->charnames
[idx
]] = ctype
->class_collection
[idx
];
3827 for (nr
= 0; nr
< ctype
->nr_charclass
; nr
++)
3829 ctype
->class_b
[nr
] = (uint32_t *) xcalloc (256 / 32, sizeof (uint32_t));
3831 for (idx
= 0; idx
< 256; ++idx
)
3832 if (ctype
->class256_collection
[idx
] & _ISbit (nr
))
3833 ctype
->class_b
[nr
][idx
>> 5] |= (uint32_t)1 << (idx
& 0x1f);
3836 for (nr
= 0; nr
< ctype
->nr_charclass
; nr
++)
3838 struct wctype_table t
;
3840 t
.p
= 4; /* or: 5 */
3841 t
.q
= 7; /* or: 6 */
3842 wctype_table_init (&t
);
3844 for (idx
= 0; idx
< ctype
->class_collection_act
; ++idx
)
3845 if (ctype
->class_collection
[idx
] & _ISwbit (nr
))
3846 wctype_table_add (&t
, ctype
->charnames
[idx
]);
3848 wctype_table_finalize (&t
);
3851 WITH_CUR_LOCALE (fprintf (stderr
, _("\
3852 %s: table for class \"%s\": %lu bytes\n"),
3853 "LC_CTYPE", ctype
->classnames
[nr
],
3854 (unsigned long int) t
.result_size
));
3856 ctype
->class_3level
[nr
].iov_base
= t
.result
;
3857 ctype
->class_3level
[nr
].iov_len
= t
.result_size
;
3860 /* Room for table of mappings. */
3861 ctype
->map_b
= (uint32_t **) xmalloc (2 * sizeof (uint32_t *));
3862 ctype
->map32_b
= (uint32_t **) xmalloc (ctype
->map_collection_nr
3863 * sizeof (uint32_t *));
3864 ctype
->map_3level
= (struct iovec
*)
3865 xmalloc (ctype
->map_collection_nr
* sizeof (struct iovec
));
3867 /* Fill in all mappings. */
3868 for (idx
= 0; idx
< 2; ++idx
)
3872 /* Allocate table. */
3873 ctype
->map_b
[idx
] = (uint32_t *)
3874 xmalloc ((256 + 128) * sizeof (uint32_t));
3876 /* Copy values from collection. */
3877 for (idx2
= 0; idx2
< 256; ++idx2
)
3878 ctype
->map_b
[idx
][128 + idx2
] = ctype
->map256_collection
[idx
][idx2
];
3880 /* Mirror first 127 entries. We must take care not to map entry
3881 -1 because EOF == -1. */
3882 for (idx2
= 0; idx2
< 127; ++idx2
)
3883 ctype
->map_b
[idx
][idx2
] = ctype
->map_b
[idx
][256 + idx2
];
3885 /* EOF must map to EOF. */
3886 ctype
->map_b
[idx
][127] = EOF
;
3889 for (idx
= 0; idx
< ctype
->map_collection_nr
; ++idx
)
3893 /* Allocate table. */
3894 ctype
->map32_b
[idx
] = (uint32_t *) xmalloc (256 * sizeof (uint32_t));
3896 /* Copy values from collection. Default is identity mapping. */
3897 for (idx2
= 0; idx2
< 256; ++idx2
)
3898 ctype
->map32_b
[idx
][idx2
] =
3899 (ctype
->map_collection
[idx
][idx2
] != 0
3900 ? ctype
->map_collection
[idx
][idx2
]
3904 for (nr
= 0; nr
< ctype
->map_collection_nr
; nr
++)
3906 struct wctrans_table t
;
3910 wctrans_table_init (&t
);
3912 for (idx
= 0; idx
< ctype
->map_collection_act
[nr
]; ++idx
)
3913 if (ctype
->map_collection
[nr
][idx
] != 0)
3914 wctrans_table_add (&t
, ctype
->charnames
[idx
],
3915 ctype
->map_collection
[nr
][idx
]);
3917 wctrans_table_finalize (&t
);
3920 WITH_CUR_LOCALE (fprintf (stderr
, _("\
3921 %s: table for map \"%s\": %lu bytes\n"),
3922 "LC_CTYPE", ctype
->mapnames
[nr
],
3923 (unsigned long int) t
.result_size
));
3925 ctype
->map_3level
[nr
].iov_base
= t
.result
;
3926 ctype
->map_3level
[nr
].iov_len
= t
.result_size
;
3929 /* Extra array for class and map names. */
3930 ctype
->class_name_ptr
= (uint32_t *) xmalloc (ctype
->nr_charclass
3931 * sizeof (uint32_t));
3932 ctype
->map_name_ptr
= (uint32_t *) xmalloc (ctype
->map_collection_nr
3933 * sizeof (uint32_t));
3935 ctype
->class_offset
= _NL_ITEM_INDEX (_NL_CTYPE_EXTRA_MAP_1
);
3936 ctype
->map_offset
= ctype
->class_offset
+ ctype
->nr_charclass
;
3938 /* Array for width information. Because the expected widths are very
3939 small (never larger than 2) we use only one single byte. This
3941 We put only printable characters in the table. wcwidth is specified
3942 to return -1 for non-printable characters. Doing the check here
3943 saves a run-time check.
3944 But we put L'\0' in the table. This again saves a run-time check. */
3946 struct wcwidth_table t
;
3950 wcwidth_table_init (&t
);
3952 /* First set all the printable characters of the character set to
3953 the default width. */
3955 while (iterate_table (&charmap
->char_table
, &curs
, &key
, &len
, &vdata
) == 0)
3957 struct charseq
*data
= (struct charseq
*) vdata
;
3959 if (data
->ucs4
== UNINITIALIZED_CHAR_VALUE
)
3960 data
->ucs4
= repertoire_find_value (ctype
->repertoire
,
3963 if (data
->ucs4
!= ILLEGAL_CHAR_VALUE
)
3965 uint32_t *class_bits
=
3966 find_idx (ctype
, &ctype
->class_collection
, NULL
,
3967 &ctype
->class_collection_act
, data
->ucs4
);
3969 if (class_bits
!= NULL
&& (*class_bits
& BITw (tok_print
)))
3970 wcwidth_table_add (&t
, data
->ucs4
, charmap
->width_default
);
3974 /* Now add the explicitly specified widths. */
3975 if (charmap
->width_rules
!= NULL
)
3979 for (cnt
= 0; cnt
< charmap
->nwidth_rules
; ++cnt
)
3981 unsigned char bytes
[charmap
->mb_cur_max
];
3982 int nbytes
= charmap
->width_rules
[cnt
].from
->nbytes
;
3984 /* We have the range of character for which the width is
3985 specified described using byte sequences of the multibyte
3986 charset. We have to convert this to UCS4 now. And we
3987 cannot simply convert the beginning and the end of the
3988 sequence, we have to iterate over the byte sequence and
3989 convert it for every single character. */
3990 memcpy (bytes
, charmap
->width_rules
[cnt
].from
->bytes
, nbytes
);
3992 while (nbytes
< charmap
->width_rules
[cnt
].to
->nbytes
3993 || memcmp (bytes
, charmap
->width_rules
[cnt
].to
->bytes
,
3996 /* Find the UCS value for `bytes'. */
3999 struct charseq
*seq
=
4000 charmap_find_symbol (charmap
, bytes
, nbytes
);
4003 wch
= ILLEGAL_CHAR_VALUE
;
4004 else if (seq
->ucs4
!= UNINITIALIZED_CHAR_VALUE
)
4007 wch
= repertoire_find_value (ctype
->repertoire
, seq
->name
,
4008 strlen (seq
->name
));
4010 if (wch
!= ILLEGAL_CHAR_VALUE
)
4012 /* Store the value. */
4013 uint32_t *class_bits
=
4014 find_idx (ctype
, &ctype
->class_collection
, NULL
,
4015 &ctype
->class_collection_act
, wch
);
4017 if (class_bits
!= NULL
&& (*class_bits
& BITw (tok_print
)))
4018 wcwidth_table_add (&t
, wch
,
4019 charmap
->width_rules
[cnt
].width
);
4022 /* "Increment" the bytes sequence. */
4024 while (inner
>= 0 && bytes
[inner
] == 0xff)
4029 /* We have to extend the byte sequence. */
4030 if (nbytes
>= charmap
->width_rules
[cnt
].to
->nbytes
)
4034 memset (&bytes
[1], 0, nbytes
);
4040 while (++inner
< nbytes
)
4047 /* Set the width of L'\0' to 0. */
4048 wcwidth_table_add (&t
, 0, 0);
4050 wcwidth_table_finalize (&t
);
4053 WITH_CUR_LOCALE (fprintf (stderr
, _("%s: table for width: %lu bytes\n"),
4054 "LC_CTYPE", (unsigned long int) t
.result_size
));
4056 ctype
->width
.iov_base
= t
.result
;
4057 ctype
->width
.iov_len
= t
.result_size
;
4060 /* Set MB_CUR_MAX. */
4061 ctype
->mb_cur_max
= charmap
->mb_cur_max
;
4063 /* Now determine the table for the transliteration information.
4065 XXX It is not yet clear to me whether it is worth implementing a
4066 complicated algorithm which uses a hash table to locate the entries.
4067 For now I'll use a simple array which can be searching using binary
4069 if (ctype
->translit_include
!= NULL
)
4070 /* Traverse the locales mentioned in the `include' statements in a
4071 depth-first way and fold in their transliteration information. */
4072 translit_flatten (ctype
, charmap
, &ctype
->translit
);
4074 if (ctype
->translit
!= NULL
)
4076 /* First count how many entries we have. This is the upper limit
4077 since some entries from the included files might be overwritten. */
4080 struct translit_t
*runp
= ctype
->translit
;
4081 struct translit_t
**sorted
;
4082 size_t from_len
, to_len
;
4084 while (runp
!= NULL
)
4090 /* Next we allocate an array large enough and fill in the values. */
4091 sorted
= (struct translit_t
**) alloca (number
4092 * sizeof (struct translit_t
**));
4093 runp
= ctype
->translit
;
4097 /* Search for the place where to insert this string.
4098 XXX Better use a real sorting algorithm later. */
4102 while (idx
< number
)
4104 int res
= wcscmp ((const wchar_t *) sorted
[idx
]->from
,
4105 (const wchar_t *) runp
->from
);
4120 memmove (&sorted
[idx
+ 1], &sorted
[idx
],
4121 (number
- idx
) * sizeof (struct translit_t
*));
4128 while (runp
!= NULL
);
4130 /* The next step is putting all the possible transliteration
4131 strings in one memory block so that we can write it out.
4132 We need several different blocks:
4133 - index to the from-string array
4135 - index to the to-string array
4138 from_len
= to_len
= 0;
4139 for (cnt
= 0; cnt
< number
; ++cnt
)
4141 struct translit_to_t
*srunp
;
4142 from_len
+= wcslen ((const wchar_t *) sorted
[cnt
]->from
) + 1;
4143 srunp
= sorted
[cnt
]->to
;
4144 while (srunp
!= NULL
)
4146 to_len
+= wcslen ((const wchar_t *) srunp
->str
) + 1;
4147 srunp
= srunp
->next
;
4149 /* Plus one for the extra NUL character marking the end of
4150 the list for the current entry. */
4154 /* We can allocate the arrays for the results. */
4155 ctype
->translit_from_idx
= xmalloc (number
* sizeof (uint32_t));
4156 ctype
->translit_from_tbl
= xmalloc (from_len
* sizeof (uint32_t));
4157 ctype
->translit_to_idx
= xmalloc (number
* sizeof (uint32_t));
4158 ctype
->translit_to_tbl
= xmalloc (to_len
* sizeof (uint32_t));
4162 for (cnt
= 0; cnt
< number
; ++cnt
)
4165 struct translit_to_t
*srunp
;
4167 ctype
->translit_from_idx
[cnt
] = from_len
;
4168 ctype
->translit_to_idx
[cnt
] = to_len
;
4170 len
= wcslen ((const wchar_t *) sorted
[cnt
]->from
) + 1;
4171 wmemcpy ((wchar_t *) &ctype
->translit_from_tbl
[from_len
],
4172 (const wchar_t *) sorted
[cnt
]->from
, len
);
4175 ctype
->translit_to_idx
[cnt
] = to_len
;
4176 srunp
= sorted
[cnt
]->to
;
4177 while (srunp
!= NULL
)
4179 len
= wcslen ((const wchar_t *) srunp
->str
) + 1;
4180 wmemcpy ((wchar_t *) &ctype
->translit_to_tbl
[to_len
],
4181 (const wchar_t *) srunp
->str
, len
);
4183 srunp
= srunp
->next
;
4185 ctype
->translit_to_tbl
[to_len
++] = L
'\0';
4188 /* Store the information about the length. */
4189 ctype
->translit_idx_size
= number
;
4190 ctype
->translit_from_tbl_size
= from_len
* sizeof (uint32_t);
4191 ctype
->translit_to_tbl_size
= to_len
* sizeof (uint32_t);
4195 /* Provide some dummy pointers since we have nothing to write out. */
4196 static uint32_t no_str
= { 0 };
4198 ctype
->translit_from_idx
= &no_str
;
4199 ctype
->translit_from_tbl
= &no_str
;
4200 ctype
->translit_to_tbl
= &no_str
;
4201 ctype
->translit_idx_size
= 0;
4202 ctype
->translit_from_tbl_size
= 0;
4203 ctype
->translit_to_tbl_size
= 0;