1 /* Copyright (C) 1995-2015 Free Software Foundation, Inc.
2 This file is part of the GNU C Library.
3 Contributed by Ulrich Drepper <drepper@gnu.org>, 1995.
5 This program is free software; you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published
7 by the Free Software Foundation; version 2 of the License, or
8 (at your option) any later version.
10 This program is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 GNU General Public License for more details.
15 You should have received a copy of the GNU General Public License
16 along with this program; if not, see <http://www.gnu.org/licenses/>. */
35 #include "localedef.h"
37 #include "localeinfo.h"
39 #include "linereader.h"
40 #include "locfile-token.h"
46 #ifdef PREDEFINED_CLASSES
47 /* These are the extra bits not in wctype.h since these are not preallocated
49 # define _ISwspecial1 (1 << 29)
50 # define _ISwspecial2 (1 << 30)
51 # define _ISwspecial3 (1 << 31)
55 /* The bit used for representing a special class. */
56 #define BITPOS(class) ((class) - tok_upper)
57 #define BIT(class) (_ISbit (BITPOS (class)))
58 #define BITw(class) (_ISwbit (BITPOS (class)))
60 #define ELEM(ctype, collection, idx, value) \
61 *find_idx (ctype, &ctype->collection idx, &ctype->collection##_max idx, \
62 &ctype->collection##_act idx, value)
65 /* To be compatible with former implementations we for now restrict
66 the number of bits for character classes to 16. When compatibility
67 is not necessary anymore increase the number to 32. */
68 #define char_class_t uint16_t
69 #define char_class32_t uint32_t
72 /* Type to describe a transliteration action. We have a possibly
73 multiple character from-string and a set of multiple character
74 to-strings. All are 32bit values since this is what is used in
75 the gconv functions. */
80 struct translit_to_t
*next
;
90 struct translit_to_t
*to
;
92 struct translit_t
*next
;
95 struct translit_ignore_t
104 struct translit_ignore_t
*next
;
108 /* Type to describe a transliteration include statement. */
109 struct translit_include_t
111 const char *copy_locale
;
112 const char *copy_repertoire
;
114 struct translit_include_t
*next
;
117 /* Provide some dummy pointer for empty string. */
118 static uint32_t no_str
[] = { 0 };
121 /* Sparse table of uint32_t. */
122 #define TABLE idx_table
123 #define ELEMENT uint32_t
124 #define DEFAULT ((uint32_t) ~0)
125 #define NO_ADD_LOCALE
128 #define TABLE wcwidth_table
129 #define ELEMENT uint8_t
133 #define TABLE wctrans_table
134 #define ELEMENT int32_t
136 #define wctrans_table_add wctrans_table_add_internal
138 #undef wctrans_table_add
139 /* The wctrans_table must actually store the difference between the
140 desired result and the argument. */
142 wctrans_table_add (struct wctrans_table
*t
, uint32_t wc
, uint32_t mapped_wc
)
144 wctrans_table_add_internal (t
, wc
, mapped_wc
- wc
);
147 /* Construction of sparse 3-level tables.
148 See wchar-lookup.h for their structure and the meaning of p and q. */
155 /* Working representation. */
168 static void add_locale_wctype_table (struct locale_file
*file
,
169 struct wctype_table
*t
);
171 /* The real definition of the struct for the LC_CTYPE locale. */
172 struct locale_ctype_t
175 size_t charnames_max
;
176 size_t charnames_act
;
177 /* An index lookup table, to speedup find_idx. */
178 struct idx_table charnames_idx
;
180 struct repertoire_t
*repertoire
;
182 /* We will allow up to 8 * sizeof (uint32_t) character classes. */
183 #define MAX_NR_CHARCLASS (8 * sizeof (uint32_t))
185 const char *classnames
[MAX_NR_CHARCLASS
];
186 uint32_t last_class_char
;
187 uint32_t class256_collection
[256];
188 uint32_t *class_collection
;
189 size_t class_collection_max
;
190 size_t class_collection_act
;
192 uint32_t class_offset
;
194 struct charseq
**mbdigits
;
201 struct charseq
*mboutdigits
[10];
202 uint32_t wcoutdigits
[10];
203 size_t outdigits_act
;
205 /* If the following number ever turns out to be too small simply
206 increase it. But I doubt it will. --drepper@gnu */
207 #define MAX_NR_CHARMAP 16
208 const char *mapnames
[MAX_NR_CHARMAP
];
209 uint32_t *map_collection
[MAX_NR_CHARMAP
];
210 uint32_t map256_collection
[2][256];
211 size_t map_collection_max
[MAX_NR_CHARMAP
];
212 size_t map_collection_act
[MAX_NR_CHARMAP
];
213 size_t map_collection_nr
;
215 int tomap_done
[MAX_NR_CHARMAP
];
218 /* Transliteration information. */
219 struct translit_include_t
*translit_include
;
220 struct translit_t
*translit
;
221 struct translit_ignore_t
*translit_ignore
;
222 uint32_t ntranslit_ignore
;
224 uint32_t *default_missing
;
225 const char *default_missing_file
;
226 size_t default_missing_lineno
;
228 uint32_t to_nonascii
;
229 uint32_t nonascii_case
;
231 /* The arrays for the binary representation. */
232 char_class_t
*ctype_b
;
233 char_class32_t
*ctype32_b
;
237 struct wctype_table
*class_3level
;
238 struct wctrans_table
*map_3level
;
239 uint32_t *class_name_ptr
;
240 uint32_t *map_name_ptr
;
241 struct wcwidth_table width
;
243 const char *codeset_name
;
244 uint32_t *translit_from_idx
;
245 uint32_t *translit_from_tbl
;
246 uint32_t *translit_to_idx
;
247 uint32_t *translit_to_tbl
;
248 uint32_t translit_idx_size
;
249 size_t translit_from_tbl_size
;
250 size_t translit_to_tbl_size
;
252 struct obstack mempool
;
256 /* Marker for an empty slot. This has the value 0xFFFFFFFF, regardless
257 whether 'int' is 16 bit, 32 bit, or 64 bit. */
258 #define EMPTY ((uint32_t) ~0)
261 #define obstack_chunk_alloc xmalloc
262 #define obstack_chunk_free free
265 /* Prototypes for local functions. */
266 static void ctype_startup (struct linereader
*lr
, struct localedef_t
*locale
,
267 const struct charmap_t
*charmap
,
268 struct localedef_t
*copy_locale
,
270 static void ctype_class_new (struct linereader
*lr
,
271 struct locale_ctype_t
*ctype
, const char *name
);
272 static void ctype_map_new (struct linereader
*lr
,
273 struct locale_ctype_t
*ctype
,
274 const char *name
, const struct charmap_t
*charmap
);
275 static uint32_t *find_idx (struct locale_ctype_t
*ctype
, uint32_t **table
,
276 size_t *max
, size_t *act
, uint32_t idx
);
277 static void set_class_defaults (struct locale_ctype_t
*ctype
,
278 const struct charmap_t
*charmap
,
279 struct repertoire_t
*repertoire
);
280 static void allocate_arrays (struct locale_ctype_t
*ctype
,
281 const struct charmap_t
*charmap
,
282 struct repertoire_t
*repertoire
);
285 static const char *longnames
[] =
287 "zero", "one", "two", "three", "four",
288 "five", "six", "seven", "eight", "nine"
290 static const char *uninames
[] =
292 "U00000030", "U00000031", "U00000032", "U00000033", "U00000034",
293 "U00000035", "U00000036", "U00000037", "U00000038", "U00000039"
295 static const unsigned char digits
[] = "0123456789";
299 ctype_startup (struct linereader
*lr
, struct localedef_t
*locale
,
300 const struct charmap_t
*charmap
,
301 struct localedef_t
*copy_locale
, int ignore_content
)
304 struct locale_ctype_t
*ctype
;
306 if (!ignore_content
&& locale
->categories
[LC_CTYPE
].ctype
== NULL
)
308 if (copy_locale
== NULL
)
310 /* Allocate the needed room. */
311 locale
->categories
[LC_CTYPE
].ctype
= ctype
=
312 (struct locale_ctype_t
*) xcalloc (1,
313 sizeof (struct locale_ctype_t
));
315 /* We have seen no names yet. */
316 ctype
->charnames_max
= charmap
->mb_cur_max
== 1 ? 256 : 512;
317 ctype
->charnames
= (uint32_t *) xmalloc (ctype
->charnames_max
318 * sizeof (uint32_t));
319 for (cnt
= 0; cnt
< 256; ++cnt
)
320 ctype
->charnames
[cnt
] = cnt
;
321 ctype
->charnames_act
= 256;
322 idx_table_init (&ctype
->charnames_idx
);
324 /* Fill character class information. */
325 ctype
->last_class_char
= ILLEGAL_CHAR_VALUE
;
326 /* The order of the following instructions determines the bit
328 ctype_class_new (lr
, ctype
, "upper");
329 ctype_class_new (lr
, ctype
, "lower");
330 ctype_class_new (lr
, ctype
, "alpha");
331 ctype_class_new (lr
, ctype
, "digit");
332 ctype_class_new (lr
, ctype
, "xdigit");
333 ctype_class_new (lr
, ctype
, "space");
334 ctype_class_new (lr
, ctype
, "print");
335 ctype_class_new (lr
, ctype
, "graph");
336 ctype_class_new (lr
, ctype
, "blank");
337 ctype_class_new (lr
, ctype
, "cntrl");
338 ctype_class_new (lr
, ctype
, "punct");
339 ctype_class_new (lr
, ctype
, "alnum");
340 #ifdef PREDEFINED_CLASSES
341 /* The following are extensions from ISO 14652. */
342 ctype_class_new (lr
, ctype
, "left_to_right");
343 ctype_class_new (lr
, ctype
, "right_to_left");
344 ctype_class_new (lr
, ctype
, "num_terminator");
345 ctype_class_new (lr
, ctype
, "num_separator");
346 ctype_class_new (lr
, ctype
, "segment_separator");
347 ctype_class_new (lr
, ctype
, "block_separator");
348 ctype_class_new (lr
, ctype
, "direction_control");
349 ctype_class_new (lr
, ctype
, "sym_swap_layout");
350 ctype_class_new (lr
, ctype
, "char_shape_selector");
351 ctype_class_new (lr
, ctype
, "num_shape_selector");
352 ctype_class_new (lr
, ctype
, "non_spacing");
353 ctype_class_new (lr
, ctype
, "non_spacing_level3");
354 ctype_class_new (lr
, ctype
, "normal_connect");
355 ctype_class_new (lr
, ctype
, "r_connect");
356 ctype_class_new (lr
, ctype
, "no_connect");
357 ctype_class_new (lr
, ctype
, "no_connect-space");
358 ctype_class_new (lr
, ctype
, "vowel_connect");
361 ctype
->class_collection_max
= charmap
->mb_cur_max
== 1 ? 256 : 512;
362 ctype
->class_collection
363 = (uint32_t *) xcalloc (sizeof (unsigned long int),
364 ctype
->class_collection_max
);
365 ctype
->class_collection_act
= 256;
367 /* Fill character map information. */
368 ctype
->last_map_idx
= MAX_NR_CHARMAP
;
369 ctype_map_new (lr
, ctype
, "toupper", charmap
);
370 ctype_map_new (lr
, ctype
, "tolower", charmap
);
371 #ifdef PREDEFINED_CLASSES
372 ctype_map_new (lr
, ctype
, "tosymmetric", charmap
);
375 /* Fill first 256 entries in `toXXX' arrays. */
376 for (cnt
= 0; cnt
< 256; ++cnt
)
378 ctype
->map_collection
[0][cnt
] = cnt
;
379 ctype
->map_collection
[1][cnt
] = cnt
;
380 #ifdef PREDEFINED_CLASSES
381 ctype
->map_collection
[2][cnt
] = cnt
;
383 ctype
->map256_collection
[0][cnt
] = cnt
;
384 ctype
->map256_collection
[1][cnt
] = cnt
;
387 if (enc_not_ascii_compatible
)
388 ctype
->to_nonascii
= 1;
390 obstack_init (&ctype
->mempool
);
393 ctype
= locale
->categories
[LC_CTYPE
].ctype
=
394 copy_locale
->categories
[LC_CTYPE
].ctype
;
400 ctype_finish (struct localedef_t
*locale
, const struct charmap_t
*charmap
)
402 /* See POSIX.2, table 2-6 for the meaning of the following table. */
407 const char allow
[NCLASS
];
409 valid_table
[NCLASS
] =
411 /* The order is important. See token.h for more information.
412 M = Always, D = Default, - = Permitted, X = Mutually exclusive */
413 { "upper", "--MX-XDDXXX-" },
414 { "lower", "--MX-XDDXXX-" },
415 { "alpha", "---X-XDDXXX-" },
416 { "digit", "XXX--XDDXXX-" },
417 { "xdigit", "-----XDDXXX-" },
418 { "space", "XXXXX------X" },
419 { "print", "---------X--" },
420 { "graph", "---------X--" },
421 { "blank", "XXXXXM-----X" },
422 { "cntrl", "XXXXX-XX--XX" },
423 { "punct", "XXXXX-DD-X-X" },
424 { "alnum", "-----XDDXXX-" }
428 uint32_t space_value
;
429 struct charseq
*space_seq
;
430 struct locale_ctype_t
*ctype
= locale
->categories
[LC_CTYPE
].ctype
;
437 /* Now resolve copying and also handle completely missing definitions. */
440 const char *repertoire_name
;
442 /* First see whether we were supposed to copy. If yes, find the
443 actual definition. */
444 if (locale
->copy_name
[LC_CTYPE
] != NULL
)
446 /* Find the copying locale. This has to happen transitively since
447 the locale we are copying from might also copying another one. */
448 struct localedef_t
*from
= locale
;
451 from
= find_locale (LC_CTYPE
, from
->copy_name
[LC_CTYPE
],
452 from
->repertoire_name
, charmap
);
453 while (from
->categories
[LC_CTYPE
].ctype
== NULL
454 && from
->copy_name
[LC_CTYPE
] != NULL
);
456 ctype
= locale
->categories
[LC_CTYPE
].ctype
457 = from
->categories
[LC_CTYPE
].ctype
;
460 /* If there is still no definition issue an warning and create an
465 WITH_CUR_LOCALE (error (0, 0, _("\
466 No definition for %s category found"), "LC_CTYPE"));
467 ctype_startup (NULL
, locale
, charmap
, NULL
, 0);
468 ctype
= locale
->categories
[LC_CTYPE
].ctype
;
471 /* Get the repertoire we have to use. */
472 repertoire_name
= locale
->repertoire_name
?: repertoire_global
;
473 if (repertoire_name
!= NULL
)
474 ctype
->repertoire
= repertoire_read (repertoire_name
);
477 /* We need the name of the currently used 8-bit character set to
478 make correct conversion between this 8-bit representation and the
479 ISO 10646 character set used internally for wide characters. */
480 ctype
->codeset_name
= charmap
->code_set_name
;
481 if (ctype
->codeset_name
== NULL
)
484 WITH_CUR_LOCALE (error (0, 0, _("\
485 No character set name specified in charmap")));
486 ctype
->codeset_name
= "//UNKNOWN//";
489 /* Set default value for classes not specified. */
490 set_class_defaults (ctype
, charmap
, ctype
->repertoire
);
492 /* Check according to table. */
493 for (cnt
= 0; cnt
< ctype
->class_collection_act
; ++cnt
)
495 uint32_t tmp
= ctype
->class_collection
[cnt
];
499 for (cls1
= 0; cls1
< NCLASS
; ++cls1
)
500 if ((tmp
& _ISwbit (cls1
)) != 0)
501 for (cls2
= 0; cls2
< NCLASS
; ++cls2
)
502 if (valid_table
[cls1
].allow
[cls2
] != '-')
504 int eq
= (tmp
& _ISwbit (cls2
)) != 0;
505 switch (valid_table
[cls1
].allow
[cls2
])
510 uint32_t value
= ctype
->charnames
[cnt
];
513 WITH_CUR_LOCALE (error (0, 0, _("\
514 character L'\\u%0*x' in class `%s' must be in class `%s'"),
515 value
> 0xffff ? 8 : 4,
517 valid_table
[cls1
].name
,
518 valid_table
[cls2
].name
));
525 uint32_t value
= ctype
->charnames
[cnt
];
528 WITH_CUR_LOCALE (error (0, 0, _("\
529 character L'\\u%0*x' in class `%s' must not be in class `%s'"),
530 value
> 0xffff ? 8 : 4,
532 valid_table
[cls1
].name
,
533 valid_table
[cls2
].name
));
538 ctype
->class_collection
[cnt
] |= _ISwbit (cls2
);
542 WITH_CUR_LOCALE (error (5, 0, _("\
543 internal error in %s, line %u"), __FUNCTION__
, __LINE__
));
549 for (cnt
= 0; cnt
< 256; ++cnt
)
551 uint32_t tmp
= ctype
->class256_collection
[cnt
];
555 for (cls1
= 0; cls1
< NCLASS
; ++cls1
)
556 if ((tmp
& _ISbit (cls1
)) != 0)
557 for (cls2
= 0; cls2
< NCLASS
; ++cls2
)
558 if (valid_table
[cls1
].allow
[cls2
] != '-')
560 int eq
= (tmp
& _ISbit (cls2
)) != 0;
561 switch (valid_table
[cls1
].allow
[cls2
])
568 snprintf (buf
, sizeof buf
, "\\%Zo", cnt
);
571 WITH_CUR_LOCALE (error (0, 0, _("\
572 character '%s' in class `%s' must be in class `%s'"),
574 valid_table
[cls1
].name
,
575 valid_table
[cls2
].name
));
584 snprintf (buf
, sizeof buf
, "\\%Zo", cnt
);
587 WITH_CUR_LOCALE (error (0, 0, _("\
588 character '%s' in class `%s' must not be in class `%s'"),
590 valid_table
[cls1
].name
,
591 valid_table
[cls2
].name
));
596 ctype
->class256_collection
[cnt
] |= _ISbit (cls2
);
600 WITH_CUR_LOCALE (error (5, 0, _("\
601 internal error in %s, line %u"), __FUNCTION__
, __LINE__
));
607 /* ... and now test <SP> as a special case. */
609 if (((cnt
= BITPOS (tok_space
),
610 (ELEM (ctype
, class_collection
, , space_value
)
611 & BITw (tok_space
)) == 0)
612 || (cnt
= BITPOS (tok_blank
),
613 (ELEM (ctype
, class_collection
, , space_value
)
614 & BITw (tok_blank
)) == 0)))
617 WITH_CUR_LOCALE (error (0, 0, _("<SP> character not in class `%s'"),
618 valid_table
[cnt
].name
));
620 else if (((cnt
= BITPOS (tok_punct
),
621 (ELEM (ctype
, class_collection
, , space_value
)
622 & BITw (tok_punct
)) != 0)
623 || (cnt
= BITPOS (tok_graph
),
624 (ELEM (ctype
, class_collection
, , space_value
)
629 WITH_CUR_LOCALE (error (0, 0, _("\
630 <SP> character must not be in class `%s'"),
631 valid_table
[cnt
].name
));
634 ELEM (ctype
, class_collection
, , space_value
) |= BITw (tok_print
);
636 space_seq
= charmap_find_value (charmap
, "SP", 2);
637 if (space_seq
== NULL
)
638 space_seq
= charmap_find_value (charmap
, "space", 5);
639 if (space_seq
== NULL
)
640 space_seq
= charmap_find_value (charmap
, "U00000020", 9);
641 if (space_seq
== NULL
|| space_seq
->nbytes
!= 1)
644 WITH_CUR_LOCALE (error (0, 0, _("\
645 character <SP> not defined in character map")));
647 else if (((cnt
= BITPOS (tok_space
),
648 (ctype
->class256_collection
[space_seq
->bytes
[0]]
649 & BIT (tok_space
)) == 0)
650 || (cnt
= BITPOS (tok_blank
),
651 (ctype
->class256_collection
[space_seq
->bytes
[0]]
652 & BIT (tok_blank
)) == 0)))
655 WITH_CUR_LOCALE (error (0, 0, _("<SP> character not in class `%s'"),
656 valid_table
[cnt
].name
));
658 else if (((cnt
= BITPOS (tok_punct
),
659 (ctype
->class256_collection
[space_seq
->bytes
[0]]
660 & BIT (tok_punct
)) != 0)
661 || (cnt
= BITPOS (tok_graph
),
662 (ctype
->class256_collection
[space_seq
->bytes
[0]]
663 & BIT (tok_graph
)) != 0)))
666 WITH_CUR_LOCALE (error (0, 0, _("\
667 <SP> character must not be in class `%s'"),
668 valid_table
[cnt
].name
));
671 ctype
->class256_collection
[space_seq
->bytes
[0]] |= BIT (tok_print
);
673 /* Check whether all single-byte characters make to their upper/lowercase
674 equivalent according to the ASCII rules. */
675 for (cnt
= 'A'; cnt
<= 'Z'; ++cnt
)
677 uint32_t uppval
= ctype
->map256_collection
[0][cnt
];
678 uint32_t lowval
= ctype
->map256_collection
[1][cnt
];
679 uint32_t lowuppval
= ctype
->map256_collection
[0][lowval
];
680 uint32_t lowlowval
= ctype
->map256_collection
[1][lowval
];
683 || lowval
!= cnt
+ 0x20
685 || lowlowval
!= cnt
+ 0x20)
686 ctype
->nonascii_case
= 1;
688 for (cnt
= 0; cnt
< 256; ++cnt
)
689 if (cnt
< 'A' || (cnt
> 'Z' && cnt
< 'a') || cnt
> 'z')
690 if (ctype
->map256_collection
[0][cnt
] != cnt
691 || ctype
->map256_collection
[1][cnt
] != cnt
)
692 ctype
->nonascii_case
= 1;
694 /* Now that the tests are done make sure the name array contains all
695 characters which are handled in the WIDTH section of the
696 character set definition file. */
697 if (charmap
->width_rules
!= NULL
)
698 for (cnt
= 0; cnt
< charmap
->nwidth_rules
; ++cnt
)
700 unsigned char bytes
[charmap
->mb_cur_max
];
701 int nbytes
= charmap
->width_rules
[cnt
].from
->nbytes
;
703 /* We have the range of character for which the width is
704 specified described using byte sequences of the multibyte
705 charset. We have to convert this to UCS4 now. And we
706 cannot simply convert the beginning and the end of the
707 sequence, we have to iterate over the byte sequence and
708 convert it for every single character. */
709 memcpy (bytes
, charmap
->width_rules
[cnt
].from
->bytes
, nbytes
);
711 while (nbytes
< charmap
->width_rules
[cnt
].to
->nbytes
712 || memcmp (bytes
, charmap
->width_rules
[cnt
].to
->bytes
,
715 /* Find the UCS value for `bytes'. */
719 = charmap_find_symbol (charmap
, (char *) bytes
, nbytes
);
722 wch
= ILLEGAL_CHAR_VALUE
;
723 else if (seq
->ucs4
!= UNINITIALIZED_CHAR_VALUE
)
726 wch
= repertoire_find_value (ctype
->repertoire
, seq
->name
,
729 if (wch
!= ILLEGAL_CHAR_VALUE
)
730 /* We are only interested in the side-effects of the
731 `find_idx' call. It will add appropriate entries in
732 the name array if this is necessary. */
733 (void) find_idx (ctype
, NULL
, NULL
, NULL
, wch
);
735 /* "Increment" the bytes sequence. */
737 while (inner
>= 0 && bytes
[inner
] == 0xff)
742 /* We have to extend the byte sequence. */
743 if (nbytes
>= charmap
->width_rules
[cnt
].to
->nbytes
)
747 memset (&bytes
[1], 0, nbytes
);
753 while (++inner
< nbytes
)
759 /* Now set all the other characters of the character set to the
762 while (iterate_table (&charmap
->char_table
, &curs
, &key
, &len
, &vdata
) == 0)
764 struct charseq
*data
= (struct charseq
*) vdata
;
766 if (data
->ucs4
== UNINITIALIZED_CHAR_VALUE
)
767 data
->ucs4
= repertoire_find_value (ctype
->repertoire
,
770 if (data
->ucs4
!= ILLEGAL_CHAR_VALUE
)
771 (void) find_idx (ctype
, NULL
, NULL
, NULL
, data
->ucs4
);
774 /* There must be a multiple of 10 digits. */
775 if (ctype
->mbdigits_act
% 10 != 0)
777 assert (ctype
->mbdigits_act
== ctype
->wcdigits_act
);
778 ctype
->wcdigits_act
-= ctype
->mbdigits_act
% 10;
779 ctype
->mbdigits_act
-= ctype
->mbdigits_act
% 10;
780 WITH_CUR_LOCALE (error (0, 0, _("\
781 `digit' category has not entries in groups of ten")));
784 /* Check the input digits. There must be a multiple of ten available.
785 In each group it could be that one or the other character is missing.
786 In this case the whole group must be removed. */
788 while (cnt
< ctype
->mbdigits_act
)
791 for (inner
= 0; inner
< 10; ++inner
)
792 if (ctype
->mbdigits
[cnt
+ inner
] == NULL
)
799 /* Remove the group. */
800 memmove (&ctype
->mbdigits
[cnt
], &ctype
->mbdigits
[cnt
+ 10],
801 ((ctype
->wcdigits_act
- cnt
- 10)
802 * sizeof (ctype
->mbdigits
[0])));
803 ctype
->mbdigits_act
-= 10;
807 /* If no input digits are given use the default. */
808 if (ctype
->mbdigits_act
== 0)
810 if (ctype
->mbdigits_max
== 0)
812 ctype
->mbdigits
= obstack_alloc (&((struct charmap_t
*) charmap
)->mem_pool
,
813 10 * sizeof (struct charseq
*));
814 ctype
->mbdigits_max
= 10;
817 for (cnt
= 0; cnt
< 10; ++cnt
)
819 ctype
->mbdigits
[cnt
] = charmap_find_symbol (charmap
,
820 (char *) digits
+ cnt
, 1);
821 if (ctype
->mbdigits
[cnt
] == NULL
)
823 ctype
->mbdigits
[cnt
] = charmap_find_symbol (charmap
,
825 strlen (longnames
[cnt
]));
826 if (ctype
->mbdigits
[cnt
] == NULL
)
828 /* Hum, this ain't good. */
829 WITH_CUR_LOCALE (error (0, 0, _("\
830 no input digits defined and none of the standard names in the charmap")));
832 ctype
->mbdigits
[cnt
] = obstack_alloc (&((struct charmap_t
*) charmap
)->mem_pool
,
833 sizeof (struct charseq
) + 1);
835 /* This is better than nothing. */
836 ctype
->mbdigits
[cnt
]->bytes
[0] = digits
[cnt
];
837 ctype
->mbdigits
[cnt
]->nbytes
= 1;
842 ctype
->mbdigits_act
= 10;
845 /* Check the wide character input digits. There must be a multiple
846 of ten available. In each group it could be that one or the other
847 character is missing. In this case the whole group must be
850 while (cnt
< ctype
->wcdigits_act
)
853 for (inner
= 0; inner
< 10; ++inner
)
854 if (ctype
->wcdigits
[cnt
+ inner
] == ILLEGAL_CHAR_VALUE
)
861 /* Remove the group. */
862 memmove (&ctype
->wcdigits
[cnt
], &ctype
->wcdigits
[cnt
+ 10],
863 ((ctype
->wcdigits_act
- cnt
- 10)
864 * sizeof (ctype
->wcdigits
[0])));
865 ctype
->wcdigits_act
-= 10;
869 /* If no input digits are given use the default. */
870 if (ctype
->wcdigits_act
== 0)
872 if (ctype
->wcdigits_max
== 0)
874 ctype
->wcdigits
= obstack_alloc (&((struct charmap_t
*) charmap
)->mem_pool
,
875 10 * sizeof (uint32_t));
876 ctype
->wcdigits_max
= 10;
879 for (cnt
= 0; cnt
< 10; ++cnt
)
880 ctype
->wcdigits
[cnt
] = L
'0' + cnt
;
882 ctype
->mbdigits_act
= 10;
885 /* Check the outdigits. */
887 for (cnt
= 0; cnt
< 10; ++cnt
)
888 if (ctype
->mboutdigits
[cnt
] == NULL
)
890 static struct charseq replace
[2];
894 WITH_CUR_LOCALE (error (0, 0, _("\
895 not all characters used in `outdigit' are available in the charmap")));
899 replace
[0].nbytes
= 1;
900 replace
[0].bytes
[0] = '?';
901 replace
[0].bytes
[1] = '\0';
902 ctype
->mboutdigits
[cnt
] = &replace
[0];
906 for (cnt
= 0; cnt
< 10; ++cnt
)
907 if (ctype
->wcoutdigits
[cnt
] == 0)
911 WITH_CUR_LOCALE (error (0, 0, _("\
912 not all characters used in `outdigit' are available in the repertoire")));
916 ctype
->wcoutdigits
[cnt
] = L
'?';
919 /* Sort the entries in the translit_ignore list. */
920 if (ctype
->translit_ignore
!= NULL
)
922 struct translit_ignore_t
*firstp
= ctype
->translit_ignore
;
923 struct translit_ignore_t
*runp
;
925 ctype
->ntranslit_ignore
= 1;
927 for (runp
= firstp
->next
; runp
!= NULL
; runp
= runp
->next
)
929 struct translit_ignore_t
*lastp
= NULL
;
930 struct translit_ignore_t
*cmpp
;
932 ++ctype
->ntranslit_ignore
;
934 for (cmpp
= firstp
; cmpp
!= NULL
; lastp
= cmpp
, cmpp
= cmpp
->next
)
935 if (runp
->from
< cmpp
->from
)
943 ctype
->translit_ignore
= firstp
;
949 ctype_output (struct localedef_t
*locale
, const struct charmap_t
*charmap
,
950 const char *output_path
)
952 struct locale_ctype_t
*ctype
= locale
->categories
[LC_CTYPE
].ctype
;
953 const size_t nelems
= (_NL_ITEM_INDEX (_NL_CTYPE_EXTRA_MAP_1
)
954 + ctype
->nr_charclass
+ ctype
->map_collection_nr
);
955 struct locale_file file
;
956 uint32_t default_missing_len
;
959 /* Now prepare the output: Find the sizes of the table we can use. */
960 allocate_arrays (ctype
, charmap
, ctype
->repertoire
);
962 default_missing_len
= (ctype
->default_missing
963 ? wcslen ((wchar_t *) ctype
->default_missing
)
966 init_locale_data (&file
, nelems
);
967 for (elem
= 0; elem
< nelems
; ++elem
)
969 if (elem
< _NL_ITEM_INDEX (_NL_CTYPE_EXTRA_MAP_1
))
972 #define CTYPE_EMPTY(name) \
974 add_locale_empty (&file); \
977 CTYPE_EMPTY(_NL_CTYPE_GAP1
);
978 CTYPE_EMPTY(_NL_CTYPE_GAP2
);
979 CTYPE_EMPTY(_NL_CTYPE_GAP3
);
980 CTYPE_EMPTY(_NL_CTYPE_GAP4
);
981 CTYPE_EMPTY(_NL_CTYPE_GAP5
);
982 CTYPE_EMPTY(_NL_CTYPE_GAP6
);
984 #define CTYPE_RAW_DATA(name, base, size) \
985 case _NL_ITEM_INDEX (name): \
986 add_locale_raw_data (&file, base, size); \
989 CTYPE_RAW_DATA (_NL_CTYPE_CLASS
,
991 (256 + 128) * sizeof (char_class_t
));
993 #define CTYPE_UINT32_ARRAY(name, base, n_elems) \
994 case _NL_ITEM_INDEX (name): \
995 add_locale_uint32_array (&file, base, n_elems); \
998 CTYPE_UINT32_ARRAY (_NL_CTYPE_TOUPPER
, ctype
->map_b
[0], 256 + 128);
999 CTYPE_UINT32_ARRAY (_NL_CTYPE_TOLOWER
, ctype
->map_b
[1], 256 + 128);
1000 CTYPE_UINT32_ARRAY (_NL_CTYPE_TOUPPER32
, ctype
->map32_b
[0], 256);
1001 CTYPE_UINT32_ARRAY (_NL_CTYPE_TOLOWER32
, ctype
->map32_b
[1], 256);
1002 CTYPE_RAW_DATA (_NL_CTYPE_CLASS32
,
1004 256 * sizeof (char_class32_t
));
1006 #define CTYPE_UINT32(name, value) \
1007 case _NL_ITEM_INDEX (name): \
1008 add_locale_uint32 (&file, value); \
1011 CTYPE_UINT32 (_NL_CTYPE_CLASS_OFFSET
, ctype
->class_offset
);
1012 CTYPE_UINT32 (_NL_CTYPE_MAP_OFFSET
, ctype
->map_offset
);
1013 CTYPE_UINT32 (_NL_CTYPE_TRANSLIT_TAB_SIZE
, ctype
->translit_idx_size
);
1015 CTYPE_UINT32_ARRAY (_NL_CTYPE_TRANSLIT_FROM_IDX
,
1016 ctype
->translit_from_idx
,
1017 ctype
->translit_idx_size
);
1019 CTYPE_UINT32_ARRAY (_NL_CTYPE_TRANSLIT_FROM_TBL
,
1020 ctype
->translit_from_tbl
,
1021 ctype
->translit_from_tbl_size
1022 / sizeof (uint32_t));
1024 CTYPE_UINT32_ARRAY (_NL_CTYPE_TRANSLIT_TO_IDX
,
1025 ctype
->translit_to_idx
,
1026 ctype
->translit_idx_size
);
1028 CTYPE_UINT32_ARRAY (_NL_CTYPE_TRANSLIT_TO_TBL
,
1029 ctype
->translit_to_tbl
,
1030 ctype
->translit_to_tbl_size
/ sizeof (uint32_t));
1032 case _NL_ITEM_INDEX (_NL_CTYPE_CLASS_NAMES
):
1033 /* The class name array. */
1034 start_locale_structure (&file
);
1035 for (cnt
= 0; cnt
< ctype
->nr_charclass
; ++cnt
)
1036 add_locale_string (&file
, ctype
->classnames
[cnt
]);
1037 add_locale_char (&file
, 0);
1038 align_locale_data (&file
, LOCFILE_ALIGN
);
1039 end_locale_structure (&file
);
1042 case _NL_ITEM_INDEX (_NL_CTYPE_MAP_NAMES
):
1043 /* The class name array. */
1044 start_locale_structure (&file
);
1045 for (cnt
= 0; cnt
< ctype
->map_collection_nr
; ++cnt
)
1046 add_locale_string (&file
, ctype
->mapnames
[cnt
]);
1047 add_locale_char (&file
, 0);
1048 align_locale_data (&file
, LOCFILE_ALIGN
);
1049 end_locale_structure (&file
);
1052 case _NL_ITEM_INDEX (_NL_CTYPE_WIDTH
):
1053 add_locale_wcwidth_table (&file
, &ctype
->width
);
1056 CTYPE_UINT32 (_NL_CTYPE_MB_CUR_MAX
, ctype
->mb_cur_max
);
1058 case _NL_ITEM_INDEX (_NL_CTYPE_CODESET_NAME
):
1059 add_locale_string (&file
, ctype
->codeset_name
);
1062 CTYPE_UINT32 (_NL_CTYPE_MAP_TO_NONASCII
, ctype
->to_nonascii
);
1064 CTYPE_UINT32 (_NL_CTYPE_NONASCII_CASE
, ctype
->nonascii_case
);
1066 case _NL_ITEM_INDEX (_NL_CTYPE_INDIGITS_MB_LEN
):
1067 add_locale_uint32 (&file
, ctype
->mbdigits_act
/ 10);
1070 case _NL_ITEM_INDEX (_NL_CTYPE_INDIGITS_WC_LEN
):
1071 add_locale_uint32 (&file
, ctype
->wcdigits_act
/ 10);
1074 case _NL_ITEM_INDEX (_NL_CTYPE_INDIGITS0_MB
) ... _NL_ITEM_INDEX (_NL_CTYPE_INDIGITS9_MB
):
1075 start_locale_structure (&file
);
1076 for (cnt
= elem
- _NL_ITEM_INDEX (_NL_CTYPE_INDIGITS0_MB
);
1077 cnt
< ctype
->mbdigits_act
; cnt
+= 10)
1079 add_locale_raw_data (&file
, ctype
->mbdigits
[cnt
]->bytes
,
1080 ctype
->mbdigits
[cnt
]->nbytes
);
1081 add_locale_char (&file
, 0);
1083 end_locale_structure (&file
);
1086 case _NL_ITEM_INDEX (_NL_CTYPE_OUTDIGIT0_MB
) ... _NL_ITEM_INDEX (_NL_CTYPE_OUTDIGIT9_MB
):
1087 start_locale_structure (&file
);
1088 cnt
= elem
- _NL_ITEM_INDEX (_NL_CTYPE_OUTDIGIT0_MB
);
1089 add_locale_raw_data (&file
, ctype
->mboutdigits
[cnt
]->bytes
,
1090 ctype
->mboutdigits
[cnt
]->nbytes
);
1091 add_locale_char (&file
, 0);
1092 end_locale_structure (&file
);
1095 case _NL_ITEM_INDEX (_NL_CTYPE_INDIGITS0_WC
) ... _NL_ITEM_INDEX (_NL_CTYPE_INDIGITS9_WC
):
1096 start_locale_structure (&file
);
1097 for (cnt
= elem
- _NL_ITEM_INDEX (_NL_CTYPE_INDIGITS0_WC
);
1098 cnt
< ctype
->wcdigits_act
; cnt
+= 10)
1099 add_locale_uint32 (&file
, ctype
->wcdigits
[cnt
]);
1100 end_locale_structure (&file
);
1103 case _NL_ITEM_INDEX (_NL_CTYPE_OUTDIGIT0_WC
) ... _NL_ITEM_INDEX (_NL_CTYPE_OUTDIGIT9_WC
):
1104 cnt
= elem
- _NL_ITEM_INDEX (_NL_CTYPE_OUTDIGIT0_WC
);
1105 add_locale_uint32 (&file
, ctype
->wcoutdigits
[cnt
]);
1108 case _NL_ITEM_INDEX(_NL_CTYPE_TRANSLIT_DEFAULT_MISSING_LEN
):
1109 add_locale_uint32 (&file
, default_missing_len
);
1112 case _NL_ITEM_INDEX(_NL_CTYPE_TRANSLIT_DEFAULT_MISSING
):
1113 add_locale_uint32_array (&file
, ctype
->default_missing
,
1114 default_missing_len
);
1117 case _NL_ITEM_INDEX(_NL_CTYPE_TRANSLIT_IGNORE_LEN
):
1118 add_locale_uint32 (&file
, ctype
->ntranslit_ignore
);
1121 case _NL_ITEM_INDEX(_NL_CTYPE_TRANSLIT_IGNORE
):
1122 start_locale_structure (&file
);
1124 struct translit_ignore_t
*runp
;
1125 for (runp
= ctype
->translit_ignore
; runp
!= NULL
;
1128 add_locale_uint32 (&file
, runp
->from
);
1129 add_locale_uint32 (&file
, runp
->to
);
1130 add_locale_uint32 (&file
, runp
->step
);
1133 end_locale_structure (&file
);
1137 assert (! "unknown CTYPE element");
1141 /* Handle extra maps. */
1142 size_t nr
= elem
- _NL_ITEM_INDEX (_NL_CTYPE_EXTRA_MAP_1
);
1143 if (nr
< ctype
->nr_charclass
)
1145 start_locale_prelude (&file
);
1146 add_locale_uint32_array (&file
, ctype
->class_b
[nr
], 256 / 32);
1147 end_locale_prelude (&file
);
1148 add_locale_wctype_table (&file
, &ctype
->class_3level
[nr
]);
1152 nr
-= ctype
->nr_charclass
;
1153 assert (nr
< ctype
->map_collection_nr
);
1154 add_locale_wctrans_table (&file
, &ctype
->map_3level
[nr
]);
1159 write_locale_data (output_path
, LC_CTYPE
, "LC_CTYPE", &file
);
1163 /* Local functions. */
1165 ctype_class_new (struct linereader
*lr
, struct locale_ctype_t
*ctype
,
1170 for (cnt
= 0; cnt
< ctype
->nr_charclass
; ++cnt
)
1171 if (strcmp (ctype
->classnames
[cnt
], name
) == 0)
1174 if (cnt
< ctype
->nr_charclass
)
1176 lr_error (lr
, _("character class `%s' already defined"), name
);
1180 if (ctype
->nr_charclass
== MAX_NR_CHARCLASS
)
1181 /* Exit code 2 is prescribed in P1003.2b. */
1182 WITH_CUR_LOCALE (error (2, 0, _("\
1183 implementation limit: no more than %Zd character classes allowed"),
1186 ctype
->classnames
[ctype
->nr_charclass
++] = name
;
1191 ctype_map_new (struct linereader
*lr
, struct locale_ctype_t
*ctype
,
1192 const char *name
, const struct charmap_t
*charmap
)
1194 size_t max_chars
= 0;
1197 for (cnt
= 0; cnt
< ctype
->map_collection_nr
; ++cnt
)
1199 if (strcmp (ctype
->mapnames
[cnt
], name
) == 0)
1202 if (max_chars
< ctype
->map_collection_max
[cnt
])
1203 max_chars
= ctype
->map_collection_max
[cnt
];
1206 if (cnt
< ctype
->map_collection_nr
)
1208 lr_error (lr
, _("character map `%s' already defined"), name
);
1212 if (ctype
->map_collection_nr
== MAX_NR_CHARMAP
)
1213 /* Exit code 2 is prescribed in P1003.2b. */
1214 WITH_CUR_LOCALE (error (2, 0, _("\
1215 implementation limit: no more than %d character maps allowed"),
1218 ctype
->mapnames
[cnt
] = name
;
1221 ctype
->map_collection_max
[cnt
] = charmap
->mb_cur_max
== 1 ? 256 : 512;
1223 ctype
->map_collection_max
[cnt
] = max_chars
;
1225 ctype
->map_collection
[cnt
] = (uint32_t *)
1226 xcalloc (sizeof (uint32_t), ctype
->map_collection_max
[cnt
]);
1227 ctype
->map_collection_act
[cnt
] = 256;
1229 ++ctype
->map_collection_nr
;
1233 /* We have to be prepared that TABLE, MAX, and ACT can be NULL. This
1234 is possible if we only want to extend the name array. */
1236 find_idx (struct locale_ctype_t
*ctype
, uint32_t **table
, size_t *max
,
1237 size_t *act
, uint32_t idx
)
1242 return table
== NULL
? NULL
: &(*table
)[idx
];
1244 /* Use the charnames_idx lookup table instead of the slow search loop. */
1246 cnt
= idx_table_get (&ctype
->charnames_idx
, idx
);
1249 cnt
= ctype
->charnames_act
;
1251 for (cnt
= 256; cnt
< ctype
->charnames_act
; ++cnt
)
1252 if (ctype
->charnames
[cnt
] == idx
)
1256 /* We have to distinguish two cases: the name is found or not. */
1257 if (cnt
== ctype
->charnames_act
)
1259 /* Extend the name array. */
1260 if (ctype
->charnames_act
== ctype
->charnames_max
)
1262 ctype
->charnames_max
*= 2;
1263 ctype
->charnames
= (uint32_t *)
1264 xrealloc (ctype
->charnames
,
1265 sizeof (uint32_t) * ctype
->charnames_max
);
1267 ctype
->charnames
[ctype
->charnames_act
++] = idx
;
1268 idx_table_add (&ctype
->charnames_idx
, idx
, cnt
);
1272 /* We have done everything we are asked to do. */
1276 /* The caller does not want to extend the table. */
1277 return (cnt
>= *act
? NULL
: &(*table
)[cnt
]);
1283 size_t old_max
= *max
;
1286 while (*max
<= cnt
);
1289 (uint32_t *) xrealloc (*table
, *max
* sizeof (uint32_t));
1290 memset (&(*table
)[old_max
], '\0',
1291 (*max
- old_max
) * sizeof (uint32_t));
1297 return &(*table
)[cnt
];
1302 get_character (struct token
*now
, const struct charmap_t
*charmap
,
1303 struct repertoire_t
*repertoire
,
1304 struct charseq
**seqp
, uint32_t *wchp
)
1306 if (now
->tok
== tok_bsymbol
)
1308 /* This will hopefully be the normal case. */
1309 *wchp
= repertoire_find_value (repertoire
, now
->val
.str
.startmb
,
1310 now
->val
.str
.lenmb
);
1311 *seqp
= charmap_find_value (charmap
, now
->val
.str
.startmb
,
1312 now
->val
.str
.lenmb
);
1314 else if (now
->tok
== tok_ucs4
)
1318 snprintf (utmp
, sizeof (utmp
), "U%08X", now
->val
.ucs4
);
1319 *seqp
= charmap_find_value (charmap
, utmp
, 9);
1322 *seqp
= repertoire_find_seq (repertoire
, now
->val
.ucs4
);
1326 /* Compute the value in the charmap from the UCS value. */
1327 const char *symbol
= repertoire_find_symbol (repertoire
,
1333 *seqp
= charmap_find_value (charmap
, symbol
, strlen (symbol
));
1337 if (repertoire
!= NULL
)
1339 /* Insert a negative entry. */
1340 static const struct charseq negative
1341 = { .ucs4
= ILLEGAL_CHAR_VALUE
};
1342 uint32_t *newp
= obstack_alloc (&repertoire
->mem_pool
,
1344 *newp
= now
->val
.ucs4
;
1346 insert_entry (&repertoire
->seq_table
, newp
,
1347 sizeof (uint32_t), (void *) &negative
);
1351 (*seqp
)->ucs4
= now
->val
.ucs4
;
1353 else if ((*seqp
)->ucs4
!= now
->val
.ucs4
)
1356 *wchp
= now
->val
.ucs4
;
1358 else if (now
->tok
== tok_charcode
)
1360 /* We must map from the byte code to UCS4. */
1361 *seqp
= charmap_find_symbol (charmap
, now
->val
.str
.startmb
,
1362 now
->val
.str
.lenmb
);
1365 *wchp
= ILLEGAL_CHAR_VALUE
;
1368 if ((*seqp
)->ucs4
== UNINITIALIZED_CHAR_VALUE
)
1369 (*seqp
)->ucs4
= repertoire_find_value (repertoire
, (*seqp
)->name
,
1370 strlen ((*seqp
)->name
));
1371 *wchp
= (*seqp
)->ucs4
;
1381 /* Ellipsis like in `<foo123>..<foo12a>' or `<j1234>....<j1245>' and
1382 the .(2). counterparts. */
1384 charclass_symbolic_ellipsis (struct linereader
*ldfile
,
1385 struct locale_ctype_t
*ctype
,
1386 const struct charmap_t
*charmap
,
1387 struct repertoire_t
*repertoire
,
1389 const char *last_str
,
1390 unsigned long int class256_bit
,
1391 unsigned long int class_bit
, int base
,
1392 int ignore_content
, int handle_digits
, int step
)
1394 const char *nowstr
= now
->val
.str
.startmb
;
1395 char tmp
[now
->val
.str
.lenmb
+ 1];
1398 unsigned long int from
;
1399 unsigned long int to
;
1401 /* We have to compute the ellipsis values using the symbolic names. */
1402 assert (last_str
!= NULL
);
1404 if (strlen (last_str
) != now
->val
.str
.lenmb
)
1408 _("`%s' and `%.*s' are not valid names for symbolic range"),
1409 last_str
, (int) now
->val
.str
.lenmb
, nowstr
);
1413 if (memcmp (last_str
, nowstr
, now
->val
.str
.lenmb
) == 0)
1414 /* Nothing to do, the names are the same. */
1417 for (cp
= last_str
; *cp
== *(nowstr
+ (cp
- last_str
)); ++cp
)
1421 from
= strtoul (cp
, &endp
, base
);
1422 if ((from
== UINT_MAX
&& errno
== ERANGE
) || *endp
!= '\0')
1425 to
= strtoul (nowstr
+ (cp
- last_str
), &endp
, base
);
1426 if ((to
== UINT_MAX
&& errno
== ERANGE
)
1427 || (endp
- nowstr
) != now
->val
.str
.lenmb
|| from
>= to
)
1430 /* OK, we have a range FROM - TO. Now we can create the symbolic names. */
1431 if (!ignore_content
)
1433 now
->val
.str
.startmb
= tmp
;
1434 while ((from
+= step
) <= to
)
1436 struct charseq
*seq
;
1439 sprintf (tmp
, (base
== 10 ? "%.*s%0*ld" : "%.*s%0*lX"),
1440 (int) (cp
- last_str
), last_str
,
1441 (int) (now
->val
.str
.lenmb
- (cp
- last_str
)),
1444 get_character (now
, charmap
, repertoire
, &seq
, &wch
);
1446 if (seq
!= NULL
&& seq
->nbytes
== 1)
1447 /* Yep, we can store information about this byte sequence. */
1448 ctype
->class256_collection
[seq
->bytes
[0]] |= class256_bit
;
1450 if (wch
!= ILLEGAL_CHAR_VALUE
&& class_bit
!= 0)
1451 /* We have the UCS4 position. */
1452 *find_idx (ctype
, &ctype
->class_collection
,
1453 &ctype
->class_collection_max
,
1454 &ctype
->class_collection_act
, wch
) |= class_bit
;
1456 if (handle_digits
== 1)
1458 /* We must store the digit values. */
1459 if (ctype
->mbdigits_act
== ctype
->mbdigits_max
)
1461 ctype
->mbdigits_max
*= 2;
1462 ctype
->mbdigits
= xrealloc (ctype
->mbdigits
,
1463 (ctype
->mbdigits_max
1464 * sizeof (char *)));
1465 ctype
->wcdigits_max
*= 2;
1466 ctype
->wcdigits
= xrealloc (ctype
->wcdigits
,
1467 (ctype
->wcdigits_max
1468 * sizeof (uint32_t)));
1471 ctype
->mbdigits
[ctype
->mbdigits_act
++] = seq
;
1472 ctype
->wcdigits
[ctype
->wcdigits_act
++] = wch
;
1474 else if (handle_digits
== 2)
1476 /* We must store the digit values. */
1477 if (ctype
->outdigits_act
>= 10)
1479 lr_error (ldfile
, _("\
1480 %s: field `%s' does not contain exactly ten entries"),
1481 "LC_CTYPE", "outdigit");
1485 ctype
->mboutdigits
[ctype
->outdigits_act
] = seq
;
1486 ctype
->wcoutdigits
[ctype
->outdigits_act
] = wch
;
1487 ++ctype
->outdigits_act
;
1494 /* Ellipsis like in `<U1234>..<U2345>' or `<U1234>..(2)..<U2345>'. */
1496 charclass_ucs4_ellipsis (struct linereader
*ldfile
,
1497 struct locale_ctype_t
*ctype
,
1498 const struct charmap_t
*charmap
,
1499 struct repertoire_t
*repertoire
,
1500 struct token
*now
, uint32_t last_wch
,
1501 unsigned long int class256_bit
,
1502 unsigned long int class_bit
, int ignore_content
,
1503 int handle_digits
, int step
)
1505 if (last_wch
> now
->val
.ucs4
)
1507 lr_error (ldfile
, _("\
1508 to-value <U%0*X> of range is smaller than from-value <U%0*X>"),
1509 (now
->val
.ucs4
| last_wch
) < 65536 ? 4 : 8, now
->val
.ucs4
,
1510 (now
->val
.ucs4
| last_wch
) < 65536 ? 4 : 8, last_wch
);
1514 if (!ignore_content
)
1515 while ((last_wch
+= step
) <= now
->val
.ucs4
)
1517 /* We have to find out whether there is a byte sequence corresponding
1518 to this UCS4 value. */
1519 struct charseq
*seq
;
1522 snprintf (utmp
, sizeof (utmp
), "U%08X", last_wch
);
1523 seq
= charmap_find_value (charmap
, utmp
, 9);
1526 snprintf (utmp
, sizeof (utmp
), "U%04X", last_wch
);
1527 seq
= charmap_find_value (charmap
, utmp
, 5);
1531 /* Try looking in the repertoire map. */
1532 seq
= repertoire_find_seq (repertoire
, last_wch
);
1534 /* If this is the first time we look for this sequence create a new
1538 static const struct charseq negative
1539 = { .ucs4
= ILLEGAL_CHAR_VALUE
};
1541 /* Find the symbolic name for this UCS4 value. */
1542 if (repertoire
!= NULL
)
1544 const char *symbol
= repertoire_find_symbol (repertoire
,
1546 uint32_t *newp
= obstack_alloc (&repertoire
->mem_pool
,
1551 /* We have a name, now search the multibyte value. */
1552 seq
= charmap_find_value (charmap
, symbol
, strlen (symbol
));
1555 /* We have to create a fake entry. */
1556 seq
= (struct charseq
*) &negative
;
1558 seq
->ucs4
= last_wch
;
1560 insert_entry (&repertoire
->seq_table
, newp
, sizeof (uint32_t),
1564 /* We have to create a fake entry. */
1565 seq
= (struct charseq
*) &negative
;
1568 /* We have a name, now search the multibyte value. */
1569 if (seq
->ucs4
== last_wch
&& seq
->nbytes
== 1)
1570 /* Yep, we can store information about this byte sequence. */
1571 ctype
->class256_collection
[(size_t) seq
->bytes
[0]]
1574 /* And of course we have the UCS4 position. */
1576 *find_idx (ctype
, &ctype
->class_collection
,
1577 &ctype
->class_collection_max
,
1578 &ctype
->class_collection_act
, last_wch
) |= class_bit
;
1580 if (handle_digits
== 1)
1582 /* We must store the digit values. */
1583 if (ctype
->mbdigits_act
== ctype
->mbdigits_max
)
1585 ctype
->mbdigits_max
*= 2;
1586 ctype
->mbdigits
= xrealloc (ctype
->mbdigits
,
1587 (ctype
->mbdigits_max
1588 * sizeof (char *)));
1589 ctype
->wcdigits_max
*= 2;
1590 ctype
->wcdigits
= xrealloc (ctype
->wcdigits
,
1591 (ctype
->wcdigits_max
1592 * sizeof (uint32_t)));
1595 ctype
->mbdigits
[ctype
->mbdigits_act
++] = (seq
->ucs4
== last_wch
1597 ctype
->wcdigits
[ctype
->wcdigits_act
++] = last_wch
;
1599 else if (handle_digits
== 2)
1601 /* We must store the digit values. */
1602 if (ctype
->outdigits_act
>= 10)
1604 lr_error (ldfile
, _("\
1605 %s: field `%s' does not contain exactly ten entries"),
1606 "LC_CTYPE", "outdigit");
1610 ctype
->mboutdigits
[ctype
->outdigits_act
] = (seq
->ucs4
== last_wch
1612 ctype
->wcoutdigits
[ctype
->outdigits_act
] = last_wch
;
1613 ++ctype
->outdigits_act
;
1619 /* Ellipsis as in `/xea/x12.../xea/x34'. */
1621 charclass_charcode_ellipsis (struct linereader
*ldfile
,
1622 struct locale_ctype_t
*ctype
,
1623 const struct charmap_t
*charmap
,
1624 struct repertoire_t
*repertoire
,
1625 struct token
*now
, char *last_charcode
,
1626 uint32_t last_charcode_len
,
1627 unsigned long int class256_bit
,
1628 unsigned long int class_bit
, int ignore_content
,
1631 /* First check whether the to-value is larger. */
1632 if (now
->val
.charcode
.nbytes
!= last_charcode_len
)
1634 lr_error (ldfile
, _("\
1635 start and end character sequence of range must have the same length"));
1639 if (memcmp (last_charcode
, now
->val
.charcode
.bytes
, last_charcode_len
) > 0)
1641 lr_error (ldfile
, _("\
1642 to-value character sequence is smaller than from-value sequence"));
1646 if (!ignore_content
)
1650 /* Increment the byte sequence value. */
1651 struct charseq
*seq
;
1655 for (i
= last_charcode_len
- 1; i
>= 0; --i
)
1656 if (++last_charcode
[i
] != 0)
1659 if (last_charcode_len
== 1)
1660 /* Of course we have the charcode value. */
1661 ctype
->class256_collection
[(size_t) last_charcode
[0]]
1664 /* Find the symbolic name. */
1665 seq
= charmap_find_symbol (charmap
, last_charcode
,
1669 if (seq
->ucs4
== UNINITIALIZED_CHAR_VALUE
)
1670 seq
->ucs4
= repertoire_find_value (repertoire
, seq
->name
,
1671 strlen (seq
->name
));
1672 wch
= seq
== NULL
? ILLEGAL_CHAR_VALUE
: seq
->ucs4
;
1674 if (wch
!= ILLEGAL_CHAR_VALUE
&& class_bit
!= 0)
1675 *find_idx (ctype
, &ctype
->class_collection
,
1676 &ctype
->class_collection_max
,
1677 &ctype
->class_collection_act
, wch
) |= class_bit
;
1680 wch
= ILLEGAL_CHAR_VALUE
;
1682 if (handle_digits
== 1)
1684 /* We must store the digit values. */
1685 if (ctype
->mbdigits_act
== ctype
->mbdigits_max
)
1687 ctype
->mbdigits_max
*= 2;
1688 ctype
->mbdigits
= xrealloc (ctype
->mbdigits
,
1689 (ctype
->mbdigits_max
1690 * sizeof (char *)));
1691 ctype
->wcdigits_max
*= 2;
1692 ctype
->wcdigits
= xrealloc (ctype
->wcdigits
,
1693 (ctype
->wcdigits_max
1694 * sizeof (uint32_t)));
1697 seq
= xmalloc (sizeof (struct charseq
) + last_charcode_len
);
1698 memcpy ((char *) (seq
+ 1), last_charcode
, last_charcode_len
);
1699 seq
->nbytes
= last_charcode_len
;
1701 ctype
->mbdigits
[ctype
->mbdigits_act
++] = seq
;
1702 ctype
->wcdigits
[ctype
->wcdigits_act
++] = wch
;
1704 else if (handle_digits
== 2)
1706 struct charseq
*seq
;
1707 /* We must store the digit values. */
1708 if (ctype
->outdigits_act
>= 10)
1710 lr_error (ldfile
, _("\
1711 %s: field `%s' does not contain exactly ten entries"),
1712 "LC_CTYPE", "outdigit");
1716 seq
= xmalloc (sizeof (struct charseq
) + last_charcode_len
);
1717 memcpy ((char *) (seq
+ 1), last_charcode
, last_charcode_len
);
1718 seq
->nbytes
= last_charcode_len
;
1720 ctype
->mboutdigits
[ctype
->outdigits_act
] = seq
;
1721 ctype
->wcoutdigits
[ctype
->outdigits_act
] = wch
;
1722 ++ctype
->outdigits_act
;
1725 while (memcmp (last_charcode
, now
->val
.charcode
.bytes
,
1726 last_charcode_len
) != 0);
1732 find_translit2 (struct locale_ctype_t
*ctype
, const struct charmap_t
*charmap
,
1735 struct translit_t
*trunp
= ctype
->translit
;
1736 struct translit_ignore_t
*tirunp
= ctype
->translit_ignore
;
1738 while (trunp
!= NULL
)
1740 /* XXX We simplify things here. The transliterations we look
1741 for are only allowed to have one character. */
1742 if (trunp
->from
[0] == wch
&& trunp
->from
[1] == 0)
1744 /* Found it. Now look for a transliteration which can be
1745 represented with the character set. */
1746 struct translit_to_t
*torunp
= trunp
->to
;
1748 while (torunp
!= NULL
)
1752 for (i
= 0; torunp
->str
[i
] != 0; ++i
)
1756 snprintf (utmp
, sizeof (utmp
), "U%08X", torunp
->str
[i
]);
1757 if (charmap_find_value (charmap
, utmp
, 9) == NULL
)
1758 /* This character cannot be represented. */
1762 if (torunp
->str
[i
] == 0)
1765 torunp
= torunp
->next
;
1771 trunp
= trunp
->next
;
1774 /* Check for ignored chars. */
1775 while (tirunp
!= NULL
)
1777 if (tirunp
->from
<= wch
&& tirunp
->to
>= wch
)
1781 for (wi
= tirunp
->from
; wi
<= wch
; wi
+= tirunp
->step
)
1787 /* Nothing found. */
1793 find_translit (struct localedef_t
*locale
, const struct charmap_t
*charmap
,
1796 struct locale_ctype_t
*ctype
;
1797 uint32_t *result
= NULL
;
1799 assert (locale
!= NULL
);
1800 ctype
= locale
->categories
[LC_CTYPE
].ctype
;
1805 if (ctype
->translit
!= NULL
)
1806 result
= find_translit2 (ctype
, charmap
, wch
);
1810 struct translit_include_t
*irunp
= ctype
->translit_include
;
1812 while (irunp
!= NULL
&& result
== NULL
)
1814 result
= find_translit (find_locale (CTYPE_LOCALE
,
1816 irunp
->copy_repertoire
,
1819 irunp
= irunp
->next
;
1827 /* Read one transliteration entry. */
1829 read_widestring (struct linereader
*ldfile
, struct token
*now
,
1830 const struct charmap_t
*charmap
,
1831 struct repertoire_t
*repertoire
)
1835 if (now
->tok
== tok_default_missing
)
1836 /* The special name "" will denote this case. */
1838 else if (now
->tok
== tok_bsymbol
)
1840 /* Get the value from the repertoire. */
1841 wstr
= (uint32_t *) xmalloc (2 * sizeof (uint32_t));
1842 wstr
[0] = repertoire_find_value (repertoire
, now
->val
.str
.startmb
,
1843 now
->val
.str
.lenmb
);
1844 if (wstr
[0] == ILLEGAL_CHAR_VALUE
)
1846 /* We cannot proceed, we don't know the UCS4 value. */
1853 else if (now
->tok
== tok_ucs4
)
1855 wstr
= (uint32_t *) xmalloc (2 * sizeof (uint32_t));
1856 wstr
[0] = now
->val
.ucs4
;
1859 else if (now
->tok
== tok_charcode
)
1861 /* Argh, we have to convert to the symbol name first and then to the
1863 struct charseq
*seq
= charmap_find_symbol (charmap
,
1864 now
->val
.str
.startmb
,
1865 now
->val
.str
.lenmb
);
1867 /* Cannot find the UCS4 value. */
1870 if (seq
->ucs4
== UNINITIALIZED_CHAR_VALUE
)
1871 seq
->ucs4
= repertoire_find_value (repertoire
, seq
->name
,
1872 strlen (seq
->name
));
1873 if (seq
->ucs4
== ILLEGAL_CHAR_VALUE
)
1874 /* We cannot proceed, we don't know the UCS4 value. */
1877 wstr
= (uint32_t *) xmalloc (2 * sizeof (uint32_t));
1878 wstr
[0] = seq
->ucs4
;
1881 else if (now
->tok
== tok_string
)
1883 wstr
= now
->val
.str
.startwc
;
1884 if (wstr
== NULL
|| wstr
[0] == 0)
1889 if (now
->tok
!= tok_eol
&& now
->tok
!= tok_eof
)
1890 lr_ignore_rest (ldfile
, 0);
1891 SYNTAX_ERROR (_("%s: syntax error"), "LC_CTYPE");
1892 return (uint32_t *) -1l;
1900 read_translit_entry (struct linereader
*ldfile
, struct locale_ctype_t
*ctype
,
1901 struct token
*now
, const struct charmap_t
*charmap
,
1902 struct repertoire_t
*repertoire
)
1904 uint32_t *from_wstr
= read_widestring (ldfile
, now
, charmap
, repertoire
);
1905 struct translit_t
*result
;
1906 struct translit_to_t
**top
;
1907 struct obstack
*ob
= &ctype
->mempool
;
1911 if (from_wstr
== NULL
)
1912 /* There is no valid from string. */
1915 result
= (struct translit_t
*) obstack_alloc (ob
,
1916 sizeof (struct translit_t
));
1917 result
->from
= from_wstr
;
1918 result
->fname
= ldfile
->fname
;
1919 result
->lineno
= ldfile
->lineno
;
1920 result
->next
= NULL
;
1930 /* Next we have one or more transliterations. They are
1931 separated by semicolons. */
1932 now
= lr_token (ldfile
, charmap
, NULL
, repertoire
, verbose
);
1934 if (!first
&& (now
->tok
== tok_semicolon
|| now
->tok
== tok_eol
))
1936 /* One string read. */
1937 const uint32_t zero
= 0;
1941 obstack_grow (ob
, &zero
, 4);
1942 to_wstr
= obstack_finish (ob
);
1944 *top
= obstack_alloc (ob
, sizeof (struct translit_to_t
));
1945 (*top
)->str
= to_wstr
;
1946 (*top
)->next
= NULL
;
1949 if (now
->tok
== tok_eol
)
1951 result
->next
= ctype
->translit
;
1952 ctype
->translit
= result
;
1957 top
= &(*top
)->next
;
1962 to_wstr
= read_widestring (ldfile
, now
, charmap
, repertoire
);
1963 if (to_wstr
== (uint32_t *) -1l)
1965 /* An error occurred. */
1966 obstack_free (ob
, result
);
1970 if (to_wstr
== NULL
)
1973 /* This value is usable. */
1974 obstack_grow (ob
, to_wstr
, wcslen ((wchar_t *) to_wstr
) * 4);
1983 read_translit_ignore_entry (struct linereader
*ldfile
,
1984 struct locale_ctype_t
*ctype
,
1985 const struct charmap_t
*charmap
,
1986 struct repertoire_t
*repertoire
)
1988 /* We expect a semicolon-separated list of characters we ignore. We are
1989 only interested in the wide character definitions. These must be
1990 single characters, possibly defining a range when an ellipsis is used. */
1993 struct token
*now
= lr_token (ldfile
, charmap
, NULL
, repertoire
,
1995 struct translit_ignore_t
*newp
;
1998 if (now
->tok
== tok_eol
|| now
->tok
== tok_eof
)
2001 _("premature end of `translit_ignore' definition"));
2005 if (now
->tok
!= tok_bsymbol
&& now
->tok
!= tok_ucs4
)
2007 lr_error (ldfile
, _("syntax error"));
2008 lr_ignore_rest (ldfile
, 0);
2012 if (now
->tok
== tok_ucs4
)
2013 from
= now
->val
.ucs4
;
2015 /* Try to get the value. */
2016 from
= repertoire_find_value (repertoire
, now
->val
.str
.startmb
,
2017 now
->val
.str
.lenmb
);
2019 if (from
== ILLEGAL_CHAR_VALUE
)
2021 lr_error (ldfile
, "invalid character name");
2026 newp
= (struct translit_ignore_t
*)
2027 obstack_alloc (&ctype
->mempool
, sizeof (struct translit_ignore_t
));
2032 newp
->next
= ctype
->translit_ignore
;
2033 ctype
->translit_ignore
= newp
;
2036 /* Now we expect either a semicolon, an ellipsis, or the end of the
2038 now
= lr_token (ldfile
, charmap
, NULL
, repertoire
, verbose
);
2040 if (now
->tok
== tok_ellipsis2
|| now
->tok
== tok_ellipsis2_2
)
2042 /* XXX Should we bother implementing `....'? `...' certainly
2043 will not be implemented. */
2045 int step
= now
->tok
== tok_ellipsis2_2
? 2 : 1;
2047 now
= lr_token (ldfile
, charmap
, NULL
, repertoire
, verbose
);
2049 if (now
->tok
== tok_eol
|| now
->tok
== tok_eof
)
2052 _("premature end of `translit_ignore' definition"));
2056 if (now
->tok
!= tok_bsymbol
&& now
->tok
!= tok_ucs4
)
2058 lr_error (ldfile
, _("syntax error"));
2059 lr_ignore_rest (ldfile
, 0);
2063 if (now
->tok
== tok_ucs4
)
2066 /* Try to get the value. */
2067 to
= repertoire_find_value (repertoire
, now
->val
.str
.startmb
,
2068 now
->val
.str
.lenmb
);
2070 if (to
== ILLEGAL_CHAR_VALUE
)
2071 lr_error (ldfile
, "invalid character name");
2074 /* Make sure the `to'-value is larger. */
2081 lr_error (ldfile
, _("\
2082 to-value <U%0*X> of range is smaller than from-value <U%0*X>"),
2083 (to
| from
) < 65536 ? 4 : 8, to
,
2084 (to
| from
) < 65536 ? 4 : 8, from
);
2087 /* And the next token. */
2088 now
= lr_token (ldfile
, charmap
, NULL
, repertoire
, verbose
);
2091 if (now
->tok
== tok_eol
|| now
->tok
== tok_eof
)
2095 if (now
->tok
== tok_semicolon
)
2099 /* If we come here something is wrong. */
2100 lr_error (ldfile
, _("syntax error"));
2101 lr_ignore_rest (ldfile
, 0);
2107 /* The parser for the LC_CTYPE section of the locale definition. */
2109 ctype_read (struct linereader
*ldfile
, struct localedef_t
*result
,
2110 const struct charmap_t
*charmap
, const char *repertoire_name
,
2113 struct repertoire_t
*repertoire
= NULL
;
2114 struct locale_ctype_t
*ctype
;
2116 enum token_t nowtok
;
2118 uint32_t last_wch
= 0;
2119 enum token_t last_token
;
2120 enum token_t ellipsis_token
;
2122 char last_charcode
[16];
2123 size_t last_charcode_len
= 0;
2124 const char *last_str
= NULL
;
2126 struct localedef_t
*copy_locale
= NULL
;
2128 /* Get the repertoire we have to use. */
2129 if (repertoire_name
!= NULL
)
2130 repertoire
= repertoire_read (repertoire_name
);
2132 /* The rest of the line containing `LC_CTYPE' must be free. */
2133 lr_ignore_rest (ldfile
, 1);
2138 now
= lr_token (ldfile
, charmap
, NULL
, NULL
, verbose
);
2141 while (nowtok
== tok_eol
);
2143 /* If we see `copy' now we are almost done. */
2144 if (nowtok
== tok_copy
)
2146 now
= lr_token (ldfile
, charmap
, NULL
, NULL
, verbose
);
2147 if (now
->tok
!= tok_string
)
2149 SYNTAX_ERROR (_("%s: syntax error"), "LC_CTYPE");
2153 now
= lr_token (ldfile
, charmap
, NULL
, NULL
, verbose
);
2154 while (now
->tok
!= tok_eof
&& now
->tok
!= tok_end
);
2156 if (now
->tok
!= tok_eof
2157 || (now
= lr_token (ldfile
, charmap
, NULL
, NULL
, verbose
),
2158 now
->tok
== tok_eof
))
2159 lr_error (ldfile
, _("%s: premature end of file"), "LC_CTYPE");
2160 else if (now
->tok
!= tok_lc_ctype
)
2162 lr_error (ldfile
, _("\
2163 %1$s: definition does not end with `END %1$s'"), "LC_CTYPE");
2164 lr_ignore_rest (ldfile
, 0);
2167 lr_ignore_rest (ldfile
, 1);
2172 if (! ignore_content
)
2174 /* Get the locale definition. */
2175 copy_locale
= load_locale (LC_CTYPE
, now
->val
.str
.startmb
,
2176 repertoire_name
, charmap
, NULL
);
2177 if ((copy_locale
->avail
& CTYPE_LOCALE
) == 0)
2179 /* Not yet loaded. So do it now. */
2180 if (locfile_read (copy_locale
, charmap
) != 0)
2184 if (copy_locale
->categories
[LC_CTYPE
].ctype
== NULL
)
2188 lr_ignore_rest (ldfile
, 1);
2190 now
= lr_token (ldfile
, charmap
, NULL
, NULL
, verbose
);
2194 /* Prepare the data structures. */
2195 ctype_startup (ldfile
, result
, charmap
, copy_locale
, ignore_content
);
2196 ctype
= result
->categories
[LC_CTYPE
].ctype
;
2198 /* Remember the repertoire we use. */
2199 if (!ignore_content
)
2200 ctype
->repertoire
= repertoire
;
2204 unsigned long int class_bit
= 0;
2205 unsigned long int class256_bit
= 0;
2206 int handle_digits
= 0;
2208 /* Of course we don't proceed beyond the end of file. */
2209 if (nowtok
== tok_eof
)
2212 /* Ingore empty lines. */
2213 if (nowtok
== tok_eol
)
2215 now
= lr_token (ldfile
, charmap
, NULL
, NULL
, verbose
);
2223 now
= lr_token (ldfile
, charmap
, NULL
, NULL
, verbose
);
2224 while (now
->tok
== tok_ident
|| now
->tok
== tok_string
)
2226 ctype_class_new (ldfile
, ctype
, now
->val
.str
.startmb
);
2227 now
= lr_token (ldfile
, charmap
, NULL
, NULL
, verbose
);
2228 if (now
->tok
!= tok_semicolon
)
2230 now
= lr_token (ldfile
, charmap
, NULL
, NULL
, verbose
);
2232 if (now
->tok
!= tok_eol
)
2234 %s: syntax error in definition of new character class"), "LC_CTYPE");
2238 now
= lr_token (ldfile
, charmap
, NULL
, NULL
, verbose
);
2239 while (now
->tok
== tok_ident
|| now
->tok
== tok_string
)
2241 ctype_map_new (ldfile
, ctype
, now
->val
.str
.startmb
, charmap
);
2242 now
= lr_token (ldfile
, charmap
, NULL
, NULL
, verbose
);
2243 if (now
->tok
!= tok_semicolon
)
2245 now
= lr_token (ldfile
, charmap
, NULL
, NULL
, verbose
);
2247 if (now
->tok
!= tok_eol
)
2249 %s: syntax error in definition of new character map"), "LC_CTYPE");
2253 /* Ignore the rest of the line if we don't need the input of
2257 lr_ignore_rest (ldfile
, 0);
2261 /* We simply forget the `class' keyword and use the following
2262 operand to determine the bit. */
2263 now
= lr_token (ldfile
, charmap
, NULL
, NULL
, verbose
);
2264 if (now
->tok
== tok_ident
|| now
->tok
== tok_string
)
2266 /* Must can be one of the predefined class names. */
2267 for (cnt
= 0; cnt
< ctype
->nr_charclass
; ++cnt
)
2268 if (strcmp (ctype
->classnames
[cnt
], now
->val
.str
.startmb
) == 0)
2270 if (cnt
>= ctype
->nr_charclass
)
2272 #ifdef PREDEFINED_CLASSES
2273 if (now
->val
.str
.lenmb
== 8
2274 && memcmp ("special1", now
->val
.str
.startmb
, 8) == 0)
2275 class_bit
= _ISwspecial1
;
2276 else if (now
->val
.str
.lenmb
== 8
2277 && memcmp ("special2", now
->val
.str
.startmb
, 8) == 0)
2278 class_bit
= _ISwspecial2
;
2279 else if (now
->val
.str
.lenmb
== 8
2280 && memcmp ("special3", now
->val
.str
.startmb
, 8) == 0)
2281 class_bit
= _ISwspecial3
;
2285 /* OK, it's a new class. */
2286 ctype_class_new (ldfile
, ctype
, now
->val
.str
.startmb
);
2288 class_bit
= _ISwbit (ctype
->nr_charclass
- 1);
2293 class_bit
= _ISwbit (cnt
);
2295 free (now
->val
.str
.startmb
);
2298 else if (now
->tok
== tok_digit
)
2299 goto handle_tok_digit
;
2300 else if (now
->tok
< tok_upper
|| now
->tok
> tok_blank
)
2304 class_bit
= BITw (now
->tok
);
2305 class256_bit
= BIT (now
->tok
);
2308 /* The next character must be a semicolon. */
2309 now
= lr_token (ldfile
, charmap
, NULL
, NULL
, verbose
);
2310 if (now
->tok
!= tok_semicolon
)
2312 goto read_charclass
;
2325 /* Ignore the rest of the line if we don't need the input of
2329 lr_ignore_rest (ldfile
, 0);
2333 class_bit
= BITw (now
->tok
);
2334 class256_bit
= BIT (now
->tok
);
2337 ctype
->class_done
|= class_bit
;
2338 last_token
= tok_none
;
2339 ellipsis_token
= tok_none
;
2341 now
= lr_token (ldfile
, charmap
, NULL
, NULL
, verbose
);
2342 while (now
->tok
!= tok_eol
&& now
->tok
!= tok_eof
)
2345 struct charseq
*seq
;
2347 if (ellipsis_token
== tok_none
)
2349 if (get_character (now
, charmap
, repertoire
, &seq
, &wch
))
2352 if (!ignore_content
&& seq
!= NULL
&& seq
->nbytes
== 1)
2353 /* Yep, we can store information about this byte
2355 ctype
->class256_collection
[seq
->bytes
[0]] |= class256_bit
;
2357 if (!ignore_content
&& wch
!= ILLEGAL_CHAR_VALUE
2359 /* We have the UCS4 position. */
2360 *find_idx (ctype
, &ctype
->class_collection
,
2361 &ctype
->class_collection_max
,
2362 &ctype
->class_collection_act
, wch
) |= class_bit
;
2364 last_token
= now
->tok
;
2365 /* Terminate the string. */
2366 if (last_token
== tok_bsymbol
)
2368 now
->val
.str
.startmb
[now
->val
.str
.lenmb
] = '\0';
2369 last_str
= now
->val
.str
.startmb
;
2374 memcpy (last_charcode
, now
->val
.charcode
.bytes
, 16);
2375 last_charcode_len
= now
->val
.charcode
.nbytes
;
2377 if (!ignore_content
&& handle_digits
== 1)
2379 /* We must store the digit values. */
2380 if (ctype
->mbdigits_act
== ctype
->mbdigits_max
)
2382 ctype
->mbdigits_max
+= 10;
2383 ctype
->mbdigits
= xrealloc (ctype
->mbdigits
,
2384 (ctype
->mbdigits_max
2385 * sizeof (char *)));
2386 ctype
->wcdigits_max
+= 10;
2387 ctype
->wcdigits
= xrealloc (ctype
->wcdigits
,
2388 (ctype
->wcdigits_max
2389 * sizeof (uint32_t)));
2392 ctype
->mbdigits
[ctype
->mbdigits_act
++] = seq
;
2393 ctype
->wcdigits
[ctype
->wcdigits_act
++] = wch
;
2395 else if (!ignore_content
&& handle_digits
== 2)
2397 /* We must store the digit values. */
2398 if (ctype
->outdigits_act
>= 10)
2400 lr_error (ldfile
, _("\
2401 %s: field `%s' does not contain exactly ten entries"),
2402 "LC_CTYPE", "outdigit");
2403 lr_ignore_rest (ldfile
, 0);
2407 ctype
->mboutdigits
[ctype
->outdigits_act
] = seq
;
2408 ctype
->wcoutdigits
[ctype
->outdigits_act
] = wch
;
2409 ++ctype
->outdigits_act
;
2414 /* Now it gets complicated. We have to resolve the
2415 ellipsis problem. First we must distinguish between
2416 the different kind of ellipsis and this must match the
2417 tokens we have seen. */
2418 assert (last_token
!= tok_none
);
2420 if (last_token
!= now
->tok
)
2422 lr_error (ldfile
, _("\
2423 ellipsis range must be marked by two operands of same type"));
2424 lr_ignore_rest (ldfile
, 0);
2428 if (last_token
== tok_bsymbol
)
2430 if (ellipsis_token
== tok_ellipsis3
)
2431 lr_error (ldfile
, _("with symbolic name range values \
2432 the absolute ellipsis `...' must not be used"));
2434 charclass_symbolic_ellipsis (ldfile
, ctype
, charmap
,
2435 repertoire
, now
, last_str
,
2436 class256_bit
, class_bit
,
2441 handle_digits
, step
);
2443 else if (last_token
== tok_ucs4
)
2445 if (ellipsis_token
!= tok_ellipsis2
)
2446 lr_error (ldfile
, _("\
2447 with UCS range values one must use the hexadecimal symbolic ellipsis `..'"));
2449 charclass_ucs4_ellipsis (ldfile
, ctype
, charmap
,
2450 repertoire
, now
, last_wch
,
2451 class256_bit
, class_bit
,
2452 ignore_content
, handle_digits
,
2457 assert (last_token
== tok_charcode
);
2459 if (ellipsis_token
!= tok_ellipsis3
)
2460 lr_error (ldfile
, _("\
2461 with character code range values one must use the absolute ellipsis `...'"));
2463 charclass_charcode_ellipsis (ldfile
, ctype
, charmap
,
2467 class256_bit
, class_bit
,
2472 /* Now we have used the last value. */
2473 last_token
= tok_none
;
2476 /* Next we expect a semicolon or the end of the line. */
2477 now
= lr_token (ldfile
, charmap
, NULL
, NULL
, verbose
);
2478 if (now
->tok
== tok_eol
|| now
->tok
== tok_eof
)
2481 if (last_token
!= tok_none
2482 && now
->tok
>= tok_ellipsis2
&& now
->tok
<= tok_ellipsis4_2
)
2484 if (now
->tok
== tok_ellipsis2_2
)
2486 now
->tok
= tok_ellipsis2
;
2489 else if (now
->tok
== tok_ellipsis4_2
)
2491 now
->tok
= tok_ellipsis4
;
2495 ellipsis_token
= now
->tok
;
2497 now
= lr_token (ldfile
, charmap
, NULL
, NULL
, verbose
);
2501 if (now
->tok
!= tok_semicolon
)
2504 /* And get the next character. */
2505 now
= lr_token (ldfile
, charmap
, NULL
, NULL
, verbose
);
2507 ellipsis_token
= tok_none
;
2513 /* Ignore the rest of the line if we don't need the input of
2517 lr_ignore_rest (ldfile
, 0);
2522 class_bit
= _ISwdigit
;
2523 class256_bit
= _ISdigit
;
2525 goto read_charclass
;
2528 /* Ignore the rest of the line if we don't need the input of
2532 lr_ignore_rest (ldfile
, 0);
2536 if (ctype
->outdigits_act
!= 0)
2537 lr_error (ldfile
, _("\
2538 %s: field `%s' declared more than once"),
2539 "LC_CTYPE", "outdigit");
2543 goto read_charclass
;
2546 /* Ignore the rest of the line if we don't need the input of
2550 lr_ignore_rest (ldfile
, 0);
2558 /* Ignore the rest of the line if we don't need the input of
2562 lr_ignore_rest (ldfile
, 0);
2570 /* Ignore the rest of the line if we don't need the input of
2574 lr_ignore_rest (ldfile
, 0);
2578 /* We simply forget the `map' keyword and use the following
2579 operand to determine the mapping. */
2580 now
= lr_token (ldfile
, charmap
, NULL
, NULL
, verbose
);
2581 if (now
->tok
== tok_ident
|| now
->tok
== tok_string
)
2585 for (cnt
= 2; cnt
< ctype
->map_collection_nr
; ++cnt
)
2586 if (strcmp (now
->val
.str
.startmb
, ctype
->mapnames
[cnt
]) == 0)
2589 if (cnt
< ctype
->map_collection_nr
)
2590 free (now
->val
.str
.startmb
);
2592 /* OK, it's a new map. */
2593 ctype_map_new (ldfile
, ctype
, now
->val
.str
.startmb
, charmap
);
2597 else if (now
->tok
< tok_toupper
|| now
->tok
> tok_tolower
)
2600 mapidx
= now
->tok
- tok_toupper
;
2602 now
= lr_token (ldfile
, charmap
, NULL
, NULL
, verbose
);
2603 /* This better should be a semicolon. */
2604 if (now
->tok
!= tok_semicolon
)
2608 /* Test whether this mapping was already defined. */
2609 if (ctype
->tomap_done
[mapidx
])
2611 lr_error (ldfile
, _("duplicated definition for mapping `%s'"),
2612 ctype
->mapnames
[mapidx
]);
2613 lr_ignore_rest (ldfile
, 0);
2616 ctype
->tomap_done
[mapidx
] = 1;
2618 now
= lr_token (ldfile
, charmap
, NULL
, NULL
, verbose
);
2619 while (now
->tok
!= tok_eol
&& now
->tok
!= tok_eof
)
2621 struct charseq
*from_seq
;
2623 struct charseq
*to_seq
;
2626 /* Every pair starts with an opening brace. */
2627 if (now
->tok
!= tok_open_brace
)
2630 /* Next comes the from-value. */
2631 now
= lr_token (ldfile
, charmap
, NULL
, NULL
, verbose
);
2632 if (get_character (now
, charmap
, repertoire
, &from_seq
,
2636 /* The next is a comma. */
2637 now
= lr_token (ldfile
, charmap
, NULL
, NULL
, verbose
);
2638 if (now
->tok
!= tok_comma
)
2641 /* And the other value. */
2642 now
= lr_token (ldfile
, charmap
, NULL
, NULL
, verbose
);
2643 if (get_character (now
, charmap
, repertoire
, &to_seq
,
2647 /* And the last thing is the closing brace. */
2648 now
= lr_token (ldfile
, charmap
, NULL
, NULL
, verbose
);
2649 if (now
->tok
!= tok_close_brace
)
2652 if (!ignore_content
)
2654 /* Check whether the mapping converts from an ASCII value
2655 to a non-ASCII value. */
2656 if (from_seq
!= NULL
&& from_seq
->nbytes
== 1
2657 && isascii (from_seq
->bytes
[0])
2658 && to_seq
!= NULL
&& (to_seq
->nbytes
!= 1
2659 || !isascii (to_seq
->bytes
[0])))
2660 ctype
->to_nonascii
= 1;
2662 if (mapidx
< 2 && from_seq
!= NULL
&& to_seq
!= NULL
2663 && from_seq
->nbytes
== 1 && to_seq
->nbytes
== 1)
2664 /* We can use this value. */
2665 ctype
->map256_collection
[mapidx
][from_seq
->bytes
[0]]
2668 if (from_wch
!= ILLEGAL_CHAR_VALUE
2669 && to_wch
!= ILLEGAL_CHAR_VALUE
)
2670 /* Both correct values. */
2671 *find_idx (ctype
, &ctype
->map_collection
[mapidx
],
2672 &ctype
->map_collection_max
[mapidx
],
2673 &ctype
->map_collection_act
[mapidx
],
2677 /* Now comes a semicolon or the end of the line/file. */
2678 now
= lr_token (ldfile
, charmap
, NULL
, NULL
, verbose
);
2679 if (now
->tok
== tok_semicolon
)
2680 now
= lr_token (ldfile
, charmap
, NULL
, NULL
, verbose
);
2684 case tok_translit_start
:
2685 /* Ignore the entire translit section with its peculiar syntax
2686 if we don't need the input. */
2691 lr_ignore_rest (ldfile
, 0);
2692 now
= lr_token (ldfile
, charmap
, NULL
, NULL
, verbose
);
2694 while (now
->tok
!= tok_translit_end
&& now
->tok
!= tok_eof
);
2696 if (now
->tok
== tok_eof
)
2697 lr_error (ldfile
, _(\
2698 "%s: `translit_start' section does not end with `translit_end'"),
2704 /* The rest of the line better should be empty. */
2705 lr_ignore_rest (ldfile
, 1);
2707 /* We count here the number of allocated entries in the `translit'
2711 ldfile
->translate_strings
= 1;
2712 ldfile
->return_widestr
= 1;
2714 /* We proceed until we see the `translit_end' token. */
2715 while (now
= lr_token (ldfile
, charmap
, NULL
, repertoire
, verbose
),
2716 now
->tok
!= tok_translit_end
&& now
->tok
!= tok_eof
)
2718 if (now
->tok
== tok_eol
)
2719 /* Ignore empty lines. */
2722 if (now
->tok
== tok_include
)
2724 /* We have to include locale. */
2725 const char *locale_name
;
2726 const char *repertoire_name
;
2727 struct translit_include_t
*include_stmt
, **include_ptr
;
2729 now
= lr_token (ldfile
, charmap
, NULL
, NULL
, verbose
);
2730 /* This should be a string or an identifier. In any
2731 case something to name a locale. */
2732 if (now
->tok
!= tok_string
&& now
->tok
!= tok_ident
)
2735 lr_error (ldfile
, _("%s: syntax error"), "LC_CTYPE");
2736 lr_ignore_rest (ldfile
, 0);
2739 locale_name
= now
->val
.str
.startmb
;
2741 /* Next should be a semicolon. */
2742 now
= lr_token (ldfile
, charmap
, NULL
, NULL
, verbose
);
2743 if (now
->tok
!= tok_semicolon
)
2744 goto translit_syntax
;
2746 /* Now the repertoire name. */
2747 now
= lr_token (ldfile
, charmap
, NULL
, NULL
, verbose
);
2748 if ((now
->tok
!= tok_string
&& now
->tok
!= tok_ident
)
2749 || now
->val
.str
.startmb
== NULL
)
2750 goto translit_syntax
;
2751 repertoire_name
= now
->val
.str
.startmb
;
2752 if (repertoire_name
[0] == '\0')
2753 /* Ignore the empty string. */
2754 repertoire_name
= NULL
;
2756 /* Save the include statement for later processing. */
2757 include_stmt
= (struct translit_include_t
*)
2758 xmalloc (sizeof (struct translit_include_t
));
2759 include_stmt
->copy_locale
= locale_name
;
2760 include_stmt
->copy_repertoire
= repertoire_name
;
2761 include_stmt
->next
= NULL
;
2763 include_ptr
= &ctype
->translit_include
;
2764 while (*include_ptr
!= NULL
)
2765 include_ptr
= &(*include_ptr
)->next
;
2766 *include_ptr
= include_stmt
;
2768 /* The rest of the line must be empty. */
2769 lr_ignore_rest (ldfile
, 1);
2771 /* Make sure the locale is read. */
2772 add_to_readlist (LC_CTYPE
, locale_name
, repertoire_name
,
2776 else if (now
->tok
== tok_default_missing
)
2782 /* We expect a single character or string as the
2784 now
= lr_token (ldfile
, charmap
, NULL
, NULL
, verbose
);
2785 wstr
= read_widestring (ldfile
, now
, charmap
,
2790 if (ctype
->default_missing
!= NULL
)
2792 lr_error (ldfile
, _("\
2793 %s: duplicate `default_missing' definition"), "LC_CTYPE");
2794 WITH_CUR_LOCALE (error_at_line (0, 0,
2795 ctype
->default_missing_file
,
2796 ctype
->default_missing_lineno
,
2798 previous definition was here")));
2802 ctype
->default_missing
= wstr
;
2803 ctype
->default_missing_file
= ldfile
->fname
;
2804 ctype
->default_missing_lineno
= ldfile
->lineno
;
2806 /* We can have more entries, ignore them. */
2807 lr_ignore_rest (ldfile
, 0);
2810 else if (wstr
== (uint32_t *) -1l)
2811 /* This was an syntax error. */
2814 /* Maybe there is another replacement we can use. */
2815 now
= lr_token (ldfile
, charmap
, NULL
, NULL
, verbose
);
2816 if (now
->tok
== tok_eol
|| now
->tok
== tok_eof
)
2818 /* Nothing found. We tell the user. */
2819 lr_error (ldfile
, _("\
2820 %s: no representable `default_missing' definition found"), "LC_CTYPE");
2823 if (now
->tok
!= tok_semicolon
)
2824 goto translit_syntax
;
2829 else if (now
->tok
== tok_translit_ignore
)
2831 read_translit_ignore_entry (ldfile
, ctype
, charmap
,
2836 read_translit_entry (ldfile
, ctype
, now
, charmap
, repertoire
);
2838 ldfile
->return_widestr
= 0;
2840 if (now
->tok
== tok_eof
)
2841 lr_error (ldfile
, _(\
2842 "%s: `translit_start' section does not end with `translit_end'"),
2848 /* Ignore the rest of the line if we don't need the input of
2852 lr_ignore_rest (ldfile
, 0);
2856 /* This could mean one of several things. First test whether
2857 it's a character class name. */
2858 for (cnt
= 0; cnt
< ctype
->nr_charclass
; ++cnt
)
2859 if (strcmp (now
->val
.str
.startmb
, ctype
->classnames
[cnt
]) == 0)
2861 if (cnt
< ctype
->nr_charclass
)
2863 class_bit
= _ISwbit (cnt
);
2864 class256_bit
= cnt
<= 11 ? _ISbit (cnt
) : 0;
2865 free (now
->val
.str
.startmb
);
2866 goto read_charclass
;
2868 for (cnt
= 0; cnt
< ctype
->map_collection_nr
; ++cnt
)
2869 if (strcmp (now
->val
.str
.startmb
, ctype
->mapnames
[cnt
]) == 0)
2871 if (cnt
< ctype
->map_collection_nr
)
2874 free (now
->val
.str
.startmb
);
2877 #ifdef PREDEFINED_CLASSES
2878 if (strcmp (now
->val
.str
.startmb
, "special1") == 0)
2880 class_bit
= _ISwspecial1
;
2881 free (now
->val
.str
.startmb
);
2882 goto read_charclass
;
2884 if (strcmp (now
->val
.str
.startmb
, "special2") == 0)
2886 class_bit
= _ISwspecial2
;
2887 free (now
->val
.str
.startmb
);
2888 goto read_charclass
;
2890 if (strcmp (now
->val
.str
.startmb
, "special3") == 0)
2892 class_bit
= _ISwspecial3
;
2893 free (now
->val
.str
.startmb
);
2894 goto read_charclass
;
2896 if (strcmp (now
->val
.str
.startmb
, "tosymmetric") == 0)
2905 /* Next we assume `LC_CTYPE'. */
2906 now
= lr_token (ldfile
, charmap
, NULL
, NULL
, verbose
);
2907 if (now
->tok
== tok_eof
)
2909 if (now
->tok
== tok_eol
)
2910 lr_error (ldfile
, _("%s: incomplete `END' line"),
2912 else if (now
->tok
!= tok_lc_ctype
)
2913 lr_error (ldfile
, _("\
2914 %1$s: definition does not end with `END %1$s'"), "LC_CTYPE");
2915 lr_ignore_rest (ldfile
, now
->tok
== tok_lc_ctype
);
2920 if (now
->tok
!= tok_eof
)
2921 SYNTAX_ERROR (_("%s: syntax error"), "LC_CTYPE");
2924 /* Prepare for the next round. */
2925 now
= lr_token (ldfile
, charmap
, NULL
, NULL
, verbose
);
2929 /* When we come here we reached the end of the file. */
2930 lr_error (ldfile
, _("%s: premature end of file"), "LC_CTYPE");
2934 /* Subroutine of set_class_defaults, below. */
2936 set_one_default (struct locale_ctype_t
*ctype
,
2937 const struct charmap_t
*charmap
,
2938 int bitpos
, int from
, int to
)
2942 int bit
= _ISbit (bitpos
);
2943 int bitw
= _ISwbit (bitpos
);
2944 /* Define string. */
2947 for (ch
= from
; ch
<= to
; ++ch
)
2949 struct charseq
*seq
;
2952 seq
= charmap_find_value (charmap
, tmp
, 1);
2956 sprintf (buf
, "U%08X", ch
);
2957 seq
= charmap_find_value (charmap
, buf
, 9);
2962 WITH_CUR_LOCALE (error (0, 0, _("\
2963 %s: character `%s' not defined while needed as default value"),
2966 else if (seq
->nbytes
!= 1)
2967 WITH_CUR_LOCALE (error (0, 0, _("\
2968 %s: character `%s' in charmap not representable with one byte"),
2971 ctype
->class256_collection
[seq
->bytes
[0]] |= bit
;
2973 /* No need to search here, the ASCII value is also the Unicode
2975 ELEM (ctype
, class_collection
, , ch
) |= bitw
;
2980 set_class_defaults (struct locale_ctype_t
*ctype
,
2981 const struct charmap_t
*charmap
,
2982 struct repertoire_t
*repertoire
)
2984 #define set_default(bitpos, from, to) \
2985 set_one_default (ctype, charmap, bitpos, from, to)
2987 /* These function defines the default values for the classes and conversions
2988 according to POSIX.2 2.5.2.1.
2989 It may seem that the order of these if-blocks is arbitrary but it is NOT.
2990 Don't move them unless you know what you do! */
2992 /* Set default values if keyword was not present. */
2993 if ((ctype
->class_done
& BITw (tok_upper
)) == 0)
2994 /* "If this keyword [lower] is not specified, the lowercase letters
2995 `A' through `Z', ..., shall automatically belong to this class,
2996 with implementation defined character values." [P1003.2, 2.5.2.1] */
2997 set_default (BITPOS (tok_upper
), 'A', 'Z');
2999 if ((ctype
->class_done
& BITw (tok_lower
)) == 0)
3000 /* "If this keyword [lower] is not specified, the lowercase letters
3001 `a' through `z', ..., shall automatically belong to this class,
3002 with implementation defined character values." [P1003.2, 2.5.2.1] */
3003 set_default (BITPOS (tok_lower
), 'a', 'z');
3005 if ((ctype
->class_done
& BITw (tok_alpha
)) == 0)
3007 /* Table 2-6 in P1003.2 says that characters in class `upper' or
3008 class `lower' *must* be in class `alpha'. */
3009 unsigned long int mask
= BIT (tok_upper
) | BIT (tok_lower
);
3010 unsigned long int maskw
= BITw (tok_upper
) | BITw (tok_lower
);
3012 for (size_t cnt
= 0; cnt
< 256; ++cnt
)
3013 if ((ctype
->class256_collection
[cnt
] & mask
) != 0)
3014 ctype
->class256_collection
[cnt
] |= BIT (tok_alpha
);
3016 for (size_t cnt
= 0; cnt
< ctype
->class_collection_act
; ++cnt
)
3017 if ((ctype
->class_collection
[cnt
] & maskw
) != 0)
3018 ctype
->class_collection
[cnt
] |= BITw (tok_alpha
);
3021 if ((ctype
->class_done
& BITw (tok_digit
)) == 0)
3022 /* "If this keyword [digit] is not specified, the digits `0' through
3023 `9', ..., shall automatically belong to this class, with
3024 implementation-defined character values." [P1003.2, 2.5.2.1] */
3025 set_default (BITPOS (tok_digit
), '0', '9');
3027 /* "Only characters specified for the `alpha' and `digit' keyword
3028 shall be specified. Characters specified for the keyword `alpha'
3029 and `digit' are automatically included in this class. */
3031 unsigned long int mask
= BIT (tok_alpha
) | BIT (tok_digit
);
3032 unsigned long int maskw
= BITw (tok_alpha
) | BITw (tok_digit
);
3034 for (size_t cnt
= 0; cnt
< 256; ++cnt
)
3035 if ((ctype
->class256_collection
[cnt
] & mask
) != 0)
3036 ctype
->class256_collection
[cnt
] |= BIT (tok_alnum
);
3038 for (size_t cnt
= 0; cnt
< ctype
->class_collection_act
; ++cnt
)
3039 if ((ctype
->class_collection
[cnt
] & maskw
) != 0)
3040 ctype
->class_collection
[cnt
] |= BITw (tok_alnum
);
3043 if ((ctype
->class_done
& BITw (tok_space
)) == 0)
3044 /* "If this keyword [space] is not specified, the characters <space>,
3045 <form-feed>, <newline>, <carriage-return>, <tab>, and
3046 <vertical-tab>, ..., shall automatically belong to this class,
3047 with implementation-defined character values." [P1003.2, 2.5.2.1] */
3049 struct charseq
*seq
;
3051 seq
= charmap_find_value (charmap
, "space", 5);
3053 seq
= charmap_find_value (charmap
, "SP", 2);
3055 seq
= charmap_find_value (charmap
, "U00000020", 9);
3059 WITH_CUR_LOCALE (error (0, 0, _("\
3060 %s: character `%s' not defined while needed as default value"),
3061 "LC_CTYPE", "<space>"));
3063 else if (seq
->nbytes
!= 1)
3064 WITH_CUR_LOCALE (error (0, 0, _("\
3065 %s: character `%s' in charmap not representable with one byte"),
3066 "LC_CTYPE", "<space>"));
3068 ctype
->class256_collection
[seq
->bytes
[0]] |= BIT (tok_space
);
3070 /* No need to search. */
3071 ELEM (ctype
, class_collection
, , L
' ') |= BITw (tok_space
);
3073 seq
= charmap_find_value (charmap
, "form-feed", 9);
3075 seq
= charmap_find_value (charmap
, "U0000000C", 9);
3079 WITH_CUR_LOCALE (error (0, 0, _("\
3080 %s: character `%s' not defined while needed as default value"),
3081 "LC_CTYPE", "<form-feed>"));
3083 else if (seq
->nbytes
!= 1)
3084 WITH_CUR_LOCALE (error (0, 0, _("\
3085 %s: character `%s' in charmap not representable with one byte"),
3086 "LC_CTYPE", "<form-feed>"));
3088 ctype
->class256_collection
[seq
->bytes
[0]] |= BIT (tok_space
);
3090 /* No need to search. */
3091 ELEM (ctype
, class_collection
, , L
'\f') |= BITw (tok_space
);
3094 seq
= charmap_find_value (charmap
, "newline", 7);
3096 seq
= charmap_find_value (charmap
, "U0000000A", 9);
3100 WITH_CUR_LOCALE (error (0, 0, _("\
3101 %s: character `%s' not defined while needed as default value"),
3102 "LC_CTYPE", "<newline>"));
3104 else if (seq
->nbytes
!= 1)
3105 WITH_CUR_LOCALE (error (0, 0, _("\
3106 %s: character `%s' in charmap not representable with one byte"),
3107 "LC_CTYPE", "<newline>"));
3109 ctype
->class256_collection
[seq
->bytes
[0]] |= BIT (tok_space
);
3111 /* No need to search. */
3112 ELEM (ctype
, class_collection
, , L
'\n') |= BITw (tok_space
);
3115 seq
= charmap_find_value (charmap
, "carriage-return", 15);
3117 seq
= charmap_find_value (charmap
, "U0000000D", 9);
3121 WITH_CUR_LOCALE (error (0, 0, _("\
3122 %s: character `%s' not defined while needed as default value"),
3123 "LC_CTYPE", "<carriage-return>"));
3125 else if (seq
->nbytes
!= 1)
3126 WITH_CUR_LOCALE (error (0, 0, _("\
3127 %s: character `%s' in charmap not representable with one byte"),
3128 "LC_CTYPE", "<carriage-return>"));
3130 ctype
->class256_collection
[seq
->bytes
[0]] |= BIT (tok_space
);
3132 /* No need to search. */
3133 ELEM (ctype
, class_collection
, , L
'\r') |= BITw (tok_space
);
3136 seq
= charmap_find_value (charmap
, "tab", 3);
3138 seq
= charmap_find_value (charmap
, "U00000009", 9);
3142 WITH_CUR_LOCALE (error (0, 0, _("\
3143 %s: character `%s' not defined while needed as default value"),
3144 "LC_CTYPE", "<tab>"));
3146 else if (seq
->nbytes
!= 1)
3147 WITH_CUR_LOCALE (error (0, 0, _("\
3148 %s: character `%s' in charmap not representable with one byte"),
3149 "LC_CTYPE", "<tab>"));
3151 ctype
->class256_collection
[seq
->bytes
[0]] |= BIT (tok_space
);
3153 /* No need to search. */
3154 ELEM (ctype
, class_collection
, , L
'\t') |= BITw (tok_space
);
3157 seq
= charmap_find_value (charmap
, "vertical-tab", 12);
3159 seq
= charmap_find_value (charmap
, "U0000000B", 9);
3163 WITH_CUR_LOCALE (error (0, 0, _("\
3164 %s: character `%s' not defined while needed as default value"),
3165 "LC_CTYPE", "<vertical-tab>"));
3167 else if (seq
->nbytes
!= 1)
3168 WITH_CUR_LOCALE (error (0, 0, _("\
3169 %s: character `%s' in charmap not representable with one byte"),
3170 "LC_CTYPE", "<vertical-tab>"));
3172 ctype
->class256_collection
[seq
->bytes
[0]] |= BIT (tok_space
);
3174 /* No need to search. */
3175 ELEM (ctype
, class_collection
, , L
'\v') |= BITw (tok_space
);
3178 if ((ctype
->class_done
& BITw (tok_xdigit
)) == 0)
3179 /* "If this keyword is not specified, the digits `0' to `9', the
3180 uppercase letters `A' through `F', and the lowercase letters `a'
3181 through `f', ..., shell automatically belong to this class, with
3182 implementation defined character values." [P1003.2, 2.5.2.1] */
3184 set_default (BITPOS (tok_xdigit
), '0', '9');
3185 set_default (BITPOS (tok_xdigit
), 'A', 'F');
3186 set_default (BITPOS (tok_xdigit
), 'a', 'f');
3189 if ((ctype
->class_done
& BITw (tok_blank
)) == 0)
3190 /* "If this keyword [blank] is unspecified, the characters <space> and
3191 <tab> shall belong to this character class." [P1003.2, 2.5.2.1] */
3193 struct charseq
*seq
;
3195 seq
= charmap_find_value (charmap
, "space", 5);
3197 seq
= charmap_find_value (charmap
, "SP", 2);
3199 seq
= charmap_find_value (charmap
, "U00000020", 9);
3203 WITH_CUR_LOCALE (error (0, 0, _("\
3204 %s: character `%s' not defined while needed as default value"),
3205 "LC_CTYPE", "<space>"));
3207 else if (seq
->nbytes
!= 1)
3208 WITH_CUR_LOCALE (error (0, 0, _("\
3209 %s: character `%s' in charmap not representable with one byte"),
3210 "LC_CTYPE", "<space>"));
3212 ctype
->class256_collection
[seq
->bytes
[0]] |= BIT (tok_blank
);
3214 /* No need to search. */
3215 ELEM (ctype
, class_collection
, , L
' ') |= BITw (tok_blank
);
3218 seq
= charmap_find_value (charmap
, "tab", 3);
3220 seq
= charmap_find_value (charmap
, "U00000009", 9);
3224 WITH_CUR_LOCALE (error (0, 0, _("\
3225 %s: character `%s' not defined while needed as default value"),
3226 "LC_CTYPE", "<tab>"));
3228 else if (seq
->nbytes
!= 1)
3229 WITH_CUR_LOCALE (error (0, 0, _("\
3230 %s: character `%s' in charmap not representable with one byte"),
3231 "LC_CTYPE", "<tab>"));
3233 ctype
->class256_collection
[seq
->bytes
[0]] |= BIT (tok_blank
);
3235 /* No need to search. */
3236 ELEM (ctype
, class_collection
, , L
'\t') |= BITw (tok_blank
);
3239 if ((ctype
->class_done
& BITw (tok_graph
)) == 0)
3240 /* "If this keyword [graph] is not specified, characters specified for
3241 the keywords `upper', `lower', `alpha', `digit', `xdigit' and `punct',
3242 shall belong to this character class." [P1003.2, 2.5.2.1] */
3244 unsigned long int mask
= BIT (tok_upper
) | BIT (tok_lower
) |
3245 BIT (tok_alpha
) | BIT (tok_digit
) | BIT (tok_xdigit
) | BIT (tok_punct
);
3246 unsigned long int maskw
= BITw (tok_upper
) | BITw (tok_lower
) |
3247 BITw (tok_alpha
) | BITw (tok_digit
) | BITw (tok_xdigit
) |
3250 for (size_t cnt
= 0; cnt
< ctype
->class_collection_act
; ++cnt
)
3251 if ((ctype
->class_collection
[cnt
] & maskw
) != 0)
3252 ctype
->class_collection
[cnt
] |= BITw (tok_graph
);
3254 for (size_t cnt
= 0; cnt
< 256; ++cnt
)
3255 if ((ctype
->class256_collection
[cnt
] & mask
) != 0)
3256 ctype
->class256_collection
[cnt
] |= BIT (tok_graph
);
3259 if ((ctype
->class_done
& BITw (tok_print
)) == 0)
3260 /* "If this keyword [print] is not provided, characters specified for
3261 the keywords `upper', `lower', `alpha', `digit', `xdigit', `punct',
3262 and the <space> character shall belong to this character class."
3263 [P1003.2, 2.5.2.1] */
3265 unsigned long int mask
= BIT (tok_upper
) | BIT (tok_lower
) |
3266 BIT (tok_alpha
) | BIT (tok_digit
) | BIT (tok_xdigit
) | BIT (tok_punct
);
3267 unsigned long int maskw
= BITw (tok_upper
) | BITw (tok_lower
) |
3268 BITw (tok_alpha
) | BITw (tok_digit
) | BITw (tok_xdigit
) |
3270 struct charseq
*seq
;
3272 for (size_t cnt
= 0; cnt
< ctype
->class_collection_act
; ++cnt
)
3273 if ((ctype
->class_collection
[cnt
] & maskw
) != 0)
3274 ctype
->class_collection
[cnt
] |= BITw (tok_print
);
3276 for (size_t cnt
= 0; cnt
< 256; ++cnt
)
3277 if ((ctype
->class256_collection
[cnt
] & mask
) != 0)
3278 ctype
->class256_collection
[cnt
] |= BIT (tok_print
);
3281 seq
= charmap_find_value (charmap
, "space", 5);
3283 seq
= charmap_find_value (charmap
, "SP", 2);
3285 seq
= charmap_find_value (charmap
, "U00000020", 9);
3289 WITH_CUR_LOCALE (error (0, 0, _("\
3290 %s: character `%s' not defined while needed as default value"),
3291 "LC_CTYPE", "<space>"));
3293 else if (seq
->nbytes
!= 1)
3294 WITH_CUR_LOCALE (error (0, 0, _("\
3295 %s: character `%s' in charmap not representable with one byte"),
3296 "LC_CTYPE", "<space>"));
3298 ctype
->class256_collection
[seq
->bytes
[0]] |= BIT (tok_print
);
3300 /* No need to search. */
3301 ELEM (ctype
, class_collection
, , L
' ') |= BITw (tok_print
);
3304 if (ctype
->tomap_done
[0] == 0)
3305 /* "If this keyword [toupper] is not specified, the lowercase letters
3306 `a' through `z', and their corresponding uppercase letters `A' to
3307 `Z', ..., shall automatically be included, with implementation-
3308 defined character values." [P1003.2, 2.5.2.1] */
3313 strcpy (tmp
, "<?>");
3315 for (ch
= 'a'; ch
<= 'z'; ++ch
)
3317 struct charseq
*seq_from
, *seq_to
;
3321 seq_from
= charmap_find_value (charmap
, &tmp
[1], 1);
3322 if (seq_from
== NULL
)
3325 sprintf (buf
, "U%08X", ch
);
3326 seq_from
= charmap_find_value (charmap
, buf
, 9);
3328 if (seq_from
== NULL
)
3331 WITH_CUR_LOCALE (error (0, 0, _("\
3332 %s: character `%s' not defined while needed as default value"),
3335 else if (seq_from
->nbytes
!= 1)
3338 WITH_CUR_LOCALE (error (0, 0, _("\
3339 %s: character `%s' needed as default value not representable with one byte"),
3344 /* This conversion is implementation defined. */
3345 tmp
[1] = (char) (ch
+ ('A' - 'a'));
3346 seq_to
= charmap_find_value (charmap
, &tmp
[1], 1);
3350 sprintf (buf
, "U%08X", ch
+ ('A' - 'a'));
3351 seq_to
= charmap_find_value (charmap
, buf
, 9);
3356 WITH_CUR_LOCALE (error (0, 0, _("\
3357 %s: character `%s' not defined while needed as default value"),
3360 else if (seq_to
->nbytes
!= 1)
3363 WITH_CUR_LOCALE (error (0, 0, _("\
3364 %s: character `%s' needed as default value not representable with one byte"),
3368 /* The index [0] is determined by the order of the
3369 `ctype_map_newP' calls in `ctype_startup'. */
3370 ctype
->map256_collection
[0][seq_from
->bytes
[0]]
3374 /* No need to search. */
3375 ELEM (ctype
, map_collection
, [0], ch
) = ch
+ ('A' - 'a');
3379 if (ctype
->tomap_done
[1] == 0)
3380 /* "If this keyword [tolower] is not specified, the mapping shall be
3381 the reverse mapping of the one specified to `toupper'." [P1003.2] */
3383 for (size_t cnt
= 0; cnt
< ctype
->map_collection_act
[0]; ++cnt
)
3384 if (ctype
->map_collection
[0][cnt
] != 0)
3385 ELEM (ctype
, map_collection
, [1],
3386 ctype
->map_collection
[0][cnt
])
3387 = ctype
->charnames
[cnt
];
3389 for (size_t cnt
= 0; cnt
< 256; ++cnt
)
3390 if (ctype
->map256_collection
[0][cnt
] != 0)
3391 ctype
->map256_collection
[1][ctype
->map256_collection
[0][cnt
]] = cnt
;
3394 if (ctype
->outdigits_act
!= 10)
3396 if (ctype
->outdigits_act
!= 0)
3397 WITH_CUR_LOCALE (error (0, 0, _("\
3398 %s: field `%s' does not contain exactly ten entries"),
3399 "LC_CTYPE", "outdigit"));
3401 for (size_t cnt
= ctype
->outdigits_act
; cnt
< 10; ++cnt
)
3403 ctype
->mboutdigits
[cnt
] = charmap_find_symbol (charmap
,
3404 (char *) digits
+ cnt
,
3407 if (ctype
->mboutdigits
[cnt
] == NULL
)
3408 ctype
->mboutdigits
[cnt
] = charmap_find_symbol (charmap
,
3410 strlen (longnames
[cnt
]));
3412 if (ctype
->mboutdigits
[cnt
] == NULL
)
3413 ctype
->mboutdigits
[cnt
] = charmap_find_symbol (charmap
,
3416 if (ctype
->mboutdigits
[cnt
] == NULL
)
3418 /* Provide a replacement. */
3419 WITH_CUR_LOCALE (error (0, 0, _("\
3420 no output digits defined and none of the standard names in the charmap")));
3422 ctype
->mboutdigits
[cnt
] = obstack_alloc (&((struct charmap_t
*) charmap
)->mem_pool
,
3423 sizeof (struct charseq
)
3426 /* This is better than nothing. */
3427 ctype
->mboutdigits
[cnt
]->bytes
[0] = digits
[cnt
];
3428 ctype
->mboutdigits
[cnt
]->nbytes
= 1;
3431 ctype
->wcoutdigits
[cnt
] = L
'0' + cnt
;
3434 ctype
->outdigits_act
= 10;
3441 /* Initialize. Assumes t->p and t->q have already been set. */
3443 wctype_table_init (struct wctype_table
*t
)
3446 t
->level1_alloc
= t
->level1_size
= 0;
3448 t
->level2_alloc
= t
->level2_size
= 0;
3450 t
->level3_alloc
= t
->level3_size
= 0;
3453 /* Retrieve an entry. */
3455 wctype_table_get (struct wctype_table
*t
, uint32_t wc
)
3457 uint32_t index1
= wc
>> (t
->q
+ t
->p
+ 5);
3458 if (index1
< t
->level1_size
)
3460 uint32_t lookup1
= t
->level1
[index1
];
3461 if (lookup1
!= EMPTY
)
3463 uint32_t index2
= ((wc
>> (t
->p
+ 5)) & ((1 << t
->q
) - 1))
3464 + (lookup1
<< t
->q
);
3465 uint32_t lookup2
= t
->level2
[index2
];
3466 if (lookup2
!= EMPTY
)
3468 uint32_t index3
= ((wc
>> 5) & ((1 << t
->p
) - 1))
3469 + (lookup2
<< t
->p
);
3470 uint32_t lookup3
= t
->level3
[index3
];
3471 uint32_t index4
= wc
& 0x1f;
3473 return (lookup3
>> index4
) & 1;
3480 /* Add one entry. */
3482 wctype_table_add (struct wctype_table
*t
, uint32_t wc
)
3484 uint32_t index1
= wc
>> (t
->q
+ t
->p
+ 5);
3485 uint32_t index2
= (wc
>> (t
->p
+ 5)) & ((1 << t
->q
) - 1);
3486 uint32_t index3
= (wc
>> 5) & ((1 << t
->p
) - 1);
3487 uint32_t index4
= wc
& 0x1f;
3490 if (index1
>= t
->level1_size
)
3492 if (index1
>= t
->level1_alloc
)
3494 size_t alloc
= 2 * t
->level1_alloc
;
3495 if (alloc
<= index1
)
3497 t
->level1
= (uint32_t *) xrealloc ((char *) t
->level1
,
3498 alloc
* sizeof (uint32_t));
3499 t
->level1_alloc
= alloc
;
3501 while (index1
>= t
->level1_size
)
3502 t
->level1
[t
->level1_size
++] = EMPTY
;
3505 if (t
->level1
[index1
] == EMPTY
)
3507 if (t
->level2_size
== t
->level2_alloc
)
3509 size_t alloc
= 2 * t
->level2_alloc
+ 1;
3510 t
->level2
= (uint32_t *) xrealloc ((char *) t
->level2
,
3511 (alloc
<< t
->q
) * sizeof (uint32_t));
3512 t
->level2_alloc
= alloc
;
3514 i1
= t
->level2_size
<< t
->q
;
3515 i2
= (t
->level2_size
+ 1) << t
->q
;
3516 for (i
= i1
; i
< i2
; i
++)
3517 t
->level2
[i
] = EMPTY
;
3518 t
->level1
[index1
] = t
->level2_size
++;
3521 index2
+= t
->level1
[index1
] << t
->q
;
3523 if (t
->level2
[index2
] == EMPTY
)
3525 if (t
->level3_size
== t
->level3_alloc
)
3527 size_t alloc
= 2 * t
->level3_alloc
+ 1;
3528 t
->level3
= (uint32_t *) xrealloc ((char *) t
->level3
,
3529 (alloc
<< t
->p
) * sizeof (uint32_t));
3530 t
->level3_alloc
= alloc
;
3532 i1
= t
->level3_size
<< t
->p
;
3533 i2
= (t
->level3_size
+ 1) << t
->p
;
3534 for (i
= i1
; i
< i2
; i
++)
3536 t
->level2
[index2
] = t
->level3_size
++;
3539 index3
+= t
->level2
[index2
] << t
->p
;
3541 t
->level3
[index3
] |= (uint32_t)1 << index4
;
3544 /* Finalize and shrink. */
3546 add_locale_wctype_table (struct locale_file
*file
, struct wctype_table
*t
)
3549 uint32_t reorder3
[t
->level3_size
];
3550 uint32_t reorder2
[t
->level2_size
];
3551 uint32_t level2_offset
, level3_offset
;
3553 /* Uniquify level3 blocks. */
3555 for (j
= 0; j
< t
->level3_size
; j
++)
3557 for (i
= 0; i
< k
; i
++)
3558 if (memcmp (&t
->level3
[i
<< t
->p
], &t
->level3
[j
<< t
->p
],
3559 (1 << t
->p
) * sizeof (uint32_t)) == 0)
3561 /* Relocate block j to block i. */
3566 memcpy (&t
->level3
[i
<< t
->p
], &t
->level3
[j
<< t
->p
],
3567 (1 << t
->p
) * sizeof (uint32_t));
3573 for (i
= 0; i
< (t
->level2_size
<< t
->q
); i
++)
3574 if (t
->level2
[i
] != EMPTY
)
3575 t
->level2
[i
] = reorder3
[t
->level2
[i
]];
3577 /* Uniquify level2 blocks. */
3579 for (j
= 0; j
< t
->level2_size
; j
++)
3581 for (i
= 0; i
< k
; i
++)
3582 if (memcmp (&t
->level2
[i
<< t
->q
], &t
->level2
[j
<< t
->q
],
3583 (1 << t
->q
) * sizeof (uint32_t)) == 0)
3585 /* Relocate block j to block i. */
3590 memcpy (&t
->level2
[i
<< t
->q
], &t
->level2
[j
<< t
->q
],
3591 (1 << t
->q
) * sizeof (uint32_t));
3597 for (i
= 0; i
< t
->level1_size
; i
++)
3598 if (t
->level1
[i
] != EMPTY
)
3599 t
->level1
[i
] = reorder2
[t
->level1
[i
]];
3602 5 * sizeof (uint32_t)
3603 + t
->level1_size
* sizeof (uint32_t)
3604 + (t
->level2_size
<< t
->q
) * sizeof (uint32_t)
3605 + (t
->level3_size
<< t
->p
) * sizeof (uint32_t);
3608 5 * sizeof (uint32_t)
3609 + t
->level1_size
* sizeof (uint32_t);
3611 5 * sizeof (uint32_t)
3612 + t
->level1_size
* sizeof (uint32_t)
3613 + (t
->level2_size
<< t
->q
) * sizeof (uint32_t);
3615 start_locale_structure (file
);
3616 add_locale_uint32 (file
, t
->q
+ t
->p
+ 5);
3617 add_locale_uint32 (file
, t
->level1_size
);
3618 add_locale_uint32 (file
, t
->p
+ 5);
3619 add_locale_uint32 (file
, (1 << t
->q
) - 1);
3620 add_locale_uint32 (file
, (1 << t
->p
) - 1);
3622 for (i
= 0; i
< t
->level1_size
; i
++)
3625 t
->level1
[i
] == EMPTY
3627 : (t
->level1
[i
] << t
->q
) * sizeof (uint32_t) + level2_offset
);
3629 for (i
= 0; i
< (t
->level2_size
<< t
->q
); i
++)
3632 t
->level2
[i
] == EMPTY
3634 : (t
->level2
[i
] << t
->p
) * sizeof (uint32_t) + level3_offset
);
3636 add_locale_uint32_array (file
, t
->level3
, t
->level3_size
<< t
->p
);
3637 end_locale_structure (file
);
3639 if (t
->level1_alloc
> 0)
3641 if (t
->level2_alloc
> 0)
3643 if (t
->level3_alloc
> 0)
3647 /* Flattens the included transliterations into a translit list.
3648 Inserts them in the list at `cursor', and returns the new cursor. */
3649 static struct translit_t
**
3650 translit_flatten (struct locale_ctype_t
*ctype
,
3651 const struct charmap_t
*charmap
,
3652 struct translit_t
**cursor
)
3654 while (ctype
->translit_include
!= NULL
)
3656 const char *copy_locale
= ctype
->translit_include
->copy_locale
;
3657 const char *copy_repertoire
= ctype
->translit_include
->copy_repertoire
;
3658 struct localedef_t
*other
;
3660 /* Unchain the include statement. During the depth-first traversal
3661 we don't want to visit any locale more than once. */
3662 ctype
->translit_include
= ctype
->translit_include
->next
;
3664 other
= find_locale (LC_CTYPE
, copy_locale
, copy_repertoire
, charmap
);
3666 if (other
== NULL
|| other
->categories
[LC_CTYPE
].ctype
== NULL
)
3668 WITH_CUR_LOCALE (error (0, 0, _("\
3669 %s: transliteration data from locale `%s' not available"),
3670 "LC_CTYPE", copy_locale
));
3674 struct locale_ctype_t
*other_ctype
=
3675 other
->categories
[LC_CTYPE
].ctype
;
3677 cursor
= translit_flatten (other_ctype
, charmap
, cursor
);
3678 assert (other_ctype
->translit_include
== NULL
);
3680 if (other_ctype
->translit
!= NULL
)
3682 /* Insert the other_ctype->translit list at *cursor. */
3683 struct translit_t
*endp
= other_ctype
->translit
;
3684 while (endp
->next
!= NULL
)
3687 endp
->next
= *cursor
;
3688 *cursor
= other_ctype
->translit
;
3690 /* Avoid any risk of circular lists. */
3691 other_ctype
->translit
= NULL
;
3693 cursor
= &endp
->next
;
3696 if (ctype
->default_missing
== NULL
)
3697 ctype
->default_missing
= other_ctype
->default_missing
;
3705 allocate_arrays (struct locale_ctype_t
*ctype
, const struct charmap_t
*charmap
,
3706 struct repertoire_t
*repertoire
)
3714 /* You wonder about this amount of memory? This is only because some
3715 users do not manage to address the array with unsigned values or
3716 data types with range >= 256. '\200' would result in the array
3717 index -128. To help these poor people we duplicate the entries for
3718 128 up to 255 below the entry for \0. */
3719 ctype
->ctype_b
= (char_class_t
*) xcalloc (256 + 128, sizeof (char_class_t
));
3720 ctype
->ctype32_b
= (char_class32_t
*) xcalloc (256, sizeof (char_class32_t
));
3721 ctype
->class_b
= (uint32_t **)
3722 xmalloc (ctype
->nr_charclass
* sizeof (uint32_t *));
3723 ctype
->class_3level
= (struct wctype_table
*)
3724 xmalloc (ctype
->nr_charclass
* sizeof (struct wctype_table
));
3726 /* This is the array accessed using the multibyte string elements. */
3727 for (idx
= 0; idx
< 256; ++idx
)
3728 ctype
->ctype_b
[128 + idx
] = ctype
->class256_collection
[idx
];
3730 /* Mirror first 127 entries. We must take care that entry -1 is not
3731 mirrored because EOF == -1. */
3732 for (idx
= 0; idx
< 127; ++idx
)
3733 ctype
->ctype_b
[idx
] = ctype
->ctype_b
[256 + idx
];
3735 /* The 32 bit array contains all characters < 0x100. */
3736 for (idx
= 0; idx
< ctype
->class_collection_act
; ++idx
)
3737 if (ctype
->charnames
[idx
] < 0x100)
3738 ctype
->ctype32_b
[ctype
->charnames
[idx
]] = ctype
->class_collection
[idx
];
3740 for (nr
= 0; nr
< ctype
->nr_charclass
; nr
++)
3742 ctype
->class_b
[nr
] = (uint32_t *) xcalloc (256 / 32, sizeof (uint32_t));
3744 /* We only set CLASS_B for the bits in the ISO C classes, not
3745 the user defined classes. The number should not change but
3747 #define LAST_ISO_C_BIT 11
3748 if (nr
<= LAST_ISO_C_BIT
)
3749 for (idx
= 0; idx
< 256; ++idx
)
3750 if (ctype
->class256_collection
[idx
] & _ISbit (nr
))
3751 ctype
->class_b
[nr
][idx
>> 5] |= (uint32_t) 1 << (idx
& 0x1f);
3754 for (nr
= 0; nr
< ctype
->nr_charclass
; nr
++)
3756 struct wctype_table
*t
;
3758 t
= &ctype
->class_3level
[nr
];
3759 t
->p
= 4; /* or: 5 */
3760 t
->q
= 7; /* or: 6 */
3761 wctype_table_init (t
);
3763 for (idx
= 0; idx
< ctype
->class_collection_act
; ++idx
)
3764 if (ctype
->class_collection
[idx
] & _ISwbit (nr
))
3765 wctype_table_add (t
, ctype
->charnames
[idx
]);
3768 WITH_CUR_LOCALE (fprintf (stderr
, _("\
3769 %s: table for class \"%s\": %lu bytes\n"),
3770 "LC_CTYPE", ctype
->classnames
[nr
],
3771 (unsigned long int) t
->result_size
));
3774 /* Room for table of mappings. */
3775 ctype
->map_b
= (uint32_t **) xmalloc (2 * sizeof (uint32_t *));
3776 ctype
->map32_b
= (uint32_t **) xmalloc (ctype
->map_collection_nr
3777 * sizeof (uint32_t *));
3778 ctype
->map_3level
= (struct wctrans_table
*)
3779 xmalloc (ctype
->map_collection_nr
* sizeof (struct wctrans_table
));
3781 /* Fill in all mappings. */
3782 for (idx
= 0; idx
< 2; ++idx
)
3786 /* Allocate table. */
3787 ctype
->map_b
[idx
] = (uint32_t *)
3788 xmalloc ((256 + 128) * sizeof (uint32_t));
3790 /* Copy values from collection. */
3791 for (idx2
= 0; idx2
< 256; ++idx2
)
3792 ctype
->map_b
[idx
][128 + idx2
] = ctype
->map256_collection
[idx
][idx2
];
3794 /* Mirror first 127 entries. We must take care not to map entry
3795 -1 because EOF == -1. */
3796 for (idx2
= 0; idx2
< 127; ++idx2
)
3797 ctype
->map_b
[idx
][idx2
] = ctype
->map_b
[idx
][256 + idx2
];
3799 /* EOF must map to EOF. */
3800 ctype
->map_b
[idx
][127] = EOF
;
3803 for (idx
= 0; idx
< ctype
->map_collection_nr
; ++idx
)
3807 /* Allocate table. */
3808 ctype
->map32_b
[idx
] = (uint32_t *) xmalloc (256 * sizeof (uint32_t));
3810 /* Copy values from collection. Default is identity mapping. */
3811 for (idx2
= 0; idx2
< 256; ++idx2
)
3812 ctype
->map32_b
[idx
][idx2
] =
3813 (ctype
->map_collection
[idx
][idx2
] != 0
3814 ? ctype
->map_collection
[idx
][idx2
]
3818 for (nr
= 0; nr
< ctype
->map_collection_nr
; nr
++)
3820 struct wctrans_table
*t
;
3822 t
= &ctype
->map_3level
[nr
];
3825 wctrans_table_init (t
);
3827 for (idx
= 0; idx
< ctype
->map_collection_act
[nr
]; ++idx
)
3828 if (ctype
->map_collection
[nr
][idx
] != 0)
3829 wctrans_table_add (t
, ctype
->charnames
[idx
],
3830 ctype
->map_collection
[nr
][idx
]);
3833 WITH_CUR_LOCALE (fprintf (stderr
, _("\
3834 %s: table for map \"%s\": %lu bytes\n"),
3835 "LC_CTYPE", ctype
->mapnames
[nr
],
3836 (unsigned long int) t
->result_size
));
3839 /* Extra array for class and map names. */
3840 ctype
->class_name_ptr
= (uint32_t *) xmalloc (ctype
->nr_charclass
3841 * sizeof (uint32_t));
3842 ctype
->map_name_ptr
= (uint32_t *) xmalloc (ctype
->map_collection_nr
3843 * sizeof (uint32_t));
3845 ctype
->class_offset
= _NL_ITEM_INDEX (_NL_CTYPE_EXTRA_MAP_1
);
3846 ctype
->map_offset
= ctype
->class_offset
+ ctype
->nr_charclass
;
3848 /* Array for width information. Because the expected widths are very
3849 small (never larger than 2) we use only one single byte. This
3851 We put only printable characters in the table. wcwidth is specified
3852 to return -1 for non-printable characters. Doing the check here
3853 saves a run-time check.
3854 But we put L'\0' in the table. This again saves a run-time check. */
3856 struct wcwidth_table
*t
;
3861 wcwidth_table_init (t
);
3863 /* First set all the printable characters of the character set to
3864 the default width. */
3866 while (iterate_table (&charmap
->char_table
, &curs
, &key
, &len
, &vdata
) == 0)
3868 struct charseq
*data
= (struct charseq
*) vdata
;
3870 if (data
->ucs4
== UNINITIALIZED_CHAR_VALUE
)
3871 data
->ucs4
= repertoire_find_value (ctype
->repertoire
,
3874 if (data
->ucs4
!= ILLEGAL_CHAR_VALUE
)
3876 uint32_t *class_bits
=
3877 find_idx (ctype
, &ctype
->class_collection
, NULL
,
3878 &ctype
->class_collection_act
, data
->ucs4
);
3880 if (class_bits
!= NULL
&& (*class_bits
& BITw (tok_print
)))
3881 wcwidth_table_add (t
, data
->ucs4
, charmap
->width_default
);
3885 /* Now add the explicitly specified widths. */
3886 if (charmap
->width_rules
!= NULL
)
3887 for (size_t cnt
= 0; cnt
< charmap
->nwidth_rules
; ++cnt
)
3889 unsigned char bytes
[charmap
->mb_cur_max
];
3890 int nbytes
= charmap
->width_rules
[cnt
].from
->nbytes
;
3892 /* We have the range of character for which the width is
3893 specified described using byte sequences of the multibyte
3894 charset. We have to convert this to UCS4 now. And we
3895 cannot simply convert the beginning and the end of the
3896 sequence, we have to iterate over the byte sequence and
3897 convert it for every single character. */
3898 memcpy (bytes
, charmap
->width_rules
[cnt
].from
->bytes
, nbytes
);
3900 while (nbytes
< charmap
->width_rules
[cnt
].to
->nbytes
3901 || memcmp (bytes
, charmap
->width_rules
[cnt
].to
->bytes
,
3904 /* Find the UCS value for `bytes'. */
3907 struct charseq
*seq
=
3908 charmap_find_symbol (charmap
, (char *) bytes
, nbytes
);
3911 wch
= ILLEGAL_CHAR_VALUE
;
3912 else if (seq
->ucs4
!= UNINITIALIZED_CHAR_VALUE
)
3915 wch
= repertoire_find_value (ctype
->repertoire
, seq
->name
,
3916 strlen (seq
->name
));
3918 if (wch
!= ILLEGAL_CHAR_VALUE
)
3920 /* Store the value. */
3921 uint32_t *class_bits
=
3922 find_idx (ctype
, &ctype
->class_collection
, NULL
,
3923 &ctype
->class_collection_act
, wch
);
3925 if (class_bits
!= NULL
&& (*class_bits
& BITw (tok_print
)))
3926 wcwidth_table_add (t
, wch
,
3927 charmap
->width_rules
[cnt
].width
);
3930 /* "Increment" the bytes sequence. */
3932 while (inner
>= 0 && bytes
[inner
] == 0xff)
3937 /* We have to extend the byte sequence. */
3938 if (nbytes
>= charmap
->width_rules
[cnt
].to
->nbytes
)
3942 memset (&bytes
[1], 0, nbytes
);
3948 while (++inner
< nbytes
)
3954 /* Set the width of L'\0' to 0. */
3955 wcwidth_table_add (t
, 0, 0);
3958 WITH_CUR_LOCALE (fprintf (stderr
, _("%s: table for width: %lu bytes\n"),
3959 "LC_CTYPE", (unsigned long int) t
->result_size
));
3962 /* Set MB_CUR_MAX. */
3963 ctype
->mb_cur_max
= charmap
->mb_cur_max
;
3965 /* Now determine the table for the transliteration information.
3967 XXX It is not yet clear to me whether it is worth implementing a
3968 complicated algorithm which uses a hash table to locate the entries.
3969 For now I'll use a simple array which can be searching using binary
3971 if (ctype
->translit_include
!= NULL
)
3972 /* Traverse the locales mentioned in the `include' statements in a
3973 depth-first way and fold in their transliteration information. */
3974 translit_flatten (ctype
, charmap
, &ctype
->translit
);
3976 if (ctype
->translit
!= NULL
)
3978 /* First count how many entries we have. This is the upper limit
3979 since some entries from the included files might be overwritten. */
3981 struct translit_t
*runp
= ctype
->translit
;
3982 struct translit_t
**sorted
;
3983 size_t from_len
, to_len
;
3985 while (runp
!= NULL
)
3991 /* Next we allocate an array large enough and fill in the values. */
3992 sorted
= (struct translit_t
**) alloca (number
3993 * sizeof (struct translit_t
**));
3994 runp
= ctype
->translit
;
3998 /* Search for the place where to insert this string.
3999 XXX Better use a real sorting algorithm later. */
4003 while (idx
< number
)
4005 int res
= wcscmp ((const wchar_t *) sorted
[idx
]->from
,
4006 (const wchar_t *) runp
->from
);
4021 memmove (&sorted
[idx
+ 1], &sorted
[idx
],
4022 (number
- idx
) * sizeof (struct translit_t
*));
4029 while (runp
!= NULL
);
4031 /* The next step is putting all the possible transliteration
4032 strings in one memory block so that we can write it out.
4033 We need several different blocks:
4034 - index to the from-string array
4036 - index to the to-string array
4039 from_len
= to_len
= 0;
4040 for (size_t cnt
= 0; cnt
< number
; ++cnt
)
4042 struct translit_to_t
*srunp
;
4043 from_len
+= wcslen ((const wchar_t *) sorted
[cnt
]->from
) + 1;
4044 srunp
= sorted
[cnt
]->to
;
4045 while (srunp
!= NULL
)
4047 to_len
+= wcslen ((const wchar_t *) srunp
->str
) + 1;
4048 srunp
= srunp
->next
;
4050 /* Plus one for the extra NUL character marking the end of
4051 the list for the current entry. */
4055 /* We can allocate the arrays for the results. */
4056 ctype
->translit_from_idx
= xmalloc (number
* sizeof (uint32_t));
4057 ctype
->translit_from_tbl
= xmalloc (from_len
* sizeof (uint32_t));
4058 ctype
->translit_to_idx
= xmalloc (number
* sizeof (uint32_t));
4059 ctype
->translit_to_tbl
= xmalloc (to_len
* sizeof (uint32_t));
4063 for (size_t cnt
= 0; cnt
< number
; ++cnt
)
4066 struct translit_to_t
*srunp
;
4068 ctype
->translit_from_idx
[cnt
] = from_len
;
4069 ctype
->translit_to_idx
[cnt
] = to_len
;
4071 len
= wcslen ((const wchar_t *) sorted
[cnt
]->from
) + 1;
4072 wmemcpy ((wchar_t *) &ctype
->translit_from_tbl
[from_len
],
4073 (const wchar_t *) sorted
[cnt
]->from
, len
);
4076 ctype
->translit_to_idx
[cnt
] = to_len
;
4077 srunp
= sorted
[cnt
]->to
;
4078 while (srunp
!= NULL
)
4080 len
= wcslen ((const wchar_t *) srunp
->str
) + 1;
4081 wmemcpy ((wchar_t *) &ctype
->translit_to_tbl
[to_len
],
4082 (const wchar_t *) srunp
->str
, len
);
4084 srunp
= srunp
->next
;
4086 ctype
->translit_to_tbl
[to_len
++] = L
'\0';
4089 /* Store the information about the length. */
4090 ctype
->translit_idx_size
= number
;
4091 ctype
->translit_from_tbl_size
= from_len
* sizeof (uint32_t);
4092 ctype
->translit_to_tbl_size
= to_len
* sizeof (uint32_t);
4096 ctype
->translit_from_idx
= no_str
;
4097 ctype
->translit_from_tbl
= no_str
;
4098 ctype
->translit_to_tbl
= no_str
;
4099 ctype
->translit_idx_size
= 0;
4100 ctype
->translit_from_tbl_size
= 0;
4101 ctype
->translit_to_tbl_size
= 0;