2 * Copyright 2011 Nexenta Systems, Inc. All rights reserved.
3 * Copyright 2012 Garrett D'Amore <garrett@damore.org> All rights reserved.
4 * Copyright 2015 John Marino <draco@marino.st>
6 * This source code is derived from the illumos localedef command, and
7 * provided under BSD-style license terms by Nexenta Systems, Inc.
9 * Redistribution and use in source and binary forms, with or without
10 * modification, are permitted provided that the following conditions
13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution.
19 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
20 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
23 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29 * POSSIBILITY OF SUCH DAMAGE.
33 * LC_CTYPE database generation routines for localedef.
42 #include <sys/types.h>
47 #include "localedef.h"
52 /* Needed for bootstrapping, _CTYPE_N not available before 1 Sep 2015 */
54 #define _CTYPE_N 0x00400000L
57 #define _ISUPPER _CTYPE_U
58 #define _ISLOWER _CTYPE_L
59 #define _ISDIGIT _CTYPE_D
60 #define _ISXDIGIT _CTYPE_X
61 #define _ISSPACE _CTYPE_S
62 #define _ISBLANK _CTYPE_B
63 #define _ISALPHA _CTYPE_A
64 #define _ISPUNCT _CTYPE_P
65 #define _ISGRAPH _CTYPE_G
66 #define _ISPRINT _CTYPE_R
67 #define _ISCNTRL _CTYPE_C
74 static wchar_t last_ctype
;
75 static int ctype_compare(const void *n1
, const void *n2
);
77 typedef struct ctype_node
{
82 RB_ENTRY(ctype_node
) entry
;
85 static RB_HEAD(ctypes
, ctype_node
) ctypes
;
86 RB_PROTOTYPE_STATIC(ctypes
, ctype_node
, entry
, ctype_compare
);
87 RB_GENERATE(ctypes
, ctype_node
, entry
, ctype_compare
);
90 ctype_compare(const void *n1
, const void *n2
)
92 const ctype_node_t
*c1
= n1
;
93 const ctype_node_t
*c2
= n2
;
95 return (c1
->wc
< c2
->wc
? -1 : c1
->wc
> c2
->wc
? 1 : 0);
106 add_ctype_impl(ctype_node_t
*ctn
)
110 ctn
->ctype
|= (_ISUPPER
| _ISALPHA
| _ISGRAPH
| _ISPRINT
);
113 ctn
->ctype
|= (_ISLOWER
| _ISALPHA
| _ISGRAPH
| _ISPRINT
);
116 ctn
->ctype
|= (_ISALPHA
| _ISGRAPH
| _ISPRINT
);
119 ctn
->ctype
|= (_ISDIGIT
| _ISGRAPH
| _ISPRINT
| _ISXDIGIT
| _E4
);
122 ctn
->ctype
|= _ISSPACE
;
125 ctn
->ctype
|= _ISCNTRL
;
128 ctn
->ctype
|= (_ISGRAPH
| _ISPRINT
);
131 ctn
->ctype
|= _ISPRINT
;
134 ctn
->ctype
|= (_ISPUNCT
| _ISGRAPH
| _ISPRINT
);
137 ctn
->ctype
|= (_ISXDIGIT
| _ISPRINT
);
140 ctn
->ctype
|= (_ISBLANK
| _ISSPACE
);
143 ctn
->ctype
|= (_E1
| _ISPRINT
| _ISGRAPH
);
146 ctn
->ctype
|= (_E2
| _ISPRINT
| _ISGRAPH
);
149 ctn
->ctype
|= (_E3
| _ISPRINT
| _ISGRAPH
);
152 ctn
->ctype
|= (_E4
| _ISPRINT
| _ISGRAPH
);
155 ctn
->ctype
|= (_E5
| _ISPRINT
| _ISGRAPH
);
159 * We can't do anything with this. The character
160 * should already be specified as a digit or alpha.
164 errf("not a valid character class");
168 static ctype_node_t
*
169 get_ctype(wchar_t wc
)
175 if ((ctn
= RB_FIND(ctypes
, &ctypes
, &srch
)) == NULL
) {
176 if ((ctn
= calloc(1, sizeof (*ctn
))) == NULL
) {
177 errf("out of memory");
182 RB_INSERT(ctypes
, &ctypes
, ctn
);
192 if ((ctn
= get_ctype(val
)) == NULL
) {
197 last_ctype
= ctn
->wc
;
201 add_ctype_range(wchar_t end
)
206 if (end
< last_ctype
) {
207 errf("malformed character range (%u ... %u))",
211 for (cur
= last_ctype
+ 1; cur
<= end
; cur
++) {
212 if ((ctn
= get_ctype(cur
)) == NULL
) {
223 * A word about widths: if the width mask is specified, then libc
224 * unconditionally honors it. Otherwise, it assumes printable
225 * characters have width 1, and non-printable characters have width
226 * -1 (except for NULL which is special with with 0). Hence, we have
227 * no need to inject defaults here -- the "default" unset value of 0
228 * indicates that libc should use its own logic in wcwidth as described.
231 add_width(int wc
, int width
)
235 if ((ctn
= get_ctype(wc
)) == NULL
) {
239 ctn
->ctype
&= ~(_CTYPE_SWM
);
242 ctn
->ctype
|= _CTYPE_SW0
;
245 ctn
->ctype
|= _CTYPE_SW1
;
248 ctn
->ctype
|= _CTYPE_SW2
;
251 ctn
->ctype
|= _CTYPE_SW3
;
257 add_width_range(int start
, int end
, int width
)
259 for (; start
<= end
; start
++) {
260 add_width(start
, width
);
265 add_caseconv(int val
, int wc
)
269 ctn
= get_ctype(val
);
293 ctype_node_t
*ctn
, *last_ct
, *last_lo
, *last_up
;
294 _FileRuneEntry
*ct
= NULL
;
295 _FileRuneEntry
*lo
= NULL
;
296 _FileRuneEntry
*up
= NULL
;
299 (void) memset(&rl
, 0, sizeof (rl
));
304 if ((f
= open_category()) == NULL
)
307 (void) memcpy(rl
.magic
, _FILE_RUNE_MAGIC_1
, 8);
308 (void) strncpy(rl
.encoding
, get_wide_encoding(), sizeof (rl
.encoding
));
311 * Initialize the identity map.
313 for (wc
= 0; (unsigned)wc
< _CACHED_RUNES
; wc
++) {
314 rl
.maplower
[wc
] = wc
;
315 rl
.mapupper
[wc
] = wc
;
318 RB_FOREACH(ctn
, ctypes
, &ctypes
) {
324 * POSIX requires certain portable characters have
325 * certain types. Add them if they are missing.
327 if ((wc
>= 1) && (wc
<= 127)) {
328 if ((wc
>= 'A') && (wc
<= 'Z'))
329 ctn
->ctype
|= _ISUPPER
;
330 if ((wc
>= 'a') && (wc
<= 'z'))
331 ctn
->ctype
|= _ISLOWER
;
332 if ((wc
>= '0') && (wc
<= '9'))
333 ctn
->ctype
|= _ISDIGIT
;
334 if (strchr(" \f\n\r\t\v", (char)wc
) != NULL
)
335 ctn
->ctype
|= _ISSPACE
;
336 if (strchr("0123456789ABCDEFabcdef", (char)wc
) != NULL
)
337 ctn
->ctype
|= _ISXDIGIT
;
338 if (strchr(" \t", (char)wc
))
339 ctn
->ctype
|= _ISBLANK
;
341 ctn
->ctype
|= _ISPRINT
;
344 * Technically these settings are only
345 * required for the C locale. However, it
346 * turns out that because of the historical
347 * version of isprint(), we need them for all
348 * locales as well. Note that these are not
349 * necessarily valid punctation characters in
350 * the current language, but ispunct() needs
351 * to return TRUE for them.
353 if (strchr("!\"'#$%&()*+,-./:;<=>?@[\\]^_`{|}~",
355 ctn
->ctype
|= _ISPUNCT
;
359 * POSIX also requires that certain types imply
360 * others. Add any inferred types here.
362 if (ctn
->ctype
& (_ISUPPER
|_ISLOWER
))
363 ctn
->ctype
|= _ISALPHA
;
364 if (ctn
->ctype
& _ISDIGIT
)
365 ctn
->ctype
|= _ISXDIGIT
;
366 if (ctn
->ctype
& _ISBLANK
)
367 ctn
->ctype
|= _ISSPACE
;
368 if (ctn
->ctype
& (_ISALPHA
|_ISDIGIT
|_ISXDIGIT
))
369 ctn
->ctype
|= _ISGRAPH
;
370 if (ctn
->ctype
& _ISGRAPH
)
371 ctn
->ctype
|= _ISPRINT
;
374 * Finally, POSIX requires that certain combinations
375 * are invalid. We don't flag this as a fatal error,
376 * but we will warn about.
378 if ((ctn
->ctype
& _ISALPHA
) &&
379 (ctn
->ctype
& (_ISPUNCT
|_ISDIGIT
)))
381 if ((ctn
->ctype
& _ISPUNCT
) &
382 (ctn
->ctype
& (_ISDIGIT
|_ISALPHA
|_ISXDIGIT
)))
384 if ((ctn
->ctype
& _ISSPACE
) && (ctn
->ctype
& _ISGRAPH
))
386 if ((ctn
->ctype
& _ISCNTRL
) & _ISPRINT
)
388 if ((wc
== ' ') && (ctn
->ctype
& (_ISPUNCT
|_ISGRAPH
)))
392 warn("conflicting classes for character 0x%x (%x)",
396 * Handle the lower 256 characters using the simple
397 * optimization. Note that if we have not defined the
398 * upper/lower case, then we identity map it.
400 if ((unsigned)wc
< _CACHED_RUNES
) {
401 rl
.runetype
[wc
] = ctn
->ctype
;
403 rl
.maplower
[wc
] = ctn
->tolower
;
405 rl
.mapupper
[wc
] = ctn
->toupper
;
409 if ((last_ct
!= NULL
) && (last_ct
->ctype
== ctn
->ctype
) &&
410 (last_ct
->wc
+ 1 == wc
)) {
411 ct
[rl
.runetype_ext_nranges
-1].max
= wc
;
413 rl
.runetype_ext_nranges
++;
415 sizeof (*ct
) * rl
.runetype_ext_nranges
);
416 ct
[rl
.runetype_ext_nranges
- 1].min
= wc
;
417 ct
[rl
.runetype_ext_nranges
- 1].max
= wc
;
418 ct
[rl
.runetype_ext_nranges
- 1].map
= ctn
->ctype
;
421 if (ctn
->tolower
== 0) {
423 } else if ((last_lo
!= NULL
) &&
424 (last_lo
->tolower
+ 1 == ctn
->tolower
)) {
425 lo
[rl
.maplower_ext_nranges
-1].max
= wc
;
428 rl
.maplower_ext_nranges
++;
430 sizeof (*lo
) * rl
.maplower_ext_nranges
);
431 lo
[rl
.maplower_ext_nranges
- 1].min
= wc
;
432 lo
[rl
.maplower_ext_nranges
- 1].max
= wc
;
433 lo
[rl
.maplower_ext_nranges
- 1].map
= ctn
->tolower
;
437 if (ctn
->toupper
== 0) {
439 } else if ((last_up
!= NULL
) &&
440 (last_up
->toupper
+ 1 == ctn
->toupper
)) {
441 up
[rl
.mapupper_ext_nranges
-1].max
= wc
;
444 rl
.mapupper_ext_nranges
++;
446 sizeof (*up
) * rl
.mapupper_ext_nranges
);
447 up
[rl
.mapupper_ext_nranges
- 1].min
= wc
;
448 up
[rl
.mapupper_ext_nranges
- 1].max
= wc
;
449 up
[rl
.mapupper_ext_nranges
- 1].map
= ctn
->toupper
;
454 if ((wr_category(&rl
, sizeof (rl
), f
) < 0) ||
455 (wr_category(ct
, sizeof (*ct
) * rl
.runetype_ext_nranges
, f
) < 0) ||
456 (wr_category(lo
, sizeof (*lo
) * rl
.maplower_ext_nranges
, f
) < 0) ||
457 (wr_category(up
, sizeof (*up
) * rl
.mapupper_ext_nranges
, f
) < 0)) {