libc/nls: Sync with FreeBSD.
[dragonfly.git] / usr.bin / localedef / ctype.c
blob433cc33a667872cd7e2c6e1396855b01968d10e8
1 /*
2 * Copyright 2011 Nexenta Systems, Inc. All rights reserved.
3 * Copyright 2012 Garrett D'Amore <garrett@damore.org> All rights reserved.
4 * Copyright 2015 John Marino <draco@marino.st>
6 * This source code is derived from the illumos localedef command, and
7 * provided under BSD-style license terms by Nexenta Systems, Inc.
9 * Redistribution and use in source and binary forms, with or without
10 * modification, are permitted provided that the following conditions
11 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution.
19 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
20 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
23 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29 * POSSIBILITY OF SUCH DAMAGE.
33 * LC_CTYPE database generation routines for localedef.
36 #include <sys/tree.h>
38 #include <stdio.h>
39 #include <stdlib.h>
40 #include <stddef.h>
41 #include <string.h>
42 #include <sys/types.h>
43 #include <wchar.h>
44 #include <ctype.h>
45 #include <wctype.h>
46 #include <unistd.h>
47 #include "localedef.h"
48 #include "parser.h"
49 #include "runefile.h"
52 /* Needed for bootstrapping, _CTYPE_N not available before 1 Sep 2015 */
53 #ifndef _CTYPE_N
54 #define _CTYPE_N 0x00400000L
55 #endif
57 #define _ISUPPER _CTYPE_U
58 #define _ISLOWER _CTYPE_L
59 #define _ISDIGIT _CTYPE_D
60 #define _ISXDIGIT _CTYPE_X
61 #define _ISSPACE _CTYPE_S
62 #define _ISBLANK _CTYPE_B
63 #define _ISALPHA _CTYPE_A
64 #define _ISPUNCT _CTYPE_P
65 #define _ISGRAPH _CTYPE_G
66 #define _ISPRINT _CTYPE_R
67 #define _ISCNTRL _CTYPE_C
68 #define _E1 _CTYPE_Q
69 #define _E2 _CTYPE_I
70 #define _E3 0
71 #define _E4 _CTYPE_N
72 #define _E5 _CTYPE_T
74 static wchar_t last_ctype;
75 static int ctype_compare(const void *n1, const void *n2);
77 typedef struct ctype_node {
78 wchar_t wc;
79 int32_t ctype;
80 int32_t toupper;
81 int32_t tolower;
82 RB_ENTRY(ctype_node) entry;
83 } ctype_node_t;
85 static RB_HEAD(ctypes, ctype_node) ctypes;
86 RB_PROTOTYPE_STATIC(ctypes, ctype_node, entry, ctype_compare);
87 RB_GENERATE(ctypes, ctype_node, entry, ctype_compare);
89 static int
90 ctype_compare(const void *n1, const void *n2)
92 const ctype_node_t *c1 = n1;
93 const ctype_node_t *c2 = n2;
95 return (c1->wc < c2->wc ? -1 : c1->wc > c2->wc ? 1 : 0);
98 void
99 init_ctype(void)
101 RB_INIT(&ctypes);
105 static void
106 add_ctype_impl(ctype_node_t *ctn)
108 switch (last_kw) {
109 case T_ISUPPER:
110 ctn->ctype |= (_ISUPPER | _ISALPHA | _ISGRAPH | _ISPRINT);
111 break;
112 case T_ISLOWER:
113 ctn->ctype |= (_ISLOWER | _ISALPHA | _ISGRAPH | _ISPRINT);
114 break;
115 case T_ISALPHA:
116 ctn->ctype |= (_ISALPHA | _ISGRAPH | _ISPRINT);
117 break;
118 case T_ISDIGIT:
119 ctn->ctype |= (_ISDIGIT | _ISGRAPH | _ISPRINT | _ISXDIGIT | _E4);
120 break;
121 case T_ISSPACE:
122 ctn->ctype |= _ISSPACE;
123 break;
124 case T_ISCNTRL:
125 ctn->ctype |= _ISCNTRL;
126 break;
127 case T_ISGRAPH:
128 ctn->ctype |= (_ISGRAPH | _ISPRINT);
129 break;
130 case T_ISPRINT:
131 ctn->ctype |= _ISPRINT;
132 break;
133 case T_ISPUNCT:
134 ctn->ctype |= (_ISPUNCT | _ISGRAPH | _ISPRINT);
135 break;
136 case T_ISXDIGIT:
137 ctn->ctype |= (_ISXDIGIT | _ISPRINT);
138 break;
139 case T_ISBLANK:
140 ctn->ctype |= (_ISBLANK | _ISSPACE);
141 break;
142 case T_ISPHONOGRAM:
143 ctn->ctype |= (_E1 | _ISPRINT | _ISGRAPH);
144 break;
145 case T_ISIDEOGRAM:
146 ctn->ctype |= (_E2 | _ISPRINT | _ISGRAPH);
147 break;
148 case T_ISENGLISH:
149 ctn->ctype |= (_E3 | _ISPRINT | _ISGRAPH);
150 break;
151 case T_ISNUMBER:
152 ctn->ctype |= (_E4 | _ISPRINT | _ISGRAPH);
153 break;
154 case T_ISSPECIAL:
155 ctn->ctype |= (_E5 | _ISPRINT | _ISGRAPH);
156 break;
157 case T_ISALNUM:
159 * We can't do anything with this. The character
160 * should already be specified as a digit or alpha.
162 break;
163 default:
164 errf("not a valid character class");
168 static ctype_node_t *
169 get_ctype(wchar_t wc)
171 ctype_node_t srch;
172 ctype_node_t *ctn;
174 srch.wc = wc;
175 if ((ctn = RB_FIND(ctypes, &ctypes, &srch)) == NULL) {
176 if ((ctn = calloc(1, sizeof (*ctn))) == NULL) {
177 errf("out of memory");
178 return (NULL);
180 ctn->wc = wc;
182 RB_INSERT(ctypes, &ctypes, ctn);
184 return (ctn);
187 void
188 add_ctype(int val)
190 ctype_node_t *ctn;
192 if ((ctn = get_ctype(val)) == NULL) {
193 INTERR;
194 return;
196 add_ctype_impl(ctn);
197 last_ctype = ctn->wc;
200 void
201 add_ctype_range(wchar_t end)
203 ctype_node_t *ctn;
204 wchar_t cur;
206 if (end < last_ctype) {
207 errf("malformed character range (%u ... %u))",
208 last_ctype, end);
209 return;
211 for (cur = last_ctype + 1; cur <= end; cur++) {
212 if ((ctn = get_ctype(cur)) == NULL) {
213 INTERR;
214 return;
216 add_ctype_impl(ctn);
218 last_ctype = end;
223 * A word about widths: if the width mask is specified, then libc
224 * unconditionally honors it. Otherwise, it assumes printable
225 * characters have width 1, and non-printable characters have width
226 * -1 (except for NULL which is special with with 0). Hence, we have
227 * no need to inject defaults here -- the "default" unset value of 0
228 * indicates that libc should use its own logic in wcwidth as described.
230 void
231 add_width(int wc, int width)
233 ctype_node_t *ctn;
235 if ((ctn = get_ctype(wc)) == NULL) {
236 INTERR;
237 return;
239 ctn->ctype &= ~(_CTYPE_SWM);
240 switch (width) {
241 case 0:
242 ctn->ctype |= _CTYPE_SW0;
243 break;
244 case 1:
245 ctn->ctype |= _CTYPE_SW1;
246 break;
247 case 2:
248 ctn->ctype |= _CTYPE_SW2;
249 break;
250 case 3:
251 ctn->ctype |= _CTYPE_SW3;
252 break;
256 void
257 add_width_range(int start, int end, int width)
259 for (; start <= end; start++) {
260 add_width(start, width);
264 void
265 add_caseconv(int val, int wc)
267 ctype_node_t *ctn;
269 ctn = get_ctype(val);
270 if (ctn == NULL) {
271 INTERR;
272 return;
275 switch (last_kw) {
276 case T_TOUPPER:
277 ctn->toupper = wc;
278 break;
279 case T_TOLOWER:
280 ctn->tolower = wc;
281 break;
282 default:
283 INTERR;
284 break;
288 void
289 dump_ctype(void)
291 FILE *f;
292 _FileRuneLocale rl;
293 ctype_node_t *ctn, *last_ct, *last_lo, *last_up;
294 _FileRuneEntry *ct = NULL;
295 _FileRuneEntry *lo = NULL;
296 _FileRuneEntry *up = NULL;
297 wchar_t wc;
299 (void) memset(&rl, 0, sizeof (rl));
300 last_ct = NULL;
301 last_lo = NULL;
302 last_up = NULL;
304 if ((f = open_category()) == NULL)
305 return;
307 (void) memcpy(rl.magic, _FILE_RUNE_MAGIC_1, 8);
308 (void) strncpy(rl.encoding, get_wide_encoding(), sizeof (rl.encoding));
311 * Initialize the identity map.
313 for (wc = 0; (unsigned)wc < _CACHED_RUNES; wc++) {
314 rl.maplower[wc] = wc;
315 rl.mapupper[wc] = wc;
318 RB_FOREACH(ctn, ctypes, &ctypes) {
319 int conflict = 0;
321 wc = ctn->wc;
324 * POSIX requires certain portable characters have
325 * certain types. Add them if they are missing.
327 if ((wc >= 1) && (wc <= 127)) {
328 if ((wc >= 'A') && (wc <= 'Z'))
329 ctn->ctype |= _ISUPPER;
330 if ((wc >= 'a') && (wc <= 'z'))
331 ctn->ctype |= _ISLOWER;
332 if ((wc >= '0') && (wc <= '9'))
333 ctn->ctype |= _ISDIGIT;
334 if (strchr(" \f\n\r\t\v", (char)wc) != NULL)
335 ctn->ctype |= _ISSPACE;
336 if (strchr("0123456789ABCDEFabcdef", (char)wc) != NULL)
337 ctn->ctype |= _ISXDIGIT;
338 if (strchr(" \t", (char)wc))
339 ctn->ctype |= _ISBLANK;
340 if (wc == ' ')
341 ctn->ctype |= _ISPRINT;
344 * Technically these settings are only
345 * required for the C locale. However, it
346 * turns out that because of the historical
347 * version of isprint(), we need them for all
348 * locales as well. Note that these are not
349 * necessarily valid punctation characters in
350 * the current language, but ispunct() needs
351 * to return TRUE for them.
353 if (strchr("!\"'#$%&()*+,-./:;<=>?@[\\]^_`{|}~",
354 (char)wc))
355 ctn->ctype |= _ISPUNCT;
359 * POSIX also requires that certain types imply
360 * others. Add any inferred types here.
362 if (ctn->ctype & (_ISUPPER |_ISLOWER))
363 ctn->ctype |= _ISALPHA;
364 if (ctn->ctype & _ISDIGIT)
365 ctn->ctype |= _ISXDIGIT;
366 if (ctn->ctype & _ISBLANK)
367 ctn->ctype |= _ISSPACE;
368 if (ctn->ctype & (_ISALPHA|_ISDIGIT|_ISXDIGIT))
369 ctn->ctype |= _ISGRAPH;
370 if (ctn->ctype & _ISGRAPH)
371 ctn->ctype |= _ISPRINT;
374 * Finally, POSIX requires that certain combinations
375 * are invalid. We don't flag this as a fatal error,
376 * but we will warn about.
378 if ((ctn->ctype & _ISALPHA) &&
379 (ctn->ctype & (_ISPUNCT|_ISDIGIT)))
380 conflict++;
381 if ((ctn->ctype & _ISPUNCT) &
382 (ctn->ctype & (_ISDIGIT|_ISALPHA|_ISXDIGIT)))
383 conflict++;
384 if ((ctn->ctype & _ISSPACE) && (ctn->ctype & _ISGRAPH))
385 conflict++;
386 if ((ctn->ctype & _ISCNTRL) & _ISPRINT)
387 conflict++;
388 if ((wc == ' ') && (ctn->ctype & (_ISPUNCT|_ISGRAPH)))
389 conflict++;
391 if (conflict) {
392 warn("conflicting classes for character 0x%x (%x)",
393 wc, ctn->ctype);
396 * Handle the lower 256 characters using the simple
397 * optimization. Note that if we have not defined the
398 * upper/lower case, then we identity map it.
400 if ((unsigned)wc < _CACHED_RUNES) {
401 rl.runetype[wc] = ctn->ctype;
402 if (ctn->tolower)
403 rl.maplower[wc] = ctn->tolower;
404 if (ctn->toupper)
405 rl.mapupper[wc] = ctn->toupper;
406 continue;
409 if ((last_ct != NULL) && (last_ct->ctype == ctn->ctype) &&
410 (last_ct->wc + 1 == wc)) {
411 ct[rl.runetype_ext_nranges-1].max = wc;
412 } else {
413 rl.runetype_ext_nranges++;
414 ct = realloc(ct,
415 sizeof (*ct) * rl.runetype_ext_nranges);
416 ct[rl.runetype_ext_nranges - 1].min = wc;
417 ct[rl.runetype_ext_nranges - 1].max = wc;
418 ct[rl.runetype_ext_nranges - 1].map = ctn->ctype;
420 last_ct = ctn;
421 if (ctn->tolower == 0) {
422 last_lo = NULL;
423 } else if ((last_lo != NULL) &&
424 (last_lo->tolower + 1 == ctn->tolower)) {
425 lo[rl.maplower_ext_nranges-1].max = wc;
426 last_lo = ctn;
427 } else {
428 rl.maplower_ext_nranges++;
429 lo = realloc(lo,
430 sizeof (*lo) * rl.maplower_ext_nranges);
431 lo[rl.maplower_ext_nranges - 1].min = wc;
432 lo[rl.maplower_ext_nranges - 1].max = wc;
433 lo[rl.maplower_ext_nranges - 1].map = ctn->tolower;
434 last_lo = ctn;
437 if (ctn->toupper == 0) {
438 last_up = NULL;
439 } else if ((last_up != NULL) &&
440 (last_up->toupper + 1 == ctn->toupper)) {
441 up[rl.mapupper_ext_nranges-1].max = wc;
442 last_up = ctn;
443 } else {
444 rl.mapupper_ext_nranges++;
445 up = realloc(up,
446 sizeof (*up) * rl.mapupper_ext_nranges);
447 up[rl.mapupper_ext_nranges - 1].min = wc;
448 up[rl.mapupper_ext_nranges - 1].max = wc;
449 up[rl.mapupper_ext_nranges - 1].map = ctn->toupper;
450 last_up = ctn;
454 if ((wr_category(&rl, sizeof (rl), f) < 0) ||
455 (wr_category(ct, sizeof (*ct) * rl.runetype_ext_nranges, f) < 0) ||
456 (wr_category(lo, sizeof (*lo) * rl.maplower_ext_nranges, f) < 0) ||
457 (wr_category(up, sizeof (*up) * rl.mapupper_ext_nranges, f) < 0)) {
458 return;
461 close_category(f);