exp2l: Work around a NetBSD 10.0/i386 bug.
[gnulib.git] / lib / localeinfo.c
blob67af44a6fff40776bb95f9a44af3dd4bc2a1f377
1 /* locale information
3 Copyright 2016-2024 Free Software Foundation, Inc.
5 This program is free software; you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published by
7 the Free Software Foundation, either version 3, or (at your option)
8 any later version.
10 This program is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 GNU General Public License for more details.
15 You should have received a copy of the GNU General Public License
16 along with this program; if not, write to the Free Software
17 Foundation, Inc., 51 Franklin Street - Fifth Floor, Boston, MA
18 02110-1301, USA. */
20 /* Written by Paul Eggert. */
22 #include <config.h>
24 #include <localeinfo.h>
26 #include <limits.h>
27 #include <locale.h>
28 #include <stdlib.h>
29 #include <string.h>
30 #if GAWK
31 /* Use ISO C 99 API. */
32 # include <wctype.h>
33 # define char32_t wchar_t
34 # define mbrtoc32 mbrtowc
35 # define c32tolower towlower
36 # define c32toupper towupper
37 #else
38 /* Use ISO C 11 + gnulib API. */
39 # include <uchar.h>
40 #endif
42 /* The sbclen implementation relies on this. */
43 static_assert (MB_LEN_MAX <= SCHAR_MAX);
45 /* Return true if the locale uses UTF-8. */
47 static bool
48 is_using_utf8 (void)
50 char32_t wc;
51 mbstate_t mbs = {0};
52 return mbrtoc32 (&wc, "\xc4\x80", 2, &mbs) == 2 && wc == 0x100;
55 /* Return true if the locale is compatible enough with the C locale so
56 that the locale is single-byte, bytes are in collating-sequence
57 order, and there are no multi-character collating elements. */
59 static bool
60 using_simple_locale (bool multibyte)
62 /* The native character set is known to be compatible with
63 the C locale. The following test isn't perfect, but it's good
64 enough in practice, as only ASCII and EBCDIC are in common use
65 and this test correctly accepts ASCII and rejects EBCDIC. */
66 enum { native_c_charset =
67 ('\b' == 8 && '\t' == 9 && '\n' == 10 && '\v' == 11 && '\f' == 12
68 && '\r' == 13 && ' ' == 32 && '!' == 33 && '"' == 34 && '#' == 35
69 && '%' == 37 && '&' == 38 && '\'' == 39 && '(' == 40 && ')' == 41
70 && '*' == 42 && '+' == 43 && ',' == 44 && '-' == 45 && '.' == 46
71 && '/' == 47 && '0' == 48 && '9' == 57 && ':' == 58 && ';' == 59
72 && '<' == 60 && '=' == 61 && '>' == 62 && '?' == 63 && 'A' == 65
73 && 'Z' == 90 && '[' == 91 && '\\' == 92 && ']' == 93 && '^' == 94
74 && '_' == 95 && 'a' == 97 && 'z' == 122 && '{' == 123 && '|' == 124
75 && '}' == 125 && '~' == 126)
78 if (!native_c_charset || multibyte)
79 return false;
81 /* As a heuristic, use strcoll to compare native character order.
82 If this agrees with byte order the locale should be simple.
83 This heuristic should work for all known practical locales,
84 although it would be invalid for artificially-constructed locales
85 where the native order is the collating-sequence order but there
86 are multi-character collating elements. */
87 for (int i = 0; i < UCHAR_MAX; i++)
88 if (0 <= strcoll (((char []) {i, 0}), ((char []) {i + 1, 0})))
89 return false;
91 return true;
94 /* Initialize *LOCALEINFO from the current locale. */
96 void
97 init_localeinfo (struct localeinfo *localeinfo)
99 localeinfo->multibyte = MB_CUR_MAX > 1;
100 localeinfo->simple = using_simple_locale (localeinfo->multibyte);
101 localeinfo->using_utf8 = is_using_utf8 ();
103 for (int i = CHAR_MIN; i <= CHAR_MAX; i++)
105 char c = i;
106 unsigned char uc = i;
107 mbstate_t s = {0};
108 char32_t wc;
109 size_t len = mbrtoc32 (&wc, &c, 1, &s);
110 localeinfo->sbclen[uc] = len <= 1 ? 1 : - (int) - len;
111 localeinfo->sbctowc[uc] = len <= 1 ? wc : WEOF;
115 /* The set of char32_t values C such that there's a useful locale
116 somewhere where C != towupper (C) && C != towlower (towupper (C)).
117 For example, 0x00B5 (U+00B5 MICRO SIGN) is in this table, because
118 towupper (0x00B5) == 0x039C (U+039C GREEK CAPITAL LETTER MU), and
119 towlower (0x039C) == 0x03BC (U+03BC GREEK SMALL LETTER MU). */
120 static unsigned short int const lonesome_lower[] =
122 0x00B5, 0x0131, 0x017F, 0x01C5, 0x01C8, 0x01CB, 0x01F2, 0x0345,
123 0x03C2, 0x03D0, 0x03D1, 0x03D5, 0x03D6, 0x03F0, 0x03F1,
125 /* U+03F2 GREEK LUNATE SIGMA SYMBOL lacks a specific uppercase
126 counterpart in locales predating Unicode 4.0.0 (April 2003). */
127 0x03F2,
129 0x03F5, 0x1E9B, 0x1FBE,
132 /* Verify that the worst case fits. This is 1 for towupper, 1 for
133 towlower, and 1 for each entry in LONESOME_LOWER. */
134 static_assert (1 + 1 + sizeof lonesome_lower / sizeof *lonesome_lower
135 <= CASE_FOLDED_BUFSIZE);
137 /* Find the characters equal to C after case-folding, other than C
138 itself, and store them into FOLDED. Return the number of characters
139 stored; this is zero if C is WEOF. */
142 case_folded_counterparts (wint_t c, char32_t folded[CASE_FOLDED_BUFSIZE])
144 int i;
145 int n = 0;
146 wint_t uc = c32toupper (c);
147 wint_t lc = c32tolower (uc);
148 if (uc != c)
149 folded[n++] = uc;
150 if (lc != uc && lc != c && c32toupper (lc) == uc)
151 folded[n++] = lc;
152 for (i = 0; i < sizeof lonesome_lower / sizeof *lonesome_lower; i++)
154 wint_t li = lonesome_lower[i];
155 if (li != lc && li != uc && li != c && c32toupper (li) == uc)
156 folded[n++] = li;
158 return n;