lib/localeinfo.c

   1 /* locale information
   2
   3    Copyright 2016-2024 Free Software Foundation, Inc.
   4
   5    This program is free software; you can redistribute it and/or modify
   6    it under the terms of the GNU General Public License as published by
   7    the Free Software Foundation, either version 3, or (at your option)
   8    any later version.
   9
  10    This program is distributed in the hope that it will be useful,
  11    but WITHOUT ANY WARRANTY; without even the implied warranty of
  12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  13    GNU General Public License for more details.
  14
  15    You should have received a copy of the GNU General Public License
  16    along with this program; if not, write to the Free Software
  17    Foundation, Inc., 51 Franklin Street - Fifth Floor, Boston, MA
  18    02110-1301, USA.  */
  19
  20 /* Written by Paul Eggert.  */
  21
  22 #include <config.h>
  23
  24 #include <localeinfo.h>
  25
  26 #include <limits.h>
  27 #include <locale.h>
  28 #include <stdlib.h>
  29 #include <string.h>
  30 #if GAWK
  31 /* Use ISO C 99 API.  */
  32 # include <wctype.h>
  33 # define char32_t wchar_t
  34 # define mbrtoc32 mbrtowc
  35 # define c32tolower towlower
  36 # define c32toupper towupper
  37 #else
  38 /* Use ISO C 11 + gnulib API.  */
  39 # include <uchar.h>
  40 #endif
  41
  42 /* The sbclen implementation relies on this.  */
  43 static_assert (MB_LEN_MAX <= SCHAR_MAX);
  44
  45 /* Return true if the locale uses UTF-8.  */
  46
  47 static bool
  48 is_using_utf8 (void)
  49 {
  50   char32_t wc;
  51   mbstate_t mbs = {0};
  52   return mbrtoc32 (&wc, "\xc4\x80", 2, &mbs) == 2 && wc == 0x100;
  53 }
  54
  55 /* Return true if the locale is compatible enough with the C locale so
  56    that the locale is single-byte, bytes are in collating-sequence
  57    order, and there are no multi-character collating elements.  */
  58
  59 static bool
  60 using_simple_locale (bool multibyte)
  61 {
  62   /* The native character set is known to be compatible with
  63      the C locale.  The following test isn't perfect, but it's good
  64      enough in practice, as only ASCII and EBCDIC are in common use
  65      and this test correctly accepts ASCII and rejects EBCDIC.  */
  66   enum { native_c_charset =
  67     ('\b' == 8 && '\t' == 9 && '\n' == 10 && '\v' == 11 && '\f' == 12
  68      && '\r' == 13 && ' ' == 32 && '!' == 33 && '"' == 34 && '#' == 35
  69      && '%' == 37 && '&' == 38 && '\'' == 39 && '(' == 40 && ')' == 41
  70      && '*' == 42 && '+' == 43 && ',' == 44 && '-' == 45 && '.' == 46
  71      && '/' == 47 && '0' == 48 && '9' == 57 && ':' == 58 && ';' == 59
  72      && '<' == 60 && '=' == 61 && '>' == 62 && '?' == 63 && 'A' == 65
  73      && 'Z' == 90 && '[' == 91 && '\\' == 92 && ']' == 93 && '^' == 94
  74      && '_' == 95 && 'a' == 97 && 'z' == 122 && '{' == 123 && '|' == 124
  75      && '}' == 125 && '~' == 126)
  76   };
  77
  78   if (!native_c_charset || multibyte)
  79     return false;
  80
  81   /* As a heuristic, use strcoll to compare native character order.
  82      If this agrees with byte order the locale should be simple.
  83      This heuristic should work for all known practical locales,
  84      although it would be invalid for artificially-constructed locales
  85      where the native order is the collating-sequence order but there
  86      are multi-character collating elements.  */
  87   for (int i = 0; i < UCHAR_MAX; i++)
  88     if (0 <= strcoll (((char []) {i, 0}), ((char []) {i + 1, 0})))
  89       return false;
  90
  91   return true;
  92 }
  93
  94 /* Initialize *LOCALEINFO from the current locale.  */
  95
  96 void
  97 init_localeinfo (struct localeinfo *localeinfo)
  98 {
  99   localeinfo->multibyte = MB_CUR_MAX > 1;
 100   localeinfo->simple = using_simple_locale (localeinfo->multibyte);
 101   localeinfo->using_utf8 = is_using_utf8 ();
 102
 103   for (int i = CHAR_MIN; i <= CHAR_MAX; i++)
 104     {
 105       char c = i;
 106       unsigned char uc = i;
 107       mbstate_t s = {0};
 108       char32_t wc;
 109       size_t len = mbrtoc32 (&wc, &c, 1, &s);
 110       localeinfo->sbclen[uc] = len <= 1 ? 1 : - (int) - len;
 111       localeinfo->sbctowc[uc] = len <= 1 ? wc : WEOF;
 112     }
 113 }
 114
 115 /* The set of char32_t values C such that there's a useful locale
 116    somewhere where C != towupper (C) && C != towlower (towupper (C)).
 117    For example, 0x00B5 (U+00B5 MICRO SIGN) is in this table, because
 118    towupper (0x00B5) == 0x039C (U+039C GREEK CAPITAL LETTER MU), and
 119    towlower (0x039C) == 0x03BC (U+03BC GREEK SMALL LETTER MU).  */
 120 static unsigned short int const lonesome_lower[] =
 121   {
 122     0x00B5, 0x0131, 0x017F, 0x01C5, 0x01C8, 0x01CB, 0x01F2, 0x0345,
 123     0x03C2, 0x03D0, 0x03D1, 0x03D5, 0x03D6, 0x03F0, 0x03F1,
 124
 125     /* U+03F2 GREEK LUNATE SIGMA SYMBOL lacks a specific uppercase
 126        counterpart in locales predating Unicode 4.0.0 (April 2003).  */
 127     0x03F2,
 128
 129     0x03F5, 0x1E9B, 0x1FBE,
 130   };
 131
 132 /* Verify that the worst case fits.  This is 1 for towupper, 1 for
 133    towlower, and 1 for each entry in LONESOME_LOWER.  */
 134 static_assert (1 + 1 + sizeof lonesome_lower / sizeof *lonesome_lower
 135                <= CASE_FOLDED_BUFSIZE);
 136
 137 /* Find the characters equal to C after case-folding, other than C
 138    itself, and store them into FOLDED.  Return the number of characters
 139    stored; this is zero if C is WEOF.  */
 140
 141 int
 142 case_folded_counterparts (wint_t c, char32_t folded[CASE_FOLDED_BUFSIZE])
 143 {
 144   int i;
 145   int n = 0;
 146   wint_t uc = c32toupper (c);
 147   wint_t lc = c32tolower (uc);
 148   if (uc != c)
 149     folded[n++] = uc;
 150   if (lc != uc && lc != c && c32toupper (lc) == uc)
 151     folded[n++] = lc;
 152   for (i = 0; i < sizeof lonesome_lower / sizeof *lonesome_lower; i++)
 153     {
 154       wint_t li = lonesome_lower[i];
 155       if (li != lc && li != uc && li != c && c32toupper (li) == uc)
 156         folded[n++] = li;
 157     }
 158   return n;
 159 }