lib/localeinfo.c

   1 /* locale information
   2
   3    Copyright 2016-2020 Free Software Foundation, Inc.
   4
   5    This program is free software; you can redistribute it and/or modify
   6    it under the terms of the GNU General Public License as published by
   7    the Free Software Foundation; either version 3, or (at your option)
   8    any later version.
   9
  10    This program is distributed in the hope that it will be useful,
  11    but WITHOUT ANY WARRANTY; without even the implied warranty of
  12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  13    GNU General Public License for more details.
  14
  15    You should have received a copy of the GNU General Public License
  16    along with this program; if not, write to the Free Software
  17    Foundation, Inc., 51 Franklin Street - Fifth Floor, Boston, MA
  18    02110-1301, USA.  */
  19
  20 /* Written by Paul Eggert.  */
  21
  22 #include <config.h>
  23
  24 #include <localeinfo.h>
  25
  26 #include <verify.h>
  27
  28 #include <limits.h>
  29 #include <locale.h>
  30 #include <stdlib.h>
  31 #include <string.h>
  32 #include <wctype.h>
  33
  34 /* The sbclen implementation relies on this.  */
  35 verify (MB_LEN_MAX <= SCHAR_MAX);
  36
  37 /* Return true if the locale uses UTF-8.  */
  38
  39 static bool
  40 is_using_utf8 (void)
  41 {
  42   wchar_t wc;
  43   mbstate_t mbs = {0};
  44   return mbrtowc (&wc, "\xc4\x80", 2, &mbs) == 2 && wc == 0x100;
  45 }
  46
  47 /* Return true if the locale is compatible enough with the C locale so
  48    that the locale is single-byte, bytes are in collating-sequence
  49    order, and there are no multi-character collating elements.  */
  50
  51 static bool
  52 using_simple_locale (bool multibyte)
  53 {
  54   /* The native character set is known to be compatible with
  55      the C locale.  The following test isn't perfect, but it's good
  56      enough in practice, as only ASCII and EBCDIC are in common use
  57      and this test correctly accepts ASCII and rejects EBCDIC.  */
  58   enum { native_c_charset =
  59     ('\b' == 8 && '\t' == 9 && '\n' == 10 && '\v' == 11 && '\f' == 12
  60      && '\r' == 13 && ' ' == 32 && '!' == 33 && '"' == 34 && '#' == 35
  61      && '%' == 37 && '&' == 38 && '\'' == 39 && '(' == 40 && ')' == 41
  62      && '*' == 42 && '+' == 43 && ',' == 44 && '-' == 45 && '.' == 46
  63      && '/' == 47 && '0' == 48 && '9' == 57 && ':' == 58 && ';' == 59
  64      && '<' == 60 && '=' == 61 && '>' == 62 && '?' == 63 && 'A' == 65
  65      && 'Z' == 90 && '[' == 91 && '\\' == 92 && ']' == 93 && '^' == 94
  66      && '_' == 95 && 'a' == 97 && 'z' == 122 && '{' == 123 && '|' == 124
  67      && '}' == 125 && '~' == 126)
  68   };
  69
  70   if (!native_c_charset || multibyte)
  71     return false;
  72
  73   /* As a heuristic, use strcoll to compare native character order.
  74      If this agrees with byte order the locale should be simple.
  75      This heuristic should work for all known practical locales,
  76      although it would be invalid for artificially-constructed locales
  77      where the native order is the collating-sequence order but there
  78      are multi-character collating elements.  */
  79   for (int i = 0; i < UCHAR_MAX; i++)
  80     if (0 <= strcoll (((char []) {i, 0}), ((char []) {i + 1, 0})))
  81       return false;
  82
  83   return true;
  84 }
  85
  86 /* Initialize *LOCALEINFO from the current locale.  */
  87
  88 void
  89 init_localeinfo (struct localeinfo *localeinfo)
  90 {
  91   localeinfo->multibyte = MB_CUR_MAX > 1;
  92   localeinfo->simple = using_simple_locale (localeinfo->multibyte);
  93   localeinfo->using_utf8 = is_using_utf8 ();
  94
  95   for (int i = CHAR_MIN; i <= CHAR_MAX; i++)
  96     {
  97       char c = i;
  98       unsigned char uc = i;
  99       mbstate_t s = {0};
 100       wchar_t wc;
 101       size_t len = mbrtowc (&wc, &c, 1, &s);
 102       localeinfo->sbclen[uc] = len <= 1 ? 1 : - (int) - len;
 103       localeinfo->sbctowc[uc] = len <= 1 ? wc : WEOF;
 104     }
 105 }
 106
 107 /* The set of wchar_t values C such that there's a useful locale
 108    somewhere where C != towupper (C) && C != towlower (towupper (C)).
 109    For example, 0x00B5 (U+00B5 MICRO SIGN) is in this table, because
 110    towupper (0x00B5) == 0x039C (U+039C GREEK CAPITAL LETTER MU), and
 111    towlower (0x039C) == 0x03BC (U+03BC GREEK SMALL LETTER MU).  */
 112 static short const lonesome_lower[] =
 113   {
 114     0x00B5, 0x0131, 0x017F, 0x01C5, 0x01C8, 0x01CB, 0x01F2, 0x0345,
 115     0x03C2, 0x03D0, 0x03D1, 0x03D5, 0x03D6, 0x03F0, 0x03F1,
 116
 117     /* U+03F2 GREEK LUNATE SIGMA SYMBOL lacks a specific uppercase
 118        counterpart in locales predating Unicode 4.0.0 (April 2003).  */
 119     0x03F2,
 120
 121     0x03F5, 0x1E9B, 0x1FBE,
 122   };
 123
 124 /* Verify that the worst case fits.  This is 1 for towupper, 1 for
 125    towlower, and 1 for each entry in LONESOME_LOWER.  */
 126 verify (1 + 1 + sizeof lonesome_lower / sizeof *lonesome_lower
 127         <= CASE_FOLDED_BUFSIZE);
 128
 129 /* Find the characters equal to C after case-folding, other than C
 130    itself, and store them into FOLDED.  Return the number of characters
 131    stored; this is zero if C is WEOF.  */
 132
 133 int
 134 case_folded_counterparts (wint_t c, wchar_t folded[CASE_FOLDED_BUFSIZE])
 135 {
 136   int i;
 137   int n = 0;
 138   wint_t uc = towupper (c);
 139   wint_t lc = towlower (uc);
 140   if (uc != c)
 141     folded[n++] = uc;
 142   if (lc != uc && lc != c && towupper (lc) == uc)
 143     folded[n++] = lc;
 144   for (i = 0; i < sizeof lonesome_lower / sizeof *lonesome_lower; i++)
 145     {
 146       wint_t li = lonesome_lower[i];
 147       if (li != lc && li != uc && li != c && towupper (li) == uc)
 148         folded[n++] = li;
 149     }
 150   return n;
 151 }