lib/localcharset.c

   1 /* Determine a canonical name for the current locale's character encoding.
   2
   3    Copyright (C) 2000-2006, 2008-2019 Free Software Foundation, Inc.
   4
   5    This program is free software; you can redistribute it and/or modify
   6    it under the terms of the GNU General Public License as published by
   7    the Free Software Foundation; either version 2, or (at your option)
   8    any later version.
   9
  10    This program is distributed in the hope that it will be useful,
  11    but WITHOUT ANY WARRANTY; without even the implied warranty of
  12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  13    GNU General Public License for more details.
  14
  15    You should have received a copy of the GNU General Public License along
  16    with this program; if not, see <https://www.gnu.org/licenses/>.  */
  17
  18 /* Written by Bruno Haible <bruno@clisp.org>.  */
  19
  20 #include <config.h>
  21
  22 /* Specification.  */
  23 #include "localcharset.h"
  24
  25 #include <stddef.h>
  26 #include <stdio.h>
  27 #include <string.h>
  28 #include <stdlib.h>
  29
  30 #if defined __APPLE__ && defined __MACH__ && HAVE_LANGINFO_CODESET
  31 # define DARWIN7 /* Darwin 7 or newer, i.e. Mac OS X 10.3 or newer */
  32 #endif
  33
  34 #if defined _WIN32 && !defined __CYGWIN__
  35 # define WINDOWS_NATIVE
  36 # include <locale.h>
  37 #endif
  38
  39 #if defined __EMX__
  40 /* Assume EMX program runs on OS/2, even if compiled under DOS.  */
  41 # ifndef OS2
  42 #  define OS2
  43 # endif
  44 #endif
  45
  46 #if !defined WINDOWS_NATIVE
  47 # if HAVE_LANGINFO_CODESET
  48 #  include <langinfo.h>
  49 # else
  50 #  if 0 /* see comment regarding use of setlocale(), below */
  51 #   include <locale.h>
  52 #  endif
  53 # endif
  54 # ifdef __CYGWIN__
  55 #  define WIN32_LEAN_AND_MEAN
  56 #  include <windows.h>
  57 # endif
  58 #elif defined WINDOWS_NATIVE
  59 # define WIN32_LEAN_AND_MEAN
  60 # include <windows.h>
  61 #endif
  62 #if defined OS2
  63 # define INCL_DOS
  64 # include <os2.h>
  65 #endif
  66
  67 /* For MB_CUR_MAX_L */
  68 #if defined DARWIN7
  69 # include <xlocale.h>
  70 #endif
  71
  72
  73 #if HAVE_LANGINFO_CODESET || defined WINDOWS_NATIVE || defined OS2
  74
  75 /* On these platforms, we use a mapping from non-canonical encoding name
  76    to GNU canonical encoding name.  */
  77
  78 /* With glibc-2.1 or newer, we don't need any canonicalization,
  79    because glibc has iconv and both glibc and libiconv support all
  80    GNU canonical names directly.  */
  81 # if !((defined __GNU_LIBRARY__ && __GLIBC__ >= 2) || defined __UCLIBC__)
  82
  83 struct table_entry
  84 {
  85   const char alias[11+1];
  86   const char canonical[11+1];
  87 };
  88
  89 /* Table of platform-dependent mappings, sorted in ascending order.  */
  90 static const struct table_entry alias_table[] =
  91   {
  92 #  if defined __FreeBSD__                                   /* FreeBSD */
  93   /*{ "ARMSCII-8",  "ARMSCII-8" },*/
  94     { "Big5",       "BIG5" },
  95     { "C",          "ASCII" },
  96   /*{ "CP1131",     "CP1131" },*/
  97   /*{ "CP1251",     "CP1251" },*/
  98   /*{ "CP866",      "CP866" },*/
  99   /*{ "GB18030",    "GB18030" },*/
 100   /*{ "GB2312",     "GB2312" },*/
 101   /*{ "GBK",        "GBK" },*/
 102   /*{ "ISCII-DEV",  "?" },*/
 103     { "ISO8859-1",  "ISO-8859-1" },
 104     { "ISO8859-13", "ISO-8859-13" },
 105     { "ISO8859-15", "ISO-8859-15" },
 106     { "ISO8859-2",  "ISO-8859-2" },
 107     { "ISO8859-5",  "ISO-8859-5" },
 108     { "ISO8859-7",  "ISO-8859-7" },
 109     { "ISO8859-9",  "ISO-8859-9" },
 110   /*{ "KOI8-R",     "KOI8-R" },*/
 111   /*{ "KOI8-U",     "KOI8-U" },*/
 112     { "SJIS",       "SHIFT_JIS" },
 113     { "US-ASCII",   "ASCII" },
 114     { "eucCN",      "GB2312" },
 115     { "eucJP",      "EUC-JP" },
 116     { "eucKR",      "EUC-KR" }
 117 #   define alias_table_defined
 118 #  endif
 119 #  if defined __NetBSD__                                    /* NetBSD */
 120     { "646",        "ASCII" },
 121   /*{ "ARMSCII-8",  "ARMSCII-8" },*/
 122   /*{ "BIG5",       "BIG5" },*/
 123     { "Big5-HKSCS", "BIG5-HKSCS" },
 124   /*{ "CP1251",     "CP1251" },*/
 125   /*{ "CP866",      "CP866" },*/
 126   /*{ "GB18030",    "GB18030" },*/
 127   /*{ "GB2312",     "GB2312" },*/
 128     { "ISO8859-1",  "ISO-8859-1" },
 129     { "ISO8859-13", "ISO-8859-13" },
 130     { "ISO8859-15", "ISO-8859-15" },
 131     { "ISO8859-2",  "ISO-8859-2" },
 132     { "ISO8859-4",  "ISO-8859-4" },
 133     { "ISO8859-5",  "ISO-8859-5" },
 134     { "ISO8859-7",  "ISO-8859-7" },
 135   /*{ "KOI8-R",     "KOI8-R" },*/
 136   /*{ "KOI8-U",     "KOI8-U" },*/
 137   /*{ "PT154",      "PT154" },*/
 138     { "SJIS",       "SHIFT_JIS" },
 139     { "eucCN",      "GB2312" },
 140     { "eucJP",      "EUC-JP" },
 141     { "eucKR",      "EUC-KR" },
 142     { "eucTW",      "EUC-TW" }
 143 #   define alias_table_defined
 144 #  endif
 145 #  if defined __OpenBSD__                                   /* OpenBSD */
 146     { "646",        "ASCII" },
 147     { "ISO8859-1",  "ISO-8859-1" },
 148     { "ISO8859-13", "ISO-8859-13" },
 149     { "ISO8859-15", "ISO-8859-15" },
 150     { "ISO8859-2",  "ISO-8859-2" },
 151     { "ISO8859-4",  "ISO-8859-4" },
 152     { "ISO8859-5",  "ISO-8859-5" },
 153     { "ISO8859-7",  "ISO-8859-7" }
 154 #   define alias_table_defined
 155 #  endif
 156 #  if defined __APPLE__ && defined __MACH__                 /* Mac OS X */
 157     /* Darwin 7.5 has nl_langinfo(CODESET), but sometimes its value is
 158        useless:
 159        - It returns the empty string when LANG is set to a locale of the
 160          form ll_CC, although ll_CC/LC_CTYPE is a symlink to an UTF-8
 161          LC_CTYPE file.
 162        - The environment variables LANG, LC_CTYPE, LC_ALL are not set by
 163          the system; nl_langinfo(CODESET) returns "US-ASCII" in this case.
 164        - The documentation says:
 165            "... all code that calls BSD system routines should ensure
 166             that the const *char parameters of these routines are in UTF-8
 167             encoding. All BSD system functions expect their string
 168             parameters to be in UTF-8 encoding and nothing else."
 169          It also says
 170            "An additional caveat is that string parameters for files,
 171             paths, and other file-system entities must be in canonical
 172             UTF-8. In a canonical UTF-8 Unicode string, all decomposable
 173             characters are decomposed ..."
 174          but this is not true: You can pass non-decomposed UTF-8 strings
 175          to file system functions, and it is the OS which will convert
 176          them to decomposed UTF-8 before accessing the file system.
 177        - The Apple Terminal application displays UTF-8 by default.
 178        - However, other applications are free to use different encodings:
 179          - xterm uses ISO-8859-1 by default.
 180          - TextEdit uses MacRoman by default.
 181        We prefer UTF-8 over decomposed UTF-8-MAC because one should
 182        minimize the use of decomposed Unicode. Unfortunately, through the
 183        Darwin file system, decomposed UTF-8 strings are leaked into user
 184        space nevertheless.
 185        Then there are also the locales with encodings other than US-ASCII
 186        and UTF-8. These locales can be occasionally useful to users (e.g.
 187        when grepping through ISO-8859-1 encoded text files), when all their
 188        file names are in US-ASCII.
 189      */
 190     { "ARMSCII-8",  "ARMSCII-8" },
 191     { "Big5",       "BIG5" },
 192     { "Big5HKSCS",  "BIG5-HKSCS" },
 193     { "CP1131",     "CP1131" },
 194     { "CP1251",     "CP1251" },
 195     { "CP866",      "CP866" },
 196     { "CP949",      "CP949" },
 197     { "GB18030",    "GB18030" },
 198     { "GB2312",     "GB2312" },
 199     { "GBK",        "GBK" },
 200   /*{ "ISCII-DEV",  "?" },*/
 201     { "ISO8859-1",  "ISO-8859-1" },
 202     { "ISO8859-13", "ISO-8859-13" },
 203     { "ISO8859-15", "ISO-8859-15" },
 204     { "ISO8859-2",  "ISO-8859-2" },
 205     { "ISO8859-4",  "ISO-8859-4" },
 206     { "ISO8859-5",  "ISO-8859-5" },
 207     { "ISO8859-7",  "ISO-8859-7" },
 208     { "ISO8859-9",  "ISO-8859-9" },
 209     { "KOI8-R",     "KOI8-R" },
 210     { "KOI8-U",     "KOI8-U" },
 211     { "PT154",      "PT154" },
 212     { "SJIS",       "SHIFT_JIS" },
 213     { "eucCN",      "GB2312" },
 214     { "eucJP",      "EUC-JP" },
 215     { "eucKR",      "EUC-KR" }
 216 #   define alias_table_defined
 217 #  endif
 218 #  if defined _AIX                                          /* AIX */
 219   /*{ "GBK",        "GBK" },*/
 220     { "IBM-1046",   "CP1046" },
 221     { "IBM-1124",   "CP1124" },
 222     { "IBM-1129",   "CP1129" },
 223     { "IBM-1252",   "CP1252" },
 224     { "IBM-850",    "CP850" },
 225     { "IBM-856",    "CP856" },
 226     { "IBM-921",    "ISO-8859-13" },
 227     { "IBM-922",    "CP922" },
 228     { "IBM-932",    "CP932" },
 229     { "IBM-943",    "CP943" },
 230     { "IBM-eucCN",  "GB2312" },
 231     { "IBM-eucJP",  "EUC-JP" },
 232     { "IBM-eucKR",  "EUC-KR" },
 233     { "IBM-eucTW",  "EUC-TW" },
 234     { "ISO8859-1",  "ISO-8859-1" },
 235     { "ISO8859-15", "ISO-8859-15" },
 236     { "ISO8859-2",  "ISO-8859-2" },
 237     { "ISO8859-5",  "ISO-8859-5" },
 238     { "ISO8859-6",  "ISO-8859-6" },
 239     { "ISO8859-7",  "ISO-8859-7" },
 240     { "ISO8859-8",  "ISO-8859-8" },
 241     { "ISO8859-9",  "ISO-8859-9" },
 242     { "TIS-620",    "TIS-620" },
 243   /*{ "UTF-8",      "UTF-8" },*/
 244     { "big5",       "BIG5" }
 245 #   define alias_table_defined
 246 #  endif
 247 #  if defined __hpux                                        /* HP-UX */
 248     { "SJIS",      "SHIFT_JIS" },
 249     { "arabic8",   "HP-ARABIC8" },
 250     { "big5",      "BIG5" },
 251     { "cp1251",    "CP1251" },
 252     { "eucJP",     "EUC-JP" },
 253     { "eucKR",     "EUC-KR" },
 254     { "eucTW",     "EUC-TW" },
 255     { "gb18030",   "GB18030" },
 256     { "greek8",    "HP-GREEK8" },
 257     { "hebrew8",   "HP-HEBREW8" },
 258     { "hkbig5",    "BIG5-HKSCS" },
 259     { "hp15CN",    "GB2312" },
 260     { "iso88591",  "ISO-8859-1" },
 261     { "iso885913", "ISO-8859-13" },
 262     { "iso885915", "ISO-8859-15" },
 263     { "iso88592",  "ISO-8859-2" },
 264     { "iso88594",  "ISO-8859-4" },
 265     { "iso88595",  "ISO-8859-5" },
 266     { "iso88596",  "ISO-8859-6" },
 267     { "iso88597",  "ISO-8859-7" },
 268     { "iso88598",  "ISO-8859-8" },
 269     { "iso88599",  "ISO-8859-9" },
 270     { "kana8",     "HP-KANA8" },
 271     { "koi8r",     "KOI8-R" },
 272     { "roman8",    "HP-ROMAN8" },
 273     { "tis620",    "TIS-620" },
 274     { "turkish8",  "HP-TURKISH8" },
 275     { "utf8",      "UTF-8" }
 276 #   define alias_table_defined
 277 #  endif
 278 #  if defined __sgi                                         /* IRIX */
 279     { "ISO8859-1",  "ISO-8859-1" },
 280     { "ISO8859-15", "ISO-8859-15" },
 281     { "ISO8859-2",  "ISO-8859-2" },
 282     { "ISO8859-5",  "ISO-8859-5" },
 283     { "ISO8859-7",  "ISO-8859-7" },
 284     { "ISO8859-9",  "ISO-8859-9" },
 285     { "eucCN",      "GB2312" },
 286     { "eucJP",      "EUC-JP" },
 287     { "eucKR",      "EUC-KR" },
 288     { "eucTW",      "EUC-TW" }
 289 #   define alias_table_defined
 290 #  endif
 291 #  if defined __osf__                                       /* OSF/1 */
 292   /*{ "GBK",        "GBK" },*/
 293     { "ISO8859-1",  "ISO-8859-1" },
 294     { "ISO8859-15", "ISO-8859-15" },
 295     { "ISO8859-2",  "ISO-8859-2" },
 296     { "ISO8859-4",  "ISO-8859-4" },
 297     { "ISO8859-5",  "ISO-8859-5" },
 298     { "ISO8859-7",  "ISO-8859-7" },
 299     { "ISO8859-8",  "ISO-8859-8" },
 300     { "ISO8859-9",  "ISO-8859-9" },
 301     { "KSC5601",    "CP949" },
 302     { "SJIS",       "SHIFT_JIS" },
 303     { "TACTIS",     "TIS-620" },
 304   /*{ "UTF-8",      "UTF-8" },*/
 305     { "big5",       "BIG5" },
 306     { "cp850",      "CP850" },
 307     { "dechanyu",   "DEC-HANYU" },
 308     { "dechanzi",   "GB2312" },
 309     { "deckanji",   "DEC-KANJI" },
 310     { "deckorean",  "EUC-KR" },
 311     { "eucJP",      "EUC-JP" },
 312     { "eucKR",      "EUC-KR" },
 313     { "eucTW",      "EUC-TW" },
 314     { "sdeckanji",  "EUC-JP" }
 315 #   define alias_table_defined
 316 #  endif
 317 #  if defined __sun                                         /* Solaris */
 318     { "5601",        "EUC-KR" },
 319     { "646",         "ASCII" },
 320   /*{ "BIG5",        "BIG5" },*/
 321     { "Big5-HKSCS",  "BIG5-HKSCS" },
 322     { "GB18030",     "GB18030" },
 323   /*{ "GBK",         "GBK" },*/
 324     { "ISO8859-1",   "ISO-8859-1" },
 325     { "ISO8859-11",  "TIS-620" },
 326     { "ISO8859-13",  "ISO-8859-13" },
 327     { "ISO8859-15",  "ISO-8859-15" },
 328     { "ISO8859-2",   "ISO-8859-2" },
 329     { "ISO8859-3",   "ISO-8859-3" },
 330     { "ISO8859-4",   "ISO-8859-4" },
 331     { "ISO8859-5",   "ISO-8859-5" },
 332     { "ISO8859-6",   "ISO-8859-6" },
 333     { "ISO8859-7",   "ISO-8859-7" },
 334     { "ISO8859-8",   "ISO-8859-8" },
 335     { "ISO8859-9",   "ISO-8859-9" },
 336     { "PCK",         "SHIFT_JIS" },
 337     { "TIS620.2533", "TIS-620" },
 338   /*{ "UTF-8",       "UTF-8" },*/
 339     { "ansi-1251",   "CP1251" },
 340     { "cns11643",    "EUC-TW" },
 341     { "eucJP",       "EUC-JP" },
 342     { "gb2312",      "GB2312" },
 343     { "koi8-r",      "KOI8-R" }
 344 #   define alias_table_defined
 345 #  endif
 346 #  if defined __minix                                       /* Minix */
 347     { "646", "ASCII" }
 348 #   define alias_table_defined
 349 #  endif
 350 #  if defined WINDOWS_NATIVE || defined __CYGWIN__          /* Windows */
 351     { "CP1361",  "JOHAB" },
 352     { "CP20127", "ASCII" },
 353     { "CP20866", "KOI8-R" },
 354     { "CP20936", "GB2312" },
 355     { "CP21866", "KOI8-RU" },
 356     { "CP28591", "ISO-8859-1" },
 357     { "CP28592", "ISO-8859-2" },
 358     { "CP28593", "ISO-8859-3" },
 359     { "CP28594", "ISO-8859-4" },
 360     { "CP28595", "ISO-8859-5" },
 361     { "CP28596", "ISO-8859-6" },
 362     { "CP28597", "ISO-8859-7" },
 363     { "CP28598", "ISO-8859-8" },
 364     { "CP28599", "ISO-8859-9" },
 365     { "CP28605", "ISO-8859-15" },
 366     { "CP38598", "ISO-8859-8" },
 367     { "CP51932", "EUC-JP" },
 368     { "CP51936", "GB2312" },
 369     { "CP51949", "EUC-KR" },
 370     { "CP51950", "EUC-TW" },
 371     { "CP54936", "GB18030" },
 372     { "CP65001", "UTF-8" },
 373     { "CP936",   "GBK" }
 374 #   define alias_table_defined
 375 #  endif
 376 #  if defined OS2                                           /* OS/2 */
 377     /* The list of encodings is taken from "List of OS/2 Codepages"
 378        by Alex Taylor:
 379        <http://altsan.org/os2/toolkits/uls/index.html#codepages>.
 380        See also "IBM Globalization - Code page identifiers":
 381        <https://www-01.ibm.com/software/globalization/cp/cp_cpgid.html>.  */
 382     { "CP1089", "ISO-8859-6" },
 383     { "CP1208", "UTF-8" },
 384     { "CP1381", "GB2312" },
 385     { "CP1386", "GBK" },
 386     { "CP3372", "EUC-JP" },
 387     { "CP813",  "ISO-8859-7" },
 388     { "CP819",  "ISO-8859-1" },
 389     { "CP878",  "KOI8-R" },
 390     { "CP912",  "ISO-8859-2" },
 391     { "CP913",  "ISO-8859-3" },
 392     { "CP914",  "ISO-8859-4" },
 393     { "CP915",  "ISO-8859-5" },
 394     { "CP916",  "ISO-8859-8" },
 395     { "CP920",  "ISO-8859-9" },
 396     { "CP921",  "ISO-8859-13" },
 397     { "CP923",  "ISO-8859-15" },
 398     { "CP954",  "EUC-JP" },
 399     { "CP964",  "EUC-TW" },
 400     { "CP970",  "EUC-KR" }
 401 #   define alias_table_defined
 402 #  endif
 403 #  if defined VMS                                           /* OpenVMS */
 404     /* The list of encodings is taken from the OpenVMS 7.3-1 documentation
 405        "Compaq C Run-Time Library Reference Manual for OpenVMS systems"
 406        section 10.7 "Handling Different Character Sets".  */
 407     { "DECHANYU",  "DEC-HANYU" },
 408     { "DECHANZI",  "GB2312" },
 409     { "DECKANJI",  "DEC-KANJI" },
 410     { "DECKOREAN", "EUC-KR" },
 411     { "ISO8859-1", "ISO-8859-1" },
 412     { "ISO8859-2", "ISO-8859-2" },
 413     { "ISO8859-5", "ISO-8859-5" },
 414     { "ISO8859-7", "ISO-8859-7" },
 415     { "ISO8859-8", "ISO-8859-8" },
 416     { "ISO8859-9", "ISO-8859-9" },
 417     { "SDECKANJI", "EUC-JP" },
 418     { "SJIS",      "SHIFT_JIS" },
 419     { "eucJP",     "EUC-JP" },
 420     { "eucTW",     "EUC-TW" }
 421 #   define alias_table_defined
 422 #  endif
 423 #  ifndef alias_table_defined
 424     /* Just a dummy entry, to avoid a C syntax error.  */
 425     { "", "" }
 426 #  endif
 427   };
 428
 429 # endif
 430
 431 #else
 432
 433 /* On these platforms, we use a mapping from locale name to GNU canonical
 434    encoding name.  */
 435
 436 struct table_entry
 437 {
 438   const char locale[17+1];
 439   const char canonical[11+1];
 440 };
 441
 442 /* Table of platform-dependent mappings, sorted in ascending order.  */
 443 static const struct table_entry locale_table[] =
 444   {
 445 # if defined __FreeBSD__                                    /* FreeBSD 4.2 */
 446     { "cs_CZ.ISO_8859-2",  "ISO-8859-2" },
 447     { "da_DK.DIS_8859-15", "ISO-8859-15" },
 448     { "da_DK.ISO_8859-1",  "ISO-8859-1" },
 449     { "de_AT.DIS_8859-15", "ISO-8859-15" },
 450     { "de_AT.ISO_8859-1",  "ISO-8859-1" },
 451     { "de_CH.DIS_8859-15", "ISO-8859-15" },
 452     { "de_CH.ISO_8859-1",  "ISO-8859-1" },
 453     { "de_DE.DIS_8859-15", "ISO-8859-15" },
 454     { "de_DE.ISO_8859-1",  "ISO-8859-1" },
 455     { "en_AU.DIS_8859-15", "ISO-8859-15" },
 456     { "en_AU.ISO_8859-1",  "ISO-8859-1" },
 457     { "en_CA.DIS_8859-15", "ISO-8859-15" },
 458     { "en_CA.ISO_8859-1",  "ISO-8859-1" },
 459     { "en_GB.DIS_8859-15", "ISO-8859-15" },
 460     { "en_GB.ISO_8859-1",  "ISO-8859-1" },
 461     { "en_US.DIS_8859-15", "ISO-8859-15" },
 462     { "en_US.ISO_8859-1",  "ISO-8859-1" },
 463     { "es_ES.DIS_8859-15", "ISO-8859-15" },
 464     { "es_ES.ISO_8859-1",  "ISO-8859-1" },
 465     { "fi_FI.DIS_8859-15", "ISO-8859-15" },
 466     { "fi_FI.ISO_8859-1",  "ISO-8859-1" },
 467     { "fr_BE.DIS_8859-15", "ISO-8859-15" },
 468     { "fr_BE.ISO_8859-1",  "ISO-8859-1" },
 469     { "fr_CA.DIS_8859-15", "ISO-8859-15" },
 470     { "fr_CA.ISO_8859-1",  "ISO-8859-1" },
 471     { "fr_CH.DIS_8859-15", "ISO-8859-15" },
 472     { "fr_CH.ISO_8859-1",  "ISO-8859-1" },
 473     { "fr_FR.DIS_8859-15", "ISO-8859-15" },
 474     { "fr_FR.ISO_8859-1",  "ISO-8859-1" },
 475     { "hr_HR.ISO_8859-2",  "ISO-8859-2" },
 476     { "hu_HU.ISO_8859-2",  "ISO-8859-2" },
 477     { "is_IS.DIS_8859-15", "ISO-8859-15" },
 478     { "is_IS.ISO_8859-1",  "ISO-8859-1" },
 479     { "it_CH.DIS_8859-15", "ISO-8859-15" },
 480     { "it_CH.ISO_8859-1",  "ISO-8859-1" },
 481     { "it_IT.DIS_8859-15", "ISO-8859-15" },
 482     { "it_IT.ISO_8859-1",  "ISO-8859-1" },
 483     { "ja_JP.EUC",         "EUC-JP" },
 484     { "ja_JP.SJIS",        "SHIFT_JIS" },
 485     { "ja_JP.Shift_JIS",   "SHIFT_JIS" },
 486     { "ko_KR.EUC",         "EUC-KR" },
 487     { "la_LN.ASCII",       "ASCII" },
 488     { "la_LN.DIS_8859-15", "ISO-8859-15" },
 489     { "la_LN.ISO_8859-1",  "ISO-8859-1" },
 490     { "la_LN.ISO_8859-2",  "ISO-8859-2" },
 491     { "la_LN.ISO_8859-4",  "ISO-8859-4" },
 492     { "lt_LN.ASCII",       "ASCII" },
 493     { "lt_LN.DIS_8859-15", "ISO-8859-15" },
 494     { "lt_LN.ISO_8859-1",  "ISO-8859-1" },
 495     { "lt_LN.ISO_8859-2",  "ISO-8859-2" },
 496     { "lt_LT.ISO_8859-4",  "ISO-8859-4" },
 497     { "nl_BE.DIS_8859-15", "ISO-8859-15" },
 498     { "nl_BE.ISO_8859-1",  "ISO-8859-1" },
 499     { "nl_NL.DIS_8859-15", "ISO-8859-15" },
 500     { "nl_NL.ISO_8859-1",  "ISO-8859-1" },
 501     { "no_NO.DIS_8859-15", "ISO-8859-15" },
 502     { "no_NO.ISO_8859-1",  "ISO-8859-1" },
 503     { "pl_PL.ISO_8859-2",  "ISO-8859-2" },
 504     { "pt_PT.DIS_8859-15", "ISO-8859-15" },
 505     { "pt_PT.ISO_8859-1",  "ISO-8859-1" },
 506     { "ru_RU.CP866",       "CP866" },
 507     { "ru_RU.ISO_8859-5",  "ISO-8859-5" },
 508     { "ru_RU.KOI8-R",      "KOI8-R" },
 509     { "ru_SU.CP866",       "CP866" },
 510     { "ru_SU.ISO_8859-5",  "ISO-8859-5" },
 511     { "ru_SU.KOI8-R",      "KOI8-R" },
 512     { "sl_SI.ISO_8859-2",  "ISO-8859-2" },
 513     { "sv_SE.DIS_8859-15", "ISO-8859-15" },
 514     { "sv_SE.ISO_8859-1",  "ISO-8859-1" },
 515     { "uk_UA.KOI8-U",      "KOI8-U" },
 516     { "zh_CN.EUC",         "GB2312" },
 517     { "zh_TW.BIG5",        "BIG5" },
 518     { "zh_TW.Big5",        "BIG5" }
 519 #  define locale_table_defined
 520 # endif
 521 # if defined __DJGPP__                                      /* DOS / DJGPP 2.03 */
 522     /* The encodings given here may not all be correct.
 523        If you find that the encoding given for your language and
 524        country is not the one your DOS machine actually uses, just
 525        correct it in this file, and send a mail to
 526        Juan Manuel Guerrero <juan.guerrero@gmx.de>
 527        and <bug-gnulib@gnu.org>.  */
 528     { "C",     "ASCII" },
 529     { "ar",    "CP864" },
 530     { "ar_AE", "CP864" },
 531     { "ar_DZ", "CP864" },
 532     { "ar_EG", "CP864" },
 533     { "ar_IQ", "CP864" },
 534     { "ar_IR", "CP864" },
 535     { "ar_JO", "CP864" },
 536     { "ar_KW", "CP864" },
 537     { "ar_MA", "CP864" },
 538     { "ar_OM", "CP864" },
 539     { "ar_QA", "CP864" },
 540     { "ar_SA", "CP864" },
 541     { "ar_SY", "CP864" },
 542     { "be",    "CP866" },
 543     { "be_BE", "CP866" },
 544     { "bg",    "CP866" }, /* not CP855 ?? */
 545     { "bg_BG", "CP866" }, /* not CP855 ?? */
 546     { "ca",    "CP850" },
 547     { "ca_ES", "CP850" },
 548     { "cs",    "CP852" },
 549     { "cs_CZ", "CP852" },
 550     { "da",    "CP865" }, /* not CP850 ?? */
 551     { "da_DK", "CP865" }, /* not CP850 ?? */
 552     { "de",    "CP850" },
 553     { "de_AT", "CP850" },
 554     { "de_CH", "CP850" },
 555     { "de_DE", "CP850" },
 556     { "el",    "CP869" },
 557     { "el_GR", "CP869" },
 558     { "en",    "CP850" },
 559     { "en_AU", "CP850" }, /* not CP437 ?? */
 560     { "en_CA", "CP850" },
 561     { "en_GB", "CP850" },
 562     { "en_NZ", "CP437" },
 563     { "en_US", "CP437" },
 564     { "en_ZA", "CP850" }, /* not CP437 ?? */
 565     { "eo",    "CP850" },
 566     { "eo_EO", "CP850" },
 567     { "es",    "CP850" },
 568     { "es_AR", "CP850" },
 569     { "es_BO", "CP850" },
 570     { "es_CL", "CP850" },
 571     { "es_CO", "CP850" },
 572     { "es_CR", "CP850" },
 573     { "es_CU", "CP850" },
 574     { "es_DO", "CP850" },
 575     { "es_EC", "CP850" },
 576     { "es_ES", "CP850" },
 577     { "es_GT", "CP850" },
 578     { "es_HN", "CP850" },
 579     { "es_MX", "CP850" },
 580     { "es_NI", "CP850" },
 581     { "es_PA", "CP850" },
 582     { "es_PE", "CP850" },
 583     { "es_PY", "CP850" },
 584     { "es_SV", "CP850" },
 585     { "es_UY", "CP850" },
 586     { "es_VE", "CP850" },
 587     { "et",    "CP850" },
 588     { "et_EE", "CP850" },
 589     { "eu",    "CP850" },
 590     { "eu_ES", "CP850" },
 591     { "fi",    "CP850" },
 592     { "fi_FI", "CP850" },
 593     { "fr",    "CP850" },
 594     { "fr_BE", "CP850" },
 595     { "fr_CA", "CP850" },
 596     { "fr_CH", "CP850" },
 597     { "fr_FR", "CP850" },
 598     { "ga",    "CP850" },
 599     { "ga_IE", "CP850" },
 600     { "gd",    "CP850" },
 601     { "gd_GB", "CP850" },
 602     { "gl",    "CP850" },
 603     { "gl_ES", "CP850" },
 604     { "he",    "CP862" },
 605     { "he_IL", "CP862" },
 606     { "hr",    "CP852" },
 607     { "hr_HR", "CP852" },
 608     { "hu",    "CP852" },
 609     { "hu_HU", "CP852" },
 610     { "id",    "CP850" }, /* not CP437 ?? */
 611     { "id_ID", "CP850" }, /* not CP437 ?? */
 612     { "is",    "CP861" }, /* not CP850 ?? */
 613     { "is_IS", "CP861" }, /* not CP850 ?? */
 614     { "it",    "CP850" },
 615     { "it_CH", "CP850" },
 616     { "it_IT", "CP850" },
 617     { "ja",    "CP932" },
 618     { "ja_JP", "CP932" },
 619     { "kr",    "CP949" }, /* not CP934 ?? */
 620     { "kr_KR", "CP949" }, /* not CP934 ?? */
 621     { "lt",    "CP775" },
 622     { "lt_LT", "CP775" },
 623     { "lv",    "CP775" },
 624     { "lv_LV", "CP775" },
 625     { "mk",    "CP866" }, /* not CP855 ?? */
 626     { "mk_MK", "CP866" }, /* not CP855 ?? */
 627     { "mt",    "CP850" },
 628     { "mt_MT", "CP850" },
 629     { "nb",    "CP865" }, /* not CP850 ?? */
 630     { "nb_NO", "CP865" }, /* not CP850 ?? */
 631     { "nl",    "CP850" },
 632     { "nl_BE", "CP850" },
 633     { "nl_NL", "CP850" },
 634     { "nn",    "CP865" }, /* not CP850 ?? */
 635     { "nn_NO", "CP865" }, /* not CP850 ?? */
 636     { "no",    "CP865" }, /* not CP850 ?? */
 637     { "no_NO", "CP865" }, /* not CP850 ?? */
 638     { "pl",    "CP852" },
 639     { "pl_PL", "CP852" },
 640     { "pt",    "CP850" },
 641     { "pt_BR", "CP850" },
 642     { "pt_PT", "CP850" },
 643     { "ro",    "CP852" },
 644     { "ro_RO", "CP852" },
 645     { "ru",    "CP866" },
 646     { "ru_RU", "CP866" },
 647     { "sk",    "CP852" },
 648     { "sk_SK", "CP852" },
 649     { "sl",    "CP852" },
 650     { "sl_SI", "CP852" },
 651     { "sq",    "CP852" },
 652     { "sq_AL", "CP852" },
 653     { "sr",    "CP852" }, /* CP852 or CP866 or CP855 ?? */
 654     { "sr_CS", "CP852" }, /* CP852 or CP866 or CP855 ?? */
 655     { "sr_YU", "CP852" }, /* CP852 or CP866 or CP855 ?? */
 656     { "sv",    "CP850" },
 657     { "sv_SE", "CP850" },
 658     { "th",    "CP874" },
 659     { "th_TH", "CP874" },
 660     { "tr",    "CP857" },
 661     { "tr_TR", "CP857" },
 662     { "uk",    "CP1125" },
 663     { "uk_UA", "CP1125" },
 664     { "zh_CN", "GBK" },
 665     { "zh_TW", "CP950" } /* not CP938 ?? */
 666 #  define locale_table_defined
 667 # endif
 668 # ifndef locale_table_defined
 669     /* Just a dummy entry, to avoid a C syntax error.  */
 670     { "", "" }
 671 # endif
 672   };
 673
 674 #endif
 675
 676
 677 /* Determine the current locale's character encoding, and canonicalize it
 678    into one of the canonical names listed in localcharset.h.
 679    The result must not be freed; it is statically allocated.
 680    If the canonical name cannot be determined, the result is a non-canonical
 681    name.  */
 682
 683 #ifdef STATIC
 684 STATIC
 685 #endif
 686 const char *
 687 locale_charset (void)
 688 {
 689   const char *codeset;
 690
 691 #if HAVE_LANGINFO_CODESET || defined WINDOWS_NATIVE || defined OS2
 692
 693 # if HAVE_LANGINFO_CODESET
 694
 695   /* Most systems support nl_langinfo (CODESET) nowadays.  */
 696   codeset = nl_langinfo (CODESET);
 697
 698 #  ifdef __CYGWIN__
 699   /* Cygwin < 1.7 does not have locales.  nl_langinfo (CODESET) always
 700      returns "US-ASCII".  Return the suffix of the locale name from the
 701      environment variables (if present) or the codepage as a number.  */
 702   if (codeset != NULL && strcmp (codeset, "US-ASCII") == 0)
 703     {
 704       const char *locale;
 705       static char buf[2 + 10 + 1];
 706
 707       locale = getenv ("LC_ALL");
 708       if (locale == NULL || locale[0] == '\0')
 709         {
 710           locale = getenv ("LC_CTYPE");
 711           if (locale == NULL || locale[0] == '\0')
 712             locale = getenv ("LANG");
 713         }
 714       if (locale != NULL && locale[0] != '\0')
 715         {
 716           /* If the locale name contains an encoding after the dot, return
 717              it.  */
 718           const char *dot = strchr (locale, '.');
 719
 720           if (dot != NULL)
 721             {
 722               const char *modifier;
 723
 724               dot++;
 725               /* Look for the possible @... trailer and remove it, if any.  */
 726               modifier = strchr (dot, '@');
 727               if (modifier == NULL)
 728                 return dot;
 729               if (modifier - dot < sizeof (buf))
 730                 {
 731                   memcpy (buf, dot, modifier - dot);
 732                   buf [modifier - dot] = '\0';
 733                   return buf;
 734                 }
 735             }
 736         }
 737
 738       /* The Windows API has a function returning the locale's codepage as a
 739          number: GetACP().  This encoding is used by Cygwin, unless the user
 740          has set the environment variable CYGWIN=codepage:oem (which very few
 741          people do).
 742          Output directed to console windows needs to be converted (to
 743          GetOEMCP() if the console is using a raster font, or to
 744          GetConsoleOutputCP() if it is using a TrueType font).  Cygwin does
 745          this conversion transparently (see winsup/cygwin/fhandler_console.cc),
 746          converting to GetConsoleOutputCP().  This leads to correct results,
 747          except when SetConsoleOutputCP has been called and a raster font is
 748          in use.  */
 749       sprintf (buf, "CP%u", GetACP ());
 750       codeset = buf;
 751     }
 752 #  endif
 753
 754   if (codeset == NULL)
 755     /* The canonical name cannot be determined.  */
 756     codeset = "";
 757
 758 # elif defined WINDOWS_NATIVE
 759
 760   static char buf[2 + 10 + 1];
 761
 762   /* The Windows API has a function returning the locale's codepage as
 763      a number, but the value doesn't change according to what the
 764      'setlocale' call specified.  So we use it as a last resort, in
 765      case the string returned by 'setlocale' doesn't specify the
 766      codepage.  */
 767   char *current_locale = setlocale (LC_ALL, NULL);
 768   char *pdot;
 769
 770   /* If they set different locales for different categories,
 771      'setlocale' will return a semi-colon separated list of locale
 772      values.  To make sure we use the correct one, we choose LC_CTYPE.  */
 773   if (strchr (current_locale, ';'))
 774     current_locale = setlocale (LC_CTYPE, NULL);
 775
 776   pdot = strrchr (current_locale, '.');
 777   if (pdot && 2 + strlen (pdot + 1) + 1 <= sizeof (buf))
 778     sprintf (buf, "CP%s", pdot + 1);
 779   else
 780     {
 781       /* The Windows API has a function returning the locale's codepage as a
 782         number: GetACP().
 783         When the output goes to a console window, it needs to be provided in
 784         GetOEMCP() encoding if the console is using a raster font, or in
 785         GetConsoleOutputCP() encoding if it is using a TrueType font.
 786         But in GUI programs and for output sent to files and pipes, GetACP()
 787         encoding is the best bet.  */
 788       sprintf (buf, "CP%u", GetACP ());
 789     }
 790   codeset = buf;
 791
 792 # elif defined OS2
 793
 794   const char *locale;
 795   static char buf[2 + 10 + 1];
 796   ULONG cp[3];
 797   ULONG cplen;
 798
 799   codeset = NULL;
 800
 801   /* Allow user to override the codeset, as set in the operating system,
 802      with standard language environment variables.  */
 803   locale = getenv ("LC_ALL");
 804   if (locale == NULL || locale[0] == '\0')
 805     {
 806       locale = getenv ("LC_CTYPE");
 807       if (locale == NULL || locale[0] == '\0')
 808         locale = getenv ("LANG");
 809     }
 810   if (locale != NULL && locale[0] != '\0')
 811     {
 812       /* If the locale name contains an encoding after the dot, return it.  */
 813       const char *dot = strchr (locale, '.');
 814
 815       if (dot != NULL)
 816         {
 817           const char *modifier;
 818
 819           dot++;
 820           /* Look for the possible @... trailer and remove it, if any.  */
 821           modifier = strchr (dot, '@');
 822           if (modifier == NULL)
 823             return dot;
 824           if (modifier - dot < sizeof (buf))
 825             {
 826               memcpy (buf, dot, modifier - dot);
 827               buf [modifier - dot] = '\0';
 828               return buf;
 829             }
 830         }
 831
 832       /* For the POSIX locale, don't use the system's codepage.  */
 833       if (strcmp (locale, "C") == 0 || strcmp (locale, "POSIX") == 0)
 834         codeset = "";
 835     }
 836
 837   if (codeset == NULL)
 838     {
 839       /* OS/2 has a function returning the locale's codepage as a number.  */
 840       if (DosQueryCp (sizeof (cp), cp, &cplen))
 841         codeset = "";
 842       else
 843         {
 844           sprintf (buf, "CP%u", cp[0]);
 845           codeset = buf;
 846         }
 847     }
 848
 849 # else
 850
 851 #  error "Add code for other platforms here."
 852
 853 # endif
 854
 855   /* Resolve alias.  */
 856   {
 857 # ifdef alias_table_defined
 858     /* On some platforms, UTF-8 locales are the most frequently used ones.
 859        Speed up the common case and slow down the less common cases by
 860        testing for this case first.  */
 861 #  if defined __OpenBSD__ || (defined __APPLE__ && defined __MACH__) || defined __sun || defined __CYGWIN__
 862     if (strcmp (codeset, "UTF-8") == 0)
 863       goto done_table_lookup;
 864     else
 865 #  endif
 866       {
 867         const struct table_entry * const table = alias_table;
 868         size_t const table_size =
 869           sizeof (alias_table) / sizeof (struct table_entry);
 870         /* The table is sorted.  Perform a binary search.  */
 871         size_t hi = table_size;
 872         size_t lo = 0;
 873         while (lo < hi)
 874           {
 875             /* Invariant:
 876                for i < lo, strcmp (table[i].alias, codeset) < 0,
 877                for i >= hi, strcmp (table[i].alias, codeset) > 0.  */
 878             size_t mid = (hi + lo) >> 1; /* >= lo, < hi */
 879             int cmp = strcmp (table[mid].alias, codeset);
 880             if (cmp < 0)
 881               lo = mid + 1;
 882             else if (cmp > 0)
 883               hi = mid;
 884             else
 885               {
 886                 /* Found an i with
 887                      strcmp (table[i].alias, codeset) == 0.  */
 888                 codeset = table[mid].canonical;
 889                 goto done_table_lookup;
 890               }
 891           }
 892       }
 893     if (0)
 894       done_table_lookup: ;
 895     else
 896 # endif
 897       {
 898         /* Did not find it in the table.  */
 899         /* On Mac OS X, all modern locales use the UTF-8 encoding.
 900            BeOS and Haiku have a single locale, and it has UTF-8 encoding.  */
 901 # if (defined __APPLE__ && defined __MACH__) || defined __BEOS__ || defined __HAIKU__
 902         codeset = "UTF-8";
 903 # else
 904         /* Don't return an empty string.  GNU libc and GNU libiconv interpret
 905            the empty string as denoting "the locale's character encoding",
 906            thus GNU libiconv would call this function a second time.  */
 907         if (codeset[0] == '\0')
 908           codeset = "ASCII";
 909 # endif
 910       }
 911   }
 912
 913 #else
 914
 915   /* On old systems which lack it, use setlocale or getenv.  */
 916   const char *locale = NULL;
 917
 918   /* But most old systems don't have a complete set of locales.  Some
 919      (like DJGPP) have only the C locale.  Therefore we don't use setlocale
 920      here; it would return "C" when it doesn't support the locale name the
 921      user has set.  */
 922 # if 0
 923   locale = setlocale (LC_CTYPE, NULL);
 924 # endif
 925   if (locale == NULL || locale[0] == '\0')
 926     {
 927       locale = getenv ("LC_ALL");
 928       if (locale == NULL || locale[0] == '\0')
 929         {
 930           locale = getenv ("LC_CTYPE");
 931           if (locale == NULL || locale[0] == '\0')
 932             locale = getenv ("LANG");
 933             if (locale == NULL)
 934               locale = "";
 935         }
 936     }
 937
 938   /* Map locale name to canonical encoding name.  */
 939   {
 940 # ifdef locale_table_defined
 941     const struct table_entry * const table = locale_table;
 942     size_t const table_size =
 943       sizeof (locale_table) / sizeof (struct table_entry);
 944     /* The table is sorted.  Perform a binary search.  */
 945     size_t hi = table_size;
 946     size_t lo = 0;
 947     while (lo < hi)
 948       {
 949         /* Invariant:
 950            for i < lo, strcmp (table[i].locale, locale) < 0,
 951            for i >= hi, strcmp (table[i].locale, locale) > 0.  */
 952         size_t mid = (hi + lo) >> 1; /* >= lo, < hi */
 953         int cmp = strcmp (table[mid].locale, locale);
 954         if (cmp < 0)
 955           lo = mid + 1;
 956         else if (cmp > 0)
 957           hi = mid;
 958         else
 959           {
 960             /* Found an i with
 961                  strcmp (table[i].locale, locale) == 0.  */
 962             codeset = table[mid].canonical;
 963             goto done_table_lookup;
 964           }
 965       }
 966     if (0)
 967       done_table_lookup: ;
 968     else
 969 # endif
 970       {
 971         /* Did not find it in the table.  */
 972         /* On Mac OS X, all modern locales use the UTF-8 encoding.
 973            BeOS and Haiku have a single locale, and it has UTF-8 encoding.  */
 974 # if (defined __APPLE__ && defined __MACH__) || defined __BEOS__ || defined __HAIKU__
 975         codeset = "UTF-8";
 976 # else
 977         /* The canonical name cannot be determined.  */
 978         /* Don't return an empty string.  GNU libc and GNU libiconv interpret
 979            the empty string as denoting "the locale's character encoding",
 980            thus GNU libiconv would call this function a second time.  */
 981         codeset = "ASCII";
 982 # endif
 983       }
 984   }
 985
 986 #endif
 987
 988 #ifdef DARWIN7
 989   /* Mac OS X sets MB_CUR_MAX to 1 when LC_ALL=C, and "UTF-8"
 990      (the default codeset) does not work when MB_CUR_MAX is 1.  */
 991   if (strcmp (codeset, "UTF-8") == 0 && MB_CUR_MAX_L (uselocale (NULL)) <= 1)
 992     codeset = "ASCII";
 993 #endif
 994
 995   return codeset;
 996 }