fts: simplify fts_build
[gnulib.git] / lib / localcharset.c
blob6b4f5ae9a43ea0f5c6d82405a1e92009fb9f975e
1 /* Determine a canonical name for the current locale's character encoding.
3 Copyright (C) 2000-2006, 2008-2017 Free Software Foundation, Inc.
5 This program is free software; you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published by
7 the Free Software Foundation; either version 2, or (at your option)
8 any later version.
10 This program is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 GNU General Public License for more details.
15 You should have received a copy of the GNU General Public License along
16 with this program; if not, see <http://www.gnu.org/licenses/>. */
18 /* Written by Bruno Haible <bruno@clisp.org>. */
20 #include <config.h>
22 /* Specification. */
23 #include "localcharset.h"
25 #include <fcntl.h>
26 #include <stddef.h>
27 #include <stdio.h>
28 #include <string.h>
29 #include <stdlib.h>
31 #if defined __APPLE__ && defined __MACH__ && HAVE_LANGINFO_CODESET
32 # define DARWIN7 /* Darwin 7 or newer, i.e. Mac OS X 10.3 or newer */
33 #endif
35 #if (defined _WIN32 || defined __WIN32__) && !defined __CYGWIN__
36 # define WINDOWS_NATIVE
37 # include <locale.h>
38 #endif
40 #if defined __EMX__
41 /* Assume EMX program runs on OS/2, even if compiled under DOS. */
42 # ifndef OS2
43 # define OS2
44 # endif
45 #endif
47 #if !defined WINDOWS_NATIVE
48 # include <unistd.h>
49 # if HAVE_LANGINFO_CODESET
50 # include <langinfo.h>
51 # else
52 # if 0 /* see comment below */
53 # include <locale.h>
54 # endif
55 # endif
56 # ifdef __CYGWIN__
57 # define WIN32_LEAN_AND_MEAN
58 # include <windows.h>
59 # endif
60 #elif defined WINDOWS_NATIVE
61 # define WIN32_LEAN_AND_MEAN
62 # include <windows.h>
63 #endif
64 #if defined OS2
65 # define INCL_DOS
66 # include <os2.h>
67 #endif
69 /* For MB_CUR_MAX_L */
70 #if defined DARWIN7
71 # include <xlocale.h>
72 #endif
74 #if ENABLE_RELOCATABLE
75 # include "relocatable.h"
76 #else
77 # define relocate(pathname) (pathname)
78 # define relocate2(pathname,allocatedp) (*(allocatedp) = NULL, (pathname))
79 #endif
81 /* Get LIBDIR. */
82 #ifndef LIBDIR
83 # include "configmake.h"
84 #endif
86 /* Define O_NOFOLLOW to 0 on platforms where it does not exist. */
87 #ifndef O_NOFOLLOW
88 # define O_NOFOLLOW 0
89 #endif
91 #if defined _WIN32 || defined __WIN32__ || defined __CYGWIN__ || defined __EMX__ || defined __DJGPP__
92 /* Native Windows, Cygwin, OS/2, DOS */
93 # define ISSLASH(C) ((C) == '/' || (C) == '\\')
94 #endif
96 #ifndef DIRECTORY_SEPARATOR
97 # define DIRECTORY_SEPARATOR '/'
98 #endif
100 #ifndef ISSLASH
101 # define ISSLASH(C) ((C) == DIRECTORY_SEPARATOR)
102 #endif
104 #if HAVE_DECL_GETC_UNLOCKED
105 # undef getc
106 # define getc getc_unlocked
107 #endif
109 /* The following static variable is declared 'volatile' to avoid a
110 possible multithread problem in the function get_charset_aliases. If we
111 are running in a threaded environment, and if two threads initialize
112 'charset_aliases' simultaneously, both will produce the same value,
113 and everything will be ok if the two assignments to 'charset_aliases'
114 are atomic. But I don't know what will happen if the two assignments mix. */
115 #if __STDC__ != 1
116 # define volatile /* empty */
117 #endif
118 /* Pointer to the contents of the charset.alias file, if it has already been
119 read, else NULL. Its format is:
120 ALIAS_1 '\0' CANONICAL_1 '\0' ... ALIAS_n '\0' CANONICAL_n '\0' '\0' */
121 static const char * volatile charset_aliases;
123 /* Return a pointer to the contents of the charset.alias file. */
124 static const char *
125 get_charset_aliases (void)
127 const char *cp;
129 cp = charset_aliases;
130 if (cp == NULL)
132 #if !(defined DARWIN7 || defined VMS || defined WINDOWS_NATIVE || defined __CYGWIN__ || defined OS2)
133 char *malloc_dir = NULL;
134 const char *dir;
135 const char *base = "charset.alias";
136 char *file_name;
138 /* Make it possible to override the charset.alias location. This is
139 necessary for running the testsuite before "make install". */
140 dir = getenv ("CHARSETALIASDIR");
141 if (dir == NULL || dir[0] == '\0')
142 dir = relocate2 (LIBDIR, &malloc_dir);
144 /* Concatenate dir and base into freshly allocated file_name. */
146 size_t dir_len = strlen (dir);
147 size_t base_len = strlen (base);
148 int add_slash = (dir_len > 0 && !ISSLASH (dir[dir_len - 1]));
149 file_name = (char *) malloc (dir_len + add_slash + base_len + 1);
150 if (file_name != NULL)
152 memcpy (file_name, dir, dir_len);
153 if (add_slash)
154 file_name[dir_len] = DIRECTORY_SEPARATOR;
155 memcpy (file_name + dir_len + add_slash, base, base_len + 1);
159 free (malloc_dir);
161 if (file_name == NULL)
162 /* Out of memory. Treat the file as empty. */
163 cp = "";
164 else
166 int fd;
168 /* Open the file. Reject symbolic links on platforms that support
169 O_NOFOLLOW. This is a security feature. Without it, an attacker
170 could retrieve parts of the contents (namely, the tail of the
171 first line that starts with "* ") of an arbitrary file by placing
172 a symbolic link to that file under the name "charset.alias" in
173 some writable directory and defining the environment variable
174 CHARSETALIASDIR to point to that directory. */
175 fd = open (file_name,
176 O_RDONLY | (HAVE_WORKING_O_NOFOLLOW ? O_NOFOLLOW : 0));
177 if (fd < 0)
178 /* File not found. Treat it as empty. */
179 cp = "";
180 else
182 FILE *fp;
184 fp = fdopen (fd, "r");
185 if (fp == NULL)
187 /* Out of memory. Treat the file as empty. */
188 close (fd);
189 cp = "";
191 else
193 /* Parse the file's contents. */
194 char *res_ptr = NULL;
195 size_t res_size = 0;
197 for (;;)
199 int c;
200 char buf1[50+1];
201 char buf2[50+1];
202 size_t l1, l2;
203 char *old_res_ptr;
205 c = getc (fp);
206 if (c == EOF)
207 break;
208 if (c == '\n' || c == ' ' || c == '\t')
209 continue;
210 if (c == '#')
212 /* Skip comment, to end of line. */
214 c = getc (fp);
215 while (!(c == EOF || c == '\n'));
216 if (c == EOF)
217 break;
218 continue;
220 ungetc (c, fp);
221 if (fscanf (fp, "%50s %50s", buf1, buf2) < 2)
222 break;
223 l1 = strlen (buf1);
224 l2 = strlen (buf2);
225 old_res_ptr = res_ptr;
226 if (res_size == 0)
228 res_size = l1 + 1 + l2 + 1;
229 res_ptr = (char *) malloc (res_size + 1);
231 else
233 res_size += l1 + 1 + l2 + 1;
234 res_ptr = (char *) realloc (res_ptr, res_size + 1);
236 if (res_ptr == NULL)
238 /* Out of memory. */
239 res_size = 0;
240 free (old_res_ptr);
241 break;
243 strcpy (res_ptr + res_size - (l2 + 1) - (l1 + 1), buf1);
244 strcpy (res_ptr + res_size - (l2 + 1), buf2);
246 fclose (fp);
247 if (res_size == 0)
248 cp = "";
249 else
251 *(res_ptr + res_size) = '\0';
252 cp = res_ptr;
257 free (file_name);
260 #else
262 # if defined DARWIN7
263 /* To avoid the trouble of installing a file that is shared by many
264 GNU packages -- many packaging systems have problems with this --,
265 simply inline the aliases here. */
266 cp = "ISO8859-1" "\0" "ISO-8859-1" "\0"
267 "ISO8859-2" "\0" "ISO-8859-2" "\0"
268 "ISO8859-4" "\0" "ISO-8859-4" "\0"
269 "ISO8859-5" "\0" "ISO-8859-5" "\0"
270 "ISO8859-7" "\0" "ISO-8859-7" "\0"
271 "ISO8859-9" "\0" "ISO-8859-9" "\0"
272 "ISO8859-13" "\0" "ISO-8859-13" "\0"
273 "ISO8859-15" "\0" "ISO-8859-15" "\0"
274 "KOI8-R" "\0" "KOI8-R" "\0"
275 "KOI8-U" "\0" "KOI8-U" "\0"
276 "CP866" "\0" "CP866" "\0"
277 "CP949" "\0" "CP949" "\0"
278 "CP1131" "\0" "CP1131" "\0"
279 "CP1251" "\0" "CP1251" "\0"
280 "eucCN" "\0" "GB2312" "\0"
281 "GB2312" "\0" "GB2312" "\0"
282 "eucJP" "\0" "EUC-JP" "\0"
283 "eucKR" "\0" "EUC-KR" "\0"
284 "Big5" "\0" "BIG5" "\0"
285 "Big5HKSCS" "\0" "BIG5-HKSCS" "\0"
286 "GBK" "\0" "GBK" "\0"
287 "GB18030" "\0" "GB18030" "\0"
288 "SJIS" "\0" "SHIFT_JIS" "\0"
289 "ARMSCII-8" "\0" "ARMSCII-8" "\0"
290 "PT154" "\0" "PT154" "\0"
291 /*"ISCII-DEV" "\0" "?" "\0"*/
292 "*" "\0" "UTF-8" "\0";
293 # endif
295 # if defined VMS
296 /* To avoid the troubles of an extra file charset.alias_vms in the
297 sources of many GNU packages, simply inline the aliases here. */
298 /* The list of encodings is taken from the OpenVMS 7.3-1 documentation
299 "Compaq C Run-Time Library Reference Manual for OpenVMS systems"
300 section 10.7 "Handling Different Character Sets". */
301 cp = "ISO8859-1" "\0" "ISO-8859-1" "\0"
302 "ISO8859-2" "\0" "ISO-8859-2" "\0"
303 "ISO8859-5" "\0" "ISO-8859-5" "\0"
304 "ISO8859-7" "\0" "ISO-8859-7" "\0"
305 "ISO8859-8" "\0" "ISO-8859-8" "\0"
306 "ISO8859-9" "\0" "ISO-8859-9" "\0"
307 /* Japanese */
308 "eucJP" "\0" "EUC-JP" "\0"
309 "SJIS" "\0" "SHIFT_JIS" "\0"
310 "DECKANJI" "\0" "DEC-KANJI" "\0"
311 "SDECKANJI" "\0" "EUC-JP" "\0"
312 /* Chinese */
313 "eucTW" "\0" "EUC-TW" "\0"
314 "DECHANYU" "\0" "DEC-HANYU" "\0"
315 "DECHANZI" "\0" "GB2312" "\0"
316 /* Korean */
317 "DECKOREAN" "\0" "EUC-KR" "\0";
318 # endif
320 # if defined WINDOWS_NATIVE || defined __CYGWIN__
321 /* To avoid the troubles of installing a separate file in the same
322 directory as the DLL and of retrieving the DLL's directory at
323 runtime, simply inline the aliases here. */
325 cp = "CP936" "\0" "GBK" "\0"
326 "CP1361" "\0" "JOHAB" "\0"
327 "CP20127" "\0" "ASCII" "\0"
328 "CP20866" "\0" "KOI8-R" "\0"
329 "CP20936" "\0" "GB2312" "\0"
330 "CP21866" "\0" "KOI8-RU" "\0"
331 "CP28591" "\0" "ISO-8859-1" "\0"
332 "CP28592" "\0" "ISO-8859-2" "\0"
333 "CP28593" "\0" "ISO-8859-3" "\0"
334 "CP28594" "\0" "ISO-8859-4" "\0"
335 "CP28595" "\0" "ISO-8859-5" "\0"
336 "CP28596" "\0" "ISO-8859-6" "\0"
337 "CP28597" "\0" "ISO-8859-7" "\0"
338 "CP28598" "\0" "ISO-8859-8" "\0"
339 "CP28599" "\0" "ISO-8859-9" "\0"
340 "CP28605" "\0" "ISO-8859-15" "\0"
341 "CP38598" "\0" "ISO-8859-8" "\0"
342 "CP51932" "\0" "EUC-JP" "\0"
343 "CP51936" "\0" "GB2312" "\0"
344 "CP51949" "\0" "EUC-KR" "\0"
345 "CP51950" "\0" "EUC-TW" "\0"
346 "CP54936" "\0" "GB18030" "\0"
347 "CP65001" "\0" "UTF-8" "\0";
348 # endif
349 # if defined OS2
350 /* To avoid the troubles of installing a separate file in the same
351 directory as the DLL and of retrieving the DLL's directory at
352 runtime, simply inline the aliases here. */
354 /* The list of encodings is taken from "List of OS/2 Codepages"
355 by Alex Taylor:
356 <http://altsan.org/os2/toolkits/uls/index.html#codepages>.
357 See also "IBM Globalization - Code page identifiers":
358 <http://www-01.ibm.com/software/globalization/cp/cp_cpgid.html>. */
359 cp = "CP813" "\0" "ISO-8859-7" "\0"
360 "CP878" "\0" "KOI8-R" "\0"
361 "CP819" "\0" "ISO-8859-1" "\0"
362 "CP912" "\0" "ISO-8859-2" "\0"
363 "CP913" "\0" "ISO-8859-3" "\0"
364 "CP914" "\0" "ISO-8859-4" "\0"
365 "CP915" "\0" "ISO-8859-5" "\0"
366 "CP916" "\0" "ISO-8859-8" "\0"
367 "CP920" "\0" "ISO-8859-9" "\0"
368 "CP921" "\0" "ISO-8859-13" "\0"
369 "CP923" "\0" "ISO-8859-15" "\0"
370 "CP954" "\0" "EUC-JP" "\0"
371 "CP964" "\0" "EUC-TW" "\0"
372 "CP970" "\0" "EUC-KR" "\0"
373 "CP1089" "\0" "ISO-8859-6" "\0"
374 "CP1208" "\0" "UTF-8" "\0"
375 "CP1381" "\0" "GB2312" "\0"
376 "CP1386" "\0" "GBK" "\0"
377 "CP3372" "\0" "EUC-JP" "\0";
378 # endif
379 #endif
381 charset_aliases = cp;
384 return cp;
387 /* Determine the current locale's character encoding, and canonicalize it
388 into one of the canonical names listed in config.charset.
389 The result must not be freed; it is statically allocated.
390 If the canonical name cannot be determined, the result is a non-canonical
391 name. */
393 #ifdef STATIC
394 STATIC
395 #endif
396 const char *
397 locale_charset (void)
399 const char *codeset;
400 const char *aliases;
402 #if !(defined WINDOWS_NATIVE || defined OS2)
404 # if HAVE_LANGINFO_CODESET
406 /* Most systems support nl_langinfo (CODESET) nowadays. */
407 codeset = nl_langinfo (CODESET);
409 # ifdef __CYGWIN__
410 /* Cygwin < 1.7 does not have locales. nl_langinfo (CODESET) always
411 returns "US-ASCII". Return the suffix of the locale name from the
412 environment variables (if present) or the codepage as a number. */
413 if (codeset != NULL && strcmp (codeset, "US-ASCII") == 0)
415 const char *locale;
416 static char buf[2 + 10 + 1];
418 locale = getenv ("LC_ALL");
419 if (locale == NULL || locale[0] == '\0')
421 locale = getenv ("LC_CTYPE");
422 if (locale == NULL || locale[0] == '\0')
423 locale = getenv ("LANG");
425 if (locale != NULL && locale[0] != '\0')
427 /* If the locale name contains an encoding after the dot, return
428 it. */
429 const char *dot = strchr (locale, '.');
431 if (dot != NULL)
433 const char *modifier;
435 dot++;
436 /* Look for the possible @... trailer and remove it, if any. */
437 modifier = strchr (dot, '@');
438 if (modifier == NULL)
439 return dot;
440 if (modifier - dot < sizeof (buf))
442 memcpy (buf, dot, modifier - dot);
443 buf [modifier - dot] = '\0';
444 return buf;
449 /* The Windows API has a function returning the locale's codepage as a
450 number: GetACP(). This encoding is used by Cygwin, unless the user
451 has set the environment variable CYGWIN=codepage:oem (which very few
452 people do).
453 Output directed to console windows needs to be converted (to
454 GetOEMCP() if the console is using a raster font, or to
455 GetConsoleOutputCP() if it is using a TrueType font). Cygwin does
456 this conversion transparently (see winsup/cygwin/fhandler_console.cc),
457 converting to GetConsoleOutputCP(). This leads to correct results,
458 except when SetConsoleOutputCP has been called and a raster font is
459 in use. */
460 sprintf (buf, "CP%u", GetACP ());
461 codeset = buf;
463 # endif
465 # else
467 /* On old systems which lack it, use setlocale or getenv. */
468 const char *locale = NULL;
470 /* But most old systems don't have a complete set of locales. Some
471 (like SunOS 4 or DJGPP) have only the C locale. Therefore we don't
472 use setlocale here; it would return "C" when it doesn't support the
473 locale name the user has set. */
474 # if 0
475 locale = setlocale (LC_CTYPE, NULL);
476 # endif
477 if (locale == NULL || locale[0] == '\0')
479 locale = getenv ("LC_ALL");
480 if (locale == NULL || locale[0] == '\0')
482 locale = getenv ("LC_CTYPE");
483 if (locale == NULL || locale[0] == '\0')
484 locale = getenv ("LANG");
488 /* On some old systems, one used to set locale = "iso8859_1". On others,
489 you set it to "language_COUNTRY.charset". In any case, we resolve it
490 through the charset.alias file. */
491 codeset = locale;
493 # endif
495 #elif defined WINDOWS_NATIVE
497 static char buf[2 + 10 + 1];
499 /* The Windows API has a function returning the locale's codepage as
500 a number, but the value doesn't change according to what the
501 'setlocale' call specified. So we use it as a last resort, in
502 case the string returned by 'setlocale' doesn't specify the
503 codepage. */
504 char *current_locale = setlocale (LC_ALL, NULL);
505 char *pdot;
507 /* If they set different locales for different categories,
508 'setlocale' will return a semi-colon separated list of locale
509 values. To make sure we use the correct one, we choose LC_CTYPE. */
510 if (strchr (current_locale, ';'))
511 current_locale = setlocale (LC_CTYPE, NULL);
513 pdot = strrchr (current_locale, '.');
514 if (pdot && 2 + strlen (pdot + 1) + 1 <= sizeof (buf))
515 sprintf (buf, "CP%s", pdot + 1);
516 else
518 /* The Windows API has a function returning the locale's codepage as a
519 number: GetACP().
520 When the output goes to a console window, it needs to be provided in
521 GetOEMCP() encoding if the console is using a raster font, or in
522 GetConsoleOutputCP() encoding if it is using a TrueType font.
523 But in GUI programs and for output sent to files and pipes, GetACP()
524 encoding is the best bet. */
525 sprintf (buf, "CP%u", GetACP ());
527 codeset = buf;
529 #elif defined OS2
531 const char *locale;
532 static char buf[2 + 10 + 1];
533 ULONG cp[3];
534 ULONG cplen;
536 codeset = NULL;
538 /* Allow user to override the codeset, as set in the operating system,
539 with standard language environment variables. */
540 locale = getenv ("LC_ALL");
541 if (locale == NULL || locale[0] == '\0')
543 locale = getenv ("LC_CTYPE");
544 if (locale == NULL || locale[0] == '\0')
545 locale = getenv ("LANG");
547 if (locale != NULL && locale[0] != '\0')
549 /* If the locale name contains an encoding after the dot, return it. */
550 const char *dot = strchr (locale, '.');
552 if (dot != NULL)
554 const char *modifier;
556 dot++;
557 /* Look for the possible @... trailer and remove it, if any. */
558 modifier = strchr (dot, '@');
559 if (modifier == NULL)
560 return dot;
561 if (modifier - dot < sizeof (buf))
563 memcpy (buf, dot, modifier - dot);
564 buf [modifier - dot] = '\0';
565 return buf;
569 /* For the POSIX locale, don't use the system's codepage. */
570 if (strcmp (locale, "C") == 0 || strcmp (locale, "POSIX") == 0)
571 codeset = "";
574 if (codeset == NULL)
576 /* OS/2 has a function returning the locale's codepage as a number. */
577 if (DosQueryCp (sizeof (cp), cp, &cplen))
578 codeset = "";
579 else
581 sprintf (buf, "CP%u", cp[0]);
582 codeset = buf;
586 #endif
588 if (codeset == NULL)
589 /* The canonical name cannot be determined. */
590 codeset = "";
592 /* Resolve alias. */
593 for (aliases = get_charset_aliases ();
594 *aliases != '\0';
595 aliases += strlen (aliases) + 1, aliases += strlen (aliases) + 1)
596 if (strcmp (codeset, aliases) == 0
597 || (aliases[0] == '*' && aliases[1] == '\0'))
599 codeset = aliases + strlen (aliases) + 1;
600 break;
603 /* Don't return an empty string. GNU libc and GNU libiconv interpret
604 the empty string as denoting "the locale's character encoding",
605 thus GNU libiconv would call this function a second time. */
606 if (codeset[0] == '\0')
607 codeset = "ASCII";
609 #ifdef DARWIN7
610 /* Mac OS X sets MB_CUR_MAX to 1 when LC_ALL=C, and "UTF-8"
611 (the default codeset) does not work when MB_CUR_MAX is 1. */
612 if (strcmp (codeset, "UTF-8") == 0 && MB_CUR_MAX_L (uselocale (NULL)) <= 1)
613 codeset = "ASCII";
614 #endif
616 return codeset;