Installer: Fix association of .sh files due to a GUID typo
[msysgit.git] / mingw / share / gettext / intl / localcharset.c
blobcbcdaceb69aa55959041f2d8e5f41f020d9c4394
1 /* Determine a canonical name for the current locale's character encoding.
3 Copyright (C) 2000-2006, 2008-2010 Free Software Foundation, Inc.
5 This program is free software; you can redistribute it and/or modify it
6 under the terms of the GNU Library General Public License as published
7 by the Free Software Foundation; either version 2, or (at your option)
8 any later version.
10 This program is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Library General Public License for more details.
15 You should have received a copy of the GNU Library General Public
16 License along with this program; if not, write to the Free Software
17 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301,
18 USA. */
20 /* Written by Bruno Haible <bruno@clisp.org>. */
22 #include <config.h>
24 /* Specification. */
25 #include "localcharset.h"
27 #include <fcntl.h>
28 #include <stddef.h>
29 #include <stdio.h>
30 #include <string.h>
31 #include <stdlib.h>
33 #if defined __APPLE__ && defined __MACH__ && HAVE_LANGINFO_CODESET
34 # define DARWIN7 /* Darwin 7 or newer, i.e. MacOS X 10.3 or newer */
35 #endif
37 #if defined _WIN32 || defined __WIN32__
38 # define WIN32_NATIVE
39 #endif
41 #if defined __EMX__
42 /* Assume EMX program runs on OS/2, even if compiled under DOS. */
43 # ifndef OS2
44 # define OS2
45 # endif
46 #endif
48 #if !defined WIN32_NATIVE
49 # include <unistd.h>
50 # if HAVE_LANGINFO_CODESET
51 # include <langinfo.h>
52 # else
53 # if 0 /* see comment below */
54 # include <locale.h>
55 # endif
56 # endif
57 # ifdef __CYGWIN__
58 # define WIN32_LEAN_AND_MEAN
59 # include <windows.h>
60 # endif
61 #elif defined WIN32_NATIVE
62 # define WIN32_LEAN_AND_MEAN
63 # include <windows.h>
64 #endif
65 #if defined OS2
66 # define INCL_DOS
67 # include <os2.h>
68 #endif
70 #if ENABLE_RELOCATABLE
71 # include "relocatable.h"
72 #else
73 # define relocate(pathname) (pathname)
74 #endif
76 /* Get LIBDIR. */
77 #ifndef LIBDIR
78 # include "configmake.h"
79 #endif
81 /* Define O_NOFOLLOW to 0 on platforms where it does not exist. */
82 #ifndef O_NOFOLLOW
83 # define O_NOFOLLOW 0
84 #endif
86 #if defined _WIN32 || defined __WIN32__ || defined __CYGWIN__ || defined __EMX__ || defined __DJGPP__
87 /* Win32, Cygwin, OS/2, DOS */
88 # define ISSLASH(C) ((C) == '/' || (C) == '\\')
89 #endif
91 #ifndef DIRECTORY_SEPARATOR
92 # define DIRECTORY_SEPARATOR '/'
93 #endif
95 #ifndef ISSLASH
96 # define ISSLASH(C) ((C) == DIRECTORY_SEPARATOR)
97 #endif
99 #if HAVE_DECL_GETC_UNLOCKED
100 # undef getc
101 # define getc getc_unlocked
102 #endif
104 /* The following static variable is declared 'volatile' to avoid a
105 possible multithread problem in the function get_charset_aliases. If we
106 are running in a threaded environment, and if two threads initialize
107 'charset_aliases' simultaneously, both will produce the same value,
108 and everything will be ok if the two assignments to 'charset_aliases'
109 are atomic. But I don't know what will happen if the two assignments mix. */
110 #if __STDC__ != 1
111 # define volatile /* empty */
112 #endif
113 /* Pointer to the contents of the charset.alias file, if it has already been
114 read, else NULL. Its format is:
115 ALIAS_1 '\0' CANONICAL_1 '\0' ... ALIAS_n '\0' CANONICAL_n '\0' '\0' */
116 static const char * volatile charset_aliases;
118 /* Return a pointer to the contents of the charset.alias file. */
119 static const char *
120 get_charset_aliases (void)
122 const char *cp;
124 cp = charset_aliases;
125 if (cp == NULL)
127 #if !(defined DARWIN7 || defined VMS || defined WIN32_NATIVE || defined __CYGWIN__)
128 const char *dir;
129 const char *base = "charset.alias";
130 char *file_name;
132 /* Make it possible to override the charset.alias location. This is
133 necessary for running the testsuite before "make install". */
134 dir = getenv ("CHARSETALIASDIR");
135 if (dir == NULL || dir[0] == '\0')
136 dir = relocate (LIBDIR);
138 /* Concatenate dir and base into freshly allocated file_name. */
140 size_t dir_len = strlen (dir);
141 size_t base_len = strlen (base);
142 int add_slash = (dir_len > 0 && !ISSLASH (dir[dir_len - 1]));
143 file_name = (char *) malloc (dir_len + add_slash + base_len + 1);
144 if (file_name != NULL)
146 memcpy (file_name, dir, dir_len);
147 if (add_slash)
148 file_name[dir_len] = DIRECTORY_SEPARATOR;
149 memcpy (file_name + dir_len + add_slash, base, base_len + 1);
153 if (file_name == NULL)
154 /* Out of memory. Treat the file as empty. */
155 cp = "";
156 else
158 int fd;
160 /* Open the file. Reject symbolic links on platforms that support
161 O_NOFOLLOW. This is a security feature. Without it, an attacker
162 could retrieve parts of the contents (namely, the tail of the
163 first line that starts with "* ") of an arbitrary file by placing
164 a symbolic link to that file under the name "charset.alias" in
165 some writable directory and defining the environment variable
166 CHARSETALIASDIR to point to that directory. */
167 fd = open (file_name,
168 O_RDONLY | (HAVE_WORKING_O_NOFOLLOW ? O_NOFOLLOW : 0));
169 if (fd < 0)
170 /* File not found. Treat it as empty. */
171 cp = "";
172 else
174 FILE *fp;
176 fp = fdopen (fd, "r");
177 if (fp == NULL)
179 /* Out of memory. Treat the file as empty. */
180 close (fd);
181 cp = "";
183 else
185 /* Parse the file's contents. */
186 char *res_ptr = NULL;
187 size_t res_size = 0;
189 for (;;)
191 int c;
192 char buf1[50+1];
193 char buf2[50+1];
194 size_t l1, l2;
195 char *old_res_ptr;
197 c = getc (fp);
198 if (c == EOF)
199 break;
200 if (c == '\n' || c == ' ' || c == '\t')
201 continue;
202 if (c == '#')
204 /* Skip comment, to end of line. */
206 c = getc (fp);
207 while (!(c == EOF || c == '\n'));
208 if (c == EOF)
209 break;
210 continue;
212 ungetc (c, fp);
213 if (fscanf (fp, "%50s %50s", buf1, buf2) < 2)
214 break;
215 l1 = strlen (buf1);
216 l2 = strlen (buf2);
217 old_res_ptr = res_ptr;
218 if (res_size == 0)
220 res_size = l1 + 1 + l2 + 1;
221 res_ptr = (char *) malloc (res_size + 1);
223 else
225 res_size += l1 + 1 + l2 + 1;
226 res_ptr = (char *) realloc (res_ptr, res_size + 1);
228 if (res_ptr == NULL)
230 /* Out of memory. */
231 res_size = 0;
232 if (old_res_ptr != NULL)
233 free (old_res_ptr);
234 break;
236 strcpy (res_ptr + res_size - (l2 + 1) - (l1 + 1), buf1);
237 strcpy (res_ptr + res_size - (l2 + 1), buf2);
239 fclose (fp);
240 if (res_size == 0)
241 cp = "";
242 else
244 *(res_ptr + res_size) = '\0';
245 cp = res_ptr;
250 free (file_name);
253 #else
255 # if defined DARWIN7
256 /* To avoid the trouble of installing a file that is shared by many
257 GNU packages -- many packaging systems have problems with this --,
258 simply inline the aliases here. */
259 cp = "ISO8859-1" "\0" "ISO-8859-1" "\0"
260 "ISO8859-2" "\0" "ISO-8859-2" "\0"
261 "ISO8859-4" "\0" "ISO-8859-4" "\0"
262 "ISO8859-5" "\0" "ISO-8859-5" "\0"
263 "ISO8859-7" "\0" "ISO-8859-7" "\0"
264 "ISO8859-9" "\0" "ISO-8859-9" "\0"
265 "ISO8859-13" "\0" "ISO-8859-13" "\0"
266 "ISO8859-15" "\0" "ISO-8859-15" "\0"
267 "KOI8-R" "\0" "KOI8-R" "\0"
268 "KOI8-U" "\0" "KOI8-U" "\0"
269 "CP866" "\0" "CP866" "\0"
270 "CP949" "\0" "CP949" "\0"
271 "CP1131" "\0" "CP1131" "\0"
272 "CP1251" "\0" "CP1251" "\0"
273 "eucCN" "\0" "GB2312" "\0"
274 "GB2312" "\0" "GB2312" "\0"
275 "eucJP" "\0" "EUC-JP" "\0"
276 "eucKR" "\0" "EUC-KR" "\0"
277 "Big5" "\0" "BIG5" "\0"
278 "Big5HKSCS" "\0" "BIG5-HKSCS" "\0"
279 "GBK" "\0" "GBK" "\0"
280 "GB18030" "\0" "GB18030" "\0"
281 "SJIS" "\0" "SHIFT_JIS" "\0"
282 "ARMSCII-8" "\0" "ARMSCII-8" "\0"
283 "PT154" "\0" "PT154" "\0"
284 /*"ISCII-DEV" "\0" "?" "\0"*/
285 "*" "\0" "UTF-8" "\0";
286 # endif
288 # if defined VMS
289 /* To avoid the troubles of an extra file charset.alias_vms in the
290 sources of many GNU packages, simply inline the aliases here. */
291 /* The list of encodings is taken from the OpenVMS 7.3-1 documentation
292 "Compaq C Run-Time Library Reference Manual for OpenVMS systems"
293 section 10.7 "Handling Different Character Sets". */
294 cp = "ISO8859-1" "\0" "ISO-8859-1" "\0"
295 "ISO8859-2" "\0" "ISO-8859-2" "\0"
296 "ISO8859-5" "\0" "ISO-8859-5" "\0"
297 "ISO8859-7" "\0" "ISO-8859-7" "\0"
298 "ISO8859-8" "\0" "ISO-8859-8" "\0"
299 "ISO8859-9" "\0" "ISO-8859-9" "\0"
300 /* Japanese */
301 "eucJP" "\0" "EUC-JP" "\0"
302 "SJIS" "\0" "SHIFT_JIS" "\0"
303 "DECKANJI" "\0" "DEC-KANJI" "\0"
304 "SDECKANJI" "\0" "EUC-JP" "\0"
305 /* Chinese */
306 "eucTW" "\0" "EUC-TW" "\0"
307 "DECHANYU" "\0" "DEC-HANYU" "\0"
308 "DECHANZI" "\0" "GB2312" "\0"
309 /* Korean */
310 "DECKOREAN" "\0" "EUC-KR" "\0";
311 # endif
313 # if defined WIN32_NATIVE || defined __CYGWIN__
314 /* To avoid the troubles of installing a separate file in the same
315 directory as the DLL and of retrieving the DLL's directory at
316 runtime, simply inline the aliases here. */
318 cp = "CP936" "\0" "GBK" "\0"
319 "CP1361" "\0" "JOHAB" "\0"
320 "CP20127" "\0" "ASCII" "\0"
321 "CP20866" "\0" "KOI8-R" "\0"
322 "CP20936" "\0" "GB2312" "\0"
323 "CP21866" "\0" "KOI8-RU" "\0"
324 "CP28591" "\0" "ISO-8859-1" "\0"
325 "CP28592" "\0" "ISO-8859-2" "\0"
326 "CP28593" "\0" "ISO-8859-3" "\0"
327 "CP28594" "\0" "ISO-8859-4" "\0"
328 "CP28595" "\0" "ISO-8859-5" "\0"
329 "CP28596" "\0" "ISO-8859-6" "\0"
330 "CP28597" "\0" "ISO-8859-7" "\0"
331 "CP28598" "\0" "ISO-8859-8" "\0"
332 "CP28599" "\0" "ISO-8859-9" "\0"
333 "CP28605" "\0" "ISO-8859-15" "\0"
334 "CP38598" "\0" "ISO-8859-8" "\0"
335 "CP51932" "\0" "EUC-JP" "\0"
336 "CP51936" "\0" "GB2312" "\0"
337 "CP51949" "\0" "EUC-KR" "\0"
338 "CP51950" "\0" "EUC-TW" "\0"
339 "CP54936" "\0" "GB18030" "\0"
340 "CP65001" "\0" "UTF-8" "\0";
341 # endif
342 #endif
344 charset_aliases = cp;
347 return cp;
350 /* Determine the current locale's character encoding, and canonicalize it
351 into one of the canonical names listed in config.charset.
352 The result must not be freed; it is statically allocated.
353 If the canonical name cannot be determined, the result is a non-canonical
354 name. */
356 #ifdef STATIC
357 STATIC
358 #endif
359 const char *
360 locale_charset (void)
362 const char *codeset;
363 const char *aliases;
365 #if !(defined WIN32_NATIVE || defined OS2)
367 # if HAVE_LANGINFO_CODESET
369 /* Most systems support nl_langinfo (CODESET) nowadays. */
370 codeset = nl_langinfo (CODESET);
372 # ifdef __CYGWIN__
373 /* Cygwin < 1.7 does not have locales. nl_langinfo (CODESET) always
374 returns "US-ASCII". Return the suffix of the locale name from the
375 environment variables (if present) or the codepage as a number. */
376 if (codeset != NULL && strcmp (codeset, "US-ASCII") == 0)
378 const char *locale;
379 static char buf[2 + 10 + 1];
381 locale = getenv ("LC_ALL");
382 if (locale == NULL || locale[0] == '\0')
384 locale = getenv ("LC_CTYPE");
385 if (locale == NULL || locale[0] == '\0')
386 locale = getenv ("LANG");
388 if (locale != NULL && locale[0] != '\0')
390 /* If the locale name contains an encoding after the dot, return
391 it. */
392 const char *dot = strchr (locale, '.');
394 if (dot != NULL)
396 const char *modifier;
398 dot++;
399 /* Look for the possible @... trailer and remove it, if any. */
400 modifier = strchr (dot, '@');
401 if (modifier == NULL)
402 return dot;
403 if (modifier - dot < sizeof (buf))
405 memcpy (buf, dot, modifier - dot);
406 buf [modifier - dot] = '\0';
407 return buf;
412 /* Woe32 has a function returning the locale's codepage as a number:
413 GetACP(). This encoding is used by Cygwin, unless the user has set
414 the environment variable CYGWIN=codepage:oem (which very few people
415 do).
416 Output directed to console windows needs to be converted (to
417 GetOEMCP() if the console is using a raster font, or to
418 GetConsoleOutputCP() if it is using a TrueType font). Cygwin does
419 this conversion transparently (see winsup/cygwin/fhandler_console.cc),
420 converting to GetConsoleOutputCP(). This leads to correct results,
421 except when SetConsoleOutputCP has been called and a raster font is
422 in use. */
423 sprintf (buf, "CP%u", GetACP ());
424 codeset = buf;
426 # endif
428 # else
430 /* On old systems which lack it, use setlocale or getenv. */
431 const char *locale = NULL;
433 /* But most old systems don't have a complete set of locales. Some
434 (like SunOS 4 or DJGPP) have only the C locale. Therefore we don't
435 use setlocale here; it would return "C" when it doesn't support the
436 locale name the user has set. */
437 # if 0
438 locale = setlocale (LC_CTYPE, NULL);
439 # endif
440 if (locale == NULL || locale[0] == '\0')
442 locale = getenv ("LC_ALL");
443 if (locale == NULL || locale[0] == '\0')
445 locale = getenv ("LC_CTYPE");
446 if (locale == NULL || locale[0] == '\0')
447 locale = getenv ("LANG");
451 /* On some old systems, one used to set locale = "iso8859_1". On others,
452 you set it to "language_COUNTRY.charset". In any case, we resolve it
453 through the charset.alias file. */
454 codeset = locale;
456 # endif
458 #elif defined WIN32_NATIVE
460 static char buf[2 + 10 + 1];
462 /* Woe32 has a function returning the locale's codepage as a number:
463 GetACP().
464 When the output goes to a console window, it needs to be provided in
465 GetOEMCP() encoding if the console is using a raster font, or in
466 GetConsoleOutputCP() encoding if it is using a TrueType font.
467 But in GUI programs and for output sent to files and pipes, GetACP()
468 encoding is the best bet. */
469 sprintf (buf, "CP%u", GetACP ());
470 codeset = buf;
472 #elif defined OS2
474 const char *locale;
475 static char buf[2 + 10 + 1];
476 ULONG cp[3];
477 ULONG cplen;
479 /* Allow user to override the codeset, as set in the operating system,
480 with standard language environment variables. */
481 locale = getenv ("LC_ALL");
482 if (locale == NULL || locale[0] == '\0')
484 locale = getenv ("LC_CTYPE");
485 if (locale == NULL || locale[0] == '\0')
486 locale = getenv ("LANG");
488 if (locale != NULL && locale[0] != '\0')
490 /* If the locale name contains an encoding after the dot, return it. */
491 const char *dot = strchr (locale, '.');
493 if (dot != NULL)
495 const char *modifier;
497 dot++;
498 /* Look for the possible @... trailer and remove it, if any. */
499 modifier = strchr (dot, '@');
500 if (modifier == NULL)
501 return dot;
502 if (modifier - dot < sizeof (buf))
504 memcpy (buf, dot, modifier - dot);
505 buf [modifier - dot] = '\0';
506 return buf;
510 /* Resolve through the charset.alias file. */
511 codeset = locale;
513 else
515 /* OS/2 has a function returning the locale's codepage as a number. */
516 if (DosQueryCp (sizeof (cp), cp, &cplen))
517 codeset = "";
518 else
520 sprintf (buf, "CP%u", cp[0]);
521 codeset = buf;
525 #endif
527 if (codeset == NULL)
528 /* The canonical name cannot be determined. */
529 codeset = "";
531 /* Resolve alias. */
532 for (aliases = get_charset_aliases ();
533 *aliases != '\0';
534 aliases += strlen (aliases) + 1, aliases += strlen (aliases) + 1)
535 if (strcmp (codeset, aliases) == 0
536 || (aliases[0] == '*' && aliases[1] == '\0'))
538 codeset = aliases + strlen (aliases) + 1;
539 break;
542 /* Don't return an empty string. GNU libc and GNU libiconv interpret
543 the empty string as denoting "the locale's character encoding",
544 thus GNU libiconv would call this function a second time. */
545 if (codeset[0] == '\0')
546 codeset = "ASCII";
548 return codeset;