1 /*****************************************************************************
2 * charset.c: Determine a canonical name for the current locale's character encoding.
3 *****************************************************************************
4 * Copyright (C) 2003 VideoLAN
5 * $Id: charset.c,v 1.2 2003/08/23 22:19:07 fenrir Exp $
7 * Authors: Derk-Jan Hartman <thedj at users.sf.net>
9 * vlc_current_charset() an adaption of mp_locale_charset():
11 * Copyright (C) 2001-2003 The Mape Project
12 * Written by Karel Zak <zakkr@zf.jcu.cz>.
14 * This program is free software; you can redistribute it and/or modify
15 * it under the terms of the GNU General Public License as published by
16 * the Free Software Foundation; either version 2 of the License, or
17 * (at your option) any later version.
19 * This program is distributed in the hope that it will be useful,
20 * but WITHOUT ANY WARRANTY; without even the implied warranty of
21 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
22 * GNU General Public License for more details.
24 * You should have received a copy of the GNU General Public License
25 * along with this program; if not, write to the Free Software
26 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111, USA.
27 *****************************************************************************/
34 # if HAVE_LANGINFO_CODESET
35 # include <langinfo.h>
47 typedef struct VLCCharsetAlias
49 char *psz_alias
, *psz_name
;
53 * The libcharset load all from external text file, but it's strange and
54 * slow solution, we rather use array(s) compiled into source. In the
55 * "good" libc this is not needful -- for example in linux.
57 * Please, put to this funtion exotic aliases only. The libc 'iconv' knows
58 * a lot of basic aliases (check it first by iconv -l).
62 static const char* vlc_encoding_from_language( const char *l
)
64 /* check for language (and perhaps country) codes */
65 if (strstr(l
, "zh_TW")) return "Big5";
66 if (strstr(l
, "zh_HK")) return "Big5HKSCS"; /* no MIME charset */
67 if (strstr(l
, "zh")) return "GB2312";
68 if (strstr(l
, "th")) return "TIS-620";
69 if (strstr(l
, "ja")) return "EUC-JP";
70 if (strstr(l
, "ko")) return "EUC-KR";
71 if (strstr(l
, "ru")) return "KOI8-R";
72 if (strstr(l
, "uk")) return "KOI8-U";
73 if (strstr(l
, "pl") || strstr(l
, "hr") ||
74 strstr(l
, "hu") || strstr(l
, "cs") ||
75 strstr(l
, "sk") || strstr(l
, "sl")) return "ISO-8859-2";
76 if (strstr(l
, "eo") || strstr(l
, "mt")) return "ISO-8859-3";
77 if (strstr(l
, "lt") || strstr(l
, "la")) return "ISO-8859-4";
78 if (strstr(l
, "bg") || strstr(l
, "be") ||
79 strstr(l
, "mk") || strstr(l
, "uk")) return "ISO-8859-5";
80 if (strstr(l
, "ar")) return "ISO-8859-6";
81 if (strstr(l
, "el")) return "ISO-8859-7";
82 if (strstr(l
, "he") || strstr(l
, "iw")) return "ISO-8859-8";
83 if (strstr(l
, "tr")) return "ISO-8859-9";
84 if (strstr(l
, "th")) return "ISO-8859-11";
85 if (strstr(l
, "lv")) return "ISO-8859-13";
86 if (strstr(l
, "cy")) return "ISO-8859-14";
87 if (strstr(l
, "et")) return "ISO-8859-15"; /* all latin1 could be iso15 as well */
88 if (strstr(l
, "ro")) return "ISO-8859-2"; /* or ISO-8859-16 */
89 if (strstr(l
, "am") || strstr(l
, "vi")) return "UTF-8";
90 /* We don't know. This ain't working go to default. */
95 static const char* vlc_charset_aliases( const char *psz_name
)
100 VLCCharsetAlias aliases
[] =
103 { "CP1361", "JOHAB" },
104 { "CP20127", "ASCII" },
105 { "CP20866", "KOI8-R" },
106 { "CP21866", "KOI8-RU" },
107 { "CP28591", "ISO-8859-1" },
108 { "CP28592", "ISO-8859-2" },
109 { "CP28593", "ISO-8859-3" },
110 { "CP28594", "ISO-8859-4" },
111 { "CP28595", "ISO-8859-5" },
112 { "CP28596", "ISO-8859-6" },
113 { "CP28597", "ISO-8859-7" },
114 { "CP28598", "ISO-8859-8" },
115 { "CP28599", "ISO-8859-9" },
116 { "CP28605", "ISO-8859-15" },
120 VLCCharsetAlias aliases
[] =
122 { "IBM-850", "CP850" },
123 { "IBM-856", "CP856" },
124 { "IBM-921", "ISO-8859-13" },
125 { "IBM-922", "CP922" },
126 { "IBM-932", "CP932" },
127 { "IBM-943", "CP943" },
128 { "IBM-1046", "CP1046" },
129 { "IBM-1124", "CP1124" },
130 { "IBM-1129", "CP1129" },
131 { "IBM-1252", "CP1252" },
132 { "IBM-EUCCN", "GB2312" },
133 { "IBM-EUCJP", "EUC-JP" },
134 { "IBM-EUCKR", "EUC-KR" },
135 { "IBM-EUCTW", "EUC-TW" },
139 VLCCharsetAlias aliases
[] =
141 { "ROMAN8", "HP-ROMAN8" },
142 { "ARABIC8", "HP-ARABIC8" },
143 { "GREEK8", "HP-GREEK8" },
144 { "HEBREW8", "HP-HEBREW8" },
145 { "TURKISH8", "HP-TURKISH8" },
146 { "KANA8", "HP-KANA8" },
147 { "HP15CN", "GB2312" },
151 VLCCharsetAlias aliases
[] =
153 { "EUCCN", "GB2312" },
157 VLCCharsetAlias aliases
[] =
159 { "KSC5601", "CP949" },
160 { "SDECKANJI", "EUC-JP" },
161 { "TACTIS", "TIS-620" },
165 VLCCharsetAlias aliases
[] =
168 { "CNS11643", "EUC-TW" },
169 { "5601", "EUC-KR" },
170 { "JOHAP92", "JOHAB" },
171 { "PCK", "SHIFT_JIS" },
172 { "2533", "TIS-620" },
176 VLCCharsetAlias aliases
[] =
179 { "EUCCN", "GB2312" },
183 VLCCharsetAlias aliases
[] = {{NULL
, NULL
}};
188 for (a
= aliases
; a
->psz_alias
; a
++)
189 if (strcasecmp (a
->psz_alias
, psz_name
) == 0)
193 /* we return original name beacuse iconv() probably will know
194 * something better about name if we don't know it :-)
199 /* Returns charset from "language_COUNTRY.charset@modifier" string */
200 static char* vlc_encoding_from_locale( char *psz_locale
)
202 char *psz_dot
= strchr( psz_locale
, '.' );
204 if( psz_dot
!= NULL
)
206 const char *psz_modifier
;
207 static char buf
[2 + 10 + 1];
211 /* Look for the possible @... trailer and remove it, if any. */
212 psz_modifier
= strchr( psz_dot
, '@' );
214 if( psz_modifier
== NULL
)
216 if( psz_modifier
- psz_dot
< sizeof( buf
))
218 memcpy( buf
, psz_dot
, psz_modifier
- psz_dot
);
219 buf
[ psz_modifier
- psz_dot
] = '\0';
223 /* try language mapping */
224 return vlc_encoding_from_language( psz_locale
);
227 vlc_bool_t
vlc_current_charset( char **psz_charset
)
229 const char *psz_codeset
;
231 #if !(defined WIN32 || defined OS2)
233 # if HAVE_LANGINFO_CODESET
234 /* Most systems support nl_langinfo( CODESET ) nowadays. */
235 psz_codeset
= nl_langinfo( CODESET
);
237 /* On old systems which lack it, use setlocale or getenv. */
238 const char *psz_locale
= NULL
;
240 /* But most old systems don't have a complete set of locales. Some
241 * (like SunOS 4 or DJGPP) have only the C locale. Therefore we don't
242 * use setlocale here; it would return "C" when it doesn't support the
243 * locale name the user has set. Darwin's setlocale is broken.
245 # if HAVE_SETLOCALE && !SYS_DARWIN
246 psz_locale
= setlocale( LC_ALL
, NULL
);
248 if( psz_locale
== NULL
|| psz_locale
[0] == '\0' )
250 psz_locale
= getenv( "LC_ALL" );
251 if( psz_locale
== NULL
|| psz_locale
[0] == '\0' )
253 psz_locale
= getenv( "LC_CTYPE" );
254 if( psz_locale
== NULL
|| psz_locale
[0] == '\0')
255 psz_locale
= getenv( "LANG" );
259 /* On some old systems, one used to set locale = "iso8859_1". On others,
260 * you set it to "language_COUNTRY.charset". Darwin only has LANG :(
262 psz_codeset
= vlc_encoding_from_locale( psz_locale
);
263 # endif /* HAVE_LANGINFO_CODESET */
267 static char buf
[2 + 10 + 1];
269 /* Woe32 has a function returning the locale's codepage as a number. */
270 sprintf( buf
, "CP%u", GetACP() );
275 const char *psz_locale
;
276 static char buf
[2 + 10 + 1];
280 /* Allow user to override the codeset, as set in the operating system,
281 * with standard language environment variables.
283 psz_locale
= getenv( "LC_ALL" );
284 if( psz_locale
== NULL
|| psz_locale
[0] == '\0' )
286 psz
+locale
= getenv( "LC_CTYPE" );
287 if( psz_locale
== NULL
|| locale
[0] == '\0' )
288 locale
= getenv( "LANG" );
290 if( psz_locale
!= NULL
&& psz_locale
[0] != '\0' )
291 psz_codeset
= vlc_encoding_from_locale( psz_locale
);
294 /* OS/2 has a function returning the locale's codepage as a number. */
295 if( DosQueryCp( sizeof( cp
), cp
, &cplen
) )
299 sprintf( buf
, "CP%u", cp
[0] );
304 if( psz_codeset
== NULL
)
305 /* The canonical name cannot be determined. */
308 psz_codeset
= vlc_charset_aliases( psz_codeset
);
310 /* Don't return an empty string. GNU libc and GNU libiconv interpret
311 * the empty string as denoting "the locale's character encoding",
312 * thus GNU libiconv would call this function a second time.
314 if( psz_codeset
[0] == '\0' )
317 * Last possibility is 'CHARSET' enviroment variable
319 if( !( psz_codeset
= getenv( "CHARSET" ) ) )
320 psz_codeset
= "ISO-8859-1";
324 *psz_charset
= (char *)psz_codeset
;
326 if (strcasecmp(psz_codeset
, "UTF8")==0 || strcasecmp(psz_codeset
, "UTF-8")==0)