* Now the MacOS mozilla plugin is an independant bundle ( searchs no more
[vlc.git] / src / misc / charset.c
blobcb4611878aacb8f1996c542cfc4c62b874d1983c
1 /*****************************************************************************
2 * charset.c: Determine a canonical name for the current locale's character encoding.
3 *****************************************************************************
4 * Copyright (C) 2003 VideoLAN
5 * $Id: charset.c,v 1.2 2003/08/23 22:19:07 fenrir Exp $
7 * Authors: Derk-Jan Hartman <thedj at users.sf.net>
9 * vlc_current_charset() an adaption of mp_locale_charset():
11 * Copyright (C) 2001-2003 The Mape Project
12 * Written by Karel Zak <zakkr@zf.jcu.cz>.
14 * This program is free software; you can redistribute it and/or modify
15 * it under the terms of the GNU General Public License as published by
16 * the Free Software Foundation; either version 2 of the License, or
17 * (at your option) any later version.
19 * This program is distributed in the hope that it will be useful,
20 * but WITHOUT ANY WARRANTY; without even the implied warranty of
21 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
22 * GNU General Public License for more details.
24 * You should have received a copy of the GNU General Public License
25 * along with this program; if not, write to the Free Software
26 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111, USA.
27 *****************************************************************************/
29 #include <stdlib.h>
30 #include <stdio.h>
31 #include <vlc/vlc.h>
33 #if !defined WIN32
34 # if HAVE_LANGINFO_CODESET
35 # include <langinfo.h>
36 # else
37 # if HAVE_SETLOCALE
38 # include <locale.h>
39 # endif
40 # endif
41 #elif defined WIN32
42 # include <windows.h>
43 #endif
45 #include "charset.h"
47 typedef struct VLCCharsetAlias
49 char *psz_alias, *psz_name;
50 } VLCCharsetAlias;
53 * The libcharset load all from external text file, but it's strange and
54 * slow solution, we rather use array(s) compiled into source. In the
55 * "good" libc this is not needful -- for example in linux.
57 * Please, put to this funtion exotic aliases only. The libc 'iconv' knows
58 * a lot of basic aliases (check it first by iconv -l).
62 static const char* vlc_encoding_from_language( const char *l )
64 /* check for language (and perhaps country) codes */
65 if (strstr(l, "zh_TW")) return "Big5";
66 if (strstr(l, "zh_HK")) return "Big5HKSCS"; /* no MIME charset */
67 if (strstr(l, "zh")) return "GB2312";
68 if (strstr(l, "th")) return "TIS-620";
69 if (strstr(l, "ja")) return "EUC-JP";
70 if (strstr(l, "ko")) return "EUC-KR";
71 if (strstr(l, "ru")) return "KOI8-R";
72 if (strstr(l, "uk")) return "KOI8-U";
73 if (strstr(l, "pl") || strstr(l, "hr") ||
74 strstr(l, "hu") || strstr(l, "cs") ||
75 strstr(l, "sk") || strstr(l, "sl")) return "ISO-8859-2";
76 if (strstr(l, "eo") || strstr(l, "mt")) return "ISO-8859-3";
77 if (strstr(l, "lt") || strstr(l, "la")) return "ISO-8859-4";
78 if (strstr(l, "bg") || strstr(l, "be") ||
79 strstr(l, "mk") || strstr(l, "uk")) return "ISO-8859-5";
80 if (strstr(l, "ar")) return "ISO-8859-6";
81 if (strstr(l, "el")) return "ISO-8859-7";
82 if (strstr(l, "he") || strstr(l, "iw")) return "ISO-8859-8";
83 if (strstr(l, "tr")) return "ISO-8859-9";
84 if (strstr(l, "th")) return "ISO-8859-11";
85 if (strstr(l, "lv")) return "ISO-8859-13";
86 if (strstr(l, "cy")) return "ISO-8859-14";
87 if (strstr(l, "et")) return "ISO-8859-15"; /* all latin1 could be iso15 as well */
88 if (strstr(l, "ro")) return "ISO-8859-2"; /* or ISO-8859-16 */
89 if (strstr(l, "am") || strstr(l, "vi")) return "UTF-8";
90 /* We don't know. This ain't working go to default. */
91 return "ISO-8859-1";
95 static const char* vlc_charset_aliases( const char *psz_name )
97 VLCCharsetAlias *a;
99 #if defined WIN32
100 VLCCharsetAlias aliases[] =
102 { "CP936", "GBK" },
103 { "CP1361", "JOHAB" },
104 { "CP20127", "ASCII" },
105 { "CP20866", "KOI8-R" },
106 { "CP21866", "KOI8-RU" },
107 { "CP28591", "ISO-8859-1" },
108 { "CP28592", "ISO-8859-2" },
109 { "CP28593", "ISO-8859-3" },
110 { "CP28594", "ISO-8859-4" },
111 { "CP28595", "ISO-8859-5" },
112 { "CP28596", "ISO-8859-6" },
113 { "CP28597", "ISO-8859-7" },
114 { "CP28598", "ISO-8859-8" },
115 { "CP28599", "ISO-8859-9" },
116 { "CP28605", "ISO-8859-15" },
117 { NULL, NULL }
119 #elif SYS_AIX
120 VLCCharsetAlias aliases[] =
122 { "IBM-850", "CP850" },
123 { "IBM-856", "CP856" },
124 { "IBM-921", "ISO-8859-13" },
125 { "IBM-922", "CP922" },
126 { "IBM-932", "CP932" },
127 { "IBM-943", "CP943" },
128 { "IBM-1046", "CP1046" },
129 { "IBM-1124", "CP1124" },
130 { "IBM-1129", "CP1129" },
131 { "IBM-1252", "CP1252" },
132 { "IBM-EUCCN", "GB2312" },
133 { "IBM-EUCJP", "EUC-JP" },
134 { "IBM-EUCKR", "EUC-KR" },
135 { "IBM-EUCTW", "EUC-TW" },
136 { NULL, NULL }
138 #elif SYS_HPUX
139 VLCCharsetAlias aliases[] =
141 { "ROMAN8", "HP-ROMAN8" },
142 { "ARABIC8", "HP-ARABIC8" },
143 { "GREEK8", "HP-GREEK8" },
144 { "HEBREW8", "HP-HEBREW8" },
145 { "TURKISH8", "HP-TURKISH8" },
146 { "KANA8", "HP-KANA8" },
147 { "HP15CN", "GB2312" },
148 { NULL, NULL }
150 #elif SYS_IRIX
151 VLCCharsetAlias aliases[] =
153 { "EUCCN", "GB2312" },
154 { NULL, NULL }
156 #elif SYS_OSF
157 VLCCharsetAlias aliases[] =
159 { "KSC5601", "CP949" },
160 { "SDECKANJI", "EUC-JP" },
161 { "TACTIS", "TIS-620" },
162 { NULL, NULL }
164 #elif SYS_SOLARIS
165 VLCCharsetAlias aliases[] =
167 { "646", "ASCII" },
168 { "CNS11643", "EUC-TW" },
169 { "5601", "EUC-KR" },
170 { "JOHAP92", "JOHAB" },
171 { "PCK", "SHIFT_JIS" },
172 { "2533", "TIS-620" },
173 { NULL, NULL }
175 #elif SYS_BSD
176 VLCCharsetAlias aliases[] =
178 { "646", " ASCII" },
179 { "EUCCN", "GB2312" },
180 { NULL, NULL }
182 #else
183 VLCCharsetAlias aliases[] = {{NULL, NULL}};
184 #endif
186 if( aliases )
188 for (a = aliases; a->psz_alias; a++)
189 if (strcasecmp (a->psz_alias, psz_name) == 0)
190 return a->psz_name;
193 /* we return original name beacuse iconv() probably will know
194 * something better about name if we don't know it :-)
196 return psz_name;
199 /* Returns charset from "language_COUNTRY.charset@modifier" string */
200 static char* vlc_encoding_from_locale( char *psz_locale )
202 char *psz_dot = strchr( psz_locale, '.' );
204 if( psz_dot != NULL )
206 const char *psz_modifier;
207 static char buf[2 + 10 + 1];
209 psz_dot++;
211 /* Look for the possible @... trailer and remove it, if any. */
212 psz_modifier = strchr( psz_dot, '@' );
214 if( psz_modifier == NULL )
215 return psz_dot;
216 if( psz_modifier - psz_dot < sizeof( buf ))
218 memcpy( buf, psz_dot, psz_modifier - psz_dot );
219 buf[ psz_modifier - psz_dot ] = '\0';
220 return buf;
223 /* try language mapping */
224 return vlc_encoding_from_language( psz_locale );
227 vlc_bool_t vlc_current_charset( char **psz_charset )
229 const char *psz_codeset;
231 #if !(defined WIN32 || defined OS2)
233 # if HAVE_LANGINFO_CODESET
234 /* Most systems support nl_langinfo( CODESET ) nowadays. */
235 psz_codeset = nl_langinfo( CODESET );
236 # else
237 /* On old systems which lack it, use setlocale or getenv. */
238 const char *psz_locale = NULL;
240 /* But most old systems don't have a complete set of locales. Some
241 * (like SunOS 4 or DJGPP) have only the C locale. Therefore we don't
242 * use setlocale here; it would return "C" when it doesn't support the
243 * locale name the user has set. Darwin's setlocale is broken.
245 # if HAVE_SETLOCALE && !SYS_DARWIN
246 psz_locale = setlocale( LC_ALL, NULL );
247 # endif
248 if( psz_locale == NULL || psz_locale[0] == '\0' )
250 psz_locale = getenv( "LC_ALL" );
251 if( psz_locale == NULL || psz_locale[0] == '\0' )
253 psz_locale = getenv( "LC_CTYPE" );
254 if( psz_locale == NULL || psz_locale[0] == '\0')
255 psz_locale = getenv( "LANG" );
259 /* On some old systems, one used to set locale = "iso8859_1". On others,
260 * you set it to "language_COUNTRY.charset". Darwin only has LANG :(
262 psz_codeset = vlc_encoding_from_locale( psz_locale );
263 # endif /* HAVE_LANGINFO_CODESET */
265 #elif defined WIN32
267 static char buf[2 + 10 + 1];
269 /* Woe32 has a function returning the locale's codepage as a number. */
270 sprintf( buf, "CP%u", GetACP() );
271 psz_codeset = buf;
273 #elif defined OS2
275 const char *psz_locale;
276 static char buf[2 + 10 + 1];
277 ULONG cp[3];
278 ULONG cplen;
280 /* Allow user to override the codeset, as set in the operating system,
281 * with standard language environment variables.
283 psz_locale = getenv( "LC_ALL" );
284 if( psz_locale == NULL || psz_locale[0] == '\0' )
286 psz+locale = getenv( "LC_CTYPE" );
287 if( psz_locale == NULL || locale[0] == '\0' )
288 locale = getenv( "LANG" );
290 if( psz_locale != NULL && psz_locale[0] != '\0' )
291 psz_codeset = vlc_encoding_from_locale( psz_locale );
292 else
294 /* OS/2 has a function returning the locale's codepage as a number. */
295 if( DosQueryCp( sizeof( cp ), cp, &cplen ) )
296 psz_codeset = "";
297 else
299 sprintf( buf, "CP%u", cp[0] );
300 psz_codeset = buf;
303 #endif
304 if( psz_codeset == NULL )
305 /* The canonical name cannot be determined. */
306 psz_codeset = "";
307 else
308 psz_codeset = vlc_charset_aliases( psz_codeset );
310 /* Don't return an empty string. GNU libc and GNU libiconv interpret
311 * the empty string as denoting "the locale's character encoding",
312 * thus GNU libiconv would call this function a second time.
314 if( psz_codeset[0] == '\0' )
317 * Last possibility is 'CHARSET' enviroment variable
319 if( !( psz_codeset = getenv( "CHARSET" ) ) )
320 psz_codeset = "ISO-8859-1";
323 if( psz_charset )
324 *psz_charset = (char *)psz_codeset;
326 if (strcasecmp(psz_codeset, "UTF8")==0 || strcasecmp(psz_codeset, "UTF-8")==0)
327 return VLC_TRUE;
329 return VLC_FALSE;