Merge branch 'doc-types' into 'master'
[glib.git] / glib / gcharset.c
blobbfcd12590ce581a4b1feca529b8751185a14dfba
1 /* gcharset.c - Charset information
3 * Copyright (C) 2011 Red Hat, Inc.
5 * This library is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU Lesser General Public
7 * License as published by the Free Software Foundation; either
8 * version 2.1 of the License, or (at your option) any later version.
10 * This library is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * Lesser General Public License for more details.
15 * You should have received a copy of the GNU Lesser General Public
16 * License along with this library; if not, see <http://www.gnu.org/licenses/>.
19 #include "config.h"
21 #include "gcharset.h"
22 #include "gcharsetprivate.h"
24 #include "garray.h"
25 #include "genviron.h"
26 #include "ghash.h"
27 #include "gmessages.h"
28 #include "gstrfuncs.h"
29 #include "gthread.h"
30 #ifdef G_OS_WIN32
31 #include "gwin32.h"
32 #endif
34 #include "libcharset/libcharset.h"
36 #include <string.h>
37 #include <stdio.h>
39 G_LOCK_DEFINE_STATIC (aliases);
41 static GHashTable *
42 get_alias_hash (void)
44 static GHashTable *alias_hash = NULL;
45 const char *aliases;
47 G_LOCK (aliases);
49 if (!alias_hash)
51 alias_hash = g_hash_table_new (g_str_hash, g_str_equal);
53 aliases = _g_locale_get_charset_aliases ();
54 while (*aliases != '\0')
56 const char *canonical;
57 const char *alias;
58 const char **alias_array;
59 int count = 0;
61 alias = aliases;
62 aliases += strlen (aliases) + 1;
63 canonical = aliases;
64 aliases += strlen (aliases) + 1;
66 alias_array = g_hash_table_lookup (alias_hash, canonical);
67 if (alias_array)
69 while (alias_array[count])
70 count++;
73 alias_array = g_renew (const char *, alias_array, count + 2);
74 alias_array[count] = alias;
75 alias_array[count + 1] = NULL;
77 g_hash_table_insert (alias_hash, (char *)canonical, alias_array);
81 G_UNLOCK (aliases);
83 return alias_hash;
86 /* As an abuse of the alias table, the following routines gets
87 * the charsets that are aliases for the canonical name.
89 const char **
90 _g_charset_get_aliases (const char *canonical_name)
92 GHashTable *alias_hash = get_alias_hash ();
94 return g_hash_table_lookup (alias_hash, canonical_name);
97 static gboolean
98 g_utf8_get_charset_internal (const char *raw_data,
99 const char **a)
101 const char *charset = g_getenv ("CHARSET");
103 if (charset && *charset)
105 *a = charset;
107 if (charset && strstr (charset, "UTF-8"))
108 return TRUE;
109 else
110 return FALSE;
113 /* The libcharset code tries to be thread-safe without
114 * a lock, but has a memory leak and a missing memory
115 * barrier, so we lock for it
117 G_LOCK (aliases);
118 charset = _g_locale_charset_unalias (raw_data);
119 G_UNLOCK (aliases);
121 if (charset && *charset)
123 *a = charset;
125 if (charset && strstr (charset, "UTF-8"))
126 return TRUE;
127 else
128 return FALSE;
131 /* Assume this for compatibility at present. */
132 *a = "US-ASCII";
134 return FALSE;
137 typedef struct _GCharsetCache GCharsetCache;
139 struct _GCharsetCache {
140 gboolean is_utf8;
141 gchar *raw;
142 gchar *charset;
145 static void
146 charset_cache_free (gpointer data)
148 GCharsetCache *cache = data;
149 g_free (cache->raw);
150 g_free (cache->charset);
151 g_free (cache);
155 * g_get_charset:
156 * @charset: (out) (optional) (transfer none): return location for character set
157 * name, or %NULL.
159 * Obtains the character set for the [current locale][setlocale]; you
160 * might use this character set as an argument to g_convert(), to convert
161 * from the current locale's encoding to some other encoding. (Frequently
162 * g_locale_to_utf8() and g_locale_from_utf8() are nice shortcuts, though.)
164 * On Windows the character set returned by this function is the
165 * so-called system default ANSI code-page. That is the character set
166 * used by the "narrow" versions of C library and Win32 functions that
167 * handle file names. It might be different from the character set
168 * used by the C library's current locale.
170 * On Linux, the character set is found by consulting nl_langinfo() if
171 * available. If not, the environment variables `LC_ALL`, `LC_CTYPE`, `LANG`
172 * and `CHARSET` are queried in order.
174 * The return value is %TRUE if the locale's encoding is UTF-8, in that
175 * case you can perhaps avoid calling g_convert().
177 * The string returned in @charset is not allocated, and should not be
178 * freed.
180 * Returns: %TRUE if the returned charset is UTF-8
182 gboolean
183 g_get_charset (const char **charset)
185 static GPrivate cache_private = G_PRIVATE_INIT (charset_cache_free);
186 GCharsetCache *cache = g_private_get (&cache_private);
187 const gchar *raw;
189 if (!cache)
191 cache = g_new0 (GCharsetCache, 1);
192 g_private_set (&cache_private, cache);
195 G_LOCK (aliases);
196 raw = _g_locale_charset_raw ();
197 G_UNLOCK (aliases);
199 if (!(cache->raw && strcmp (cache->raw, raw) == 0))
201 const gchar *new_charset;
203 g_free (cache->raw);
204 g_free (cache->charset);
205 cache->raw = g_strdup (raw);
206 cache->is_utf8 = g_utf8_get_charset_internal (raw, &new_charset);
207 cache->charset = g_strdup (new_charset);
210 if (charset)
211 *charset = cache->charset;
213 return cache->is_utf8;
217 * g_get_codeset:
219 * Gets the character set for the current locale.
221 * Returns: a newly allocated string containing the name
222 * of the character set. This string must be freed with g_free().
224 gchar *
225 g_get_codeset (void)
227 const gchar *charset;
229 g_get_charset (&charset);
231 return g_strdup (charset);
234 #ifndef G_OS_WIN32
236 /* read an alias file for the locales */
237 static void
238 read_aliases (const gchar *file,
239 GHashTable *alias_table)
241 FILE *fp;
242 char buf[256];
244 fp = fopen (file,"r");
245 if (!fp)
246 return;
247 while (fgets (buf, 256, fp))
249 char *p, *q;
251 g_strstrip (buf);
253 /* Line is a comment */
254 if ((buf[0] == '#') || (buf[0] == '\0'))
255 continue;
257 /* Reads first column */
258 for (p = buf, q = NULL; *p; p++) {
259 if ((*p == '\t') || (*p == ' ') || (*p == ':')) {
260 *p = '\0';
261 q = p+1;
262 while ((*q == '\t') || (*q == ' ')) {
263 q++;
265 break;
268 /* The line only had one column */
269 if (!q || *q == '\0')
270 continue;
272 /* Read second column */
273 for (p = q; *p; p++) {
274 if ((*p == '\t') || (*p == ' ')) {
275 *p = '\0';
276 break;
280 /* Add to alias table if necessary */
281 if (!g_hash_table_lookup (alias_table, buf)) {
282 g_hash_table_insert (alias_table, g_strdup (buf), g_strdup (q));
285 fclose (fp);
288 #endif
290 static char *
291 unalias_lang (char *lang)
293 #ifndef G_OS_WIN32
294 static GHashTable *alias_table = NULL;
295 char *p;
296 int i;
298 if (g_once_init_enter (&alias_table))
300 GHashTable *table = g_hash_table_new (g_str_hash, g_str_equal);
301 read_aliases ("/usr/share/locale/locale.alias", table);
302 g_once_init_leave (&alias_table, table);
305 i = 0;
306 while ((p = g_hash_table_lookup (alias_table, lang)) && (strcmp (p, lang) != 0))
308 lang = p;
309 if (i++ == 30)
311 static gboolean said_before = FALSE;
312 if (!said_before)
313 g_warning ("Too many alias levels for a locale, "
314 "may indicate a loop");
315 said_before = TRUE;
316 return lang;
319 #endif
320 return lang;
323 /* Mask for components of locale spec. The ordering here is from
324 * least significant to most significant
326 enum
328 COMPONENT_CODESET = 1 << 0,
329 COMPONENT_TERRITORY = 1 << 1,
330 COMPONENT_MODIFIER = 1 << 2
333 /* Break an X/Open style locale specification into components
335 static guint
336 explode_locale (const gchar *locale,
337 gchar **language,
338 gchar **territory,
339 gchar **codeset,
340 gchar **modifier)
342 const gchar *uscore_pos;
343 const gchar *at_pos;
344 const gchar *dot_pos;
346 guint mask = 0;
348 uscore_pos = strchr (locale, '_');
349 dot_pos = strchr (uscore_pos ? uscore_pos : locale, '.');
350 at_pos = strchr (dot_pos ? dot_pos : (uscore_pos ? uscore_pos : locale), '@');
352 if (at_pos)
354 mask |= COMPONENT_MODIFIER;
355 *modifier = g_strdup (at_pos);
357 else
358 at_pos = locale + strlen (locale);
360 if (dot_pos)
362 mask |= COMPONENT_CODESET;
363 *codeset = g_strndup (dot_pos, at_pos - dot_pos);
365 else
366 dot_pos = at_pos;
368 if (uscore_pos)
370 mask |= COMPONENT_TERRITORY;
371 *territory = g_strndup (uscore_pos, dot_pos - uscore_pos);
373 else
374 uscore_pos = dot_pos;
376 *language = g_strndup (locale, uscore_pos - locale);
378 return mask;
382 * Compute all interesting variants for a given locale name -
383 * by stripping off different components of the value.
385 * For simplicity, we assume that the locale is in
386 * X/Open format: language[_territory][.codeset][@modifier]
388 * TODO: Extend this to handle the CEN format (see the GNUlibc docs)
389 * as well. We could just copy the code from glibc wholesale
390 * but it is big, ugly, and complicated, so I'm reluctant
391 * to do so when this should handle 99% of the time...
393 static void
394 append_locale_variants (GPtrArray *array,
395 const gchar *locale)
397 gchar *language = NULL;
398 gchar *territory = NULL;
399 gchar *codeset = NULL;
400 gchar *modifier = NULL;
402 guint mask;
403 guint i, j;
405 g_return_if_fail (locale != NULL);
407 mask = explode_locale (locale, &language, &territory, &codeset, &modifier);
409 /* Iterate through all possible combinations, from least attractive
410 * to most attractive.
412 for (j = 0; j <= mask; ++j)
414 i = mask - j;
416 if ((i & ~mask) == 0)
418 gchar *val = g_strconcat (language,
419 (i & COMPONENT_TERRITORY) ? territory : "",
420 (i & COMPONENT_CODESET) ? codeset : "",
421 (i & COMPONENT_MODIFIER) ? modifier : "",
422 NULL);
423 g_ptr_array_add (array, val);
427 g_free (language);
428 if (mask & COMPONENT_CODESET)
429 g_free (codeset);
430 if (mask & COMPONENT_TERRITORY)
431 g_free (territory);
432 if (mask & COMPONENT_MODIFIER)
433 g_free (modifier);
437 * g_get_locale_variants:
438 * @locale: a locale identifier
440 * Returns a list of derived variants of @locale, which can be used to
441 * e.g. construct locale-dependent filenames or search paths. The returned
442 * list is sorted from most desirable to least desirable.
443 * This function handles territory, charset and extra locale modifiers.
445 * For example, if @locale is "fr_BE", then the returned list
446 * is "fr_BE", "fr".
448 * If you need the list of variants for the current locale,
449 * use g_get_language_names().
451 * Returns: (transfer full) (array zero-terminated=1) (element-type utf8): a newly
452 * allocated array of newly allocated strings with the locale variants. Free with
453 * g_strfreev().
455 * Since: 2.28
457 gchar **
458 g_get_locale_variants (const gchar *locale)
460 GPtrArray *array;
462 g_return_val_if_fail (locale != NULL, NULL);
464 array = g_ptr_array_sized_new (8);
465 append_locale_variants (array, locale);
466 g_ptr_array_add (array, NULL);
468 return (gchar **) g_ptr_array_free (array, FALSE);
471 /* The following is (partly) taken from the gettext package.
472 Copyright (C) 1995, 1996, 1997, 1998 Free Software Foundation, Inc. */
474 static const gchar *
475 guess_category_value (const gchar *category_name)
477 const gchar *retval;
479 /* The highest priority value is the 'LANGUAGE' environment
480 variable. This is a GNU extension. */
481 retval = g_getenv ("LANGUAGE");
482 if ((retval != NULL) && (retval[0] != '\0'))
483 return retval;
485 /* 'LANGUAGE' is not set. So we have to proceed with the POSIX
486 methods of looking to 'LC_ALL', 'LC_xxx', and 'LANG'. On some
487 systems this can be done by the 'setlocale' function itself. */
489 /* Setting of LC_ALL overwrites all other. */
490 retval = g_getenv ("LC_ALL");
491 if ((retval != NULL) && (retval[0] != '\0'))
492 return retval;
494 /* Next comes the name of the desired category. */
495 retval = g_getenv (category_name);
496 if ((retval != NULL) && (retval[0] != '\0'))
497 return retval;
499 /* Last possibility is the LANG environment variable. */
500 retval = g_getenv ("LANG");
501 if ((retval != NULL) && (retval[0] != '\0'))
502 return retval;
504 #ifdef G_PLATFORM_WIN32
505 /* g_win32_getlocale() first checks for LC_ALL, LC_MESSAGES and
506 * LANG, which we already did above. Oh well. The main point of
507 * calling g_win32_getlocale() is to get the thread's locale as used
508 * by Windows and the Microsoft C runtime (in the "English_United
509 * States" format) translated into the Unixish format.
512 char *locale = g_win32_getlocale ();
513 retval = g_intern_string (locale);
514 g_free (locale);
515 return retval;
517 #endif
519 return NULL;
522 typedef struct _GLanguageNamesCache GLanguageNamesCache;
524 struct _GLanguageNamesCache {
525 gchar *languages;
526 gchar **language_names;
529 static void
530 language_names_cache_free (gpointer data)
532 GLanguageNamesCache *cache = data;
533 g_free (cache->languages);
534 g_strfreev (cache->language_names);
535 g_free (cache);
539 * g_get_language_names:
541 * Computes a list of applicable locale names, which can be used to
542 * e.g. construct locale-dependent filenames or search paths. The returned
543 * list is sorted from most desirable to least desirable and always contains
544 * the default locale "C".
546 * For example, if LANGUAGE=de:en_US, then the returned list is
547 * "de", "en_US", "en", "C".
549 * This function consults the environment variables `LANGUAGE`, `LC_ALL`,
550 * `LC_MESSAGES` and `LANG` to find the list of locales specified by the
551 * user.
553 * Returns: (array zero-terminated=1) (transfer none): a %NULL-terminated array of strings owned by GLib
554 * that must not be modified or freed.
556 * Since: 2.6
558 const gchar * const *
559 g_get_language_names (void)
561 return g_get_language_names_with_category ("LC_MESSAGES");
565 * g_get_language_names_with_category:
566 * @category_name: a locale category name
568 * Computes a list of applicable locale names with a locale category name,
569 * which can be used to construct the fallback locale-dependent filenames
570 * or search paths. The returned list is sorted from most desirable to
571 * least desirable and always contains the default locale "C".
573 * This function consults the environment variables `LANGUAGE`, `LC_ALL`,
574 * @category_name, and `LANG` to find the list of locales specified by the
575 * user.
577 * g_get_language_names() returns g_get_language_names_with_category("LC_MESSAGES").
579 * Returns: (array zero-terminated=1) (transfer none): a %NULL-terminated array of strings owned by GLib
580 * that must not be modified or freed.
582 * Since: 2.58
584 const gchar * const *
585 g_get_language_names_with_category (const gchar *category_name)
587 static GPrivate cache_private = G_PRIVATE_INIT ((void (*)(gpointer)) g_hash_table_remove_all);
588 GHashTable *cache = g_private_get (&cache_private);
589 const gchar *languages;
590 GLanguageNamesCache *name_cache;
592 g_return_val_if_fail (category_name != NULL, NULL);
594 if (!cache)
596 cache = g_hash_table_new_full (g_str_hash, g_str_equal,
597 g_free, language_names_cache_free);
598 g_private_set (&cache_private, cache);
601 languages = guess_category_value (category_name);
602 if (!languages)
603 languages = "C";
605 name_cache = (GLanguageNamesCache *) g_hash_table_lookup (cache, category_name);
606 if (!(name_cache && name_cache->languages &&
607 strcmp (name_cache->languages, languages) == 0))
609 GPtrArray *array;
610 gchar **alist, **a;
612 g_hash_table_remove (cache, category_name);
614 array = g_ptr_array_sized_new (8);
616 alist = g_strsplit (languages, ":", 0);
617 for (a = alist; *a; a++)
618 append_locale_variants (array, unalias_lang (*a));
619 g_strfreev (alist);
620 g_ptr_array_add (array, g_strdup ("C"));
621 g_ptr_array_add (array, NULL);
623 name_cache = g_new0 (GLanguageNamesCache, 1);
624 name_cache->languages = g_strdup (languages);
625 name_cache->language_names = (gchar **) g_ptr_array_free (array, FALSE);
626 g_hash_table_insert (cache, g_strdup (category_name), name_cache);
629 return (const gchar * const *) name_cache->language_names;