1 /* gcharset.c - Charset information
3 * Copyright (C) 2011 Red Hat, Inc.
5 * This library is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU Lesser General Public
7 * License as published by the Free Software Foundation; either
8 * version 2.1 of the License, or (at your option) any later version.
10 * This library is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * Lesser General Public License for more details.
15 * You should have received a copy of the GNU Lesser General Public
16 * License along with this library; if not, see <http://www.gnu.org/licenses/>.
22 #include "gcharsetprivate.h"
27 #include "gmessages.h"
28 #include "gstrfuncs.h"
34 #include "libcharset/libcharset.h"
39 G_LOCK_DEFINE_STATIC (aliases
);
44 static GHashTable
*alias_hash
= NULL
;
51 alias_hash
= g_hash_table_new (g_str_hash
, g_str_equal
);
53 aliases
= _g_locale_get_charset_aliases ();
54 while (*aliases
!= '\0')
56 const char *canonical
;
58 const char **alias_array
;
62 aliases
+= strlen (aliases
) + 1;
64 aliases
+= strlen (aliases
) + 1;
66 alias_array
= g_hash_table_lookup (alias_hash
, canonical
);
69 while (alias_array
[count
])
73 alias_array
= g_renew (const char *, alias_array
, count
+ 2);
74 alias_array
[count
] = alias
;
75 alias_array
[count
+ 1] = NULL
;
77 g_hash_table_insert (alias_hash
, (char *)canonical
, alias_array
);
86 /* As an abuse of the alias table, the following routines gets
87 * the charsets that are aliases for the canonical name.
90 _g_charset_get_aliases (const char *canonical_name
)
92 GHashTable
*alias_hash
= get_alias_hash ();
94 return g_hash_table_lookup (alias_hash
, canonical_name
);
98 g_utf8_get_charset_internal (const char *raw_data
,
101 const char *charset
= g_getenv ("CHARSET");
103 if (charset
&& *charset
)
107 if (charset
&& strstr (charset
, "UTF-8"))
113 /* The libcharset code tries to be thread-safe without
114 * a lock, but has a memory leak and a missing memory
115 * barrier, so we lock for it
118 charset
= _g_locale_charset_unalias (raw_data
);
121 if (charset
&& *charset
)
125 if (charset
&& strstr (charset
, "UTF-8"))
131 /* Assume this for compatibility at present. */
137 typedef struct _GCharsetCache GCharsetCache
;
139 struct _GCharsetCache
{
146 charset_cache_free (gpointer data
)
148 GCharsetCache
*cache
= data
;
150 g_free (cache
->charset
);
156 * @charset: (out) (optional) (transfer none): return location for character set
159 * Obtains the character set for the [current locale][setlocale]; you
160 * might use this character set as an argument to g_convert(), to convert
161 * from the current locale's encoding to some other encoding. (Frequently
162 * g_locale_to_utf8() and g_locale_from_utf8() are nice shortcuts, though.)
164 * On Windows the character set returned by this function is the
165 * so-called system default ANSI code-page. That is the character set
166 * used by the "narrow" versions of C library and Win32 functions that
167 * handle file names. It might be different from the character set
168 * used by the C library's current locale.
170 * On Linux, the character set is found by consulting nl_langinfo() if
171 * available. If not, the environment variables `LC_ALL`, `LC_CTYPE`, `LANG`
172 * and `CHARSET` are queried in order.
174 * The return value is %TRUE if the locale's encoding is UTF-8, in that
175 * case you can perhaps avoid calling g_convert().
177 * The string returned in @charset is not allocated, and should not be
180 * Returns: %TRUE if the returned charset is UTF-8
183 g_get_charset (const char **charset
)
185 static GPrivate cache_private
= G_PRIVATE_INIT (charset_cache_free
);
186 GCharsetCache
*cache
= g_private_get (&cache_private
);
191 cache
= g_new0 (GCharsetCache
, 1);
192 g_private_set (&cache_private
, cache
);
196 raw
= _g_locale_charset_raw ();
199 if (!(cache
->raw
&& strcmp (cache
->raw
, raw
) == 0))
201 const gchar
*new_charset
;
204 g_free (cache
->charset
);
205 cache
->raw
= g_strdup (raw
);
206 cache
->is_utf8
= g_utf8_get_charset_internal (raw
, &new_charset
);
207 cache
->charset
= g_strdup (new_charset
);
211 *charset
= cache
->charset
;
213 return cache
->is_utf8
;
219 * Gets the character set for the current locale.
221 * Returns: a newly allocated string containing the name
222 * of the character set. This string must be freed with g_free().
227 const gchar
*charset
;
229 g_get_charset (&charset
);
231 return g_strdup (charset
);
236 /* read an alias file for the locales */
238 read_aliases (const gchar
*file
,
239 GHashTable
*alias_table
)
244 fp
= fopen (file
,"r");
247 while (fgets (buf
, 256, fp
))
253 /* Line is a comment */
254 if ((buf
[0] == '#') || (buf
[0] == '\0'))
257 /* Reads first column */
258 for (p
= buf
, q
= NULL
; *p
; p
++) {
259 if ((*p
== '\t') || (*p
== ' ') || (*p
== ':')) {
262 while ((*q
== '\t') || (*q
== ' ')) {
268 /* The line only had one column */
269 if (!q
|| *q
== '\0')
272 /* Read second column */
273 for (p
= q
; *p
; p
++) {
274 if ((*p
== '\t') || (*p
== ' ')) {
280 /* Add to alias table if necessary */
281 if (!g_hash_table_lookup (alias_table
, buf
)) {
282 g_hash_table_insert (alias_table
, g_strdup (buf
), g_strdup (q
));
291 unalias_lang (char *lang
)
294 static GHashTable
*alias_table
= NULL
;
298 if (g_once_init_enter (&alias_table
))
300 GHashTable
*table
= g_hash_table_new (g_str_hash
, g_str_equal
);
301 read_aliases ("/usr/share/locale/locale.alias", table
);
302 g_once_init_leave (&alias_table
, table
);
306 while ((p
= g_hash_table_lookup (alias_table
, lang
)) && (strcmp (p
, lang
) != 0))
311 static gboolean said_before
= FALSE
;
313 g_warning ("Too many alias levels for a locale, "
314 "may indicate a loop");
323 /* Mask for components of locale spec. The ordering here is from
324 * least significant to most significant
328 COMPONENT_CODESET
= 1 << 0,
329 COMPONENT_TERRITORY
= 1 << 1,
330 COMPONENT_MODIFIER
= 1 << 2
333 /* Break an X/Open style locale specification into components
336 explode_locale (const gchar
*locale
,
342 const gchar
*uscore_pos
;
344 const gchar
*dot_pos
;
348 uscore_pos
= strchr (locale
, '_');
349 dot_pos
= strchr (uscore_pos
? uscore_pos
: locale
, '.');
350 at_pos
= strchr (dot_pos
? dot_pos
: (uscore_pos
? uscore_pos
: locale
), '@');
354 mask
|= COMPONENT_MODIFIER
;
355 *modifier
= g_strdup (at_pos
);
358 at_pos
= locale
+ strlen (locale
);
362 mask
|= COMPONENT_CODESET
;
363 *codeset
= g_strndup (dot_pos
, at_pos
- dot_pos
);
370 mask
|= COMPONENT_TERRITORY
;
371 *territory
= g_strndup (uscore_pos
, dot_pos
- uscore_pos
);
374 uscore_pos
= dot_pos
;
376 *language
= g_strndup (locale
, uscore_pos
- locale
);
382 * Compute all interesting variants for a given locale name -
383 * by stripping off different components of the value.
385 * For simplicity, we assume that the locale is in
386 * X/Open format: language[_territory][.codeset][@modifier]
388 * TODO: Extend this to handle the CEN format (see the GNUlibc docs)
389 * as well. We could just copy the code from glibc wholesale
390 * but it is big, ugly, and complicated, so I'm reluctant
391 * to do so when this should handle 99% of the time...
394 append_locale_variants (GPtrArray
*array
,
397 gchar
*language
= NULL
;
398 gchar
*territory
= NULL
;
399 gchar
*codeset
= NULL
;
400 gchar
*modifier
= NULL
;
405 g_return_if_fail (locale
!= NULL
);
407 mask
= explode_locale (locale
, &language
, &territory
, &codeset
, &modifier
);
409 /* Iterate through all possible combinations, from least attractive
410 * to most attractive.
412 for (j
= 0; j
<= mask
; ++j
)
416 if ((i
& ~mask
) == 0)
418 gchar
*val
= g_strconcat (language
,
419 (i
& COMPONENT_TERRITORY
) ? territory
: "",
420 (i
& COMPONENT_CODESET
) ? codeset
: "",
421 (i
& COMPONENT_MODIFIER
) ? modifier
: "",
423 g_ptr_array_add (array
, val
);
428 if (mask
& COMPONENT_CODESET
)
430 if (mask
& COMPONENT_TERRITORY
)
432 if (mask
& COMPONENT_MODIFIER
)
437 * g_get_locale_variants:
438 * @locale: a locale identifier
440 * Returns a list of derived variants of @locale, which can be used to
441 * e.g. construct locale-dependent filenames or search paths. The returned
442 * list is sorted from most desirable to least desirable.
443 * This function handles territory, charset and extra locale modifiers.
445 * For example, if @locale is "fr_BE", then the returned list
448 * If you need the list of variants for the current locale,
449 * use g_get_language_names().
451 * Returns: (transfer full) (array zero-terminated=1) (element-type utf8): a newly
452 * allocated array of newly allocated strings with the locale variants. Free with
458 g_get_locale_variants (const gchar
*locale
)
462 g_return_val_if_fail (locale
!= NULL
, NULL
);
464 array
= g_ptr_array_sized_new (8);
465 append_locale_variants (array
, locale
);
466 g_ptr_array_add (array
, NULL
);
468 return (gchar
**) g_ptr_array_free (array
, FALSE
);
471 /* The following is (partly) taken from the gettext package.
472 Copyright (C) 1995, 1996, 1997, 1998 Free Software Foundation, Inc. */
475 guess_category_value (const gchar
*category_name
)
479 /* The highest priority value is the 'LANGUAGE' environment
480 variable. This is a GNU extension. */
481 retval
= g_getenv ("LANGUAGE");
482 if ((retval
!= NULL
) && (retval
[0] != '\0'))
485 /* 'LANGUAGE' is not set. So we have to proceed with the POSIX
486 methods of looking to 'LC_ALL', 'LC_xxx', and 'LANG'. On some
487 systems this can be done by the 'setlocale' function itself. */
489 /* Setting of LC_ALL overwrites all other. */
490 retval
= g_getenv ("LC_ALL");
491 if ((retval
!= NULL
) && (retval
[0] != '\0'))
494 /* Next comes the name of the desired category. */
495 retval
= g_getenv (category_name
);
496 if ((retval
!= NULL
) && (retval
[0] != '\0'))
499 /* Last possibility is the LANG environment variable. */
500 retval
= g_getenv ("LANG");
501 if ((retval
!= NULL
) && (retval
[0] != '\0'))
504 #ifdef G_PLATFORM_WIN32
505 /* g_win32_getlocale() first checks for LC_ALL, LC_MESSAGES and
506 * LANG, which we already did above. Oh well. The main point of
507 * calling g_win32_getlocale() is to get the thread's locale as used
508 * by Windows and the Microsoft C runtime (in the "English_United
509 * States" format) translated into the Unixish format.
512 char *locale
= g_win32_getlocale ();
513 retval
= g_intern_string (locale
);
522 typedef struct _GLanguageNamesCache GLanguageNamesCache
;
524 struct _GLanguageNamesCache
{
526 gchar
**language_names
;
530 language_names_cache_free (gpointer data
)
532 GLanguageNamesCache
*cache
= data
;
533 g_free (cache
->languages
);
534 g_strfreev (cache
->language_names
);
539 * g_get_language_names:
541 * Computes a list of applicable locale names, which can be used to
542 * e.g. construct locale-dependent filenames or search paths. The returned
543 * list is sorted from most desirable to least desirable and always contains
544 * the default locale "C".
546 * For example, if LANGUAGE=de:en_US, then the returned list is
547 * "de", "en_US", "en", "C".
549 * This function consults the environment variables `LANGUAGE`, `LC_ALL`,
550 * `LC_MESSAGES` and `LANG` to find the list of locales specified by the
553 * Returns: (array zero-terminated=1) (transfer none): a %NULL-terminated array of strings owned by GLib
554 * that must not be modified or freed.
558 const gchar
* const *
559 g_get_language_names (void)
561 return g_get_language_names_with_category ("LC_MESSAGES");
565 * g_get_language_names_with_category:
566 * @category_name: a locale category name
568 * Computes a list of applicable locale names with a locale category name,
569 * which can be used to construct the fallback locale-dependent filenames
570 * or search paths. The returned list is sorted from most desirable to
571 * least desirable and always contains the default locale "C".
573 * This function consults the environment variables `LANGUAGE`, `LC_ALL`,
574 * @category_name, and `LANG` to find the list of locales specified by the
577 * g_get_language_names() returns g_get_language_names_with_category("LC_MESSAGES").
579 * Returns: (array zero-terminated=1) (transfer none): a %NULL-terminated array of strings owned by GLib
580 * that must not be modified or freed.
584 const gchar
* const *
585 g_get_language_names_with_category (const gchar
*category_name
)
587 static GPrivate cache_private
= G_PRIVATE_INIT ((void (*)(gpointer
)) g_hash_table_remove_all
);
588 GHashTable
*cache
= g_private_get (&cache_private
);
589 const gchar
*languages
;
590 GLanguageNamesCache
*name_cache
;
592 g_return_val_if_fail (category_name
!= NULL
, NULL
);
596 cache
= g_hash_table_new_full (g_str_hash
, g_str_equal
,
597 g_free
, language_names_cache_free
);
598 g_private_set (&cache_private
, cache
);
601 languages
= guess_category_value (category_name
);
605 name_cache
= (GLanguageNamesCache
*) g_hash_table_lookup (cache
, category_name
);
606 if (!(name_cache
&& name_cache
->languages
&&
607 strcmp (name_cache
->languages
, languages
) == 0))
612 g_hash_table_remove (cache
, category_name
);
614 array
= g_ptr_array_sized_new (8);
616 alist
= g_strsplit (languages
, ":", 0);
617 for (a
= alist
; *a
; a
++)
618 append_locale_variants (array
, unalias_lang (*a
));
620 g_ptr_array_add (array
, g_strdup ("C"));
621 g_ptr_array_add (array
, NULL
);
623 name_cache
= g_new0 (GLanguageNamesCache
, 1);
624 name_cache
->languages
= g_strdup (languages
);
625 name_cache
->language_names
= (gchar
**) g_ptr_array_free (array
, FALSE
);
626 g_hash_table_insert (cache
, g_strdup (category_name
), name_cache
);
629 return (const gchar
* const *) name_cache
->language_names
;