1 /*****************************************************************************
2 * unicode.c: Unicode <-> locale functions
3 *****************************************************************************
4 * Copyright (C) 2005-2006 VLC authors and VideoLAN
5 * Copyright © 2005-2010 Rémi Denis-Courmont
7 * Authors: Rémi Denis-Courmont <rem # videolan.org>
9 * This program is free software; you can redistribute it and/or modify it
10 * under the terms of the GNU Lesser General Public License as published by
11 * the Free Software Foundation; either version 2.1 of the License, or
12 * (at your option) any later version.
14 * This program is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 * GNU Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public License
20 * along with this program; if not, write to the Free Software Foundation,
21 * Inc., 51 Franklin Street, Fifth Floor, Boston MA 02110-1301, USA.
22 *****************************************************************************/
24 /*****************************************************************************
26 *****************************************************************************/
31 #include <vlc_common.h>
34 #include <vlc_charset.h>
41 #include <sys/types.h>
51 * Releases (if needed) a localized or uniformized string.
52 * @param str non-NULL return value from FromLocale() or ToLocale().
54 void LocaleFree (const char *str
)
65 * Converts a string from the system locale character encoding to UTF-8.
67 * @param locale nul-terminated string to convert
69 * @return a nul-terminated UTF-8 string, or NULL in case of error.
70 * To avoid memory leak, you have to pass the result to LocaleFree()
71 * when it is no longer needed.
73 char *FromLocale (const char *locale
)
76 return (char *)locale
;
78 return locale
? FromCharset ("", locale
, strlen(locale
)) : NULL
;
83 * converts a string from the system locale character encoding to utf-8,
84 * the result is always allocated on the heap.
86 * @param locale nul-terminated string to convert
88 * @return a nul-terminated utf-8 string, or null in case of error.
89 * The result must be freed using free() - as with the strdup() function.
91 char *FromLocaleDup (const char *locale
)
94 return strdup (locale
);
96 return FromCharset ("", locale
, strlen(locale
));
102 * ToLocale: converts an UTF-8 string to local system encoding.
104 * @param utf8 nul-terminated string to be converted
106 * @return a nul-terminated string, or NULL in case of error.
107 * To avoid memory leak, you have to pass the result to LocaleFree()
108 * when it is no longer needed.
110 char *ToLocale (const char *utf8
)
116 return utf8
? ToCharset ("", utf8
, &outsize
) : NULL
;
122 * converts a string from UTF-8 to the system locale character encoding,
123 * the result is always allocated on the heap.
125 * @param utf8 nul-terminated string to convert
127 * @return a nul-terminated string, or null in case of error.
128 * The result must be freed using free() - as with the strdup() function.
130 char *ToLocaleDup (const char *utf8
)
133 return strdup (utf8
);
136 return ToCharset ("", utf8
, &outsize
);
141 * Formats an UTF-8 string as vfprintf(), then print it, with
142 * appropriate conversion to local encoding.
144 int utf8_vfprintf( FILE *stream
, const char *fmt
, va_list ap
)
147 return vfprintf (stream
, fmt
, ap
);
152 # if defined( WIN32 ) && !defined( UNDER_CE )
153 /* Writing to the console is a lot of fun on Microsoft Windows.
154 * If you use the standard I/O functions, you must use the OEM code page,
155 * which is different from the usual ANSI code page. Or maybe not, if the
156 * user called "chcp". Anyway, we prefer Unicode. */
157 int fd
= _fileno (stream
);
158 if (likely(fd
!= -1) && _isatty (fd
))
160 res
= vasprintf (&str
, fmt
, ap
);
161 if (unlikely(res
== -1))
164 size_t wlen
= 2 * (res
+ 1);
165 wchar_t *wide
= malloc (wlen
);
166 if (likely(wide
!= NULL
))
168 wlen
= MultiByteToWideChar (CP_UTF8
, 0, str
, res
+ 1, wide
, wlen
);
171 HANDLE h
= (HANDLE
)(intptr_t)_get_osfhandle (fd
);
174 WriteConsoleW (h
, wide
, wlen
- 1, &out
, NULL
);
187 res
= vasprintf (&str
, fmt
, ap
);
188 if (unlikely(res
== -1))
191 char *ansi
= ToLocaleDup (str
);
196 fputs (ansi
, stream
);
203 * Formats an UTF-8 string as fprintf(), then print it, with
204 * appropriate conversion to local encoding.
206 int utf8_fprintf( FILE *stream
, const char *fmt
, ... )
212 res
= utf8_vfprintf( stream
, fmt
, ap
);
219 * Converts the first character from a UTF-8 sequence into a code point.
221 * @param str an UTF-8 bytes sequence
222 * @return 0 if str points to an empty string, i.e. the first character is NUL;
223 * number of bytes that the first character occupies (from 1 to 4) otherwise;
224 * -1 if the byte sequence was not a valid UTF-8 sequence.
226 size_t vlc_towc (const char *str
, uint32_t *restrict pwc
)
228 uint8_t *ptr
= (uint8_t *)str
, c
;
231 assert (str
!= NULL
);
234 if (unlikely(c
> 0xF4))
237 int charlen
= clz8 (c
^ 0xFF);
240 case 0: // 7-bit ASCII character -> short cut
244 case 1: // continuation byte -> error
248 if (unlikely(c
< 0xC2)) // ASCII overlong
250 cp
= (c
& 0x1F) << 6;
254 cp
= (c
& 0x0F) << 12;
258 cp
= (c
& 0x07) << 16;
265 /* Unrolled continuation bytes decoding */
270 if (unlikely((c
>> 6) != 2)) // not a continuation byte
272 cp
|= (c
& 0x3f) << 12;
274 if (unlikely(cp
>= 0x110000)) // beyond Unicode range
279 if (unlikely((c
>> 6) != 2)) // not a continuation byte
281 cp
|= (c
& 0x3f) << 6;
283 if (unlikely(cp
>= 0xD800 && cp
< 0xC000)) // UTF-16 surrogate
285 if (unlikely(cp
< (1u << (5 * charlen
- 4)))) // non-ASCII overlong
290 if (unlikely((c
>> 6) != 2)) // not a continuation byte
301 * Look for an UTF-8 string within another one in a case-insensitive fashion.
302 * Beware that this is quite slow. Contrary to strcasestr(), this function
303 * works regardless of the system character encoding, and handles multibyte
304 * code points correctly.
306 * @param haystack string to look into
307 * @param needle string to look for
308 * @return a pointer to the first occurence of the needle within the haystack,
309 * or NULL if no occurence were found.
311 char *vlc_strcasestr (const char *haystack
, const char *needle
)
317 const char *h
= haystack
, *n
= needle
;
323 s
= vlc_towc (n
, &cpn
);
325 return (char *)haystack
;
330 s
= vlc_towc (h
, &cph
);
331 if (s
<= 0 || towlower (cph
) != towlower (cpn
))
336 s
= vlc_towc (haystack
, &(uint32_t) { 0 });
345 * Replaces invalid/overlong UTF-8 sequences with question marks.
346 * Note that it is not possible to convert from Latin-1 to UTF-8 on the fly,
347 * so we don't try that, even though it would be less disruptive.
349 * @return str if it was valid UTF-8, NULL if not.
351 char *EnsureUTF8( char *str
)
357 while ((n
= vlc_towc (str
, &cp
)) != 0)
358 if (likely(n
!= (size_t)-1))
370 * Checks whether a string is a valid UTF-8 byte sequence.
372 * @param str nul-terminated string to be checked
374 * @return str if it was valid UTF-8, NULL if not.
376 const char *IsUTF8( const char *str
)
381 while ((n
= vlc_towc (str
, &cp
)) != 0)
382 if (likely(n
!= (size_t)-1))
390 * Converts a string from the given character encoding to utf-8.
392 * @return a nul-terminated utf-8 string, or null in case of error.
393 * The result must be freed using free().
395 char *FromCharset(const char *charset
, const void *data
, size_t data_size
)
397 vlc_iconv_t handle
= vlc_iconv_open ("UTF-8", charset
);
398 if (handle
== (vlc_iconv_t
)(-1))
402 for(unsigned mul
= 4; mul
< 8; mul
++ )
404 size_t in_size
= data_size
;
405 const char *in
= data
;
406 size_t out_max
= mul
* data_size
;
407 char *tmp
= out
= malloc (1 + out_max
);
411 if (vlc_iconv (handle
, &in
, &in_size
, &tmp
, &out_max
) != (size_t)(-1)) {
421 vlc_iconv_close(handle
);
426 * Converts a nul-terminated UTF-8 string to a given character encoding.
427 * @param charset iconv name of the character set
428 * @param in nul-terminated UTF-8 string
429 * @param outsize pointer to hold the byte size of result
431 * @return A pointer to the result, which must be released using free().
432 * The UTF-8 nul terminator is included in the conversion if the target
433 * character encoding supports it. However it is not included in the returned
435 * In case of error, NULL is returned and the byte size is undefined.
437 void *ToCharset(const char *charset
, const char *in
, size_t *outsize
)
439 vlc_iconv_t hd
= vlc_iconv_open (charset
, "UTF-8");
440 if (hd
== (vlc_iconv_t
)(-1))
443 const size_t inlen
= strlen (in
);
446 for (unsigned mul
= 4; mul
< 16; mul
++)
448 size_t outlen
= mul
* (inlen
+ 1);
449 res
= malloc (outlen
);
450 if (unlikely(res
== NULL
))
453 const char *inp
= in
;
456 size_t outb
= outlen
- mul
;
458 if (vlc_iconv (hd
, &inp
, &inb
, &outp
, &outb
) != (size_t)(-1))
460 *outsize
= outlen
- mul
- outb
;
462 inb
= 1; /* append nul terminator if possible */
463 if (vlc_iconv (hd
, &inp
, &inb
, &outp
, &outb
) != (size_t)(-1))
465 if (errno
== EILSEQ
) /* cannot translate nul terminator!? */
471 if (errno
!= E2BIG
) /* conversion failure */
474 vlc_iconv_close (hd
);