1 /*****************************************************************************
2 * unicode.c: Unicode <-> locale functions
3 *****************************************************************************
4 * Copyright (C) 2005-2006 VLC authors and VideoLAN
5 * Copyright © 2005-2010 Rémi Denis-Courmont
7 * Authors: Rémi Denis-Courmont <rem # videolan.org>
9 * This program is free software; you can redistribute it and/or modify it
10 * under the terms of the GNU Lesser General Public License as published by
11 * the Free Software Foundation; either version 2.1 of the License, or
12 * (at your option) any later version.
14 * This program is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 * GNU Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public License
20 * along with this program; if not, write to the Free Software Foundation,
21 * Inc., 51 Franklin Street, Fifth Floor, Boston MA 02110-1301, USA.
22 *****************************************************************************/
24 /*****************************************************************************
26 *****************************************************************************/
31 #include <vlc_common.h>
34 #include <vlc_charset.h>
41 #include <sys/types.h>
49 * Formats an UTF-8 string as vfprintf(), then print it, with
50 * appropriate conversion to local encoding.
52 int utf8_vfprintf( FILE *stream
, const char *fmt
, va_list ap
)
55 return vfprintf (stream
, fmt
, ap
);
58 int res
= vasprintf (&str
, fmt
, ap
);
59 if (unlikely(res
== -1))
63 /* Writing to the console is a lot of fun on Microsoft Windows.
64 * If you use the standard I/O functions, you must use the OEM code page,
65 * which is different from the usual ANSI code page. Or maybe not, if the
66 * user called "chcp". Anyway, we prefer Unicode. */
67 int fd
= _fileno (stream
);
68 if (likely(fd
!= -1) && _isatty (fd
))
70 wchar_t *wide
= ToWide (str
);
71 if (likely(wide
!= NULL
))
73 HANDLE h
= (HANDLE
)((uintptr_t)_get_osfhandle (fd
));
75 /* XXX: It is not clear whether WriteConsole() wants the number of
76 * Unicode characters or the size of the wchar_t array. */
77 BOOL ok
= WriteConsoleW (h
, wide
, wcslen (wide
), &out
, NULL
);
84 wchar_t *wide
= ToWide(str
);
85 if (likely(wide
!= NULL
))
87 res
= fputws(wide
, stream
);
99 * Formats an UTF-8 string as fprintf(), then print it, with
100 * appropriate conversion to local encoding.
102 int utf8_fprintf( FILE *stream
, const char *fmt
, ... )
108 res
= utf8_vfprintf( stream
, fmt
, ap
);
113 size_t vlc_towc (const char *str
, uint32_t *restrict pwc
)
115 uint8_t *ptr
= (uint8_t *)str
, c
;
118 assert (str
!= NULL
);
121 if (unlikely(c
> 0xF4))
124 int charlen
= clz8 (c
^ 0xFF);
127 case 0: // 7-bit ASCII character -> short cut
131 case 1: // continuation byte -> error
135 if (unlikely(c
< 0xC2)) // ASCII overlong
137 cp
= (c
& 0x1F) << 6;
141 cp
= (c
& 0x0F) << 12;
145 cp
= (c
& 0x07) << 18;
149 vlc_assert_unreachable ();
152 /* Unrolled continuation bytes decoding */
157 if (unlikely((c
& 0xC0) != 0x80)) // not a continuation byte
159 cp
|= (c
& 0x3F) << 12;
161 if (unlikely(cp
>= 0x110000)) // beyond Unicode range
166 if (unlikely((c
& 0xC0) != 0x80)) // not a continuation byte
168 cp
|= (c
& 0x3F) << 6;
170 if (unlikely(cp
>= 0xD800 && cp
< 0xE000)) // UTF-16 surrogate
172 if (unlikely(cp
< (1u << (5 * charlen
- 4)))) // non-ASCII overlong
177 if (unlikely((c
& 0xC0) != 0x80)) // not a continuation byte
188 * Look for an UTF-8 string within another one in a case-insensitive fashion.
189 * Beware that this is quite slow. Contrary to strcasestr(), this function
190 * works regardless of the system character encoding, and handles multibyte
191 * code points correctly.
193 * @param haystack string to look into
194 * @param needle string to look for
195 * @return a pointer to the first occurrence of the needle within the haystack,
196 * or NULL if no occurrence were found.
198 char *vlc_strcasestr (const char *haystack
, const char *needle
)
204 const char *h
= haystack
, *n
= needle
;
210 s
= vlc_towc (n
, &cpn
);
212 return (char *)haystack
;
217 s
= vlc_towc (h
, &cph
);
218 if (s
<= 0 || towlower (cph
) != towlower (cpn
))
223 s
= vlc_towc (haystack
, &(uint32_t) { 0 });
232 * Converts a string from the given character encoding to utf-8.
234 * @return a nul-terminated utf-8 string, or null in case of error.
235 * The result must be freed using free().
237 char *FromCharset(const char *charset
, const void *data
, size_t data_size
)
239 vlc_iconv_t handle
= vlc_iconv_open ("UTF-8", charset
);
240 if (handle
== (vlc_iconv_t
)(-1))
244 for(unsigned mul
= 4; mul
< 8; mul
++ )
246 size_t in_size
= data_size
;
247 const char *in
= data
;
248 size_t out_max
= mul
* data_size
;
249 char *tmp
= out
= malloc (1 + out_max
);
253 if (vlc_iconv (handle
, &in
, &in_size
, &tmp
, &out_max
) != (size_t)(-1)) {
263 vlc_iconv_close(handle
);
268 * Converts a nul-terminated UTF-8 string to a given character encoding.
269 * @param charset iconv name of the character set
270 * @param in nul-terminated UTF-8 string
271 * @param outsize pointer to hold the byte size of result
273 * @return A pointer to the result, which must be released using free().
274 * The UTF-8 nul terminator is included in the conversion if the target
275 * character encoding supports it. However it is not included in the returned
277 * In case of error, NULL is returned and the byte size is undefined.
279 void *ToCharset(const char *charset
, const char *in
, size_t *outsize
)
281 vlc_iconv_t hd
= vlc_iconv_open (charset
, "UTF-8");
282 if (hd
== (vlc_iconv_t
)(-1))
285 const size_t inlen
= strlen (in
);
288 for (unsigned mul
= 4; mul
< 16; mul
++)
290 size_t outlen
= mul
* (inlen
+ 1);
291 res
= malloc (outlen
);
292 if (unlikely(res
== NULL
))
295 const char *inp
= in
;
298 size_t outb
= outlen
- mul
;
300 if (vlc_iconv (hd
, &inp
, &inb
, &outp
, &outb
) != (size_t)(-1))
302 *outsize
= outlen
- mul
- outb
;
304 inb
= 1; /* append nul terminator if possible */
305 if (vlc_iconv (hd
, &inp
, &inb
, &outp
, &outb
) != (size_t)(-1))
307 if (errno
== EILSEQ
) /* cannot translate nul terminator!? */
313 if (errno
!= E2BIG
) /* conversion failure */
316 vlc_iconv_close (hd
);