Fixed a crash caused by yadif deinterlacer on Windows XP
[vlc/solaris.git] / src / text / unicode.c
blobc71a534b6499a989c5f31c9d403a80d4d6bfe3c1
1 /*****************************************************************************
2 * unicode.c: Unicode <-> locale functions
3 *****************************************************************************
4 * Copyright (C) 2005-2006 VLC authors and VideoLAN
5 * Copyright © 2005-2010 Rémi Denis-Courmont
7 * Authors: Rémi Denis-Courmont <rem # videolan.org>
9 * This program is free software; you can redistribute it and/or modify it
10 * under the terms of the GNU Lesser General Public License as published by
11 * the Free Software Foundation; either version 2.1 of the License, or
12 * (at your option) any later version.
14 * This program is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 * GNU Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public License
20 * along with this program; if not, write to the Free Software Foundation,
21 * Inc., 51 Franklin Street, Fifth Floor, Boston MA 02110-1301, USA.
22 *****************************************************************************/
24 /*****************************************************************************
25 * Preamble
26 *****************************************************************************/
27 #ifdef HAVE_CONFIG_H
28 # include "config.h"
29 #endif
31 #include <vlc_common.h>
33 #include "libvlc.h"
34 #include <vlc_charset.h>
36 #include <assert.h>
38 #include <stdio.h>
39 #include <stdarg.h>
40 #include <stdlib.h>
41 #include <sys/types.h>
42 #ifdef UNDER_CE
43 # include <tchar.h>
44 #elif defined(WIN32)
45 # include <io.h>
46 #endif
47 #include <errno.h>
48 #include <wctype.h>
50 /**
51 * Releases (if needed) a localized or uniformized string.
52 * @param str non-NULL return value from FromLocale() or ToLocale().
54 void LocaleFree (const char *str)
56 #ifdef ASSUME_UTF8
57 (void) str;
58 #else
59 free ((char *)str);
60 #endif
64 /**
65 * Converts a string from the system locale character encoding to UTF-8.
67 * @param locale nul-terminated string to convert
69 * @return a nul-terminated UTF-8 string, or NULL in case of error.
70 * To avoid memory leak, you have to pass the result to LocaleFree()
71 * when it is no longer needed.
73 char *FromLocale (const char *locale)
75 #ifdef ASSUME_UTF8
76 return (char *)locale;
77 #else
78 return locale ? FromCharset ("", locale, strlen(locale)) : NULL;
79 #endif
82 /**
83 * converts a string from the system locale character encoding to utf-8,
84 * the result is always allocated on the heap.
86 * @param locale nul-terminated string to convert
88 * @return a nul-terminated utf-8 string, or null in case of error.
89 * The result must be freed using free() - as with the strdup() function.
91 char *FromLocaleDup (const char *locale)
93 #ifdef ASSUME_UTF8
94 return strdup (locale);
95 #else
96 return FromCharset ("", locale, strlen(locale));
97 #endif
102 * ToLocale: converts an UTF-8 string to local system encoding.
104 * @param utf8 nul-terminated string to be converted
106 * @return a nul-terminated string, or NULL in case of error.
107 * To avoid memory leak, you have to pass the result to LocaleFree()
108 * when it is no longer needed.
110 char *ToLocale (const char *utf8)
112 #ifdef ASSUME_UTF8
113 return (char *)utf8;
114 #else
115 size_t outsize;
116 return utf8 ? ToCharset ("", utf8, &outsize) : NULL;
117 #endif
122 * converts a string from UTF-8 to the system locale character encoding,
123 * the result is always allocated on the heap.
125 * @param utf8 nul-terminated string to convert
127 * @return a nul-terminated string, or null in case of error.
128 * The result must be freed using free() - as with the strdup() function.
130 char *ToLocaleDup (const char *utf8)
132 #ifdef ASSUME_UTF8
133 return strdup (utf8);
134 #else
135 size_t outsize;
136 return ToCharset ("", utf8, &outsize);
137 #endif
141 * Formats an UTF-8 string as vfprintf(), then print it, with
142 * appropriate conversion to local encoding.
144 int utf8_vfprintf( FILE *stream, const char *fmt, va_list ap )
146 #ifdef ASSUME_UTF8
147 return vfprintf (stream, fmt, ap);
148 #else
149 char *str;
150 int res;
152 # if defined( WIN32 ) && !defined( UNDER_CE )
153 /* Writing to the console is a lot of fun on Microsoft Windows.
154 * If you use the standard I/O functions, you must use the OEM code page,
155 * which is different from the usual ANSI code page. Or maybe not, if the
156 * user called "chcp". Anyway, we prefer Unicode. */
157 int fd = _fileno (stream);
158 if (likely(fd != -1) && _isatty (fd))
160 res = vasprintf (&str, fmt, ap);
161 if (unlikely(res == -1))
162 return -1;
164 size_t wlen = 2 * (res + 1);
165 wchar_t *wide = malloc (wlen);
166 if (likely(wide != NULL))
168 wlen = MultiByteToWideChar (CP_UTF8, 0, str, res + 1, wide, wlen);
169 if (wlen > 0)
171 HANDLE h = (HANDLE)(intptr_t)_get_osfhandle (fd);
172 DWORD out;
174 WriteConsoleW (h, wide, wlen - 1, &out, NULL);
176 else
177 res = -1;
178 free (wide);
180 else
181 res = -1;
182 free (str);
183 return res;
185 # endif
187 res = vasprintf (&str, fmt, ap);
188 if (unlikely(res == -1))
189 return -1;
191 char *ansi = ToLocaleDup (str);
192 free (str);
194 if (ansi == NULL)
195 return -1;
196 fputs (ansi, stream);
197 free (ansi);
198 return res;
199 #endif
203 * Formats an UTF-8 string as fprintf(), then print it, with
204 * appropriate conversion to local encoding.
206 int utf8_fprintf( FILE *stream, const char *fmt, ... )
208 va_list ap;
209 int res;
211 va_start( ap, fmt );
212 res = utf8_vfprintf( stream, fmt, ap );
213 va_end( ap );
214 return res;
219 * Converts the first character from a UTF-8 sequence into a code point.
221 * @param str an UTF-8 bytes sequence
222 * @return 0 if str points to an empty string, i.e. the first character is NUL;
223 * number of bytes that the first character occupies (from 1 to 4) otherwise;
224 * -1 if the byte sequence was not a valid UTF-8 sequence.
226 size_t vlc_towc (const char *str, uint32_t *restrict pwc)
228 uint8_t *ptr = (uint8_t *)str, c;
229 uint32_t cp;
231 assert (str != NULL);
233 c = *ptr;
234 if (unlikely(c > 0xF4))
235 return -1;
237 int charlen = clz8 (c ^ 0xFF);
238 switch (charlen)
240 case 0: // 7-bit ASCII character -> short cut
241 *pwc = c;
242 return c != '\0';
244 case 1: // continuation byte -> error
245 return -1;
247 case 2:
248 if (unlikely(c < 0xC2)) // ASCII overlong
249 return -1;
250 cp = (c & 0x1F) << 6;
251 break;
253 case 3:
254 cp = (c & 0x0F) << 12;
255 break;
257 case 4:
258 cp = (c & 0x07) << 16;
259 break;
261 default:
262 assert (0);
265 /* Unrolled continuation bytes decoding */
266 switch (charlen)
268 case 4:
269 c = *++ptr;
270 if (unlikely((c >> 6) != 2)) // not a continuation byte
271 return -1;
272 cp |= (c & 0x3f) << 12;
274 if (unlikely(cp >= 0x110000)) // beyond Unicode range
275 return -1;
276 /* fall through */
277 case 3:
278 c = *++ptr;
279 if (unlikely((c >> 6) != 2)) // not a continuation byte
280 return -1;
281 cp |= (c & 0x3f) << 6;
283 if (unlikely(cp >= 0xD800 && cp < 0xC000)) // UTF-16 surrogate
284 return -1;
285 if (unlikely(cp < (1u << (5 * charlen - 4)))) // non-ASCII overlong
286 return -1;
287 /* fall through */
288 case 2:
289 c = *++ptr;
290 if (unlikely((c >> 6) != 2)) // not a continuation byte
291 return -1;
292 cp |= (c & 0x3f);
293 break;
296 *pwc = cp;
297 return charlen;
301 * Look for an UTF-8 string within another one in a case-insensitive fashion.
302 * Beware that this is quite slow. Contrary to strcasestr(), this function
303 * works regardless of the system character encoding, and handles multibyte
304 * code points correctly.
306 * @param haystack string to look into
307 * @param needle string to look for
308 * @return a pointer to the first occurence of the needle within the haystack,
309 * or NULL if no occurence were found.
311 char *vlc_strcasestr (const char *haystack, const char *needle)
313 ssize_t s;
317 const char *h = haystack, *n = needle;
319 for (;;)
321 uint32_t cph, cpn;
323 s = vlc_towc (n, &cpn);
324 if (s == 0)
325 return (char *)haystack;
326 if (unlikely(s < 0))
327 return NULL;
328 n += s;
330 s = vlc_towc (h, &cph);
331 if (s <= 0 || towlower (cph) != towlower (cpn))
332 break;
333 h += s;
336 s = vlc_towc (haystack, &(uint32_t) { 0 });
337 haystack += s;
339 while (s != 0);
341 return NULL;
345 * Replaces invalid/overlong UTF-8 sequences with question marks.
346 * Note that it is not possible to convert from Latin-1 to UTF-8 on the fly,
347 * so we don't try that, even though it would be less disruptive.
349 * @return str if it was valid UTF-8, NULL if not.
351 char *EnsureUTF8( char *str )
353 char *ret = str;
354 size_t n;
355 uint32_t cp;
357 while ((n = vlc_towc (str, &cp)) != 0)
358 if (likely(n != (size_t)-1))
359 str += n;
360 else
362 *str++ = '?';
363 ret = NULL;
365 return ret;
370 * Checks whether a string is a valid UTF-8 byte sequence.
372 * @param str nul-terminated string to be checked
374 * @return str if it was valid UTF-8, NULL if not.
376 const char *IsUTF8( const char *str )
378 size_t n;
379 uint32_t cp;
381 while ((n = vlc_towc (str, &cp)) != 0)
382 if (likely(n != (size_t)-1))
383 str += n;
384 else
385 return NULL;
386 return str;
390 * Converts a string from the given character encoding to utf-8.
392 * @return a nul-terminated utf-8 string, or null in case of error.
393 * The result must be freed using free().
395 char *FromCharset(const char *charset, const void *data, size_t data_size)
397 vlc_iconv_t handle = vlc_iconv_open ("UTF-8", charset);
398 if (handle == (vlc_iconv_t)(-1))
399 return NULL;
401 char *out = NULL;
402 for(unsigned mul = 4; mul < 8; mul++ )
404 size_t in_size = data_size;
405 const char *in = data;
406 size_t out_max = mul * data_size;
407 char *tmp = out = malloc (1 + out_max);
408 if (!out)
409 break;
411 if (vlc_iconv (handle, &in, &in_size, &tmp, &out_max) != (size_t)(-1)) {
412 *tmp = '\0';
413 break;
415 free(out);
416 out = NULL;
418 if (errno != E2BIG)
419 break;
421 vlc_iconv_close(handle);
422 return out;
426 * Converts a nul-terminated UTF-8 string to a given character encoding.
427 * @param charset iconv name of the character set
428 * @param in nul-terminated UTF-8 string
429 * @param outsize pointer to hold the byte size of result
431 * @return A pointer to the result, which must be released using free().
432 * The UTF-8 nul terminator is included in the conversion if the target
433 * character encoding supports it. However it is not included in the returned
434 * byte size.
435 * In case of error, NULL is returned and the byte size is undefined.
437 void *ToCharset(const char *charset, const char *in, size_t *outsize)
439 vlc_iconv_t hd = vlc_iconv_open (charset, "UTF-8");
440 if (hd == (vlc_iconv_t)(-1))
441 return NULL;
443 const size_t inlen = strlen (in);
444 void *res;
446 for (unsigned mul = 4; mul < 16; mul++)
448 size_t outlen = mul * (inlen + 1);
449 res = malloc (outlen);
450 if (unlikely(res == NULL))
451 break;
453 const char *inp = in;
454 char *outp = res;
455 size_t inb = inlen;
456 size_t outb = outlen - mul;
458 if (vlc_iconv (hd, &inp, &inb, &outp, &outb) != (size_t)(-1))
460 *outsize = outlen - mul - outb;
461 outb += mul;
462 inb = 1; /* append nul terminator if possible */
463 if (vlc_iconv (hd, &inp, &inb, &outp, &outb) != (size_t)(-1))
464 break;
465 if (errno == EILSEQ) /* cannot translate nul terminator!? */
466 break;
469 free (res);
470 res = NULL;
471 if (errno != E2BIG) /* conversion failure */
472 break;
474 vlc_iconv_close (hd);
475 return res;