Revert "contrib: Allow a minimum version to be required"
[vlc.git] / src / text / unicode.c
blob54f05480dd5ae3e567f7fe538f2ce815fa4505f9
1 /*****************************************************************************
2 * unicode.c: Unicode <-> locale functions
3 *****************************************************************************
4 * Copyright (C) 2005-2006 VLC authors and VideoLAN
5 * Copyright © 2005-2010 Rémi Denis-Courmont
7 * Authors: Rémi Denis-Courmont <rem # videolan.org>
9 * This program is free software; you can redistribute it and/or modify it
10 * under the terms of the GNU Lesser General Public License as published by
11 * the Free Software Foundation; either version 2.1 of the License, or
12 * (at your option) any later version.
14 * This program is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 * GNU Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public License
20 * along with this program; if not, write to the Free Software Foundation,
21 * Inc., 51 Franklin Street, Fifth Floor, Boston MA 02110-1301, USA.
22 *****************************************************************************/
24 /*****************************************************************************
25 * Preamble
26 *****************************************************************************/
27 #ifdef HAVE_CONFIG_H
28 # include "config.h"
29 #endif
31 #include <vlc_common.h>
33 #include "libvlc.h"
34 #include <vlc_charset.h>
36 #include <assert.h>
38 #include <stdio.h>
39 #include <stdarg.h>
40 #include <stdlib.h>
41 #include <sys/types.h>
42 #if defined(_WIN32)
43 # include <io.h>
44 #endif
45 #include <errno.h>
46 #include <wctype.h>
48 /**
49 * Formats an UTF-8 string as vfprintf(), then print it, with
50 * appropriate conversion to local encoding.
52 int utf8_vfprintf( FILE *stream, const char *fmt, va_list ap )
54 #ifndef _WIN32
55 return vfprintf (stream, fmt, ap);
56 #else
57 char *str;
58 int res = vasprintf (&str, fmt, ap);
59 if (unlikely(res == -1))
60 return -1;
62 #if !VLC_WINSTORE_APP
63 /* Writing to the console is a lot of fun on Microsoft Windows.
64 * If you use the standard I/O functions, you must use the OEM code page,
65 * which is different from the usual ANSI code page. Or maybe not, if the
66 * user called "chcp". Anyway, we prefer Unicode. */
67 int fd = _fileno (stream);
68 if (likely(fd != -1) && _isatty (fd))
70 wchar_t *wide = ToWide (str);
71 if (likely(wide != NULL))
73 HANDLE h = (HANDLE)((uintptr_t)_get_osfhandle (fd));
74 DWORD out;
75 /* XXX: It is not clear whether WriteConsole() wants the number of
76 * Unicode characters or the size of the wchar_t array. */
77 BOOL ok = WriteConsoleW (h, wide, wcslen (wide), &out, NULL);
78 free (wide);
79 if (ok)
80 goto out;
83 #endif
84 wchar_t *wide = ToWide(str);
85 if (likely(wide != NULL))
87 res = fputws(wide, stream);
88 free(wide);
90 else
91 res = -1;
92 out:
93 free (str);
94 return res;
95 #endif
98 /**
99 * Formats an UTF-8 string as fprintf(), then print it, with
100 * appropriate conversion to local encoding.
102 int utf8_fprintf( FILE *stream, const char *fmt, ... )
104 va_list ap;
105 int res;
107 va_start( ap, fmt );
108 res = utf8_vfprintf( stream, fmt, ap );
109 va_end( ap );
110 return res;
113 size_t vlc_towc (const char *str, uint32_t *restrict pwc)
115 uint8_t *ptr = (uint8_t *)str, c;
116 uint32_t cp;
118 assert (str != NULL);
120 c = *ptr;
121 if (unlikely(c > 0xF4))
122 return -1;
124 int charlen = clz((unsigned char)(c ^ 0xFF));
125 switch (charlen)
127 case 0: // 7-bit ASCII character -> short cut
128 *pwc = c;
129 return c != '\0';
131 case 1: // continuation byte -> error
132 return -1;
134 case 2:
135 if (unlikely(c < 0xC2)) // ASCII overlong
136 return -1;
137 cp = (c & 0x1F) << 6;
138 break;
140 case 3:
141 cp = (c & 0x0F) << 12;
142 break;
144 case 4:
145 cp = (c & 0x07) << 18;
146 break;
148 default:
149 vlc_assert_unreachable ();
152 /* Unrolled continuation bytes decoding */
153 switch (charlen)
155 case 4:
156 c = *++ptr;
157 if (unlikely((c & 0xC0) != 0x80)) // not a continuation byte
158 return -1;
159 cp |= (c & 0x3F) << 12;
161 if (unlikely(cp >= 0x110000)) // beyond Unicode range
162 return -1;
163 /* fall through */
164 case 3:
165 c = *++ptr;
166 if (unlikely((c & 0xC0) != 0x80)) // not a continuation byte
167 return -1;
168 cp |= (c & 0x3F) << 6;
170 if (unlikely(cp >= 0xD800 && cp < 0xE000)) // UTF-16 surrogate
171 return -1;
172 if (unlikely(cp < (1u << (5 * charlen - 4)))) // non-ASCII overlong
173 return -1;
174 /* fall through */
175 case 2:
176 c = *++ptr;
177 if (unlikely((c & 0xC0) != 0x80)) // not a continuation byte
178 return -1;
179 cp |= (c & 0x3F);
180 break;
183 *pwc = cp;
184 return charlen;
188 * Look for an UTF-8 string within another one in a case-insensitive fashion.
189 * Beware that this is quite slow. Contrary to strcasestr(), this function
190 * works regardless of the system character encoding, and handles multibyte
191 * code points correctly.
193 * @param haystack string to look into
194 * @param needle string to look for
195 * @return a pointer to the first occurrence of the needle within the haystack,
196 * or NULL if no occurrence were found.
198 char *vlc_strcasestr (const char *haystack, const char *needle)
200 ssize_t s;
204 const char *h = haystack, *n = needle;
206 for (;;)
208 uint32_t cph, cpn;
210 s = vlc_towc (n, &cpn);
211 if (s == 0)
212 return (char *)haystack;
213 if (unlikely(s < 0))
214 return NULL;
215 n += s;
217 s = vlc_towc (h, &cph);
218 if (s <= 0 || towlower (cph) != towlower (cpn))
219 break;
220 h += s;
223 s = vlc_towc (haystack, &(uint32_t) { 0 });
224 haystack += s;
226 while (s > 0);
228 return NULL;
232 * Converts a string from the given character encoding to utf-8.
234 * @return a nul-terminated utf-8 string, or null in case of error.
235 * The result must be freed using free().
237 char *FromCharset(const char *charset, const void *data, size_t data_size)
239 vlc_iconv_t handle = vlc_iconv_open ("UTF-8", charset);
240 if (handle == (vlc_iconv_t)(-1))
241 return NULL;
243 char *out = NULL;
244 for(unsigned mul = 4; mul < 8; mul++ )
246 size_t in_size = data_size;
247 const char *in = data;
248 size_t out_max = mul * data_size;
249 char *tmp = out = malloc (1 + out_max);
250 if (!out)
251 break;
253 if (vlc_iconv (handle, &in, &in_size, &tmp, &out_max) != (size_t)(-1)) {
254 *tmp = '\0';
255 break;
257 free(out);
258 out = NULL;
260 if (errno != E2BIG)
261 break;
263 vlc_iconv_close(handle);
264 return out;
268 * Converts a nul-terminated UTF-8 string to a given character encoding.
269 * @param charset iconv name of the character set
270 * @param in nul-terminated UTF-8 string
271 * @param outsize pointer to hold the byte size of result
273 * @return A pointer to the result, which must be released using free().
274 * The UTF-8 nul terminator is included in the conversion if the target
275 * character encoding supports it. However it is not included in the returned
276 * byte size.
277 * In case of error, NULL is returned and the byte size is undefined.
279 void *ToCharset(const char *charset, const char *in, size_t *outsize)
281 vlc_iconv_t hd = vlc_iconv_open (charset, "UTF-8");
282 if (hd == (vlc_iconv_t)(-1))
283 return NULL;
285 const size_t inlen = strlen (in);
286 void *res;
288 for (unsigned mul = 4; mul < 16; mul++)
290 size_t outlen = mul * (inlen + 1);
291 res = malloc (outlen);
292 if (unlikely(res == NULL))
293 break;
295 const char *inp = in;
296 char *outp = res;
297 size_t inb = inlen;
298 size_t outb = outlen - mul;
300 if (vlc_iconv (hd, &inp, &inb, &outp, &outb) != (size_t)(-1))
302 *outsize = outlen - mul - outb;
303 outb += mul;
304 inb = 1; /* append nul terminator if possible */
305 if (vlc_iconv (hd, &inp, &inb, &outp, &outb) != (size_t)(-1))
306 break;
307 if (errno == EILSEQ) /* cannot translate nul terminator!? */
308 break;
311 free (res);
312 res = NULL;
313 if (errno != E2BIG) /* conversion failure */
314 break;
316 vlc_iconv_close (hd);
317 return res;