1 /*****************************************************************************
2 * vlc_charset.h: Unicode UTF-8 wrappers function
3 *****************************************************************************
4 * Copyright (C) 2003-2005 VLC authors and VideoLAN
5 * Copyright © 2005-2010 Rémi Denis-Courmont
8 * Author: Rémi Denis-Courmont <rem # videolan,org>
10 * This program is free software; you can redistribute it and/or modify it
11 * under the terms of the GNU Lesser General Public License as published by
12 * the Free Software Foundation; either version 2.1 of the License, or
13 * (at your option) any later version.
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 * GNU Lesser General Public License for more details.
20 * You should have received a copy of the GNU Lesser General Public License
21 * along with this program; if not, write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston MA 02110-1301, USA.
23 *****************************************************************************/
26 #define VLC_CHARSET_H 1
30 * Characters sets handling
37 * Decodes a code point from UTF-8.
39 * Converts the first character in a UTF-8 sequence into a Unicode code point.
41 * \param str an UTF-8 bytes sequence [IN]
42 * \param pwc address of a location to store the code point [OUT]
44 * \return the number of bytes occupied by the decoded code point
46 * \retval (size_t)-1 not a valid UTF-8 sequence
47 * \retval 0 null character (i.e. str points to an empty string)
48 * \retval 1 (non-null) ASCII character
49 * \retval 2-4 non-ASCII character
51 VLC_API
size_t vlc_towc(const char *str
, uint32_t *restrict pwc
);
54 * Checks UTF-8 validity.
56 * Checks whether a null-terminated string is a valid UTF-8 bytes sequence.
58 * \param str string to check
60 * \retval str the string is a valid null-terminated UTF-8 sequence
61 * \retval NULL the string is not an UTF-8 sequence
63 VLC_USED
static inline const char *IsUTF8(const char *str
)
68 while ((n
= vlc_towc(str
, &cp
)) != 0)
69 if (likely(n
!= (size_t)-1))
77 * Checks ASCII validity.
79 * Checks whether a null-terminated string is a valid ASCII bytes sequence
80 * (non-printable ASCII characters 1-31 are permitted).
82 * \param str string to check
84 * \retval str the string is a valid null-terminated ASCII sequence
85 * \retval NULL the string is not an ASCII sequence
87 VLC_USED
static inline const char *IsASCII(const char *str
)
91 for (const char *p
= str
; (c
= *p
) != '\0'; p
++)
98 * Removes non-UTF-8 sequences.
100 * Replaces invalid or <i>over-long</i> UTF-8 bytes sequences within a
101 * null-terminated string with question marks. This is so that the string can
102 * be printed at least partially.
104 * \warning Do not use this were correctness is critical. use IsUTF8() and
105 * handle the error case instead. This function is mainly for display or debug.
107 * \note Converting from Latin-1 to UTF-8 in place is not possible (the string
108 * size would be increased). So it is not attempted even if it would otherwise
109 * be less disruptive.
111 * \retval str the string is a valid null-terminated UTF-8 sequence
112 * (i.e. no changes were made)
113 * \retval NULL the string is not an UTF-8 sequence
115 static inline char *EnsureUTF8(char *str
)
121 while ((n
= vlc_towc(str
, &cp
)) != 0)
122 if (likely(n
!= (size_t)-1))
132 /* iconv wrappers (defined in src/extras/libc.c) */
133 #define VLC_ICONV_ERR ((size_t) -1)
134 typedef void *vlc_iconv_t
;
135 VLC_API vlc_iconv_t
vlc_iconv_open( const char *, const char * ) VLC_USED
;
136 VLC_API
size_t vlc_iconv( vlc_iconv_t
, const char **, size_t *, char **, size_t * ) VLC_USED
;
137 VLC_API
int vlc_iconv_close( vlc_iconv_t
);
141 VLC_API
int utf8_vfprintf( FILE *stream
, const char *fmt
, va_list ap
);
142 VLC_API
int utf8_fprintf( FILE *, const char *, ... ) VLC_FORMAT( 2, 3 );
143 VLC_API
char * vlc_strcasestr(const char *, const char *) VLC_USED
;
145 VLC_API
char * FromCharset( const char *charset
, const void *data
, size_t data_size
) VLC_USED
;
146 VLC_API
void * ToCharset( const char *charset
, const char *in
, size_t *outsize
) VLC_USED
;
149 # include <CoreFoundation/CoreFoundation.h>
151 /* Obtains a copy of the contents of a CFString in specified encoding.
152 * Returns char* (must be freed by caller) or NULL on failure.
154 VLC_USED
static inline char *FromCFString(const CFStringRef cfString
,
155 const CFStringEncoding cfStringEncoding
)
157 // Try the quick way to obtain the buffer
158 const char *tmpBuffer
= CFStringGetCStringPtr(cfString
, cfStringEncoding
);
160 if (tmpBuffer
!= NULL
) {
161 return strdup(tmpBuffer
);
164 // The quick way did not work, try the long way
165 CFIndex length
= CFStringGetLength(cfString
);
167 CFStringGetMaximumSizeForEncoding(length
, cfStringEncoding
);
169 // If result would exceed LONG_MAX, kCFNotFound is returned
170 if (unlikely(maxSize
== kCFNotFound
)) {
174 // Account for the null terminator
177 char *buffer
= (char *)malloc(maxSize
);
179 if (unlikely(buffer
== NULL
)) {
183 // Copy CFString in requested encoding to buffer
184 Boolean success
= CFStringGetCString(cfString
, buffer
, maxSize
, cfStringEncoding
);
194 static inline char *FromWide (const wchar_t *wide
)
196 size_t len
= WideCharToMultiByte (CP_UTF8
, 0, wide
, -1, NULL
, 0, NULL
, NULL
);
200 char *out
= (char *)malloc (len
);
203 WideCharToMultiByte (CP_UTF8
, 0, wide
, -1, out
, len
, NULL
, NULL
);
208 static inline wchar_t *ToWide (const char *utf8
)
210 int len
= MultiByteToWideChar (CP_UTF8
, 0, utf8
, -1, NULL
, 0);
214 wchar_t *out
= (wchar_t *)malloc (len
* sizeof (wchar_t));
217 MultiByteToWideChar (CP_UTF8
, 0, utf8
, -1, out
, len
);
222 static inline char *ToCodePage (unsigned cp
, const char *utf8
)
224 wchar_t *wide
= ToWide (utf8
);
228 size_t len
= WideCharToMultiByte (cp
, 0, wide
, -1, NULL
, 0, NULL
, NULL
);
234 char *out
= (char *)malloc (len
);
235 if (likely(out
!= NULL
))
236 WideCharToMultiByte (cp
, 0, wide
, -1, out
, len
, NULL
, NULL
);
242 static inline char *FromCodePage (unsigned cp
, const char *mb
)
244 int len
= MultiByteToWideChar (cp
, 0, mb
, -1, NULL
, 0);
248 wchar_t *wide
= (wchar_t *)malloc (len
* sizeof (wchar_t));
249 if (unlikely(wide
== NULL
))
251 MultiByteToWideChar (cp
, 0, mb
, -1, wide
, len
);
253 char *utf8
= FromWide (wide
);
259 static inline char *FromANSI (const char *ansi
)
261 return FromCodePage (GetACP (), ansi
);
265 static inline char *ToANSI (const char *utf8
)
267 return ToCodePage (GetACP (), utf8
);
271 # define FromT FromWide
274 # define FromT FromANSI
277 # define FromLocale FromANSI
278 # define ToLocale ToANSI
279 # define LocaleFree(s) free((char *)(s))
280 # define FromLocaleDup FromANSI
281 # define ToLocaleDup ToANSI
283 #elif defined(__OS2__)
285 VLC_USED
static inline char *FromLocale (const char *locale
)
287 return locale
? FromCharset ((char *)"", locale
, strlen(locale
)) : NULL
;
290 VLC_USED
static inline char *ToLocale (const char *utf8
)
293 return utf8
? (char *)ToCharset ("", utf8
, &outsize
) : NULL
;
296 VLC_USED
static inline void LocaleFree (const char *str
)
301 VLC_USED
static inline char *FromLocaleDup (const char *locale
)
303 return FromCharset ("", locale
, strlen(locale
));
306 VLC_USED
static inline char *ToLocaleDup (const char *utf8
)
309 return (char *)ToCharset ("", utf8
, &outsize
);
314 # define FromLocale(l) (l)
315 # define ToLocale(u) (u)
316 # define LocaleFree(s) ((void)(s))
317 # define FromLocaleDup strdup
318 # define ToLocaleDup strdup
322 * Converts a nul-terminated string from ISO-8859-1 to UTF-8.
324 static inline char *FromLatin1 (const char *latin
)
326 char *str
= (char *)malloc (2 * strlen (latin
) + 1), *utf8
= str
;
332 while ((c
= *(latin
++)) != '\0')
336 *(utf8
++) = 0xC0 | (c
>> 6);
337 *(utf8
++) = 0x80 | (c
& 0x3F);
344 utf8
= (char *)realloc (str
, utf8
- str
);
345 return utf8
? utf8
: str
;
350 VLC_API
double us_strtod( const char *, char ** ) VLC_USED
;
351 VLC_API
float us_strtof( const char *, char ** ) VLC_USED
;
352 VLC_API
double us_atof( const char * ) VLC_USED
;
353 VLC_API
int us_vasprintf( char **, const char *, va_list );
354 VLC_API
int us_asprintf( char **, const char *, ... ) VLC_USED
;