From 9ce1a13fb8fa77a38e0dfdfe3387829e1df3f085 Mon Sep 17 00:00:00 2001 From: =?utf8?q?R=C3=A9mi=20Denis-Courmont?= Date: Fri, 8 Oct 2010 20:30:11 +0300 Subject: [PATCH] Refactor EnsureUTF8 and IsUTF8 --- src/text/unicode.c | 129 +++++++++++++++++++++++++++++++---------------------- 1 file changed, 75 insertions(+), 54 deletions(-) diff --git a/src/text/unicode.c b/src/text/unicode.c index 3030d47085..bed5f1f96d 100644 --- a/src/text/unicode.c +++ b/src/text/unicode.c @@ -2,7 +2,7 @@ * unicode.c: Unicode <-> locale functions ***************************************************************************** * Copyright (C) 2005-2006 the VideoLAN team - * Copyright © 2005-2008 Rémi Denis-Courmont + * Copyright © 2005-2010 Rémi Denis-Courmont * * Authors: Rémi Denis-Courmont * @@ -273,73 +273,74 @@ int utf8_fprintf( FILE *stream, const char *fmt, ... ) } -static char *CheckUTF8( char *str, char rep ) +/** + * Converts the first character from a UTF-8 sequence into a code point. + * + * @param str an UTF-8 bytes sequence + * @return 0 if str points to an empty string, i.e. the first character is NUL; + * number of bytes that the first character occupies (from 1 to 4) otherwise; + * -1 if the byte sequence was not a valid UTF-8 sequence. + */ +static size_t vlc_towc (const char *str, uint32_t *restrict pwc) { uint8_t *ptr = (uint8_t *)str; assert (str != NULL); - for (;;) - { - uint8_t c = ptr[0]; + uint8_t c = ptr[0]; - if (c == '\0') - break; - - if (c > 0xF4) - goto error; - - int charlen = clz8 (c ^ 0xFF); - switch (charlen) - { - case 0: // 7-bit ASCII character -> OK - ptr++; - continue; + if (unlikely(c == '\0')) + { + *pwc = 0; + return 0; + } - case 1: // continuation byte -> error - goto error; - } + if (unlikely(c > 0xF4)) + return -1; - assert (charlen >= 2 && charlen <= 4); + int charlen = clz8 (c ^ 0xFF); + switch (charlen) + { + case 0: // 7-bit ASCII character -> OK + *pwc = c; + return 1; - uint32_t cp = c & ~((0xff >> (7 - charlen)) << (7 - charlen)); - for (int i = 1; i < charlen; i++) - { - assert (cp < (1 << 26)); - c = ptr[i]; + case 1: // continuation byte -> error + return -1; + } - if ((c >> 6) != 2) // not a continuation byte - goto error; + assert (charlen >= 2 && charlen <= 4); - cp = (cp << 6) | (ptr[i] & 0x3f); - } + uint32_t cp = c & ~((0xff >> (7 - charlen)) << (7 - charlen)); + for (int i = 1; i < charlen; i++) + { + assert (cp < (1 << 26)); + c = ptr[i]; - switch (charlen) - { - case 4: - if (cp > 0x10FFFF) // beyond Unicode - goto error; - case 3: - if (cp >= 0xD800 && cp < 0xC000) // UTF-16 surrogate - goto error; - case 2: - if (cp < 128) // ASCII overlong - goto error; - if (cp < (1u << (5 * charlen - 3))) // overlong - goto error; - } - ptr += charlen; - continue; + if (unlikely((c >> 6) != 2)) // not a continuation byte + return -1; - error: - if (rep == 0) - return NULL; - *ptr++ = rep; - str = NULL; + cp = (cp << 6) | (ptr[i] & 0x3f); } - return str; + switch (charlen) + { + case 4: + if (unlikely(cp > 0x10FFFF)) // beyond Unicode + return -1; + case 3: + if (unlikely(cp >= 0xD800 && cp < 0xC000)) // UTF-16 surrogate + return -1; + case 2: + if (unlikely(cp < 128)) // ASCII overlong + return -1; + if (unlikely(cp < (1u << (5 * charlen - 3)))) // overlong + return -1; + } + *pwc = cp; + return charlen; } + /** * Replaces invalid/overlong UTF-8 sequences with question marks. * Note that it is not possible to convert from Latin-1 to UTF-8 on the fly, @@ -349,7 +350,19 @@ static char *CheckUTF8( char *str, char rep ) */ char *EnsureUTF8( char *str ) { - return CheckUTF8( str, '?' ); + char *ret = str; + size_t n; + uint32_t cp; + + while ((n = vlc_towc (str, &cp)) != 0) + if (likely(n != (size_t)-1)) + str += n; + else + { + *str++ = '?'; + ret = NULL; + } + return ret; } @@ -362,7 +375,15 @@ char *EnsureUTF8( char *str ) */ const char *IsUTF8( const char *str ) { - return CheckUTF8( (char *)str, 0 ); + size_t n; + uint32_t cp; + + while ((n = vlc_towc (str, &cp)) != 0) + if (likely(n != (size_t)-1)) + str += n; + else + return NULL; + return str; } /** -- 2.11.4.GIT