From 0bacd766e2e7174724bf76da44b52becb6881154 Mon Sep 17 00:00:00 2001 From: Pavol Babincak Date: Tue, 31 Jan 2006 01:09:49 +0100 Subject: [PATCH] Added UTF-8 char length lookup table Added lookup table to quick get number of bytes of UTF-8 character from first byte. --- src/intl/charsets.c | 44 ++++++++++++++++++++++++++------------------ src/intl/charsets.h | 1 + 2 files changed, 27 insertions(+), 18 deletions(-) diff --git a/src/intl/charsets.c b/src/intl/charsets.c index ca564d6a..cae7b274 100644 --- a/src/intl/charsets.c +++ b/src/intl/charsets.c @@ -168,6 +168,21 @@ u2cp_(unicode_val_T u, int to, int no_nbsp_hack) return no_str; } + +/* Number of bytes utf8 character indexed by first byte. Illegal bytes are + * equal ones and handled different. */ +static char utf8char_len_tab[256] = +{ + 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, + 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, + 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, + 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, + 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, + 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, + 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, + 3,3,3,3,3,3,3,3, 3,3,3,3,3,3,3,3, 4,4,4,4,4,4,4,4, 5,5,5,5,6,6,1,1, +}; + static unsigned char utf_buffer[7]; inline unsigned char * @@ -205,6 +220,15 @@ encode_utf_8(unicode_val_T u) return utf_buffer; } +inline int utf8charlen(const unsigned char *p) +{ + int len; + if (p==NULL) + return 0; + len = utf8char_len_tab[*p]; + return len; +} + inline int strlen_utf8(unsigned char **str) { @@ -214,12 +238,7 @@ strlen_utf8(unsigned char **str) int len; for (x = 0;; x++, s += len) { - if (*s < 0x80) len = 1; - else if (*s < 0xe0) len = 2; - else if (*s < 0xf0) len = 3; - else if (*s < 0xf8) len = 4; - else if (*s < 0xfc) len = 5; - else len = 6; + len = utf8charlen(s); if (s + len > end) break; } *str = s; @@ -233,18 +252,7 @@ utf_8_to_unicode(unsigned char **string, unsigned char *end) unicode_val_T u; int length; - if (str[0] < 0x80) - length = 1; - else if (str[0] < 0xe0) - length = 2; - else if (str[0] < 0xf0) - length = 3; - else if (str[0] < 0xf8) - length = 4; - else if (str[0] < 0xfc) - length = 5; - else - length = 6; + length = utf8char_len_tab[str[0]]; if (str + length > end) { return UCS_NO_CHAR; diff --git a/src/intl/charsets.h b/src/intl/charsets.h index 364c484d..d57913a5 100644 --- a/src/intl/charsets.h +++ b/src/intl/charsets.h @@ -54,6 +54,7 @@ unsigned char *get_cp_mime_name(int); int is_cp_special(int); void free_conv_table(void); inline unsigned char *encode_utf_8(unicode_val_T); +inline int utf8charlen(const unsigned char *); inline int strlen_utf8(unsigned char **); inline unicode_val_T utf_8_to_unicode(unsigned char **, unsigned char *); -- 2.11.4.GIT