1 ////////////////////////////////////////////////////////////////////////////////
4 static const unsigned char utf8Length
[256] = {
5 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, //0x00-0x0f
6 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, //0x10-0x1f
7 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, //0x20-0x2f
8 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, //0x30-0x3f
9 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, //0x40-0x4f
10 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, //0x50-0x5f
11 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, //0x60-0x6f
12 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, //0x70-0x7f
13 9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, //0x80-0x8f
14 9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, //0x90-0x9f
15 9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, //0xa0-0xaf
16 9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, //0xb0-0xbf
17 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, //0xc0-0xcf c0-c1: overlong encoding: start of a 2-byte sequence, but code point <= 127
18 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, //0xd0-0xdf
19 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, //0xe0-0xef
20 4,4,4,4,4,8,8,8,8,8,8,8,8,8,8,8 //0xf0-0xff
24 // decode one utf-8 char from *buf to *u, return char length; return '?' on error
25 static int k8t_UTF8Decode (uint32_t *u
, const void *buf
) {
26 const unsigned char *data
= (const unsigned char *)buf
;
27 unsigned char len
= utf8Length
[*data
];
34 case 8: case 9: // invalid
39 uc
= (*data
++)&(0x7c>>len
);
41 if (utf8Length
[*data
] != 9) { uc
= 0xffff; break; }
42 uc
= (uc
<<6)|((*data
++)&0x3f);
44 if (uc
> 0x10ffff) uc
&= 0x1fffff;
45 if ((uc
>= 0xd800 && uc
<= 0xdfff) || // utf16/utf32 surrogates
46 (uc
>= 0xfdd0 && uc
<= 0xfdef) || // just for fun
47 (uc
>= 0xfffe && uc
<= 0xffff)) uc
= '?'; // bad unicode
49 return data
-((const unsigned char *)buf
);
53 // encode one utf-8 char from u to *buf, return char length
54 static int k8t_UTF8Encode (void *buf
, uint32_t uc
) {
55 uint8_t *sp
= (uint8_t *)buf
;
58 if (uc
< 0x80) { *sp
= uc
; return 1; } /* 0xxxxxxx */
64 } else if (uc
< 0x10000) {
66 *sp
++ = (uc
>>12)|0xe0;
68 } else if (uc
<= 0x10FFFF) {
70 *sp
++ = (uc
>>18)|0xf0;
74 memcpy(sp
, "\xEF\xBF\xBD", 3);
77 for (int f
= n
; f
> 0; --f
) *sp
++ = ((uc
>>(6*(f
-1)))&0x3f)|0x80; /* 10xxxxxx */
82 /* use this if your buffer is less than UTF_SIZ, it returns 1 if you can decode
83 UTF-8 otherwise return 0 */
84 static int k8t_UTF8IsFull (const void *buf
, int buflen
) {
86 const unsigned char *data
= (const unsigned char *)buf
;
87 unsigned char len
= utf8Length
[*data
++];
91 case 0: case 8: case 9: return 1;
93 if ((res
= (buflen
>= len
))) buflen
= len
;
94 for (int f
= buflen
-1; f
> 0; --f
) if (((*data
++)&0xc0) != 0x80) return 1;
101 static int k8t_UTF8Size (const void *buf
) {
102 const unsigned char *data
= (const unsigned char *)buf
;
103 unsigned char len
= utf8Length
[*data
];
107 case 8: case 9: return 0;
113 static int k8t_UTF8strlen (const char *s
) {
117 if (((unsigned char)(s
[0])&0xc0) == 0xc0 || ((unsigned char)(s
[0])&0x80) == 0) ++len
;
124 static void k8t_UTF8ChopLast (char *s
) {
127 for (char *t
= s
; *t
; ++t
) {
128 if (((unsigned char)(t
[0])&0xc0) == 0xc0 || ((unsigned char)(t
[0])&0x80) == 0) lastpos
= (int)(t
-s
);