alot of renaming...
[k8sterm.git] / src / utf8.c
blob5b14c4841c1f280cb288a77ff10f922f4be4cf22
1 ////////////////////////////////////////////////////////////////////////////////
2 // UTF-8
4 static const unsigned char utf8Length[256] = {
5 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, //0x00-0x0f
6 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, //0x10-0x1f
7 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, //0x20-0x2f
8 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, //0x30-0x3f
9 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, //0x40-0x4f
10 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, //0x50-0x5f
11 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, //0x60-0x6f
12 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, //0x70-0x7f
13 9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, //0x80-0x8f
14 9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, //0x90-0x9f
15 9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, //0xa0-0xaf
16 9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, //0xb0-0xbf
17 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, //0xc0-0xcf c0-c1: overlong encoding: start of a 2-byte sequence, but code point <= 127
18 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, //0xd0-0xdf
19 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, //0xe0-0xef
20 4,4,4,4,4,8,8,8,8,8,8,8,8,8,8,8 //0xf0-0xff
24 // decode one utf-8 char from *buf to *u, return char length; return '?' on error
25 static int k8t_UTF8Decode (uint32_t *u, const void *buf) {
26 const unsigned char *data = (const unsigned char *)buf;
27 unsigned char len = utf8Length[*data];
28 uint32_t uc;
30 switch (len) {
31 case 0: // ascii
32 *u = *data;
33 return 1;
34 case 8: case 9: // invalid
35 *u = '?';
36 return 1;
38 // utf-8
39 uc = (*data++)&(0x7c>>len);
40 while (--len) {
41 if (utf8Length[*data] != 9) { uc = 0xffff; break; }
42 uc = (uc<<6)|((*data++)&0x3f);
44 if (uc > 0x10ffff) uc &= 0x1fffff;
45 if ((uc >= 0xd800 && uc <= 0xdfff) || // utf16/utf32 surrogates
46 (uc >= 0xfdd0 && uc <= 0xfdef) || // just for fun
47 (uc >= 0xfffe && uc <= 0xffff)) uc = '?'; // bad unicode
48 *u = uc;
49 return data-((const unsigned char *)buf);
53 // encode one utf-8 char from u to *buf, return char length
54 static int k8t_UTF8Encode (void *buf, uint32_t uc) {
55 uint8_t *sp = (uint8_t *)buf;
56 int n;
58 if (uc < 0x80) { *sp = uc; return 1; } /* 0xxxxxxx */
59 uc &= 0x1fffff;
60 if (uc < 0x800) {
61 /* 110xxxxx */
62 *sp++ = (uc>>6)|0xc0;
63 n = 1;
64 } else if (uc < 0x10000) {
65 /* 1110xxxx */
66 *sp++ = (uc>>12)|0xe0;
67 n = 2;
68 } else if (uc <= 0x10FFFF) {
69 /* 11110xxx */
70 *sp++ = (uc>>18)|0xf0;
71 n = 3;
72 } else {
73 /* U+FFFD */
74 memcpy(sp, "\xEF\xBF\xBD", 3);
75 return 3;
77 for (int f = n; f > 0; --f) *sp++ = ((uc>>(6*(f-1)))&0x3f)|0x80; /* 10xxxxxx */
78 return n+1;
82 /* use this if your buffer is less than UTF_SIZ, it returns 1 if you can decode
83 UTF-8 otherwise return 0 */
84 static int k8t_UTF8IsFull (const void *buf, int buflen) {
85 if (buflen > 0) {
86 const unsigned char *data = (const unsigned char *)buf;
87 unsigned char len = utf8Length[*data++];
88 int res;
90 switch (len) {
91 case 0: case 8: case 9: return 1;
93 if ((res = (buflen >= len))) buflen = len;
94 for (int f = buflen-1; f > 0; --f) if (((*data++)&0xc0) != 0x80) return 1;
95 return res;
97 return 0;
101 static int k8t_UTF8Size (const void *buf) {
102 const unsigned char *data = (const unsigned char *)buf;
103 unsigned char len = utf8Length[*data];
105 switch (len) {
106 case 0: return 1;
107 case 8: case 9: return 0;
109 return len;
113 static int k8t_UTF8strlen (const char *s) {
114 int len = 0;
116 while (*s) {
117 if (((unsigned char)(s[0])&0xc0) == 0xc0 || ((unsigned char)(s[0])&0x80) == 0) ++len;
118 ++s;
120 return len;
124 static void k8t_UTF8ChopLast (char *s) {
125 int lastpos = 0;
127 for (char *t = s; *t; ++t) {
128 if (((unsigned char)(t[0])&0xc0) == 0xc0 || ((unsigned char)(t[0])&0x80) == 0) lastpos = (int)(t-s);
130 s[lastpos] = 0;