updated README
[k8sterm.git] / src / utf8dec.c
blob02c2cd9fd58357c2a92d06cefbaf2bc321f35032
1 // ////////////////////////////////////////////////////////////////////////// //
2 // see http://bjoern.hoehrmann.de/utf-8/decoder/dfa/
3 // for some recovery strategies it may be useful to determine the number of bytes expected.
4 static const uint8_t sxed_utf8dfa[0x16c] = {
5 // maps bytes to character classes
6 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, // 00-0f
7 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, // 10-1f
8 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, // 20-2f
9 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, // 30-3f
10 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, // 40-4f
11 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, // 50-5f
12 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, // 60-6f
13 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, // 70-7f
14 0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01, // 80-8f
15 0x09,0x09,0x09,0x09,0x09,0x09,0x09,0x09,0x09,0x09,0x09,0x09,0x09,0x09,0x09,0x09, // 90-9f
16 0x07,0x07,0x07,0x07,0x07,0x07,0x07,0x07,0x07,0x07,0x07,0x07,0x07,0x07,0x07,0x07, // a0-af
17 0x07,0x07,0x07,0x07,0x07,0x07,0x07,0x07,0x07,0x07,0x07,0x07,0x07,0x07,0x07,0x07, // b0-bf
18 0x08,0x08,0x02,0x02,0x02,0x02,0x02,0x02,0x02,0x02,0x02,0x02,0x02,0x02,0x02,0x02, // c0-cf
19 0x02,0x02,0x02,0x02,0x02,0x02,0x02,0x02,0x02,0x02,0x02,0x02,0x02,0x02,0x02,0x02, // d0-df
20 0x0a,0x03,0x03,0x03,0x03,0x03,0x03,0x03,0x03,0x03,0x03,0x03,0x03,0x04,0x03,0x03, // e0-ef
21 0x0b,0x06,0x06,0x06,0x05,0x08,0x08,0x08,0x08,0x08,0x08,0x08,0x08,0x08,0x08,0x08, // f0-ff
22 // maps a combination of a state of the automaton and a character class to a state
23 0x00,0x0c,0x18,0x24,0x3c,0x60,0x54,0x0c,0x0c,0x0c,0x30,0x48,0x0c,0x0c,0x0c,0x0c, // 100-10f
24 0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x00,0x0c,0x0c,0x0c,0x0c,0x0c,0x00, // 110-11f
25 0x0c,0x00,0x0c,0x0c,0x0c,0x18,0x0c,0x0c,0x0c,0x0c,0x0c,0x18,0x0c,0x18,0x0c,0x0c, // 120-12f
26 0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x18,0x0c,0x0c,0x0c,0x0c,0x0c,0x18,0x0c,0x0c, // 130-13f
27 0x0c,0x0c,0x0c,0x0c,0x0c,0x18,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x24, // 140-14f
28 0x0c,0x24,0x0c,0x0c,0x0c,0x24,0x0c,0x0c,0x0c,0x0c,0x0c,0x24,0x0c,0x24,0x0c,0x0c, // 150-15f
29 0x0c,0x24,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c
33 // replacement char for invalid unicode
34 #define SXED_UTF8_REPLACEMENT_CP (0xFFFDU)
37 // ////////////////////////////////////////////////////////////////////////// //
38 static inline /*__attribute__((always_inline))*/ __attribute__((unused))
39 // is the given codepoint valid?
40 int sxed_utf8_valid_cp (uint32_t cp) { return (cp < 0xD800U || (cp > 0xDFFFU && cp <= 0x10FFFFU)); }
42 static inline /*__attribute__((always_inline))*/ __attribute__((unused))
43 // is the given codepoint considered printable?
44 // i restrict it to some useful subset
45 // unifuck is unifucked, but i hope that i sorted out all idiotic diactritics and control chars
46 int sxed_utf8_printable_cp (uint32_t cp) {
47 return
48 // basic latin
49 cp <= 0x024FU ||
50 // some greek, and cyrillic w/o combiners
51 (cp >= 0x0390U && cp <= 0x0482U) ||
52 (cp >= 0x048AU && cp <= 0x052FU) ||
53 // runic (just for fun)
54 (cp >= 0x16A0U && cp <= 0x168FU) ||
55 // latin extended additional
56 (cp >= 0x1E00U && cp <= 0x1EFFU) ||
57 // some general punctuation, extensions, etc.
58 (cp >= 0x2000U && cp <= 0x2C7FU) ||
59 // supplemental punctuation
60 (cp >= 0x2E00U && cp <= 0x2E42U) ||
61 // more latin extended
62 (cp >= 0xAB30U && cp <= 0xAB65U);
66 // ////////////////////////////////////////////////////////////////////////// //
67 // DFA-based utf-8 decoder with only 32 bits of state
68 // "overlong" variants accept "zero overlong" (0xC0 0x80), but are slower
69 // normal variants reject overlongs
71 // use like this:
73 // uint32_t cp = 0; // 0 is important! (also, assigning zero can be used to reset the decoder)
74 // for (;;) {
75 // cp = sxed_utf8d_consume(cp, next_byte());
76 // if (sxed_utf8_valid_cp(cp)) {
77 // ...process unicode codepoint from `cp`...
78 // }
79 // }
81 // or:
83 // uint32_t cp = 0; // 0 is important! (also, assigning zero can be used to reset the decoder)
84 // for (;;) {
85 // cp = sxed_utf8d_consume_ex(cp, next_byte());
86 // if (sxed_utf8_valid_cp(cp)) {
87 // ...process unicode codepoint from `cp`...
88 // } else if (sxed_utf8d_fuckedup(cp)) {
89 // ...we found an invalid utf-8 sequence...
90 // }
91 // }
93 static inline /*__attribute__((always_inline))*/ __attribute__((unused))
94 // never reaches `invalid` state, returns `replacement` for invalid chars
95 // returns invalid codepoint while it is "in progress" (i.e. result > 0x10FFFFU)
96 uint32_t sxed_utf8d_consume (uint32_t cp, const char ch) {
97 uint8_t state = cp>>24;
98 if (state == 12/*State.Reject*/) state = 0/*State.Accept*/; // invalid utf-8 sequence was hit, restart (just in case)
99 const uint8_t type = sxed_utf8dfa[((uint8_t)ch)];
100 cp = (state /*!= State.Accept*/ ? (((uint8_t)ch)&0x3fU)|((cp&~0xff000000U)<<6) : (0xff>>type)&((uint8_t)ch));
101 if ((state = sxed_utf8dfa[256+state+type]) == 12/*State.Reject*/) return SXED_UTF8_REPLACEMENT_CP; // invalid utf-8 sequence
102 return
103 state ? cp|((uint32_t)state<<24) :
104 sxed_utf8_valid_cp(cp) ? cp : //k8: i don't remember if this is required, but it's better be safe than sorry
105 SXED_UTF8_REPLACEMENT_CP;
108 static inline /*__attribute__((always_inline))*/ __attribute__((unused))
109 // can reach `invalid` state
110 // returns invalid codepoint while it is "in progress" (i.e. result > 0x10FFFFU)
111 // on invalid utf-8 sequence, returns special value that can be tested with `sxed_utf8d_fuckedup()`
112 // codepoint is undefined in this case (but it won't pass `sxed_utf8_valid_cp()`)
113 // next call will restart decoding
114 uint32_t sxed_utf8d_consume_ex (uint32_t cp, const char ch) {
115 uint8_t state = cp>>24;
116 if (state == 12/*State.Reject*/) state = 0/*State.Accept*/; // invalid utf-8 sequence was hit, restart (just in case)
117 const uint8_t type = sxed_utf8dfa[((uint8_t)ch)];
118 cp = (state /*!= State.Accept*/ ? (((uint8_t)ch)&0x3fU)|((cp&~0xff000000U)<<6) : (0xff>>type)&((uint8_t)ch));
119 if ((state = sxed_utf8dfa[256+state+type]) == 12/*State.Reject*/) return 12U<<24; // invalid utf-8 sequence
120 return
121 state ? cp|((uint32_t)state<<24) :
122 sxed_utf8_valid_cp(cp) ? cp : //k8: i don't remember if this is required, but it's better be safe than sorry
123 12U<<24; // invalid utf-8 sequence
126 static inline /*__attribute__((always_inline))*/ __attribute__((unused))
127 // check value returned from `sxed_utf8d_consume_ex()` with this
128 // returns non-zero if last decoding operation hit invalid utf-8 sequence
129 // codepoint is undefined in this case (but it won't pass `sxed_utf8_valid_cp()`)
130 int sxed_utf8d_fuckedup (const uint32_t cp) { return ((cp>>24) == 12); }