Upload UI
[lsnes.git] / src / library / utf8.cpp
blob03a19a566765fec96d299d6018a35143aa3e6218
1 #include <sstream>
2 #include "utf8.hpp"
4 namespace
6 //First nibble values:
7 //0 => INITIAL
8 //1 => S_2_2
9 //2 => S_2_3
10 //3 => S_2_4
11 //4 => S_3_3
12 //5 => S_3_4
13 //6 => INIT_RE
14 //7 => (unused)
15 //8 => S_4_4
16 //Second nibble values:
17 //0 => Return NO CHARACTER and transition to another state with substate 0.
18 //1 => Return the character and transition to another state with substate 0.
19 //2 => Return invalid character and transition to another state with substate 0.
20 //3 => Memorize character minus 192, return NO CHARACTER and transition to another state.
21 //4 => Memorize character minus 224, return NO CHARACTER and transition to another state.
22 //5 => Memorize character minus 240, return NO CHARACTER and transition to another state.
23 //6 => Memorize byte, return invalid character and transition to another state.
24 //7 => Return 2-byte value and transition to another state.
25 //8 => Combine memorized, return NO CHARACTER and transition to another state.
26 //9 => Return 3-byte value and transition to another state.
27 //A => Return 4-byte value and transition to another state.
28 //B => Handle memorized character and EOF.
29 //C => Handle memorized character and continuation.
30 const unsigned char transitions[] = {
31 //E //1 //C //2 //3 //4 //I
32 0x00, 0x01, 0x02, 0x13, 0x24, 0x35, 0x02, //INITIAL
33 0x01, 0x66, 0x07, 0x66, 0x66, 0x66, 0x66, //S_2_2
34 0x01, 0x66, 0x48, 0x66, 0x66, 0x66, 0x66, //S_2_3
35 0x01, 0x66, 0x58, 0x66, 0x66, 0x66, 0x66, //S_2_4
36 0x01, 0x66, 0x09, 0x66, 0x66, 0x66, 0x66, //S_3_3
37 0x01, 0x66, 0x88, 0x66, 0x66, 0x66, 0x66, //S_3_4
38 0x0B, 0x6C, 0x6C, 0x6C, 0x6C, 0x6C, 0x6C, //INIT_RE
39 0x01, 0x66, 0x0A, 0x66, 0x66, 0x66, 0x66 //S_4_4
43 extern const uint16_t utf8_initial_state = 0;
45 int32_t utf8_parse_byte(int ch, uint16_t& state) throw()
47 unsigned char mch = (ch < 248) ? ch : 248;
48 uint32_t astate = state >> 12;
49 uint32_t iclass;
50 uint32_t tmp;
51 if(astate > 7) astate = 7;
52 if(ch < 0) iclass = 0;
53 else if(ch < 128) iclass = 1;
54 else if(ch < 192) iclass = 2;
55 else if(ch < 224) iclass = 3;
56 else if(ch < 240) iclass = 4;
57 else if(ch < 248) iclass = 5;
58 else iclass = 6;
59 unsigned char ctrl = transitions[astate * 7 + iclass];
61 switch(ctrl & 0xF) {
62 case 0x0:
63 state = (ctrl & 0xF0) * 256;
64 return -1;
65 case 0x1:
66 state = (ctrl & 0xF0) * 256;
67 return ch;
68 case 0x2:
69 state = (ctrl & 0xF0) * 256;
70 return 0xFFFD;
71 case 0x3:
72 state = (ctrl & 0xF0) * 256 + ch - 192;
73 return -1;
74 case 0x4:
75 state = (ctrl & 0xF0) * 256 + ch - 224;
76 return -1;
77 case 0x5:
78 state = (ctrl & 0xF0) * 256 + ch - 240;
79 return -1;
80 case 0x6:
81 state = (ctrl & 0xF0) * 256 + mch;
82 return 0xFFFD;
83 case 0x7:
84 tmp = (state & 0xFFF) * 64 + ch - 128;
85 if(tmp < 0x80)
86 tmp = 0xFFFD;
87 state = (ctrl & 0xF0) * 256;
88 return tmp;
89 case 0x8:
90 state = (ctrl & 0xF0) * 256 + (state & 0xFFF) * 64 + ch - 128;
91 return -1;
92 case 0x9:
93 tmp = (state & 0xFFF) * 64 + ch - 128;
94 if(tmp < 0x800 || (tmp & 0xF800) == 0xD800 || (tmp & 0xFFFE) == 0xFFFE)
95 tmp = 0xFFFD;
96 state = (ctrl & 0xF0) * 256;
97 return tmp;
98 case 0xA:
99 tmp = (state & 0x7FFF) * 64 + ch - 128;
100 if(tmp < 0x10000 || tmp > 0x10FFFD || (tmp & 0xFFFE) == 0xFFFE)
101 tmp = 0xFFFD;
102 state = (ctrl & 0xF0) * 256;
103 return tmp;
104 case 0xB:
105 if(state & 0x80)
106 tmp = 0xFFFD;
107 else
108 tmp = state & 0x7F;
109 state = (ctrl & 0xF0) * 256;
110 return tmp;
111 case 0xC:
112 //This is nasty.
113 if((state & 0x80) == 0) {
114 tmp = state & 0x7F;
115 state = 0x6000 + mch;
116 return tmp;
117 } else if((state & 0xF8) == 0xF8 || (state & 0xF8) == 0x80) {
118 //Continuation or invalid.
119 state = 0x6000 + mch;
120 return 0xFFFD;
121 } else if(iclass == 0) {
122 //Incomplete.
123 state = 0;
124 return 0xFFFD;
125 } else if(iclass != 2) {
126 //Bad sequence.
127 state = 0x6000 + mch;
128 return 0xFFFD;
129 } else if((state & 0xE0) == 0xC0) {
130 //Complete 2-byte sequence.
131 tmp = (state & 0x1F) * 64 + (ch & 0x3F);
132 state = 0;
133 if(tmp < 0x80)
134 tmp = 0xFFFD;
135 return tmp;
136 } else if((state & 0xF0) == 0xE0) {
137 //First 2 bytes of 3-byte sequence.
138 state = 0x4000 + (state & 0x0F) * 64 + (ch & 0x3F);
139 return -1;
140 } else if((state & 0xF8) == 0xF0) {
141 //First 2 bytes of 4-byte sequence.
142 state = 0x5000 + (state & 0x07) * 64 + (ch & 0x3F);
143 return -1;
146 return -1;
149 size_t utf8_strlen(const std::string& str) throw()
151 uint16_t s = utf8_initial_state;
152 size_t r = 0;
153 for(size_t i = 0; i < str.length(); i++)
154 if(utf8_parse_byte(static_cast<uint8_t>(str[i]), s) >= 0)
155 r++;
156 if(utf8_parse_byte(-1, s) >= 0)
157 r++;
158 return r;
161 std::u32string to_u32string(const std::string& utf8)
163 std::u32string x;
164 x.resize(utf8_strlen(utf8));
165 copy_from_utf8(utf8.begin(), utf8.end(), x.begin());
166 return x;
169 std::string to_u8string(const std::u32string& utf32)
171 std::ostringstream s;
172 for(auto i : utf32) {
173 if(i < 0x80)
174 s << (unsigned char)i;
175 else if(i < 0x800)
176 s << (unsigned char)(0xC0 + (i >> 6)) << (unsigned char)(0x80 + (i & 0x3F));
177 else if(i < 0x10000)
178 s << (unsigned char)(0xE0 + (i >> 12)) << (unsigned char)(0x80 + ((i >> 6) & 0x3F))
179 << (unsigned char)(0x80 + (i & 0x3F));
180 else if(i < 0x10FFFF)
181 s << (unsigned char)(0xF0 + (i >> 18)) << (unsigned char)(0x80 + ((i >> 12) & 0x3F))
182 << (unsigned char)(0x80 + ((i >> 6) & 0x3F))
183 << (unsigned char)(0x80 + (i & 0x3F));
185 return s.str();
188 #ifdef TEST_UTF8
189 #include <iostream>
190 char* format_dword(uint16_t s)
192 static char buf[32];
193 sprintf(buf, "%04X", s);
194 return buf;
197 int main()
199 uint16_t s = utf8_initial_state;
200 while(true) {
201 int c;
202 int32_t d;
203 std::cin >> c;
204 d = utf8_parse_byte(c, s);
205 std::cout << "> " << d << " (status word=" << format_dword(s) << ")" << std::endl;
206 if(c == -1 && d == -1)
207 return 0;
209 return 0;
211 #endif