lsnes rr2-β24
[lsnes.git] / src / library / utf8.cpp
blobe56f80a42f4bf7776350449588433f074d5e53be
1 #include <sstream>
2 #include "utf8.hpp"
4 namespace utf8
6 namespace
8 //First nibble values:
9 //0 => INITIAL
10 //1 => S_2_2
11 //2 => S_2_3
12 //3 => S_2_4
13 //4 => S_3_3
14 //5 => S_3_4
15 //6 => INIT_RE
16 //7 => (unused)
17 //8 => S_4_4
18 //Second nibble values:
19 //0 => Return NO CHARACTER and transition to another state with substate 0.
20 //1 => Return the character and transition to another state with substate 0.
21 //2 => Return invalid character and transition to another state with substate 0.
22 //3 => Memorize character minus 192, return NO CHARACTER and transition to another state.
23 //4 => Memorize character minus 224, return NO CHARACTER and transition to another state.
24 //5 => Memorize character minus 240, return NO CHARACTER and transition to another state.
25 //6 => Memorize byte, return invalid character and transition to another state.
26 //7 => Return 2-byte value and transition to another state.
27 //8 => Combine memorized, return NO CHARACTER and transition to another state.
28 //9 => Return 3-byte value and transition to another state.
29 //A => Return 4-byte value and transition to another state.
30 //B => Handle memorized character and EOF.
31 //C => Handle memorized character and continuation.
32 const unsigned char transitions[] = {
33 //E //1 //C //2 //3 //4 //I
34 0x00, 0x01, 0x02, 0x13, 0x24, 0x35, 0x02, //INITIAL
35 0x01, 0x66, 0x07, 0x66, 0x66, 0x66, 0x66, //S_2_2
36 0x01, 0x66, 0x48, 0x66, 0x66, 0x66, 0x66, //S_2_3
37 0x01, 0x66, 0x58, 0x66, 0x66, 0x66, 0x66, //S_2_4
38 0x01, 0x66, 0x09, 0x66, 0x66, 0x66, 0x66, //S_3_3
39 0x01, 0x66, 0x88, 0x66, 0x66, 0x66, 0x66, //S_3_4
40 0x0B, 0x6C, 0x6C, 0x6C, 0x6C, 0x6C, 0x6C, //INIT_RE
41 0x01, 0x66, 0x0A, 0x66, 0x66, 0x66, 0x66 //S_4_4
45 extern const uint16_t initial_state = 0;
47 int32_t parse_byte(int ch, uint16_t& state) throw()
49 unsigned char mch = (ch < 248) ? ch : 248;
50 uint32_t astate = state >> 12;
51 uint32_t iclass;
52 uint32_t tmp;
53 if(astate > 7) astate = 7;
54 if(ch < 0) iclass = 0;
55 else if(ch < 128) iclass = 1;
56 else if(ch < 192) iclass = 2;
57 else if(ch < 224) iclass = 3;
58 else if(ch < 240) iclass = 4;
59 else if(ch < 248) iclass = 5;
60 else iclass = 6;
61 unsigned char ctrl = transitions[astate * 7 + iclass];
63 switch(ctrl & 0xF) {
64 case 0x0:
65 state = (ctrl & 0xF0) * 256;
66 return -1;
67 case 0x1:
68 state = (ctrl & 0xF0) * 256;
69 return ch;
70 case 0x2:
71 state = (ctrl & 0xF0) * 256;
72 return 0xFFFD;
73 case 0x3:
74 state = (ctrl & 0xF0) * 256 + ch - 192;
75 return -1;
76 case 0x4:
77 state = (ctrl & 0xF0) * 256 + ch - 224;
78 return -1;
79 case 0x5:
80 state = (ctrl & 0xF0) * 256 + ch - 240;
81 return -1;
82 case 0x6:
83 state = (ctrl & 0xF0) * 256 + mch;
84 return 0xFFFD;
85 case 0x7:
86 tmp = (state & 0xFFF) * 64 + ch - 128;
87 if(tmp < 0x80)
88 tmp = 0xFFFD;
89 state = (ctrl & 0xF0) * 256;
90 return tmp;
91 case 0x8:
92 state = (ctrl & 0xF0) * 256 + (state & 0xFFF) * 64 + ch - 128;
93 return -1;
94 case 0x9:
95 tmp = (state & 0xFFF) * 64 + ch - 128;
96 if(tmp < 0x800 || (tmp & 0xF800) == 0xD800 || (tmp & 0xFFFE) == 0xFFFE)
97 tmp = 0xFFFD;
98 state = (ctrl & 0xF0) * 256;
99 return tmp;
100 case 0xA:
101 tmp = (state & 0x7FFF) * 64 + ch - 128;
102 if(tmp < 0x10000 || tmp > 0x10FFFD || (tmp & 0xFFFE) == 0xFFFE)
103 tmp = 0xFFFD;
104 state = (ctrl & 0xF0) * 256;
105 return tmp;
106 case 0xB:
107 if(state & 0x80)
108 tmp = 0xFFFD;
109 else
110 tmp = state & 0x7F;
111 state = (ctrl & 0xF0) * 256;
112 return tmp;
113 case 0xC:
114 //This is nasty.
115 if((state & 0x80) == 0) {
116 tmp = state & 0x7F;
117 state = 0x6000 + mch;
118 return tmp;
119 } else if((state & 0xF8) == 0xF8 || (state & 0xF8) == 0x80) {
120 //Continuation or invalid.
121 state = 0x6000 + mch;
122 return 0xFFFD;
123 } else if(iclass == 0) {
124 //Incomplete.
125 state = 0;
126 return 0xFFFD;
127 } else if(iclass != 2) {
128 //Bad sequence.
129 state = 0x6000 + mch;
130 return 0xFFFD;
131 } else if((state & 0xE0) == 0xC0) {
132 //Complete 2-byte sequence.
133 tmp = (state & 0x1F) * 64 + (ch & 0x3F);
134 state = 0;
135 if(tmp < 0x80)
136 tmp = 0xFFFD;
137 return tmp;
138 } else if((state & 0xF0) == 0xE0) {
139 //First 2 bytes of 3-byte sequence.
140 state = 0x4000 + (state & 0x0F) * 64 + (ch & 0x3F);
141 return -1;
142 } else if((state & 0xF8) == 0xF0) {
143 //First 2 bytes of 4-byte sequence.
144 state = 0x5000 + (state & 0x07) * 64 + (ch & 0x3F);
145 return -1;
148 return -1;
151 size_t strlen(const std::string& str) throw()
153 uint16_t s = initial_state;
154 size_t r = 0;
155 for(size_t i = 0; i < str.length(); i++)
156 if(parse_byte(static_cast<uint8_t>(str[i]), s) >= 0)
157 r++;
158 if(parse_byte(-1, s) >= 0)
159 r++;
160 return r;
163 std::u32string to32(const std::string& utf8)
165 std::u32string x;
166 x.resize(strlen(utf8));
167 to32i(utf8.begin(), utf8.end(), x.begin());
168 return x;
171 std::string to8(const std::u32string& utf32)
173 std::ostringstream s;
174 for(auto i : utf32) {
175 if(i < 0x80)
176 s << (unsigned char)i;
177 else if(i < 0x800)
178 s << (unsigned char)(0xC0 + (i >> 6)) << (unsigned char)(0x80 + (i & 0x3F));
179 else if(i < 0x10000)
180 s << (unsigned char)(0xE0 + (i >> 12)) << (unsigned char)(0x80 + ((i >> 6) & 0x3F))
181 << (unsigned char)(0x80 + (i & 0x3F));
182 else if(i < 0x10FFFF)
183 s << (unsigned char)(0xF0 + (i >> 18)) << (unsigned char)(0x80 + ((i >> 12) & 0x3F))
184 << (unsigned char)(0x80 + ((i >> 6) & 0x3F))
185 << (unsigned char)(0x80 + (i & 0x3F));
187 return s.str();
191 #ifdef TEST_UTF8
192 #include <iostream>
193 char* format_dword(uint16_t s)
195 static char buf[32];
196 sprintf(buf, "%04X", s);
197 return buf;
200 int main()
202 uint16_t s = utf8::initial_state;
203 while(true) {
204 int c;
205 int32_t d;
206 std::cin >> c;
207 d = utf8::parse_byte(c, s);
208 std::cout << "> " << d << " (status word=" << format_dword(s) << ")" << std::endl;
209 if(c == -1 && d == -1)
210 return 0;
212 return 0;
214 #endif