18 //Second nibble values:
19 //0 => Return NO CHARACTER and transition to another state with substate 0.
20 //1 => Return the character and transition to another state with substate 0.
21 //2 => Return invalid character and transition to another state with substate 0.
22 //3 => Memorize character minus 192, return NO CHARACTER and transition to another state.
23 //4 => Memorize character minus 224, return NO CHARACTER and transition to another state.
24 //5 => Memorize character minus 240, return NO CHARACTER and transition to another state.
25 //6 => Memorize byte, return invalid character and transition to another state.
26 //7 => Return 2-byte value and transition to another state.
27 //8 => Combine memorized, return NO CHARACTER and transition to another state.
28 //9 => Return 3-byte value and transition to another state.
29 //A => Return 4-byte value and transition to another state.
30 //B => Handle memorized character and EOF.
31 //C => Handle memorized character and continuation.
32 const unsigned char transitions
[] = {
33 //E //1 //C //2 //3 //4 //I
34 0x00, 0x01, 0x02, 0x13, 0x24, 0x35, 0x02, //INITIAL
35 0x01, 0x66, 0x07, 0x66, 0x66, 0x66, 0x66, //S_2_2
36 0x01, 0x66, 0x48, 0x66, 0x66, 0x66, 0x66, //S_2_3
37 0x01, 0x66, 0x58, 0x66, 0x66, 0x66, 0x66, //S_2_4
38 0x01, 0x66, 0x09, 0x66, 0x66, 0x66, 0x66, //S_3_3
39 0x01, 0x66, 0x88, 0x66, 0x66, 0x66, 0x66, //S_3_4
40 0x0B, 0x6C, 0x6C, 0x6C, 0x6C, 0x6C, 0x6C, //INIT_RE
41 0x01, 0x66, 0x0A, 0x66, 0x66, 0x66, 0x66 //S_4_4
45 extern const uint16_t initial_state
= 0;
47 int32_t parse_byte(int ch
, uint16_t& state
) throw()
49 unsigned char mch
= (ch
< 248) ? ch
: 248;
50 uint32_t astate
= state
>> 12;
53 if(astate
> 7) astate
= 7;
54 if(ch
< 0) iclass
= 0;
55 else if(ch
< 128) iclass
= 1;
56 else if(ch
< 192) iclass
= 2;
57 else if(ch
< 224) iclass
= 3;
58 else if(ch
< 240) iclass
= 4;
59 else if(ch
< 248) iclass
= 5;
61 unsigned char ctrl
= transitions
[astate
* 7 + iclass
];
65 state
= (ctrl
& 0xF0) * 256;
68 state
= (ctrl
& 0xF0) * 256;
71 state
= (ctrl
& 0xF0) * 256;
74 state
= (ctrl
& 0xF0) * 256 + ch
- 192;
77 state
= (ctrl
& 0xF0) * 256 + ch
- 224;
80 state
= (ctrl
& 0xF0) * 256 + ch
- 240;
83 state
= (ctrl
& 0xF0) * 256 + mch
;
86 tmp
= (state
& 0xFFF) * 64 + ch
- 128;
89 state
= (ctrl
& 0xF0) * 256;
92 state
= (ctrl
& 0xF0) * 256 + (state
& 0xFFF) * 64 + ch
- 128;
95 tmp
= (state
& 0xFFF) * 64 + ch
- 128;
96 if(tmp
< 0x800 || (tmp
& 0xF800) == 0xD800 || (tmp
& 0xFFFE) == 0xFFFE)
98 state
= (ctrl
& 0xF0) * 256;
101 tmp
= (state
& 0x7FFF) * 64 + ch
- 128;
102 if(tmp
< 0x10000 || tmp
> 0x10FFFD || (tmp
& 0xFFFE) == 0xFFFE)
104 state
= (ctrl
& 0xF0) * 256;
111 state
= (ctrl
& 0xF0) * 256;
115 if((state
& 0x80) == 0) {
117 state
= 0x6000 + mch
;
119 } else if((state
& 0xF8) == 0xF8 || (state
& 0xF8) == 0x80) {
120 //Continuation or invalid.
121 state
= 0x6000 + mch
;
123 } else if(iclass
== 0) {
127 } else if(iclass
!= 2) {
129 state
= 0x6000 + mch
;
131 } else if((state
& 0xE0) == 0xC0) {
132 //Complete 2-byte sequence.
133 tmp
= (state
& 0x1F) * 64 + (ch
& 0x3F);
138 } else if((state
& 0xF0) == 0xE0) {
139 //First 2 bytes of 3-byte sequence.
140 state
= 0x4000 + (state
& 0x0F) * 64 + (ch
& 0x3F);
142 } else if((state
& 0xF8) == 0xF0) {
143 //First 2 bytes of 4-byte sequence.
144 state
= 0x5000 + (state
& 0x07) * 64 + (ch
& 0x3F);
151 size_t strlen(const std::string
& str
) throw()
153 uint16_t s
= initial_state
;
155 for(size_t i
= 0; i
< str
.length(); i
++)
156 if(parse_byte(static_cast<uint8_t>(str
[i
]), s
) >= 0)
158 if(parse_byte(-1, s
) >= 0)
163 std::u32string
to32(const std::string
& utf8
)
166 x
.resize(strlen(utf8
));
167 to32i(utf8
.begin(), utf8
.end(), x
.begin());
171 std::string
to8(const std::u32string
& utf32
)
173 std::ostringstream s
;
174 for(auto i
: utf32
) {
176 s
<< (unsigned char)i
;
178 s
<< (unsigned char)(0xC0 + (i
>> 6)) << (unsigned char)(0x80 + (i
& 0x3F));
180 s
<< (unsigned char)(0xE0 + (i
>> 12)) << (unsigned char)(0x80 + ((i
>> 6) & 0x3F))
181 << (unsigned char)(0x80 + (i
& 0x3F));
182 else if(i
< 0x10FFFF)
183 s
<< (unsigned char)(0xF0 + (i
>> 18)) << (unsigned char)(0x80 + ((i
>> 12) & 0x3F))
184 << (unsigned char)(0x80 + ((i
>> 6) & 0x3F))
185 << (unsigned char)(0x80 + (i
& 0x3F));
193 char* format_dword(uint16_t s
)
196 sprintf(buf
, "%04X", s
);
202 uint16_t s
= utf8::initial_state
;
207 d
= utf8::parse_byte(c
, s
);
208 std::cout
<< "> " << d
<< " (status word=" << format_dword(s
) << ")" << std::endl
;
209 if(c
== -1 && d
== -1)