16 //Second nibble values:
17 //0 => Return NO CHARACTER and transition to another state with substate 0.
18 //1 => Return the character and transition to another state with substate 0.
19 //2 => Return invalid character and transition to another state with substate 0.
20 //3 => Memorize character minus 192, return NO CHARACTER and transition to another state.
21 //4 => Memorize character minus 224, return NO CHARACTER and transition to another state.
22 //5 => Memorize character minus 240, return NO CHARACTER and transition to another state.
23 //6 => Memorize byte, return invalid character and transition to another state.
24 //7 => Return 2-byte value and transition to another state.
25 //8 => Combine memorized, return NO CHARACTER and transition to another state.
26 //9 => Return 3-byte value and transition to another state.
27 //A => Return 4-byte value and transition to another state.
28 //B => Handle memorized character and EOF.
29 //C => Handle memorized character and continuation.
30 const unsigned char transitions
[] = {
31 //E //1 //C //2 //3 //4 //I
32 0x00, 0x01, 0x02, 0x13, 0x24, 0x35, 0x02, //INITIAL
33 0x01, 0x66, 0x07, 0x66, 0x66, 0x66, 0x66, //S_2_2
34 0x01, 0x66, 0x48, 0x66, 0x66, 0x66, 0x66, //S_2_3
35 0x01, 0x66, 0x58, 0x66, 0x66, 0x66, 0x66, //S_2_4
36 0x01, 0x66, 0x09, 0x66, 0x66, 0x66, 0x66, //S_3_3
37 0x01, 0x66, 0x88, 0x66, 0x66, 0x66, 0x66, //S_3_4
38 0x0B, 0x6C, 0x6C, 0x6C, 0x6C, 0x6C, 0x6C, //INIT_RE
39 0x01, 0x66, 0x0A, 0x66, 0x66, 0x66, 0x66 //S_4_4
43 extern const uint16_t utf8_initial_state
= 0;
45 int32_t utf8_parse_byte(int ch
, uint16_t& state
) throw()
47 unsigned char mch
= (ch
< 248) ? ch
: 248;
48 uint32_t astate
= state
>> 12;
51 if(astate
> 7) astate
= 7;
52 if(ch
< 0) iclass
= 0;
53 else if(ch
< 128) iclass
= 1;
54 else if(ch
< 192) iclass
= 2;
55 else if(ch
< 224) iclass
= 3;
56 else if(ch
< 240) iclass
= 4;
57 else if(ch
< 248) iclass
= 5;
59 unsigned char ctrl
= transitions
[astate
* 7 + iclass
];
63 state
= (ctrl
& 0xF0) * 256;
66 state
= (ctrl
& 0xF0) * 256;
69 state
= (ctrl
& 0xF0) * 256;
72 state
= (ctrl
& 0xF0) * 256 + ch
- 192;
75 state
= (ctrl
& 0xF0) * 256 + ch
- 224;
78 state
= (ctrl
& 0xF0) * 256 + ch
- 240;
81 state
= (ctrl
& 0xF0) * 256 + mch
;
84 tmp
= (state
& 0xFFF) * 64 + ch
- 128;
87 state
= (ctrl
& 0xF0) * 256;
90 state
= (ctrl
& 0xF0) * 256 + (state
& 0xFFF) * 64 + ch
- 128;
93 tmp
= (state
& 0xFFF) * 64 + ch
- 128;
94 if(tmp
< 0x800 || (tmp
& 0xF800) == 0xD800 || (tmp
& 0xFFFE) == 0xFFFE)
96 state
= (ctrl
& 0xF0) * 256;
99 tmp
= (state
& 0x7FFF) * 64 + ch
- 128;
100 if(tmp
< 0x10000 || tmp
> 0x10FFFD || (tmp
& 0xFFFE) == 0xFFFE)
102 state
= (ctrl
& 0xF0) * 256;
109 state
= (ctrl
& 0xF0) * 256;
113 if((state
& 0x80) == 0) {
115 state
= 0x6000 + mch
;
117 } else if((state
& 0xF8) == 0xF8 || (state
& 0xF8) == 0x80) {
118 //Continuation or invalid.
119 state
= 0x6000 + mch
;
121 } else if(iclass
== 0) {
125 } else if(iclass
!= 2) {
127 state
= 0x6000 + mch
;
129 } else if((state
& 0xE0) == 0xC0) {
130 //Complete 2-byte sequence.
131 tmp
= (state
& 0x1F) * 64 + (ch
& 0x3F);
136 } else if((state
& 0xF0) == 0xE0) {
137 //First 2 bytes of 3-byte sequence.
138 state
= 0x4000 + (state
& 0x0F) * 64 + (ch
& 0x3F);
140 } else if((state
& 0xF8) == 0xF0) {
141 //First 2 bytes of 4-byte sequence.
142 state
= 0x5000 + (state
& 0x07) * 64 + (ch
& 0x3F);
149 size_t utf8_strlen(const std::string
& str
) throw()
151 uint16_t s
= utf8_initial_state
;
153 for(size_t i
= 0; i
< str
.length(); i
++)
154 if(utf8_parse_byte(static_cast<uint8_t>(str
[i
]), s
) >= 0)
156 if(utf8_parse_byte(-1, s
) >= 0)
161 std::u32string
to_u32string(const std::string
& utf8
)
164 x
.resize(utf8_strlen(utf8
));
165 copy_from_utf8(utf8
.begin(), utf8
.end(), x
.begin());
169 std::string
to_u8string(const std::u32string
& utf32
)
171 std::ostringstream s
;
172 for(auto i
: utf32
) {
174 s
<< (unsigned char)i
;
176 s
<< (unsigned char)(0xC0 + (i
>> 6)) << (unsigned char)(0x80 + (i
& 0x3F));
178 s
<< (unsigned char)(0xE0 + (i
>> 12)) << (unsigned char)(0x80 + ((i
>> 6) & 0x3F))
179 << (unsigned char)(0x80 + (i
& 0x3F));
180 else if(i
< 0x10FFFF)
181 s
<< (unsigned char)(0xF0 + (i
>> 18)) << (unsigned char)(0x80 + ((i
>> 12) & 0x3F))
182 << (unsigned char)(0x80 + ((i
>> 6) & 0x3F))
183 << (unsigned char)(0x80 + (i
& 0x3F));
190 char* format_dword(uint16_t s
)
193 sprintf(buf
, "%04X", s
);
199 uint16_t s
= utf8_initial_state
;
204 d
= utf8_parse_byte(c
, s
);
205 std::cout
<< "> " << d
<< " (status word=" << format_dword(s
) << ")" << std::endl
;
206 if(c
== -1 && d
== -1)