Use std::u32string instead of std::vector<uint32_t> for UTF-32 strings
[lsnes.git] / include / library / utf8.hpp
blob9c05a8a7cdf97d3cb6bde68a58a8d55cf42d60ed
1 #ifndef _library__utf8__hpp__included__
2 #define _library__utf8__hpp__included__
4 #include <iostream>
5 #include <cstdint>
6 #include <cstdlib>
7 #include <string>
9 /**
10 * Initial state for UTF-8 parser.
12 extern const uint16_t utf8_initial_state;
13 /**
14 * Parse a byte.
16 * Parameter ch: The character to parse. -1 for end of string.
17 * Parameter state: The state. Mutated.
18 * Returns: The codepoint, or -1 if no codepoint emitted.
20 * Note: When called with EOF, max 1 codepoint can be emitted.
22 int32_t utf8_parse_byte(int ch, uint16_t& state) throw();
23 /**
24 * Return length of string in UTF-8 codepoints.
26 * Parameter str: The string.
27 * Returns: The length in codepoints.
29 size_t utf8_strlen(const std::string& str) throw();
31 /**
32 * Transform UTF-8 into UTF-32.
34 std::u32string to_u32string(const std::string& utf8);
36 /**
37 * Transform UTF-32 into UTF-8.
39 std::string to_u8string(const std::u32string& utf32);
41 /**
42 * Iterator copy from UTF-8 to UTF-32
44 template<typename srcitr, typename dstitr>
45 inline void copy_from_utf8(srcitr begin, srcitr end, dstitr target)
47 uint16_t state = utf8_initial_state;
48 for(srcitr i = begin; i != end; i++) {
49 int32_t x = utf8_parse_byte((unsigned char)*i, state);
50 if(x >= 0) {
51 *target = x;
52 ++target;
55 int32_t x = utf8_parse_byte(-1, state);
56 if(x >= 0) {
57 *target = x;
58 ++target;
62 #endif