1 // Copyright (c) 2009 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
5 #include "base/utf_string_conversion_utils.h"
7 #include "base/third_party/icu/icu_utf.h"
11 // ReadUnicodeCharacter --------------------------------------------------------
13 bool ReadUnicodeCharacter(const char* src
,
16 uint32
* code_point_out
) {
17 // U8_NEXT expects to be able to use -1 to signal an error, so we must
18 // use a signed type for code_point. But this function returns false
19 // on error anyway, so code_point_out is unsigned.
21 CBU8_NEXT(src
, *char_index
, src_len
, code_point
);
22 *code_point_out
= static_cast<uint32
>(code_point
);
24 // The ICU macro above moves to the next char, we want to point to the last
28 // Validate the decoded value.
29 return IsValidCodepoint(code_point
);
32 bool ReadUnicodeCharacter(const char16
* src
,
36 if (CBU16_IS_SURROGATE(src
[*char_index
])) {
37 if (!CBU16_IS_SURROGATE_LEAD(src
[*char_index
]) ||
38 *char_index
+ 1 >= src_len
||
39 !CBU16_IS_TRAIL(src
[*char_index
+ 1])) {
40 // Invalid surrogate pair.
44 // Valid surrogate pair.
45 *code_point
= CBU16_GET_SUPPLEMENTARY(src
[*char_index
],
46 src
[*char_index
+ 1]);
49 // Not a surrogate, just one 16-bit word.
50 *code_point
= src
[*char_index
];
53 return IsValidCodepoint(*code_point
);
56 #if defined(WCHAR_T_IS_UTF32)
57 bool ReadUnicodeCharacter(const wchar_t* src
,
61 // Conversion is easy since the source is 32-bit.
62 *code_point
= src
[*char_index
];
64 // Validate the value.
65 return IsValidCodepoint(*code_point
);
67 #endif // defined(WCHAR_T_IS_UTF32)
69 // WriteUnicodeCharacter -------------------------------------------------------
71 size_t WriteUnicodeCharacter(uint32 code_point
, std::string
* output
) {
72 if (code_point
<= 0x7f) {
73 // Fast path the common case of one byte.
74 output
->push_back(code_point
);
79 // CBU8_APPEND_UNSAFE can append up to 4 bytes.
80 size_t char_offset
= output
->length();
81 size_t original_char_offset
= char_offset
;
82 output
->resize(char_offset
+ CBU8_MAX_LENGTH
);
84 CBU8_APPEND_UNSAFE(&(*output
)[0], char_offset
, code_point
);
86 // CBU8_APPEND_UNSAFE will advance our pointer past the inserted character, so
87 // it will represent the new length of the string.
88 output
->resize(char_offset
);
89 return char_offset
- original_char_offset
;
92 size_t WriteUnicodeCharacter(uint32 code_point
, string16
* output
) {
93 if (CBU16_LENGTH(code_point
) == 1) {
94 // Thie code point is in the Basic Multilingual Plane (BMP).
95 output
->push_back(static_cast<char16
>(code_point
));
98 // Non-BMP characters use a double-character encoding.
99 size_t char_offset
= output
->length();
100 output
->resize(char_offset
+ CBU16_MAX_LENGTH
);
101 CBU16_APPEND_UNSAFE(&(*output
)[0], char_offset
, code_point
);
102 return CBU16_MAX_LENGTH
;
105 // Generalized Unicode converter -----------------------------------------------
107 template<typename CHAR
>
108 void PrepareForUTF8Output(const CHAR
* src
,
110 std::string
* output
) {
115 // Assume that the entire input will be ASCII.
116 output
->reserve(src_len
);
118 // Assume that the entire input is non-ASCII and will have 3 bytes per char.
119 output
->reserve(src_len
* 3);
123 // Instantiate versions we know callers will need.
124 template void PrepareForUTF8Output(const wchar_t*, size_t, std::string
*);
125 template void PrepareForUTF8Output(const char16
*, size_t, std::string
*);
127 template<typename STRING
>
128 void PrepareForUTF16Or32Output(const char* src
,
134 if (static_cast<unsigned char>(src
[0]) < 0x80) {
135 // Assume the input is all ASCII, which means 1:1 correspondence.
136 output
->reserve(src_len
);
138 // Otherwise assume that the UTF-8 sequences will have 2 bytes for each
140 output
->reserve(src_len
/ 2);
144 // Instantiate versions we know callers will need.
145 template void PrepareForUTF16Or32Output(const char*, size_t, std::wstring
*);
146 template void PrepareForUTF16Or32Output(const char*, size_t, string16
*);