1 // Copyright (c) 2009 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
5 #include "base/i18n/icu_string_conversions.h"
9 #include "base/basictypes.h"
10 #include "base/logging.h"
11 #include "base/string_util.h"
12 #include "unicode/ucnv.h"
13 #include "unicode/ucnv_cb.h"
14 #include "unicode/ucnv_err.h"
15 #include "unicode/ustring.h"
20 // ToUnicodeCallbackSubstitute() is based on UCNV_TO_U_CALLBACK_SUSBSTITUTE
21 // in source/common/ucnv_err.c.
23 // Copyright (c) 1995-2006 International Business Machines Corporation
26 // All rights reserved.
29 // Permission is hereby granted, free of charge, to any person obtaining a
30 // copy of this software and associated documentation files (the "Software"),
31 // to deal in the Software without restriction, including without limitation
32 // the rights to use, copy, modify, merge, publish, distribute, and/or
33 // sell copies of the Software, and to permit persons to whom the Software
34 // is furnished to do so, provided that the above copyright notice(s) and
35 // this permission notice appear in all copies of the Software and that
36 // both the above copyright notice(s) and this permission notice appear in
37 // supporting documentation.
39 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
40 // EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
41 // MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT
42 // OF THIRD PARTY RIGHTS. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS
43 // INCLUDED IN THIS NOTICE BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT
44 // OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS
45 // OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE
46 // OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE
47 // OR PERFORMANCE OF THIS SOFTWARE.
49 // Except as contained in this notice, the name of a copyright holder
50 // shall not be used in advertising or otherwise to promote the sale, use
51 // or other dealings in this Software without prior written authorization
52 // of the copyright holder.
54 // ___________________________________________________________________________
56 // All trademarks and registered trademarks mentioned herein are the property
57 // of their respective owners.
59 void ToUnicodeCallbackSubstitute(const void* context
,
60 UConverterToUnicodeArgs
*to_args
,
61 const char* code_units
,
63 UConverterCallbackReason reason
,
65 static const UChar kReplacementChar
= 0xFFFD;
66 if (reason
<= UCNV_IRREGULAR
) {
67 if (context
== NULL
||
68 (*(reinterpret_cast<const char*>(context
)) == 'i' &&
69 reason
== UCNV_UNASSIGNED
)) {
71 ucnv_cbToUWriteUChars(to_args
, &kReplacementChar
, 1, 0, err
);
73 // else the caller must have set the error code accordingly.
75 // else ignore the reset, close and clone calls.
78 bool ConvertFromUTF16(UConverter
* converter
, const UChar
* uchar_src
,
79 int uchar_len
, OnStringConversionError::Type on_error
,
80 std::string
* encoded
) {
81 int encoded_max_length
= UCNV_GET_MAX_BYTES_FOR_STRING(uchar_len
,
82 ucnv_getMaxCharSize(converter
));
83 encoded
->resize(encoded_max_length
);
85 UErrorCode status
= U_ZERO_ERROR
;
87 // Setup our error handler.
89 case OnStringConversionError::FAIL
:
90 ucnv_setFromUCallBack(converter
, UCNV_FROM_U_CALLBACK_STOP
, 0,
93 case OnStringConversionError::SKIP
:
94 case OnStringConversionError::SUBSTITUTE
:
95 ucnv_setFromUCallBack(converter
, UCNV_FROM_U_CALLBACK_SKIP
, 0,
102 // ucnv_fromUChars returns size not including terminating null
103 int actual_size
= ucnv_fromUChars(converter
, &(*encoded
)[0],
104 encoded_max_length
, uchar_src
, uchar_len
, &status
);
105 encoded
->resize(actual_size
);
106 ucnv_close(converter
);
107 if (U_SUCCESS(status
))
109 encoded
->clear(); // Make sure the output is empty on error.
113 // Set up our error handler for ToUTF-16 converters
114 void SetUpErrorHandlerForToUChars(OnStringConversionError::Type on_error
,
115 UConverter
* converter
, UErrorCode
* status
) {
117 case OnStringConversionError::FAIL
:
118 ucnv_setToUCallBack(converter
, UCNV_TO_U_CALLBACK_STOP
, 0,
121 case OnStringConversionError::SKIP
:
122 ucnv_setToUCallBack(converter
, UCNV_TO_U_CALLBACK_SKIP
, 0,
125 case OnStringConversionError::SUBSTITUTE
:
126 ucnv_setToUCallBack(converter
, ToUnicodeCallbackSubstitute
, 0,
134 inline UConverterType
utf32_platform_endian() {
136 return UCNV_UTF32_BigEndian
;
138 return UCNV_UTF32_LittleEndian
;
144 const char kCodepageLatin1
[] = "ISO-8859-1";
145 const char kCodepageUTF8
[] = "UTF-8";
146 const char kCodepageUTF16BE
[] = "UTF-16BE";
147 const char kCodepageUTF16LE
[] = "UTF-16LE";
149 // Codepage <-> Wide/UTF-16 ---------------------------------------------------
151 bool UTF16ToCodepage(const string16
& utf16
,
152 const char* codepage_name
,
153 OnStringConversionError::Type on_error
,
154 std::string
* encoded
) {
157 UErrorCode status
= U_ZERO_ERROR
;
158 UConverter
* converter
= ucnv_open(codepage_name
, &status
);
159 if (!U_SUCCESS(status
))
162 return ConvertFromUTF16(converter
, utf16
.c_str(),
163 static_cast<int>(utf16
.length()), on_error
, encoded
);
166 bool CodepageToUTF16(const std::string
& encoded
,
167 const char* codepage_name
,
168 OnStringConversionError::Type on_error
,
172 UErrorCode status
= U_ZERO_ERROR
;
173 UConverter
* converter
= ucnv_open(codepage_name
, &status
);
174 if (!U_SUCCESS(status
))
177 // Even in the worst case, the maximum length in 2-byte units of UTF-16
178 // output would be at most the same as the number of bytes in input. There
179 // is no single-byte encoding in which a character is mapped to a
180 // non-BMP character requiring two 2-byte units.
182 // Moreover, non-BMP characters in legacy multibyte encodings
183 // (e.g. EUC-JP, GB18030) take at least 2 bytes. The only exceptions are
184 // BOCU and SCSU, but we don't care about them.
185 size_t uchar_max_length
= encoded
.length() + 1;
187 SetUpErrorHandlerForToUChars(on_error
, converter
, &status
);
188 int actual_size
= ucnv_toUChars(converter
, WriteInto(utf16
, uchar_max_length
),
189 static_cast<int>(uchar_max_length
), encoded
.data(),
190 static_cast<int>(encoded
.length()), &status
);
191 ucnv_close(converter
);
192 if (!U_SUCCESS(status
)) {
193 utf16
->clear(); // Make sure the output is empty on error.
197 utf16
->resize(actual_size
);
201 bool WideToCodepage(const std::wstring
& wide
,
202 const char* codepage_name
,
203 OnStringConversionError::Type on_error
,
204 std::string
* encoded
) {
205 #if defined(WCHAR_T_IS_UTF16)
206 return UTF16ToCodepage(wide
, codepage_name
, on_error
, encoded
);
207 #elif defined(WCHAR_T_IS_UTF32)
210 UErrorCode status
= U_ZERO_ERROR
;
211 UConverter
* converter
= ucnv_open(codepage_name
, &status
);
212 if (!U_SUCCESS(status
))
216 // When wchar_t is wider than UChar (16 bits), transform |wide| into a
217 // UChar* string. Size the UChar* buffer to be large enough to hold twice
218 // as many UTF-16 code units (UChar's) as there are Unicode code points,
219 // in case each code points translates to a UTF-16 surrogate pair,
220 // and leave room for a NUL terminator.
221 std::vector
<UChar
> utf16(wide
.length() * 2 + 1);
222 u_strFromWCS(&utf16
[0], utf16
.size(), &utf16_len
,
223 wide
.c_str(), wide
.length(), &status
);
224 DCHECK(U_SUCCESS(status
)) << "failed to convert wstring to UChar*";
226 return ConvertFromUTF16(converter
, &utf16
[0], utf16_len
, on_error
, encoded
);
227 #endif // defined(WCHAR_T_IS_UTF32)
230 bool CodepageToWide(const std::string
& encoded
,
231 const char* codepage_name
,
232 OnStringConversionError::Type on_error
,
233 std::wstring
* wide
) {
234 #if defined(WCHAR_T_IS_UTF16)
235 return CodepageToUTF16(encoded
, codepage_name
, on_error
, wide
);
236 #elif defined(WCHAR_T_IS_UTF32)
239 UErrorCode status
= U_ZERO_ERROR
;
240 UConverter
* converter
= ucnv_open(codepage_name
, &status
);
241 if (!U_SUCCESS(status
))
244 // The maximum length in 4 byte unit of UTF-32 output would be
245 // at most the same as the number of bytes in input. In the worst
246 // case of GB18030 (excluding escaped-based encodings like ISO-2022-JP),
247 // this can be 4 times larger than actually needed.
248 size_t wchar_max_length
= encoded
.length() + 1;
250 SetUpErrorHandlerForToUChars(on_error
, converter
, &status
);
251 int actual_size
= ucnv_toAlgorithmic(utf32_platform_endian(), converter
,
252 reinterpret_cast<char*>(WriteInto(wide
, wchar_max_length
)),
253 static_cast<int>(wchar_max_length
) * sizeof(wchar_t), encoded
.data(),
254 static_cast<int>(encoded
.length()), &status
);
255 ucnv_close(converter
);
256 if (!U_SUCCESS(status
)) {
257 wide
->clear(); // Make sure the output is empty on error.
261 // actual_size is # of bytes.
262 wide
->resize(actual_size
/ sizeof(wchar_t));
264 #endif // defined(WCHAR_T_IS_UTF32)