1 // Copyright 2013 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
5 // ICU integration functions.
10 #include "base/lazy_instance.h"
11 #include "base/logging.h"
12 #include "third_party/icu/source/common/unicode/ucnv.h"
13 #include "third_party/icu/source/common/unicode/ucnv_cb.h"
14 #include "third_party/icu/source/common/unicode/uidna.h"
15 #include "url/url_canon_icu.h"
16 #include "url/url_canon_internal.h" // for _itoa_s
22 // Called when converting a character that can not be represented, this will
23 // append an escaped version of the numerical character reference for that code
24 // point. It is of the form "Ӓ" and we will escape the non-digits to
25 // "%26%231234%3B". Why? This is what Netscape did back in the olden days.
26 void appendURLEscapedChar(const void* context
,
27 UConverterFromUnicodeArgs
* from_args
,
28 const UChar
* code_units
,
31 UConverterCallbackReason reason
,
33 if (reason
== UCNV_UNASSIGNED
) {
36 const static int prefix_len
= 6;
37 const static char prefix
[prefix_len
+ 1] = "%26%23"; // "&#" percent-escaped
38 ucnv_cbFromUWriteBytes(from_args
, prefix
, prefix_len
, 0, err
);
40 DCHECK(code_point
< 0x110000);
41 char number
[8]; // Max Unicode code point is 7 digits.
42 _itoa_s(code_point
, number
, 10);
43 int number_len
= static_cast<int>(strlen(number
));
44 ucnv_cbFromUWriteBytes(from_args
, number
, number_len
, 0, err
);
46 const static int postfix_len
= 3;
47 const static char postfix
[postfix_len
+ 1] = "%3B"; // ";" percent-escaped
48 ucnv_cbFromUWriteBytes(from_args
, postfix
, postfix_len
, 0, err
);
52 // A class for scoping the installation of the invalid character callback.
53 class AppendHandlerInstaller
{
55 // The owner of this object must ensure that the converter is alive for the
56 // duration of this object's lifetime.
57 AppendHandlerInstaller(UConverter
* converter
) : converter_(converter
) {
58 UErrorCode err
= U_ZERO_ERROR
;
59 ucnv_setFromUCallBack(converter_
, appendURLEscapedChar
, 0,
60 &old_callback_
, &old_context_
, &err
);
63 ~AppendHandlerInstaller() {
64 UErrorCode err
= U_ZERO_ERROR
;
65 ucnv_setFromUCallBack(converter_
, old_callback_
, old_context_
, 0, 0, &err
);
69 UConverter
* converter_
;
71 UConverterFromUCallback old_callback_
;
72 const void* old_context_
;
75 // A wrapper to use LazyInstance<>::Leaky with ICU's UIDNA, a C pointer to
76 // a UTS46/IDNA 2008 handling object opened with uidna_openUTS46().
78 // We use UTS46 with BiDiCheck to migrate from IDNA 2003 (with unassigned
79 // code points allowed) to IDNA 2008 with
80 // the backward compatibility in mind. What it does:
82 // 1. Use the up-to-date Unicode data.
83 // 2. Define a case folding/mapping with the up-to-date Unicode data as
85 // 3. Use transitional mechanism for 4 deviation characters (sharp-s,
86 // final sigma, ZWJ and ZWNJ) for now.
87 // 4. Continue to allow symbols and punctuations.
88 // 5. Apply new BiDi check rules more permissive than the IDNA 2003 BiDI rules.
89 // 6. Do not apply STD3 rules
90 // 7. Do not allow unassigned code points.
92 // It also closely matches what IE 10 does except for the BiDi check (
93 // http://goo.gl/3XBhqw ).
94 // See http://http://unicode.org/reports/tr46/ and references therein
98 UErrorCode err
= U_ZERO_ERROR
;
99 // TODO(jungshik): Change options as different parties (browsers,
100 // registrars, search engines) converge toward a consensus.
101 value
= uidna_openUTS46(UIDNA_CHECK_BIDI
, &err
);
111 ICUCharsetConverter::ICUCharsetConverter(UConverter
* converter
)
112 : converter_(converter
) {
115 ICUCharsetConverter::~ICUCharsetConverter() {
118 void ICUCharsetConverter::ConvertFromUTF16(const base::char16
* input
,
120 CanonOutput
* output
) {
121 // Install our error handler. It will be called for character that can not
122 // be represented in the destination character set.
123 AppendHandlerInstaller
handler(converter_
);
125 int begin_offset
= output
->length();
126 int dest_capacity
= output
->capacity() - begin_offset
;
127 output
->set_length(output
->length());
130 UErrorCode err
= U_ZERO_ERROR
;
131 char* dest
= &output
->data()[begin_offset
];
132 int required_capacity
= ucnv_fromUChars(converter_
, dest
, dest_capacity
,
133 input
, input_len
, &err
);
134 if (err
!= U_BUFFER_OVERFLOW_ERROR
) {
135 output
->set_length(begin_offset
+ required_capacity
);
139 // Output didn't fit, expand
140 dest_capacity
= required_capacity
;
141 output
->Resize(begin_offset
+ dest_capacity
);
145 static base::LazyInstance
<UIDNAWrapper
>::Leaky
146 g_uidna
= LAZY_INSTANCE_INITIALIZER
;
148 // Converts the Unicode input representing a hostname to ASCII using IDN rules.
149 // The output must be ASCII, but is represented as wide characters.
151 // On success, the output will be filled with the ASCII host name and it will
152 // return true. Unlike most other canonicalization functions, this assumes that
153 // the output is empty. The beginning of the host will be at offset 0, and
154 // the length of the output will be set to the length of the new host name.
156 // On error, this will return false. The output in this case is undefined.
157 // TODO(jungshik): use UTF-8/ASCII version of nameToASCII.
158 // Change the function signature and callers accordingly to avoid unnecessary
159 // conversions in our code. In addition, consider using icu::IDNA's UTF-8/ASCII
160 // version with StringByteSink. That way, we can avoid C wrappers and additional
161 // string conversion.
162 bool IDNToASCII(const base::char16
* src
, int src_len
, CanonOutputW
* output
) {
163 DCHECK(output
->length() == 0); // Output buffer is assumed empty.
165 UIDNA
* uidna
= g_uidna
.Get().value
;
166 DCHECK(uidna
!= NULL
);
168 UErrorCode err
= U_ZERO_ERROR
;
169 UIDNAInfo info
= UIDNA_INFO_INITIALIZER
;
170 int output_length
= uidna_nameToASCII(uidna
, src
, src_len
, output
->data(),
171 output
->capacity(), &info
, &err
);
172 if (U_SUCCESS(err
) && info
.errors
== 0) {
173 output
->set_length(output_length
);
177 // TODO(jungshik): Look at info.errors to handle them case-by-case basis
179 if (err
!= U_BUFFER_OVERFLOW_ERROR
|| info
.errors
!= 0)
180 return false; // Unknown error, give up.
182 // Not enough room in our buffer, expand.
183 output
->Resize(output_length
);
187 bool ReadUTFChar(const char* str
, int* begin
, int length
,
188 unsigned* code_point_out
) {
189 int code_point
; // Avoids warning when U8_NEXT writes -1 to it.
190 U8_NEXT(str
, *begin
, length
, code_point
);
191 *code_point_out
= static_cast<unsigned>(code_point
);
193 // The ICU macro above moves to the next char, we want to point to the last
197 // Validate the decoded value.
198 if (U_IS_UNICODE_CHAR(code_point
))
200 *code_point_out
= kUnicodeReplacementCharacter
;
204 bool ReadUTFChar(const base::char16
* str
, int* begin
, int length
,
205 unsigned* code_point
) {
206 if (U16_IS_SURROGATE(str
[*begin
])) {
207 if (!U16_IS_SURROGATE_LEAD(str
[*begin
]) || *begin
+ 1 >= length
||
208 !U16_IS_TRAIL(str
[*begin
+ 1])) {
209 // Invalid surrogate pair.
210 *code_point
= kUnicodeReplacementCharacter
;
213 // Valid surrogate pair.
214 *code_point
= U16_GET_SUPPLEMENTARY(str
[*begin
], str
[*begin
+ 1]);
218 // Not a surrogate, just one 16-bit word.
219 *code_point
= str
[*begin
];
222 if (U_IS_UNICODE_CHAR(*code_point
))
225 // Invalid code point.
226 *code_point
= kUnicodeReplacementCharacter
;
230 } // namespace url_canon