1 // Copyright 2013 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
5 // ICU integration functions.
10 #include "base/logging.h"
11 #include "third_party/icu/source/common/unicode/ucnv.h"
12 #include "third_party/icu/source/common/unicode/ucnv_cb.h"
13 #include "third_party/icu/source/common/unicode/uidna.h"
14 #include "url/url_canon_icu.h"
15 #include "url/url_canon_internal.h" // for _itoa_s
21 // Called when converting a character that can not be represented, this will
22 // append an escaped version of the numerical character reference for that code
23 // point. It is of the form "Ӓ" and we will escape the non-digits to
24 // "%26%231234%3B". Why? This is what Netscape did back in the olden days.
25 void appendURLEscapedChar(const void* context
,
26 UConverterFromUnicodeArgs
* from_args
,
27 const UChar
* code_units
,
30 UConverterCallbackReason reason
,
32 if (reason
== UCNV_UNASSIGNED
) {
35 const static int prefix_len
= 6;
36 const static char prefix
[prefix_len
+ 1] = "%26%23"; // "&#" percent-escaped
37 ucnv_cbFromUWriteBytes(from_args
, prefix
, prefix_len
, 0, err
);
39 DCHECK(code_point
< 0x110000);
40 char number
[8]; // Max Unicode code point is 7 digits.
41 _itoa_s(code_point
, number
, 10);
42 int number_len
= static_cast<int>(strlen(number
));
43 ucnv_cbFromUWriteBytes(from_args
, number
, number_len
, 0, err
);
45 const static int postfix_len
= 3;
46 const static char postfix
[postfix_len
+ 1] = "%3B"; // ";" percent-escaped
47 ucnv_cbFromUWriteBytes(from_args
, postfix
, postfix_len
, 0, err
);
51 // A class for scoping the installation of the invalid character callback.
52 class AppendHandlerInstaller
{
54 // The owner of this object must ensure that the converter is alive for the
55 // duration of this object's lifetime.
56 AppendHandlerInstaller(UConverter
* converter
) : converter_(converter
) {
57 UErrorCode err
= U_ZERO_ERROR
;
58 ucnv_setFromUCallBack(converter_
, appendURLEscapedChar
, 0,
59 &old_callback_
, &old_context_
, &err
);
62 ~AppendHandlerInstaller() {
63 UErrorCode err
= U_ZERO_ERROR
;
64 ucnv_setFromUCallBack(converter_
, old_callback_
, old_context_
, 0, 0, &err
);
68 UConverter
* converter_
;
70 UConverterFromUCallback old_callback_
;
71 const void* old_context_
;
76 ICUCharsetConverter::ICUCharsetConverter(UConverter
* converter
)
77 : converter_(converter
) {
80 ICUCharsetConverter::~ICUCharsetConverter() {
83 void ICUCharsetConverter::ConvertFromUTF16(const base::char16
* input
,
85 CanonOutput
* output
) {
86 // Install our error handler. It will be called for character that can not
87 // be represented in the destination character set.
88 AppendHandlerInstaller
handler(converter_
);
90 int begin_offset
= output
->length();
91 int dest_capacity
= output
->capacity() - begin_offset
;
92 output
->set_length(output
->length());
95 UErrorCode err
= U_ZERO_ERROR
;
96 char* dest
= &output
->data()[begin_offset
];
97 int required_capacity
= ucnv_fromUChars(converter_
, dest
, dest_capacity
,
98 input
, input_len
, &err
);
99 if (err
!= U_BUFFER_OVERFLOW_ERROR
) {
100 output
->set_length(begin_offset
+ required_capacity
);
104 // Output didn't fit, expand
105 dest_capacity
= required_capacity
;
106 output
->Resize(begin_offset
+ dest_capacity
);
110 // Converts the Unicode input representing a hostname to ASCII using IDN rules.
111 // The output must be ASCII, but is represented as wide characters.
113 // On success, the output will be filled with the ASCII host name and it will
114 // return true. Unlike most other canonicalization functions, this assumes that
115 // the output is empty. The beginning of the host will be at offset 0, and
116 // the length of the output will be set to the length of the new host name.
118 // On error, this will return false. The output in this case is undefined.
119 bool IDNToASCII(const base::char16
* src
, int src_len
, CanonOutputW
* output
) {
120 DCHECK(output
->length() == 0); // Output buffer is assumed empty.
122 // Use ALLOW_UNASSIGNED to be more tolerant of hostnames that violate
123 // the spec (which do exist). This does not present any risk and is a
124 // little more future proof.
125 UErrorCode err
= U_ZERO_ERROR
;
126 int num_converted
= uidna_IDNToASCII(src
, src_len
, output
->data(),
128 UIDNA_ALLOW_UNASSIGNED
, NULL
, &err
);
129 if (err
== U_ZERO_ERROR
) {
130 output
->set_length(num_converted
);
133 if (err
!= U_BUFFER_OVERFLOW_ERROR
)
134 return false; // Unknown error, give up.
136 // Not enough room in our buffer, expand.
137 output
->Resize(output
->capacity() * 2);
141 bool ReadUTFChar(const char* str
, int* begin
, int length
,
142 unsigned* code_point_out
) {
143 int code_point
; // Avoids warning when U8_NEXT writes -1 to it.
144 U8_NEXT(str
, *begin
, length
, code_point
);
145 *code_point_out
= static_cast<unsigned>(code_point
);
147 // The ICU macro above moves to the next char, we want to point to the last
151 // Validate the decoded value.
152 if (U_IS_UNICODE_CHAR(code_point
))
154 *code_point_out
= kUnicodeReplacementCharacter
;
158 bool ReadUTFChar(const base::char16
* str
, int* begin
, int length
,
159 unsigned* code_point
) {
160 if (U16_IS_SURROGATE(str
[*begin
])) {
161 if (!U16_IS_SURROGATE_LEAD(str
[*begin
]) || *begin
+ 1 >= length
||
162 !U16_IS_TRAIL(str
[*begin
+ 1])) {
163 // Invalid surrogate pair.
164 *code_point
= kUnicodeReplacementCharacter
;
167 // Valid surrogate pair.
168 *code_point
= U16_GET_SUPPLEMENTARY(str
[*begin
], str
[*begin
+ 1]);
172 // Not a surrogate, just one 16-bit word.
173 *code_point
= str
[*begin
];
176 if (U_IS_UNICODE_CHAR(*code_point
))
179 // Invalid code point.
180 *code_point
= kUnicodeReplacementCharacter
;
184 } // namespace url_canon