1 // Copyright 2014 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
5 #include "base/macros.h"
6 #include "testing/gtest/include/gtest/gtest.h"
7 #include "third_party/icu/source/common/unicode/ucnv.h"
8 #include "url/url_canon.h"
9 #include "url/url_canon_icu.h"
10 #include "url/url_canon_stdstring.h"
11 #include "url/url_test_utils.h"
15 using test_utils::WStringToUTF16
;
19 // Wrapper around a UConverter object that managers creation and destruction.
22 explicit UConvScoper(const char* charset_name
) {
23 UErrorCode err
= U_ZERO_ERROR
;
24 converter_
= ucnv_open(charset_name
, &err
);
29 ucnv_close(converter_
);
32 // Returns the converter object, may be NULL.
33 UConverter
* converter() const { return converter_
; }
36 UConverter
* converter_
;
39 TEST(URLCanonIcuTest
, ICUCharsetConverter
) {
46 {L
"Hello, world", "utf-8", "Hello, world"},
47 {L
"\x4f60\x597d", "utf-8", "\xe4\xbd\xa0\xe5\xa5\xbd"},
49 {L
"!\xd800\xdf00!", "utf-8", "!\xf0\x90\x8c\x80!"},
51 {L
"\x4f60\x597d", "big5", "\xa7\x41\xa6\x6e"},
52 // Unrepresentable character in the destination set.
53 {L
"hello\x4f60\x06de\x597dworld", "big5",
54 "hello\xa7\x41%26%231758%3B\xa6\x6eworld"},
57 for (size_t i
= 0; i
< arraysize(icu_cases
); i
++) {
58 UConvScoper
conv(icu_cases
[i
].encoding
);
59 ASSERT_TRUE(conv
.converter() != NULL
);
60 ICUCharsetConverter
converter(conv
.converter());
63 StdStringCanonOutput
output(&str
);
65 base::string16
input_str(WStringToUTF16(icu_cases
[i
].input
));
66 int input_len
= static_cast<int>(input_str
.length());
67 converter
.ConvertFromUTF16(input_str
.c_str(), input_len
, &output
);
70 EXPECT_STREQ(icu_cases
[i
].expected
, str
.c_str());
73 // Test string sizes around the resize boundary for the output to make sure
74 // the converter resizes as needed.
75 const int static_size
= 16;
76 UConvScoper
conv("utf-8");
77 ASSERT_TRUE(conv
.converter());
78 ICUCharsetConverter
converter(conv
.converter());
79 for (int i
= static_size
- 2; i
<= static_size
+ 2; i
++) {
80 // Make a string with the appropriate length.
82 for (int ch
= 0; ch
< i
; ch
++)
85 RawCanonOutput
<static_size
> output
;
86 converter
.ConvertFromUTF16(input
.c_str(), static_cast<int>(input
.length()),
88 EXPECT_EQ(input
.length(), static_cast<size_t>(output
.length()));
92 TEST(URLCanonIcuTest
, QueryWithConverter
) {
95 const wchar_t* input16
;
99 // Regular ASCII case in some different encodings.
100 {"foo=bar", L
"foo=bar", "utf-8", "?foo=bar"},
101 {"foo=bar", L
"foo=bar", "shift_jis", "?foo=bar"},
102 {"foo=bar", L
"foo=bar", "gb2312", "?foo=bar"},
103 // Chinese input/output
104 {"q=\xe4\xbd\xa0\xe5\xa5\xbd", L
"q=\x4f60\x597d", "gb2312",
106 {"q=\xe4\xbd\xa0\xe5\xa5\xbd", L
"q=\x4f60\x597d", "big5", "?q=%A7A%A6n"},
107 // Unencodable character in the destination character set should be
108 // escaped. The escape sequence unescapes to be the entity name:
110 {"q=Chinese\xef\xbc\xa7", L
"q=Chinese\xff27", "iso-8859-1",
111 "?q=Chinese%26%2365319%3B"},
114 for (size_t i
= 0; i
< arraysize(query_cases
); i
++) {
117 UConvScoper
conv(query_cases
[i
].encoding
);
118 ASSERT_TRUE(!query_cases
[i
].encoding
|| conv
.converter());
119 ICUCharsetConverter
converter(conv
.converter());
121 if (query_cases
[i
].input8
) {
122 int len
= static_cast<int>(strlen(query_cases
[i
].input8
));
123 Component
in_comp(0, len
);
126 StdStringCanonOutput
output(&out_str
);
127 CanonicalizeQuery(query_cases
[i
].input8
, in_comp
, &converter
, &output
,
131 EXPECT_EQ(query_cases
[i
].expected
, out_str
);
134 if (query_cases
[i
].input16
) {
135 base::string16
input16(WStringToUTF16(query_cases
[i
].input16
));
136 int len
= static_cast<int>(input16
.length());
137 Component
in_comp(0, len
);
140 StdStringCanonOutput
output(&out_str
);
141 CanonicalizeQuery(input16
.c_str(), in_comp
, &converter
, &output
,
145 EXPECT_EQ(query_cases
[i
].expected
, out_str
);
149 // Extra test for input with embedded NULL;
151 StdStringCanonOutput
output(&out_str
);
153 CanonicalizeQuery("a \x00z\x01", Component(0, 5), NULL
, &output
, &out_comp
);
155 EXPECT_EQ("?a%20%00z%01", out_str
);