url/url_canon_icu_unittest.cc

   1 // Copyright 2014 The Chromium Authors. All rights reserved.
   2 // Use of this source code is governed by a BSD-style license that can be
   3 // found in the LICENSE file.
   4
   5 #include "base/macros.h"
   6 #include "testing/gtest/include/gtest/gtest.h"
   7 #include "third_party/icu/source/common/unicode/ucnv.h"
   8 #include "url/url_canon.h"
   9 #include "url/url_canon_icu.h"
  10 #include "url/url_canon_stdstring.h"
  11 #include "url/url_test_utils.h"
  12
  13 namespace url {
  14
  15 using test_utils::WStringToUTF16;
  16
  17 namespace {
  18
  19 // Wrapper around a UConverter object that managers creation and destruction.
  20 class UConvScoper {
  21  public:
  22   explicit UConvScoper(const char* charset_name) {
  23     UErrorCode err = U_ZERO_ERROR;
  24     converter_ = ucnv_open(charset_name, &err);
  25   }
  26
  27   ~UConvScoper() {
  28     if (converter_)
  29       ucnv_close(converter_);
  30   }
  31
  32   // Returns the converter object, may be NULL.
  33   UConverter* converter() const { return converter_; }
  34
  35  private:
  36   UConverter* converter_;
  37 };
  38
  39 TEST(URLCanonIcuTest, ICUCharsetConverter) {
  40   struct ICUCase {
  41     const wchar_t* input;
  42     const char* encoding;
  43     const char* expected;
  44   } icu_cases[] = {
  45       // UTF-8.
  46     {L"Hello, world", "utf-8", "Hello, world"},
  47     {L"\x4f60\x597d", "utf-8", "\xe4\xbd\xa0\xe5\xa5\xbd"},
  48       // Non-BMP UTF-8.
  49     {L"!\xd800\xdf00!", "utf-8", "!\xf0\x90\x8c\x80!"},
  50       // Big5
  51     {L"\x4f60\x597d", "big5", "\xa7\x41\xa6\x6e"},
  52       // Unrepresentable character in the destination set.
  53     {L"hello\x4f60\x06de\x597dworld", "big5",
  54       "hello\xa7\x41%26%231758%3B\xa6\x6eworld"},
  55   };
  56
  57   for (size_t i = 0; i < arraysize(icu_cases); i++) {
  58     UConvScoper conv(icu_cases[i].encoding);
  59     ASSERT_TRUE(conv.converter() != NULL);
  60     ICUCharsetConverter converter(conv.converter());
  61
  62     std::string str;
  63     StdStringCanonOutput output(&str);
  64
  65     base::string16 input_str(WStringToUTF16(icu_cases[i].input));
  66     int input_len = static_cast<int>(input_str.length());
  67     converter.ConvertFromUTF16(input_str.c_str(), input_len, &output);
  68     output.Complete();
  69
  70     EXPECT_STREQ(icu_cases[i].expected, str.c_str());
  71   }
  72
  73   // Test string sizes around the resize boundary for the output to make sure
  74   // the converter resizes as needed.
  75   const int static_size = 16;
  76   UConvScoper conv("utf-8");
  77   ASSERT_TRUE(conv.converter());
  78   ICUCharsetConverter converter(conv.converter());
  79   for (int i = static_size - 2; i <= static_size + 2; i++) {
  80     // Make a string with the appropriate length.
  81     base::string16 input;
  82     for (int ch = 0; ch < i; ch++)
  83       input.push_back('a');
  84
  85     RawCanonOutput<static_size> output;
  86     converter.ConvertFromUTF16(input.c_str(), static_cast<int>(input.length()),
  87                                &output);
  88     EXPECT_EQ(input.length(), static_cast<size_t>(output.length()));
  89   }
  90 }
  91
  92 TEST(URLCanonIcuTest, QueryWithConverter) {
  93   struct QueryCase {
  94     const char* input8;
  95     const wchar_t* input16;
  96     const char* encoding;
  97     const char* expected;
  98   } query_cases[] = {
  99       // Regular ASCII case in some different encodings.
 100     {"foo=bar", L"foo=bar", "utf-8", "?foo=bar"},
 101     {"foo=bar", L"foo=bar", "shift_jis", "?foo=bar"},
 102     {"foo=bar", L"foo=bar", "gb2312", "?foo=bar"},
 103       // Chinese input/output
 104     {"q=\xe4\xbd\xa0\xe5\xa5\xbd", L"q=\x4f60\x597d", "gb2312",
 105       "?q=%C4%E3%BA%C3"},
 106     {"q=\xe4\xbd\xa0\xe5\xa5\xbd", L"q=\x4f60\x597d", "big5", "?q=%A7A%A6n"},
 107       // Unencodable character in the destination character set should be
 108       // escaped. The escape sequence unescapes to be the entity name:
 109       // "?q=&#20320;"
 110     {"q=Chinese\xef\xbc\xa7", L"q=Chinese\xff27", "iso-8859-1",
 111       "?q=Chinese%26%2365319%3B"},
 112   };
 113
 114   for (size_t i = 0; i < arraysize(query_cases); i++) {
 115     Component out_comp;
 116
 117     UConvScoper conv(query_cases[i].encoding);
 118     ASSERT_TRUE(!query_cases[i].encoding || conv.converter());
 119     ICUCharsetConverter converter(conv.converter());
 120
 121     if (query_cases[i].input8) {
 122       int len = static_cast<int>(strlen(query_cases[i].input8));
 123       Component in_comp(0, len);
 124       std::string out_str;
 125
 126       StdStringCanonOutput output(&out_str);
 127       CanonicalizeQuery(query_cases[i].input8, in_comp, &converter, &output,
 128                         &out_comp);
 129       output.Complete();
 130
 131       EXPECT_EQ(query_cases[i].expected, out_str);
 132     }
 133
 134     if (query_cases[i].input16) {
 135       base::string16 input16(WStringToUTF16(query_cases[i].input16));
 136       int len = static_cast<int>(input16.length());
 137       Component in_comp(0, len);
 138       std::string out_str;
 139
 140       StdStringCanonOutput output(&out_str);
 141       CanonicalizeQuery(input16.c_str(), in_comp, &converter, &output,
 142                         &out_comp);
 143       output.Complete();
 144
 145       EXPECT_EQ(query_cases[i].expected, out_str);
 146     }
 147   }
 148
 149   // Extra test for input with embedded NULL;
 150   std::string out_str;
 151   StdStringCanonOutput output(&out_str);
 152   Component out_comp;
 153   CanonicalizeQuery("a \x00z\x01", Component(0, 5), NULL, &output, &out_comp);
 154   output.Complete();
 155   EXPECT_EQ("?a%20%00z%01", out_str);
 156 }
 157
 158 }  // namespace
 159
 160 }  // namespace url