1 // Copyright 2013 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
5 #include "url/url_canon.h"
6 #include "url/url_canon_internal.h"
8 // Query canonicalization in IE
9 // ----------------------------
10 // IE is very permissive for query parameters specified in links on the page
11 // (in contrast to links that it constructs itself based on form data). It does
12 // not unescape any character. It does not reject any escape sequence (be they
13 // invalid like "%2y" or freaky like %00).
15 // IE only escapes spaces and nothing else. Embedded NULLs, tabs (0x09),
16 // LF (0x0a), and CR (0x0d) are removed (this probably happens at an earlier
17 // layer since they are removed from all portions of the URL). All other
18 // characters are passed unmodified. Invalid UTF-16 sequences are preserved as
19 // well, with each character in the input being converted to UTF-8. It is the
20 // server's job to make sense of this invalid query.
22 // Invalid multibyte sequences (for example, invalid UTF-8 on a UTF-8 page)
23 // are converted to the invalid character and sent as unescaped UTF-8 (0xef,
24 // 0xbf, 0xbd). This may not be canonicalization, the parser may generate these
25 // strings before the URL handler ever sees them.
27 // Our query canonicalization
28 // --------------------------
29 // We escape all non-ASCII characters and control characters, like Firefox.
30 // This is more conformant to the URL spec, and there do not seem to be many
31 // problems relating to Firefox's behavior.
33 // Like IE, we will never unescape (although the application may want to try
34 // unescaping to present the user with a more understandable URL). We will
35 // replace all invalid sequences (including invalid UTF-16 sequences, which IE
36 // doesn't) with the "invalid character," and we will escape it.
42 // Returns true if the characters starting at |begin| and going until |end|
43 // (non-inclusive) are all representable in 7-bits.
44 template<typename CHAR
, typename UCHAR
>
45 bool IsAllASCII(const CHAR
* spec
, const Component
& query
) {
46 int end
= query
.end();
47 for (int i
= query
.begin
; i
< end
; i
++) {
48 if (static_cast<UCHAR
>(spec
[i
]) >= 0x80)
54 // Appends the given string to the output, escaping characters that do not
55 // match the given |type| in SharedCharTypes. This version will accept 8 or 16
56 // bit characters, but assumes that they have only 7-bit values. It also assumes
57 // that all UTF-8 values are correct, so doesn't bother checking
58 template<typename CHAR
>
59 void AppendRaw8BitQueryString(const CHAR
* source
, int length
,
60 CanonOutput
* output
) {
61 for (int i
= 0; i
< length
; i
++) {
62 if (!IsQueryChar(static_cast<unsigned char>(source
[i
])))
63 AppendEscapedChar(static_cast<unsigned char>(source
[i
]), output
);
64 else // Doesn't need escaping.
65 output
->push_back(static_cast<char>(source
[i
]));
69 // Runs the converter on the given UTF-8 input. Since the converter expects
70 // UTF-16, we have to convert first. The converter must be non-NULL.
71 void RunConverter(const char* spec
,
72 const Component
& query
,
73 CharsetConverter
* converter
,
74 CanonOutput
* output
) {
75 // This function will replace any misencoded values with the invalid
76 // character. This is what we want so we don't have to check for error.
77 RawCanonOutputW
<1024> utf16
;
78 ConvertUTF8ToUTF16(&spec
[query
.begin
], query
.len
, &utf16
);
79 converter
->ConvertFromUTF16(utf16
.data(), utf16
.length(), output
);
82 // Runs the converter with the given UTF-16 input. We don't have to do
83 // anything, but this overridden function allows us to use the same code
84 // for both UTF-8 and UTF-16 input.
85 void RunConverter(const base::char16
* spec
,
86 const Component
& query
,
87 CharsetConverter
* converter
,
88 CanonOutput
* output
) {
89 converter
->ConvertFromUTF16(&spec
[query
.begin
], query
.len
, output
);
92 template<typename CHAR
, typename UCHAR
>
93 void DoConvertToQueryEncoding(const CHAR
* spec
,
94 const Component
& query
,
95 CharsetConverter
* converter
,
96 CanonOutput
* output
) {
97 if (IsAllASCII
<CHAR
, UCHAR
>(spec
, query
)) {
98 // Easy: the input can just appended with no character set conversions.
99 AppendRaw8BitQueryString(&spec
[query
.begin
], query
.len
, output
);
102 // Harder: convert to the proper encoding first.
104 // Run the converter to get an 8-bit string, then append it, escaping
106 RawCanonOutput
<1024> eight_bit
;
107 RunConverter(spec
, query
, converter
, &eight_bit
);
108 AppendRaw8BitQueryString(eight_bit
.data(), eight_bit
.length(), output
);
111 // No converter, do our own UTF-8 conversion.
112 AppendStringOfType(&spec
[query
.begin
], query
.len
, CHAR_QUERY
, output
);
117 template<typename CHAR
, typename UCHAR
>
118 void DoCanonicalizeQuery(const CHAR
* spec
,
119 const Component
& query
,
120 CharsetConverter
* converter
,
122 Component
* out_query
) {
124 *out_query
= Component();
128 output
->push_back('?');
129 out_query
->begin
= output
->length();
131 DoConvertToQueryEncoding
<CHAR
, UCHAR
>(spec
, query
, converter
, output
);
133 out_query
->len
= output
->length() - out_query
->begin
;
138 void CanonicalizeQuery(const char* spec
,
139 const Component
& query
,
140 CharsetConverter
* converter
,
142 Component
* out_query
) {
143 DoCanonicalizeQuery
<char, unsigned char>(spec
, query
, converter
,
147 void CanonicalizeQuery(const base::char16
* spec
,
148 const Component
& query
,
149 CharsetConverter
* converter
,
151 Component
* out_query
) {
152 DoCanonicalizeQuery
<base::char16
, base::char16
>(spec
, query
, converter
,
156 void ConvertUTF16ToQueryEncoding(const base::char16
* input
,
157 const Component
& query
,
158 CharsetConverter
* converter
,
159 CanonOutput
* output
) {
160 DoConvertToQueryEncoding
<base::char16
, base::char16
>(input
, query
,