1 // Copyright 2013 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
5 // Functions to canonicalize "standard" URLs, which are ones that have an
6 // authority section including a host name.
8 #include "url/url_canon.h"
9 #include "url/url_canon_internal.h"
10 #include "url/url_constants.h"
16 template<typename CHAR
, typename UCHAR
>
17 bool DoCanonicalizeStandardURL(const URLComponentSource
<CHAR
>& source
,
19 CharsetConverter
* query_converter
,
22 // Scheme: this will append the colon.
23 bool success
= CanonicalizeScheme(source
.scheme
, parsed
.scheme
,
24 output
, &new_parsed
->scheme
);
26 // Authority (username, password, host, port)
28 if (parsed
.username
.is_valid() || parsed
.password
.is_valid() ||
29 parsed
.host
.is_nonempty() || parsed
.port
.is_valid()) {
30 have_authority
= true;
32 // Only write the authority separators when we have a scheme.
33 if (parsed
.scheme
.is_valid()) {
34 output
->push_back('/');
35 output
->push_back('/');
38 // User info: the canonicalizer will handle the : and @.
39 success
&= CanonicalizeUserInfo(source
.username
, parsed
.username
,
40 source
.password
, parsed
.password
,
42 &new_parsed
->username
,
43 &new_parsed
->password
);
45 success
&= CanonicalizeHost(source
.host
, parsed
.host
,
46 output
, &new_parsed
->host
);
48 // Host must not be empty for standard URLs.
49 if (!parsed
.host
.is_nonempty())
52 // Port: the port canonicalizer will handle the colon.
53 int default_port
= DefaultPortForScheme(
54 &output
->data()[new_parsed
->scheme
.begin
], new_parsed
->scheme
.len
);
55 success
&= CanonicalizePort(source
.port
, parsed
.port
, default_port
,
56 output
, &new_parsed
->port
);
58 // No authority, clear the components.
59 have_authority
= false;
60 new_parsed
->host
.reset();
61 new_parsed
->username
.reset();
62 new_parsed
->password
.reset();
63 new_parsed
->port
.reset();
64 success
= false; // Standard URLs must have an authority.
68 if (parsed
.path
.is_valid()) {
69 success
&= CanonicalizePath(source
.path
, parsed
.path
,
70 output
, &new_parsed
->path
);
71 } else if (have_authority
||
72 parsed
.query
.is_valid() || parsed
.ref
.is_valid()) {
73 // When we have an empty path, make up a path when we have an authority
74 // or something following the path. The only time we allow an empty
75 // output path is when there is nothing else.
76 new_parsed
->path
= Component(output
->length(), 1);
77 output
->push_back('/');
80 new_parsed
->path
.reset();
84 CanonicalizeQuery(source
.query
, parsed
.query
, query_converter
,
85 output
, &new_parsed
->query
);
87 // Ref: ignore failure for this, since the page can probably still be loaded.
88 CanonicalizeRef(source
.ref
, parsed
.ref
, output
, &new_parsed
->ref
);
96 // Returns the default port for the given canonical scheme, or PORT_UNSPECIFIED
97 // if the scheme is unknown.
98 int DefaultPortForScheme(const char* scheme
, int scheme_len
) {
99 int default_port
= PORT_UNSPECIFIED
;
100 switch (scheme_len
) {
102 if (!strncmp(scheme
, kHttpScheme
, scheme_len
))
106 if (!strncmp(scheme
, kHttpsScheme
, scheme_len
))
110 if (!strncmp(scheme
, kFtpScheme
, scheme_len
))
112 else if (!strncmp(scheme
, kWssScheme
, scheme_len
))
116 if (!strncmp(scheme
, kGopherScheme
, scheme_len
))
120 if (!strncmp(scheme
, kWsScheme
, scheme_len
))
127 bool CanonicalizeStandardURL(const char* spec
,
129 const Parsed
& parsed
,
130 CharsetConverter
* query_converter
,
132 Parsed
* new_parsed
) {
133 return DoCanonicalizeStandardURL
<char, unsigned char>(
134 URLComponentSource
<char>(spec
), parsed
, query_converter
,
138 bool CanonicalizeStandardURL(const base::char16
* spec
,
140 const Parsed
& parsed
,
141 CharsetConverter
* query_converter
,
143 Parsed
* new_parsed
) {
144 return DoCanonicalizeStandardURL
<base::char16
, base::char16
>(
145 URLComponentSource
<base::char16
>(spec
), parsed
, query_converter
,
149 // It might be nice in the future to optimize this so unchanged components don't
150 // need to be recanonicalized. This is especially true since the common case for
151 // ReplaceComponents is removing things we don't want, like reference fragments
152 // and usernames. These cases can become more efficient if we can assume the
153 // rest of the URL is OK with these removed (or only the modified parts
154 // recanonicalized). This would be much more complex to implement, however.
156 // You would also need to update DoReplaceComponents in url_util.cc which
157 // relies on this re-checking everything (see the comment there for why).
158 bool ReplaceStandardURL(const char* base
,
159 const Parsed
& base_parsed
,
160 const Replacements
<char>& replacements
,
161 CharsetConverter
* query_converter
,
163 Parsed
* new_parsed
) {
164 URLComponentSource
<char> source(base
);
165 Parsed
parsed(base_parsed
);
166 SetupOverrideComponents(base
, replacements
, &source
, &parsed
);
167 return DoCanonicalizeStandardURL
<char, unsigned char>(
168 source
, parsed
, query_converter
, output
, new_parsed
);
171 // For 16-bit replacements, we turn all the replacements into UTF-8 so the
172 // regular codepath can be used.
173 bool ReplaceStandardURL(const char* base
,
174 const Parsed
& base_parsed
,
175 const Replacements
<base::char16
>& replacements
,
176 CharsetConverter
* query_converter
,
178 Parsed
* new_parsed
) {
179 RawCanonOutput
<1024> utf8
;
180 URLComponentSource
<char> source(base
);
181 Parsed
parsed(base_parsed
);
182 SetupUTF16OverrideComponents(base
, replacements
, &utf8
, &source
, &parsed
);
183 return DoCanonicalizeStandardURL
<char, unsigned char>(
184 source
, parsed
, query_converter
, output
, new_parsed
);