1 // Copyright 2013 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
5 // Functions to canonicalize "standard" URLs, which are ones that have an
6 // authority section including a host name.
8 #include "url/url_canon.h"
9 #include "url/url_canon_internal.h"
15 template<typename CHAR
, typename UCHAR
>
16 bool DoCanonicalizeStandardURL(const URLComponentSource
<CHAR
>& source
,
17 const url_parse::Parsed
& parsed
,
18 CharsetConverter
* query_converter
,
20 url_parse::Parsed
* new_parsed
) {
21 // Scheme: this will append the colon.
22 bool success
= CanonicalizeScheme(source
.scheme
, parsed
.scheme
,
23 output
, &new_parsed
->scheme
);
25 // Authority (username, password, host, port)
27 if (parsed
.username
.is_valid() || parsed
.password
.is_valid() ||
28 parsed
.host
.is_nonempty() || parsed
.port
.is_valid()) {
29 have_authority
= true;
31 // Only write the authority separators when we have a scheme.
32 if (parsed
.scheme
.is_valid()) {
33 output
->push_back('/');
34 output
->push_back('/');
37 // User info: the canonicalizer will handle the : and @.
38 success
&= CanonicalizeUserInfo(source
.username
, parsed
.username
,
39 source
.password
, parsed
.password
,
41 &new_parsed
->username
,
42 &new_parsed
->password
);
44 success
&= CanonicalizeHost(source
.host
, parsed
.host
,
45 output
, &new_parsed
->host
);
47 // Host must not be empty for standard URLs.
48 if (!parsed
.host
.is_nonempty())
51 // Port: the port canonicalizer will handle the colon.
52 int default_port
= DefaultPortForScheme(
53 &output
->data()[new_parsed
->scheme
.begin
], new_parsed
->scheme
.len
);
54 success
&= CanonicalizePort(source
.port
, parsed
.port
, default_port
,
55 output
, &new_parsed
->port
);
57 // No authority, clear the components.
58 have_authority
= false;
59 new_parsed
->host
.reset();
60 new_parsed
->username
.reset();
61 new_parsed
->password
.reset();
62 new_parsed
->port
.reset();
63 success
= false; // Standard URLs must have an authority.
67 if (parsed
.path
.is_valid()) {
68 success
&= CanonicalizePath(source
.path
, parsed
.path
,
69 output
, &new_parsed
->path
);
70 } else if (have_authority
||
71 parsed
.query
.is_valid() || parsed
.ref
.is_valid()) {
72 // When we have an empty path, make up a path when we have an authority
73 // or something following the path. The only time we allow an empty
74 // output path is when there is nothing else.
75 new_parsed
->path
= url_parse::Component(output
->length(), 1);
76 output
->push_back('/');
79 new_parsed
->path
.reset();
83 CanonicalizeQuery(source
.query
, parsed
.query
, query_converter
,
84 output
, &new_parsed
->query
);
86 // Ref: ignore failure for this, since the page can probably still be loaded.
87 CanonicalizeRef(source
.ref
, parsed
.ref
, output
, &new_parsed
->ref
);
95 // Returns the default port for the given canonical scheme, or PORT_UNSPECIFIED
96 // if the scheme is unknown.
97 int DefaultPortForScheme(const char* scheme
, int scheme_len
) {
98 int default_port
= url_parse::PORT_UNSPECIFIED
;
101 if (!strncmp(scheme
, "http", scheme_len
))
105 if (!strncmp(scheme
, "https", scheme_len
))
109 if (!strncmp(scheme
, "ftp", scheme_len
))
111 else if (!strncmp(scheme
, "wss", scheme_len
))
115 if (!strncmp(scheme
, "gopher", scheme_len
))
119 if (!strncmp(scheme
, "ws", scheme_len
))
126 bool CanonicalizeStandardURL(const char* spec
,
128 const url_parse::Parsed
& parsed
,
129 CharsetConverter
* query_converter
,
131 url_parse::Parsed
* new_parsed
) {
132 return DoCanonicalizeStandardURL
<char, unsigned char>(
133 URLComponentSource
<char>(spec
), parsed
, query_converter
,
137 bool CanonicalizeStandardURL(const base::char16
* spec
,
139 const url_parse::Parsed
& parsed
,
140 CharsetConverter
* query_converter
,
142 url_parse::Parsed
* new_parsed
) {
143 return DoCanonicalizeStandardURL
<base::char16
, base::char16
>(
144 URLComponentSource
<base::char16
>(spec
), parsed
, query_converter
,
148 // It might be nice in the future to optimize this so unchanged components don't
149 // need to be recanonicalized. This is especially true since the common case for
150 // ReplaceComponents is removing things we don't want, like reference fragments
151 // and usernames. These cases can become more efficient if we can assume the
152 // rest of the URL is OK with these removed (or only the modified parts
153 // recanonicalized). This would be much more complex to implement, however.
155 // You would also need to update DoReplaceComponents in url_util.cc which
156 // relies on this re-checking everything (see the comment there for why).
157 bool ReplaceStandardURL(const char* base
,
158 const url_parse::Parsed
& base_parsed
,
159 const Replacements
<char>& replacements
,
160 CharsetConverter
* query_converter
,
162 url_parse::Parsed
* new_parsed
) {
163 URLComponentSource
<char> source(base
);
164 url_parse::Parsed
parsed(base_parsed
);
165 SetupOverrideComponents(base
, replacements
, &source
, &parsed
);
166 return DoCanonicalizeStandardURL
<char, unsigned char>(
167 source
, parsed
, query_converter
, output
, new_parsed
);
170 // For 16-bit replacements, we turn all the replacements into UTF-8 so the
171 // regular codepath can be used.
172 bool ReplaceStandardURL(const char* base
,
173 const url_parse::Parsed
& base_parsed
,
174 const Replacements
<base::char16
>& replacements
,
175 CharsetConverter
* query_converter
,
177 url_parse::Parsed
* new_parsed
) {
178 RawCanonOutput
<1024> utf8
;
179 URLComponentSource
<char> source(base
);
180 url_parse::Parsed
parsed(base_parsed
);
181 SetupUTF16OverrideComponents(base
, replacements
, &utf8
, &source
, &parsed
);
182 return DoCanonicalizeStandardURL
<char, unsigned char>(
183 source
, parsed
, query_converter
, output
, new_parsed
);
186 } // namespace url_canon