url/url_canon_stdurl.cc

   1 // Copyright 2013 The Chromium Authors. All rights reserved.
   2 // Use of this source code is governed by a BSD-style license that can be
   3 // found in the LICENSE file.
   4
   5 // Functions to canonicalize "standard" URLs, which are ones that have an
   6 // authority section including a host name.
   7
   8 #include "url/url_canon.h"
   9 #include "url/url_canon_internal.h"
  10
  11 namespace url_canon {
  12
  13 namespace {
  14
  15 template<typename CHAR, typename UCHAR>
  16 bool DoCanonicalizeStandardURL(const URLComponentSource<CHAR>& source,
  17                                const url_parse::Parsed& parsed,
  18                                CharsetConverter* query_converter,
  19                                CanonOutput* output,
  20                                url_parse::Parsed* new_parsed) {
  21   // Scheme: this will append the colon.
  22   bool success = CanonicalizeScheme(source.scheme, parsed.scheme,
  23                                     output, &new_parsed->scheme);
  24
  25   // Authority (username, password, host, port)
  26   bool have_authority;
  27   if (parsed.username.is_valid() || parsed.password.is_valid() ||
  28       parsed.host.is_nonempty() || parsed.port.is_valid()) {
  29     have_authority = true;
  30
  31     // Only write the authority separators when we have a scheme.
  32     if (parsed.scheme.is_valid()) {
  33       output->push_back('/');
  34       output->push_back('/');
  35     }
  36
  37     // User info: the canonicalizer will handle the : and @.
  38     success &= CanonicalizeUserInfo(source.username, parsed.username,
  39                                     source.password, parsed.password,
  40                                     output,
  41                                     &new_parsed->username,
  42                                     &new_parsed->password);
  43
  44     success &= CanonicalizeHost(source.host, parsed.host,
  45                                 output, &new_parsed->host);
  46
  47     // Host must not be empty for standard URLs.
  48     if (!parsed.host.is_nonempty())
  49       success = false;
  50
  51     // Port: the port canonicalizer will handle the colon.
  52     int default_port = DefaultPortForScheme(
  53         &output->data()[new_parsed->scheme.begin], new_parsed->scheme.len);
  54     success &= CanonicalizePort(source.port, parsed.port, default_port,
  55                                 output, &new_parsed->port);
  56   } else {
  57     // No authority, clear the components.
  58     have_authority = false;
  59     new_parsed->host.reset();
  60     new_parsed->username.reset();
  61     new_parsed->password.reset();
  62     new_parsed->port.reset();
  63     success = false;  // Standard URLs must have an authority.
  64   }
  65
  66   // Path
  67   if (parsed.path.is_valid()) {
  68     success &= CanonicalizePath(source.path, parsed.path,
  69                                 output, &new_parsed->path);
  70   } else if (have_authority ||
  71              parsed.query.is_valid() || parsed.ref.is_valid()) {
  72     // When we have an empty path, make up a path when we have an authority
  73     // or something following the path. The only time we allow an empty
  74     // output path is when there is nothing else.
  75     new_parsed->path = url_parse::Component(output->length(), 1);
  76     output->push_back('/');
  77   } else {
  78     // No path at all
  79     new_parsed->path.reset();
  80   }
  81
  82   // Query
  83   CanonicalizeQuery(source.query, parsed.query, query_converter,
  84                     output, &new_parsed->query);
  85
  86   // Ref: ignore failure for this, since the page can probably still be loaded.
  87   CanonicalizeRef(source.ref, parsed.ref, output, &new_parsed->ref);
  88
  89   return success;
  90 }
  91
  92 }  // namespace
  93
  94
  95 // Returns the default port for the given canonical scheme, or PORT_UNSPECIFIED
  96 // if the scheme is unknown.
  97 int DefaultPortForScheme(const char* scheme, int scheme_len) {
  98   int default_port = url_parse::PORT_UNSPECIFIED;
  99   switch (scheme_len) {
 100     case 4:
 101       if (!strncmp(scheme, "http", scheme_len))
 102         default_port = 80;
 103       break;
 104     case 5:
 105       if (!strncmp(scheme, "https", scheme_len))
 106         default_port = 443;
 107       break;
 108     case 3:
 109       if (!strncmp(scheme, "ftp", scheme_len))
 110         default_port = 21;
 111       else if (!strncmp(scheme, "wss", scheme_len))
 112         default_port = 443;
 113       break;
 114     case 6:
 115       if (!strncmp(scheme, "gopher", scheme_len))
 116         default_port = 70;
 117       break;
 118     case 2:
 119       if (!strncmp(scheme, "ws", scheme_len))
 120         default_port = 80;
 121       break;
 122   }
 123   return default_port;
 124 }
 125
 126 bool CanonicalizeStandardURL(const char* spec,
 127                              int spec_len,
 128                              const url_parse::Parsed& parsed,
 129                              CharsetConverter* query_converter,
 130                              CanonOutput* output,
 131                              url_parse::Parsed* new_parsed) {
 132   return DoCanonicalizeStandardURL<char, unsigned char>(
 133       URLComponentSource<char>(spec), parsed, query_converter,
 134       output, new_parsed);
 135 }
 136
 137 bool CanonicalizeStandardURL(const base::char16* spec,
 138                              int spec_len,
 139                              const url_parse::Parsed& parsed,
 140                              CharsetConverter* query_converter,
 141                              CanonOutput* output,
 142                              url_parse::Parsed* new_parsed) {
 143   return DoCanonicalizeStandardURL<base::char16, base::char16>(
 144       URLComponentSource<base::char16>(spec), parsed, query_converter,
 145       output, new_parsed);
 146 }
 147
 148 // It might be nice in the future to optimize this so unchanged components don't
 149 // need to be recanonicalized. This is especially true since the common case for
 150 // ReplaceComponents is removing things we don't want, like reference fragments
 151 // and usernames. These cases can become more efficient if we can assume the
 152 // rest of the URL is OK with these removed (or only the modified parts
 153 // recanonicalized). This would be much more complex to implement, however.
 154 //
 155 // You would also need to update DoReplaceComponents in url_util.cc which
 156 // relies on this re-checking everything (see the comment there for why).
 157 bool ReplaceStandardURL(const char* base,
 158                         const url_parse::Parsed& base_parsed,
 159                         const Replacements<char>& replacements,
 160                         CharsetConverter* query_converter,
 161                         CanonOutput* output,
 162                         url_parse::Parsed* new_parsed) {
 163   URLComponentSource<char> source(base);
 164   url_parse::Parsed parsed(base_parsed);
 165   SetupOverrideComponents(base, replacements, &source, &parsed);
 166   return DoCanonicalizeStandardURL<char, unsigned char>(
 167       source, parsed, query_converter, output, new_parsed);
 168 }
 169
 170 // For 16-bit replacements, we turn all the replacements into UTF-8 so the
 171 // regular codepath can be used.
 172 bool ReplaceStandardURL(const char* base,
 173                         const url_parse::Parsed& base_parsed,
 174                         const Replacements<base::char16>& replacements,
 175                         CharsetConverter* query_converter,
 176                         CanonOutput* output,
 177                         url_parse::Parsed* new_parsed) {
 178   RawCanonOutput<1024> utf8;
 179   URLComponentSource<char> source(base);
 180   url_parse::Parsed parsed(base_parsed);
 181   SetupUTF16OverrideComponents(base, replacements, &utf8, &source, &parsed);
 182   return DoCanonicalizeStandardURL<char, unsigned char>(
 183       source, parsed, query_converter, output, new_parsed);
 184 }
 185
 186 }  // namespace url_canon