extensions/common/url_pattern.cc

   1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.
   2 // Use of this source code is governed by a BSD-style license that can be
   3 // found in the LICENSE file.
   4
   5 #include "extensions/common/url_pattern.h"
   6
   7 #include <ostream>
   8
   9 #include "base/strings/pattern.h"
  10 #include "base/strings/string_number_conversions.h"
  11 #include "base/strings/string_piece.h"
  12 #include "base/strings/string_split.h"
  13 #include "base/strings/string_util.h"
  14 #include "base/strings/stringprintf.h"
  15 #include "content/public/common/url_constants.h"
  16 #include "extensions/common/constants.h"
  17 #include "net/base/registry_controlled_domains/registry_controlled_domain.h"
  18 #include "url/gurl.h"
  19 #include "url/url_util.h"
  20
  21 const char URLPattern::kAllUrlsPattern[] = "<all_urls>";
  22
  23 namespace {
  24
  25 // TODO(aa): What about more obscure schemes like data: and javascript: ?
  26 // Note: keep this array in sync with kValidSchemeMasks.
  27 const char* kValidSchemes[] = {
  28     url::kHttpScheme,
  29     url::kHttpsScheme,
  30     url::kFileScheme,
  31     url::kFtpScheme,
  32     content::kChromeUIScheme,
  33     extensions::kExtensionScheme,
  34     url::kFileSystemScheme,
  35 };
  36
  37 const int kValidSchemeMasks[] = {
  38   URLPattern::SCHEME_HTTP,
  39   URLPattern::SCHEME_HTTPS,
  40   URLPattern::SCHEME_FILE,
  41   URLPattern::SCHEME_FTP,
  42   URLPattern::SCHEME_CHROMEUI,
  43   URLPattern::SCHEME_EXTENSION,
  44   URLPattern::SCHEME_FILESYSTEM,
  45 };
  46
  47 static_assert(arraysize(kValidSchemes) == arraysize(kValidSchemeMasks),
  48               "must keep these arrays in sync");
  49
  50 const char kParseSuccess[] = "Success.";
  51 const char kParseErrorMissingSchemeSeparator[] = "Missing scheme separator.";
  52 const char kParseErrorInvalidScheme[] = "Invalid scheme.";
  53 const char kParseErrorWrongSchemeType[] = "Wrong scheme type.";
  54 const char kParseErrorEmptyHost[] = "Host can not be empty.";
  55 const char kParseErrorInvalidHostWildcard[] = "Invalid host wildcard.";
  56 const char kParseErrorEmptyPath[] = "Empty path.";
  57 const char kParseErrorInvalidPort[] = "Invalid port.";
  58 const char kParseErrorInvalidHost[] = "Invalid host.";
  59
  60 // Message explaining each URLPattern::ParseResult.
  61 const char* const kParseResultMessages[] = {
  62   kParseSuccess,
  63   kParseErrorMissingSchemeSeparator,
  64   kParseErrorInvalidScheme,
  65   kParseErrorWrongSchemeType,
  66   kParseErrorEmptyHost,
  67   kParseErrorInvalidHostWildcard,
  68   kParseErrorEmptyPath,
  69   kParseErrorInvalidPort,
  70   kParseErrorInvalidHost,
  71 };
  72
  73 static_assert(URLPattern::NUM_PARSE_RESULTS == arraysize(kParseResultMessages),
  74               "must add message for each parse result");
  75
  76 const char kPathSeparator[] = "/";
  77
  78 bool IsStandardScheme(const std::string& scheme) {
  79   // "*" gets the same treatment as a standard scheme.
  80   if (scheme == "*")
  81     return true;
  82
  83   return url::IsStandard(scheme.c_str(),
  84                          url::Component(0, static_cast<int>(scheme.length())));
  85 }
  86
  87 bool IsValidPortForScheme(const std::string& scheme, const std::string& port) {
  88   if (port == "*")
  89     return true;
  90
  91   // Only accept non-wildcard ports if the scheme uses ports.
  92   if (url::DefaultPortForScheme(scheme.c_str(), scheme.length()) ==
  93       url::PORT_UNSPECIFIED) {
  94     return false;
  95   }
  96
  97   int parsed_port = url::PORT_UNSPECIFIED;
  98   if (!base::StringToInt(port, &parsed_port))
  99     return false;
 100   return (parsed_port >= 0) && (parsed_port < 65536);
 101 }
 102
 103 // Returns |path| with the trailing wildcard stripped if one existed.
 104 //
 105 // The functions that rely on this (OverlapsWith and Contains) are only
 106 // called for the patterns inside URLPatternSet. In those cases, we know that
 107 // the path will have only a single wildcard at the end. This makes figuring
 108 // out overlap much easier. It seems like there is probably a computer-sciency
 109 // way to solve the general case, but we don't need that yet.
 110 std::string StripTrailingWildcard(const std::string& path) {
 111   size_t wildcard_index = path.find('*');
 112   size_t path_last = path.size() - 1;
 113   DCHECK(wildcard_index == std::string::npos || wildcard_index == path_last);
 114   return wildcard_index == path_last ? path.substr(0, path_last) : path;
 115 }
 116
 117 }  // namespace
 118
 119 // static
 120 bool URLPattern::IsValidSchemeForExtensions(const std::string& scheme) {
 121   for (size_t i = 0; i < arraysize(kValidSchemes); ++i) {
 122     if (scheme == kValidSchemes[i])
 123       return true;
 124   }
 125   return false;
 126 }
 127
 128 URLPattern::URLPattern()
 129     : valid_schemes_(SCHEME_NONE),
 130       match_all_urls_(false),
 131       match_subdomains_(false),
 132       port_("*") {}
 133
 134 URLPattern::URLPattern(int valid_schemes)
 135     : valid_schemes_(valid_schemes),
 136       match_all_urls_(false),
 137       match_subdomains_(false),
 138       port_("*") {}
 139
 140 URLPattern::URLPattern(int valid_schemes, const std::string& pattern)
 141     // Strict error checking is used, because this constructor is only
 142     // appropriate when we know |pattern| is valid.
 143     : valid_schemes_(valid_schemes),
 144       match_all_urls_(false),
 145       match_subdomains_(false),
 146       port_("*") {
 147   ParseResult result = Parse(pattern);
 148   if (PARSE_SUCCESS != result)
 149     NOTREACHED() << "URLPattern invalid: " << pattern << " result " << result;
 150 }
 151
 152 URLPattern::~URLPattern() {
 153 }
 154
 155 bool URLPattern::operator<(const URLPattern& other) const {
 156   return GetAsString() < other.GetAsString();
 157 }
 158
 159 bool URLPattern::operator>(const URLPattern& other) const {
 160   return GetAsString() > other.GetAsString();
 161 }
 162
 163 bool URLPattern::operator==(const URLPattern& other) const {
 164   return GetAsString() == other.GetAsString();
 165 }
 166
 167 std::ostream& operator<<(std::ostream& out, const URLPattern& url_pattern) {
 168   return out << '"' << url_pattern.GetAsString() << '"';
 169 }
 170
 171 URLPattern::ParseResult URLPattern::Parse(const std::string& pattern) {
 172   spec_.clear();
 173   SetMatchAllURLs(false);
 174   SetMatchSubdomains(false);
 175   SetPort("*");
 176
 177   // Special case pattern to match every valid URL.
 178   if (pattern == kAllUrlsPattern) {
 179     SetMatchAllURLs(true);
 180     return PARSE_SUCCESS;
 181   }
 182
 183   // Parse out the scheme.
 184   size_t scheme_end_pos = pattern.find(url::kStandardSchemeSeparator);
 185   bool has_standard_scheme_separator = true;
 186
 187   // Some urls also use ':' alone as the scheme separator.
 188   if (scheme_end_pos == std::string::npos) {
 189     scheme_end_pos = pattern.find(':');
 190     has_standard_scheme_separator = false;
 191   }
 192
 193   if (scheme_end_pos == std::string::npos)
 194     return PARSE_ERROR_MISSING_SCHEME_SEPARATOR;
 195
 196   if (!SetScheme(pattern.substr(0, scheme_end_pos)))
 197     return PARSE_ERROR_INVALID_SCHEME;
 198
 199   bool standard_scheme = IsStandardScheme(scheme_);
 200   if (standard_scheme != has_standard_scheme_separator)
 201     return PARSE_ERROR_WRONG_SCHEME_SEPARATOR;
 202
 203   // Advance past the scheme separator.
 204   scheme_end_pos +=
 205       (standard_scheme ? strlen(url::kStandardSchemeSeparator) : 1);
 206   if (scheme_end_pos >= pattern.size())
 207     return PARSE_ERROR_EMPTY_HOST;
 208
 209   // Parse out the host and path.
 210   size_t host_start_pos = scheme_end_pos;
 211   size_t path_start_pos = 0;
 212
 213   if (!standard_scheme) {
 214     path_start_pos = host_start_pos;
 215   } else if (scheme_ == url::kFileScheme) {
 216     size_t host_end_pos = pattern.find(kPathSeparator, host_start_pos);
 217     if (host_end_pos == std::string::npos) {
 218       // Allow hostname omission.
 219       // e.g. file://* is interpreted as file:///*,
 220       // file://foo* is interpreted as file:///foo*.
 221       path_start_pos = host_start_pos - 1;
 222     } else {
 223       // Ignore hostname if scheme is file://.
 224       // e.g. file://localhost/foo is equal to file:///foo.
 225       path_start_pos = host_end_pos;
 226     }
 227   } else {
 228     size_t host_end_pos = pattern.find(kPathSeparator, host_start_pos);
 229
 230     // Host is required.
 231     if (host_start_pos == host_end_pos)
 232       return PARSE_ERROR_EMPTY_HOST;
 233
 234     if (host_end_pos == std::string::npos)
 235       return PARSE_ERROR_EMPTY_PATH;
 236
 237     host_ = pattern.substr(host_start_pos, host_end_pos - host_start_pos);
 238
 239     // The first component can optionally be '*' to match all subdomains.
 240     std::vector<std::string> host_components = base::SplitString(
 241         host_, ".", base::TRIM_WHITESPACE, base::SPLIT_WANT_ALL);
 242
 243     // Could be empty if the host only consists of whitespace characters.
 244     if (host_components.empty() ||
 245         (host_components.size() == 1 && host_components[0].empty()))
 246       return PARSE_ERROR_EMPTY_HOST;
 247
 248     if (host_components[0] == "*") {
 249       match_subdomains_ = true;
 250       host_components.erase(host_components.begin(),
 251                             host_components.begin() + 1);
 252     }
 253     host_ = base::JoinString(host_components, ".");
 254
 255     path_start_pos = host_end_pos;
 256   }
 257
 258   SetPath(pattern.substr(path_start_pos));
 259
 260   size_t port_pos = host_.find(':');
 261   if (port_pos != std::string::npos) {
 262     if (!SetPort(host_.substr(port_pos + 1)))
 263       return PARSE_ERROR_INVALID_PORT;
 264     host_ = host_.substr(0, port_pos);
 265   }
 266
 267   // No other '*' can occur in the host, though. This isn't necessary, but is
 268   // done as a convenience to developers who might otherwise be confused and
 269   // think '*' works as a glob in the host.
 270   if (host_.find('*') != std::string::npos)
 271     return PARSE_ERROR_INVALID_HOST_WILDCARD;
 272
 273   // Null characters are not allowed in hosts.
 274   if (host_.find('\0') != std::string::npos)
 275     return PARSE_ERROR_INVALID_HOST;
 276
 277   return PARSE_SUCCESS;
 278 }
 279
 280 void URLPattern::SetValidSchemes(int valid_schemes) {
 281   spec_.clear();
 282   valid_schemes_ = valid_schemes;
 283 }
 284
 285 void URLPattern::SetHost(const std::string& host) {
 286   spec_.clear();
 287   host_ = host;
 288 }
 289
 290 void URLPattern::SetMatchAllURLs(bool val) {
 291   spec_.clear();
 292   match_all_urls_ = val;
 293
 294   if (val) {
 295     match_subdomains_ = true;
 296     scheme_ = "*";
 297     host_.clear();
 298     SetPath("/*");
 299   }
 300 }
 301
 302 void URLPattern::SetMatchSubdomains(bool val) {
 303   spec_.clear();
 304   match_subdomains_ = val;
 305 }
 306
 307 bool URLPattern::SetScheme(const std::string& scheme) {
 308   spec_.clear();
 309   scheme_ = scheme;
 310   if (scheme_ == "*") {
 311     valid_schemes_ &= (SCHEME_HTTP | SCHEME_HTTPS);
 312   } else if (!IsValidScheme(scheme_)) {
 313     return false;
 314   }
 315   return true;
 316 }
 317
 318 bool URLPattern::IsValidScheme(const std::string& scheme) const {
 319   if (valid_schemes_ == SCHEME_ALL)
 320     return true;
 321
 322   for (size_t i = 0; i < arraysize(kValidSchemes); ++i) {
 323     if (scheme == kValidSchemes[i] && (valid_schemes_ & kValidSchemeMasks[i]))
 324       return true;
 325   }
 326
 327   return false;
 328 }
 329
 330 void URLPattern::SetPath(const std::string& path) {
 331   spec_.clear();
 332   path_ = path;
 333   path_escaped_ = path_;
 334   base::ReplaceSubstringsAfterOffset(&path_escaped_, 0, "\\", "\\\\");
 335   base::ReplaceSubstringsAfterOffset(&path_escaped_, 0, "?", "\\?");
 336 }
 337
 338 bool URLPattern::SetPort(const std::string& port) {
 339   spec_.clear();
 340   if (IsValidPortForScheme(scheme_, port)) {
 341     port_ = port;
 342     return true;
 343   }
 344   return false;
 345 }
 346
 347 bool URLPattern::MatchesURL(const GURL& test) const {
 348   const GURL* test_url = &test;
 349   bool has_inner_url = test.inner_url() != NULL;
 350
 351   if (has_inner_url) {
 352     if (!test.SchemeIsFileSystem())
 353       return false;  // The only nested URLs we handle are filesystem URLs.
 354     test_url = test.inner_url();
 355   }
 356
 357   if (!MatchesScheme(test_url->scheme()))
 358     return false;
 359
 360   if (match_all_urls_)
 361     return true;
 362
 363   std::string path_for_request = test.PathForRequest();
 364   if (has_inner_url)
 365     path_for_request = test_url->path() + path_for_request;
 366
 367   return MatchesSecurityOriginHelper(*test_url) &&
 368          MatchesPath(path_for_request);
 369 }
 370
 371 bool URLPattern::MatchesSecurityOrigin(const GURL& test) const {
 372   const GURL* test_url = &test;
 373   bool has_inner_url = test.inner_url() != NULL;
 374
 375   if (has_inner_url) {
 376     if (!test.SchemeIsFileSystem())
 377       return false;  // The only nested URLs we handle are filesystem URLs.
 378     test_url = test.inner_url();
 379   }
 380
 381   if (!MatchesScheme(test_url->scheme()))
 382     return false;
 383
 384   if (match_all_urls_)
 385     return true;
 386
 387   return MatchesSecurityOriginHelper(*test_url);
 388 }
 389
 390 bool URLPattern::MatchesScheme(const std::string& test) const {
 391   if (!IsValidScheme(test))
 392     return false;
 393
 394   return scheme_ == "*" || test == scheme_;
 395 }
 396
 397 bool URLPattern::MatchesHost(const std::string& host) const {
 398   std::string test(url::kHttpScheme);
 399   test += url::kStandardSchemeSeparator;
 400   test += host;
 401   test += "/";
 402   return MatchesHost(GURL(test));
 403 }
 404
 405 bool URLPattern::MatchesHost(const GURL& test) const {
 406   // If the hosts are exactly equal, we have a match.
 407   if (test.host() == host_)
 408     return true;
 409
 410   // If we're matching subdomains, and we have no host in the match pattern,
 411   // that means that we're matching all hosts, which means we have a match no
 412   // matter what the test host is.
 413   if (match_subdomains_ && host_.empty())
 414     return true;
 415
 416   // Otherwise, we can only match if our match pattern matches subdomains.
 417   if (!match_subdomains_)
 418     return false;
 419
 420   // We don't do subdomain matching against IP addresses, so we can give up now
 421   // if the test host is an IP address.
 422   if (test.HostIsIPAddress())
 423     return false;
 424
 425   // Check if the test host is a subdomain of our host.
 426   if (test.host().length() <= (host_.length() + 1))
 427     return false;
 428
 429   if (test.host().compare(test.host().length() - host_.length(),
 430                           host_.length(), host_) != 0)
 431     return false;
 432
 433   return test.host()[test.host().length() - host_.length() - 1] == '.';
 434 }
 435
 436 bool URLPattern::ImpliesAllHosts() const {
 437   // Check if it matches all urls or is a pattern like http://*/*.
 438   if (match_all_urls_ ||
 439       (match_subdomains_ && host_.empty() && port_ == "*" && path_ == "/*")) {
 440     return true;
 441   }
 442
 443   // If this doesn't even match subdomains, it can't possibly imply all hosts.
 444   if (!match_subdomains_)
 445     return false;
 446
 447   // If |host_| is a recognized TLD, this will be 0. We don't include private
 448   // TLDs, so that, e.g., *.appspot.com does not imply all hosts.
 449   size_t registry_length = net::registry_controlled_domains::GetRegistryLength(
 450       host_,
 451       net::registry_controlled_domains::EXCLUDE_UNKNOWN_REGISTRIES,
 452       net::registry_controlled_domains::EXCLUDE_PRIVATE_REGISTRIES);
 453   // If there was more than just a TLD in the host (e.g., *.foobar.com), it
 454   // doesn't imply all hosts.
 455   if (registry_length > 0)
 456     return false;
 457
 458   // At this point the host could either be just a TLD ("com") or some unknown
 459   // TLD-like string ("notatld"). To disambiguate between them construct a
 460   // fake URL, and check the registry. This returns 0 if the TLD is
 461   // unrecognized, or the length of the recognized TLD.
 462   registry_length = net::registry_controlled_domains::GetRegistryLength(
 463       base::StringPrintf("foo.%s", host_.c_str()),
 464       net::registry_controlled_domains::EXCLUDE_UNKNOWN_REGISTRIES,
 465       net::registry_controlled_domains::EXCLUDE_PRIVATE_REGISTRIES);
 466   // If we recognized this TLD, then this is a pattern like *.com, and it
 467   // should imply all hosts. Otherwise, this doesn't imply all hosts.
 468   return registry_length > 0;
 469 }
 470
 471 bool URLPattern::MatchesSingleOrigin() const {
 472   // Strictly speaking, the port is part of the origin, but in URLPattern it
 473   // defaults to *. It's not very interesting anyway, so leave it out.
 474   return !ImpliesAllHosts() && scheme_ != "*" && !match_subdomains_;
 475 }
 476
 477 bool URLPattern::MatchesPath(const std::string& test) const {
 478   // Make the behaviour of OverlapsWith consistent with MatchesURL, which is
 479   // need to match hosted apps on e.g. 'google.com' also run on 'google.com/'.
 480   if (test + "/*" == path_escaped_)
 481     return true;
 482
 483   return base::MatchPattern(test, path_escaped_);
 484 }
 485
 486 const std::string& URLPattern::GetAsString() const {
 487   if (!spec_.empty())
 488     return spec_;
 489
 490   if (match_all_urls_) {
 491     spec_ = kAllUrlsPattern;
 492     return spec_;
 493   }
 494
 495   bool standard_scheme = IsStandardScheme(scheme_);
 496
 497   std::string spec = scheme_ +
 498       (standard_scheme ? url::kStandardSchemeSeparator : ":");
 499
 500   if (scheme_ != url::kFileScheme && standard_scheme) {
 501     if (match_subdomains_) {
 502       spec += "*";
 503       if (!host_.empty())
 504         spec += ".";
 505     }
 506
 507     if (!host_.empty())
 508       spec += host_;
 509
 510     if (port_ != "*") {
 511       spec += ":";
 512       spec += port_;
 513     }
 514   }
 515
 516   if (!path_.empty())
 517     spec += path_;
 518
 519   spec_ = spec;
 520   return spec_;
 521 }
 522
 523 bool URLPattern::OverlapsWith(const URLPattern& other) const {
 524   if (match_all_urls() || other.match_all_urls())
 525     return true;
 526   return (MatchesAnyScheme(other.GetExplicitSchemes()) ||
 527           other.MatchesAnyScheme(GetExplicitSchemes()))
 528       && (MatchesHost(other.host()) || other.MatchesHost(host()))
 529       && (MatchesPortPattern(other.port()) || other.MatchesPortPattern(port()))
 530       && (MatchesPath(StripTrailingWildcard(other.path())) ||
 531           other.MatchesPath(StripTrailingWildcard(path())));
 532 }
 533
 534 bool URLPattern::Contains(const URLPattern& other) const {
 535   if (match_all_urls())
 536     return true;
 537   return MatchesAllSchemes(other.GetExplicitSchemes())
 538       && MatchesHost(other.host())
 539       && MatchesPortPattern(other.port())
 540       && MatchesPath(StripTrailingWildcard(other.path()));
 541 }
 542
 543 bool URLPattern::MatchesAnyScheme(
 544     const std::vector<std::string>& schemes) const {
 545   for (std::vector<std::string>::const_iterator i = schemes.begin();
 546        i != schemes.end(); ++i) {
 547     if (MatchesScheme(*i))
 548       return true;
 549   }
 550
 551   return false;
 552 }
 553
 554 bool URLPattern::MatchesAllSchemes(
 555     const std::vector<std::string>& schemes) const {
 556   for (std::vector<std::string>::const_iterator i = schemes.begin();
 557        i != schemes.end(); ++i) {
 558     if (!MatchesScheme(*i))
 559       return false;
 560   }
 561
 562   return true;
 563 }
 564
 565 bool URLPattern::MatchesSecurityOriginHelper(const GURL& test) const {
 566   // Ignore hostname if scheme is file://.
 567   if (scheme_ != url::kFileScheme && !MatchesHost(test))
 568     return false;
 569
 570   if (!MatchesPortPattern(base::IntToString(test.EffectiveIntPort())))
 571     return false;
 572
 573   return true;
 574 }
 575
 576 bool URLPattern::MatchesPortPattern(const std::string& port) const {
 577   return port_ == "*" || port_ == port;
 578 }
 579
 580 std::vector<std::string> URLPattern::GetExplicitSchemes() const {
 581   std::vector<std::string> result;
 582
 583   if (scheme_ != "*" && !match_all_urls_ && IsValidScheme(scheme_)) {
 584     result.push_back(scheme_);
 585     return result;
 586   }
 587
 588   for (size_t i = 0; i < arraysize(kValidSchemes); ++i) {
 589     if (MatchesScheme(kValidSchemes[i])) {
 590       result.push_back(kValidSchemes[i]);
 591     }
 592   }
 593
 594   return result;
 595 }
 596
 597 std::vector<URLPattern> URLPattern::ConvertToExplicitSchemes() const {
 598   std::vector<std::string> explicit_schemes = GetExplicitSchemes();
 599   std::vector<URLPattern> result;
 600
 601   for (std::vector<std::string>::const_iterator i = explicit_schemes.begin();
 602        i != explicit_schemes.end(); ++i) {
 603     URLPattern temp = *this;
 604     temp.SetScheme(*i);
 605     temp.SetMatchAllURLs(false);
 606     result.push_back(temp);
 607   }
 608
 609   return result;
 610 }
 611
 612 // static
 613 const char* URLPattern::GetParseResultString(
 614     URLPattern::ParseResult parse_result) {
 615   return kParseResultMessages[parse_result];
 616 }