url/gurl.cc

   1 // Copyright 2013 The Chromium Authors. All rights reserved.
   2 // Use of this source code is governed by a BSD-style license that can be
   3 // found in the LICENSE file.
   4
   5 #ifdef WIN32
   6 #include <windows.h>
   7 #else
   8 #include <pthread.h>
   9 #endif
  10
  11 #include <algorithm>
  12 #include <ostream>
  13
  14 #include "url/gurl.h"
  15
  16 #include "base/logging.h"
  17 #include "base/strings/string_piece.h"
  18 #include "base/strings/string_util.h"
  19 #include "url/url_canon_stdstring.h"
  20 #include "url/url_util.h"
  21
  22 namespace {
  23
  24 static std::string* empty_string = NULL;
  25 static GURL* empty_gurl = NULL;
  26
  27 #ifdef WIN32
  28
  29 // Returns a static reference to an empty string for returning a reference
  30 // when there is no underlying string.
  31 const std::string& EmptyStringForGURL() {
  32   // Avoid static object construction/destruction on startup/shutdown.
  33   if (!empty_string) {
  34     // Create the string. Be careful that we don't break in the case that this
  35     // is being called from multiple threads. Statics are not threadsafe.
  36     std::string* new_empty_string = new std::string;
  37     if (InterlockedCompareExchangePointer(
  38         reinterpret_cast<PVOID*>(&empty_string), new_empty_string, NULL)) {
  39       // The old value was non-NULL, so no replacement was done. Another
  40       // thread did the initialization out from under us.
  41       delete new_empty_string;
  42     }
  43   }
  44   return *empty_string;
  45 }
  46
  47 #else
  48
  49 static pthread_once_t empty_string_once = PTHREAD_ONCE_INIT;
  50 static pthread_once_t empty_gurl_once = PTHREAD_ONCE_INIT;
  51
  52 void EmptyStringForGURLOnce(void) {
  53   empty_string = new std::string;
  54 }
  55
  56 const std::string& EmptyStringForGURL() {
  57   // Avoid static object construction/destruction on startup/shutdown.
  58   pthread_once(&empty_string_once, EmptyStringForGURLOnce);
  59   return *empty_string;
  60 }
  61
  62 #endif  // WIN32
  63
  64 } // namespace
  65
  66 GURL::GURL() : is_valid_(false) {
  67 }
  68
  69 GURL::GURL(const GURL& other)
  70     : spec_(other.spec_),
  71       is_valid_(other.is_valid_),
  72       parsed_(other.parsed_) {
  73   if (other.inner_url_)
  74     inner_url_.reset(new GURL(*other.inner_url_));
  75   // Valid filesystem urls should always have an inner_url_.
  76   DCHECK(!is_valid_ || !SchemeIsFileSystem() || inner_url_);
  77 }
  78
  79 GURL::GURL(const std::string& url_string) {
  80   InitCanonical(url_string, true);
  81 }
  82
  83 GURL::GURL(const base::string16& url_string) {
  84   InitCanonical(url_string, true);
  85 }
  86
  87 GURL::GURL(const std::string& url_string, RetainWhiteSpaceSelector) {
  88   InitCanonical(url_string, false);
  89 }
  90
  91 GURL::GURL(const char* canonical_spec,
  92            size_t canonical_spec_len,
  93            const url::Parsed& parsed,
  94            bool is_valid)
  95     : spec_(canonical_spec, canonical_spec_len),
  96       is_valid_(is_valid),
  97       parsed_(parsed) {
  98   InitializeFromCanonicalSpec();
  99 }
 100
 101 GURL::GURL(std::string canonical_spec, const url::Parsed& parsed, bool is_valid)
 102     : is_valid_(is_valid),
 103       parsed_(parsed) {
 104   spec_.swap(canonical_spec);
 105   InitializeFromCanonicalSpec();
 106 }
 107
 108 template<typename STR>
 109 void GURL::InitCanonical(const STR& input_spec, bool trim_path_end) {
 110   // Reserve enough room in the output for the input, plus some extra so that
 111   // we have room if we have to escape a few things without reallocating.
 112   spec_.reserve(input_spec.size() + 32);
 113   url::StdStringCanonOutput output(&spec_);
 114   is_valid_ = url::Canonicalize(
 115       input_spec.data(), static_cast<int>(input_spec.length()), trim_path_end,
 116       NULL, &output, &parsed_);
 117
 118   output.Complete();  // Must be done before using string.
 119   if (is_valid_ && SchemeIsFileSystem()) {
 120     inner_url_.reset(new GURL(spec_.data(), parsed_.Length(),
 121                               *parsed_.inner_parsed(), true));
 122   }
 123 }
 124
 125 void GURL::InitializeFromCanonicalSpec() {
 126   if (is_valid_ && SchemeIsFileSystem()) {
 127     inner_url_.reset(
 128         new GURL(spec_.data(), parsed_.Length(),
 129                  *parsed_.inner_parsed(), true));
 130   }
 131
 132 #ifndef NDEBUG
 133   // For testing purposes, check that the parsed canonical URL is identical to
 134   // what we would have produced. Skip checking for invalid URLs have no meaning
 135   // and we can't always canonicalize then reproducabely.
 136   if (is_valid_) {
 137     url::Component scheme;
 138     // We can't do this check on the inner_url of a filesystem URL, as
 139     // canonical_spec actually points to the start of the outer URL, so we'd
 140     // end up with infinite recursion in this constructor.
 141     if (!url::FindAndCompareScheme(spec_.data(), spec_.length(),
 142                                    url::kFileSystemScheme, &scheme) ||
 143         scheme.begin == parsed_.scheme.begin) {
 144       // We need to retain trailing whitespace on path URLs, as the |parsed_|
 145       // spec we originally received may legitimately contain trailing white-
 146       // space on the path or  components e.g. if the #ref has been
 147       // removed from a "foo:hello #ref" URL (see http://crbug.com/291747).
 148       GURL test_url(spec_, RETAIN_TRAILING_PATH_WHITEPACE);
 149
 150       DCHECK(test_url.is_valid_ == is_valid_);
 151       DCHECK(test_url.spec_ == spec_);
 152
 153       DCHECK(test_url.parsed_.scheme == parsed_.scheme);
 154       DCHECK(test_url.parsed_.username == parsed_.username);
 155       DCHECK(test_url.parsed_.password == parsed_.password);
 156       DCHECK(test_url.parsed_.host == parsed_.host);
 157       DCHECK(test_url.parsed_.port == parsed_.port);
 158       DCHECK(test_url.parsed_.path == parsed_.path);
 159       DCHECK(test_url.parsed_.query == parsed_.query);
 160       DCHECK(test_url.parsed_.ref == parsed_.ref);
 161     }
 162   }
 163 #endif
 164 }
 165
 166 GURL::~GURL() {
 167 }
 168
 169 GURL& GURL::operator=(GURL other) {
 170   Swap(&other);
 171   return *this;
 172 }
 173
 174 const std::string& GURL::spec() const {
 175   if (is_valid_ || spec_.empty())
 176     return spec_;
 177
 178   DCHECK(false) << "Trying to get the spec of an invalid URL!";
 179   return EmptyStringForGURL();
 180 }
 181
 182 bool GURL::operator==(const GURL& other) const {
 183   return spec_ == other.spec_;
 184 }
 185
 186 bool GURL::operator!=(const GURL& other) const {
 187   return spec_ != other.spec_;
 188 }
 189
 190 bool GURL::operator<(const GURL& other) const {
 191   return spec_ < other.spec_;
 192 }
 193
 194 bool GURL::operator>(const GURL& other) const {
 195   return spec_ > other.spec_;
 196 }
 197
 198 // Note: code duplicated below (it's inconvenient to use a template here).
 199 GURL GURL::Resolve(const std::string& relative) const {
 200   // Not allowed for invalid URLs.
 201   if (!is_valid_)
 202     return GURL();
 203
 204   GURL result;
 205
 206   // Reserve enough room in the output for the input, plus some extra so that
 207   // we have room if we have to escape a few things without reallocating.
 208   result.spec_.reserve(spec_.size() + 32);
 209   url::StdStringCanonOutput output(&result.spec_);
 210
 211   if (!url::ResolveRelative(spec_.data(), static_cast<int>(spec_.length()),
 212                             parsed_, relative.data(),
 213                             static_cast<int>(relative.length()),
 214                             nullptr, &output, &result.parsed_)) {
 215     // Error resolving, return an empty URL.
 216     return GURL();
 217   }
 218
 219   output.Complete();
 220   result.is_valid_ = true;
 221   if (result.SchemeIsFileSystem()) {
 222     result.inner_url_.reset(
 223         new GURL(result.spec_.data(), result.parsed_.Length(),
 224                  *result.parsed_.inner_parsed(), true));
 225   }
 226   return result;
 227 }
 228
 229 // Note: code duplicated above (it's inconvenient to use a template here).
 230 GURL GURL::Resolve(const base::string16& relative) const {
 231   // Not allowed for invalid URLs.
 232   if (!is_valid_)
 233     return GURL();
 234
 235   GURL result;
 236
 237   // Reserve enough room in the output for the input, plus some extra so that
 238   // we have room if we have to escape a few things without reallocating.
 239   result.spec_.reserve(spec_.size() + 32);
 240   url::StdStringCanonOutput output(&result.spec_);
 241
 242   if (!url::ResolveRelative(spec_.data(), static_cast<int>(spec_.length()),
 243                             parsed_, relative.data(),
 244                             static_cast<int>(relative.length()),
 245                             nullptr, &output, &result.parsed_)) {
 246     // Error resolving, return an empty URL.
 247     return GURL();
 248   }
 249
 250   output.Complete();
 251   result.is_valid_ = true;
 252   if (result.SchemeIsFileSystem()) {
 253     result.inner_url_.reset(
 254         new GURL(result.spec_.data(), result.parsed_.Length(),
 255                  *result.parsed_.inner_parsed(), true));
 256   }
 257   return result;
 258 }
 259
 260 // Note: code duplicated below (it's inconvenient to use a template here).
 261 GURL GURL::ReplaceComponents(
 262     const url::Replacements<char>& replacements) const {
 263   GURL result;
 264
 265   // Not allowed for invalid URLs.
 266   if (!is_valid_)
 267     return GURL();
 268
 269   // Reserve enough room in the output for the input, plus some extra so that
 270   // we have room if we have to escape a few things without reallocating.
 271   result.spec_.reserve(spec_.size() + 32);
 272   url::StdStringCanonOutput output(&result.spec_);
 273
 274   result.is_valid_ = url::ReplaceComponents(
 275       spec_.data(), static_cast<int>(spec_.length()), parsed_, replacements,
 276       NULL, &output, &result.parsed_);
 277
 278   output.Complete();
 279   if (result.is_valid_ && result.SchemeIsFileSystem()) {
 280     result.inner_url_.reset(new GURL(spec_.data(), result.parsed_.Length(),
 281                                      *result.parsed_.inner_parsed(), true));
 282   }
 283   return result;
 284 }
 285
 286 // Note: code duplicated above (it's inconvenient to use a template here).
 287 GURL GURL::ReplaceComponents(
 288     const url::Replacements<base::char16>& replacements) const {
 289   GURL result;
 290
 291   // Not allowed for invalid URLs.
 292   if (!is_valid_)
 293     return GURL();
 294
 295   // Reserve enough room in the output for the input, plus some extra so that
 296   // we have room if we have to escape a few things without reallocating.
 297   result.spec_.reserve(spec_.size() + 32);
 298   url::StdStringCanonOutput output(&result.spec_);
 299
 300   result.is_valid_ = url::ReplaceComponents(
 301       spec_.data(), static_cast<int>(spec_.length()), parsed_, replacements,
 302       NULL, &output, &result.parsed_);
 303
 304   output.Complete();
 305   if (result.is_valid_ && result.SchemeIsFileSystem()) {
 306     result.inner_url_.reset(new GURL(spec_.data(), result.parsed_.Length(),
 307                                      *result.parsed_.inner_parsed(), true));
 308   }
 309   return result;
 310 }
 311
 312 GURL GURL::GetOrigin() const {
 313   // This doesn't make sense for invalid or nonstandard URLs, so return
 314   // the empty URL
 315   if (!is_valid_ || !IsStandard())
 316     return GURL();
 317
 318   if (SchemeIsFileSystem())
 319     return inner_url_->GetOrigin();
 320
 321   url::Replacements<char> replacements;
 322   replacements.ClearUsername();
 323   replacements.ClearPassword();
 324   replacements.ClearPath();
 325   replacements.ClearQuery();
 326   replacements.ClearRef();
 327
 328   return ReplaceComponents(replacements);
 329 }
 330
 331 GURL GURL::GetAsReferrer() const {
 332   if (!is_valid_ || !SchemeIsHTTPOrHTTPS())
 333     return GURL();
 334
 335   if (!has_ref() && !has_username() && !has_password())
 336     return GURL(*this);
 337
 338   url::Replacements<char> replacements;
 339   replacements.ClearRef();
 340   replacements.ClearUsername();
 341   replacements.ClearPassword();
 342   return ReplaceComponents(replacements);
 343 }
 344
 345 GURL GURL::GetWithEmptyPath() const {
 346   // This doesn't make sense for invalid or nonstandard URLs, so return
 347   // the empty URL.
 348   if (!is_valid_ || !IsStandard())
 349     return GURL();
 350
 351   // We could optimize this since we know that the URL is canonical, and we are
 352   // appending a canonical path, so avoiding re-parsing.
 353   GURL other(*this);
 354   if (parsed_.path.len == 0)
 355     return other;
 356
 357   // Clear everything after the path.
 358   other.parsed_.query.reset();
 359   other.parsed_.ref.reset();
 360
 361   // Set the path, since the path is longer than one, we can just set the
 362   // first character and resize.
 363   other.spec_[other.parsed_.path.begin] = '/';
 364   other.parsed_.path.len = 1;
 365   other.spec_.resize(other.parsed_.path.begin + 1);
 366   return other;
 367 }
 368
 369 bool GURL::IsStandard() const {
 370   return url::IsStandard(spec_.data(), parsed_.scheme);
 371 }
 372
 373 bool GURL::SchemeIs(const char* lower_ascii_scheme) const {
 374   if (parsed_.scheme.len <= 0)
 375     return lower_ascii_scheme == NULL;
 376   return base::LowerCaseEqualsASCII(
 377       base::StringPiece(spec_.data() + parsed_.scheme.begin,
 378                         parsed_.scheme.len),
 379       lower_ascii_scheme);
 380 }
 381
 382 bool GURL::SchemeIsHTTPOrHTTPS() const {
 383   return SchemeIs(url::kHttpScheme) || SchemeIs(url::kHttpsScheme);
 384 }
 385
 386 bool GURL::SchemeIsWSOrWSS() const {
 387   return SchemeIs(url::kWsScheme) || SchemeIs(url::kWssScheme);
 388 }
 389
 390 int GURL::IntPort() const {
 391   if (parsed_.port.is_nonempty())
 392     return url::ParsePort(spec_.data(), parsed_.port);
 393   return url::PORT_UNSPECIFIED;
 394 }
 395
 396 int GURL::EffectiveIntPort() const {
 397   int int_port = IntPort();
 398   if (int_port == url::PORT_UNSPECIFIED && IsStandard())
 399     return url::DefaultPortForScheme(spec_.data() + parsed_.scheme.begin,
 400                                      parsed_.scheme.len);
 401   return int_port;
 402 }
 403
 404 std::string GURL::ExtractFileName() const {
 405   url::Component file_component;
 406   url::ExtractFileName(spec_.data(), parsed_.path, &file_component);
 407   return ComponentString(file_component);
 408 }
 409
 410 std::string GURL::PathForRequest() const {
 411   DCHECK(parsed_.path.len > 0) << "Canonical path for requests should be non-empty";
 412   if (parsed_.ref.len >= 0) {
 413     // Clip off the reference when it exists. The reference starts after the #
 414     // sign, so we have to subtract one to also remove it.
 415     return std::string(spec_, parsed_.path.begin,
 416                        parsed_.ref.begin - parsed_.path.begin - 1);
 417   }
 418   // Compute the actual path length, rather than depending on the spec's
 419   // terminator.  If we're an inner_url, our spec continues on into our outer
 420   // url's path/query/ref.
 421   int path_len = parsed_.path.len;
 422   if (parsed_.query.is_valid())
 423     path_len = parsed_.query.end() - parsed_.path.begin;
 424
 425   return std::string(spec_, parsed_.path.begin, path_len);
 426 }
 427
 428 std::string GURL::HostNoBrackets() const {
 429   // If host looks like an IPv6 literal, strip the square brackets.
 430   url::Component h(parsed_.host);
 431   if (h.len >= 2 && spec_[h.begin] == '[' && spec_[h.end() - 1] == ']') {
 432     h.begin++;
 433     h.len -= 2;
 434   }
 435   return ComponentString(h);
 436 }
 437
 438 std::string GURL::GetContent() const {
 439   return is_valid_ ? ComponentString(parsed_.GetContent()) : std::string();
 440 }
 441
 442 bool GURL::HostIsIPAddress() const {
 443   if (!is_valid_ || spec_.empty())
 444      return false;
 445
 446   url::RawCanonOutputT<char, 128> ignored_output;
 447   url::CanonHostInfo host_info;
 448   url::CanonicalizeIPAddress(spec_.c_str(), parsed_.host, &ignored_output,
 449                              &host_info);
 450   return host_info.IsIPAddress();
 451 }
 452
 453 #ifdef WIN32
 454
 455 const GURL& GURL::EmptyGURL() {
 456   // Avoid static object construction/destruction on startup/shutdown.
 457   if (!empty_gurl) {
 458     // Create the string. Be careful that we don't break in the case that this
 459     // is being called from multiple threads.
 460     GURL* new_empty_gurl = new GURL;
 461     if (InterlockedCompareExchangePointer(
 462         reinterpret_cast<PVOID*>(&empty_gurl), new_empty_gurl, NULL)) {
 463       // The old value was non-NULL, so no replacement was done. Another
 464       // thread did the initialization out from under us.
 465       delete new_empty_gurl;
 466     }
 467   }
 468   return *empty_gurl;
 469 }
 470
 471 #else
 472
 473 void EmptyGURLOnce(void) {
 474   empty_gurl = new GURL;
 475 }
 476
 477 const GURL& GURL::EmptyGURL() {
 478   // Avoid static object construction/destruction on startup/shutdown.
 479   pthread_once(&empty_gurl_once, EmptyGURLOnce);
 480   return *empty_gurl;
 481 }
 482
 483 #endif  // WIN32
 484
 485 bool GURL::DomainIs(base::StringPiece lower_ascii_domain) const {
 486   if (!is_valid_ || lower_ascii_domain.empty())
 487     return false;
 488
 489   // FileSystem URLs have empty parsed_.host, so check this first.
 490   if (SchemeIsFileSystem() && inner_url_)
 491     return inner_url_->DomainIs(lower_ascii_domain);
 492
 493   if (!parsed_.host.is_nonempty())
 494     return false;
 495
 496   // If the host name ends with a dot but the input domain doesn't,
 497   // then we ignore the dot in the host name.
 498   const char* host_last_pos = spec_.data() + parsed_.host.end() - 1;
 499   int host_len = parsed_.host.len;
 500   int domain_len = lower_ascii_domain.length();
 501   if ('.' == *host_last_pos && '.' != lower_ascii_domain[domain_len - 1]) {
 502     host_last_pos--;
 503     host_len--;
 504   }
 505
 506   if (host_len < domain_len)
 507     return false;
 508
 509   // |host_first_pos| is the start of the compared part of the host name, not
 510   // start of the whole host name.
 511   const char* host_first_pos = spec_.data() + parsed_.host.begin +
 512                                host_len - domain_len;
 513
 514   if (!base::LowerCaseEqualsASCII(
 515            base::StringPiece(host_first_pos, domain_len), lower_ascii_domain))
 516     return false;
 517
 518   // Make sure there aren't extra characters in host before the compared part;
 519   // if the host name is longer than the input domain name, then the character
 520   // immediately before the compared part should be a dot. For example,
 521   // www.google.com has domain "google.com", but www.iamnotgoogle.com does not.
 522   if ('.' != lower_ascii_domain[0] && host_len > domain_len &&
 523       '.' != *(host_first_pos - 1))
 524     return false;
 525
 526   return true;
 527 }
 528
 529 void GURL::Swap(GURL* other) {
 530   spec_.swap(other->spec_);
 531   std::swap(is_valid_, other->is_valid_);
 532   std::swap(parsed_, other->parsed_);
 533   inner_url_.swap(other->inner_url_);
 534 }
 535
 536 std::ostream& operator<<(std::ostream& out, const GURL& url) {
 537   return out << url.possibly_invalid_spec();
 538 }