url/gurl.cc

   1 // Copyright 2013 The Chromium Authors. All rights reserved.
   2 // Use of this source code is governed by a BSD-style license that can be
   3 // found in the LICENSE file.
   4
   5 #ifdef WIN32
   6 #include <windows.h>
   7 #else
   8 #include <pthread.h>
   9 #endif
  10
  11 #include <algorithm>
  12 #include <ostream>
  13
  14 #include "url/gurl.h"
  15
  16 #include "base/logging.h"
  17 #include "base/strings/string_util.h"
  18 #include "url/url_canon_stdstring.h"
  19 #include "url/url_util.h"
  20
  21 namespace {
  22
  23 static std::string* empty_string = NULL;
  24 static GURL* empty_gurl = NULL;
  25
  26 #ifdef WIN32
  27
  28 // Returns a static reference to an empty string for returning a reference
  29 // when there is no underlying string.
  30 const std::string& EmptyStringForGURL() {
  31   // Avoid static object construction/destruction on startup/shutdown.
  32   if (!empty_string) {
  33     // Create the string. Be careful that we don't break in the case that this
  34     // is being called from multiple threads. Statics are not threadsafe.
  35     std::string* new_empty_string = new std::string;
  36     if (InterlockedCompareExchangePointer(
  37         reinterpret_cast<PVOID*>(&empty_string), new_empty_string, NULL)) {
  38       // The old value was non-NULL, so no replacement was done. Another
  39       // thread did the initialization out from under us.
  40       delete new_empty_string;
  41     }
  42   }
  43   return *empty_string;
  44 }
  45
  46 #else
  47
  48 static pthread_once_t empty_string_once = PTHREAD_ONCE_INIT;
  49 static pthread_once_t empty_gurl_once = PTHREAD_ONCE_INIT;
  50
  51 void EmptyStringForGURLOnce(void) {
  52   empty_string = new std::string;
  53 }
  54
  55 const std::string& EmptyStringForGURL() {
  56   // Avoid static object construction/destruction on startup/shutdown.
  57   pthread_once(&empty_string_once, EmptyStringForGURLOnce);
  58   return *empty_string;
  59 }
  60
  61 #endif  // WIN32
  62
  63 } // namespace
  64
  65 GURL::GURL() : is_valid_(false) {
  66 }
  67
  68 GURL::GURL(const GURL& other)
  69     : spec_(other.spec_),
  70       is_valid_(other.is_valid_),
  71       parsed_(other.parsed_) {
  72   if (other.inner_url_)
  73     inner_url_.reset(new GURL(*other.inner_url_));
  74   // Valid filesystem urls should always have an inner_url_.
  75   DCHECK(!is_valid_ || !SchemeIsFileSystem() || inner_url_);
  76 }
  77
  78 GURL::GURL(const std::string& url_string) {
  79   InitCanonical(url_string, true);
  80 }
  81
  82 GURL::GURL(const base::string16& url_string) {
  83   InitCanonical(url_string, true);
  84 }
  85
  86 GURL::GURL(const std::string& url_string, RetainWhiteSpaceSelector) {
  87   InitCanonical(url_string, false);
  88 }
  89
  90 GURL::GURL(const char* canonical_spec,
  91            size_t canonical_spec_len,
  92            const url::Parsed& parsed,
  93            bool is_valid)
  94     : spec_(canonical_spec, canonical_spec_len),
  95       is_valid_(is_valid),
  96       parsed_(parsed) {
  97   InitializeFromCanonicalSpec();
  98 }
  99
 100 GURL::GURL(std::string canonical_spec, const url::Parsed& parsed, bool is_valid)
 101     : is_valid_(is_valid),
 102       parsed_(parsed) {
 103   spec_.swap(canonical_spec);
 104   InitializeFromCanonicalSpec();
 105 }
 106
 107 template<typename STR>
 108 void GURL::InitCanonical(const STR& input_spec, bool trim_path_end) {
 109   // Reserve enough room in the output for the input, plus some extra so that
 110   // we have room if we have to escape a few things without reallocating.
 111   spec_.reserve(input_spec.size() + 32);
 112   url::StdStringCanonOutput output(&spec_);
 113   is_valid_ = url::Canonicalize(
 114       input_spec.data(), static_cast<int>(input_spec.length()), trim_path_end,
 115       NULL, &output, &parsed_);
 116
 117   output.Complete();  // Must be done before using string.
 118   if (is_valid_ && SchemeIsFileSystem()) {
 119     inner_url_.reset(new GURL(spec_.data(), parsed_.Length(),
 120                               *parsed_.inner_parsed(), true));
 121   }
 122 }
 123
 124 void GURL::InitializeFromCanonicalSpec() {
 125   if (is_valid_ && SchemeIsFileSystem()) {
 126     inner_url_.reset(
 127         new GURL(spec_.data(), parsed_.Length(),
 128                  *parsed_.inner_parsed(), true));
 129   }
 130
 131 #ifndef NDEBUG
 132   // For testing purposes, check that the parsed canonical URL is identical to
 133   // what we would have produced. Skip checking for invalid URLs have no meaning
 134   // and we can't always canonicalize then reproducabely.
 135   if (is_valid_) {
 136     url::Component scheme;
 137     // We can't do this check on the inner_url of a filesystem URL, as
 138     // canonical_spec actually points to the start of the outer URL, so we'd
 139     // end up with infinite recursion in this constructor.
 140     if (!url::FindAndCompareScheme(spec_.data(), spec_.length(),
 141                                    url::kFileSystemScheme, &scheme) ||
 142         scheme.begin == parsed_.scheme.begin) {
 143       // We need to retain trailing whitespace on path URLs, as the |parsed_|
 144       // spec we originally received may legitimately contain trailing white-
 145       // space on the path or  components e.g. if the #ref has been
 146       // removed from a "foo:hello #ref" URL (see http://crbug.com/291747).
 147       GURL test_url(spec_, RETAIN_TRAILING_PATH_WHITEPACE);
 148
 149       DCHECK(test_url.is_valid_ == is_valid_);
 150       DCHECK(test_url.spec_ == spec_);
 151
 152       DCHECK(test_url.parsed_.scheme == parsed_.scheme);
 153       DCHECK(test_url.parsed_.username == parsed_.username);
 154       DCHECK(test_url.parsed_.password == parsed_.password);
 155       DCHECK(test_url.parsed_.host == parsed_.host);
 156       DCHECK(test_url.parsed_.port == parsed_.port);
 157       DCHECK(test_url.parsed_.path == parsed_.path);
 158       DCHECK(test_url.parsed_.query == parsed_.query);
 159       DCHECK(test_url.parsed_.ref == parsed_.ref);
 160     }
 161   }
 162 #endif
 163 }
 164
 165 GURL::~GURL() {
 166 }
 167
 168 GURL& GURL::operator=(GURL other) {
 169   Swap(&other);
 170   return *this;
 171 }
 172
 173 const std::string& GURL::spec() const {
 174   if (is_valid_ || spec_.empty())
 175     return spec_;
 176
 177   DCHECK(false) << "Trying to get the spec of an invalid URL!";
 178   return EmptyStringForGURL();
 179 }
 180
 181 bool GURL::operator==(const GURL& other) const {
 182   return spec_ == other.spec_;
 183 }
 184
 185 bool GURL::operator!=(const GURL& other) const {
 186   return spec_ != other.spec_;
 187 }
 188
 189 bool GURL::operator<(const GURL& other) const {
 190   return spec_ < other.spec_;
 191 }
 192
 193 bool GURL::operator>(const GURL& other) const {
 194   return spec_ > other.spec_;
 195 }
 196
 197 GURL GURL::Resolve(const std::string& relative) const {
 198   return ResolveWithCharsetConverter(relative, NULL);
 199 }
 200 GURL GURL::Resolve(const base::string16& relative) const {
 201   return ResolveWithCharsetConverter(relative, NULL);
 202 }
 203
 204 // Note: code duplicated below (it's inconvenient to use a template here).
 205 GURL GURL::ResolveWithCharsetConverter(
 206     const std::string& relative,
 207     url::CharsetConverter* charset_converter) const {
 208   // Not allowed for invalid URLs.
 209   if (!is_valid_)
 210     return GURL();
 211
 212   GURL result;
 213
 214   // Reserve enough room in the output for the input, plus some extra so that
 215   // we have room if we have to escape a few things without reallocating.
 216   result.spec_.reserve(spec_.size() + 32);
 217   url::StdStringCanonOutput output(&result.spec_);
 218
 219   if (!url::ResolveRelative(spec_.data(), static_cast<int>(spec_.length()),
 220                             parsed_, relative.data(),
 221                             static_cast<int>(relative.length()),
 222                             charset_converter, &output, &result.parsed_)) {
 223     // Error resolving, return an empty URL.
 224     return GURL();
 225   }
 226
 227   output.Complete();
 228   result.is_valid_ = true;
 229   if (result.SchemeIsFileSystem()) {
 230     result.inner_url_.reset(
 231         new GURL(result.spec_.data(), result.parsed_.Length(),
 232                  *result.parsed_.inner_parsed(), true));
 233   }
 234   return result;
 235 }
 236
 237 // Note: code duplicated above (it's inconvenient to use a template here).
 238 GURL GURL::ResolveWithCharsetConverter(
 239     const base::string16& relative,
 240     url::CharsetConverter* charset_converter) const {
 241   // Not allowed for invalid URLs.
 242   if (!is_valid_)
 243     return GURL();
 244
 245   GURL result;
 246
 247   // Reserve enough room in the output for the input, plus some extra so that
 248   // we have room if we have to escape a few things without reallocating.
 249   result.spec_.reserve(spec_.size() + 32);
 250   url::StdStringCanonOutput output(&result.spec_);
 251
 252   if (!url::ResolveRelative(spec_.data(), static_cast<int>(spec_.length()),
 253                             parsed_, relative.data(),
 254                             static_cast<int>(relative.length()),
 255                             charset_converter, &output, &result.parsed_)) {
 256     // Error resolving, return an empty URL.
 257     return GURL();
 258   }
 259
 260   output.Complete();
 261   result.is_valid_ = true;
 262   if (result.SchemeIsFileSystem()) {
 263     result.inner_url_.reset(
 264         new GURL(result.spec_.data(), result.parsed_.Length(),
 265                  *result.parsed_.inner_parsed(), true));
 266   }
 267   return result;
 268 }
 269
 270 // Note: code duplicated below (it's inconvenient to use a template here).
 271 GURL GURL::ReplaceComponents(
 272     const url::Replacements<char>& replacements) const {
 273   GURL result;
 274
 275   // Not allowed for invalid URLs.
 276   if (!is_valid_)
 277     return GURL();
 278
 279   // Reserve enough room in the output for the input, plus some extra so that
 280   // we have room if we have to escape a few things without reallocating.
 281   result.spec_.reserve(spec_.size() + 32);
 282   url::StdStringCanonOutput output(&result.spec_);
 283
 284   result.is_valid_ = url::ReplaceComponents(
 285       spec_.data(), static_cast<int>(spec_.length()), parsed_, replacements,
 286       NULL, &output, &result.parsed_);
 287
 288   output.Complete();
 289   if (result.is_valid_ && result.SchemeIsFileSystem()) {
 290     result.inner_url_.reset(new GURL(spec_.data(), result.parsed_.Length(),
 291                                      *result.parsed_.inner_parsed(), true));
 292   }
 293   return result;
 294 }
 295
 296 // Note: code duplicated above (it's inconvenient to use a template here).
 297 GURL GURL::ReplaceComponents(
 298     const url::Replacements<base::char16>& replacements) const {
 299   GURL result;
 300
 301   // Not allowed for invalid URLs.
 302   if (!is_valid_)
 303     return GURL();
 304
 305   // Reserve enough room in the output for the input, plus some extra so that
 306   // we have room if we have to escape a few things without reallocating.
 307   result.spec_.reserve(spec_.size() + 32);
 308   url::StdStringCanonOutput output(&result.spec_);
 309
 310   result.is_valid_ = url::ReplaceComponents(
 311       spec_.data(), static_cast<int>(spec_.length()), parsed_, replacements,
 312       NULL, &output, &result.parsed_);
 313
 314   output.Complete();
 315   if (result.is_valid_ && result.SchemeIsFileSystem()) {
 316     result.inner_url_.reset(new GURL(spec_.data(), result.parsed_.Length(),
 317                                      *result.parsed_.inner_parsed(), true));
 318   }
 319   return result;
 320 }
 321
 322 GURL GURL::GetOrigin() const {
 323   // This doesn't make sense for invalid or nonstandard URLs, so return
 324   // the empty URL
 325   if (!is_valid_ || !IsStandard())
 326     return GURL();
 327
 328   if (SchemeIsFileSystem())
 329     return inner_url_->GetOrigin();
 330
 331   url::Replacements<char> replacements;
 332   replacements.ClearUsername();
 333   replacements.ClearPassword();
 334   replacements.ClearPath();
 335   replacements.ClearQuery();
 336   replacements.ClearRef();
 337
 338   return ReplaceComponents(replacements);
 339 }
 340
 341 GURL GURL::GetAsReferrer() const {
 342   if (!is_valid_ || !SchemeIsHTTPOrHTTPS())
 343     return GURL();
 344
 345   if (!has_ref() && !has_username() && !has_password())
 346     return GURL(*this);
 347
 348   url::Replacements<char> replacements;
 349   replacements.ClearRef();
 350   replacements.ClearUsername();
 351   replacements.ClearPassword();
 352   return ReplaceComponents(replacements);
 353 }
 354
 355 GURL GURL::GetWithEmptyPath() const {
 356   // This doesn't make sense for invalid or nonstandard URLs, so return
 357   // the empty URL.
 358   if (!is_valid_ || !IsStandard())
 359     return GURL();
 360
 361   // We could optimize this since we know that the URL is canonical, and we are
 362   // appending a canonical path, so avoiding re-parsing.
 363   GURL other(*this);
 364   if (parsed_.path.len == 0)
 365     return other;
 366
 367   // Clear everything after the path.
 368   other.parsed_.query.reset();
 369   other.parsed_.ref.reset();
 370
 371   // Set the path, since the path is longer than one, we can just set the
 372   // first character and resize.
 373   other.spec_[other.parsed_.path.begin] = '/';
 374   other.parsed_.path.len = 1;
 375   other.spec_.resize(other.parsed_.path.begin + 1);
 376   return other;
 377 }
 378
 379 bool GURL::IsStandard() const {
 380   return url::IsStandard(spec_.data(), parsed_.scheme);
 381 }
 382
 383 bool GURL::SchemeIs(const char* lower_ascii_scheme) const {
 384   if (parsed_.scheme.len <= 0)
 385     return lower_ascii_scheme == NULL;
 386   return base::LowerCaseEqualsASCII(spec_.data() + parsed_.scheme.begin,
 387                                     spec_.data() + parsed_.scheme.end(),
 388                                     lower_ascii_scheme);
 389 }
 390
 391 bool GURL::SchemeIsHTTPOrHTTPS() const {
 392   return SchemeIs(url::kHttpScheme) || SchemeIs(url::kHttpsScheme);
 393 }
 394
 395 bool GURL::SchemeIsWSOrWSS() const {
 396   return SchemeIs(url::kWsScheme) || SchemeIs(url::kWssScheme);
 397 }
 398
 399 int GURL::IntPort() const {
 400   if (parsed_.port.is_nonempty())
 401     return url::ParsePort(spec_.data(), parsed_.port);
 402   return url::PORT_UNSPECIFIED;
 403 }
 404
 405 int GURL::EffectiveIntPort() const {
 406   int int_port = IntPort();
 407   if (int_port == url::PORT_UNSPECIFIED && IsStandard())
 408     return url::DefaultPortForScheme(spec_.data() + parsed_.scheme.begin,
 409                                      parsed_.scheme.len);
 410   return int_port;
 411 }
 412
 413 std::string GURL::ExtractFileName() const {
 414   url::Component file_component;
 415   url::ExtractFileName(spec_.data(), parsed_.path, &file_component);
 416   return ComponentString(file_component);
 417 }
 418
 419 std::string GURL::PathForRequest() const {
 420   DCHECK(parsed_.path.len > 0) << "Canonical path for requests should be non-empty";
 421   if (parsed_.ref.len >= 0) {
 422     // Clip off the reference when it exists. The reference starts after the #
 423     // sign, so we have to subtract one to also remove it.
 424     return std::string(spec_, parsed_.path.begin,
 425                        parsed_.ref.begin - parsed_.path.begin - 1);
 426   }
 427   // Compute the actual path length, rather than depending on the spec's
 428   // terminator.  If we're an inner_url, our spec continues on into our outer
 429   // url's path/query/ref.
 430   int path_len = parsed_.path.len;
 431   if (parsed_.query.is_valid())
 432     path_len = parsed_.query.end() - parsed_.path.begin;
 433
 434   return std::string(spec_, parsed_.path.begin, path_len);
 435 }
 436
 437 std::string GURL::HostNoBrackets() const {
 438   // If host looks like an IPv6 literal, strip the square brackets.
 439   url::Component h(parsed_.host);
 440   if (h.len >= 2 && spec_[h.begin] == '[' && spec_[h.end() - 1] == ']') {
 441     h.begin++;
 442     h.len -= 2;
 443   }
 444   return ComponentString(h);
 445 }
 446
 447 std::string GURL::GetContent() const {
 448   return is_valid_ ? ComponentString(parsed_.GetContent()) : std::string();
 449 }
 450
 451 bool GURL::HostIsIPAddress() const {
 452   if (!is_valid_ || spec_.empty())
 453      return false;
 454
 455   url::RawCanonOutputT<char, 128> ignored_output;
 456   url::CanonHostInfo host_info;
 457   url::CanonicalizeIPAddress(spec_.c_str(), parsed_.host, &ignored_output,
 458                              &host_info);
 459   return host_info.IsIPAddress();
 460 }
 461
 462 #ifdef WIN32
 463
 464 const GURL& GURL::EmptyGURL() {
 465   // Avoid static object construction/destruction on startup/shutdown.
 466   if (!empty_gurl) {
 467     // Create the string. Be careful that we don't break in the case that this
 468     // is being called from multiple threads.
 469     GURL* new_empty_gurl = new GURL;
 470     if (InterlockedCompareExchangePointer(
 471         reinterpret_cast<PVOID*>(&empty_gurl), new_empty_gurl, NULL)) {
 472       // The old value was non-NULL, so no replacement was done. Another
 473       // thread did the initialization out from under us.
 474       delete new_empty_gurl;
 475     }
 476   }
 477   return *empty_gurl;
 478 }
 479
 480 #else
 481
 482 void EmptyGURLOnce(void) {
 483   empty_gurl = new GURL;
 484 }
 485
 486 const GURL& GURL::EmptyGURL() {
 487   // Avoid static object construction/destruction on startup/shutdown.
 488   pthread_once(&empty_gurl_once, EmptyGURLOnce);
 489   return *empty_gurl;
 490 }
 491
 492 #endif  // WIN32
 493
 494 bool GURL::DomainIs(const char* lower_ascii_domain,
 495                     int domain_len) const {
 496   // Return false if this URL is not valid or domain is empty.
 497   if (!is_valid_ || !domain_len)
 498     return false;
 499
 500   // FileSystem URLs have empty parsed_.host, so check this first.
 501   if (SchemeIsFileSystem() && inner_url_)
 502     return inner_url_->DomainIs(lower_ascii_domain, domain_len);
 503
 504   if (!parsed_.host.is_nonempty())
 505     return false;
 506
 507   // Check whether the host name is end with a dot. If yes, treat it
 508   // the same as no-dot unless the input comparison domain is end
 509   // with dot.
 510   const char* last_pos = spec_.data() + parsed_.host.end() - 1;
 511   int host_len = parsed_.host.len;
 512   if ('.' == *last_pos && '.' != lower_ascii_domain[domain_len - 1]) {
 513     last_pos--;
 514     host_len--;
 515   }
 516
 517   // Return false if host's length is less than domain's length.
 518   if (host_len < domain_len)
 519     return false;
 520
 521   // Compare this url whether belong specific domain.
 522   const char* start_pos = spec_.data() + parsed_.host.begin +
 523                           host_len - domain_len;
 524
 525   if (!base::LowerCaseEqualsASCII(start_pos,
 526                                   last_pos + 1,
 527                                   lower_ascii_domain,
 528                                   lower_ascii_domain + domain_len))
 529     return false;
 530
 531   // Check whether host has right domain start with dot, make sure we got
 532   // right domain range. For example www.google.com has domain
 533   // "google.com" but www.iamnotgoogle.com does not.
 534   if ('.' != lower_ascii_domain[0] && host_len > domain_len &&
 535       '.' != *(start_pos - 1))
 536     return false;
 537
 538   return true;
 539 }
 540
 541 void GURL::Swap(GURL* other) {
 542   spec_.swap(other->spec_);
 543   std::swap(is_valid_, other->is_valid_);
 544   std::swap(parsed_, other->parsed_);
 545   inner_url_.swap(other->inner_url_);
 546 }
 547
 548 std::ostream& operator<<(std::ostream& out, const GURL& url) {
 549   return out << url.possibly_invalid_spec();
 550 }