url/url_util.cc

   1 // Copyright 2013 The Chromium Authors. All rights reserved.
   2 // Use of this source code is governed by a BSD-style license that can be
   3 // found in the LICENSE file.
   4
   5 #include "url/url_util.h"
   6
   7 #include <string.h>
   8 #include <vector>
   9
  10 #include "base/debug/leak_annotations.h"
  11 #include "base/logging.h"
  12 #include "base/strings/string_util.h"
  13 #include "url/url_canon_internal.h"
  14 #include "url/url_file.h"
  15 #include "url/url_util_internal.h"
  16
  17 namespace url {
  18
  19 namespace {
  20
  21 const int kNumStandardURLSchemes = 8;
  22 const char* kStandardURLSchemes[kNumStandardURLSchemes] = {
  23   kHttpScheme,
  24   kHttpsScheme,
  25   kFileScheme,  // Yes, file urls can have a hostname!
  26   kFtpScheme,
  27   kGopherScheme,
  28   kWsScheme,    // WebSocket.
  29   kWssScheme,   // WebSocket secure.
  30   kFileSystemScheme,
  31 };
  32
  33 // List of the currently installed standard schemes. This list is lazily
  34 // initialized by InitStandardSchemes and is leaked on shutdown to prevent
  35 // any destructors from being called that will slow us down or cause problems.
  36 std::vector<const char*>* standard_schemes = NULL;
  37
  38 // See the LockStandardSchemes declaration in the header.
  39 bool standard_schemes_locked = false;
  40
  41 // Ensures that the standard_schemes list is initialized, does nothing if it
  42 // already has values.
  43 void InitStandardSchemes() {
  44   if (standard_schemes)
  45     return;
  46   standard_schemes = new std::vector<const char*>;
  47   for (int i = 0; i < kNumStandardURLSchemes; i++)
  48     standard_schemes->push_back(kStandardURLSchemes[i]);
  49 }
  50
  51 // Given a string and a range inside the string, compares it to the given
  52 // lower-case |compare_to| buffer.
  53 template<typename CHAR>
  54 inline bool DoCompareSchemeComponent(const CHAR* spec,
  55                                      const Component& component,
  56                                      const char* compare_to) {
  57   if (!component.is_nonempty())
  58     return compare_to[0] == 0;  // When component is empty, match empty scheme.
  59   return base::LowerCaseEqualsASCII(&spec[component.begin],
  60                                     &spec[component.end()],
  61                                     compare_to);
  62 }
  63
  64 // Returns true if the given scheme identified by |scheme| within |spec| is one
  65 // of the registered "standard" schemes.
  66 template<typename CHAR>
  67 bool DoIsStandard(const CHAR* spec, const Component& scheme) {
  68   if (!scheme.is_nonempty())
  69     return false;  // Empty or invalid schemes are non-standard.
  70
  71   InitStandardSchemes();
  72   for (size_t i = 0; i < standard_schemes->size(); i++) {
  73     if (base::LowerCaseEqualsASCII(&spec[scheme.begin], &spec[scheme.end()],
  74                                    standard_schemes->at(i)))
  75       return true;
  76   }
  77   return false;
  78 }
  79
  80 template<typename CHAR>
  81 bool DoFindAndCompareScheme(const CHAR* str,
  82                             int str_len,
  83                             const char* compare,
  84                             Component* found_scheme) {
  85   // Before extracting scheme, canonicalize the URL to remove any whitespace.
  86   // This matches the canonicalization done in DoCanonicalize function.
  87   RawCanonOutputT<CHAR> whitespace_buffer;
  88   int spec_len;
  89   const CHAR* spec = RemoveURLWhitespace(str, str_len,
  90                                          &whitespace_buffer, &spec_len);
  91
  92   Component our_scheme;
  93   if (!ExtractScheme(spec, spec_len, &our_scheme)) {
  94     // No scheme.
  95     if (found_scheme)
  96       *found_scheme = Component();
  97     return false;
  98   }
  99   if (found_scheme)
 100     *found_scheme = our_scheme;
 101   return DoCompareSchemeComponent(spec, our_scheme, compare);
 102 }
 103
 104 template<typename CHAR>
 105 bool DoCanonicalize(const CHAR* in_spec,
 106                     int in_spec_len,
 107                     bool trim_path_end,
 108                     CharsetConverter* charset_converter,
 109                     CanonOutput* output,
 110                     Parsed* output_parsed) {
 111   // Remove any whitespace from the middle of the relative URL, possibly
 112   // copying to the new buffer.
 113   RawCanonOutputT<CHAR> whitespace_buffer;
 114   int spec_len;
 115   const CHAR* spec = RemoveURLWhitespace(in_spec, in_spec_len,
 116                                          &whitespace_buffer, &spec_len);
 117
 118   Parsed parsed_input;
 119 #ifdef WIN32
 120   // For Windows, we allow things that look like absolute Windows paths to be
 121   // fixed up magically to file URLs. This is done for IE compatability. For
 122   // example, this will change "c:/foo" into a file URL rather than treating
 123   // it as a URL with the protocol "c". It also works for UNC ("\\foo\bar.txt").
 124   // There is similar logic in url_canon_relative.cc for
 125   //
 126   // For Max & Unix, we don't do this (the equivalent would be "/foo/bar" which
 127   // has no meaning as an absolute path name. This is because browsers on Mac
 128   // & Unix don't generally do this, so there is no compatibility reason for
 129   // doing so.
 130   if (DoesBeginUNCPath(spec, 0, spec_len, false) ||
 131       DoesBeginWindowsDriveSpec(spec, 0, spec_len)) {
 132     ParseFileURL(spec, spec_len, &parsed_input);
 133     return CanonicalizeFileURL(spec, spec_len, parsed_input, charset_converter,
 134                                output, output_parsed);
 135   }
 136 #endif
 137
 138   Component scheme;
 139   if (!ExtractScheme(spec, spec_len, &scheme))
 140     return false;
 141
 142   // This is the parsed version of the input URL, we have to canonicalize it
 143   // before storing it in our object.
 144   bool success;
 145   if (DoCompareSchemeComponent(spec, scheme, url::kFileScheme)) {
 146     // File URLs are special.
 147     ParseFileURL(spec, spec_len, &parsed_input);
 148     success = CanonicalizeFileURL(spec, spec_len, parsed_input,
 149                                   charset_converter, output, output_parsed);
 150   } else if (DoCompareSchemeComponent(spec, scheme, url::kFileSystemScheme)) {
 151     // Filesystem URLs are special.
 152     ParseFileSystemURL(spec, spec_len, &parsed_input);
 153     success = CanonicalizeFileSystemURL(spec, spec_len, parsed_input,
 154                                         charset_converter, output,
 155                                         output_parsed);
 156
 157   } else if (DoIsStandard(spec, scheme)) {
 158     // All "normal" URLs.
 159     ParseStandardURL(spec, spec_len, &parsed_input);
 160     success = CanonicalizeStandardURL(spec, spec_len, parsed_input,
 161                                       charset_converter, output, output_parsed);
 162
 163   } else if (DoCompareSchemeComponent(spec, scheme, url::kMailToScheme)) {
 164     // Mailto are treated like a standard url with only a scheme, path, query
 165     ParseMailtoURL(spec, spec_len, &parsed_input);
 166     success = CanonicalizeMailtoURL(spec, spec_len, parsed_input, output,
 167                                     output_parsed);
 168
 169   } else {
 170     // "Weird" URLs like data: and javascript:
 171     ParsePathURL(spec, spec_len, trim_path_end, &parsed_input);
 172     success = CanonicalizePathURL(spec, spec_len, parsed_input, output,
 173                                   output_parsed);
 174   }
 175   return success;
 176 }
 177
 178 template<typename CHAR>
 179 bool DoResolveRelative(const char* base_spec,
 180                        int base_spec_len,
 181                        const Parsed& base_parsed,
 182                        const CHAR* in_relative,
 183                        int in_relative_length,
 184                        CharsetConverter* charset_converter,
 185                        CanonOutput* output,
 186                        Parsed* output_parsed) {
 187   // Remove any whitespace from the middle of the relative URL, possibly
 188   // copying to the new buffer.
 189   RawCanonOutputT<CHAR> whitespace_buffer;
 190   int relative_length;
 191   const CHAR* relative = RemoveURLWhitespace(in_relative, in_relative_length,
 192                                              &whitespace_buffer,
 193                                              &relative_length);
 194   bool base_is_authority_based = false;
 195   bool base_is_hierarchical = false;
 196   if (base_spec &&
 197       base_parsed.scheme.is_nonempty()) {
 198     int after_scheme = base_parsed.scheme.end() + 1;  // Skip past the colon.
 199     int num_slashes = CountConsecutiveSlashes(base_spec, after_scheme,
 200                                               base_spec_len);
 201     base_is_authority_based = num_slashes > 1;
 202     base_is_hierarchical = num_slashes > 0;
 203   }
 204
 205   bool standard_base_scheme =
 206       base_parsed.scheme.is_nonempty() &&
 207       DoIsStandard(base_spec, base_parsed.scheme);
 208
 209   bool is_relative;
 210   Component relative_component;
 211   if (!IsRelativeURL(base_spec, base_parsed, relative, relative_length,
 212                      (base_is_hierarchical || standard_base_scheme),
 213                      &is_relative, &relative_component)) {
 214     // Error resolving.
 215     return false;
 216   }
 217
 218   // Pretend for a moment that |base_spec| is a standard URL. Normally
 219   // non-standard URLs are treated as PathURLs, but if the base has an
 220   // authority we would like to preserve it.
 221   if (is_relative && base_is_authority_based && !standard_base_scheme) {
 222     Parsed base_parsed_authority;
 223     ParseStandardURL(base_spec, base_spec_len, &base_parsed_authority);
 224     if (base_parsed_authority.host.is_nonempty()) {
 225       RawCanonOutputT<char> temporary_output;
 226       bool did_resolve_succeed =
 227           ResolveRelativeURL(base_spec, base_parsed_authority, false, relative,
 228                              relative_component, charset_converter,
 229                              &temporary_output, output_parsed);
 230       // The output_parsed is incorrect at this point (because it was built
 231       // based on base_parsed_authority instead of base_parsed) and needs to be
 232       // re-created.
 233       DoCanonicalize(temporary_output.data(), temporary_output.length(), true,
 234                      charset_converter, output, output_parsed);
 235       return did_resolve_succeed;
 236     }
 237   } else if (is_relative) {
 238     // Relative, resolve and canonicalize.
 239     bool file_base_scheme = base_parsed.scheme.is_nonempty() &&
 240         DoCompareSchemeComponent(base_spec, base_parsed.scheme, kFileScheme);
 241     return ResolveRelativeURL(base_spec, base_parsed, file_base_scheme, relative,
 242                               relative_component, charset_converter, output,
 243                               output_parsed);
 244   }
 245
 246   // Not relative, canonicalize the input.
 247   return DoCanonicalize(relative, relative_length, true, charset_converter,
 248                         output, output_parsed);
 249 }
 250
 251 template<typename CHAR>
 252 bool DoReplaceComponents(const char* spec,
 253                          int spec_len,
 254                          const Parsed& parsed,
 255                          const Replacements<CHAR>& replacements,
 256                          CharsetConverter* charset_converter,
 257                          CanonOutput* output,
 258                          Parsed* out_parsed) {
 259   // If the scheme is overridden, just do a simple string substitution and
 260   // reparse the whole thing. There are lots of edge cases that we really don't
 261   // want to deal with. Like what happens if I replace "http://e:8080/foo"
 262   // with a file. Does it become "file:///E:/8080/foo" where the port number
 263   // becomes part of the path? Parsing that string as a file URL says "yes"
 264   // but almost no sane rule for dealing with the components individually would
 265   // come up with that.
 266   //
 267   // Why allow these crazy cases at all? Programatically, there is almost no
 268   // case for replacing the scheme. The most common case for hitting this is
 269   // in JS when building up a URL using the location object. In this case, the
 270   // JS code expects the string substitution behavior:
 271   //   http://www.w3.org/TR/2008/WD-html5-20080610/structured.html#common3
 272   if (replacements.IsSchemeOverridden()) {
 273     // Canonicalize the new scheme so it is 8-bit and can be concatenated with
 274     // the existing spec.
 275     RawCanonOutput<128> scheme_replaced;
 276     Component scheme_replaced_parsed;
 277     CanonicalizeScheme(replacements.sources().scheme,
 278                        replacements.components().scheme,
 279                        &scheme_replaced, &scheme_replaced_parsed);
 280
 281     // We can assume that the input is canonicalized, which means it always has
 282     // a colon after the scheme (or where the scheme would be).
 283     int spec_after_colon = parsed.scheme.is_valid() ? parsed.scheme.end() + 1
 284                                                     : 1;
 285     if (spec_len - spec_after_colon > 0) {
 286       scheme_replaced.Append(&spec[spec_after_colon],
 287                              spec_len - spec_after_colon);
 288     }
 289
 290     // We now need to completely re-parse the resulting string since its meaning
 291     // may have changed with the different scheme.
 292     RawCanonOutput<128> recanonicalized;
 293     Parsed recanonicalized_parsed;
 294     DoCanonicalize(scheme_replaced.data(), scheme_replaced.length(), true,
 295                    charset_converter,
 296                    &recanonicalized, &recanonicalized_parsed);
 297
 298     // Recurse using the version with the scheme already replaced. This will now
 299     // use the replacement rules for the new scheme.
 300     //
 301     // Warning: this code assumes that ReplaceComponents will re-check all
 302     // components for validity. This is because we can't fail if DoCanonicalize
 303     // failed above since theoretically the thing making it fail could be
 304     // getting replaced here. If ReplaceComponents didn't re-check everything,
 305     // we wouldn't know if something *not* getting replaced is a problem.
 306     // If the scheme-specific replacers are made more intelligent so they don't
 307     // re-check everything, we should instead recanonicalize the whole thing
 308     // after this call to check validity (this assumes replacing the scheme is
 309     // much much less common than other types of replacements, like clearing the
 310     // ref).
 311     Replacements<CHAR> replacements_no_scheme = replacements;
 312     replacements_no_scheme.SetScheme(NULL, Component());
 313     return DoReplaceComponents(recanonicalized.data(), recanonicalized.length(),
 314                                recanonicalized_parsed, replacements_no_scheme,
 315                                charset_converter, output, out_parsed);
 316   }
 317
 318   // If we get here, then we know the scheme doesn't need to be replaced, so can
 319   // just key off the scheme in the spec to know how to do the replacements.
 320   if (DoCompareSchemeComponent(spec, parsed.scheme, url::kFileScheme)) {
 321     return ReplaceFileURL(spec, parsed, replacements, charset_converter, output,
 322                           out_parsed);
 323   }
 324   if (DoCompareSchemeComponent(spec, parsed.scheme, url::kFileSystemScheme)) {
 325     return ReplaceFileSystemURL(spec, parsed, replacements, charset_converter,
 326                                 output, out_parsed);
 327   }
 328   if (DoIsStandard(spec, parsed.scheme)) {
 329     return ReplaceStandardURL(spec, parsed, replacements, charset_converter,
 330                               output, out_parsed);
 331   }
 332   if (DoCompareSchemeComponent(spec, parsed.scheme, url::kMailToScheme)) {
 333     return ReplaceMailtoURL(spec, parsed, replacements, output, out_parsed);
 334   }
 335
 336   // Default is a path URL.
 337   return ReplacePathURL(spec, parsed, replacements, output, out_parsed);
 338 }
 339
 340 }  // namespace
 341
 342 void Initialize() {
 343   InitStandardSchemes();
 344 }
 345
 346 void Shutdown() {
 347   if (standard_schemes) {
 348     delete standard_schemes;
 349     standard_schemes = NULL;
 350   }
 351 }
 352
 353 void AddStandardScheme(const char* new_scheme) {
 354   // If this assert triggers, it means you've called AddStandardScheme after
 355   // LockStandardSchemes have been called (see the header file for
 356   // LockStandardSchemes for more).
 357   //
 358   // This normally means you're trying to set up a new standard scheme too late
 359   // in your application's init process. Locate where your app does this
 360   // initialization and calls LockStandardScheme, and add your new standard
 361   // scheme there.
 362   DCHECK(!standard_schemes_locked) <<
 363       "Trying to add a standard scheme after the list has been locked.";
 364
 365   size_t scheme_len = strlen(new_scheme);
 366   if (scheme_len == 0)
 367     return;
 368
 369   // Dulicate the scheme into a new buffer and add it to the list of standard
 370   // schemes. This pointer will be leaked on shutdown.
 371   char* dup_scheme = new char[scheme_len + 1];
 372   ANNOTATE_LEAKING_OBJECT_PTR(dup_scheme);
 373   memcpy(dup_scheme, new_scheme, scheme_len + 1);
 374
 375   InitStandardSchemes();
 376   standard_schemes->push_back(dup_scheme);
 377 }
 378
 379 void LockStandardSchemes() {
 380   standard_schemes_locked = true;
 381 }
 382
 383 bool IsStandard(const char* spec, const Component& scheme) {
 384   return DoIsStandard(spec, scheme);
 385 }
 386
 387 bool IsStandard(const base::char16* spec, const Component& scheme) {
 388   return DoIsStandard(spec, scheme);
 389 }
 390
 391 bool FindAndCompareScheme(const char* str,
 392                           int str_len,
 393                           const char* compare,
 394                           Component* found_scheme) {
 395   return DoFindAndCompareScheme(str, str_len, compare, found_scheme);
 396 }
 397
 398 bool FindAndCompareScheme(const base::char16* str,
 399                           int str_len,
 400                           const char* compare,
 401                           Component* found_scheme) {
 402   return DoFindAndCompareScheme(str, str_len, compare, found_scheme);
 403 }
 404
 405 bool Canonicalize(const char* spec,
 406                   int spec_len,
 407                   bool trim_path_end,
 408                   CharsetConverter* charset_converter,
 409                   CanonOutput* output,
 410                   Parsed* output_parsed) {
 411   return DoCanonicalize(spec, spec_len, trim_path_end, charset_converter,
 412                         output, output_parsed);
 413 }
 414
 415 bool Canonicalize(const base::char16* spec,
 416                   int spec_len,
 417                   bool trim_path_end,
 418                   CharsetConverter* charset_converter,
 419                   CanonOutput* output,
 420                   Parsed* output_parsed) {
 421   return DoCanonicalize(spec, spec_len, trim_path_end, charset_converter,
 422                         output, output_parsed);
 423 }
 424
 425 bool ResolveRelative(const char* base_spec,
 426                      int base_spec_len,
 427                      const Parsed& base_parsed,
 428                      const char* relative,
 429                      int relative_length,
 430                      CharsetConverter* charset_converter,
 431                      CanonOutput* output,
 432                      Parsed* output_parsed) {
 433   return DoResolveRelative(base_spec, base_spec_len, base_parsed,
 434                            relative, relative_length,
 435                            charset_converter, output, output_parsed);
 436 }
 437
 438 bool ResolveRelative(const char* base_spec,
 439                      int base_spec_len,
 440                      const Parsed& base_parsed,
 441                      const base::char16* relative,
 442                      int relative_length,
 443                      CharsetConverter* charset_converter,
 444                      CanonOutput* output,
 445                      Parsed* output_parsed) {
 446   return DoResolveRelative(base_spec, base_spec_len, base_parsed,
 447                            relative, relative_length,
 448                            charset_converter, output, output_parsed);
 449 }
 450
 451 bool ReplaceComponents(const char* spec,
 452                        int spec_len,
 453                        const Parsed& parsed,
 454                        const Replacements<char>& replacements,
 455                        CharsetConverter* charset_converter,
 456                        CanonOutput* output,
 457                        Parsed* out_parsed) {
 458   return DoReplaceComponents(spec, spec_len, parsed, replacements,
 459                              charset_converter, output, out_parsed);
 460 }
 461
 462 bool ReplaceComponents(const char* spec,
 463                        int spec_len,
 464                        const Parsed& parsed,
 465                        const Replacements<base::char16>& replacements,
 466                        CharsetConverter* charset_converter,
 467                        CanonOutput* output,
 468                        Parsed* out_parsed) {
 469   return DoReplaceComponents(spec, spec_len, parsed, replacements,
 470                              charset_converter, output, out_parsed);
 471 }
 472
 473 void DecodeURLEscapeSequences(const char* input,
 474                               int length,
 475                               CanonOutputW* output) {
 476   RawCanonOutputT<char> unescaped_chars;
 477   for (int i = 0; i < length; i++) {
 478     if (input[i] == '%') {
 479       unsigned char ch;
 480       if (DecodeEscaped(input, &i, length, &ch)) {
 481         unescaped_chars.push_back(ch);
 482       } else {
 483         // Invalid escape sequence, copy the percent literal.
 484         unescaped_chars.push_back('%');
 485       }
 486     } else {
 487       // Regular non-escaped 8-bit character.
 488       unescaped_chars.push_back(input[i]);
 489     }
 490   }
 491
 492   // Convert that 8-bit to UTF-16. It's not clear IE does this at all to
 493   // JavaScript URLs, but Firefox and Safari do.
 494   for (int i = 0; i < unescaped_chars.length(); i++) {
 495     unsigned char uch = static_cast<unsigned char>(unescaped_chars.at(i));
 496     if (uch < 0x80) {
 497       // Non-UTF-8, just append directly
 498       output->push_back(uch);
 499     } else {
 500       // next_ch will point to the last character of the decoded
 501       // character.
 502       int next_character = i;
 503       unsigned code_point;
 504       if (ReadUTFChar(unescaped_chars.data(), &next_character,
 505                       unescaped_chars.length(), &code_point)) {
 506         // Valid UTF-8 character, convert to UTF-16.
 507         AppendUTF16Value(code_point, output);
 508         i = next_character;
 509       } else {
 510         // If there are any sequences that are not valid UTF-8, we keep
 511         // invalid code points and promote to UTF-16. We copy all characters
 512         // from the current position to the end of the identified sequence.
 513         while (i < next_character) {
 514           output->push_back(static_cast<unsigned char>(unescaped_chars.at(i)));
 515           i++;
 516         }
 517         output->push_back(static_cast<unsigned char>(unescaped_chars.at(i)));
 518       }
 519     }
 520   }
 521 }
 522
 523 void EncodeURIComponent(const char* input, int length, CanonOutput* output) {
 524   for (int i = 0; i < length; ++i) {
 525     unsigned char c = static_cast<unsigned char>(input[i]);
 526     if (IsComponentChar(c))
 527       output->push_back(c);
 528     else
 529       AppendEscapedChar(c, output);
 530   }
 531 }
 532
 533 bool CompareSchemeComponent(const char* spec,
 534                             const Component& component,
 535                             const char* compare_to) {
 536   return DoCompareSchemeComponent(spec, component, compare_to);
 537 }
 538
 539 bool CompareSchemeComponent(const base::char16* spec,
 540                             const Component& component,
 541                             const char* compare_to) {
 542   return DoCompareSchemeComponent(spec, component, compare_to);
 543 }
 544
 545 }  // namespace url