content/common/cross_site_document_classifier.cc

   1 // Copyright 2015 The Chromium Authors. All rights reserved.
   2 // Use of this source code is governed by a BSD-style license that can be
   3 // found in the LICENSE file.
   4
   5 #include "content/common/cross_site_document_classifier.h"
   6
   7 #include "base/basictypes.h"
   8 #include "base/command_line.h"
   9 #include "base/lazy_instance.h"
  10 #include "base/logging.h"
  11 #include "base/metrics/histogram.h"
  12 #include "base/strings/string_util.h"
  13 #include "content/public/common/content_switches.h"
  14 #include "content/public/common/resource_response_info.h"
  15 #include "net/base/registry_controlled_domains/registry_controlled_domain.h"
  16 #include "net/http/http_response_headers.h"
  17
  18 using base::StringPiece;
  19
  20 namespace content {
  21
  22 namespace {
  23
  24 // MIME types
  25 const char kTextHtml[] = "text/html";
  26 const char kTextXml[] = "text/xml";
  27 const char kAppRssXml[] = "application/rss+xml";
  28 const char kAppXml[] = "application/xml";
  29 const char kAppJson[] = "application/json";
  30 const char kTextJson[] = "text/json";
  31 const char kTextXjson[] = "text/x-json";
  32 const char kTextPlain[] = "text/plain";
  33
  34 bool MatchesSignature(StringPiece data,
  35                       const StringPiece signatures[],
  36                       size_t arr_size) {
  37   size_t offset = data.find_first_not_of(" \t\r\n");
  38   // There is no not-whitespace character in this document.
  39   if (offset == base::StringPiece::npos)
  40     return false;
  41
  42   data.remove_prefix(offset);
  43   size_t length = data.length();
  44
  45   for (size_t sig_index = 0; sig_index < arr_size; ++sig_index) {
  46     const StringPiece& signature = signatures[sig_index];
  47     size_t signature_length = signature.length();
  48     if (length < signature_length)
  49       continue;
  50
  51     if (base::LowerCaseEqualsASCII(
  52             data.begin(), data.begin() + signature_length, signature.data()))
  53       return true;
  54   }
  55   return false;
  56 }
  57
  58 }  // namespace
  59
  60 CrossSiteDocumentMimeType CrossSiteDocumentClassifier::GetCanonicalMimeType(
  61     const std::string& mime_type) {
  62   if (base::LowerCaseEqualsASCII(mime_type, kTextHtml)) {
  63     return CROSS_SITE_DOCUMENT_MIME_TYPE_HTML;
  64   }
  65
  66   if (base::LowerCaseEqualsASCII(mime_type, kTextPlain)) {
  67     return CROSS_SITE_DOCUMENT_MIME_TYPE_PLAIN;
  68   }
  69
  70   if (base::LowerCaseEqualsASCII(mime_type, kAppJson) ||
  71       base::LowerCaseEqualsASCII(mime_type, kTextJson) ||
  72       base::LowerCaseEqualsASCII(mime_type, kTextXjson)) {
  73     return CROSS_SITE_DOCUMENT_MIME_TYPE_JSON;
  74   }
  75
  76   if (base::LowerCaseEqualsASCII(mime_type, kTextXml) ||
  77       base::LowerCaseEqualsASCII(mime_type, kAppRssXml) ||
  78       base::LowerCaseEqualsASCII(mime_type, kAppXml)) {
  79     return CROSS_SITE_DOCUMENT_MIME_TYPE_XML;
  80   }
  81
  82   return CROSS_SITE_DOCUMENT_MIME_TYPE_OTHERS;
  83 }
  84
  85 bool CrossSiteDocumentClassifier::IsBlockableScheme(const GURL& url) {
  86   // We exclude ftp:// from here. FTP doesn't provide a Content-Type
  87   // header which our policy depends on, so we cannot protect any
  88   // document from FTP servers.
  89   return url.SchemeIs(url::kHttpScheme) || url.SchemeIs(url::kHttpsScheme);
  90 }
  91
  92 bool CrossSiteDocumentClassifier::IsSameSite(const GURL& frame_origin,
  93                                              const GURL& response_url) {
  94   if (!frame_origin.is_valid() || !response_url.is_valid())
  95     return false;
  96
  97   if (frame_origin.scheme() != response_url.scheme())
  98     return false;
  99
 100   // SameDomainOrHost() extracts the effective domains (public suffix plus one)
 101   // from the two URLs and compare them.
 102   return net::registry_controlled_domains::SameDomainOrHost(
 103       frame_origin, response_url,
 104       net::registry_controlled_domains::INCLUDE_PRIVATE_REGISTRIES);
 105 }
 106
 107 // We don't use Webkit's existing CORS policy implementation since
 108 // their policy works in terms of origins, not sites. For example,
 109 // when frame is sub.a.com and it is not allowed to access a document
 110 // with sub1.a.com. But under Site Isolation, it's allowed.
 111 bool CrossSiteDocumentClassifier::IsValidCorsHeaderSet(
 112     const GURL& frame_origin,
 113     const GURL& website_origin,
 114     const std::string& access_control_origin) {
 115   // Many websites are sending back "\"*\"" instead of "*". This is
 116   // non-standard practice, and not supported by Chrome. Refer to
 117   // CrossOriginAccessControl::passesAccessControlCheck().
 118
 119   // TODO(dsjang): * is not allowed for the response from a request
 120   // with cookies. This allows for more than what the renderer will
 121   // eventually be able to receive, so we won't see illegal cross-site
 122   // documents allowed by this. We have to find a way to see if this
 123   // response is from a cookie-tagged request or not in the future.
 124   if (access_control_origin == "*")
 125     return true;
 126
 127   // TODO(dsjang): The CORS spec only treats a fully specified URL, except for
 128   // "*", but many websites are using just a domain for access_control_origin,
 129   // and this is blocked by Webkit's CORS logic here :
 130   // CrossOriginAccessControl::passesAccessControlCheck(). GURL is set
 131   // is_valid() to false when it is created from a URL containing * in the
 132   // domain part.
 133
 134   GURL cors_origin(access_control_origin);
 135   return IsSameSite(frame_origin, cors_origin);
 136 }
 137
 138 // This function is a slight modification of |net::SniffForHTML|.
 139 bool CrossSiteDocumentClassifier::SniffForHTML(StringPiece data) {
 140   // The content sniffer used by Chrome and Firefox are using "<!--"
 141   // as one of the HTML signatures, but it also appears in valid
 142   // JavaScript, considered as well-formed JS by the browser.  Since
 143   // we do not want to block any JS, we exclude it from our HTML
 144   // signatures. This can weaken our document block policy, but we can
 145   // break less websites.
 146   // TODO(dsjang): parameterize |net::SniffForHTML| with an option
 147   // that decides whether to include <!-- or not, so that we can
 148   // remove this function.
 149   // TODO(dsjang): Once CrossSiteDocumentClassifier is moved into the browser
 150   // process, we should do single-thread checking here for the static
 151   // initializer.
 152   static const StringPiece kHtmlSignatures[] = {
 153       StringPiece("<!doctype html"),  // HTML5 spec
 154       StringPiece("<script"),         // HTML5 spec, Mozilla
 155       StringPiece("<html"),           // HTML5 spec, Mozilla
 156       StringPiece("<head"),           // HTML5 spec, Mozilla
 157       StringPiece("<iframe"),         // Mozilla
 158       StringPiece("<h1"),             // Mozilla
 159       StringPiece("<div"),            // Mozilla
 160       StringPiece("<font"),           // Mozilla
 161       StringPiece("<table"),          // Mozilla
 162       StringPiece("<a"),              // Mozilla
 163       StringPiece("<style"),          // Mozilla
 164       StringPiece("<title"),          // Mozilla
 165       StringPiece("<b"),              // Mozilla
 166       StringPiece("<body"),           // Mozilla
 167       StringPiece("<br"),             // Mozilla
 168       StringPiece("<p")               // Mozilla
 169   };
 170
 171   while (data.length() > 0) {
 172     if (MatchesSignature(data, kHtmlSignatures, arraysize(kHtmlSignatures)))
 173       return true;
 174
 175     // If we cannot find "<!--", we fail sniffing this as HTML.
 176     static const StringPiece kCommentBegins[] = {StringPiece("<!--")};
 177     if (!MatchesSignature(data, kCommentBegins, arraysize(kCommentBegins)))
 178       break;
 179
 180     // Search for --> and do SniffForHTML after that. If we can find the
 181     // comment's end, we start HTML sniffing from there again.
 182     static const char kEndComment[] = "-->";
 183     size_t offset = data.find(kEndComment);
 184     if (offset == base::StringPiece::npos)
 185       break;
 186
 187     // Proceed to the index next to the ending comment (-->).
 188     data.remove_prefix(offset + strlen(kEndComment));
 189   }
 190
 191   return false;
 192 }
 193
 194 bool CrossSiteDocumentClassifier::SniffForXML(base::StringPiece data) {
 195   // TODO(dsjang): Chrome's mime_sniffer is using strncasecmp() for
 196   // this signature. However, XML is case-sensitive. Don't we have to
 197   // be more lenient only to block documents starting with the exact
 198   // string <?xml rather than <?XML ?
 199   // TODO(dsjang): Once CrossSiteDocumentClassifier is moved into the browser
 200   // process, we should do single-thread checking here for the static
 201   // initializer.
 202   static const StringPiece kXmlSignatures[] = {StringPiece("<?xml")};
 203   return MatchesSignature(data, kXmlSignatures, arraysize(kXmlSignatures));
 204 }
 205
 206 bool CrossSiteDocumentClassifier::SniffForJSON(base::StringPiece data) {
 207   // TODO(dsjang): We have to come up with a better way to sniff
 208   // JSON. However, even RE cannot help us that much due to the fact
 209   // that we don't do full parsing.  This DFA starts with state 0, and
 210   // finds {, "/' and : in that order. We're avoiding adding a
 211   // dependency on a regular expression library.
 212   enum {
 213     kStartState,
 214     kLeftBraceState,
 215     kLeftQuoteState,
 216     kColonState,
 217     kTerminalState,
 218   } state = kStartState;
 219
 220   size_t length = data.length();
 221   for (size_t i = 0; i < length && state < kColonState; ++i) {
 222     const char c = data[i];
 223     if (c == ' ' || c == '\t' || c == '\r' || c == '\n')
 224       continue;
 225
 226     switch (state) {
 227       case kStartState:
 228         if (c == '{')
 229           state = kLeftBraceState;
 230         else
 231           state = kTerminalState;
 232         break;
 233       case kLeftBraceState:
 234         if (c == '\"' || c == '\'')
 235           state = kLeftQuoteState;
 236         else
 237           state = kTerminalState;
 238         break;
 239       case kLeftQuoteState:
 240         if (c == ':')
 241           state = kColonState;
 242         break;
 243       case kColonState:
 244       case kTerminalState:
 245         NOTREACHED();
 246         break;
 247     }
 248   }
 249   return state == kColonState;
 250 }
 251
 252 }  // namespace content