[patch 6 of 6] CrossSiteDocumentClassifier bug fixes.
[chromium-blink-merge.git] / content / common / cross_site_document_classifier.cc
blobc57c2f403219e87e669a5ca09c36fe612d7b744f
1 // Copyright 2015 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
5 #include "content/common/cross_site_document_classifier.h"
7 #include "base/basictypes.h"
8 #include "base/command_line.h"
9 #include "base/lazy_instance.h"
10 #include "base/logging.h"
11 #include "base/metrics/histogram.h"
12 #include "base/strings/string_util.h"
13 #include "content/public/common/content_switches.h"
14 #include "content/public/common/resource_response_info.h"
15 #include "net/base/registry_controlled_domains/registry_controlled_domain.h"
16 #include "net/http/http_response_headers.h"
18 using base::StringPiece;
20 namespace content {
22 namespace {
24 // MIME types
25 const char kTextHtml[] = "text/html";
26 const char kTextXml[] = "text/xml";
27 const char kAppRssXml[] = "application/rss+xml";
28 const char kAppXml[] = "application/xml";
29 const char kAppJson[] = "application/json";
30 const char kTextJson[] = "text/json";
31 const char kTextXjson[] = "text/x-json";
32 const char kTextPlain[] = "text/plain";
34 bool MatchesSignature(StringPiece data,
35 const StringPiece signatures[],
36 size_t arr_size) {
37 size_t offset = data.find_first_not_of(" \t\r\n");
38 // There is no not-whitespace character in this document.
39 if (offset == base::StringPiece::npos)
40 return false;
42 data.remove_prefix(offset);
43 size_t length = data.length();
45 for (size_t sig_index = 0; sig_index < arr_size; ++sig_index) {
46 const StringPiece& signature = signatures[sig_index];
47 size_t signature_length = signature.length();
48 if (length < signature_length)
49 continue;
51 if (base::LowerCaseEqualsASCII(
52 data.begin(), data.begin() + signature_length, signature.data()))
53 return true;
55 return false;
58 } // namespace
60 CrossSiteDocumentMimeType CrossSiteDocumentClassifier::GetCanonicalMimeType(
61 const std::string& mime_type) {
62 if (base::LowerCaseEqualsASCII(mime_type, kTextHtml)) {
63 return CROSS_SITE_DOCUMENT_MIME_TYPE_HTML;
66 if (base::LowerCaseEqualsASCII(mime_type, kTextPlain)) {
67 return CROSS_SITE_DOCUMENT_MIME_TYPE_PLAIN;
70 if (base::LowerCaseEqualsASCII(mime_type, kAppJson) ||
71 base::LowerCaseEqualsASCII(mime_type, kTextJson) ||
72 base::LowerCaseEqualsASCII(mime_type, kTextXjson)) {
73 return CROSS_SITE_DOCUMENT_MIME_TYPE_JSON;
76 if (base::LowerCaseEqualsASCII(mime_type, kTextXml) ||
77 base::LowerCaseEqualsASCII(mime_type, kAppRssXml) ||
78 base::LowerCaseEqualsASCII(mime_type, kAppXml)) {
79 return CROSS_SITE_DOCUMENT_MIME_TYPE_XML;
82 return CROSS_SITE_DOCUMENT_MIME_TYPE_OTHERS;
85 bool CrossSiteDocumentClassifier::IsBlockableScheme(const GURL& url) {
86 // We exclude ftp:// from here. FTP doesn't provide a Content-Type
87 // header which our policy depends on, so we cannot protect any
88 // document from FTP servers.
89 return url.SchemeIs(url::kHttpScheme) || url.SchemeIs(url::kHttpsScheme);
92 bool CrossSiteDocumentClassifier::IsSameSite(const GURL& frame_origin,
93 const GURL& response_url) {
94 if (!frame_origin.is_valid() || !response_url.is_valid())
95 return false;
97 if (frame_origin.scheme() != response_url.scheme())
98 return false;
100 // SameDomainOrHost() extracts the effective domains (public suffix plus one)
101 // from the two URLs and compare them.
102 return net::registry_controlled_domains::SameDomainOrHost(
103 frame_origin, response_url,
104 net::registry_controlled_domains::INCLUDE_PRIVATE_REGISTRIES);
107 // We don't use Webkit's existing CORS policy implementation since
108 // their policy works in terms of origins, not sites. For example,
109 // when frame is sub.a.com and it is not allowed to access a document
110 // with sub1.a.com. But under Site Isolation, it's allowed.
111 bool CrossSiteDocumentClassifier::IsValidCorsHeaderSet(
112 const GURL& frame_origin,
113 const GURL& website_origin,
114 const std::string& access_control_origin) {
115 // Many websites are sending back "\"*\"" instead of "*". This is
116 // non-standard practice, and not supported by Chrome. Refer to
117 // CrossOriginAccessControl::passesAccessControlCheck().
119 // TODO(dsjang): * is not allowed for the response from a request
120 // with cookies. This allows for more than what the renderer will
121 // eventually be able to receive, so we won't see illegal cross-site
122 // documents allowed by this. We have to find a way to see if this
123 // response is from a cookie-tagged request or not in the future.
124 if (access_control_origin == "*")
125 return true;
127 // TODO(dsjang): The CORS spec only treats a fully specified URL, except for
128 // "*", but many websites are using just a domain for access_control_origin,
129 // and this is blocked by Webkit's CORS logic here :
130 // CrossOriginAccessControl::passesAccessControlCheck(). GURL is set
131 // is_valid() to false when it is created from a URL containing * in the
132 // domain part.
134 GURL cors_origin(access_control_origin);
135 return IsSameSite(frame_origin, cors_origin);
138 // This function is a slight modification of |net::SniffForHTML|.
139 bool CrossSiteDocumentClassifier::SniffForHTML(StringPiece data) {
140 // The content sniffer used by Chrome and Firefox are using "<!--"
141 // as one of the HTML signatures, but it also appears in valid
142 // JavaScript, considered as well-formed JS by the browser. Since
143 // we do not want to block any JS, we exclude it from our HTML
144 // signatures. This can weaken our document block policy, but we can
145 // break less websites.
146 // TODO(dsjang): parameterize |net::SniffForHTML| with an option
147 // that decides whether to include <!-- or not, so that we can
148 // remove this function.
149 // TODO(dsjang): Once CrossSiteDocumentClassifier is moved into the browser
150 // process, we should do single-thread checking here for the static
151 // initializer.
152 static const StringPiece kHtmlSignatures[] = {
153 StringPiece("<!doctype html"), // HTML5 spec
154 StringPiece("<script"), // HTML5 spec, Mozilla
155 StringPiece("<html"), // HTML5 spec, Mozilla
156 StringPiece("<head"), // HTML5 spec, Mozilla
157 StringPiece("<iframe"), // Mozilla
158 StringPiece("<h1"), // Mozilla
159 StringPiece("<div"), // Mozilla
160 StringPiece("<font"), // Mozilla
161 StringPiece("<table"), // Mozilla
162 StringPiece("<a"), // Mozilla
163 StringPiece("<style"), // Mozilla
164 StringPiece("<title"), // Mozilla
165 StringPiece("<b"), // Mozilla
166 StringPiece("<body"), // Mozilla
167 StringPiece("<br"), // Mozilla
168 StringPiece("<p") // Mozilla
171 while (data.length() > 0) {
172 if (MatchesSignature(data, kHtmlSignatures, arraysize(kHtmlSignatures)))
173 return true;
175 // If we cannot find "<!--", we fail sniffing this as HTML.
176 static const StringPiece kCommentBegins[] = {StringPiece("<!--")};
177 if (!MatchesSignature(data, kCommentBegins, arraysize(kCommentBegins)))
178 break;
180 // Search for --> and do SniffForHTML after that. If we can find the
181 // comment's end, we start HTML sniffing from there again.
182 static const char kEndComment[] = "-->";
183 size_t offset = data.find(kEndComment);
184 if (offset == base::StringPiece::npos)
185 break;
187 // Proceed to the index next to the ending comment (-->).
188 data.remove_prefix(offset + strlen(kEndComment));
191 return false;
194 bool CrossSiteDocumentClassifier::SniffForXML(base::StringPiece data) {
195 // TODO(dsjang): Chrome's mime_sniffer is using strncasecmp() for
196 // this signature. However, XML is case-sensitive. Don't we have to
197 // be more lenient only to block documents starting with the exact
198 // string <?xml rather than <?XML ?
199 // TODO(dsjang): Once CrossSiteDocumentClassifier is moved into the browser
200 // process, we should do single-thread checking here for the static
201 // initializer.
202 static const StringPiece kXmlSignatures[] = {StringPiece("<?xml")};
203 return MatchesSignature(data, kXmlSignatures, arraysize(kXmlSignatures));
206 bool CrossSiteDocumentClassifier::SniffForJSON(base::StringPiece data) {
207 // TODO(dsjang): We have to come up with a better way to sniff
208 // JSON. However, even RE cannot help us that much due to the fact
209 // that we don't do full parsing. This DFA starts with state 0, and
210 // finds {, "/' and : in that order. We're avoiding adding a
211 // dependency on a regular expression library.
212 enum {
213 kStartState,
214 kLeftBraceState,
215 kLeftQuoteState,
216 kColonState,
217 kTerminalState,
218 } state = kStartState;
220 size_t length = data.length();
221 for (size_t i = 0; i < length && state < kColonState; ++i) {
222 const char c = data[i];
223 if (c == ' ' || c == '\t' || c == '\r' || c == '\n')
224 continue;
226 switch (state) {
227 case kStartState:
228 if (c == '{')
229 state = kLeftBraceState;
230 else
231 state = kTerminalState;
232 break;
233 case kLeftBraceState:
234 if (c == '\"' || c == '\'')
235 state = kLeftQuoteState;
236 else
237 state = kTerminalState;
238 break;
239 case kLeftQuoteState:
240 if (c == ':')
241 state = kColonState;
242 break;
243 case kColonState:
244 case kTerminalState:
245 NOTREACHED();
246 break;
249 return state == kColonState;
252 } // namespace content