net/base/mime_sniffer.cc

   1 // Copyright (c) 2011 The Chromium Authors. All rights reserved.
   2 // Use of this source code is governed by a BSD-style license that can be
   3 // found in the LICENSE file.
   4
   5 // Detecting mime types is a tricky business because we need to balance
   6 // compatibility concerns with security issues.  Here is a survey of how other
   7 // browsers behave and then a description of how we intend to behave.
   8 //
   9 // HTML payload, no Content-Type header:
  10 // * IE 7: Render as HTML
  11 // * Firefox 2: Render as HTML
  12 // * Safari 3: Render as HTML
  13 // * Opera 9: Render as HTML
  14 //
  15 // Here the choice seems clear:
  16 // => Chrome: Render as HTML
  17 //
  18 // HTML payload, Content-Type: "text/plain":
  19 // * IE 7: Render as HTML
  20 // * Firefox 2: Render as text
  21 // * Safari 3: Render as text (Note: Safari will Render as HTML if the URL
  22 //                                   has an HTML extension)
  23 // * Opera 9: Render as text
  24 //
  25 // Here we choose to follow the majority (and break some compatibility with IE).
  26 // Many folks dislike IE's behavior here.
  27 // => Chrome: Render as text
  28 // We generalize this as follows.  If the Content-Type header is text/plain
  29 // we won't detect dangerous mime types (those that can execute script).
  30 //
  31 // HTML payload, Content-Type: "application/octet-stream":
  32 // * IE 7: Render as HTML
  33 // * Firefox 2: Download as application/octet-stream
  34 // * Safari 3: Render as HTML
  35 // * Opera 9: Render as HTML
  36 //
  37 // We follow Firefox.
  38 // => Chrome: Download as application/octet-stream
  39 // One factor in this decision is that IIS 4 and 5 will send
  40 // application/octet-stream for .xhtml files (because they don't recognize
  41 // the extension).  We did some experiments and it looks like this doesn't occur
  42 // very often on the web.  We choose the more secure option.
  43 //
  44 // GIF payload, no Content-Type header:
  45 // * IE 7: Render as GIF
  46 // * Firefox 2: Render as GIF
  47 // * Safari 3: Download as Unknown (Note: Safari will Render as GIF if the
  48 //                                        URL has an GIF extension)
  49 // * Opera 9: Render as GIF
  50 //
  51 // The choice is clear.
  52 // => Chrome: Render as GIF
  53 // Once we decide to render HTML without a Content-Type header, there isn't much
  54 // reason not to render GIFs.
  55 //
  56 // GIF payload, Content-Type: "text/plain":
  57 // * IE 7: Render as GIF
  58 // * Firefox 2: Download as application/octet-stream (Note: Firefox will
  59 //                              Download as GIF if the URL has an GIF extension)
  60 // * Safari 3: Download as Unknown (Note: Safari will Render as GIF if the
  61 //                                        URL has an GIF extension)
  62 // * Opera 9: Render as GIF
  63 //
  64 // Displaying as text/plain makes little sense as the content will look like
  65 // gibberish.  Here, we could change our minds and download.
  66 // => Chrome: Render as GIF
  67 //
  68 // GIF payload, Content-Type: "application/octet-stream":
  69 // * IE 7: Render as GIF
  70 // * Firefox 2: Download as application/octet-stream (Note: Firefox will
  71 //                              Download as GIF if the URL has an GIF extension)
  72 // * Safari 3: Download as Unknown (Note: Safari will Render as GIF if the
  73 //                                        URL has an GIF extension)
  74 // * Opera 9: Render as GIF
  75 //
  76 // We used to render as GIF here, but the problem is that some sites want to
  77 // trigger downloads by sending application/octet-stream (even though they
  78 // should be sending Content-Disposition: attachment).  Although it is safe
  79 // to render as GIF from a security perspective, we actually get better
  80 // compatibility if we don't sniff from application/octet stream at all.
  81 // => Chrome: Download as application/octet-stream
  82 //
  83 // XHTML payload, Content-Type: "text/xml":
  84 // * IE 7: Render as XML
  85 // * Firefox 2: Render as HTML
  86 // * Safari 3: Render as HTML
  87 // * Opera 9: Render as HTML
  88 // The layout tests rely on us rendering this as HTML.
  89 // But we're conservative in XHTML detection, as this runs afoul of the
  90 // "don't detect dangerous mime types" rule.
  91 //
  92 // Note that our definition of HTML payload is much stricter than IE's
  93 // definition and roughly the same as Firefox's definition.
  94
  95 #include <string>
  96
  97 #include "net/base/mime_sniffer.h"
  98
  99 #include "base/basictypes.h"
 100 #include "base/logging.h"
 101 #include "base/metrics/histogram.h"
 102 #include "base/string_util.h"
 103 #include "googleurl/src/gurl.h"
 104 #include "net/base/mime_util.h"
 105
 106 namespace net {
 107
 108 // The number of content bytes we need to use all our magic numbers.  Feel free
 109 // to increase this number if you add a longer magic number.
 110 static const size_t kBytesRequiredForMagic = 42;
 111
 112 struct MagicNumber {
 113   const char* mime_type;
 114   const char* magic;
 115   size_t magic_len;
 116   bool is_string;
 117 };
 118
 119 #define MAGIC_NUMBER(mime_type, magic) \
 120   { (mime_type), (magic), sizeof(magic)-1, false },
 121
 122 // Magic strings are case insensitive and must not include '\0' characters
 123 #define MAGIC_STRING(mime_type, magic) \
 124   { (mime_type), (magic), sizeof(magic)-1, true },
 125
 126 static const MagicNumber kMagicNumbers[] = {
 127   // Source: HTML 5 specification
 128   MAGIC_NUMBER("application/pdf", "%PDF-")
 129   MAGIC_NUMBER("application/postscript", "%!PS-Adobe-")
 130   MAGIC_NUMBER("image/gif", "GIF87a")
 131   MAGIC_NUMBER("image/gif", "GIF89a")
 132   MAGIC_NUMBER("image/png", "\x89" "PNG\x0D\x0A\x1A\x0A")
 133   MAGIC_NUMBER("image/jpeg", "\xFF\xD8\xFF")
 134   MAGIC_NUMBER("image/bmp", "BM")
 135   // Source: Mozilla
 136   MAGIC_NUMBER("text/plain", "#!")  // Script
 137   MAGIC_NUMBER("text/plain", "%!")  // Script, similar to PS
 138   MAGIC_NUMBER("text/plain", "From")
 139   MAGIC_NUMBER("text/plain", ">From")
 140   // Chrome specific
 141   MAGIC_NUMBER("application/x-gzip", "\x1F\x8B\x08")
 142   MAGIC_NUMBER("audio/x-pn-realaudio", "\x2E\x52\x4D\x46")
 143   MAGIC_NUMBER("video/x-ms-asf",
 144       "\x30\x26\xB2\x75\x8E\x66\xCF\x11\xA6\xD9\x00\xAA\x00\x62\xCE\x6C")
 145   MAGIC_NUMBER("image/tiff", "I I")
 146   MAGIC_NUMBER("image/tiff", "II*")
 147   MAGIC_NUMBER("image/tiff", "MM\x00*")
 148   MAGIC_NUMBER("audio/mpeg", "ID3")
 149   MAGIC_NUMBER("image/webp", "RIFF....WEBPVP8 ")
 150   MAGIC_NUMBER("video/webm", "\x1A\x45\xDF\xA3")
 151   // TODO(abarth): we don't handle partial byte matches yet
 152   // MAGIC_NUMBER("video/mpeg", "\x00\x00\x01\xB")
 153   // MAGIC_NUMBER("audio/mpeg", "\xFF\xE")
 154   // MAGIC_NUMBER("audio/mpeg", "\xFF\xF")
 155   MAGIC_NUMBER("application/zip", "PK\x03\x04")
 156   MAGIC_NUMBER("application/x-rar-compressed", "Rar!\x1A\x07\x00")
 157   MAGIC_NUMBER("application/x-msmetafile", "\xD7\xCD\xC6\x9A")
 158   MAGIC_NUMBER("application/octet-stream", "MZ")  // EXE
 159   // Sniffing for Flash:
 160   //
 161   //   MAGIC_NUMBER("application/x-shockwave-flash", "CWS")
 162   //   MAGIC_NUMBER("application/x-shockwave-flash", "FLV")
 163   //   MAGIC_NUMBER("application/x-shockwave-flash", "FWS")
 164   //
 165   // Including these magic number for Flash is a trade off.
 166   //
 167   // Pros:
 168   //   * Flash is an important and popular file format
 169   //
 170   // Cons:
 171   //   * These patterns are fairly weak
 172   //   * If we mistakenly decide something is Flash, we will execute it
 173   //     in the origin of an unsuspecting site.  This could be a security
 174   //     vulnerability if the site allows users to upload content.
 175   //
 176   // On balance, we do not include these patterns.
 177 };
 178
 179 // Our HTML sniffer differs slightly from Mozilla.  For example, Mozilla will
 180 // decide that a document that begins "<!DOCTYPE SOAP-ENV:Envelope PUBLIC " is
 181 // HTML, but we will not.
 182
 183 #define MAGIC_HTML_TAG(tag) \
 184   MAGIC_STRING("text/html", "<" tag)
 185
 186 static const MagicNumber kSniffableTags[] = {
 187   // XML processing directive.  Although this is not an HTML mime type, we sniff
 188   // for this in the HTML phase because text/xml is just as powerful as HTML and
 189   // we want to leverage our white space skipping technology.
 190   MAGIC_NUMBER("text/xml", "<?xml")  // Mozilla
 191   // DOCTYPEs
 192   MAGIC_HTML_TAG("!DOCTYPE html")  // HTML5 spec
 193   // Sniffable tags, ordered by how often they occur in sniffable documents.
 194   MAGIC_HTML_TAG("script")  // HTML5 spec, Mozilla
 195   MAGIC_HTML_TAG("html")  // HTML5 spec, Mozilla
 196   MAGIC_HTML_TAG("!--")
 197   MAGIC_HTML_TAG("head")  // HTML5 spec, Mozilla
 198   MAGIC_HTML_TAG("iframe")  // Mozilla
 199   MAGIC_HTML_TAG("h1")  // Mozilla
 200   MAGIC_HTML_TAG("div")  // Mozilla
 201   MAGIC_HTML_TAG("font")  // Mozilla
 202   MAGIC_HTML_TAG("table")  // Mozilla
 203   MAGIC_HTML_TAG("a")  // Mozilla
 204   MAGIC_HTML_TAG("style")  // Mozilla
 205   MAGIC_HTML_TAG("title")  // Mozilla
 206   MAGIC_HTML_TAG("b")  // Mozilla
 207   MAGIC_HTML_TAG("body")  // Mozilla
 208   MAGIC_HTML_TAG("br")
 209   MAGIC_HTML_TAG("p")  // Mozilla
 210 };
 211
 212 static base::Histogram* UMASnifferHistogramGet(const char* name,
 213                                                int array_size) {
 214   base::Histogram* counter =
 215       base::LinearHistogram::FactoryGet(name, 1, array_size - 1, array_size,
 216       base::Histogram::kUmaTargetedHistogramFlag);
 217   return counter;
 218 }
 219
 220 // Compare content header to a magic number where magic_entry can contain '.'
 221 // for single character of anything, allowing some bytes to be skipped.
 222 static bool MagicCmp(const char* magic_entry, const char* content, size_t len) {
 223   while (len) {
 224     if ((*magic_entry != '.') && (*magic_entry != *content))
 225       return false;
 226     ++magic_entry;
 227     ++content;
 228     --len;
 229   }
 230   return true;
 231 }
 232
 233 static bool MatchMagicNumber(const char* content, size_t size,
 234                              const MagicNumber* magic_entry,
 235                              std::string* result) {
 236   const size_t len = magic_entry->magic_len;
 237
 238   // Keep kBytesRequiredForMagic honest.
 239   DCHECK_LE(len, kBytesRequiredForMagic);
 240
 241   // To compare with magic strings, we need to compute strlen(content), but
 242   // content might not actually have a null terminator.  In that case, we
 243   // pretend the length is content_size.
 244   const char* end =
 245       static_cast<const char*>(memchr(content, '\0', size));
 246   const size_t content_strlen =
 247       (end != NULL) ? static_cast<size_t>(end - content) : size;
 248
 249   bool match = false;
 250   if (magic_entry->is_string) {
 251     if (content_strlen >= len) {
 252       // String comparisons are case-insensitive
 253       match = (base::strncasecmp(magic_entry->magic, content, len) == 0);
 254     }
 255   } else {
 256     if (size >= len)
 257       match = MagicCmp(magic_entry->magic, content, len);
 258   }
 259
 260   if (match) {
 261     result->assign(magic_entry->mime_type);
 262     return true;
 263   }
 264   return false;
 265 }
 266
 267 static bool CheckForMagicNumbers(const char* content, size_t size,
 268                                  const MagicNumber* magic, size_t magic_len,
 269                                  base::Histogram* counter,
 270                                  std::string* result) {
 271   for (size_t i = 0; i < magic_len; ++i) {
 272     if (MatchMagicNumber(content, size, &(magic[i]), result)) {
 273       if (counter) counter->Add(static_cast<int>(i));
 274       return true;
 275     }
 276   }
 277   return false;
 278 }
 279
 280 // Truncates |size| to |max_size| and returns true if |size| is at least
 281 // |max_size|.
 282 static bool TruncateSize(const size_t max_size, size_t* size) {
 283   // Keep kMaxBytesToSniff honest.
 284   DCHECK_LE(static_cast<int>(max_size), kMaxBytesToSniff);
 285
 286   if (*size >= max_size) {
 287     *size = max_size;
 288     return true;
 289   }
 290   return false;
 291 }
 292
 293 // Returns true and sets result if the content appears to be HTML.
 294 // Clears have_enough_content if more data could possibly change the result.
 295 static bool SniffForHTML(const char* content,
 296                          size_t size,
 297                          bool* have_enough_content,
 298                          std::string* result) {
 299   // For HTML, we are willing to consider up to 512 bytes. This may be overly
 300   // conservative as IE only considers 256.
 301   *have_enough_content &= TruncateSize(512, &size);
 302
 303   // We adopt a strategy similar to that used by Mozilla to sniff HTML tags,
 304   // but with some modifications to better match the HTML5 spec.
 305   const char* const end = content + size;
 306   const char* pos;
 307   for (pos = content; pos < end; ++pos) {
 308     if (!IsAsciiWhitespace(*pos))
 309       break;
 310   }
 311   static base::Histogram* counter(NULL);
 312   if (!counter)
 313     counter = UMASnifferHistogramGet("mime_sniffer.kSniffableTags2",
 314                                      arraysize(kSniffableTags));
 315   // |pos| now points to first non-whitespace character (or at end).
 316   return CheckForMagicNumbers(pos, end - pos,
 317                               kSniffableTags, arraysize(kSniffableTags),
 318                               counter, result);
 319 }
 320
 321 // Returns true and sets result if the content matches any of kMagicNumbers.
 322 // Clears have_enough_content if more data could possibly change the result.
 323 static bool SniffForMagicNumbers(const char* content,
 324                                  size_t size,
 325                                  bool* have_enough_content,
 326                                  std::string* result) {
 327   *have_enough_content &= TruncateSize(kBytesRequiredForMagic, &size);
 328
 329   // Check our big table of Magic Numbers
 330   static base::Histogram* counter(NULL);
 331   if (!counter)
 332     counter = UMASnifferHistogramGet("mime_sniffer.kMagicNumbers2",
 333                                      arraysize(kMagicNumbers));
 334   return CheckForMagicNumbers(content, size,
 335                               kMagicNumbers, arraysize(kMagicNumbers),
 336                               counter, result);
 337 }
 338
 339 // Byte order marks
 340 static const MagicNumber kMagicXML[] = {
 341   // We want to be very conservative in interpreting text/xml content as
 342   // XHTML -- we just want to sniff enough to make unit tests pass.
 343   // So we match explicitly on this, and don't match other ways of writing
 344   // it in semantically-equivalent ways.
 345   MAGIC_STRING("application/xhtml+xml",
 346                "<html xmlns=\"http://www.w3.org/1999/xhtml\"")
 347   MAGIC_STRING("application/atom+xml", "<feed")
 348   MAGIC_STRING("application/rss+xml", "<rss")  // UTF-8
 349 };
 350
 351 // Returns true and sets result if the content appears to contain XHTML or a
 352 // feed.
 353 // Clears have_enough_content if more data could possibly change the result.
 354 //
 355 // TODO(evanm): this is similar but more conservative than what Safari does,
 356 // while HTML5 has a different recommendation -- what should we do?
 357 // TODO(evanm): this is incorrect for documents whose encoding isn't a superset
 358 // of ASCII -- do we care?
 359 static bool SniffXML(const char* content,
 360                      size_t size,
 361                      bool* have_enough_content,
 362                      std::string* result) {
 363   // We allow at most 300 bytes of content before we expect the opening tag.
 364   *have_enough_content &= TruncateSize(300, &size);
 365   const char* pos = content;
 366   const char* const end = content + size;
 367
 368   // This loop iterates through tag-looking offsets in the file.
 369   // We want to skip XML processing instructions (of the form "<?xml ...")
 370   // and stop at the first "plain" tag, then make a decision on the mime-type
 371   // based on the name (or possibly attributes) of that tag.
 372   static base::Histogram* counter(NULL);
 373   if (!counter)
 374     counter = UMASnifferHistogramGet("mime_sniffer.kMagicXML2",
 375                                      arraysize(kMagicXML));
 376   const int kMaxTagIterations = 5;
 377   for (int i = 0; i < kMaxTagIterations && pos < end; ++i) {
 378     pos = reinterpret_cast<const char*>(memchr(pos, '<', end - pos));
 379     if (!pos)
 380       return false;
 381
 382     if (base::strncasecmp(pos, "<?xml", sizeof("<?xml")-1) == 0) {
 383       // Skip XML declarations.
 384       ++pos;
 385       continue;
 386     } else if (base::strncasecmp(pos, "<!DOCTYPE",
 387                                  sizeof("<!DOCTYPE")-1) == 0) {
 388       // Skip DOCTYPE declarations.
 389       ++pos;
 390       continue;
 391     }
 392
 393     if (CheckForMagicNumbers(pos, end - pos,
 394                              kMagicXML, arraysize(kMagicXML),
 395                              counter, result))
 396       return true;
 397
 398     // TODO(evanm): handle RSS 1.0, which is an RDF format and more difficult
 399     // to identify.
 400
 401     // If we get here, we've hit an initial tag that hasn't matched one of the
 402     // above tests.  Abort.
 403     return true;
 404   }
 405
 406   // We iterated too far without finding a start tag.
 407   // If we have more content to look at, we aren't going to change our mind by
 408   // seeing more bytes from the network.
 409   return pos < end;
 410 }
 411
 412 // Byte order marks
 413 static const MagicNumber kByteOrderMark[] = {
 414   MAGIC_NUMBER("text/plain", "\xFE\xFF")  // UTF-16BE
 415   MAGIC_NUMBER("text/plain", "\xFF\xFE")  // UTF-16LE
 416   MAGIC_NUMBER("text/plain", "\xEF\xBB\xBF")  // UTF-8
 417 };
 418
 419 // Whether a given byte looks like it might be part of binary content.
 420 // Source: HTML5 spec
 421 static char kByteLooksBinary[] = {
 422   1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1,  // 0x00 - 0x0F
 423   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1,  // 0x10 - 0x1F
 424   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  // 0x20 - 0x2F
 425   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  // 0x30 - 0x3F
 426   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  // 0x40 - 0x4F
 427   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  // 0x50 - 0x5F
 428   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  // 0x60 - 0x6F
 429   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  // 0x70 - 0x7F
 430   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  // 0x80 - 0x8F
 431   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  // 0x90 - 0x9F
 432   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  // 0xA0 - 0xAF
 433   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  // 0xB0 - 0xBF
 434   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  // 0xC0 - 0xCF
 435   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  // 0xD0 - 0xDF
 436   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  // 0xE0 - 0xEF
 437   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  // 0xF0 - 0xFF
 438 };
 439
 440 // Returns true and sets result to "application/octet-stream" if the content
 441 // appears to be binary data. Otherwise, returns false and sets "text/plain".
 442 // Clears have_enough_content if more data could possibly change the result.
 443 static bool SniffBinary(const char* content,
 444                         size_t size,
 445                         bool* have_enough_content,
 446                         std::string* result) {
 447   // There is no concensus about exactly how to sniff for binary content.
 448   // * IE 7: Don't sniff for binary looking bytes, but trust the file extension.
 449   // * Firefox 3.5: Sniff first 4096 bytes for a binary looking byte.
 450   // Here, we side with FF, but with a smaller buffer. This size was chosen
 451   // because it is small enough to comfortably fit into a single packet (after
 452   // allowing for headers) and yet large enough to account for binary formats
 453   // that have a significant amount of ASCII at the beginning (crbug.com/15314).
 454   const bool is_truncated = TruncateSize(kMaxBytesToSniff, &size);
 455
 456   // First, we look for a BOM.
 457   static base::Histogram* counter(NULL);
 458   if (!counter)
 459     counter = UMASnifferHistogramGet("mime_sniffer.kByteOrderMark2",
 460                                      arraysize(kByteOrderMark));
 461   std::string unused;
 462   if (CheckForMagicNumbers(content, size,
 463                            kByteOrderMark, arraysize(kByteOrderMark),
 464                            counter, &unused)) {
 465     // If there is BOM, we think the buffer is not binary.
 466     result->assign("text/plain");
 467     return false;
 468   }
 469
 470   // Next we look to see if any of the bytes "look binary."
 471   for (size_t i = 0; i < size; ++i) {
 472     // If we a see a binary-looking byte, we think the content is binary.
 473     if (kByteLooksBinary[static_cast<unsigned char>(content[i])]) {
 474       result->assign("application/octet-stream");
 475       return true;
 476     }
 477   }
 478
 479   // No evidence either way. Default to non-binary and, if truncated, clear
 480   // have_enough_content because there could be a binary looking byte in the
 481   // truncated data.
 482   *have_enough_content &= is_truncated;
 483   result->assign("text/plain");
 484   return false;
 485 }
 486
 487 static bool IsUnknownMimeType(const std::string& mime_type) {
 488   // TODO(tc): Maybe reuse some code in net/http/http_response_headers.* here.
 489   // If we do, please be careful not to alter the semantics at all.
 490   static const char* kUnknownMimeTypes[] = {
 491     // Empty mime types are as unknown as they get.
 492     "",
 493     // The unknown/unknown type is popular and uninformative
 494     "unknown/unknown",
 495     // The second most popular unknown mime type is application/unknown
 496     "application/unknown",
 497     // Firefox rejects a mime type if it is exactly */*
 498     "*/*",
 499   };
 500   static base::Histogram* counter(NULL);
 501   if (!counter)
 502     counter = UMASnifferHistogramGet("mime_sniffer.kUnknownMimeTypes2",
 503                                      arraysize(kUnknownMimeTypes) + 1);
 504   for (size_t i = 0; i < arraysize(kUnknownMimeTypes); ++i) {
 505     if (mime_type == kUnknownMimeTypes[i]) {
 506       counter->Add(i);
 507       return true;
 508     }
 509   }
 510   if (mime_type.find('/') == std::string::npos) {
 511     // Firefox rejects a mime type if it does not contain a slash
 512     counter->Add(arraysize(kUnknownMimeTypes));
 513     return true;
 514   }
 515   return false;
 516 }
 517
 518 // Returns true and sets result if the content appears to be a crx (chrome
 519 // extension) file.
 520 // Clears have_enough_content if more data could possibly change the result.
 521 static bool SniffCRX(const char* content,
 522                      size_t size,
 523                      const GURL& url,
 524                      const std::string& type_hint,
 525                      bool* have_enough_content,
 526                      std::string* result) {
 527   static base::Histogram* counter(NULL);
 528   if (!counter)
 529     counter = UMASnifferHistogramGet("mime_sniffer.kSniffCRX", 3);
 530
 531   // Technically, the crx magic number is just Cr24, but the bytes after that
 532   // are a version number which changes infrequently. Including it in the
 533   // sniffing gives us less room for error. If the version number ever changes,
 534   // we can just add an entry to this list.
 535   //
 536   // TODO(aa): If we ever have another magic number, we'll want to pass a
 537   // histogram into CheckForMagicNumbers(), below, to see which one matched.
 538   static const struct MagicNumber kCRXMagicNumbers[] = {
 539     MAGIC_NUMBER("application/x-chrome-extension", "Cr24\x02\x00\x00\x00")
 540   };
 541
 542   // Only consider files that have the extension ".crx".
 543   static const char kCRXExtension[] = ".crx";
 544   // Ignore null by subtracting 1.
 545   static const int kExtensionLength = arraysize(kCRXExtension) - 1;
 546   if (url.path().rfind(kCRXExtension, std::string::npos, kExtensionLength) ==
 547       url.path().size() - kExtensionLength) {
 548     counter->Add(1);
 549   } else {
 550     return false;
 551   }
 552
 553   *have_enough_content &= TruncateSize(kBytesRequiredForMagic, &size);
 554   if (CheckForMagicNumbers(content, size,
 555                            kCRXMagicNumbers, arraysize(kCRXMagicNumbers),
 556                            NULL, result)) {
 557     counter->Add(2);
 558   } else {
 559     return false;
 560   }
 561
 562   return true;
 563 }
 564
 565 bool ShouldSniffMimeType(const GURL& url, const std::string& mime_type) {
 566   static base::Histogram* should_sniff_counter(NULL);
 567   if (!should_sniff_counter)
 568     should_sniff_counter =
 569         UMASnifferHistogramGet("mime_sniffer.ShouldSniffMimeType2", 3);
 570   // We are willing to sniff the mime type for HTTP, HTTPS, and FTP
 571   bool sniffable_scheme = url.is_empty() ||
 572                           url.SchemeIs("http") ||
 573                           url.SchemeIs("https") ||
 574                           url.SchemeIs("ftp") ||
 575                           url.SchemeIsFile();
 576   if (!sniffable_scheme) {
 577     should_sniff_counter->Add(1);
 578     return false;
 579   }
 580
 581   static const char* kSniffableTypes[] = {
 582     // Many web servers are misconfigured to send text/plain for many
 583     // different types of content.
 584     "text/plain",
 585     // We want to sniff application/octet-stream for
 586     // application/x-chrome-extension, but nothing else.
 587     "application/octet-stream",
 588     // XHTML and Atom/RSS feeds are often served as plain xml instead of
 589     // their more specific mime types.
 590     "text/xml",
 591     "application/xml",
 592   };
 593   static base::Histogram* counter(NULL);
 594   if (!counter)
 595     counter = UMASnifferHistogramGet("mime_sniffer.kSniffableTypes2",
 596                                      arraysize(kSniffableTypes) + 1);
 597   for (size_t i = 0; i < arraysize(kSniffableTypes); ++i) {
 598     if (mime_type == kSniffableTypes[i]) {
 599       counter->Add(i);
 600       should_sniff_counter->Add(2);
 601       return true;
 602     }
 603   }
 604   if (IsUnknownMimeType(mime_type)) {
 605     // The web server didn't specify a content type or specified a mime
 606     // type that we ignore.
 607     counter->Add(arraysize(kSniffableTypes));
 608     should_sniff_counter->Add(2);
 609     return true;
 610   }
 611   should_sniff_counter->Add(1);
 612   return false;
 613 }
 614
 615 bool SniffMimeType(const char* content, size_t content_size,
 616                    const GURL& url, const std::string& type_hint,
 617                    std::string* result) {
 618   DCHECK_LT(content_size, 1000000U);  // sanity check
 619   DCHECK(content);
 620   DCHECK(result);
 621
 622   // By default, we assume we have enough content.
 623   // Each sniff routine may unset this if it wasn't provided enough content.
 624   bool have_enough_content = true;
 625
 626   // By default, we'll return the type hint.
 627   // Each sniff routine may modify this if it has a better guess..
 628   result->assign(type_hint);
 629
 630   // Cache information about the type_hint
 631   const bool hint_is_unknown_mime_type = IsUnknownMimeType(type_hint);
 632
 633   // First check for HTML
 634   if (hint_is_unknown_mime_type) {
 635     // We're only willing to sniff HTML if the server has not supplied a mime
 636     // type, or if the type it did supply indicates that it doesn't know what
 637     // the type should be.
 638     if (SniffForHTML(content, content_size, &have_enough_content, result))
 639       return true;  // We succeeded in sniffing HTML.  No more content needed.
 640   }
 641
 642   // We're only willing to sniff for binary in 3 cases:
 643   // 1. The server has not supplied a mime type.
 644   // 2. The type it did supply indicates that it doesn't know what the type
 645   //    should be.
 646   // 3. The type is "text/plain" which is the default on some web servers and
 647   //    could be indicative of a mis-configuration that we shield the user from.
 648   const bool hint_is_text_plain = (type_hint == "text/plain");
 649   if (hint_is_unknown_mime_type || hint_is_text_plain) {
 650     if (!SniffBinary(content, content_size, &have_enough_content, result)) {
 651       // If the server said the content was text/plain and it doesn't appear
 652       // to be binary, then we trust it.
 653       if (hint_is_text_plain) {
 654         return have_enough_content;
 655       }
 656     }
 657   }
 658
 659   // If we have plain XML, sniff XML subtypes.
 660   if (type_hint == "text/xml" || type_hint == "application/xml") {
 661     // We're not interested in sniffing these types for images and the like.
 662     // Instead, we're looking explicitly for a feed.  If we don't find one
 663     // we're done and return early.
 664     if (SniffXML(content, content_size, &have_enough_content, result))
 665       return true;
 666     return have_enough_content;
 667   }
 668
 669   // CRX files (chrome extensions) have a special sniffing algorithm. It is
 670   // tighter than the others because we don't have to match legacy behavior.
 671   if (SniffCRX(content, content_size, url, type_hint,
 672                &have_enough_content, result))
 673     return true;
 674
 675   // We're not interested in sniffing for magic numbers when the type_hint
 676   // is application/octet-stream.  Time to bail out.
 677   if (type_hint == "application/octet-stream")
 678     return have_enough_content;
 679
 680   // Now we look in our large table of magic numbers to see if we can find
 681   // anything that matches the content.
 682   if (SniffForMagicNumbers(content, content_size,
 683                            &have_enough_content, result))
 684     return true;  // We've matched a magic number.  No more content needed.
 685
 686   return have_enough_content;
 687 }
 688
 689 }  // namespace net