// Copyright 2012 The Chromium Authors // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. // Detecting mime types is a tricky business because we need to balance // compatibility concerns with security issues. Here is a survey of how other // browsers behave and then a description of how we intend to behave. // // HTML payload, no Content-Type header: // * IE 7: Render as HTML // * Firefox 2: Render as HTML // * Safari 3: Render as HTML // * Opera 9: Render as HTML // // Here the choice seems clear: // => Chrome: Render as HTML // // HTML payload, Content-Type: "text/plain": // * IE 7: Render as HTML // * Firefox 2: Render as text // * Safari 3: Render as text (Note: Safari will Render as HTML if the URL // has an HTML extension) // * Opera 9: Render as text // // Here we choose to follow the majority (and break some compatibility with IE). // Many folks dislike IE's behavior here. // => Chrome: Render as text // We generalize this as follows. If the Content-Type header is text/plain // we won't detect dangerous mime types (those that can execute script). // // HTML payload, Content-Type: "application/octet-stream": // * IE 7: Render as HTML // * Firefox 2: Download as application/octet-stream // * Safari 3: Render as HTML // * Opera 9: Render as HTML // // We follow Firefox. // => Chrome: Download as application/octet-stream // One factor in this decision is that IIS 4 and 5 will send // application/octet-stream for .xhtml files (because they don't recognize // the extension). We did some experiments and it looks like this doesn't occur // very often on the web. We choose the more secure option. // // GIF payload, no Content-Type header: // * IE 7: Render as GIF // * Firefox 2: Render as GIF // * Safari 3: Download as Unknown (Note: Safari will Render as GIF if the // URL has an GIF extension) // * Opera 9: Render as GIF // // The choice is clear. // => Chrome: Render as GIF // Once we decide to render HTML without a Content-Type header, there isn't much // reason not to render GIFs. // // GIF payload, Content-Type: "text/plain": // * IE 7: Render as GIF // * Firefox 2: Download as application/octet-stream (Note: Firefox will // Download as GIF if the URL has an GIF extension) // * Safari 3: Download as Unknown (Note: Safari will Render as GIF if the // URL has an GIF extension) // * Opera 9: Render as GIF // // Displaying as text/plain makes little sense as the content will look like // gibberish. Here, we could change our minds and download. // => Chrome: Render as GIF // // GIF payload, Content-Type: "application/octet-stream": // * IE 7: Render as GIF // * Firefox 2: Download as application/octet-stream (Note: Firefox will // Download as GIF if the URL has an GIF extension) // * Safari 3: Download as Unknown (Note: Safari will Render as GIF if the // URL has an GIF extension) // * Opera 9: Render as GIF // // We used to render as GIF here, but the problem is that some sites want to // trigger downloads by sending application/octet-stream (even though they // should be sending Content-Disposition: attachment). Although it is safe // to render as GIF from a security perspective, we actually get better // compatibility if we don't sniff from application/octet stream at all. // => Chrome: Download as application/octet-stream // // Note that our definition of HTML payload is much stricter than IE's // definition and roughly the same as Firefox's definition. #include <stdint.h> #include <string> #include "net/base/mime_sniffer.h" #include "base/check_op.h" #include "base/containers/span.h" #include "base/notreached.h" #include "base/strings/string_util.h" #include "build/build_config.h" #include "url/gurl.h" namespace net { // The number of content bytes we need to use all our magic numbers. Feel free // to increase this number if you add a longer magic number. static const size_t kBytesRequiredForMagic = …; struct MagicNumber { … }; #define MAGIC_NUMBER(mime_type, magic) … template <int MagicSize, int MaskSize> class VerifySizes { … }; #define verified_sizeof(magic, mask) … #define MAGIC_MASK(mime_type, magic, mask) … // Magic strings are case insensitive and must not include '\0' characters #define MAGIC_STRING(mime_type, magic) … static const MagicNumber kMagicNumbers[] = …; // The number of content bytes we need to use all our Microsoft Office magic // numbers. static const size_t kBytesRequiredForOfficeMagic = …; static const MagicNumber kOfficeMagicNumbers[] = …; enum OfficeDocType { … }; struct OfficeExtensionType { … }; #define OFFICE_EXTENSION(type, extension) … static const OfficeExtensionType kOfficeExtensionTypes[] = …; static const MagicNumber kExtraMagicNumbers[] = …; // Our HTML sniffer differs slightly from Mozilla. For example, Mozilla will // decide that a document that begins "<!DOCTYPE SOAP-ENV:Envelope PUBLIC " is // HTML, but we will not. #define MAGIC_HTML_TAG(tag) … static const MagicNumber kSniffableTags[] = …; // Compare content header to a magic number where magic_entry can contain '.' // for single character of anything, allowing some bytes to be skipped. static bool MagicCmp(std::string_view content, std::string_view magic_entry) { … } // Like MagicCmp() except that it ANDs each byte with a mask before // the comparison, because there are some bits we don't care about. static bool MagicMaskCmp(std::string_view content, std::string_view magic_entry, std::string_view magic_mask) { … } static bool MatchMagicNumber(std::string_view content, const MagicNumber& magic_entry, std::string* result) { … } static bool CheckForMagicNumbers(std::string_view content, base::span<const MagicNumber> magic_numbers, std::string* result) { … } // Truncates |string_piece| to length |max_size| and returns true if // |string_piece| is now exactly |max_size|. static bool TruncateStringPiece(const size_t max_size, std::string_view* string_piece) { … } // Returns true and sets result if the content appears to be HTML. // Clears have_enough_content if more data could possibly change the result. static bool SniffForHTML(std::string_view content, bool* have_enough_content, std::string* result) { … } // Returns true and sets result if the content matches any of kMagicNumbers. // Clears have_enough_content if more data could possibly change the result. static bool SniffForMagicNumbers(std::string_view content, bool* have_enough_content, std::string* result) { … } // Returns true and sets result if the content matches any of // kOfficeMagicNumbers, and the URL has the proper extension. // Clears |have_enough_content| if more data could possibly change the result. static bool SniffForOfficeDocs(std::string_view content, const GURL& url, bool* have_enough_content, std::string* result) { … } static bool IsOfficeType(const std::string& type_hint) { … } // This function checks for files that have a Microsoft Office MIME type // set, but are not actually Office files. // // If this is not actually an Office file, |*result| is set to // "application/octet-stream", otherwise it is not modified. // // Returns false if additional data is required to determine the file type, or // true if there is enough data to make a decision. static bool SniffForInvalidOfficeDocs(std::string_view content, const GURL& url, std::string* result) { … } // Tags that indicate the content is likely XML. static const MagicNumber kMagicXML[] = …; // Returns true and sets result if the content appears to contain XHTML or a // feed. // Clears have_enough_content if more data could possibly change the result. // // TODO(evanm): this is similar but more conservative than what Safari does, // while HTML5 has a different recommendation -- what should we do? // TODO(evanm): this is incorrect for documents whose encoding isn't a superset // of ASCII -- do we care? static bool SniffXML(std::string_view content, bool* have_enough_content, std::string* result) { … } // Byte order marks static const MagicNumber kByteOrderMark[] = …; // Returns true and sets result to "application/octet-stream" if the content // appears to be binary data. Otherwise, returns false and sets "text/plain". // Clears have_enough_content if more data could possibly change the result. static bool SniffBinary(std::string_view content, bool* have_enough_content, std::string* result) { … } static bool IsUnknownMimeType(std::string_view mime_type) { … } // Returns true and sets result if the content appears to be a crx (Chrome // extension) file. // Clears have_enough_content if more data could possibly change the result. static bool SniffCRX(std::string_view content, const GURL& url, bool* have_enough_content, std::string* result) { … } bool ShouldSniffMimeType(const GURL& url, std::string_view mime_type) { … } bool SniffMimeType(std::string_view content, const GURL& url, const std::string& type_hint, ForceSniffFileUrlsForHtml force_sniff_file_url_for_html, std::string* result) { … } bool SniffMimeTypeFromLocalData(std::string_view content, std::string* result) { … } bool LooksLikeBinary(std::string_view content) { … } } // namespace net