1 年之前 · 2babc08c17
--- a/Tests/LibWeb/TestMimeSniff.cpp
+++ b/Tests/LibWeb/TestMimeSniff.cpp
@@ -30,3 +30,62 @@ TEST_CASE(determine_computed_mime_type_given_no_sniff_is_unset)
 
				 
			
 
				     EXPECT_EQ(xml_mime_type, MUST(computed_mime_type.serialized()));
			
 
				 }
			
 
				+
			
 
				+TEST_CASE(compute_unknown_mime_type)
			
 
				+{
			
 
				+    HashMap<StringView, Vector<StringView>> mime_type_to_headers_map;
			
 
				+
			
 
				+    mime_type_to_headers_map.set("application/octet-stream"sv, { "\x00"sv });
			
 
				+    mime_type_to_headers_map.set("text/html"sv, {
			
 
				+                                                    "\x09\x09<!DOCTYPE HTML\x20"sv,
			
 
				+                                                    "\x0A<HTML\x3E"sv,
			
 
				+                                                    "\x0C<HEAD\x20"sv,
			
 
				+                                                    "\x0D<SCRIPT>"sv,
			
 
				+                                                    "\x20<IFRAME>"sv,
			
 
				+                                                    "<H1>"sv,
			
 
				+                                                    "<DIV>"sv,
			
 
				+                                                    "<FONT>"sv,
			
 
				+                                                    "<TABLE>"sv,
			
 
				+                                                    "<A>"sv,
			
 
				+                                                    "<STYLE>"sv,
			
 
				+                                                    "<TITLE>"sv,
			
 
				+                                                    "<B>"sv,
			
 
				+                                                    "<BODY>"sv,
			
 
				+                                                    "<BR>"sv,
			
 
				+                                                    "<P>"sv,
			
 
				+                                                    "<!-->"sv,
			
 
				+                                                });
			
 
				+    mime_type_to_headers_map.set("text/xml"sv, { "<?xml"sv });
			
 
				+    mime_type_to_headers_map.set("application/pdf"sv, { "%PDF-"sv });
			
 
				+    mime_type_to_headers_map.set("application/postscript"sv, { "%!PS-Adobe-"sv });
			
 
				+    mime_type_to_headers_map.set("text/plain"sv, {
			
 
				+                                                     "\xFE\xFF\x00\x00"sv,
			
 
				+                                                     "\xFF\xFE\x00\x00"sv,
			
 
				+                                                     "\xEF\xBB\xBF\x00"sv,
			
 
				+                                                     "Hello world!"sv,
			
 
				+                                                 });
			
 
				+
			
 
				+    for (auto const& mime_type_to_headers : mime_type_to_headers_map) {
			
 
				+        auto mime_type = mime_type_to_headers.key;
			
 
				+
			
 
				+        for (auto const& header : mime_type_to_headers.value) {
			
 
				+            auto computed_mime_type = MUST(Web::MimeSniff::Resource::sniff(header.bytes()));
			
 
				+            EXPECT_EQ(mime_type, computed_mime_type.essence());
			
 
				+        }
			
 
				+    }
			
 
				+}
			
 
				+
			
 
				+TEST_CASE(compute_mime_type_given_unknown_supplied_type)
			
 
				+{
			
 
				+    Array<Web::MimeSniff::MimeType, 3> unknown_supplied_types = {
			
 
				+        MUST(Web::MimeSniff::MimeType::create("unknown"_string, "unknown"_string)),
			
 
				+        MUST(Web::MimeSniff::MimeType::create("application"_string, "unknown"_string)),
			
 
				+        MUST(Web::MimeSniff::MimeType::create("*"_string, "*"_string))
			
 
				+    };
			
 
				+    auto header_bytes = "<HTML>"sv.bytes();
			
 
				+
			
 
				+    for (auto const& unknown_supplied_type : unknown_supplied_types) {
			
 
				+        auto computed_mime_type = MUST(Web::MimeSniff::Resource::sniff(header_bytes, Web::MimeSniff::SniffingConfiguration { .supplied_type = unknown_supplied_type }));
			
 
				+        EXPECT_EQ("text/html"sv, computed_mime_type.essence());
			
 
				+    }
			
 
				+}
			
--- a/Userland/Libraries/LibWeb/MimeSniff/Resource.cpp
+++ b/Userland/Libraries/LibWeb/MimeSniff/Resource.cpp
@@ -7,6 +7,236 @@
 
				 #include <LibWeb/Fetch/Infrastructure/URL.h>
			
 
				 #include <LibWeb/MimeSniff/Resource.h>
			
 
				 
			
 
				+namespace {
			
 
				+
			
 
				+using namespace Web::MimeSniff;
			
 
				+
			
 
				+struct BytePatternTableRow {
			
 
				+    StringView byte_pattern;
			
 
				+    StringView pattern_mask;
			
 
				+    ReadonlyBytes ignored_leading_bytes;
			
 
				+    StringView mime_type;
			
 
				+
			
 
				+    // NOTE: If the byte pattern has a tag-terminating byte, add a byte where this byte should be. The value itself is ignored in
			
 
				+    //       the pattern_matching_algorithm() (see the NOTE in this algorithm for more details).
			
 
				+    bool is_tag_terminated { false };
			
 
				+};
			
 
				+
			
 
				+// https://mimesniff.spec.whatwg.org/#tag-terminating-byte
			
 
				+bool is_tag_terminating_byte(u8 byte)
			
 
				+{
			
 
				+    // A tag-terminating byte (abbreviated 0xTT) is any one of the following bytes: 0x20 (SP), 0x3E (">").
			
 
				+    return byte == 0x20 || byte == 0x3E;
			
 
				+}
			
 
				+
			
 
				+// https://mimesniff.spec.whatwg.org/#binary-data-byte
			
 
				+bool is_binary_data_byte(u8 byte)
			
 
				+{
			
 
				+    //  A binary data byte is a byte in the range 0x00 to 0x08 (NUL to BS), the byte 0x0B (VT), a byte in
			
 
				+    //  the range 0x0E to 0x1A (SO to SUB), or a byte in the range 0x1C to 0x1F (FS to US).
			
 
				+    return (byte <= 0x08) || byte == 0x0B || (byte >= 0x0E && byte <= 0x1A) || (byte >= 0x1C && byte <= 0x1F);
			
 
				+}
			
 
				+
			
 
				+// https://mimesniff.spec.whatwg.org/#pattern-matching-algorithm
			
 
				+bool pattern_matching_algorithm(ReadonlyBytes input, ReadonlyBytes pattern, ReadonlyBytes mask, ReadonlyBytes ignored, bool is_tag_terminated = false)
			
 
				+{
			
 
				+    // 1. Assert: pattern’s length is equal to mask’s length.
			
 
				+    VERIFY(pattern.size() == mask.size());
			
 
				+
			
 
				+    // 2. If input’s length is less than pattern’s length, return false.
			
 
				+    if (input.size() < pattern.size())
			
 
				+        return false;
			
 
				+
			
 
				+    // 3. Let s be 0.
			
 
				+    size_t s = 0;
			
 
				+
			
 
				+    // 4. While s < input’s length:
			
 
				+    while (s < input.size()) {
			
 
				+        // 1. If ignored does not contain input[s], break.
			
 
				+        if (!ignored.contains_slow(input[s]))
			
 
				+            break;
			
 
				+
			
 
				+        // 2. Set s to s + 1.
			
 
				+        s++;
			
 
				+    }
			
 
				+
			
 
				+    // 5. Let p be 0.
			
 
				+    size_t p = 0;
			
 
				+
			
 
				+    // 6. While p < pattern’s length:
			
 
				+    while (p < pattern.size()) {
			
 
				+        // 1. Let maskedData be the result of applying the bitwise AND operator to input[s] and mask[p].
			
 
				+        u8 masked_data = input[s] & mask[p];
			
 
				+
			
 
				+        // NOTE: This non-standard branch exists to avoid having to create 2 byte patterns just so that
			
 
				+        //       they can only differ by their tag-terminating byte (which could be a 0x20 or 0x3E byte).
			
 
				+        if (is_tag_terminated && p + 1 == pattern.size())
			
 
				+            return is_tag_terminating_byte(masked_data);
			
 
				+
			
 
				+        // 2. If maskedData is not equal to pattern[p], return false.
			
 
				+        if (masked_data != pattern[p])
			
 
				+            return false;
			
 
				+
			
 
				+        // 3. Set s to s + 1.
			
 
				+        s++;
			
 
				+
			
 
				+        // 4. Set p to p + 1.
			
 
				+        p++;
			
 
				+    }
			
 
				+
			
 
				+    // 7. Return true.
			
 
				+    return true;
			
 
				+}
			
 
				+
			
 
				+ReadonlyBytes constexpr no_ignored_bytes;
			
 
				+
			
 
				+// https://mimesniff.spec.whatwg.org/#rules-for-identifying-an-unknown-mime-type
			
 
				+ErrorOr<MimeType> rules_for_identifying_an_unknown_mime_type(Resource const& resource, bool sniff_scriptable = false)
			
 
				+{
			
 
				+    // 1. If the sniff-scriptable flag is set, execute the following steps for each row row in the following table:
			
 
				+    if (sniff_scriptable) {
			
 
				+        static auto constexpr text_html_mime_type = "text/html"sv;
			
 
				+
			
 
				+        // https://mimesniff.spec.whatwg.org/#whitespace-byte
			
 
				+        // A whitespace byte (abbreviated 0xWS) is any one of the following bytes: 0x09 (HT), 0x0A (LF), 0x0C (FF), 0x0D (CR), 0x20 (SP).
			
 
				+        static Array<u8, 5> constexpr ignored_whitespace_bytes { 0x09, 0x0A, 0x0C, 0x0D, 0x20 };
			
 
				+        static Array<BytePatternTableRow, 19> constexpr pattern_table {
			
 
				+            // The case-insensitive string "<!DOCTYPE HTML" followed by a tag-terminating byte.
			
 
				+            BytePatternTableRow { "\x3C\x21\x44\x4F\x43\x54\x59\x50\x45\x20\x48\x54\x4D\x4C\x00"sv,
			
 
				+                "\xFF\xFF\xDF\xDF\xDF\xDF\xDF\xDF\xDF\xFF\xDF\xDF\xDF\xDF\xFF"sv, ignored_whitespace_bytes, text_html_mime_type, true },
			
 
				+
			
 
				+            // The case-insensitive string "<HTML" followed by a tag-terminating byte.
			
 
				+            BytePatternTableRow { "\x3C\x48\x54\x4D\x4C\x00"sv, "\xFF\xDF\xDF\xDF\xDF\xFF"sv, ignored_whitespace_bytes, text_html_mime_type, true },
			
 
				+
			
 
				+            // The case-insensitive string "<HEAD" followed by a tag-terminating byte.
			
 
				+            BytePatternTableRow { "\x3C\x48\x45\x41\x44\x00"sv, "\xFF\xDF\xDF\xDF\xDF\xFF"sv, ignored_whitespace_bytes, text_html_mime_type, true },
			
 
				+
			
 
				+            // The case-insensitive string "<SCRIPT" followed by a tag-terminating byte.
			
 
				+            BytePatternTableRow { "\x3C\x53\x43\x52\x49\x50\x54\x00"sv,
			
 
				+                "\xFF\xDF\xDF\xDF\xDF\xDF\xDF\xFF"sv, ignored_whitespace_bytes, text_html_mime_type, true },
			
 
				+
			
 
				+            // The case-insensitive string "<IFRAME" followed by a tag-terminating byte.
			
 
				+            BytePatternTableRow { "\x3C\x49\x46\x52\x41\x4D\x45\x00"sv,
			
 
				+                "\xFF\xDF\xDF\xDF\xDF\xDF\xDF\xFF"sv, ignored_whitespace_bytes, text_html_mime_type, true },
			
 
				+
			
 
				+            // The case-insensitive string "<H1" followed by a tag-terminating byte.
			
 
				+            BytePatternTableRow { "\x3C\x48\x31\x00"sv, "\xFF\xDF\xFF\xFF"sv, ignored_whitespace_bytes, text_html_mime_type, true },
			
 
				+
			
 
				+            // The case-insensitive string "<DIV" followed by a tag-terminating byte.
			
 
				+            BytePatternTableRow { "\x3C\x44\x49\x56\x00"sv, "\xFF\xDF\xDF\xDF\xFF"sv, ignored_whitespace_bytes, text_html_mime_type, true },
			
 
				+
			
 
				+            // The case-insensitive string "<FONT" followed by a tag-terminating byte.
			
 
				+            BytePatternTableRow { "\x3C\x46\x4F\x4E\x54\x00"sv, "\xFF\xDF\xDF\xDF\xDF\xFF"sv, ignored_whitespace_bytes, text_html_mime_type, true },
			
 
				+
			
 
				+            // The case-insensitive string "<TABLE" followed by a tag-terminating byte.
			
 
				+            BytePatternTableRow { "\x3C\x54\x41\x42\x4C\x45\x00"sv, "\xFF\xDF\xDF\xDF\xDF\xDF\xFF"sv, ignored_whitespace_bytes, text_html_mime_type, true },
			
 
				+
			
 
				+            // The case-insensitive string "<A" followed by a tag-terminating byte.
			
 
				+            BytePatternTableRow { "\x3C\x41\x00"sv, "\xFF\xDF\xFF"sv, ignored_whitespace_bytes, text_html_mime_type, true },
			
 
				+
			
 
				+            // The case-insensitive string "<STYLE" followed by a tag-terminating byte.
			
 
				+            BytePatternTableRow { "\x3C\x53\x54\x59\x4C\x45\x00"sv,
			
 
				+                "\xFF\xDF\xDF\xDF\xDF\xDF\xFF"sv, ignored_whitespace_bytes, text_html_mime_type, true },
			
 
				+
			
 
				+            // The case-insensitive string "<TITLE" followed by a tag-terminating byte.
			
 
				+            BytePatternTableRow { "\x3C\x54\x49\x54\x4C\x45\x00"sv,
			
 
				+                "\xFF\xDF\xDF\xDF\xDF\xDF\xFF"sv, ignored_whitespace_bytes, text_html_mime_type, true },
			
 
				+
			
 
				+            // The case-insensitive string "<B" followed by a tag-terminating byte.
			
 
				+            BytePatternTableRow { "\x3C\x42\x00"sv, "\xFF\xDF\xFF"sv, ignored_whitespace_bytes, text_html_mime_type, true },
			
 
				+
			
 
				+            // The case-insensitive string "<BODY" followed by a tag-terminating byte.
			
 
				+            BytePatternTableRow { "\x3C\x42\x4F\x44\x59\x00"sv, "\xFF\xDF\xDF\xDF\xDF\xFF"sv, ignored_whitespace_bytes, text_html_mime_type, true },
			
 
				+
			
 
				+            // The case-insensitive string "<BR" followed by a tag-terminating byte.
			
 
				+            BytePatternTableRow { "\x3C\x42\x52\x00"sv, "\xFF\xDF\xDF\xFF"sv, ignored_whitespace_bytes, text_html_mime_type, true },
			
 
				+
			
 
				+            // The case-insensitive string "<P" followed by a tag-terminating byte.
			
 
				+            BytePatternTableRow { "\x3C\x50\x00"sv, "\xFF\xDF\xFF"sv, ignored_whitespace_bytes, text_html_mime_type, true },
			
 
				+
			
 
				+            // The string "<!--" followed by a tag-terminating byte.
			
 
				+            BytePatternTableRow { "\x3C\x21\x2D\x2D\x00"sv, "\xFF\xFF\xFF\xFF\xFF"sv, ignored_whitespace_bytes, text_html_mime_type, true },
			
 
				+
			
 
				+            // The string "<?xml".
			
 
				+            BytePatternTableRow { "\x3C\x3F\x78\x6D\x6C"sv, "\xFF\xFF\xFF\xFF\xFF"sv, ignored_whitespace_bytes, "text/xml"sv },
			
 
				+
			
 
				+            // The string "%PDF-", the PDF signature.
			
 
				+            BytePatternTableRow { "\x25\x50\x44\x46\x2D"sv, "\xFF\xFF\xFF\xFF\xFF"sv, no_ignored_bytes, "application/pdf"sv },
			
 
				+        };
			
 
				+
			
 
				+        for (auto const& row : pattern_table) {
			
 
				+            // 1. Let patternMatched be the result of the pattern matching algorithm given resource’s resource header,
			
 
				+            //    the value in the first column of row, the value in the second column of row, and the value in the
			
 
				+            //    third column of row.
			
 
				+            auto pattern_matched = pattern_matching_algorithm(resource.resource_header(), row.byte_pattern.bytes(), row.pattern_mask.bytes(), row.ignored_leading_bytes, row.is_tag_terminated);
			
 
				+
			
 
				+            // 2. If patternMatched is true, return the value in the fourth column of row.
			
 
				+            if (pattern_matched) {
			
 
				+                if (auto maybe_type = TRY(MimeType::parse(row.mime_type)); maybe_type.has_value())
			
 
				+                    return maybe_type.release_value();
			
 
				+            }
			
 
				+        }
			
 
				+    }
			
 
				+
			
 
				+    // 2. Execute the following steps for each row row in the following table:
			
 
				+    static auto constexpr text_plain_mime_type = "text/plain"sv;
			
 
				+    static Array<BytePatternTableRow, 4> constexpr pattern_table {
			
 
				+        // The string "%!PS-Adobe-", the PostScript signature.
			
 
				+        BytePatternTableRow { "\x25\x21\x50\x53\x2D\x41\x64\x6F\x62\x65\x2D"sv,
			
 
				+            "\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF"sv, no_ignored_bytes, "application/postscript"sv },
			
 
				+
			
 
				+        // UTF-16BE BOM
			
 
				+        BytePatternTableRow { "\xFE\xFF\x00\x00"sv, "\xFF\xFF\x00\x00"sv, no_ignored_bytes, text_plain_mime_type },
			
 
				+
			
 
				+        // UTF-16LE BOM
			
 
				+        BytePatternTableRow { "\xFF\xFE\x00\x00"sv, "\xFF\xFF\x00\x00"sv, no_ignored_bytes, text_plain_mime_type },
			
 
				+
			
 
				+        // UTF-8 BOM
			
 
				+        BytePatternTableRow { "\xEF\xBB\xBF\x00"sv, "\xFF\xFF\xFF\x00"sv, no_ignored_bytes, text_plain_mime_type },
			
 
				+    };
			
 
				+
			
 
				+    for (auto const& row : pattern_table) {
			
 
				+        // 1. Let patternMatched be the result of the pattern matching algorithm given resource’s resource header,
			
 
				+        //    the value in the first column of row, the value in the second column of row, and the value in the
			
 
				+        //    third column of row.
			
 
				+        auto pattern_matched = pattern_matching_algorithm(resource.resource_header(), row.byte_pattern.bytes(), row.pattern_mask.bytes(), row.ignored_leading_bytes);
			
 
				+
			
 
				+        // 2. If patternMatched is true, return the value in the fourth column of row.
			
 
				+        if (pattern_matched) {
			
 
				+            if (auto maybe_type = TRY(MimeType::parse(row.mime_type)); maybe_type.has_value())
			
 
				+                return maybe_type.release_value();
			
 
				+        }
			
 
				+    }
			
 
				+
			
 
				+    // FIXME: 3. Let matchedType be the result of executing the image type pattern matching algorithm given resource’s resource header.
			
 
				+    Optional<MimeType> matched_type;
			
 
				+
			
 
				+    // 4. If matchedType is not undefined, return matchedType.
			
 
				+    if (matched_type.has_value())
			
 
				+        return matched_type.release_value();
			
 
				+
			
 
				+    // FIXME: 5. Set matchedType to the result of executing the audio or video type pattern matching algorithm given resource’s resource header.
			
 
				+
			
 
				+    // 6. If matchedType is not undefined, return matchedType.
			
 
				+    if (matched_type.has_value())
			
 
				+        return matched_type.release_value();
			
 
				+
			
 
				+    // FIXME: 7. Set matchedType to the result of executing the archive type pattern matching algorithm given resource’s resource header.
			
 
				+
			
 
				+    // 8. If matchedType is not undefined, return matchedType.
			
 
				+    if (matched_type.has_value())
			
 
				+        return matched_type.release_value();
			
 
				+
			
 
				+    // 9. If resource’s resource header contains no binary data bytes, return "text/plain".
			
 
				+    if (!any_of(resource.resource_header(), is_binary_data_byte))
			
 
				+        return MimeType::create("text"_string, "plain"_string);
			
 
				+
			
 
				+    // 10. Return "application/octet-stream".
			
 
				+    return MimeType::create("application"_string, "octet-stream"_string);
			
 
				+}
			
 
				+
			
 
				+}
			
 
				+
			
 
				 namespace Web::MimeSniff {
			
 
				 
			
 
				 ErrorOr<Resource> Resource::create(ReadonlyBytes data, SniffingConfiguration configuration)
			
@@ -105,10 +335,7 @@ ErrorOr<void> Resource::mime_type_sniffing_algorithm()
 
				     //    identifying an unknown MIME type with the sniff-scriptable flag equal to the
			
 
				     //    inverse of the no-sniff flag and abort these steps.
			
 
				     if (!m_supplied_mime_type.has_value() || m_supplied_mime_type->essence().is_one_of("unknown/unknown", "application/unknown", "*/*")) {
			
 
				-
			
 
				-        // FIXME: Execute the rules for identifying an unknown MIME type with the
			
 
				-        // sniff-scriptable flag equal to the inverse of the no-sniff flag and abort
			
 
				-        // these steps.
			
 
				+        m_computed_mime_type = TRY(rules_for_identifying_an_unknown_mime_type(*this, !m_no_sniff));
			
 
				         return {};
			
 
				     }