浏览代码

LibWeb/MimeSniff: Add rules for identifying an unknown mime type

This also implements the pattern matching algorithm since it's
needed.
Kemal Zebari 1 年之前
父节点
当前提交
2babc08c17
共有 2 个文件被更改,包括 290 次插入4 次删除
  1. 59 0
      Tests/LibWeb/TestMimeSniff.cpp
  2. 231 4
      Userland/Libraries/LibWeb/MimeSniff/Resource.cpp

+ 59 - 0
Tests/LibWeb/TestMimeSniff.cpp

@@ -30,3 +30,62 @@ TEST_CASE(determine_computed_mime_type_given_no_sniff_is_unset)
 
     EXPECT_EQ(xml_mime_type, MUST(computed_mime_type.serialized()));
 }
+
+TEST_CASE(compute_unknown_mime_type)
+{
+    HashMap<StringView, Vector<StringView>> mime_type_to_headers_map;
+
+    mime_type_to_headers_map.set("application/octet-stream"sv, { "\x00"sv });
+    mime_type_to_headers_map.set("text/html"sv, {
+                                                    "\x09\x09<!DOCTYPE HTML\x20"sv,
+                                                    "\x0A<HTML\x3E"sv,
+                                                    "\x0C<HEAD\x20"sv,
+                                                    "\x0D<SCRIPT>"sv,
+                                                    "\x20<IFRAME>"sv,
+                                                    "<H1>"sv,
+                                                    "<DIV>"sv,
+                                                    "<FONT>"sv,
+                                                    "<TABLE>"sv,
+                                                    "<A>"sv,
+                                                    "<STYLE>"sv,
+                                                    "<TITLE>"sv,
+                                                    "<B>"sv,
+                                                    "<BODY>"sv,
+                                                    "<BR>"sv,
+                                                    "<P>"sv,
+                                                    "<!-->"sv,
+                                                });
+    mime_type_to_headers_map.set("text/xml"sv, { "<?xml"sv });
+    mime_type_to_headers_map.set("application/pdf"sv, { "%PDF-"sv });
+    mime_type_to_headers_map.set("application/postscript"sv, { "%!PS-Adobe-"sv });
+    mime_type_to_headers_map.set("text/plain"sv, {
+                                                     "\xFE\xFF\x00\x00"sv,
+                                                     "\xFF\xFE\x00\x00"sv,
+                                                     "\xEF\xBB\xBF\x00"sv,
+                                                     "Hello world!"sv,
+                                                 });
+
+    for (auto const& mime_type_to_headers : mime_type_to_headers_map) {
+        auto mime_type = mime_type_to_headers.key;
+
+        for (auto const& header : mime_type_to_headers.value) {
+            auto computed_mime_type = MUST(Web::MimeSniff::Resource::sniff(header.bytes()));
+            EXPECT_EQ(mime_type, computed_mime_type.essence());
+        }
+    }
+}
+
+TEST_CASE(compute_mime_type_given_unknown_supplied_type)
+{
+    Array<Web::MimeSniff::MimeType, 3> unknown_supplied_types = {
+        MUST(Web::MimeSniff::MimeType::create("unknown"_string, "unknown"_string)),
+        MUST(Web::MimeSniff::MimeType::create("application"_string, "unknown"_string)),
+        MUST(Web::MimeSniff::MimeType::create("*"_string, "*"_string))
+    };
+    auto header_bytes = "<HTML>"sv.bytes();
+
+    for (auto const& unknown_supplied_type : unknown_supplied_types) {
+        auto computed_mime_type = MUST(Web::MimeSniff::Resource::sniff(header_bytes, Web::MimeSniff::SniffingConfiguration { .supplied_type = unknown_supplied_type }));
+        EXPECT_EQ("text/html"sv, computed_mime_type.essence());
+    }
+}

+ 231 - 4
Userland/Libraries/LibWeb/MimeSniff/Resource.cpp

@@ -7,6 +7,236 @@
 #include <LibWeb/Fetch/Infrastructure/URL.h>
 #include <LibWeb/MimeSniff/Resource.h>
 
+namespace {
+
+using namespace Web::MimeSniff;
+
+struct BytePatternTableRow {
+    StringView byte_pattern;
+    StringView pattern_mask;
+    ReadonlyBytes ignored_leading_bytes;
+    StringView mime_type;
+
+    // NOTE: If the byte pattern has a tag-terminating byte, add a byte where this byte should be. The value itself is ignored in
+    //       the pattern_matching_algorithm() (see the NOTE in this algorithm for more details).
+    bool is_tag_terminated { false };
+};
+
+// https://mimesniff.spec.whatwg.org/#tag-terminating-byte
+bool is_tag_terminating_byte(u8 byte)
+{
+    // A tag-terminating byte (abbreviated 0xTT) is any one of the following bytes: 0x20 (SP), 0x3E (">").
+    return byte == 0x20 || byte == 0x3E;
+}
+
+// https://mimesniff.spec.whatwg.org/#binary-data-byte
+bool is_binary_data_byte(u8 byte)
+{
+    //  A binary data byte is a byte in the range 0x00 to 0x08 (NUL to BS), the byte 0x0B (VT), a byte in
+    //  the range 0x0E to 0x1A (SO to SUB), or a byte in the range 0x1C to 0x1F (FS to US).
+    return (byte <= 0x08) || byte == 0x0B || (byte >= 0x0E && byte <= 0x1A) || (byte >= 0x1C && byte <= 0x1F);
+}
+
+// https://mimesniff.spec.whatwg.org/#pattern-matching-algorithm
+bool pattern_matching_algorithm(ReadonlyBytes input, ReadonlyBytes pattern, ReadonlyBytes mask, ReadonlyBytes ignored, bool is_tag_terminated = false)
+{
+    // 1. Assert: pattern’s length is equal to mask’s length.
+    VERIFY(pattern.size() == mask.size());
+
+    // 2. If input’s length is less than pattern’s length, return false.
+    if (input.size() < pattern.size())
+        return false;
+
+    // 3. Let s be 0.
+    size_t s = 0;
+
+    // 4. While s < input’s length:
+    while (s < input.size()) {
+        // 1. If ignored does not contain input[s], break.
+        if (!ignored.contains_slow(input[s]))
+            break;
+
+        // 2. Set s to s + 1.
+        s++;
+    }
+
+    // 5. Let p be 0.
+    size_t p = 0;
+
+    // 6. While p < pattern’s length:
+    while (p < pattern.size()) {
+        // 1. Let maskedData be the result of applying the bitwise AND operator to input[s] and mask[p].
+        u8 masked_data = input[s] & mask[p];
+
+        // NOTE: This non-standard branch exists to avoid having to create 2 byte patterns just so that
+        //       they can only differ by their tag-terminating byte (which could be a 0x20 or 0x3E byte).
+        if (is_tag_terminated && p + 1 == pattern.size())
+            return is_tag_terminating_byte(masked_data);
+
+        // 2. If maskedData is not equal to pattern[p], return false.
+        if (masked_data != pattern[p])
+            return false;
+
+        // 3. Set s to s + 1.
+        s++;
+
+        // 4. Set p to p + 1.
+        p++;
+    }
+
+    // 7. Return true.
+    return true;
+}
+
+ReadonlyBytes constexpr no_ignored_bytes;
+
+// https://mimesniff.spec.whatwg.org/#rules-for-identifying-an-unknown-mime-type
+ErrorOr<MimeType> rules_for_identifying_an_unknown_mime_type(Resource const& resource, bool sniff_scriptable = false)
+{
+    // 1. If the sniff-scriptable flag is set, execute the following steps for each row row in the following table:
+    if (sniff_scriptable) {
+        static auto constexpr text_html_mime_type = "text/html"sv;
+
+        // https://mimesniff.spec.whatwg.org/#whitespace-byte
+        // A whitespace byte (abbreviated 0xWS) is any one of the following bytes: 0x09 (HT), 0x0A (LF), 0x0C (FF), 0x0D (CR), 0x20 (SP).
+        static Array<u8, 5> constexpr ignored_whitespace_bytes { 0x09, 0x0A, 0x0C, 0x0D, 0x20 };
+        static Array<BytePatternTableRow, 19> constexpr pattern_table {
+            // The case-insensitive string "<!DOCTYPE HTML" followed by a tag-terminating byte.
+            BytePatternTableRow { "\x3C\x21\x44\x4F\x43\x54\x59\x50\x45\x20\x48\x54\x4D\x4C\x00"sv,
+                "\xFF\xFF\xDF\xDF\xDF\xDF\xDF\xDF\xDF\xFF\xDF\xDF\xDF\xDF\xFF"sv, ignored_whitespace_bytes, text_html_mime_type, true },
+
+            // The case-insensitive string "<HTML" followed by a tag-terminating byte.
+            BytePatternTableRow { "\x3C\x48\x54\x4D\x4C\x00"sv, "\xFF\xDF\xDF\xDF\xDF\xFF"sv, ignored_whitespace_bytes, text_html_mime_type, true },
+
+            // The case-insensitive string "<HEAD" followed by a tag-terminating byte.
+            BytePatternTableRow { "\x3C\x48\x45\x41\x44\x00"sv, "\xFF\xDF\xDF\xDF\xDF\xFF"sv, ignored_whitespace_bytes, text_html_mime_type, true },
+
+            // The case-insensitive string "<SCRIPT" followed by a tag-terminating byte.
+            BytePatternTableRow { "\x3C\x53\x43\x52\x49\x50\x54\x00"sv,
+                "\xFF\xDF\xDF\xDF\xDF\xDF\xDF\xFF"sv, ignored_whitespace_bytes, text_html_mime_type, true },
+
+            // The case-insensitive string "<IFRAME" followed by a tag-terminating byte.
+            BytePatternTableRow { "\x3C\x49\x46\x52\x41\x4D\x45\x00"sv,
+                "\xFF\xDF\xDF\xDF\xDF\xDF\xDF\xFF"sv, ignored_whitespace_bytes, text_html_mime_type, true },
+
+            // The case-insensitive string "<H1" followed by a tag-terminating byte.
+            BytePatternTableRow { "\x3C\x48\x31\x00"sv, "\xFF\xDF\xFF\xFF"sv, ignored_whitespace_bytes, text_html_mime_type, true },
+
+            // The case-insensitive string "<DIV" followed by a tag-terminating byte.
+            BytePatternTableRow { "\x3C\x44\x49\x56\x00"sv, "\xFF\xDF\xDF\xDF\xFF"sv, ignored_whitespace_bytes, text_html_mime_type, true },
+
+            // The case-insensitive string "<FONT" followed by a tag-terminating byte.
+            BytePatternTableRow { "\x3C\x46\x4F\x4E\x54\x00"sv, "\xFF\xDF\xDF\xDF\xDF\xFF"sv, ignored_whitespace_bytes, text_html_mime_type, true },
+
+            // The case-insensitive string "<TABLE" followed by a tag-terminating byte.
+            BytePatternTableRow { "\x3C\x54\x41\x42\x4C\x45\x00"sv, "\xFF\xDF\xDF\xDF\xDF\xDF\xFF"sv, ignored_whitespace_bytes, text_html_mime_type, true },
+
+            // The case-insensitive string "<A" followed by a tag-terminating byte.
+            BytePatternTableRow { "\x3C\x41\x00"sv, "\xFF\xDF\xFF"sv, ignored_whitespace_bytes, text_html_mime_type, true },
+
+            // The case-insensitive string "<STYLE" followed by a tag-terminating byte.
+            BytePatternTableRow { "\x3C\x53\x54\x59\x4C\x45\x00"sv,
+                "\xFF\xDF\xDF\xDF\xDF\xDF\xFF"sv, ignored_whitespace_bytes, text_html_mime_type, true },
+
+            // The case-insensitive string "<TITLE" followed by a tag-terminating byte.
+            BytePatternTableRow { "\x3C\x54\x49\x54\x4C\x45\x00"sv,
+                "\xFF\xDF\xDF\xDF\xDF\xDF\xFF"sv, ignored_whitespace_bytes, text_html_mime_type, true },
+
+            // The case-insensitive string "<B" followed by a tag-terminating byte.
+            BytePatternTableRow { "\x3C\x42\x00"sv, "\xFF\xDF\xFF"sv, ignored_whitespace_bytes, text_html_mime_type, true },
+
+            // The case-insensitive string "<BODY" followed by a tag-terminating byte.
+            BytePatternTableRow { "\x3C\x42\x4F\x44\x59\x00"sv, "\xFF\xDF\xDF\xDF\xDF\xFF"sv, ignored_whitespace_bytes, text_html_mime_type, true },
+
+            // The case-insensitive string "<BR" followed by a tag-terminating byte.
+            BytePatternTableRow { "\x3C\x42\x52\x00"sv, "\xFF\xDF\xDF\xFF"sv, ignored_whitespace_bytes, text_html_mime_type, true },
+
+            // The case-insensitive string "<P" followed by a tag-terminating byte.
+            BytePatternTableRow { "\x3C\x50\x00"sv, "\xFF\xDF\xFF"sv, ignored_whitespace_bytes, text_html_mime_type, true },
+
+            // The string "<!--" followed by a tag-terminating byte.
+            BytePatternTableRow { "\x3C\x21\x2D\x2D\x00"sv, "\xFF\xFF\xFF\xFF\xFF"sv, ignored_whitespace_bytes, text_html_mime_type, true },
+
+            // The string "<?xml".
+            BytePatternTableRow { "\x3C\x3F\x78\x6D\x6C"sv, "\xFF\xFF\xFF\xFF\xFF"sv, ignored_whitespace_bytes, "text/xml"sv },
+
+            // The string "%PDF-", the PDF signature.
+            BytePatternTableRow { "\x25\x50\x44\x46\x2D"sv, "\xFF\xFF\xFF\xFF\xFF"sv, no_ignored_bytes, "application/pdf"sv },
+        };
+
+        for (auto const& row : pattern_table) {
+            // 1. Let patternMatched be the result of the pattern matching algorithm given resource’s resource header,
+            //    the value in the first column of row, the value in the second column of row, and the value in the
+            //    third column of row.
+            auto pattern_matched = pattern_matching_algorithm(resource.resource_header(), row.byte_pattern.bytes(), row.pattern_mask.bytes(), row.ignored_leading_bytes, row.is_tag_terminated);
+
+            // 2. If patternMatched is true, return the value in the fourth column of row.
+            if (pattern_matched) {
+                if (auto maybe_type = TRY(MimeType::parse(row.mime_type)); maybe_type.has_value())
+                    return maybe_type.release_value();
+            }
+        }
+    }
+
+    // 2. Execute the following steps for each row row in the following table:
+    static auto constexpr text_plain_mime_type = "text/plain"sv;
+    static Array<BytePatternTableRow, 4> constexpr pattern_table {
+        // The string "%!PS-Adobe-", the PostScript signature.
+        BytePatternTableRow { "\x25\x21\x50\x53\x2D\x41\x64\x6F\x62\x65\x2D"sv,
+            "\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF"sv, no_ignored_bytes, "application/postscript"sv },
+
+        // UTF-16BE BOM
+        BytePatternTableRow { "\xFE\xFF\x00\x00"sv, "\xFF\xFF\x00\x00"sv, no_ignored_bytes, text_plain_mime_type },
+
+        // UTF-16LE BOM
+        BytePatternTableRow { "\xFF\xFE\x00\x00"sv, "\xFF\xFF\x00\x00"sv, no_ignored_bytes, text_plain_mime_type },
+
+        // UTF-8 BOM
+        BytePatternTableRow { "\xEF\xBB\xBF\x00"sv, "\xFF\xFF\xFF\x00"sv, no_ignored_bytes, text_plain_mime_type },
+    };
+
+    for (auto const& row : pattern_table) {
+        // 1. Let patternMatched be the result of the pattern matching algorithm given resource’s resource header,
+        //    the value in the first column of row, the value in the second column of row, and the value in the
+        //    third column of row.
+        auto pattern_matched = pattern_matching_algorithm(resource.resource_header(), row.byte_pattern.bytes(), row.pattern_mask.bytes(), row.ignored_leading_bytes);
+
+        // 2. If patternMatched is true, return the value in the fourth column of row.
+        if (pattern_matched) {
+            if (auto maybe_type = TRY(MimeType::parse(row.mime_type)); maybe_type.has_value())
+                return maybe_type.release_value();
+        }
+    }
+
+    // FIXME: 3. Let matchedType be the result of executing the image type pattern matching algorithm given resource’s resource header.
+    Optional<MimeType> matched_type;
+
+    // 4. If matchedType is not undefined, return matchedType.
+    if (matched_type.has_value())
+        return matched_type.release_value();
+
+    // FIXME: 5. Set matchedType to the result of executing the audio or video type pattern matching algorithm given resource’s resource header.
+
+    // 6. If matchedType is not undefined, return matchedType.
+    if (matched_type.has_value())
+        return matched_type.release_value();
+
+    // FIXME: 7. Set matchedType to the result of executing the archive type pattern matching algorithm given resource’s resource header.
+
+    // 8. If matchedType is not undefined, return matchedType.
+    if (matched_type.has_value())
+        return matched_type.release_value();
+
+    // 9. If resource’s resource header contains no binary data bytes, return "text/plain".
+    if (!any_of(resource.resource_header(), is_binary_data_byte))
+        return MimeType::create("text"_string, "plain"_string);
+
+    // 10. Return "application/octet-stream".
+    return MimeType::create("application"_string, "octet-stream"_string);
+}
+
+}
+
 namespace Web::MimeSniff {
 
 ErrorOr<Resource> Resource::create(ReadonlyBytes data, SniffingConfiguration configuration)
@@ -105,10 +335,7 @@ ErrorOr<void> Resource::mime_type_sniffing_algorithm()
     //    identifying an unknown MIME type with the sniff-scriptable flag equal to the
     //    inverse of the no-sniff flag and abort these steps.
     if (!m_supplied_mime_type.has_value() || m_supplied_mime_type->essence().is_one_of("unknown/unknown", "application/unknown", "*/*")) {
-
-        // FIXME: Execute the rules for identifying an unknown MIME type with the
-        // sniff-scriptable flag equal to the inverse of the no-sniff flag and abort
-        // these steps.
+        m_computed_mime_type = TRY(rules_for_identifying_an_unknown_mime_type(*this, !m_no_sniff));
         return {};
     }