4 سال پیش · f808279769
--- a/Userland/Libraries/LibWeb/CMakeLists.txt
+++ b/Userland/Libraries/LibWeb/CMakeLists.txt
@@ -147,6 +147,7 @@ set(SOURCES
 
				     HTML/ImageData.cpp
			
 
				     HTML/Parser/Entities.cpp
			
 
				     HTML/Parser/HTMLDocumentParser.cpp
			
 
				+    HTML/Parser/HTMLEncodingDetection.cpp
			
 
				     HTML/Parser/HTMLToken.cpp
			
 
				     HTML/Parser/HTMLTokenizer.cpp
			
 
				     HTML/Parser/ListOfActiveFormattingElements.cpp
			
--- a/Userland/Libraries/LibWeb/HTML/Parser/HTMLDocumentParser.cpp
+++ b/Userland/Libraries/LibWeb/HTML/Parser/HTMLDocumentParser.cpp
@@ -22,6 +22,7 @@
 
				 #include <LibWeb/HTML/HTMLTableElement.h>
			
 
				 #include <LibWeb/HTML/HTMLTemplateElement.h>
			
 
				 #include <LibWeb/HTML/Parser/HTMLDocumentParser.h>
			
 
				+#include <LibWeb/HTML/Parser/HTMLEncodingDetection.h>
			
 
				 #include <LibWeb/HTML/Parser/HTMLToken.h>
			
 
				 #include <LibWeb/Namespace.h>
			
 
				 #include <LibWeb/SVG/TagNames.h>
			
@@ -3039,4 +3040,14 @@ NonnullRefPtrVector<DOM::Node> HTMLDocumentParser::parse_html_fragment(DOM::Elem
 
				     }
			
 
				     return children;
			
 
				 }
			
 
				+
			
 
				+NonnullOwnPtr<HTMLDocumentParser> HTMLDocumentParser::create_with_uncertain_encoding(DOM::Document& document, const ByteBuffer& input)
			
 
				+{
			
 
				+    if (document.has_encoding())
			
 
				+        return make<HTMLDocumentParser>(document, input, document.encoding().value());
			
 
				+    auto encoding = run_encoding_sniffing_algorithm(input);
			
 
				+    dbgln("The encoding sniffing algorithm returned encoding '{}'", encoding);
			
 
				+    return make<HTMLDocumentParser>(document, input, encoding);
			
 
				+}
			
 
				+
			
 
				 }
			
--- a/Userland/Libraries/LibWeb/HTML/Parser/HTMLDocumentParser.h
+++ b/Userland/Libraries/LibWeb/HTML/Parser/HTMLDocumentParser.h
@@ -46,6 +46,8 @@ public:
 
				     HTMLDocumentParser(DOM::Document&, const StringView& input, const String& encoding);
			
 
				     ~HTMLDocumentParser();
			
 
				 
			
 
				+    static NonnullOwnPtr<HTMLDocumentParser> create_with_uncertain_encoding(DOM::Document&, const ByteBuffer& input);
			
 
				+
			
 
				     void run(const URL&);
			
 
				 
			
 
				     DOM::Document& document();
			
--- a/Userland/Libraries/LibWeb/HTML/Parser/HTMLEncodingDetection.cpp
+++ b/Userland/Libraries/LibWeb/HTML/Parser/HTMLEncodingDetection.cpp
@@ -0,0 +1,223 @@
 
				+/*
			
 
				+ * Copyright (c) 2021, Max Wipfli <mail@maxwipfli.ch>
			
 
				+ *
			
 
				+ * SPDX-License-Identifier: BSD-2-Clause
			
 
				+ */
			
 
				+
			
 
				+#include <AK/StringView.h>
			
 
				+#include <AK/Utf8View.h>
			
 
				+#include <LibTextCodec/Decoder.h>
			
 
				+#include <LibWeb/HTML/Parser/HTMLEncodingDetection.h>
			
 
				+#include <ctype.h>
			
 
				+
			
 
				+namespace Web::HTML {
			
 
				+
			
 
				+bool prescan_should_abort(const ByteBuffer& input, const size_t& position)
			
 
				+{
			
 
				+    return position >= input.size() || position >= 1024;
			
 
				+}
			
 
				+
			
 
				+bool prescan_is_whitespace_or_slash(const u8& byte)
			
 
				+{
			
 
				+    return byte == '\t' || byte == '\n' || byte == '\f' || byte == '\r' || byte == ' ' || byte == '/';
			
 
				+}
			
 
				+
			
 
				+bool prescan_skip_whitespace_and_slashes(const ByteBuffer& input, size_t& position)
			
 
				+{
			
 
				+    while (!prescan_should_abort(input, position) && (input[position] == '\t' || input[position] == '\n' || input[position] == '\f' || input[position] == '\r' || input[position] == ' ' || input[position] == '/'))
			
 
				+        ++position;
			
 
				+    return !prescan_should_abort(input, position);
			
 
				+}
			
 
				+
			
 
				+Optional<Attribute> prescan_get_attribute(const ByteBuffer& input, size_t& position)
			
 
				+{
			
 
				+    if (!prescan_skip_whitespace_and_slashes(input, position))
			
 
				+        return {};
			
 
				+    if (input[position] == '>')
			
 
				+        return {};
			
 
				+
			
 
				+    StringBuilder attribute_name;
			
 
				+    while (true) {
			
 
				+        if (input[position] == '=' && !attribute_name.is_empty()) {
			
 
				+            ++position;
			
 
				+            goto value;
			
 
				+        } else if (input[position] == '\t' || input[position] == '\n' || input[position] == '\f' || input[position] == '\r' || input[position] == ' ')
			
 
				+            goto spaces;
			
 
				+        else if (input[position] == '/' || input[position] == '>')
			
 
				+            return Attribute(attribute_name.to_string(), "");
			
 
				+        else
			
 
				+            attribute_name.append_as_lowercase(input[position]);
			
 
				+        ++position;
			
 
				+        if (prescan_should_abort(input, position))
			
 
				+            return {};
			
 
				+    }
			
 
				+
			
 
				+spaces:
			
 
				+    if (!prescan_skip_whitespace_and_slashes(input, position))
			
 
				+        return {};
			
 
				+    if (input[position] != '=')
			
 
				+        return Attribute(attribute_name.to_string(), "");
			
 
				+    ++position;
			
 
				+
			
 
				+value:
			
 
				+    if (!prescan_skip_whitespace_and_slashes(input, position))
			
 
				+        return {};
			
 
				+
			
 
				+    StringBuilder attribute_value;
			
 
				+    if (input[position] == '"' || input[position] == '\'') {
			
 
				+        u8 quote_character = input[position];
			
 
				+        ++position;
			
 
				+        for (; !prescan_should_abort(input, position); ++position) {
			
 
				+            if (input[position] == quote_character)
			
 
				+                return Attribute(attribute_name.to_string(), attribute_value.to_string());
			
 
				+            else
			
 
				+                attribute_value.append_as_lowercase(input[position]);
			
 
				+        }
			
 
				+        return {};
			
 
				+    } else if (input[position] == '>')
			
 
				+        return Attribute(attribute_name.to_string(), "");
			
 
				+    else
			
 
				+        attribute_value.append_as_lowercase(input[position]);
			
 
				+
			
 
				+    ++position;
			
 
				+    if (prescan_should_abort(input, position))
			
 
				+        return {};
			
 
				+
			
 
				+    for (; !prescan_should_abort(input, position); ++position) {
			
 
				+        if (input[position] == '\t' || input[position] == '\n' || input[position] == '\f' || input[position] == '\r' || input[position] == ' ' || input[position] == '>')
			
 
				+            return Attribute(attribute_name.to_string(), attribute_value.to_string());
			
 
				+        else
			
 
				+            attribute_value.append_as_lowercase(input[position]);
			
 
				+    }
			
 
				+    return {};
			
 
				+}
			
 
				+
			
 
				+// https://html.spec.whatwg.org/multipage/parsing.html#prescan-a-byte-stream-to-determine-its-encoding
			
 
				+Optional<String> run_prescan_byte_stream_algorithm(const ByteBuffer& input)
			
 
				+{
			
 
				+    // https://html.spec.whatwg.org/multipage/parsing.html#prescan-a-byte-stream-to-determine-its-encoding
			
 
				+
			
 
				+    // Detects '<?x'
			
 
				+    if (!prescan_should_abort(input, 6)) {
			
 
				+        if (input[0] == 0x3C && input[1] == 0x00 && input[2] == 0x3F && input[3] == 0x00 && input[4] == 0x78 && input[5] == 0x00)
			
 
				+            return "utf-16le";
			
 
				+        if (input[0] == 0x00 && input[1] == 0x3C && input[2] == 0x00 && input[4] == 0x3F && input[5] == 0x00 && input[6] == 0x78)
			
 
				+            return "utf-16be";
			
 
				+    }
			
 
				+
			
 
				+    for (size_t position = 0; !prescan_should_abort(input, position); ++position) {
			
 
				+        if (!prescan_should_abort(input, position + 5) && input[position] == '<' && input[position + 1] == '!'
			
 
				+            && input[position + 2] == '-' && input[position + 3] == '-') {
			
 
				+            position += 2;
			
 
				+            for (; !prescan_should_abort(input, position + 3); ++position) {
			
 
				+                if (input[position] == '-' && input[position + 1] == '-' && input[position + 2] == '>') {
			
 
				+                    position += 2;
			
 
				+                    break;
			
 
				+                }
			
 
				+            }
			
 
				+        } else if (!prescan_should_abort(input, position + 6)
			
 
				+            && input[position] == '<'
			
 
				+            && (input[position + 1] == 'M' || input[position + 1] == 'm')
			
 
				+            && (input[position + 2] == 'E' || input[position + 2] == 'e')
			
 
				+            && (input[position + 3] == 'T' || input[position + 3] == 't')
			
 
				+            && (input[position + 4] == 'A' || input[position + 4] == 'a')
			
 
				+            && prescan_is_whitespace_or_slash(input[position + 5])) {
			
 
				+            position += 6;
			
 
				+            Vector<String> attribute_list {};
			
 
				+            bool got_pragma = false;
			
 
				+            Optional<bool> need_pragma {};
			
 
				+            Optional<String> charset {};
			
 
				+
			
 
				+            while (true) {
			
 
				+                auto attribute = prescan_get_attribute(input, position);
			
 
				+                if (!attribute.has_value())
			
 
				+                    break;
			
 
				+                if (attribute_list.contains_slow(attribute.value().name()))
			
 
				+                    continue;
			
 
				+                auto& attribute_name = attribute.value().name();
			
 
				+                attribute_list.append(attribute.value().name());
			
 
				+
			
 
				+                if (attribute_name == "http-equiv" && attribute.value().value() == "content-type")
			
 
				+                    got_pragma = true;
			
 
				+                else if (attribute_name == "charset") {
			
 
				+                    auto maybe_charset = TextCodec::get_standardized_encoding(attribute.value().value());
			
 
				+                    if (maybe_charset.has_value()) {
			
 
				+                        charset = Optional<String> { maybe_charset };
			
 
				+                        need_pragma = { false };
			
 
				+                    }
			
 
				+                }
			
 
				+
			
 
				+                // FIXME: For attribute name "content", do this:
			
 
				+                //        Apply the "algorithm for extracting a character encoding from a meta
			
 
				+                //        element", giving the attribute's value as the string to parse. If a
			
 
				+                //        character encoding is returned, and if charset is still set to null,
			
 
				+                //        let charset be the encoding returned, and set need pragma to true.
			
 
				+            }
			
 
				+
			
 
				+            if (!need_pragma.has_value() || (need_pragma.value() && !got_pragma) || !charset.has_value())
			
 
				+                continue;
			
 
				+            if (charset.value() == "UTF-16BE/LE")
			
 
				+                return "UTF-8";
			
 
				+            else if (charset.value() == "x-user-defined")
			
 
				+                return "windows-1252";
			
 
				+            else
			
 
				+                return charset.value();
			
 
				+        } else if (!prescan_should_abort(input, position + 3) && input[position] == '<'
			
 
				+            && ((input[position + 1] == '/' && isalpha(input[position + 2])) || isalpha(input[position + 1]))) {
			
 
				+            position += 2;
			
 
				+            prescan_skip_whitespace_and_slashes(input, position);
			
 
				+            while (prescan_get_attribute(input, position).has_value()) { };
			
 
				+        } else if (!prescan_should_abort(input, position + 1) && input[position] == '<' && (input[position + 1] == '!' || input[position + 1] == '/' || input[position + 1] == '?')) {
			
 
				+            position += 2;
			
 
				+            while (input[position] != '>') {
			
 
				+                ++position;
			
 
				+                if (prescan_should_abort(input, position))
			
 
				+                    return {};
			
 
				+            }
			
 
				+        } else {
			
 
				+            // Do nothing.
			
 
				+        }
			
 
				+    }
			
 
				+    return {};
			
 
				+}
			
 
				+
			
 
				+// https://html.spec.whatwg.org/multipage/parsing.html#determining-the-character-encoding
			
 
				+String run_encoding_sniffing_algorithm(const ByteBuffer& input)
			
 
				+{
			
 
				+    if (input.size() >= 2) {
			
 
				+        if (input[0] == 0xFE && input[1] == 0xFF) {
			
 
				+            return "UTF-16BE";
			
 
				+        } else if (input[0] == 0xFF && input[1] == 0xFE) {
			
 
				+            return "UTF-16LE";
			
 
				+        } else if (input.size() >= 3 && input[0] == 0xEF && input[1] == 0xBB && input[2] == 0xBF) {
			
 
				+            return "UTF-8";
			
 
				+        }
			
 
				+    }
			
 
				+
			
 
				+    // FIXME: If the user has explicitly instructed the user agent to override the document's character
			
 
				+    //        encoding with a specific encoding.
			
 
				+    // FIXME: The user agent may wait for more bytes of the resource to be available, either in this step or
			
 
				+    //        at any later step in this algorithm.
			
 
				+    // FIXME: If the transport layer specifies a character encoding, and it is supported.
			
 
				+
			
 
				+    auto optional_encoding = run_prescan_byte_stream_algorithm(input);
			
 
				+    if (optional_encoding.has_value()) {
			
 
				+        return optional_encoding.value();
			
 
				+    }
			
 
				+
			
 
				+    // FIXME: If the HTML parser for which this algorithm is being run is associated with a Document whose browsing context
			
 
				+    //        is non-null and a child browsing context.
			
 
				+    // FIXME: If the user agent has information on the likely encoding for this page, e.g. based on the encoding of the page
			
 
				+    //        when it was last visited.
			
 
				+
			
 
				+    if (!Utf8View(StringView(input)).validate()) {
			
 
				+        // FIXME: As soon as Locale is supported, this should sometimes return a different encoding based on the locale.
			
 
				+        return "windows-1252";
			
 
				+    }
			
 
				+
			
 
				+    // NOTE: This is the authoritative place to actually decide on using the default encoding as per the HTML specification.
			
 
				+    //       "Otherwise, return an implementation-defined or user-specified default character encoding, [...]."
			
 
				+    return "UTF-8";
			
 
				+}
			
 
				+
			
 
				+}
			
--- a/Userland/Libraries/LibWeb/HTML/Parser/HTMLEncodingDetection.h
+++ b/Userland/Libraries/LibWeb/HTML/Parser/HTMLEncodingDetection.h
@@ -0,0 +1,22 @@
 
				+/*
			
 
				+ * Copyright (c) 2021, Max Wipfli <mail@maxwipfli.ch>
			
 
				+ *
			
 
				+ * SPDX-License-Identifier: BSD-2-Clause
			
 
				+ */
			
 
				+
			
 
				+#pragma once
			
 
				+
			
 
				+#include <AK/Optional.h>
			
 
				+#include <AK/String.h>
			
 
				+#include <LibWeb/DOM/Attribute.h>
			
 
				+
			
 
				+namespace Web::HTML {
			
 
				+
			
 
				+bool prescan_should_abort(const ByteBuffer& input, const size_t& position);
			
 
				+bool prescan_is_whitespace_or_slash(const u8& byte);
			
 
				+bool prescan_skip_whitespace_and_slashes(const ByteBuffer& input, size_t& position);
			
 
				+Optional<Attribute> prescan_get_attribute(const ByteBuffer& input, size_t& position);
			
 
				+Optional<String> run_prescan_byte_stream_algorithm(const ByteBuffer& input);
			
 
				+String run_encoding_sniffing_algorithm(const ByteBuffer& input);
			
 
				+
			
 
				+}
			
--- a/Userland/Libraries/LibWeb/Loader/FrameLoader.cpp
+++ b/Userland/Libraries/LibWeb/Loader/FrameLoader.cpp
@@ -113,8 +113,8 @@ bool FrameLoader::parse_document(DOM::Document& document, const ByteBuffer& data
 
				 {
			
 
				     auto& mime_type = document.content_type();
			
 
				     if (mime_type == "text/html" || mime_type == "image/svg+xml") {
			
 
				-        HTML::HTMLDocumentParser parser(document, data, document.encoding_or_default());
			
 
				-        parser.run(document.url());
			
 
				+        auto parser = HTML::HTMLDocumentParser::create_with_uncertain_encoding(document, data);
			
 
				+        parser->run(document.url());
			
 
				         return true;
			
 
				     }
			
 
				     if (mime_type.starts_with("image/"))