소스 검색

LibWeb: Implement HTML fragment serialisation and use it in innerHTML

The previous implementation was about a half implementation and was
tied to Element::innerHTML. This separates it and puts it into
HTMLDocumentParser, as this is in the parsing section of the spec.

This provides a near finished HTML fragment serialisation algorithm,
bar namespaces in attributes and the `is` value.
Luke Wilde 3 년 전
부모
커밋
f62477c093

+ 14 - 54
Userland/Libraries/LibWeb/DOM/Element.cpp

@@ -256,62 +256,10 @@ void Element::set_inner_html(StringView markup)
     document().invalidate_layout();
 }
 
+// https://w3c.github.io/DOM-Parsing/#dom-innerhtml-innerhtml
 String Element::inner_html() const
 {
-    auto escape_string = [](const StringView& string, bool attribute_mode) -> String {
-        // https://html.spec.whatwg.org/multipage/parsing.html#escapingString
-        StringBuilder builder;
-        for (auto& ch : string) {
-            if (ch == '&')
-                builder.append("&");
-            // FIXME: also replace U+00A0 NO-BREAK SPACE with  
-            else if (ch == '"' && attribute_mode)
-                builder.append(""");
-            else if (ch == '<' && !attribute_mode)
-                builder.append("&lt;");
-            else if (ch == '>' && !attribute_mode)
-                builder.append("&gt;");
-            else
-                builder.append(ch);
-        }
-        return builder.to_string();
-    };
-
-    StringBuilder builder;
-
-    Function<void(const Node&)> recurse = [&](auto& node) {
-        for (auto* child = node.first_child(); child; child = child->next_sibling()) {
-            if (child->is_element()) {
-                auto& element = verify_cast<Element>(*child);
-                builder.append('<');
-                builder.append(element.local_name());
-                element.for_each_attribute([&](auto& name, auto& value) {
-                    builder.append(' ');
-                    builder.append(name);
-                    builder.append('=');
-                    builder.append('"');
-                    builder.append(escape_string(value, true));
-                    builder.append('"');
-                });
-                builder.append('>');
-
-                recurse(*child);
-
-                // FIXME: This should be skipped for void elements
-                builder.append("</");
-                builder.append(element.local_name());
-                builder.append('>');
-            }
-            if (child->is_text()) {
-                auto& text = verify_cast<Text>(*child);
-                builder.append(escape_string(text.data(), false));
-            }
-            // FIXME: Also handle Comment, ProcessingInstruction, DocumentType
-        }
-    };
-    recurse(*this);
-
-    return builder.to_string();
+    return serialize_fragment(/* FIXME: Providing true for the require well-formed flag (which may throw) */);
 }
 
 bool Element::is_focused() const
@@ -376,4 +324,16 @@ void Element::queue_an_element_task(HTML::Task::Source source, Function<void()>
     HTML::main_thread_event_loop().task_queue().add(move(task));
 }
 
+// https://html.spec.whatwg.org/multipage/syntax.html#void-elements
+bool Element::is_void_element() const
+{
+    return local_name().is_one_of(HTML::TagNames::area, HTML::TagNames::base, HTML::TagNames::br, HTML::TagNames::col, HTML::TagNames::embed, HTML::TagNames::hr, HTML::TagNames::img, HTML::TagNames::input, HTML::TagNames::link, HTML::TagNames::meta, HTML::TagNames::param, HTML::TagNames::source, HTML::TagNames::track, HTML::TagNames::wbr);
+}
+
+// https://html.spec.whatwg.org/multipage/parsing.html#serializes-as-void
+bool Element::serializes_as_void() const
+{
+    return is_void_element() || local_name().is_one_of(HTML::TagNames::basefont, HTML::TagNames::bgsound, HTML::TagNames::frame, HTML::TagNames::keygen);
+}
+
 }

+ 3 - 0
Userland/Libraries/LibWeb/DOM/Element.h

@@ -108,6 +108,9 @@ public:
 
     void queue_an_element_task(HTML::Task::Source, Function<void()>);
 
+    bool is_void_element() const;
+    bool serializes_as_void() const;
+
 protected:
     RefPtr<Layout::Node> create_layout_node() override;
 

+ 13 - 0
Userland/Libraries/LibWeb/DOM/Node.cpp

@@ -24,6 +24,7 @@
 #include <LibWeb/DOM/ProcessingInstruction.h>
 #include <LibWeb/DOM/ShadowRoot.h>
 #include <LibWeb/HTML/HTMLAnchorElement.h>
+#include <LibWeb/HTML/Parser/HTMLDocumentParser.h>
 #include <LibWeb/Layout/InitialContainingBlock.h>
 #include <LibWeb/Layout/Node.h>
 #include <LibWeb/Layout/TextNode.h>
@@ -768,6 +769,18 @@ void Node::string_replace_all(String const& string)
     replace_all(node);
 }
 
+// https://w3c.github.io/DOM-Parsing/#dfn-fragment-serializing-algorithm
+String Node::serialize_fragment(/* FIXME: Requires well-formed flag */) const
+{
+    // FIXME: Let context document be the value of node's node document.
+
+    // FIXME: If context document is an HTML document, return an HTML serialization of node.
+    //        (We currently always do this)
+    return HTML::HTMLDocumentParser::serialize_html_fragment(*this);
+
+    // FIXME: Otherwise, context document is an XML document; return an XML serialization of node passing the flag require well-formed.
+}
+
 // https://dom.spec.whatwg.org/#dom-node-issamenode
 bool Node::is_same_node(Node const* other_node) const
 {

+ 2 - 0
Userland/Libraries/LibWeb/DOM/Node.h

@@ -183,6 +183,8 @@ public:
     i32 id() const { return m_id; }
     static Node* from_id(i32 node_id);
 
+    String serialize_fragment() const;
+
     void replace_all(RefPtr<Node>);
     void string_replace_all(String const&);
 

+ 200 - 0
Userland/Libraries/LibWeb/HTML/Parser/HTMLDocumentParser.cpp

@@ -1,5 +1,6 @@
 /*
  * Copyright (c) 2020, Andreas Kling <kling@serenityos.org>
+ * Copyright (c) 2021, Luke Wilde <lukew@serenityos.org>
  *
  * SPDX-License-Identifier: BSD-2-Clause
  */
@@ -13,6 +14,7 @@
 #include <LibWeb/DOM/DocumentType.h>
 #include <LibWeb/DOM/ElementFactory.h>
 #include <LibWeb/DOM/Event.h>
+#include <LibWeb/DOM/ProcessingInstruction.h>
 #include <LibWeb/DOM/Text.h>
 #include <LibWeb/DOM/Window.h>
 #include <LibWeb/HTML/EventLoop/EventLoop.h>
@@ -3054,4 +3056,202 @@ NonnullOwnPtr<HTMLDocumentParser> HTMLDocumentParser::create_with_uncertain_enco
     return make<HTMLDocumentParser>(document, input, encoding);
 }
 
+// https://html.spec.whatwg.org/multipage/parsing.html#html-fragment-serialisation-algorithm
+String HTMLDocumentParser::serialize_html_fragment(DOM::Node const& node)
+{
+    // The algorithm takes as input a DOM Element, Document, or DocumentFragment referred to as the node.
+    VERIFY(node.is_element() || node.is_document() || node.is_document_fragment());
+    NonnullRefPtr<DOM::Node> actual_node = node;
+
+    if (is<DOM::Element>(node)) {
+        auto& element = verify_cast<DOM::Element>(node);
+
+        // 1. If the node serializes as void, then return the empty string.
+        //    (NOTE: serializes as void is defined only on elements in the spec)
+        if (element.serializes_as_void())
+            return String::empty();
+
+        // 3. If the node is a template element, then let the node instead be the template element's template contents (a DocumentFragment node).
+        //    (NOTE: This is out of order of the spec to avoid another dynamic cast. The second step just creates a string builder, so it shouldn't matter)
+        if (is<HTML::HTMLTemplateElement>(element))
+            actual_node = verify_cast<HTML::HTMLTemplateElement>(element).content();
+    }
+
+    enum class AttributeMode {
+        No,
+        Yes,
+    };
+
+    auto escape_string = [](StringView const& string, AttributeMode attribute_mode) -> String {
+        // https://html.spec.whatwg.org/multipage/parsing.html#escapingString
+        StringBuilder builder;
+        for (auto& ch : string) {
+            // 1. Replace any occurrence of the "&" character by the string "&amp;".
+            if (ch == '&')
+                builder.append("&amp;");
+            // 2. Replace any occurrences of the U+00A0 NO-BREAK SPACE character by the string "&nbsp;".
+            else if (ch == '\xA0')
+                builder.append("&nbsp;");
+            // 3. If the algorithm was invoked in the attribute mode, replace any occurrences of the """ character by the string "&quot;".
+            else if (ch == '"' && attribute_mode == AttributeMode::Yes)
+                builder.append("&quot;");
+            // 4. If the algorithm was not invoked in the attribute mode, replace any occurrences of the "<" character by the string "&lt;", and any occurrences of the ">" character by the string "&gt;".
+            else if (ch == '<' && attribute_mode == AttributeMode::No)
+                builder.append("&lt;");
+            else if (ch == '>' && attribute_mode == AttributeMode::No)
+                builder.append("&gt;");
+            else
+                builder.append(ch);
+        }
+        return builder.to_string();
+    };
+
+    // 2. Let s be a string, and initialize it to the empty string.
+    StringBuilder builder;
+
+    // 4. For each child node of the node, in tree order, run the following steps:
+    actual_node->for_each_child([&](DOM::Node& current_node) {
+        // 1. Let current node be the child node being processed.
+
+        // 2. Append the appropriate string from the following list to s:
+
+        if (is<DOM::Element>(current_node)) {
+            // -> If current node is an Element
+            auto& element = verify_cast<DOM::Element>(current_node);
+
+            // 1. If current node is an element in the HTML namespace, the MathML namespace, or the SVG namespace, then let tagname be current node's local name.
+            //    Otherwise, let tagname be current node's qualified name.
+            String tag_name;
+
+            if (element.namespace_().is_one_of(Namespace::HTML, Namespace::MathML, Namespace::SVG))
+                tag_name = element.local_name();
+            else
+                tag_name = element.qualified_name();
+
+            // 2. Append a U+003C LESS-THAN SIGN character (<), followed by tagname.
+            builder.append('<');
+            builder.append(tag_name);
+
+            // FIXME: 3. If current node's is value is not null, and the element does not have an is attribute in its attribute list,
+            //           then append the string " is="", followed by current node's is value escaped as described below in attribute mode,
+            //           followed by a U+0022 QUOTATION MARK character (").
+
+            // 4. For each attribute that the element has, append a U+0020 SPACE character, the attribute's serialized name as described below, a U+003D EQUALS SIGN character (=),
+            //    a U+0022 QUOTATION MARK character ("), the attribute's value, escaped as described below in attribute mode, and a second U+0022 QUOTATION MARK character (").
+            //    NOTE: The order of attributes is implementation-defined. The only constraint is that the order must be stable.
+            element.for_each_attribute([&](auto& name, auto& value) {
+                builder.append(' ');
+
+                // An attribute's serialized name for the purposes of the previous paragraph must be determined as follows:
+
+                // FIXME: -> If the attribute has no namespace:
+                //              The attribute's serialized name is the attribute's local name.
+                //           (We currently always do this)
+                builder.append(name);
+
+                // FIXME: -> If the attribute is in the XML namespace:
+                //             The attribute's serialized name is the string "xml:" followed by the attribute's local name.
+
+                // FIXME: -> If the attribute is in the XMLNS namespace and the attribute's local name is xmlns:
+                //             The attribute's serialized name is the string "xmlns".
+
+                // FIXME: -> If the attribute is in the XMLNS namespace and the attribute's local name is not xmlns:
+                //             The attribute's serialized name is the string "xmlns:" followed by the attribute's local name.
+
+                // FIXME: -> If the attribute is in the XLink namespace:
+                //             The attribute's serialized name is the string "xlink:" followed by the attribute's local name.
+
+                // FIXME: -> If the attribute is in some other namespace:
+                //             The attribute's serialized name is the attribute's qualified name.
+
+                builder.append("=\"");
+                builder.append(escape_string(value, AttributeMode::Yes));
+                builder.append('"');
+            });
+
+            // 5. Append a U+003E GREATER-THAN SIGN character (>).
+            builder.append('>');
+
+            // 6. If current node serializes as void, then continue on to the next child node at this point.
+            if (element.serializes_as_void())
+                return IterationDecision::Continue;
+
+            // 7. Append the value of running the HTML fragment serialization algorithm on the current node element (thus recursing into this algorithm for that element),
+            //    followed by a U+003C LESS-THAN SIGN character (<), a U+002F SOLIDUS character (/), tagname again, and finally a U+003E GREATER-THAN SIGN character (>).
+            builder.append(serialize_html_fragment(element));
+            builder.append("</");
+            builder.append(tag_name);
+            builder.append('>');
+
+            return IterationDecision::Continue;
+        }
+
+        if (is<DOM::Text>(current_node)) {
+            // -> If current node is a Text node
+            auto& text_node = verify_cast<DOM::Text>(current_node);
+            auto* parent = current_node.parent();
+
+            if (is<DOM::Element>(parent)) {
+                auto& parent_element = verify_cast<DOM::Element>(*parent);
+
+                // 1. If the parent of current node is a style, script, xmp, iframe, noembed, noframes, or plaintext element,
+                //    or if the parent of current node is a noscript element and scripting is enabled for the node, then append the value of current node's data IDL attribute literally.
+                if (parent_element.local_name().is_one_of(HTML::TagNames::style, HTML::TagNames::script, HTML::TagNames::xmp, HTML::TagNames::iframe, HTML::TagNames::noembed, HTML::TagNames::noframes, HTML::TagNames::plaintext)
+                    || (parent_element.local_name() == HTML::TagNames::noscript && !parent_element.is_scripting_disabled())) {
+                    builder.append(text_node.data());
+                    return IterationDecision::Continue;
+                }
+            }
+
+            // 2. Otherwise, append the value of current node's data IDL attribute, escaped as described below.
+            builder.append(escape_string(text_node.data(), AttributeMode::No));
+            return IterationDecision::Continue;
+        }
+
+        if (is<DOM::Comment>(current_node)) {
+            // -> If current node is a Comment
+            auto& comment_node = verify_cast<DOM::Comment>(current_node);
+
+            // 1. Append the literal string "<!--" (U+003C LESS-THAN SIGN, U+0021 EXCLAMATION MARK, U+002D HYPHEN-MINUS, U+002D HYPHEN-MINUS),
+            //    followed by the value of current node's data IDL attribute, followed by the literal string "-->" (U+002D HYPHEN-MINUS, U+002D HYPHEN-MINUS, U+003E GREATER-THAN SIGN).
+            builder.append("<!--");
+            builder.append(comment_node.data());
+            builder.append("-->");
+            return IterationDecision::Continue;
+        }
+
+        if (is<DOM::ProcessingInstruction>(current_node)) {
+            // -> If current node is a ProcessingInstruction
+            auto& processing_instruction_node = verify_cast<DOM::ProcessingInstruction>(current_node);
+
+            // 1. Append the literal string "<?" (U+003C LESS-THAN SIGN, U+003F QUESTION MARK), followed by the value of current node's target IDL attribute,
+            //    followed by a single U+0020 SPACE character, followed by the value of current node's data IDL attribute, followed by a single U+003E GREATER-THAN SIGN character (>).
+            builder.append("<?");
+            builder.append(processing_instruction_node.target());
+            builder.append(' ');
+            builder.append(processing_instruction_node.data());
+            builder.append('>');
+            return IterationDecision::Continue;
+        }
+
+        if (is<DOM::DocumentType>(current_node)) {
+            // -> If current node is a DocumentType
+            auto& document_type_node = verify_cast<DOM::DocumentType>(current_node);
+
+            // 1. Append the literal string "<!DOCTYPE" (U+003C LESS-THAN SIGN, U+0021 EXCLAMATION MARK, U+0044 LATIN CAPITAL LETTER D, U+004F LATIN CAPITAL LETTER O,
+            //    U+0043 LATIN CAPITAL LETTER C, U+0054 LATIN CAPITAL LETTER T, U+0059 LATIN CAPITAL LETTER Y, U+0050 LATIN CAPITAL LETTER P, U+0045 LATIN CAPITAL LETTER E),
+            //    followed by a space (U+0020 SPACE), followed by the value of current node's name IDL attribute, followed by the literal string ">" (U+003E GREATER-THAN SIGN).
+            builder.append("<!DOCTYPE ");
+            builder.append(document_type_node.name());
+            builder.append('>');
+            return IterationDecision::Continue;
+        }
+
+        return IterationDecision::Continue;
+    });
+
+    // 5. Return s.
+    return builder.to_string();
+}
+
 }

+ 1 - 0
Userland/Libraries/LibWeb/HTML/Parser/HTMLDocumentParser.h

@@ -53,6 +53,7 @@ public:
     DOM::Document& document();
 
     static NonnullRefPtrVector<DOM::Node> parse_html_fragment(DOM::Element& context_element, const StringView&);
+    static String serialize_html_fragment(DOM::Node const& node);
 
     enum class InsertionMode {
 #define __ENUMERATE_INSERTION_MODE(mode) mode,