Преглед на файлове

LibWeb: Set the correct end position of HTML attribute names

We were previously setting the end position of attribute names in self-
closing HTML tags to the end of the attribute value. To illustrate the
previous behavior, consider this tag and its attribute's start and end
positions (shown inclusively below):

    <meta charset="UTF-8" />
          ^ name start
                  ^ value start
                        ^ value end
                        ^ name end

Rather than setting the end position of the attribute name when we parse
the closing slash, ensure the end position is already set while we are
in the AttributeName state. We now have:

    <meta charset="UTF-8" />
          ^ name start
                ^ name end
                  ^ value start
                        ^ value end

The tokenizer unit test has been extended to test these positions.
Timothy Flynn преди 1 година
родител
ревизия
5a2bf7fdd1
променени са 3 файла, в които са добавени 46 реда и са изтрити 19 реда
  1. 33 15
      Tests/LibWeb/TestHTMLTokenizer.cpp
  2. 8 1
      Userland/Libraries/LibWeb/HTML/Parser/HTMLToken.h
  3. 5 3
      Userland/Libraries/LibWeb/HTML/Parser/HTMLTokenizer.cpp

+ 33 - 15
Tests/LibWeb/TestHTMLTokenizer.cpp

@@ -59,9 +59,15 @@ using Token = Web::HTML::HTMLToken;
     EXPECT_EQ(current_token->type(), Token::Type::DOCTYPE); \
     NEXT_TOKEN();
 
-#define EXPECT_TAG_TOKEN_ATTRIBUTE(name, value) \
-    VERIFY(last_token);                         \
-    EXPECT_EQ(last_token->attribute(#name), value);
+#define EXPECT_TAG_TOKEN_ATTRIBUTE(name, attribute_value, name_start_column, name_end_column, value_start_column, value_end_column) \
+    VERIFY(last_token);                                                                                                             \
+    auto name##_attr = last_token->raw_attribute(#name);                                                                            \
+    VERIFY(name##_attr.has_value());                                                                                                \
+    EXPECT_EQ(name##_attr->value, attribute_value);                                                                                 \
+    EXPECT_EQ(name##_attr->name_start_position.column, name_start_column);                                                          \
+    EXPECT_EQ(name##_attr->name_end_position.column, name_end_column);                                                              \
+    EXPECT_EQ(name##_attr->value_start_position.column, value_start_column);                                                        \
+    EXPECT_EQ(name##_attr->value_end_position.column, value_end_column);
 
 #define EXPECT_TAG_TOKEN_ATTRIBUTE_COUNT(count) \
     VERIFY(last_token);                         \
@@ -128,7 +134,7 @@ TEST_CASE(unquoted_attributes)
     BEGIN_ENUMERATION(tokens);
     EXPECT_START_TAG_TOKEN(p, 1u, 10u);
     EXPECT_TAG_TOKEN_ATTRIBUTE_COUNT(1);
-    EXPECT_TAG_TOKEN_ATTRIBUTE(foo, "bar");
+    EXPECT_TAG_TOKEN_ATTRIBUTE(foo, "bar", 3u, 6u, 7u, 10u);
     EXPECT_END_OF_FILE_TOKEN();
     END_ENUMERATION();
 }
@@ -139,7 +145,7 @@ TEST_CASE(single_quoted_attributes)
     BEGIN_ENUMERATION(tokens);
     EXPECT_START_TAG_TOKEN(p, 1u, 12u);
     EXPECT_TAG_TOKEN_ATTRIBUTE_COUNT(1);
-    EXPECT_TAG_TOKEN_ATTRIBUTE(foo, "bar");
+    EXPECT_TAG_TOKEN_ATTRIBUTE(foo, "bar", 3u, 6u, 7u, 12u);
     EXPECT_END_OF_FILE_TOKEN();
     END_ENUMERATION();
 }
@@ -150,20 +156,32 @@ TEST_CASE(double_quoted_attributes)
     BEGIN_ENUMERATION(tokens);
     EXPECT_START_TAG_TOKEN(p, 1u, 12u);
     EXPECT_TAG_TOKEN_ATTRIBUTE_COUNT(1);
-    EXPECT_TAG_TOKEN_ATTRIBUTE(foo, "bar");
+    EXPECT_TAG_TOKEN_ATTRIBUTE(foo, "bar", 3u, 6u, 7u, 12u);
+    EXPECT_END_OF_FILE_TOKEN();
+    END_ENUMERATION();
+}
+
+TEST_CASE(valueless_attribute)
+{
+    auto tokens = run_tokenizer("<p foo>"sv);
+    BEGIN_ENUMERATION(tokens);
+    EXPECT_START_TAG_TOKEN(p, 1u, 6u);
+    EXPECT_TAG_TOKEN_ATTRIBUTE_COUNT(1);
+    EXPECT_TAG_TOKEN_ATTRIBUTE(foo, "", 3u, 6u, 0u, 0u);
     EXPECT_END_OF_FILE_TOKEN();
     END_ENUMERATION();
 }
 
 TEST_CASE(multiple_attributes)
 {
-    auto tokens = run_tokenizer("<p foo=\"bar\" baz=foobar foo2=\"bar2\">"sv);
+    auto tokens = run_tokenizer("<p foo=\"bar\" baz=foobar biz foo2=\"bar2\">"sv);
     BEGIN_ENUMERATION(tokens);
-    EXPECT_START_TAG_TOKEN(p, 1u, 35u);
-    EXPECT_TAG_TOKEN_ATTRIBUTE_COUNT(3);
-    EXPECT_TAG_TOKEN_ATTRIBUTE(foo, "bar");
-    EXPECT_TAG_TOKEN_ATTRIBUTE(baz, "foobar");
-    EXPECT_TAG_TOKEN_ATTRIBUTE(foo2, "bar2");
+    EXPECT_START_TAG_TOKEN(p, 1u, 39u);
+    EXPECT_TAG_TOKEN_ATTRIBUTE_COUNT(4);
+    EXPECT_TAG_TOKEN_ATTRIBUTE(foo, "bar", 3u, 6u, 7u, 12u);
+    EXPECT_TAG_TOKEN_ATTRIBUTE(baz, "foobar", 13u, 16u, 17u, 23u);
+    EXPECT_TAG_TOKEN_ATTRIBUTE(biz, "", 24u, 27u, 0u, 0u);
+    EXPECT_TAG_TOKEN_ATTRIBUTE(foo2, "bar2", 28u, 32u, 33u, 39u);
     EXPECT_END_OF_FILE_TOKEN();
     END_ENUMERATION();
 }
@@ -174,9 +192,9 @@ TEST_CASE(character_reference_in_attribute)
     BEGIN_ENUMERATION(tokens);
     EXPECT_START_TAG_TOKEN(p, 1u, 43u);
     EXPECT_TAG_TOKEN_ATTRIBUTE_COUNT(3);
-    EXPECT_TAG_TOKEN_ATTRIBUTE(foo, "a&b");
-    EXPECT_TAG_TOKEN_ATTRIBUTE(bar, "a&b");
-    EXPECT_TAG_TOKEN_ATTRIBUTE(baz, "a&b");
+    EXPECT_TAG_TOKEN_ATTRIBUTE(foo, "a&b", 3u, 6u, 7u, 14u);
+    EXPECT_TAG_TOKEN_ATTRIBUTE(bar, "a&b", 15u, 18u, 19u, 28u);
+    EXPECT_TAG_TOKEN_ATTRIBUTE(baz, "a&b", 29u, 32u, 33u, 43u);
     EXPECT_END_OF_FILE_TOKEN();
     END_ENUMERATION();
 }

+ 8 - 1
Userland/Libraries/LibWeb/HTML/Parser/HTMLToken.h

@@ -246,6 +246,13 @@ public:
     }
 
     StringView attribute(DeprecatedFlyString const& attribute_name) const
+    {
+        if (auto result = raw_attribute(attribute_name); result.has_value())
+            return result->value;
+        return {};
+    }
+
+    Optional<Attribute const&> raw_attribute(DeprecatedFlyString const& attribute_name) const
     {
         VERIFY(is_start_tag() || is_end_tag());
 
@@ -254,7 +261,7 @@ public:
             return {};
         for (auto& attribute : *ptr) {
             if (attribute_name == attribute.local_name)
-                return attribute.value;
+                return attribute;
         }
         return {};
     }

+ 5 - 3
Userland/Libraries/LibWeb/HTML/Parser/HTMLTokenizer.cpp

@@ -1051,8 +1051,6 @@ _StartOfFunction:
                 }
                 ON('/')
                 {
-                    if (m_current_token.has_attributes())
-                        m_current_token.last_attribute().name_end_position = nth_last_position(1);
                     RECONSUME_IN(AfterAttributeName);
                 }
                 ON('>')
@@ -1108,21 +1106,25 @@ _StartOfFunction:
             {
                 ON_WHITESPACE
                 {
+                    m_current_token.last_attribute().name_end_position = nth_last_position(1);
                     m_current_token.last_attribute().local_name = consume_current_builder();
                     RECONSUME_IN(AfterAttributeName);
                 }
                 ON('/')
                 {
+                    m_current_token.last_attribute().name_end_position = nth_last_position(1);
                     m_current_token.last_attribute().local_name = consume_current_builder();
                     RECONSUME_IN(AfterAttributeName);
                 }
                 ON('>')
                 {
+                    m_current_token.last_attribute().name_end_position = nth_last_position(1);
                     m_current_token.last_attribute().local_name = consume_current_builder();
                     RECONSUME_IN(AfterAttributeName);
                 }
                 ON_EOF
                 {
+                    m_current_token.last_attribute().name_end_position = nth_last_position(1);
                     m_current_token.last_attribute().local_name = consume_current_builder();
                     RECONSUME_IN(AfterAttributeName);
                 }
@@ -1196,7 +1198,7 @@ _StartOfFunction:
                 {
                     m_current_token.add_attribute({});
                     if (!m_source_positions.is_empty())
-                        m_current_token.last_attribute().name_start_position = m_source_positions.last();
+                        m_current_token.last_attribute().name_start_position = nth_last_position(1);
                     RECONSUME_IN(AttributeName);
                 }
             }