LibXML: Read code points when parsing names

Author: https://github.com/Gingeh Commit: https://github.com/LadybirdBrowser/ladybird/commit/453e0348010 Pull-request: https://github.com/LadybirdBrowser/ladybird/pull/2192 Reviewed-by: https://github.com/alimpfard
2024-11-21 23:20:20 +00:00 · 2024-11-06 18:40:14 +11:00 · 2024-11-06 18:40:14 +11:00 · 453e034801 · 2024-11-06 09:09:03 +00:00
commit 453e034801
parent 42b31820a6
2 changed files with 26 additions and 6 deletions
--- a/Tests/LibXML/TestParser.cpp
+++ b/Tests/LibXML/TestParser.cpp
@ -41,3 +41,9 @@ TEST_CASE(predefined_character_reference)
    auto const& content = node.children[0]->content.get<XML::Node::Text>();
    EXPECT_EQ(content.builder.string_view(), "Well hello &, <, >, ', and \"!");
 }
+
+TEST_CASE(unicode_name)
+{
+    XML::Parser parser("<div 中文=\"\"></div>"sv);
+    TRY_OR_FAIL(parser.parse());
+}
--- a/Userland/Libraries/LibXML/Parser/Parser.cpp
+++ b/Userland/Libraries/LibXML/Parser/Parser.cpp
@ -545,16 +545,30 @@ ErrorOr<Name, ParseError> Parser::parse_name()
    auto rule = enter_rule();

    // Name ::= NameStartChar (NameChar)*
-    auto start = TRY(expect(s_name_start_characters, "a NameStartChar"sv));
+
+    // FIXME: This is a hacky workaround to read code points instead of bytes.
+    // Replace this once we have a unicode-aware lexer.
+    auto start = m_lexer.tell();
+    StringView remaining = m_lexer.input().substring_view(start);
+    Utf8View view { remaining };
+    auto code_points = view.begin();
+    if (code_points.done() || !s_name_start_characters.contains(*code_points)) {
+        if (m_options.treat_errors_as_fatal)
+            return parse_error(m_lexer.current_position(), Expectation { "a NameStartChar"sv });
+    }
+
+    m_lexer.ignore(code_points.underlying_code_point_length_in_bytes());
+    ++code_points;
+
    auto accept = accept_rule();

-    auto rest = m_lexer.consume_while(s_name_characters);
-    StringBuilder builder;
-    builder.append(start);
-    builder.append(rest);
+    while (!code_points.done() && s_name_characters.contains(*code_points)) {
+        m_lexer.ignore(code_points.underlying_code_point_length_in_bytes());
+        ++code_points;
+    }

    rollback.disarm();
-    return builder.to_byte_string();
+    return remaining.substring_view(0, m_lexer.tell() - start);
 }

 // 2.8.28. doctypedecl, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-doctypedecl