diff --git a/Tests/LibXML/TestParser.cpp b/Tests/LibXML/TestParser.cpp index abf416c9867..62cf81b2122 100644 --- a/Tests/LibXML/TestParser.cpp +++ b/Tests/LibXML/TestParser.cpp @@ -41,3 +41,9 @@ TEST_CASE(predefined_character_reference) auto const& content = node.children[0]->content.get(); EXPECT_EQ(content.builder.string_view(), "Well hello &, <, >, ', and \"!"); } + +TEST_CASE(unicode_name) +{ + XML::Parser parser("
"sv); + TRY_OR_FAIL(parser.parse()); +} diff --git a/Userland/Libraries/LibXML/Parser/Parser.cpp b/Userland/Libraries/LibXML/Parser/Parser.cpp index 23870c27e86..aa0503cf14b 100644 --- a/Userland/Libraries/LibXML/Parser/Parser.cpp +++ b/Userland/Libraries/LibXML/Parser/Parser.cpp @@ -545,16 +545,30 @@ ErrorOr Parser::parse_name() auto rule = enter_rule(); // Name ::= NameStartChar (NameChar)* - auto start = TRY(expect(s_name_start_characters, "a NameStartChar"sv)); + + // FIXME: This is a hacky workaround to read code points instead of bytes. + // Replace this once we have a unicode-aware lexer. + auto start = m_lexer.tell(); + StringView remaining = m_lexer.input().substring_view(start); + Utf8View view { remaining }; + auto code_points = view.begin(); + if (code_points.done() || !s_name_start_characters.contains(*code_points)) { + if (m_options.treat_errors_as_fatal) + return parse_error(m_lexer.current_position(), Expectation { "a NameStartChar"sv }); + } + + m_lexer.ignore(code_points.underlying_code_point_length_in_bytes()); + ++code_points; + auto accept = accept_rule(); - auto rest = m_lexer.consume_while(s_name_characters); - StringBuilder builder; - builder.append(start); - builder.append(rest); + while (!code_points.done() && s_name_characters.contains(*code_points)) { + m_lexer.ignore(code_points.underlying_code_point_length_in_bytes()); + ++code_points; + } rollback.disarm(); - return builder.to_byte_string(); + return remaining.substring_view(0, m_lexer.tell() - start); } // 2.8.28. doctypedecl, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-doctypedecl