Quellcode durchsuchen

LibXML+Tests: Consume `>` in the character data ending `]]>` and test it

For example, with this input:
```xml
<C>]]>
```
After seeing `<C>`, the parser will start parsing the content of the
element. The content parser will then parse any character data it sees.

The character parser would see the first two `]]` and consume them.
Then, it would see the `>` and set the state machine to say we have
seen this, but it did _not_ consume it and would instead tell
GenericLexer that it should stop consuming characters. Therefore,
we only consumed 2 characters.

Then, it would see that we are in the state where we've seen the
full `]]>` and try to take off three characters from the end of the
consumed input when we only have 2 characters, causing an assertion
failure as we are asking to take off more characters than there really
is.
Luke Wilde vor 3 Jahren
Ursprung
Commit
adb5f7e485

+ 1 - 0
Tests/CMakeLists.txt

@@ -23,6 +23,7 @@ add_subdirectory(LibTimeZone)
 add_subdirectory(LibUnicode)
 add_subdirectory(LibWasm)
 add_subdirectory(LibWeb)
+add_subdirectory(LibXML)
 if (${SERENITY_ARCH} STREQUAL "i686")
     add_subdirectory(UserspaceEmulator)
 endif()

+ 7 - 0
Tests/LibXML/CMakeLists.txt

@@ -0,0 +1,7 @@
+set(TEST_SOURCES
+    TestParser.cpp
+)
+
+foreach(source IN LISTS TEST_SOURCES)
+    serenity_test("${source}" LibXML LIBS LibXML)
+endforeach()

+ 22 - 0
Tests/LibXML/TestParser.cpp

@@ -0,0 +1,22 @@
+/*
+ * Copyright (c) 2022, Luke Wilde <lukew@serenityos.org>
+ *
+ * SPDX-License-Identifier: BSD-2-Clause
+ */
+
+#include <LibTest/TestCase.h>
+#include <LibXML/Parser/Parser.h>
+
+TEST_CASE(char_data_ending)
+{
+    EXPECT_NO_CRASH("parsing character data ending by itself should not crash", [] {
+        // After seeing `<C>`, the parser will start parsing the content of the element. The content parser will then parse any character data it sees.
+        // The character parser would see the first two `]]` and consume them. Then, it would see the `>` and set the state machine to say we have seen this,
+        // but it did _not_ consume it and would instead tell GenericLexer that it should stop consuming characters. Therefore, we only consumed 2 characters.
+        // Then, it would see that we are in the state where we've seen the full `]]>` and try to take off three characters from the end of the consumed
+        // input when we only have 2 characters, causing an assertion failure as we are asking to take off more characters than there really is.
+        XML::Parser parser("<C>]]>");
+        (void)parser.parse();
+        return Test::Crash::Failure::DidNotCrash;
+    });
+}

+ 2 - 2
Userland/Libraries/LibXML/Parser/Parser.cpp

@@ -891,7 +891,7 @@ ErrorOr<StringView, ParseError> Parser::parse_char_data()
     // CharData ::= [^<&]* - ([^<&]* ']]>' [^<&]*)
     auto cend_state = 0; // 1: ], 2: ], 3: >
     auto text = m_lexer.consume_while([&](auto ch) {
-        if (ch == '<' || ch == '&')
+        if (ch == '<' || ch == '&' || cend_state == 3)
             return false;
         switch (cend_state) {
         case 0:
@@ -904,7 +904,7 @@ ErrorOr<StringView, ParseError> Parser::parse_char_data()
         case 2:
             if (ch == '>') {
                 cend_state++;
-                return false;
+                return true;
             }
             cend_state = 0;
             return true;