AK: Stop using U+0000 as end of file code point in URL parser

This changes URL parser to use the 0xFFFFFFFF constant instead of 0 to indicate end of file. This fixes a bug where inputs containing null bytes would terminate the parser early, because they were interpreted as end of file.
Author: https://github.com/MaxWipfli Commit: https://github.com/SerenityOS/serenity/commit/2164d8aae8a Pull-request: https://github.com/SerenityOS/serenity/pull/7791 Reviewed-by: https://github.com/awesomekling
2024-11-21 23:20:20 +00:00 · 2021-06-03 12:43:08 +02:00 · 2021-06-03 12:43:08 +02:00 · 2164d8aae8 · 2024-07-18 16:52:00 +09:00
commit 2164d8aae8
parent 97425c7dfb
1 changed files with 18 additions and 16 deletions
--- a/AK/URLParser.cpp
+++ b/AK/URLParser.cpp
@ -16,6 +16,9 @@

 namespace AK {

+// NOTE: This is similar to the LibC macro EOF = -1.
+constexpr u32 end_of_file = 0xFFFFFFFF;
+
 constexpr bool is_url_code_point(u32 code_point)
 {
    // FIXME: [...] and code points in the range U+00A0 to U+10FFFD, inclusive, excluding surrogates and noncharacters.
@ -221,14 +224,13 @@ URL URLParser::parse(Badge<URL>, StringView const& raw_input, URL const* base_ur
    // NOTE: "continue" should only be used to prevent incrementing the iterator, as this is done at the end of the loop.
    //       ++iterator : "increase pointer by 1"
    //       continue   : "decrease pointer by 1"
-    // NOTE: The NULL code point is used as the "EOF code point".
    for (;;) {
-        u32 code_point = 0;
+        u32 code_point = end_of_file;
        if (!iterator.done())
            code_point = *iterator;

        if constexpr (URL_PARSER_DEBUG) {
-            if (!code_point)
+            if (code_point == end_of_file)
                dbgln("URLParser::parse: {} state with EOF.", state_name(state));
            else if (is_ascii_printable(code_point))
                dbgln("URLParser::parse: {} state with code point U+{:04X} ({:c}).", state_name(state), code_point, code_point);
@ -335,7 +337,7 @@ URL URLParser::parse(Badge<URL>, StringView const& raw_input, URL const* base_ur
                } else if (code_point == '#') {
                    url.m_fragment = "";
                    state = State::Fragment;
-                } else if (code_point != 0) {
+                } else if (code_point != end_of_file) {
                    url.m_query = {};
                    if (url.m_paths.size())
                        url.m_paths.remove(url.m_paths.size() - 1);
@ -408,7 +410,7 @@ URL URLParser::parse(Badge<URL>, StringView const& raw_input, URL const* base_ur
                    }
                }
                buffer.clear();
-            } else if (code_point == 0 || code_point == '/' || code_point == '?' || code_point == '#' || (url.is_special() && code_point == '\\')) {
+            } else if (code_point == end_of_file || code_point == '/' || code_point == '?' || code_point == '#' || (url.is_special() && code_point == '\\')) {
                if (at_sign_seen && buffer.is_empty()) {
                    report_validation_error();
                    return {};
@ -434,7 +436,7 @@ URL URLParser::parse(Badge<URL>, StringView const& raw_input, URL const* base_ur
                url.m_host = host.release_value();
                buffer.clear();
                state = State::Port;
-            } else if (code_point == 0 || code_point == '/' || code_point == '?' || code_point == '#' || (url.is_special() && code_point == '\\')) {
+            } else if (code_point == end_of_file || code_point == '/' || code_point == '?' || code_point == '#' || (url.is_special() && code_point == '\\')) {
                if (url.is_special() && buffer.is_empty()) {
                    report_validation_error();
                    return {};
@ -457,7 +459,7 @@ URL URLParser::parse(Badge<URL>, StringView const& raw_input, URL const* base_ur
        case State::Port:
            if (is_ascii_digit(code_point)) {
                buffer.append_code_point(code_point);
-            } else if (code_point == 0 || code_point == '/' || code_point == '?' || code_point == '#' || (url.is_special() && code_point == '\\')) {
+            } else if (code_point == end_of_file || code_point == '/' || code_point == '?' || code_point == '#' || (url.is_special() && code_point == '\\')) {
                if (!buffer.is_empty()) {
                    auto port = buffer.to_string().to_uint();
                    if (!port.has_value() || port.value() > 65535) {
@ -494,7 +496,7 @@ URL URLParser::parse(Badge<URL>, StringView const& raw_input, URL const* base_ur
                } else if (code_point == '#') {
                    url.m_fragment = "";
                    state = State::Fragment;
-                } else if (code_point != 0) {
+                } else if (code_point != end_of_file) {
                    url.m_query = {};
                    auto substring_from_pointer = input.substring_view(iterator - input.begin()).as_string();
                    if (!starts_with_windows_drive_letter(substring_from_pointer)) {
@ -524,7 +526,7 @@ URL URLParser::parse(Badge<URL>, StringView const& raw_input, URL const* base_ur
            }
            break;
        case State::FileHost:
-            if (code_point == 0 || code_point == '/' || code_point == '\\' || code_point == '?' || code_point == '#') {
+            if (code_point == end_of_file || code_point == '/' || code_point == '\\' || code_point == '?' || code_point == '#') {
                if (is_windows_drive_letter(buffer.to_string())) {
                    report_validation_error();
                    state = State::Path;
@ -559,14 +561,14 @@ URL URLParser::parse(Badge<URL>, StringView const& raw_input, URL const* base_ur
            } else if (code_point == '#') {
                url.m_fragment = "";
                state = State::Fragment;
-            } else if (code_point != 0) {
+            } else if (code_point != end_of_file) {
                state = State::Path;
                if (code_point != '/')
                    continue;
            }
            break;
        case State::Path:
-            if (code_point == 0 || code_point == '/' || (url.is_special() && code_point == '\\') || code_point == '?' || code_point == '#') {
+            if (code_point == end_of_file || code_point == '/' || (url.is_special() && code_point == '\\') || code_point == '?' || code_point == '#') {
                if (url.is_special() && code_point == '\\')
                    report_validation_error();
                if (is_double_dot_path_segment(buffer.to_string())) {
@ -616,10 +618,10 @@ URL URLParser::parse(Badge<URL>, StringView const& raw_input, URL const* base_ur
                url.m_fragment = "";
                state = State::Fragment;
            } else {
-                if (code_point != 0 && !is_url_code_point(code_point) && code_point != '%')
+                if (code_point != end_of_file && !is_url_code_point(code_point) && code_point != '%')
                    report_validation_error();
                // FIXME: If c is U+0025 (%) and remaining does not start with two ASCII hex digits, validation error.
-                if (code_point != 0) {
+                if (code_point != end_of_file) {
                    URL::append_percent_encoded_if_necessary(buffer, code_point, URL::PercentEncodeSet::C0Control);
                } else {
                    // NOTE: This needs to be percent decoded since the member variables contain decoded data.
@ -628,7 +630,7 @@ URL URLParser::parse(Badge<URL>, StringView const& raw_input, URL const* base_ur
            }
            break;
        case State::Query:
-            if (code_point == '#' || code_point == 0) {
+            if (code_point == end_of_file || code_point == '#') {
                VERIFY(url.m_query == "");
                auto query_percent_encode_set = url.is_special() ? URL::PercentEncodeSet::SpecialQuery : URL::PercentEncodeSet::Query;
                // NOTE: This is has to be encoded and then decoded because the original sequence could contain already percent-encoded sequences.
@ -638,7 +640,7 @@ URL URLParser::parse(Badge<URL>, StringView const& raw_input, URL const* base_ur
                    url.m_fragment = "";
                    state = State::Fragment;
                }
-            } else if (code_point != 0) {
+            } else if (code_point != end_of_file) {
                if (!is_url_code_point(code_point) && code_point != '%')
                    report_validation_error();
                // FIXME: If c is U+0025 (%) and remaining does not start with two ASCII hex digits, validation error.
@ -647,7 +649,7 @@ URL URLParser::parse(Badge<URL>, StringView const& raw_input, URL const* base_ur
            break;
        case State::Fragment:
            // NOTE: This does not follow the spec exactly but rather uses the buffer and only sets the fragment on EOF.
-            if (code_point) {
+            if (code_point != end_of_file) {
                if (!is_url_code_point(code_point) && code_point != '%')
                    report_validation_error();
                // FIXME: If c is U+0025 (%) and remaining does not start with two ASCII hex digits, validation error.