From 79c77debb029af258353f5b188fd4f07d2c6b73a Mon Sep 17 00:00:00 2001 From: Andreas Kling Date: Sun, 10 Apr 2022 00:48:15 +0200 Subject: [PATCH] AK: Don't destructively re-encode query strings in the URL parser We were decoding and then re-encoding the query string in URLs. This round-trip caused us to lose information about plus ('+') ASCII characters encoded as "%2B". --- AK/URL.cpp | 2 +- AK/URL.h | 2 ++ AK/URLParser.cpp | 39 +++++++++++++++++++++++++++++++++++++-- 3 files changed, 40 insertions(+), 3 deletions(-) diff --git a/AK/URL.cpp b/AK/URL.cpp index 7d4e1ee19e1..e83eac4c7ac 100644 --- a/AK/URL.cpp +++ b/AK/URL.cpp @@ -367,7 +367,7 @@ void URL::append_percent_encoded(StringBuilder& builder, u32 code_point) } // https://url.spec.whatwg.org/#c0-control-percent-encode-set -constexpr bool code_point_is_in_percent_encode_set(u32 code_point, URL::PercentEncodeSet set) +bool URL::code_point_is_in_percent_encode_set(u32 code_point, URL::PercentEncodeSet set) { switch (set) { case URL::PercentEncodeSet::C0Control: diff --git a/AK/URL.h b/AK/URL.h index 880dadae581..f60c8e1c93a 100644 --- a/AK/URL.h +++ b/AK/URL.h @@ -113,6 +113,8 @@ public: bool operator==(URL const& other) const { return equals(other, ExcludeFragment::No); } + static bool code_point_is_in_percent_encode_set(u32 code_point, URL::PercentEncodeSet); + private: URL(String&& data_mime_type, String&& data_payload, bool payload_is_base64) : m_valid(true) diff --git a/AK/URLParser.cpp b/AK/URLParser.cpp index d508aa96dc7..cd50ffbd7d0 100644 --- a/AK/URLParser.cpp +++ b/AK/URLParser.cpp @@ -115,6 +115,41 @@ constexpr bool is_double_dot_path_segment(StringView input) return input == ".."sv || input.equals_ignoring_case(".%2e"sv) || input.equals_ignoring_case("%2e."sv) || input.equals_ignoring_case("%2e%2e"sv); } +// https://url.spec.whatwg.org/#string-percent-encode-after-encoding +static String percent_encode_after_encoding(StringView input, URL::PercentEncodeSet percent_encode_set, bool space_as_plus = false) +{ + // NOTE: This is written somewhat ad-hoc since we don't yet implement the Encoding spec. + + StringBuilder output; + + // 3. For each byte of encodeOutput converted to a byte sequence: + for (auto byte : input) { + // 1. If spaceAsPlus is true and byte is 0x20 (SP), then append U+002B (+) to output and continue. + if (space_as_plus && byte == ' ') { + output.append('+'); + continue; + } + + // 2. Let isomorph be a code point whose value is byte’s value. + u32 isomorph = byte; + + // 3. Assert: percentEncodeSet includes all non-ASCII code points. + + // 4. If isomorphic is not in percentEncodeSet, then append isomorph to output. + if (!URL::code_point_is_in_percent_encode_set(isomorph, percent_encode_set)) { + output.append_code_point(isomorph); + } + + // 5. Otherwise, percent-encode byte and append the result to output. + else { + output.appendff("%{:02X}", byte); + } + } + + // 6. Return output. + return output.to_string(); +} + // https://fetch.spec.whatwg.org/#data-urls // FIXME: This only loosely follows the spec, as we use the same class for "regular" and data URLs, unlike the spec. Optional URLParser::parse_data_url(StringView raw_input) @@ -635,11 +670,11 @@ URL URLParser::parse(StringView raw_input, URL const* base_url, Optional ur } break; case State::Query: + // https://url.spec.whatwg.org/#query-state if (code_point == end_of_file || code_point == '#') { VERIFY(url->m_query == ""); auto query_percent_encode_set = url->is_special() ? URL::PercentEncodeSet::SpecialQuery : URL::PercentEncodeSet::Query; - // NOTE: This is has to be encoded and then decoded because the original sequence could contain already percent-encoded sequences. - url->m_query = URL::percent_decode(URL::percent_encode(buffer.string_view(), query_percent_encode_set)); + url->m_query = percent_encode_after_encoding(buffer.string_view(), query_percent_encode_set); buffer.clear(); if (code_point == '#') { url->m_fragment = "";