Bläddra i källkod

AK: Don't store parts of URLs percent decoded

As noted in serval comments doing this goes against the WC3 spec,
and breaks parsing then re-serializing URLs that contain percent
encoded data, that was not encoded using the same character set as
the serializer.

For example, previously if you had a URL like:

https:://foo.com/what%2F%2F (the path is what + '//' percent encoded)

Creating URL("https:://foo.com/what%2F%2F").serialize() would return:

https://foo.com/what//

Which is incorrect and not the same as the URL we passed. This is
because the re-serializing uses the PercentEncodeSet::Path which
does not include '/'.

Only doing the percent encoding in the setters fixes this, which
is required to navigate to Google Street View (which includes a
percent encoded URL in its URL).

Seems to fix #13477 too
MacDue 2 år sedan
förälder
incheckning
8283e8b88c
5 ändrade filer med 85 tillägg och 51 borttagningar
  1. 49 20
      AK/URL.cpp
  2. 15 8
      AK/URL.h
  3. 11 20
      AK/URLParser.cpp
  4. 9 2
      Tests/AK/TestURL.cpp
  5. 1 1
      Userland/Libraries/LibWeb/URL/URL.h

+ 49 - 20
AK/URL.cpp

@@ -47,20 +47,32 @@ URL URL::complete_url(StringView relative_url) const
     return URLParser::parse(relative_url, *this);
 }
 
+// NOTE: This only exists for compatibility with the existing URL tests which check for both .is_null() and .is_empty().
+static DeprecatedString deprecated_string_percent_encode(DeprecatedString const& input, URL::PercentEncodeSet set = URL::PercentEncodeSet::Userinfo, URL::SpaceAsPlus space_as_plus = URL::SpaceAsPlus::No)
+{
+    if (input.is_null() || input.is_empty())
+        return input;
+    return URL::percent_encode(input.view(), set, space_as_plus);
+}
+
 void URL::set_scheme(DeprecatedString scheme)
 {
     m_scheme = move(scheme);
     m_valid = compute_validity();
 }
 
-void URL::set_username(DeprecatedString username)
+void URL::set_username(DeprecatedString username, ApplyPercentEncoding apply_percent_encoding)
 {
+    if (apply_percent_encoding == ApplyPercentEncoding::Yes)
+        username = deprecated_string_percent_encode(username, PercentEncodeSet::Userinfo);
     m_username = move(username);
     m_valid = compute_validity();
 }
 
-void URL::set_password(DeprecatedString password)
+void URL::set_password(DeprecatedString password, ApplyPercentEncoding apply_percent_encoding)
 {
+    if (apply_percent_encoding == ApplyPercentEncoding::Yes)
+        password = deprecated_string_percent_encode(password, PercentEncodeSet::Userinfo);
     m_password = move(password);
     m_valid = compute_validity();
 }
@@ -81,19 +93,38 @@ void URL::set_port(Optional<u16> port)
     m_valid = compute_validity();
 }
 
-void URL::set_paths(Vector<DeprecatedString> paths)
+void URL::set_paths(Vector<DeprecatedString> paths, ApplyPercentEncoding apply_percent_encoding)
 {
-    m_paths = move(paths);
+    if (apply_percent_encoding == ApplyPercentEncoding::Yes) {
+        Vector<DeprecatedString> encoded_paths;
+        encoded_paths.ensure_capacity(paths.size());
+        for (auto& segment : paths)
+            encoded_paths.unchecked_append(deprecated_string_percent_encode(segment, PercentEncodeSet::Path));
+        m_paths = move(encoded_paths);
+    } else {
+        m_paths = move(paths);
+    }
     m_valid = compute_validity();
 }
 
-void URL::set_query(DeprecatedString query)
+void URL::append_path(DeprecatedString path, ApplyPercentEncoding apply_percent_encoding)
+{
+    if (apply_percent_encoding == ApplyPercentEncoding::Yes)
+        path = deprecated_string_percent_encode(path, PercentEncodeSet::Path);
+    m_paths.append(path);
+}
+
+void URL::set_query(DeprecatedString query, ApplyPercentEncoding apply_percent_encoding)
 {
+    if (apply_percent_encoding == ApplyPercentEncoding::Yes)
+        query = deprecated_string_percent_encode(query, is_special() ? PercentEncodeSet::SpecialQuery : PercentEncodeSet::Query);
     m_query = move(query);
 }
 
-void URL::set_fragment(DeprecatedString fragment)
+void URL::set_fragment(DeprecatedString fragment, ApplyPercentEncoding apply_percent_encoding)
 {
+    if (apply_percent_encoding == ApplyPercentEncoding::Yes)
+        fragment = deprecated_string_percent_encode(fragment, PercentEncodeSet::Fragment);
     m_fragment = move(fragment);
 }
 
@@ -171,9 +202,8 @@ URL URL::create_with_file_scheme(DeprecatedString const& path, DeprecatedString
     //       This is because a file URL always needs a non-null hostname.
     url.set_host(hostname.is_null() || hostname == "localhost" ? DeprecatedString::empty() : hostname);
     url.set_paths(lexical_path.parts());
-    // NOTE: To indicate that we want to end the path with a slash, we have to append an empty path segment.
     if (path.ends_with('/'))
-        url.append_path("");
+        url.append_slash();
     url.set_fragment(fragment);
     return url;
 }
@@ -188,9 +218,8 @@ URL URL::create_with_help_scheme(DeprecatedString const& path, DeprecatedString
     //       This is because a file URL always needs a non-null hostname.
     url.set_host(hostname.is_null() || hostname == "localhost" ? DeprecatedString::empty() : hostname);
     url.set_paths(lexical_path.parts());
-    // NOTE: To indicate that we want to end the path with a slash, we have to append an empty path segment.
     if (path.ends_with('/'))
-        url.append_path("");
+        url.append_slash();
     url.set_fragment(fragment);
     return url;
 }
@@ -242,10 +271,10 @@ DeprecatedString URL::serialize(ExcludeFragment exclude_fragment) const
         builder.append("//"sv);
 
         if (includes_credentials()) {
-            builder.append(percent_encode(m_username, PercentEncodeSet::Userinfo));
+            builder.append(m_username);
             if (!m_password.is_empty()) {
                 builder.append(':');
-                builder.append(percent_encode(m_password, PercentEncodeSet::Userinfo));
+                builder.append(m_password);
             }
             builder.append('@');
         }
@@ -256,24 +285,24 @@ DeprecatedString URL::serialize(ExcludeFragment exclude_fragment) const
     }
 
     if (cannot_be_a_base_url()) {
-        builder.append(percent_encode(m_paths[0], PercentEncodeSet::Path));
+        builder.append(m_paths[0]);
     } else {
         if (m_host.is_null() && m_paths.size() > 1 && m_paths[0].is_empty())
             builder.append("/."sv);
         for (auto& segment : m_paths) {
             builder.append('/');
-            builder.append(percent_encode(segment, PercentEncodeSet::Path));
+            builder.append(segment);
         }
     }
 
     if (!m_query.is_null()) {
         builder.append('?');
-        builder.append(percent_encode(m_query, is_special() ? URL::PercentEncodeSet::SpecialQuery : URL::PercentEncodeSet::Query));
+        builder.append(m_query);
     }
 
     if (exclude_fragment == ExcludeFragment::No && !m_fragment.is_null()) {
         builder.append('#');
-        builder.append(percent_encode(m_fragment, PercentEncodeSet::Fragment));
+        builder.append(m_fragment);
     }
 
     return builder.to_deprecated_string();
@@ -300,24 +329,24 @@ DeprecatedString URL::serialize_for_display() const
     }
 
     if (cannot_be_a_base_url()) {
-        builder.append(percent_encode(m_paths[0], PercentEncodeSet::Path));
+        builder.append(m_paths[0]);
     } else {
         if (m_host.is_null() && m_paths.size() > 1 && m_paths[0].is_empty())
             builder.append("/."sv);
         for (auto& segment : m_paths) {
             builder.append('/');
-            builder.append(percent_encode(segment, PercentEncodeSet::Path));
+            builder.append(segment);
         }
     }
 
     if (!m_query.is_null()) {
         builder.append('?');
-        builder.append(percent_encode(m_query, is_special() ? URL::PercentEncodeSet::SpecialQuery : URL::PercentEncodeSet::Query));
+        builder.append(m_query);
     }
 
     if (!m_fragment.is_null()) {
         builder.append('#');
-        builder.append(percent_encode(m_fragment, PercentEncodeSet::Fragment));
+        builder.append(m_fragment);
     }
 
     return builder.to_deprecated_string();

+ 15 - 8
AK/URL.h

@@ -19,8 +19,6 @@
 
 namespace AK {
 
-// NOTE: The member variables cannot contain any percent encoded sequences.
-//       The URL parser automatically decodes those sequences and the serialize() method will re-encode them as necessary.
 class URL {
     friend class URLParser;
 
@@ -70,16 +68,25 @@ public:
     bool includes_credentials() const { return !m_username.is_empty() || !m_password.is_empty(); }
     bool is_special() const { return is_special_scheme(m_scheme); }
 
+    enum class ApplyPercentEncoding {
+        Yes,
+        No
+    };
     void set_scheme(DeprecatedString);
-    void set_username(DeprecatedString);
-    void set_password(DeprecatedString);
+    void set_username(DeprecatedString username, ApplyPercentEncoding apply_percent_encoding = ApplyPercentEncoding::Yes);
+    void set_password(DeprecatedString password, ApplyPercentEncoding apply_percent_encoding = ApplyPercentEncoding::Yes);
     void set_host(DeprecatedString);
     void set_port(Optional<u16>);
-    void set_paths(Vector<DeprecatedString>);
-    void set_query(DeprecatedString);
-    void set_fragment(DeprecatedString);
+    void set_paths(Vector<DeprecatedString> password, ApplyPercentEncoding apply_percent_encoding = ApplyPercentEncoding::Yes);
+    void set_query(DeprecatedString query, ApplyPercentEncoding apply_percent_encoding = ApplyPercentEncoding::Yes);
+    void set_fragment(DeprecatedString fragment, ApplyPercentEncoding apply_percent_encoding = ApplyPercentEncoding::Yes);
     void set_cannot_be_a_base_url(bool value) { m_cannot_be_a_base_url = value; }
-    void append_path(DeprecatedString path) { m_paths.append(move(path)); }
+    void append_path(DeprecatedString path, ApplyPercentEncoding apply_percent_encoding = ApplyPercentEncoding::Yes);
+    void append_slash()
+    {
+        // NOTE: To indicate that we want to end the path with a slash, we have to append an empty path segment.
+        append_path("", ApplyPercentEncoding::No);
+    }
 
     DeprecatedString path() const;
     DeprecatedString basename() const;

+ 11 - 20
AK/URLParser.cpp

@@ -194,9 +194,6 @@ Optional<URL> URLParser::parse_data_url(StringView raw_input)
 //       future for validation of URLs, which would then lead to infinite recursion.
 //       The same goes for base_url, because e.g. the port() getter does not always return m_port, and we are interested in the underlying member
 //       variables' values here, not what the URL class presents to its users.
-// NOTE: Since the URL class's member variables contain percent decoded data, we have to deviate from the URL parser specification when setting
-//       some of those values. Because the specification leaves all values percent encoded in their URL data structure, we have to percent decode
-//       everything before setting the member variables.
 URL URLParser::parse(StringView raw_input, Optional<URL> const& base_url, Optional<URL> url, Optional<State> state_override)
 {
     dbgln_if(URL_PARSER_DEBUG, "URLParser::parse: Parsing '{}'", raw_input);
@@ -310,7 +307,7 @@ URL URLParser::parse(StringView raw_input, Optional<URL> const& base_url, Option
                     ++iterator;
                 } else {
                     url->m_cannot_be_a_base_url = true;
-                    url->append_path("");
+                    url->append_slash();
                     state = State::CannotBeABaseUrlPath;
                 }
             } else {
@@ -441,13 +438,11 @@ URL URLParser::parse(StringView raw_input, Optional<URL> const& base_url, Option
                     if (password_token_seen) {
                         builder.append(url->password());
                         URL::append_percent_encoded_if_necessary(builder, c, URL::PercentEncodeSet::Userinfo);
-                        // NOTE: This is has to be encoded and then decoded because the original sequence could contain already percent-encoded sequences.
-                        url->m_password = URL::percent_decode(builder.string_view());
+                        url->m_password = builder.string_view();
                     } else {
                         builder.append(url->username());
                         URL::append_percent_encoded_if_necessary(builder, c, URL::PercentEncodeSet::Userinfo);
-                        // NOTE: This is has to be encoded and then decoded because the original sequence could contain already percent-encoded sequences.
-                        url->m_username = URL::percent_decode(builder.string_view());
+                        url->m_username = builder.string_view();
                     }
                 }
                 buffer.clear();
@@ -561,7 +556,7 @@ URL URLParser::parse(StringView raw_input, Optional<URL> const& base_url, Option
                 url->m_host = base_url->m_host;
                 auto substring_from_pointer = input.substring_view(iterator - input.begin()).as_string();
                 if (!starts_with_windows_drive_letter(substring_from_pointer) && is_normalized_windows_drive_letter(base_url->m_paths[0]))
-                    url->append_path(base_url->m_paths[0]);
+                    url->append_path(base_url->m_paths[0], URL::ApplyPercentEncoding::No);
                 state = State::Path;
                 continue;
             }
@@ -616,9 +611,9 @@ URL URLParser::parse(StringView raw_input, Optional<URL> const& base_url, Option
                     if (!url->m_paths.is_empty() && !(url->m_scheme == "file" && url->m_paths.size() == 1 && is_normalized_windows_drive_letter(url->m_paths[0])))
                         url->m_paths.remove(url->m_paths.size() - 1);
                     if (code_point != '/' && !(url->is_special() && code_point == '\\'))
-                        url->append_path("");
+                        url->append_slash();
                 } else if (is_single_dot_path_segment(buffer.string_view()) && code_point != '/' && !(url->is_special() && code_point == '\\')) {
-                    url->append_path("");
+                    url->append_slash();
                 } else if (!is_single_dot_path_segment(buffer.string_view())) {
                     if (url->m_scheme == "file" && url->m_paths.is_empty() && is_windows_drive_letter(buffer.string_view())) {
                         auto drive_letter = buffer.string_view()[0];
@@ -626,8 +621,7 @@ URL URLParser::parse(StringView raw_input, Optional<URL> const& base_url, Option
                         buffer.append(drive_letter);
                         buffer.append(':');
                     }
-                    // NOTE: This needs to be percent decoded since the member variables contain decoded data.
-                    url->append_path(URL::percent_decode(buffer.string_view()));
+                    url->append_path(buffer.string_view(), URL::ApplyPercentEncoding::No);
                 }
                 buffer.clear();
                 if (code_point == '?') {
@@ -649,13 +643,12 @@ URL URLParser::parse(StringView raw_input, Optional<URL> const& base_url, Option
             // NOTE: Verify that the assumptions required for this simplification are correct.
             VERIFY(url->m_paths.size() == 1 && url->m_paths[0].is_empty());
             if (code_point == '?') {
-                // NOTE: This needs to be percent decoded since the member variables contain decoded data.
-                url->m_paths[0] = URL::percent_decode(buffer.string_view());
+                url->m_paths[0] = buffer.string_view();
                 url->m_query = "";
                 state = State::Query;
             } else if (code_point == '#') {
                 // NOTE: This needs to be percent decoded since the member variables contain decoded data.
-                url->m_paths[0] = URL::percent_decode(buffer.string_view());
+                url->m_paths[0] = buffer.string_view();
                 url->m_fragment = "";
                 state = State::Fragment;
             } else {
@@ -665,8 +658,7 @@ URL URLParser::parse(StringView raw_input, Optional<URL> const& base_url, Option
                 if (code_point != end_of_file) {
                     URL::append_percent_encoded_if_necessary(buffer, code_point, URL::PercentEncodeSet::C0Control);
                 } else {
-                    // NOTE: This needs to be percent decoded since the member variables contain decoded data.
-                    url->m_paths[0] = URL::percent_decode(buffer.string_view());
+                    url->m_paths[0] = buffer.string_view();
                 }
             }
             break;
@@ -696,8 +688,7 @@ URL URLParser::parse(StringView raw_input, Optional<URL> const& base_url, Option
                 // FIXME: If c is U+0025 (%) and remaining does not start with two ASCII hex digits, validation error.
                 buffer.append_code_point(code_point);
             } else {
-                // NOTE: This needs to be percent decoded since the member variables contain decoded data.
-                url->m_fragment = URL::percent_decode(buffer.string_view());
+                url->m_fragment = buffer.string_view();
                 buffer.clear();
             }
             break;

+ 9 - 2
Tests/AK/TestURL.cpp

@@ -151,7 +151,7 @@ TEST_CASE(file_url_with_encoded_characters)
     URL url("file:///my/file/test%23file.txt"sv);
     EXPECT(url.is_valid());
     EXPECT_EQ(url.scheme(), "file");
-    EXPECT_EQ(url.path(), "/my/file/test#file.txt");
+    EXPECT_EQ(url.path(), "/my/file/test%23file.txt");
     EXPECT(url.query().is_null());
     EXPECT(url.fragment().is_null());
 }
@@ -389,7 +389,7 @@ TEST_CASE(unicode)
 {
     URL url { "http://example.com/_ünicöde_téxt_©"sv };
     EXPECT(url.is_valid());
-    EXPECT_EQ(url.path(), "/_ünicöde_téxt_©");
+    EXPECT_EQ(url.path(), "/_%C3%BCnic%C3%B6de_t%C3%A9xt_%C2%A9");
     EXPECT(url.query().is_null());
     EXPECT(url.fragment().is_null());
 }
@@ -415,3 +415,10 @@ TEST_CASE(empty_url_with_base_url)
     EXPECT_EQ(parsed_url.is_valid(), true);
     EXPECT(base_url.equals(parsed_url));
 }
+
+TEST_CASE(google_street_view)
+{
+    constexpr auto streetview_url = "https://www.google.co.uk/maps/@53.3354159,-1.9573545,3a,75y,121.1h,75.67t/data=!3m7!1e1!3m5!1sSY8xCv17jAX4S7SRdV38hg!2e0!6shttps:%2F%2Fstreetviewpixels-pa.googleapis.com%2Fv1%2Fthumbnail%3Fpanoid%3DSY8xCv17jAX4S7SRdV38hg%26cb_client%3Dmaps_sv.tactile.gps%26w%3D203%26h%3D100%26yaw%3D188.13148%26pitch%3D0%26thumbfov%3D100!7i13312!8i6656";
+    URL url(streetview_url);
+    EXPECT_EQ(url.serialize(), streetview_url);
+}

+ 1 - 1
Userland/Libraries/LibWeb/URL/URL.h

@@ -62,7 +62,7 @@ public:
 
     WebIDL::ExceptionOr<String> to_json() const;
 
-    void set_query(Badge<URLSearchParams>, String query) { m_url.set_query(query.to_deprecated_string()); }
+    void set_query(Badge<URLSearchParams>, String query) { m_url.set_query(query.to_deprecated_string(), AK::URL::ApplyPercentEncoding::Yes); }
 
 private:
     URL(JS::Realm&, AK::URL, JS::NonnullGCPtr<URLSearchParams> query);