2020-01-18 08:38:21 +00:00
/*
* Copyright ( c ) 2018 - 2020 , Andreas Kling < kling @ serenityos . org >
2021-05-23 21:31:16 +00:00
* Copyright ( c ) 2021 , Max Wipfli < mail @ maxwipfli . ch >
2020-01-18 08:38:21 +00:00
*
2021-04-22 08:24:48 +00:00
* SPDX - License - Identifier : BSD - 2 - Clause
2020-01-18 08:38:21 +00:00
*/
AK: Decode data URLs to separate class (and parse like every other URL)
Parsing 'data:' URLs took it's own route. It never set standard URL
fields like path, query or fragment (except for scheme) and instead
gave us separate methods called `data_payload()`, `data_mime_type()`,
and `data_payload_is_base64()`.
Because parsing 'data:' didn't use standard fields, running the
following JS code:
new URL('#a', 'data:text/plain,hello').toString()
not only cleared the path as URLParser doesn't check for data from
data_payload() function (making the result be 'data:#a'), but it also
crashes the program because we forbid having an empty MIME type when we
serialize to string.
With this change, 'data:' URLs will be parsed like every other URLs.
To decode the 'data:' URL contents, one needs to call process_data_url()
on a URL, which will return a struct containing MIME type with already
decoded data! :^)
2023-07-06 17:11:58 +00:00
# include <AK/Base64.h>
2021-06-01 19:18:08 +00:00
# include <AK/CharacterTypes.h>
2021-05-27 19:05:07 +00:00
# include <AK/Debug.h>
2020-05-26 11:52:44 +00:00
# include <AK/LexicalPath.h>
2019-08-10 15:27:56 +00:00
# include <AK/StringBuilder.h>
2021-05-25 11:50:03 +00:00
# include <AK/Utf8View.h>
2024-03-18 03:22:27 +00:00
# include <LibURL/Parser.h>
# include <LibURL/URL.h>
2019-08-10 15:27:56 +00:00
2024-03-18 03:22:27 +00:00
namespace URL {
2019-08-10 15:27:56 +00:00
2024-03-18 03:22:27 +00:00
// FIXME: It could make sense to force users of URL to use URL::Parser::basic_parse() explicitly instead of using a constructor.
2021-11-10 23:55:02 +00:00
URL : : URL ( StringView string )
2024-03-18 03:22:27 +00:00
: URL ( Parser : : basic_parse ( string ) )
2019-08-10 15:27:56 +00:00
{
2021-05-27 19:05:07 +00:00
if constexpr ( URL_PARSER_DEBUG ) {
2024-08-02 13:23:49 +00:00
if ( m_data - > valid )
2021-05-27 19:05:07 +00:00
dbgln ( " URL constructor: Parsed URL to be '{}'. " , serialize ( ) ) ;
else
dbgln ( " URL constructor: Parsed URL to be invalid. " ) ;
}
2019-08-10 15:27:56 +00:00
}
2023-02-13 17:42:27 +00:00
URL URL : : complete_url ( StringView relative_url ) const
2019-11-18 21:04:39 +00:00
{
2020-06-07 16:23:33 +00:00
if ( ! is_valid ( ) )
return { } ;
2024-03-18 03:22:27 +00:00
return Parser : : basic_parse ( relative_url , * this ) ;
2019-11-18 21:04:39 +00:00
}
2023-12-16 14:19:34 +00:00
ByteString URL : : path_segment_at_index ( size_t index ) const
2023-04-13 22:29:51 +00:00
{
VERIFY ( index < path_segment_count ( ) ) ;
2024-08-02 13:23:49 +00:00
return percent_decode ( m_data - > paths [ index ] ) ;
2023-04-13 22:29:51 +00:00
}
2023-12-16 14:19:34 +00:00
ByteString URL : : basename ( ) const
2023-04-13 22:06:58 +00:00
{
2024-08-02 13:23:49 +00:00
if ( ! m_data - > valid )
2023-04-13 22:06:58 +00:00
return { } ;
2024-08-02 13:23:49 +00:00
if ( m_data - > paths . is_empty ( ) )
2023-04-13 22:06:58 +00:00
return { } ;
2024-08-02 13:23:49 +00:00
auto & last_segment = m_data - > paths . last ( ) ;
2023-08-06 04:43:50 +00:00
return percent_decode ( last_segment ) ;
2023-04-13 22:06:58 +00:00
}
2023-08-12 04:52:41 +00:00
void URL : : set_scheme ( String scheme )
2020-04-11 21:07:23 +00:00
{
2024-08-02 13:23:49 +00:00
m_data - > scheme = move ( scheme ) ;
m_data - > valid = compute_validity ( ) ;
2020-04-11 21:07:23 +00:00
}
2023-08-06 04:32:44 +00:00
// https://url.spec.whatwg.org/#set-the-username
2023-08-12 04:52:38 +00:00
ErrorOr < void > URL : : set_username ( StringView username )
2021-05-25 19:32:20 +00:00
{
2023-08-06 04:32:44 +00:00
// To set the username given a url and username, set url’ s username to the result of running UTF-8 percent-encode on username using the userinfo percent-encode set.
2024-08-02 13:23:49 +00:00
m_data - > username = TRY ( String : : from_byte_string ( percent_encode ( username , PercentEncodeSet : : Userinfo ) ) ) ;
m_data - > valid = compute_validity ( ) ;
2023-08-12 04:52:38 +00:00
return { } ;
2021-05-25 19:32:20 +00:00
}
2023-08-06 04:32:44 +00:00
// https://url.spec.whatwg.org/#set-the-password
2023-08-12 04:52:38 +00:00
ErrorOr < void > URL : : set_password ( StringView password )
2021-05-25 19:32:20 +00:00
{
2023-08-06 04:32:44 +00:00
// To set the password given a url and password, set url’ s password to the result of running UTF-8 percent-encode on password using the userinfo percent-encode set.
2024-08-02 13:23:49 +00:00
m_data - > password = TRY ( String : : from_byte_string ( percent_encode ( password , PercentEncodeSet : : Userinfo ) ) ) ;
m_data - > valid = compute_validity ( ) ;
2023-08-12 04:52:38 +00:00
return { } ;
2021-05-25 19:32:20 +00:00
}
2023-07-27 09:40:41 +00:00
void URL : : set_host ( Host host )
2020-04-11 21:07:23 +00:00
{
2024-08-02 13:23:49 +00:00
m_data - > host = move ( host ) ;
m_data - > valid = compute_validity ( ) ;
2020-04-11 21:07:23 +00:00
}
2023-07-27 09:40:41 +00:00
// https://url.spec.whatwg.org/#concept-host-serializer
ErrorOr < String > URL : : serialized_host ( ) const
{
2024-08-02 13:23:49 +00:00
return Parser : : serialize_host ( m_data - > host ) ;
2023-07-27 09:40:41 +00:00
}
2021-09-13 20:12:16 +00:00
void URL : : set_port ( Optional < u16 > port )
2020-11-04 06:20:20 +00:00
{
2024-08-02 13:23:49 +00:00
if ( port = = default_port_for_scheme ( m_data - > scheme ) ) {
m_data - > port = { } ;
2021-05-25 19:32:20 +00:00
return ;
}
2024-08-02 13:23:49 +00:00
m_data - > port = move ( port ) ;
m_data - > valid = compute_validity ( ) ;
2020-11-04 06:20:20 +00:00
}
2023-12-16 14:19:34 +00:00
void URL : : set_paths ( Vector < ByteString > const & paths )
2021-05-25 19:32:20 +00:00
{
2024-08-02 13:23:49 +00:00
m_data - > paths . clear_with_capacity ( ) ;
m_data - > paths . ensure_capacity ( paths . size ( ) ) ;
2023-08-06 04:32:44 +00:00
for ( auto const & segment : paths )
2024-08-02 13:23:49 +00:00
m_data - > paths . unchecked_append ( String : : from_byte_string ( percent_encode ( segment , PercentEncodeSet : : Path ) ) . release_value_but_fixme_should_propagate_errors ( ) ) ;
m_data - > valid = compute_validity ( ) ;
2021-05-25 19:32:20 +00:00
}
2023-08-06 04:32:44 +00:00
void URL : : append_path ( StringView path )
2023-04-09 13:21:00 +00:00
{
2024-08-02 13:23:49 +00:00
m_data - > paths . append ( String : : from_byte_string ( percent_encode ( path , PercentEncodeSet : : Path ) ) . release_value_but_fixme_should_propagate_errors ( ) ) ;
2023-04-09 13:21:00 +00:00
}
2023-07-26 08:54:36 +00:00
// https://url.spec.whatwg.org/#cannot-have-a-username-password-port
bool URL : : cannot_have_a_username_or_password_or_port ( ) const
{
// A URL cannot have a username/password/port if its host is null or the empty string, or its scheme is "file".
2024-08-02 13:23:49 +00:00
return m_data - > host . has < Empty > ( ) | | m_data - > host = = String { } | | m_data - > scheme = = " file " sv ;
2023-07-26 08:54:36 +00:00
}
2021-05-29 18:46:49 +00:00
// FIXME: This is by no means complete.
// NOTE: This relies on some assumptions about how the spec-defined URL parser works that may turn out to be wrong.
2020-04-11 21:07:23 +00:00
bool URL : : compute_validity ( ) const
{
2024-08-02 13:23:49 +00:00
if ( m_data - > scheme . is_empty ( ) )
2020-04-11 21:07:23 +00:00
return false ;
2020-11-04 06:20:20 +00:00
2024-08-02 13:23:49 +00:00
if ( m_data - > cannot_be_a_base_url ) {
if ( m_data - > paths . size ( ) ! = 1 )
2021-05-29 18:46:49 +00:00
return false ;
2024-08-02 13:23:49 +00:00
if ( m_data - > paths [ 0 ] . is_empty ( ) )
2021-05-29 18:46:49 +00:00
return false ;
} else {
2024-08-02 13:23:49 +00:00
if ( m_data - > scheme . is_one_of ( " about " , " mailto " ) )
2021-05-29 18:46:49 +00:00
return false ;
// NOTE: Maybe it is allowed to have a zero-segment path.
2024-08-02 13:23:49 +00:00
if ( m_data - > paths . size ( ) = = 0 )
2021-05-29 18:46:49 +00:00
return false ;
2020-04-11 21:07:23 +00:00
}
2020-11-04 06:20:20 +00:00
2021-05-29 18:46:49 +00:00
// NOTE: A file URL's host should be the empty string for localhost, not null.
2024-08-02 13:23:49 +00:00
if ( m_data - > scheme = = " file " & & m_data - > host . has < Empty > ( ) )
2020-11-04 06:20:20 +00:00
return false ;
2020-04-11 21:07:23 +00:00
return true ;
}
2023-07-31 08:23:53 +00:00
// https://url.spec.whatwg.org/#default-port
2024-03-18 03:22:27 +00:00
Optional < u16 > default_port_for_scheme ( StringView scheme )
2020-11-04 06:20:20 +00:00
{
2023-07-31 08:23:53 +00:00
// Spec defined mappings with port:
if ( scheme = = " ftp " )
return 21 ;
2021-05-23 21:31:16 +00:00
if ( scheme = = " http " )
2020-11-04 06:20:20 +00:00
return 80 ;
2021-05-23 21:31:16 +00:00
if ( scheme = = " https " )
2020-11-04 06:20:20 +00:00
return 443 ;
2023-07-31 08:23:53 +00:00
if ( scheme = = " ws " )
return 80 ;
if ( scheme = = " wss " )
return 443 ;
// NOTE: not in spec, but we support these too
2021-05-23 21:31:16 +00:00
if ( scheme = = " irc " )
2020-11-04 06:20:20 +00:00
return 6667 ;
2021-05-23 21:31:16 +00:00
if ( scheme = = " ircs " )
2020-11-04 06:20:20 +00:00
return 6697 ;
2023-07-31 08:23:53 +00:00
2023-08-12 23:00:56 +00:00
return { } ;
2020-11-04 06:20:20 +00:00
}
2024-03-18 03:22:27 +00:00
URL create_with_file_scheme ( ByteString const & path , ByteString const & fragment , ByteString const & hostname )
2020-04-18 20:02:04 +00:00
{
2021-05-27 19:40:02 +00:00
LexicalPath lexical_path ( path ) ;
2021-06-29 11:11:03 +00:00
if ( ! lexical_path . is_absolute ( ) )
2021-05-27 19:40:02 +00:00
return { } ;
2021-05-29 19:57:20 +00:00
2020-04-18 20:02:04 +00:00
URL url ;
2023-08-12 04:52:41 +00:00
url . set_scheme ( " file " _string ) ;
2023-12-16 14:19:34 +00:00
url . set_host ( hostname = = " localhost " ? String { } : String : : from_byte_string ( hostname ) . release_value_but_fixme_should_propagate_errors ( ) ) ;
2022-03-24 02:46:52 +00:00
url . set_paths ( lexical_path . parts ( ) ) ;
if ( path . ends_with ( ' / ' ) )
2023-04-09 13:21:00 +00:00
url . append_slash ( ) ;
2023-10-10 11:30:58 +00:00
if ( ! fragment . is_empty ( ) )
2023-12-16 14:19:34 +00:00
url . set_fragment ( String : : from_byte_string ( fragment ) . release_value_but_fixme_should_propagate_errors ( ) ) ;
2022-03-24 02:46:52 +00:00
return url ;
}
2024-03-18 03:22:27 +00:00
URL create_with_help_scheme ( ByteString const & path , ByteString const & fragment , ByteString const & hostname )
2022-03-24 02:46:52 +00:00
{
LexicalPath lexical_path ( path ) ;
URL url ;
2023-08-12 04:52:41 +00:00
url . set_scheme ( " help " _string ) ;
2023-12-16 14:19:34 +00:00
url . set_host ( hostname = = " localhost " ? String { } : String : : from_byte_string ( hostname ) . release_value_but_fixme_should_propagate_errors ( ) ) ;
2023-07-27 09:40:41 +00:00
2021-05-27 19:40:02 +00:00
url . set_paths ( lexical_path . parts ( ) ) ;
if ( path . ends_with ( ' / ' ) )
2023-04-09 13:21:00 +00:00
url . append_slash ( ) ;
2023-10-10 11:30:58 +00:00
if ( ! fragment . is_empty ( ) )
2023-12-16 14:19:34 +00:00
url . set_fragment ( String : : from_byte_string ( fragment ) . release_value_but_fixme_should_propagate_errors ( ) ) ;
2020-04-18 20:02:04 +00:00
return url ;
}
2024-03-18 03:22:27 +00:00
URL create_with_url_or_path ( ByteString const & url_or_path )
2020-04-19 08:55:59 +00:00
{
URL url = url_or_path ;
if ( url . is_valid ( ) )
return url ;
2023-12-16 14:19:34 +00:00
ByteString path = LexicalPath : : canonicalized_path ( url_or_path ) ;
2024-03-18 03:22:27 +00:00
return create_with_file_scheme ( path ) ;
2020-04-19 08:55:59 +00:00
}
2024-03-18 03:22:27 +00:00
URL create_with_data ( StringView mime_type , StringView payload , bool is_base64 )
AK: Decode data URLs to separate class (and parse like every other URL)
Parsing 'data:' URLs took it's own route. It never set standard URL
fields like path, query or fragment (except for scheme) and instead
gave us separate methods called `data_payload()`, `data_mime_type()`,
and `data_payload_is_base64()`.
Because parsing 'data:' didn't use standard fields, running the
following JS code:
new URL('#a', 'data:text/plain,hello').toString()
not only cleared the path as URLParser doesn't check for data from
data_payload() function (making the result be 'data:#a'), but it also
crashes the program because we forbid having an empty MIME type when we
serialize to string.
With this change, 'data:' URLs will be parsed like every other URLs.
To decode the 'data:' URL contents, one needs to call process_data_url()
on a URL, which will return a struct containing MIME type with already
decoded data! :^)
2023-07-06 17:11:58 +00:00
{
URL url ;
url . set_cannot_be_a_base_url ( true ) ;
2023-08-12 04:52:41 +00:00
url . set_scheme ( " data " _string ) ;
AK: Decode data URLs to separate class (and parse like every other URL)
Parsing 'data:' URLs took it's own route. It never set standard URL
fields like path, query or fragment (except for scheme) and instead
gave us separate methods called `data_payload()`, `data_mime_type()`,
and `data_payload_is_base64()`.
Because parsing 'data:' didn't use standard fields, running the
following JS code:
new URL('#a', 'data:text/plain,hello').toString()
not only cleared the path as URLParser doesn't check for data from
data_payload() function (making the result be 'data:#a'), but it also
crashes the program because we forbid having an empty MIME type when we
serialize to string.
With this change, 'data:' URLs will be parsed like every other URLs.
To decode the 'data:' URL contents, one needs to call process_data_url()
on a URL, which will return a struct containing MIME type with already
decoded data! :^)
2023-07-06 17:11:58 +00:00
StringBuilder builder ;
builder . append ( mime_type ) ;
if ( is_base64 )
builder . append ( " ;base64 " sv ) ;
builder . append ( ' , ' ) ;
builder . append ( payload ) ;
2023-12-16 14:19:34 +00:00
url . set_paths ( { builder . to_byte_string ( ) } ) ;
AK: Decode data URLs to separate class (and parse like every other URL)
Parsing 'data:' URLs took it's own route. It never set standard URL
fields like path, query or fragment (except for scheme) and instead
gave us separate methods called `data_payload()`, `data_mime_type()`,
and `data_payload_is_base64()`.
Because parsing 'data:' didn't use standard fields, running the
following JS code:
new URL('#a', 'data:text/plain,hello').toString()
not only cleared the path as URLParser doesn't check for data from
data_payload() function (making the result be 'data:#a'), but it also
crashes the program because we forbid having an empty MIME type when we
serialize to string.
With this change, 'data:' URLs will be parsed like every other URLs.
To decode the 'data:' URL contents, one needs to call process_data_url()
on a URL, which will return a struct containing MIME type with already
decoded data! :^)
2023-07-06 17:11:58 +00:00
return url ;
}
2021-05-25 20:05:01 +00:00
// https://url.spec.whatwg.org/#special-scheme
2024-03-18 03:22:27 +00:00
bool is_special_scheme ( StringView scheme )
2021-05-25 20:05:01 +00:00
{
return scheme . is_one_of ( " ftp " , " file " , " http " , " https " , " ws " , " wss " ) ;
}
2023-09-12 15:50:15 +00:00
// https://url.spec.whatwg.org/#url-path-serializer
2023-12-16 14:19:34 +00:00
ByteString URL : : serialize_path ( ApplyPercentDecoding apply_percent_decoding ) const
2023-04-14 19:12:03 +00:00
{
2023-09-19 16:45:12 +00:00
// 1. If url has an opaque path, then return url’ s path.
2023-09-12 15:50:15 +00:00
// FIXME: Reimplement this step once we modernize the URL implementation to meet the spec.
2023-04-14 19:12:03 +00:00
if ( cannot_be_a_base_url ( ) )
2024-08-02 13:23:49 +00:00
return m_data - > paths [ 0 ] . to_byte_string ( ) ;
2023-09-12 15:50:15 +00:00
2023-09-19 16:45:12 +00:00
// 2. Let output be the empty string.
2023-09-12 15:50:15 +00:00
StringBuilder output ;
2023-09-19 16:45:12 +00:00
// 3. For each segment of url’ s path: append U+002F (/) followed by segment to output.
2024-08-02 13:23:49 +00:00
for ( auto const & segment : m_data - > paths ) {
2023-09-12 15:50:15 +00:00
output . append ( ' / ' ) ;
2023-12-16 14:19:34 +00:00
output . append ( apply_percent_decoding = = ApplyPercentDecoding : : Yes ? percent_decode ( segment ) : segment . to_byte_string ( ) ) ;
2023-04-14 19:12:03 +00:00
}
2023-09-12 15:50:15 +00:00
2023-09-19 16:45:12 +00:00
// 4. Return output.
2023-12-16 14:19:34 +00:00
return output . to_byte_string ( ) ;
2023-04-14 19:12:03 +00:00
}
2021-05-25 20:32:39 +00:00
// https://url.spec.whatwg.org/#concept-url-serializer
2023-12-16 14:19:34 +00:00
ByteString URL : : serialize ( ExcludeFragment exclude_fragment ) const
2021-05-25 20:32:39 +00:00
{
2023-07-25 08:04:09 +00:00
// 1. Let output be url’ s scheme and U+003A (:) concatenated.
StringBuilder output ;
2024-08-02 13:23:49 +00:00
output . append ( m_data - > scheme ) ;
2023-07-25 08:04:09 +00:00
output . append ( ' : ' ) ;
// 2. If url’ s host is non-null:
2024-08-02 13:23:49 +00:00
if ( ! m_data - > host . has < Empty > ( ) ) {
2023-07-25 08:04:09 +00:00
// 1. Append "//" to output.
output . append ( " // " sv ) ;
2021-05-25 20:32:39 +00:00
2023-07-25 08:04:09 +00:00
// 2. If url includes credentials, then:
2021-05-25 20:32:39 +00:00
if ( includes_credentials ( ) ) {
2023-07-25 08:04:09 +00:00
// 1. Append url’ s username to output.
2024-08-02 13:23:49 +00:00
output . append ( m_data - > username ) ;
2023-07-25 08:04:09 +00:00
// 2. If url’ s password is not the empty string, then append U+003A (:), followed by url’ s password, to output.
2024-08-02 13:23:49 +00:00
if ( ! m_data - > password . is_empty ( ) ) {
2023-07-25 08:04:09 +00:00
output . append ( ' : ' ) ;
2024-08-02 13:23:49 +00:00
output . append ( m_data - > password ) ;
2021-05-25 20:32:39 +00:00
}
2023-07-25 08:04:09 +00:00
// 3. Append U+0040 (@) to output.
output . append ( ' @ ' ) ;
2021-05-25 20:32:39 +00:00
}
2023-07-25 08:04:09 +00:00
// 3. Append url’ s host, serialized, to output.
2023-07-27 09:40:41 +00:00
output . append ( serialized_host ( ) . release_value_but_fixme_should_propagate_errors ( ) ) ;
2023-07-25 08:04:09 +00:00
// 4. If url’ s port is non-null, append U+003A (:) followed by url’ s port, serialized, to output.
2024-08-02 13:23:49 +00:00
if ( m_data - > port . has_value ( ) )
output . appendff ( " :{} " , * m_data - > port ) ;
2021-05-25 20:32:39 +00:00
}
2023-07-25 08:04:09 +00:00
// 3. If url’ s host is null, url does not have an opaque path, url’ s path’ s size is greater than 1, and url’ s path[0] is the empty string, then append U+002F (/) followed by U+002E (.) to output.
// 4. Append the result of URL path serializing url to output.
// FIXME: Implement this closer to spec steps.
2021-05-25 20:32:39 +00:00
if ( cannot_be_a_base_url ( ) ) {
2024-08-02 13:23:49 +00:00
output . append ( m_data - > paths [ 0 ] ) ;
2021-05-25 20:32:39 +00:00
} else {
2024-08-02 13:23:49 +00:00
if ( m_data - > host . has < Empty > ( ) & & m_data - > paths . size ( ) > 1 & & m_data - > paths [ 0 ] . is_empty ( ) )
2023-07-25 08:04:09 +00:00
output . append ( " /. " sv ) ;
2024-08-02 13:23:49 +00:00
for ( auto & segment : m_data - > paths ) {
2023-07-25 08:04:09 +00:00
output . append ( ' / ' ) ;
output . append ( segment ) ;
2021-05-25 20:32:39 +00:00
}
}
2023-07-25 08:04:09 +00:00
// 5. If url’ s query is non-null, append U+003F (?), followed by url’ s query, to output.
2024-08-02 13:23:49 +00:00
if ( m_data - > query . has_value ( ) ) {
2023-07-25 08:04:09 +00:00
output . append ( ' ? ' ) ;
2024-08-02 13:23:49 +00:00
output . append ( * m_data - > query ) ;
2021-05-25 20:32:39 +00:00
}
2023-07-25 08:04:09 +00:00
// 6. If exclude fragment is false and url’ s fragment is non-null, then append U+0023 (#), followed by url’ s fragment, to output.
2024-08-02 13:23:49 +00:00
if ( exclude_fragment = = ExcludeFragment : : No & & m_data - > fragment . has_value ( ) ) {
2023-07-25 08:04:09 +00:00
output . append ( ' # ' ) ;
2024-08-02 13:23:49 +00:00
output . append ( * m_data - > fragment ) ;
2021-05-25 20:32:39 +00:00
}
2023-07-25 08:04:09 +00:00
// 7. Return output.
2023-12-16 14:19:34 +00:00
return output . to_byte_string ( ) ;
2021-05-25 20:32:39 +00:00
}
// https://url.spec.whatwg.org/#url-rendering
// NOTE: This does e.g. not display credentials.
// FIXME: Parts of the URL other than the host should have their sequences of percent-encoded bytes replaced with code points
// resulting from percent-decoding those sequences converted to bytes, unless that renders those sequences invisible.
2023-12-16 14:19:34 +00:00
ByteString URL : : serialize_for_display ( ) const
2021-05-25 20:32:39 +00:00
{
2024-08-02 13:23:49 +00:00
VERIFY ( m_data - > valid ) ;
AK: Decode data URLs to separate class (and parse like every other URL)
Parsing 'data:' URLs took it's own route. It never set standard URL
fields like path, query or fragment (except for scheme) and instead
gave us separate methods called `data_payload()`, `data_mime_type()`,
and `data_payload_is_base64()`.
Because parsing 'data:' didn't use standard fields, running the
following JS code:
new URL('#a', 'data:text/plain,hello').toString()
not only cleared the path as URLParser doesn't check for data from
data_payload() function (making the result be 'data:#a'), but it also
crashes the program because we forbid having an empty MIME type when we
serialize to string.
With this change, 'data:' URLs will be parsed like every other URLs.
To decode the 'data:' URL contents, one needs to call process_data_url()
on a URL, which will return a struct containing MIME type with already
decoded data! :^)
2023-07-06 17:11:58 +00:00
2021-05-25 20:32:39 +00:00
StringBuilder builder ;
2024-08-02 13:23:49 +00:00
builder . append ( m_data - > scheme ) ;
2021-05-25 20:32:39 +00:00
builder . append ( ' : ' ) ;
2024-08-02 13:23:49 +00:00
if ( ! m_data - > host . has < Empty > ( ) ) {
2022-07-11 17:32:29 +00:00
builder . append ( " // " sv ) ;
2023-07-27 09:40:41 +00:00
builder . append ( serialized_host ( ) . release_value_but_fixme_should_propagate_errors ( ) ) ;
2024-08-02 13:23:49 +00:00
if ( m_data - > port . has_value ( ) )
builder . appendff ( " :{} " , * m_data - > port ) ;
2021-05-25 20:32:39 +00:00
}
if ( cannot_be_a_base_url ( ) ) {
2024-08-02 13:23:49 +00:00
builder . append ( m_data - > paths [ 0 ] ) ;
2021-05-25 20:32:39 +00:00
} else {
2024-08-02 13:23:49 +00:00
if ( m_data - > host . has < Empty > ( ) & & m_data - > paths . size ( ) > 1 & & m_data - > paths [ 0 ] . is_empty ( ) )
2022-07-11 17:32:29 +00:00
builder . append ( " /. " sv ) ;
2024-08-02 13:23:49 +00:00
for ( auto & segment : m_data - > paths ) {
2021-05-27 19:40:02 +00:00
builder . append ( ' / ' ) ;
2023-04-09 13:21:00 +00:00
builder . append ( segment ) ;
2021-05-25 20:32:39 +00:00
}
}
2024-08-02 13:23:49 +00:00
if ( m_data - > query . has_value ( ) ) {
2021-05-25 20:32:39 +00:00
builder . append ( ' ? ' ) ;
2024-08-02 13:23:49 +00:00
builder . append ( * m_data - > query ) ;
2021-05-25 20:32:39 +00:00
}
2024-08-02 13:23:49 +00:00
if ( m_data - > fragment . has_value ( ) ) {
2021-05-25 20:32:39 +00:00
builder . append ( ' # ' ) ;
2024-08-02 13:23:49 +00:00
builder . append ( * m_data - > fragment ) ;
2021-05-25 20:32:39 +00:00
}
2023-12-16 14:19:34 +00:00
return builder . to_byte_string ( ) ;
2021-05-25 20:32:39 +00:00
}
2023-06-17 07:15:40 +00:00
ErrorOr < String > URL : : to_string ( ) const
{
2023-12-16 14:19:34 +00:00
return String : : from_byte_string ( serialize ( ) ) ;
2023-06-17 07:15:40 +00:00
}
2021-09-13 19:18:14 +00:00
// https://html.spec.whatwg.org/multipage/origin.html#ascii-serialisation-of-an-origin
// https://url.spec.whatwg.org/#concept-url-origin
2023-12-16 14:19:34 +00:00
ByteString URL : : serialize_origin ( ) const
2021-09-13 19:18:14 +00:00
{
2024-08-02 13:23:49 +00:00
VERIFY ( m_data - > valid ) ;
2021-09-13 19:18:14 +00:00
2024-08-02 13:23:49 +00:00
if ( m_data - > scheme = = " blob " sv ) {
2021-09-13 19:18:14 +00:00
// TODO: 1. If URL’ s blob URL entry is non-null, then return URL’ s blob URL entry’ s environment’ s origin.
// 2. Let url be the result of parsing URL’ s path[0].
2024-08-02 13:23:49 +00:00
VERIFY ( ! m_data - > paths . is_empty ( ) ) ;
URL url = m_data - > paths [ 0 ] ;
2021-09-13 19:18:14 +00:00
// 3. Return a new opaque origin, if url is failure, and url’ s origin otherwise.
if ( ! url . is_valid ( ) )
return " null " ;
return url . serialize_origin ( ) ;
2024-08-02 13:23:49 +00:00
} else if ( ! m_data - > scheme . is_one_of ( " ftp " sv , " http " sv , " https " sv , " ws " sv , " wss " sv ) ) { // file: "Unfortunate as it is, this is left as an exercise to the reader. When in doubt, return a new opaque origin."
2021-09-13 19:18:14 +00:00
return " null " ;
}
StringBuilder builder ;
2024-08-02 13:23:49 +00:00
builder . append ( m_data - > scheme ) ;
2021-09-13 19:18:14 +00:00
builder . append ( " :// " sv ) ;
2023-07-27 09:40:41 +00:00
builder . append ( serialized_host ( ) . release_value_but_fixme_should_propagate_errors ( ) ) ;
2024-08-02 13:23:49 +00:00
if ( m_data - > port . has_value ( ) )
builder . appendff ( " :{} " , * m_data - > port ) ;
2023-12-16 14:19:34 +00:00
return builder . to_byte_string ( ) ;
2021-09-13 19:18:14 +00:00
}
2021-06-01 08:58:27 +00:00
bool URL : : equals ( URL const & other , ExcludeFragment exclude_fragments ) const
2021-05-27 19:38:16 +00:00
{
2021-06-01 09:14:30 +00:00
if ( this = = & other )
return true ;
2024-08-02 13:23:49 +00:00
if ( ! m_data - > valid | | ! other . m_data - > valid )
2021-05-27 19:38:16 +00:00
return false ;
return serialize ( exclude_fragments ) = = other . serialize ( exclude_fragments ) ;
}
2024-03-18 03:22:27 +00:00
void append_percent_encoded ( StringBuilder & builder , u32 code_point )
2021-05-25 11:50:03 +00:00
{
if ( code_point < = 0x7f )
builder . appendff ( " %{:02X} " , code_point ) ;
else if ( code_point < = 0x07ff )
builder . appendff ( " %{:02X}%{:02X} " , ( ( code_point > > 6 ) & 0x1f ) | 0xc0 , ( code_point & 0x3f ) | 0x80 ) ;
else if ( code_point < = 0xffff )
builder . appendff ( " %{:02X}%{:02X}%{:02X} " , ( ( code_point > > 12 ) & 0x0f ) | 0xe0 , ( ( code_point > > 6 ) & 0x3f ) | 0x80 , ( code_point & 0x3f ) | 0x80 ) ;
else if ( code_point < = 0x10ffff )
builder . appendff ( " %{:02X}%{:02X}%{:02X}%{:02X} " , ( ( code_point > > 18 ) & 0x07 ) | 0xf0 , ( ( code_point > > 12 ) & 0x3f ) | 0x80 , ( ( code_point > > 6 ) & 0x3f ) | 0x80 , ( code_point & 0x3f ) | 0x80 ) ;
else
VERIFY_NOT_REACHED ( ) ;
}
// https://url.spec.whatwg.org/#c0-control-percent-encode-set
2024-03-18 03:22:27 +00:00
bool code_point_is_in_percent_encode_set ( u32 code_point , PercentEncodeSet set )
2021-05-25 11:50:03 +00:00
{
2023-12-29 16:15:11 +00:00
// NOTE: Once we've checked for presence in the C0Control set, we know that the code point is
// a valid ASCII character in the range 0x20..0x7E, so we can safely cast it to char.
2021-05-25 11:50:03 +00:00
switch ( set ) {
2024-03-18 03:22:27 +00:00
case PercentEncodeSet : : C0Control :
2021-05-25 11:50:03 +00:00
return code_point < 0x20 | | code_point > 0x7E ;
2024-03-18 03:22:27 +00:00
case PercentEncodeSet : : Fragment :
return code_point_is_in_percent_encode_set ( code_point , PercentEncodeSet : : C0Control ) | | " \" <>` " sv . contains ( static_cast < char > ( code_point ) ) ;
case PercentEncodeSet : : Query :
return code_point_is_in_percent_encode_set ( code_point , PercentEncodeSet : : C0Control ) | | " \" #<> " sv . contains ( static_cast < char > ( code_point ) ) ;
case PercentEncodeSet : : SpecialQuery :
return code_point_is_in_percent_encode_set ( code_point , PercentEncodeSet : : Query ) | | code_point = = ' \' ' ;
case PercentEncodeSet : : Path :
return code_point_is_in_percent_encode_set ( code_point , PercentEncodeSet : : Query ) | | " ?` { } " sv.contains(static_cast<char>(code_point));
case PercentEncodeSet : : Userinfo :
return code_point_is_in_percent_encode_set ( code_point , PercentEncodeSet : : Path ) | | " /: ; = @ [ \ \ ] ^ | " sv.contains(static_cast<char>(code_point));
case PercentEncodeSet : : Component :
return code_point_is_in_percent_encode_set ( code_point , PercentEncodeSet : : Userinfo ) | | " $%&+, " sv . contains ( static_cast < char > ( code_point ) ) ;
case PercentEncodeSet : : ApplicationXWWWFormUrlencoded :
return code_point_is_in_percent_encode_set ( code_point , PercentEncodeSet : : Component ) | | " !'()~ " sv . contains ( static_cast < char > ( code_point ) ) ;
case PercentEncodeSet : : EncodeURI :
2021-05-25 11:50:03 +00:00
// NOTE: This is the same percent encode set that JS encodeURI() uses.
// https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/encodeURI
2022-12-25 19:25:34 +00:00
return code_point > 0x7E | | ( ! is_ascii_alphanumeric ( code_point ) & & ! " ;,/?:@&=+$-_.!~*'()# " sv . contains ( static_cast < char > ( code_point ) ) ) ;
2021-05-25 11:50:03 +00:00
default :
VERIFY_NOT_REACHED ( ) ;
}
}
2024-03-18 03:22:27 +00:00
void append_percent_encoded_if_necessary ( StringBuilder & builder , u32 code_point , PercentEncodeSet set )
2021-05-25 11:50:03 +00:00
{
2022-04-08 13:20:30 +00:00
if ( code_point_is_in_percent_encode_set ( code_point , set ) )
2021-05-25 11:50:03 +00:00
append_percent_encoded ( builder , code_point ) ;
else
builder . append_code_point ( code_point ) ;
}
2024-03-18 03:22:27 +00:00
ByteString percent_encode ( StringView input , PercentEncodeSet set , SpaceAsPlus space_as_plus )
2021-05-25 11:50:03 +00:00
{
StringBuilder builder ;
for ( auto code_point : Utf8View ( input ) ) {
2022-04-09 16:34:49 +00:00
if ( space_as_plus = = SpaceAsPlus : : Yes & & code_point = = ' ' )
builder . append ( ' + ' ) ;
else
append_percent_encoded_if_necessary ( builder , code_point , set ) ;
2021-05-25 11:50:03 +00:00
}
2023-12-16 14:19:34 +00:00
return builder . to_byte_string ( ) ;
2021-05-25 11:50:03 +00:00
}
2024-03-18 03:22:27 +00:00
ByteString percent_decode ( StringView input )
2021-05-25 11:50:03 +00:00
{
if ( ! input . contains ( ' % ' ) )
return input ;
StringBuilder builder ;
Utf8View utf8_view ( input ) ;
for ( auto it = utf8_view . begin ( ) ; ! it . done ( ) ; + + it ) {
if ( * it ! = ' % ' ) {
builder . append_code_point ( * it ) ;
} else if ( ! is_ascii_hex_digit ( it . peek ( 1 ) . value_or ( 0 ) ) | | ! is_ascii_hex_digit ( it . peek ( 2 ) . value_or ( 0 ) ) ) {
builder . append_code_point ( * it ) ;
} else {
+ + it ;
2021-06-01 19:18:08 +00:00
u8 byte = parse_ascii_hex_digit ( * it ) < < 4 ;
2021-05-25 11:50:03 +00:00
+ + it ;
2021-06-01 19:18:08 +00:00
byte + = parse_ascii_hex_digit ( * it ) ;
2021-05-25 11:50:03 +00:00
builder . append ( byte ) ;
}
}
2023-12-16 14:19:34 +00:00
return builder . to_byte_string ( ) ;
2021-05-25 11:50:03 +00:00
}
2019-08-10 15:27:56 +00:00
}