
This is a fetching AO and is only used by LibWeb in the context of fetch tasks. Move it to LibWeb with other fetch methods. The main reason for this is that it requires the use of other LibWeb AOs such as the forgiving Base64 decoder and MIME sniffing. These AOs aren't available within LibURL.
509 lines
17 KiB
C++
509 lines
17 KiB
C++
/*
|
||
* Copyright (c) 2018-2020, Andreas Kling <kling@serenityos.org>
|
||
* Copyright (c) 2021, Max Wipfli <mail@maxwipfli.ch>
|
||
*
|
||
* SPDX-License-Identifier: BSD-2-Clause
|
||
*/
|
||
|
||
#include <AK/Base64.h>
|
||
#include <AK/CharacterTypes.h>
|
||
#include <AK/Debug.h>
|
||
#include <AK/LexicalPath.h>
|
||
#include <AK/StringBuilder.h>
|
||
#include <AK/Utf8View.h>
|
||
#include <LibURL/Parser.h>
|
||
#include <LibURL/URL.h>
|
||
|
||
namespace URL {
|
||
|
||
// FIXME: It could make sense to force users of URL to use URL::Parser::basic_parse() explicitly instead of using a constructor.
|
||
URL::URL(StringView string)
|
||
: URL(Parser::basic_parse(string))
|
||
{
|
||
if constexpr (URL_PARSER_DEBUG) {
|
||
if (m_valid)
|
||
dbgln("URL constructor: Parsed URL to be '{}'.", serialize());
|
||
else
|
||
dbgln("URL constructor: Parsed URL to be invalid.");
|
||
}
|
||
}
|
||
|
||
URL URL::complete_url(StringView relative_url) const
|
||
{
|
||
if (!is_valid())
|
||
return {};
|
||
|
||
return Parser::basic_parse(relative_url, *this);
|
||
}
|
||
|
||
ErrorOr<String> URL::username() const
|
||
{
|
||
return String::from_byte_string(percent_decode(m_username));
|
||
}
|
||
|
||
ErrorOr<String> URL::password() const
|
||
{
|
||
return String::from_byte_string(percent_decode(m_password));
|
||
}
|
||
|
||
ByteString URL::path_segment_at_index(size_t index) const
|
||
{
|
||
VERIFY(index < path_segment_count());
|
||
return percent_decode(m_paths[index]);
|
||
}
|
||
|
||
ByteString URL::basename() const
|
||
{
|
||
if (!m_valid)
|
||
return {};
|
||
if (m_paths.is_empty())
|
||
return {};
|
||
auto& last_segment = m_paths.last();
|
||
return percent_decode(last_segment);
|
||
}
|
||
|
||
void URL::set_scheme(String scheme)
|
||
{
|
||
m_scheme = move(scheme);
|
||
m_valid = compute_validity();
|
||
}
|
||
|
||
// https://url.spec.whatwg.org/#set-the-username
|
||
ErrorOr<void> URL::set_username(StringView username)
|
||
{
|
||
// To set the username given a url and username, set url’s username to the result of running UTF-8 percent-encode on username using the userinfo percent-encode set.
|
||
m_username = TRY(String::from_byte_string(percent_encode(username, PercentEncodeSet::Userinfo)));
|
||
m_valid = compute_validity();
|
||
return {};
|
||
}
|
||
|
||
// https://url.spec.whatwg.org/#set-the-password
|
||
ErrorOr<void> URL::set_password(StringView password)
|
||
{
|
||
// To set the password given a url and password, set url’s password to the result of running UTF-8 percent-encode on password using the userinfo percent-encode set.
|
||
m_password = TRY(String::from_byte_string(percent_encode(password, PercentEncodeSet::Userinfo)));
|
||
m_valid = compute_validity();
|
||
return {};
|
||
}
|
||
|
||
void URL::set_host(Host host)
|
||
{
|
||
m_host = move(host);
|
||
m_valid = compute_validity();
|
||
}
|
||
|
||
// https://url.spec.whatwg.org/#concept-host-serializer
|
||
ErrorOr<String> URL::serialized_host() const
|
||
{
|
||
return Parser::serialize_host(m_host);
|
||
}
|
||
|
||
void URL::set_port(Optional<u16> port)
|
||
{
|
||
if (port == default_port_for_scheme(m_scheme)) {
|
||
m_port = {};
|
||
return;
|
||
}
|
||
m_port = move(port);
|
||
m_valid = compute_validity();
|
||
}
|
||
|
||
void URL::set_paths(Vector<ByteString> const& paths)
|
||
{
|
||
m_paths.clear_with_capacity();
|
||
m_paths.ensure_capacity(paths.size());
|
||
for (auto const& segment : paths)
|
||
m_paths.unchecked_append(String::from_byte_string(percent_encode(segment, PercentEncodeSet::Path)).release_value_but_fixme_should_propagate_errors());
|
||
m_valid = compute_validity();
|
||
}
|
||
|
||
void URL::append_path(StringView path)
|
||
{
|
||
m_paths.append(String::from_byte_string(percent_encode(path, PercentEncodeSet::Path)).release_value_but_fixme_should_propagate_errors());
|
||
}
|
||
|
||
// https://url.spec.whatwg.org/#cannot-have-a-username-password-port
|
||
bool URL::cannot_have_a_username_or_password_or_port() const
|
||
{
|
||
// A URL cannot have a username/password/port if its host is null or the empty string, or its scheme is "file".
|
||
return m_host.has<Empty>() || m_host == String {} || m_scheme == "file"sv;
|
||
}
|
||
|
||
// FIXME: This is by no means complete.
|
||
// NOTE: This relies on some assumptions about how the spec-defined URL parser works that may turn out to be wrong.
|
||
bool URL::compute_validity() const
|
||
{
|
||
if (m_scheme.is_empty())
|
||
return false;
|
||
|
||
if (m_cannot_be_a_base_url) {
|
||
if (m_paths.size() != 1)
|
||
return false;
|
||
if (m_paths[0].is_empty())
|
||
return false;
|
||
} else {
|
||
if (m_scheme.is_one_of("about", "mailto"))
|
||
return false;
|
||
// NOTE: Maybe it is allowed to have a zero-segment path.
|
||
if (m_paths.size() == 0)
|
||
return false;
|
||
}
|
||
|
||
// NOTE: A file URL's host should be the empty string for localhost, not null.
|
||
if (m_scheme == "file" && m_host.has<Empty>())
|
||
return false;
|
||
|
||
return true;
|
||
}
|
||
|
||
// https://url.spec.whatwg.org/#default-port
|
||
Optional<u16> default_port_for_scheme(StringView scheme)
|
||
{
|
||
// Spec defined mappings with port:
|
||
if (scheme == "ftp")
|
||
return 21;
|
||
if (scheme == "http")
|
||
return 80;
|
||
if (scheme == "https")
|
||
return 443;
|
||
if (scheme == "ws")
|
||
return 80;
|
||
if (scheme == "wss")
|
||
return 443;
|
||
|
||
// NOTE: not in spec, but we support these too
|
||
if (scheme == "gemini")
|
||
return 1965;
|
||
if (scheme == "irc")
|
||
return 6667;
|
||
if (scheme == "ircs")
|
||
return 6697;
|
||
|
||
return {};
|
||
}
|
||
|
||
URL create_with_file_scheme(ByteString const& path, ByteString const& fragment, ByteString const& hostname)
|
||
{
|
||
LexicalPath lexical_path(path);
|
||
if (!lexical_path.is_absolute())
|
||
return {};
|
||
|
||
URL url;
|
||
url.set_scheme("file"_string);
|
||
url.set_host(hostname == "localhost" ? String {} : String::from_byte_string(hostname).release_value_but_fixme_should_propagate_errors());
|
||
url.set_paths(lexical_path.parts());
|
||
if (path.ends_with('/'))
|
||
url.append_slash();
|
||
if (!fragment.is_empty())
|
||
url.set_fragment(String::from_byte_string(fragment).release_value_but_fixme_should_propagate_errors());
|
||
return url;
|
||
}
|
||
|
||
URL create_with_help_scheme(ByteString const& path, ByteString const& fragment, ByteString const& hostname)
|
||
{
|
||
LexicalPath lexical_path(path);
|
||
|
||
URL url;
|
||
url.set_scheme("help"_string);
|
||
url.set_host(hostname == "localhost" ? String {} : String::from_byte_string(hostname).release_value_but_fixme_should_propagate_errors());
|
||
|
||
url.set_paths(lexical_path.parts());
|
||
if (path.ends_with('/'))
|
||
url.append_slash();
|
||
if (!fragment.is_empty())
|
||
url.set_fragment(String::from_byte_string(fragment).release_value_but_fixme_should_propagate_errors());
|
||
return url;
|
||
}
|
||
|
||
URL create_with_url_or_path(ByteString const& url_or_path)
|
||
{
|
||
URL url = url_or_path;
|
||
if (url.is_valid())
|
||
return url;
|
||
|
||
ByteString path = LexicalPath::canonicalized_path(url_or_path);
|
||
return create_with_file_scheme(path);
|
||
}
|
||
|
||
URL create_with_data(StringView mime_type, StringView payload, bool is_base64)
|
||
{
|
||
URL url;
|
||
url.set_cannot_be_a_base_url(true);
|
||
url.set_scheme("data"_string);
|
||
|
||
StringBuilder builder;
|
||
builder.append(mime_type);
|
||
if (is_base64)
|
||
builder.append(";base64"sv);
|
||
builder.append(',');
|
||
builder.append(payload);
|
||
url.set_paths({ builder.to_byte_string() });
|
||
return url;
|
||
}
|
||
|
||
// https://url.spec.whatwg.org/#special-scheme
|
||
bool is_special_scheme(StringView scheme)
|
||
{
|
||
return scheme.is_one_of("ftp", "file", "http", "https", "ws", "wss");
|
||
}
|
||
|
||
// https://url.spec.whatwg.org/#url-path-serializer
|
||
ByteString URL::serialize_path(ApplyPercentDecoding apply_percent_decoding) const
|
||
{
|
||
// 1. If url has an opaque path, then return url’s path.
|
||
// FIXME: Reimplement this step once we modernize the URL implementation to meet the spec.
|
||
if (cannot_be_a_base_url())
|
||
return m_paths[0].to_byte_string();
|
||
|
||
// 2. Let output be the empty string.
|
||
StringBuilder output;
|
||
|
||
// 3. For each segment of url’s path: append U+002F (/) followed by segment to output.
|
||
for (auto const& segment : m_paths) {
|
||
output.append('/');
|
||
output.append(apply_percent_decoding == ApplyPercentDecoding::Yes ? percent_decode(segment) : segment.to_byte_string());
|
||
}
|
||
|
||
// 4. Return output.
|
||
return output.to_byte_string();
|
||
}
|
||
|
||
// https://url.spec.whatwg.org/#concept-url-serializer
|
||
ByteString URL::serialize(ExcludeFragment exclude_fragment) const
|
||
{
|
||
// 1. Let output be url’s scheme and U+003A (:) concatenated.
|
||
StringBuilder output;
|
||
output.append(m_scheme);
|
||
output.append(':');
|
||
|
||
// 2. If url’s host is non-null:
|
||
if (!m_host.has<Empty>()) {
|
||
// 1. Append "//" to output.
|
||
output.append("//"sv);
|
||
|
||
// 2. If url includes credentials, then:
|
||
if (includes_credentials()) {
|
||
// 1. Append url’s username to output.
|
||
output.append(m_username);
|
||
|
||
// 2. If url’s password is not the empty string, then append U+003A (:), followed by url’s password, to output.
|
||
if (!m_password.is_empty()) {
|
||
output.append(':');
|
||
output.append(m_password);
|
||
}
|
||
|
||
// 3. Append U+0040 (@) to output.
|
||
output.append('@');
|
||
}
|
||
|
||
// 3. Append url’s host, serialized, to output.
|
||
output.append(serialized_host().release_value_but_fixme_should_propagate_errors());
|
||
|
||
// 4. If url’s port is non-null, append U+003A (:) followed by url’s port, serialized, to output.
|
||
if (m_port.has_value())
|
||
output.appendff(":{}", *m_port);
|
||
}
|
||
|
||
// 3. If url’s host is null, url does not have an opaque path, url’s path’s size is greater than 1, and url’s path[0] is the empty string, then append U+002F (/) followed by U+002E (.) to output.
|
||
// 4. Append the result of URL path serializing url to output.
|
||
// FIXME: Implement this closer to spec steps.
|
||
if (cannot_be_a_base_url()) {
|
||
output.append(m_paths[0]);
|
||
} else {
|
||
if (m_host.has<Empty>() && m_paths.size() > 1 && m_paths[0].is_empty())
|
||
output.append("/."sv);
|
||
for (auto& segment : m_paths) {
|
||
output.append('/');
|
||
output.append(segment);
|
||
}
|
||
}
|
||
|
||
// 5. If url’s query is non-null, append U+003F (?), followed by url’s query, to output.
|
||
if (m_query.has_value()) {
|
||
output.append('?');
|
||
output.append(*m_query);
|
||
}
|
||
|
||
// 6. If exclude fragment is false and url’s fragment is non-null, then append U+0023 (#), followed by url’s fragment, to output.
|
||
if (exclude_fragment == ExcludeFragment::No && m_fragment.has_value()) {
|
||
output.append('#');
|
||
output.append(*m_fragment);
|
||
}
|
||
|
||
// 7. Return output.
|
||
return output.to_byte_string();
|
||
}
|
||
|
||
// https://url.spec.whatwg.org/#url-rendering
|
||
// NOTE: This does e.g. not display credentials.
|
||
// FIXME: Parts of the URL other than the host should have their sequences of percent-encoded bytes replaced with code points
|
||
// resulting from percent-decoding those sequences converted to bytes, unless that renders those sequences invisible.
|
||
ByteString URL::serialize_for_display() const
|
||
{
|
||
VERIFY(m_valid);
|
||
|
||
StringBuilder builder;
|
||
builder.append(m_scheme);
|
||
builder.append(':');
|
||
|
||
if (!m_host.has<Empty>()) {
|
||
builder.append("//"sv);
|
||
builder.append(serialized_host().release_value_but_fixme_should_propagate_errors());
|
||
if (m_port.has_value())
|
||
builder.appendff(":{}", *m_port);
|
||
}
|
||
|
||
if (cannot_be_a_base_url()) {
|
||
builder.append(m_paths[0]);
|
||
} else {
|
||
if (m_host.has<Empty>() && m_paths.size() > 1 && m_paths[0].is_empty())
|
||
builder.append("/."sv);
|
||
for (auto& segment : m_paths) {
|
||
builder.append('/');
|
||
builder.append(segment);
|
||
}
|
||
}
|
||
|
||
if (m_query.has_value()) {
|
||
builder.append('?');
|
||
builder.append(*m_query);
|
||
}
|
||
|
||
if (m_fragment.has_value()) {
|
||
builder.append('#');
|
||
builder.append(*m_fragment);
|
||
}
|
||
|
||
return builder.to_byte_string();
|
||
}
|
||
|
||
ErrorOr<String> URL::to_string() const
|
||
{
|
||
return String::from_byte_string(serialize());
|
||
}
|
||
|
||
// https://html.spec.whatwg.org/multipage/origin.html#ascii-serialisation-of-an-origin
|
||
// https://url.spec.whatwg.org/#concept-url-origin
|
||
ByteString URL::serialize_origin() const
|
||
{
|
||
VERIFY(m_valid);
|
||
|
||
if (m_scheme == "blob"sv) {
|
||
// TODO: 1. If URL’s blob URL entry is non-null, then return URL’s blob URL entry’s environment’s origin.
|
||
// 2. Let url be the result of parsing URL’s path[0].
|
||
VERIFY(!m_paths.is_empty());
|
||
URL url = m_paths[0];
|
||
// 3. Return a new opaque origin, if url is failure, and url’s origin otherwise.
|
||
if (!url.is_valid())
|
||
return "null";
|
||
return url.serialize_origin();
|
||
} else if (!m_scheme.is_one_of("ftp"sv, "http"sv, "https"sv, "ws"sv, "wss"sv)) { // file: "Unfortunate as it is, this is left as an exercise to the reader. When in doubt, return a new opaque origin."
|
||
return "null";
|
||
}
|
||
|
||
StringBuilder builder;
|
||
builder.append(m_scheme);
|
||
builder.append("://"sv);
|
||
builder.append(serialized_host().release_value_but_fixme_should_propagate_errors());
|
||
if (m_port.has_value())
|
||
builder.appendff(":{}", *m_port);
|
||
return builder.to_byte_string();
|
||
}
|
||
|
||
bool URL::equals(URL const& other, ExcludeFragment exclude_fragments) const
|
||
{
|
||
if (this == &other)
|
||
return true;
|
||
if (!m_valid || !other.m_valid)
|
||
return false;
|
||
return serialize(exclude_fragments) == other.serialize(exclude_fragments);
|
||
}
|
||
|
||
void append_percent_encoded(StringBuilder& builder, u32 code_point)
|
||
{
|
||
if (code_point <= 0x7f)
|
||
builder.appendff("%{:02X}", code_point);
|
||
else if (code_point <= 0x07ff)
|
||
builder.appendff("%{:02X}%{:02X}", ((code_point >> 6) & 0x1f) | 0xc0, (code_point & 0x3f) | 0x80);
|
||
else if (code_point <= 0xffff)
|
||
builder.appendff("%{:02X}%{:02X}%{:02X}", ((code_point >> 12) & 0x0f) | 0xe0, ((code_point >> 6) & 0x3f) | 0x80, (code_point & 0x3f) | 0x80);
|
||
else if (code_point <= 0x10ffff)
|
||
builder.appendff("%{:02X}%{:02X}%{:02X}%{:02X}", ((code_point >> 18) & 0x07) | 0xf0, ((code_point >> 12) & 0x3f) | 0x80, ((code_point >> 6) & 0x3f) | 0x80, (code_point & 0x3f) | 0x80);
|
||
else
|
||
VERIFY_NOT_REACHED();
|
||
}
|
||
|
||
// https://url.spec.whatwg.org/#c0-control-percent-encode-set
|
||
bool code_point_is_in_percent_encode_set(u32 code_point, PercentEncodeSet set)
|
||
{
|
||
// NOTE: Once we've checked for presence in the C0Control set, we know that the code point is
|
||
// a valid ASCII character in the range 0x20..0x7E, so we can safely cast it to char.
|
||
switch (set) {
|
||
case PercentEncodeSet::C0Control:
|
||
return code_point < 0x20 || code_point > 0x7E;
|
||
case PercentEncodeSet::Fragment:
|
||
return code_point_is_in_percent_encode_set(code_point, PercentEncodeSet::C0Control) || " \"<>`"sv.contains(static_cast<char>(code_point));
|
||
case PercentEncodeSet::Query:
|
||
return code_point_is_in_percent_encode_set(code_point, PercentEncodeSet::C0Control) || " \"#<>"sv.contains(static_cast<char>(code_point));
|
||
case PercentEncodeSet::SpecialQuery:
|
||
return code_point_is_in_percent_encode_set(code_point, PercentEncodeSet::Query) || code_point == '\'';
|
||
case PercentEncodeSet::Path:
|
||
return code_point_is_in_percent_encode_set(code_point, PercentEncodeSet::Query) || "?`{}"sv.contains(static_cast<char>(code_point));
|
||
case PercentEncodeSet::Userinfo:
|
||
return code_point_is_in_percent_encode_set(code_point, PercentEncodeSet::Path) || "/:;=@[\\]^|"sv.contains(static_cast<char>(code_point));
|
||
case PercentEncodeSet::Component:
|
||
return code_point_is_in_percent_encode_set(code_point, PercentEncodeSet::Userinfo) || "$%&+,"sv.contains(static_cast<char>(code_point));
|
||
case PercentEncodeSet::ApplicationXWWWFormUrlencoded:
|
||
return code_point_is_in_percent_encode_set(code_point, PercentEncodeSet::Component) || "!'()~"sv.contains(static_cast<char>(code_point));
|
||
case PercentEncodeSet::EncodeURI:
|
||
// NOTE: This is the same percent encode set that JS encodeURI() uses.
|
||
// https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/encodeURI
|
||
return code_point > 0x7E || (!is_ascii_alphanumeric(code_point) && !";,/?:@&=+$-_.!~*'()#"sv.contains(static_cast<char>(code_point)));
|
||
default:
|
||
VERIFY_NOT_REACHED();
|
||
}
|
||
}
|
||
|
||
void append_percent_encoded_if_necessary(StringBuilder& builder, u32 code_point, PercentEncodeSet set)
|
||
{
|
||
if (code_point_is_in_percent_encode_set(code_point, set))
|
||
append_percent_encoded(builder, code_point);
|
||
else
|
||
builder.append_code_point(code_point);
|
||
}
|
||
|
||
ByteString percent_encode(StringView input, PercentEncodeSet set, SpaceAsPlus space_as_plus)
|
||
{
|
||
StringBuilder builder;
|
||
for (auto code_point : Utf8View(input)) {
|
||
if (space_as_plus == SpaceAsPlus::Yes && code_point == ' ')
|
||
builder.append('+');
|
||
else
|
||
append_percent_encoded_if_necessary(builder, code_point, set);
|
||
}
|
||
return builder.to_byte_string();
|
||
}
|
||
|
||
ByteString percent_decode(StringView input)
|
||
{
|
||
if (!input.contains('%'))
|
||
return input;
|
||
StringBuilder builder;
|
||
Utf8View utf8_view(input);
|
||
for (auto it = utf8_view.begin(); !it.done(); ++it) {
|
||
if (*it != '%') {
|
||
builder.append_code_point(*it);
|
||
} else if (!is_ascii_hex_digit(it.peek(1).value_or(0)) || !is_ascii_hex_digit(it.peek(2).value_or(0))) {
|
||
builder.append_code_point(*it);
|
||
} else {
|
||
++it;
|
||
u8 byte = parse_ascii_hex_digit(*it) << 4;
|
||
++it;
|
||
byte += parse_ascii_hex_digit(*it);
|
||
builder.append(byte);
|
||
}
|
||
}
|
||
return builder.to_byte_string();
|
||
}
|
||
|
||
}
|