ladybird/Userland/Libraries/LibURL/URL.cpp

502 lines
17 KiB
C++
Raw Normal View History

/*
* Copyright (c) 2018-2020, Andreas Kling <andreas@ladybird.org>
* Copyright (c) 2021, Max Wipfli <mail@maxwipfli.ch>
*
* SPDX-License-Identifier: BSD-2-Clause
*/
#include <AK/Base64.h>
#include <AK/CharacterTypes.h>
#include <AK/Debug.h>
#include <AK/LexicalPath.h>
#include <AK/StringBuilder.h>
#include <AK/Utf8View.h>
#include <LibURL/Parser.h>
#include <LibURL/URL.h>
namespace URL {
// FIXME: It could make sense to force users of URL to use URL::Parser::basic_parse() explicitly instead of using a constructor.
URL::URL(StringView string)
: URL(Parser::basic_parse(string))
{
if constexpr (URL_PARSER_DEBUG) {
if (m_data->valid)
dbgln("URL constructor: Parsed URL to be '{}'.", serialize());
else
dbgln("URL constructor: Parsed URL to be invalid.");
}
}
URL URL::complete_url(StringView relative_url) const
{
if (!is_valid())
return {};
return Parser::basic_parse(relative_url, *this);
}
ByteString URL::path_segment_at_index(size_t index) const
{
VERIFY(index < path_segment_count());
return percent_decode(m_data->paths[index]);
}
ByteString URL::basename() const
{
if (!m_data->valid)
return {};
if (m_data->paths.is_empty())
return {};
auto& last_segment = m_data->paths.last();
return percent_decode(last_segment);
}
void URL::set_scheme(String scheme)
{
m_data->scheme = move(scheme);
m_data->valid = compute_validity();
}
// https://url.spec.whatwg.org/#set-the-username
void URL::set_username(StringView username)
{
// To set the username given a url and username, set urls username to the result of running UTF-8 percent-encode on username using the userinfo percent-encode set.
m_data->username = percent_encode(username, PercentEncodeSet::Userinfo);
m_data->valid = compute_validity();
}
// https://url.spec.whatwg.org/#set-the-password
void URL::set_password(StringView password)
{
// To set the password given a url and password, set urls password to the result of running UTF-8 percent-encode on password using the userinfo percent-encode set.
m_data->password = percent_encode(password, PercentEncodeSet::Userinfo);
m_data->valid = compute_validity();
}
void URL::set_host(Host host)
{
m_data->host = move(host);
m_data->valid = compute_validity();
}
// https://url.spec.whatwg.org/#concept-host-serializer
ErrorOr<String> URL::serialized_host() const
{
return Parser::serialize_host(m_data->host);
}
void URL::set_port(Optional<u16> port)
{
if (port == default_port_for_scheme(m_data->scheme)) {
m_data->port = {};
return;
}
m_data->port = move(port);
m_data->valid = compute_validity();
}
void URL::set_paths(Vector<ByteString> const& paths)
{
m_data->paths.clear_with_capacity();
m_data->paths.ensure_capacity(paths.size());
for (auto const& segment : paths)
m_data->paths.unchecked_append(percent_encode(segment, PercentEncodeSet::Path));
m_data->valid = compute_validity();
}
void URL::append_path(StringView path)
{
m_data->paths.append(percent_encode(path, PercentEncodeSet::Path));
}
// https://url.spec.whatwg.org/#cannot-have-a-username-password-port
bool URL::cannot_have_a_username_or_password_or_port() const
{
// A URL cannot have a username/password/port if its host is null or the empty string, or its scheme is "file".
return m_data->host.has<Empty>() || m_data->host == String {} || m_data->scheme == "file"sv;
}
// FIXME: This is by no means complete.
// NOTE: This relies on some assumptions about how the spec-defined URL parser works that may turn out to be wrong.
bool URL::compute_validity() const
{
if (m_data->scheme.is_empty())
return false;
if (m_data->cannot_be_a_base_url) {
if (m_data->paths.size() != 1)
return false;
if (m_data->paths[0].is_empty())
return false;
} else {
if (m_data->scheme.is_one_of("about", "mailto"))
return false;
// NOTE: Maybe it is allowed to have a zero-segment path.
if (m_data->paths.size() == 0)
return false;
}
// NOTE: A file URL's host should be the empty string for localhost, not null.
if (m_data->scheme == "file" && m_data->host.has<Empty>())
return false;
return true;
}
// https://url.spec.whatwg.org/#default-port
Optional<u16> default_port_for_scheme(StringView scheme)
{
// Spec defined mappings with port:
if (scheme == "ftp")
return 21;
if (scheme == "http")
return 80;
if (scheme == "https")
return 443;
if (scheme == "ws")
return 80;
if (scheme == "wss")
return 443;
// NOTE: not in spec, but we support these too
if (scheme == "irc")
return 6667;
if (scheme == "ircs")
return 6697;
return {};
}
URL create_with_file_scheme(ByteString const& path, ByteString const& fragment, ByteString const& hostname)
{
LexicalPath lexical_path(path);
if (!lexical_path.is_absolute())
return {};
URL url;
url.set_scheme("file"_string);
url.set_host(hostname == "localhost" ? String {} : String::from_byte_string(hostname).release_value_but_fixme_should_propagate_errors());
url.set_paths(lexical_path.parts());
if (path.ends_with('/'))
url.append_slash();
if (!fragment.is_empty())
url.set_fragment(String::from_byte_string(fragment).release_value_but_fixme_should_propagate_errors());
return url;
}
URL create_with_url_or_path(ByteString const& url_or_path)
{
URL url = url_or_path;
if (url.is_valid())
return url;
ByteString path = LexicalPath::canonicalized_path(url_or_path);
return create_with_file_scheme(path);
}
URL create_with_data(StringView mime_type, StringView payload, bool is_base64)
{
URL url;
url.set_cannot_be_a_base_url(true);
url.set_scheme("data"_string);
StringBuilder builder;
builder.append(mime_type);
if (is_base64)
builder.append(";base64"sv);
builder.append(',');
builder.append(payload);
url.set_paths({ builder.to_byte_string() });
return url;
}
// https://url.spec.whatwg.org/#special-scheme
bool is_special_scheme(StringView scheme)
{
return scheme.is_one_of("ftp", "file", "http", "https", "ws", "wss");
}
// https://url.spec.whatwg.org/#url-path-serializer
String URL::serialize_path() const
{
// 1. If url has an opaque path, then return urls path.
// FIXME: Reimplement this step once we modernize the URL implementation to meet the spec.
if (cannot_be_a_base_url())
return m_data->paths[0];
// 2. Let output be the empty string.
StringBuilder output;
// 3. For each segment of urls path: append U+002F (/) followed by segment to output.
for (auto const& segment : m_data->paths) {
output.append('/');
output.append(segment);
}
// 4. Return output.
return output.to_string_without_validation();
}
// https://url.spec.whatwg.org/#concept-url-serializer
ByteString URL::serialize(ExcludeFragment exclude_fragment) const
{
// 1. Let output be urls scheme and U+003A (:) concatenated.
StringBuilder output;
output.append(m_data->scheme);
output.append(':');
// 2. If urls host is non-null:
if (!m_data->host.has<Empty>()) {
// 1. Append "//" to output.
output.append("//"sv);
// 2. If url includes credentials, then:
if (includes_credentials()) {
// 1. Append urls username to output.
output.append(m_data->username);
// 2. If urls password is not the empty string, then append U+003A (:), followed by urls password, to output.
if (!m_data->password.is_empty()) {
output.append(':');
output.append(m_data->password);
}
// 3. Append U+0040 (@) to output.
output.append('@');
}
// 3. Append urls host, serialized, to output.
output.append(serialized_host().release_value_but_fixme_should_propagate_errors());
// 4. If urls port is non-null, append U+003A (:) followed by urls port, serialized, to output.
if (m_data->port.has_value())
output.appendff(":{}", *m_data->port);
}
// 3. If urls host is null, url does not have an opaque path, urls paths size is greater than 1, and urls path[0] is the empty string, then append U+002F (/) followed by U+002E (.) to output.
// 4. Append the result of URL path serializing url to output.
// FIXME: Implement this closer to spec steps.
if (cannot_be_a_base_url()) {
output.append(m_data->paths[0]);
} else {
if (m_data->host.has<Empty>() && m_data->paths.size() > 1 && m_data->paths[0].is_empty())
output.append("/."sv);
for (auto& segment : m_data->paths) {
output.append('/');
output.append(segment);
}
}
// 5. If urls query is non-null, append U+003F (?), followed by urls query, to output.
if (m_data->query.has_value()) {
output.append('?');
output.append(*m_data->query);
}
// 6. If exclude fragment is false and urls fragment is non-null, then append U+0023 (#), followed by urls fragment, to output.
if (exclude_fragment == ExcludeFragment::No && m_data->fragment.has_value()) {
output.append('#');
output.append(*m_data->fragment);
}
// 7. Return output.
return output.to_byte_string();
}
// https://url.spec.whatwg.org/#url-rendering
// NOTE: This does e.g. not display credentials.
// FIXME: Parts of the URL other than the host should have their sequences of percent-encoded bytes replaced with code points
// resulting from percent-decoding those sequences converted to bytes, unless that renders those sequences invisible.
ByteString URL::serialize_for_display() const
{
VERIFY(m_data->valid);
StringBuilder builder;
builder.append(m_data->scheme);
builder.append(':');
if (!m_data->host.has<Empty>()) {
builder.append("//"sv);
builder.append(serialized_host().release_value_but_fixme_should_propagate_errors());
if (m_data->port.has_value())
builder.appendff(":{}", *m_data->port);
}
if (cannot_be_a_base_url()) {
builder.append(m_data->paths[0]);
} else {
if (m_data->host.has<Empty>() && m_data->paths.size() > 1 && m_data->paths[0].is_empty())
builder.append("/."sv);
for (auto& segment : m_data->paths) {
builder.append('/');
builder.append(segment);
}
}
if (m_data->query.has_value()) {
builder.append('?');
builder.append(*m_data->query);
}
if (m_data->fragment.has_value()) {
builder.append('#');
builder.append(*m_data->fragment);
}
return builder.to_byte_string();
}
ErrorOr<String> URL::to_string() const
{
return String::from_byte_string(serialize());
}
// https://url.spec.whatwg.org/#concept-url-origin
Origin URL::origin() const
{
// The origin of a URL url is the origin returned by running these steps, switching on urls scheme:
// -> "blob"
if (scheme() == "blob"sv) {
auto url_string = to_string().release_value_but_fixme_should_propagate_errors();
// 1. If urls blob URL entry is non-null, then return urls blob URL entrys environments origin.
if (blob_url_entry().has_value())
return blob_url_entry()->environment_origin;
// 2. Let pathURL be the result of parsing the result of URL path serializing url.
auto path_url = Parser::basic_parse(serialize_path());
// 3. If pathURL is failure, then return a new opaque origin.
if (!path_url.is_valid())
return Origin {};
// 4. If pathURLs scheme is "http", "https", or "file", then return pathURLs origin.
if (path_url.scheme().is_one_of("http"sv, "https"sv, "file"sv))
return path_url.origin();
// 5. Return a new opaque origin.
return Origin {};
}
// -> "ftp"
// -> "http"
// -> "https"
// -> "ws"
// -> "wss"
if (scheme().is_one_of("ftp"sv, "http"sv, "https"sv, "ws"sv, "wss"sv)) {
// Return the tuple origin (urls scheme, urls host, urls port, null).
return Origin(scheme().to_byte_string(), host(), port().value_or(0));
}
// -> "file"
// AD-HOC: Our resource:// is basically an alias to file://
if (scheme() == "file"sv || scheme() == "resource"sv) {
// Unfortunate as it is, this is left as an exercise to the reader. When in doubt, return a new opaque origin.
// Note: We must return an origin with the `file://' protocol for `file://' iframes to work from `file://' pages.
return Origin(scheme().to_byte_string(), String {}, 0);
}
// -> Otherwise
// Return a new opaque origin.
return Origin {};
}
bool URL::equals(URL const& other, ExcludeFragment exclude_fragments) const
{
if (this == &other)
return true;
if (!m_data->valid || !other.m_data->valid)
return false;
return serialize(exclude_fragments) == other.serialize(exclude_fragments);
}
void append_percent_encoded(StringBuilder& builder, u32 code_point)
{
if (code_point <= 0x7f)
builder.appendff("%{:02X}", code_point);
else if (code_point <= 0x07ff)
builder.appendff("%{:02X}%{:02X}", ((code_point >> 6) & 0x1f) | 0xc0, (code_point & 0x3f) | 0x80);
else if (code_point <= 0xffff)
builder.appendff("%{:02X}%{:02X}%{:02X}", ((code_point >> 12) & 0x0f) | 0xe0, ((code_point >> 6) & 0x3f) | 0x80, (code_point & 0x3f) | 0x80);
else if (code_point <= 0x10ffff)
builder.appendff("%{:02X}%{:02X}%{:02X}%{:02X}", ((code_point >> 18) & 0x07) | 0xf0, ((code_point >> 12) & 0x3f) | 0x80, ((code_point >> 6) & 0x3f) | 0x80, (code_point & 0x3f) | 0x80);
else
VERIFY_NOT_REACHED();
}
// https://url.spec.whatwg.org/#c0-control-percent-encode-set
bool code_point_is_in_percent_encode_set(u32 code_point, PercentEncodeSet set)
{
// NOTE: Once we've checked for presence in the C0Control set, we know that the code point is
// a valid ASCII character in the range 0x20..0x7E, so we can safely cast it to char.
switch (set) {
case PercentEncodeSet::C0Control:
return code_point < 0x20 || code_point > 0x7E;
case PercentEncodeSet::Fragment:
return code_point_is_in_percent_encode_set(code_point, PercentEncodeSet::C0Control) || " \"<>`"sv.contains(static_cast<char>(code_point));
case PercentEncodeSet::Query:
return code_point_is_in_percent_encode_set(code_point, PercentEncodeSet::C0Control) || " \"#<>"sv.contains(static_cast<char>(code_point));
case PercentEncodeSet::SpecialQuery:
return code_point_is_in_percent_encode_set(code_point, PercentEncodeSet::Query) || code_point == '\'';
case PercentEncodeSet::Path:
return code_point_is_in_percent_encode_set(code_point, PercentEncodeSet::Query) || "?`{}"sv.contains(static_cast<char>(code_point));
case PercentEncodeSet::Userinfo:
return code_point_is_in_percent_encode_set(code_point, PercentEncodeSet::Path) || "/:;=@[\\]^|"sv.contains(static_cast<char>(code_point));
case PercentEncodeSet::Component:
return code_point_is_in_percent_encode_set(code_point, PercentEncodeSet::Userinfo) || "$%&+,"sv.contains(static_cast<char>(code_point));
case PercentEncodeSet::ApplicationXWWWFormUrlencoded:
return code_point_is_in_percent_encode_set(code_point, PercentEncodeSet::Component) || "!'()~"sv.contains(static_cast<char>(code_point));
case PercentEncodeSet::EncodeURI:
// NOTE: This is the same percent encode set that JS encodeURI() uses.
// https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/encodeURI
return code_point > 0x7E || (!is_ascii_alphanumeric(code_point) && !";,/?:@&=+$-_.!~*'()#"sv.contains(static_cast<char>(code_point)));
default:
VERIFY_NOT_REACHED();
}
}
void append_percent_encoded_if_necessary(StringBuilder& builder, u32 code_point, PercentEncodeSet set)
{
if (code_point_is_in_percent_encode_set(code_point, set))
append_percent_encoded(builder, code_point);
else
builder.append_code_point(code_point);
}
String percent_encode(StringView input, PercentEncodeSet set, SpaceAsPlus space_as_plus)
{
StringBuilder builder;
for (auto code_point : Utf8View(input)) {
if (space_as_plus == SpaceAsPlus::Yes && code_point == ' ')
builder.append('+');
else
append_percent_encoded_if_necessary(builder, code_point, set);
}
return MUST(builder.to_string());
}
ByteString percent_decode(StringView input)
{
if (!input.contains('%'))
return input;
StringBuilder builder;
Utf8View utf8_view(input);
for (auto it = utf8_view.begin(); !it.done(); ++it) {
if (*it != '%') {
builder.append_code_point(*it);
} else if (!is_ascii_hex_digit(it.peek(1).value_or(0)) || !is_ascii_hex_digit(it.peek(2).value_or(0))) {
builder.append_code_point(*it);
} else {
++it;
u8 byte = parse_ascii_hex_digit(*it) << 4;
++it;
byte += parse_ascii_hex_digit(*it);
builder.append(byte);
}
}
return builder.to_byte_string();
}
}