ladybird/AK/URLParser.cpp

/*
 * Copyright (c) 2021, Max Wipfli <mail@maxwipfli.ch>
 *
 * SPDX-License-Identifier: BSD-2-Clause
 */

#include <AK/CharacterTypes.h>
#include <AK/Debug.h>
#include <AK/DeprecatedString.h>
#include <AK/Optional.h>
#include <AK/SourceLocation.h>
#include <AK/StringBuilder.h>
#include <AK/StringUtils.h>
#include <AK/URLParser.h>
#include <AK/Utf8View.h>

namespace AK {

// NOTE: This is similar to the LibC macro EOF = -1.
constexpr u32 end_of_file = 0xFFFFFFFF;

static bool is_url_code_point(u32 code_point)
{
    // FIXME: [...] and code points in the range U+00A0 to U+10FFFD, inclusive, excluding surrogates and noncharacters.
    return is_ascii_alphanumeric(code_point) || code_point >= 0xA0 || "!$&'()*+,-./:;=?@_~"sv.contains(code_point);
}

static void report_validation_error(SourceLocation const& location = SourceLocation::current())
{
    dbgln_if(URL_PARSER_DEBUG, "URLParser::parse: Validation error! {}", location);
}

static Optional<DeprecatedString> parse_opaque_host(StringView input)
{
    auto forbidden_host_characters_excluding_percent = "\0\t\n\r #/:<>?@[\\]^|"sv;
    for (auto character : forbidden_host_characters_excluding_percent) {
        if (input.contains(character)) {
            report_validation_error();
            return {};
        }
    }
    // FIXME: If input contains a code point that is not a URL code point and not U+0025 (%), validation error.
    // FIXME: If input contains a U+0025 (%) and the two code points following it are not ASCII hex digits, validation error.
    return URL::percent_encode(input, URL::PercentEncodeSet::C0Control);
}

static Optional<DeprecatedString> parse_ipv4_address(StringView input)
{
    // FIXME: Implement the correct IPv4 parser as specified by https://url.spec.whatwg.org/#concept-ipv4-parser.
    return input;
}

// https://url.spec.whatwg.org/#concept-host-parser
// NOTE: This is a very bare-bones implementation.
static Optional<DeprecatedString> parse_host(StringView input, bool is_not_special = false)
{
    if (input.starts_with('[')) {
        if (!input.ends_with(']')) {
            report_validation_error();
            return {};
        }
        // FIXME: Return the result of IPv6 parsing input with its leading U+005B ([) and trailing U+005D (]) removed.
        TODO();
    }

    if (is_not_special)
        return parse_opaque_host(input);
    VERIFY(!input.is_empty());

    // FIXME: Let domain be the result of running UTF-8 decode without BOM on the percent-decoding of input.
    auto domain = URL::percent_decode(input);
    // FIXME: Let asciiDomain be the result of running domain to ASCII on domain.
    auto& ascii_domain = domain;

    auto forbidden_host_characters = "\0\t\n\r #%/:<>?@[\\]^|"sv;
    for (auto character : forbidden_host_characters) {
        if (ascii_domain.view().contains(character)) {
            report_validation_error();
            return {};
        }
    }

    auto ipv4_host = parse_ipv4_address(ascii_domain);
    return ipv4_host;
}

// https://url.spec.whatwg.org/#start-with-a-windows-drive-letter
constexpr bool starts_with_windows_drive_letter(StringView input)
{
    if (input.length() < 2)
        return false;
    if (!is_ascii_alpha(input[0]) || !(input[1] == ':' || input[1] == '|'))
        return false;
    if (input.length() == 2)
        return true;
    return "/\\?#"sv.contains(input[2]);
}

constexpr bool is_windows_drive_letter(StringView input)
{
    return input.length() == 2 && is_ascii_alpha(input[0]) && (input[1] == ':' || input[1] == '|');
}

constexpr bool is_normalized_windows_drive_letter(StringView input)
{
    return input.length() == 2 && is_ascii_alpha(input[0]) && input[1] == ':';
}

constexpr bool is_single_dot_path_segment(StringView input)
{
    return input == "."sv || input.equals_ignoring_case("%2e"sv);
}

constexpr bool is_double_dot_path_segment(StringView input)
{
    return input == ".."sv || input.equals_ignoring_case(".%2e"sv) || input.equals_ignoring_case("%2e."sv) || input.equals_ignoring_case("%2e%2e"sv);
}

// https://url.spec.whatwg.org/#string-percent-encode-after-encoding
static DeprecatedString percent_encode_after_encoding(StringView input, URL::PercentEncodeSet percent_encode_set, bool space_as_plus = false)
{
    // NOTE: This is written somewhat ad-hoc since we don't yet implement the Encoding spec.

    StringBuilder output;

    // 3. For each byte of encodeOutput converted to a byte sequence:
    for (auto byte : input) {
        // 1. If spaceAsPlus is true and byte is 0x20 (SP), then append U+002B (+) to output and continue.
        if (space_as_plus && byte == ' ') {
            output.append('+');
            continue;
        }

        // 2. Let isomorph be a code point whose value is byte’s value.
        u32 isomorph = byte;

        // 3. Assert: percentEncodeSet includes all non-ASCII code points.

        // 4. If isomorphic is not in percentEncodeSet, then append isomorph to output.
        if (!URL::code_point_is_in_percent_encode_set(isomorph, percent_encode_set)) {
            output.append_code_point(isomorph);
        }

        // 5. Otherwise, percent-encode byte and append the result to output.
        else {
            output.appendff("%{:02X}", byte);
        }
    }

    // 6. Return output.
    return output.to_deprecated_string();
}

// https://fetch.spec.whatwg.org/#data-urls
// FIXME: This only loosely follows the spec, as we use the same class for "regular" and data URLs, unlike the spec.
Optional<URL> URLParser::parse_data_url(StringView raw_input)
{
    dbgln_if(URL_PARSER_DEBUG, "URLParser::parse_data_url: Parsing '{}'.", raw_input);
    VERIFY(raw_input.starts_with("data:"sv));
    auto input = raw_input.substring_view(5);
    auto comma_offset = input.find(',');
    if (!comma_offset.has_value())
        return {};
    auto mime_type = StringUtils::trim(input.substring_view(0, comma_offset.value()), "\t\n\f\r "sv, TrimMode::Both);
    auto encoded_body = input.substring_view(comma_offset.value() + 1);
    auto body = URL::percent_decode(encoded_body);
    bool is_base64_encoded = false;
    if (mime_type.ends_with("base64"sv, CaseSensitivity::CaseInsensitive)) {
        auto substring_view = mime_type.substring_view(0, mime_type.length() - 6);
        auto trimmed_substring_view = StringUtils::trim(substring_view, " "sv, TrimMode::Right);
        if (trimmed_substring_view.ends_with(';')) {
            is_base64_encoded = true;
            mime_type = trimmed_substring_view.substring_view(0, trimmed_substring_view.length() - 1);
        }
    }

    StringBuilder builder;
    if (mime_type.starts_with(";"sv) || mime_type.is_empty()) {
        builder.append("text/plain"sv);
        builder.append(mime_type);
        mime_type = builder.string_view();
    }

    // FIXME: Parse the MIME type's components according to https://mimesniff.spec.whatwg.org/#parse-a-mime-type
    URL url { StringUtils::trim(mime_type, "\n\r\t "sv, TrimMode::Both), move(body), is_base64_encoded };
    dbgln_if(URL_PARSER_DEBUG, "URLParser::parse_data_url: Parsed data URL to be '{}'.", url.serialize());
    return url;
}

// https://url.spec.whatwg.org/#concept-basic-url-parser
// NOTE: This parser assumes a UTF-8 encoding.
// NOTE: Refrain from using the URL classes setters inside this algorithm. Rather, set the values directly. This bypasses the setters' built-in
//       validation, which is strictly unnecessary since we set m_valid=true at the end anyways. Furthermore, this algorithm may be used in the
//       future for validation of URLs, which would then lead to infinite recursion.
//       The same goes for base_url, because e.g. the port() getter does not always return m_port, and we are interested in the underlying member
//       variables' values here, not what the URL class presents to its users.
// NOTE: Since the URL class's member variables contain percent decoded data, we have to deviate from the URL parser specification when setting
//       some of those values. Because the specification leaves all values percent encoded in their URL data structure, we have to percent decode
//       everything before setting the member variables.
URL URLParser::parse(StringView raw_input, URL const* base_url, Optional<URL> url, Optional<State> state_override)
{
    dbgln_if(URL_PARSER_DEBUG, "URLParser::parse: Parsing '{}'", raw_input);
    if (raw_input.is_empty())
        return base_url ? *base_url : URL {};

    if (raw_input.starts_with("data:"sv)) {
        auto maybe_url = parse_data_url(raw_input);
        if (!maybe_url.has_value())
            return {};
        return maybe_url.release_value();
    }

    size_t start_index = 0;
    size_t end_index = raw_input.length();
    if (!url.has_value()) {
        url = URL();

        // NOTE: This removes all leading and trailing C0 control or space characters.
        bool has_validation_error = false;
        for (size_t i = 0; i < raw_input.length(); ++i) {
            i8 ch = raw_input[i];
            if (0 <= ch && ch <= 0x20) {
                ++start_index;
                has_validation_error = true;
            } else {
                break;
            }
        }
        for (ssize_t i = raw_input.length() - 1; i >= 0; --i) {
            i8 ch = raw_input[i];
            if (0 <= ch && ch <= 0x20) {
                --end_index;
                has_validation_error = true;
            } else {
                break;
            }
        }
        if (has_validation_error)
            report_validation_error();
    }
    if (start_index >= end_index)
        return {};

    DeprecatedString processed_input = raw_input.substring_view(start_index, end_index - start_index);

    // NOTE: This replaces all tab and newline characters with nothing.
    if (processed_input.contains("\t"sv) || processed_input.contains("\n"sv)) {
        report_validation_error();
        processed_input = processed_input.replace("\t"sv, ""sv, ReplaceMode::All).replace("\n"sv, ""sv, ReplaceMode::All);
    }

    State state = state_override.value_or(State::SchemeStart);
    StringBuilder buffer;
    bool at_sign_seen = false;
    bool inside_brackets = false;
    bool password_token_seen = false;

    Utf8View input(processed_input);
    Utf8CodePointIterator iterator = input.begin();

    auto get_remaining = [&input, &iterator] {
        return input.substring_view(iterator - input.begin() + iterator.underlying_code_point_length_in_bytes()).as_string();
    };

    // NOTE: "continue" should only be used to prevent incrementing the iterator, as this is done at the end of the loop.
    //       ++iterator : "increase pointer by 1"
    //       continue   : "decrease pointer by 1"
    for (;;) {
        u32 code_point = end_of_file;
        if (!iterator.done())
            code_point = *iterator;

        if constexpr (URL_PARSER_DEBUG) {
            if (code_point == end_of_file)
                dbgln("URLParser::parse: {} state with EOF.", state_name(state));
            else if (is_ascii_printable(code_point))
                dbgln("URLParser::parse: {} state with code point U+{:04X} ({:c}).", state_name(state), code_point, code_point);
            else
                dbgln("URLParser::parse: {} state with code point U+{:04X}.", state_name(state), code_point);
        }

        switch (state) {
        case State::SchemeStart:
            if (is_ascii_alpha(code_point)) {
                buffer.append_as_lowercase(code_point);
                state = State::Scheme;
            } else {
                state = State::NoScheme;
                continue;
            }
            break;
        case State::Scheme:
            if (is_ascii_alphanumeric(code_point) || code_point == '+' || code_point == '-' || code_point == '.') {
                buffer.append_as_lowercase(code_point);
            } else if (code_point == ':') {
                url->m_scheme = buffer.to_deprecated_string();
                buffer.clear();
                if (url->scheme() == "file") {
                    if (!get_remaining().starts_with("//"sv)) {
                        report_validation_error();
                    }
                    state = State::File;
                } else if (url->is_special()) {
                    if (base_url && base_url->m_scheme == url->m_scheme)
                        state = State::SpecialRelativeOrAuthority;
                    else
                        state = State::SpecialAuthoritySlashes;
                } else if (get_remaining().starts_with("/"sv)) {
                    state = State::PathOrAuthority;
                    ++iterator;
                } else {
                    url->m_cannot_be_a_base_url = true;
                    url->append_path("");
                    state = State::CannotBeABaseUrlPath;
                }
            } else {
                buffer.clear();
                state = State::NoScheme;
                iterator = input.begin();
                continue;
            }
            break;
        case State::NoScheme:
            if (!base_url || (base_url->m_cannot_be_a_base_url && code_point != '#')) {
                report_validation_error();
                return {};
            } else if (base_url->m_cannot_be_a_base_url && code_point == '#') {
                url->m_scheme = base_url->m_scheme;
                url->m_paths = base_url->m_paths;
                url->m_query = base_url->m_query;
                url->m_fragment = "";
                url->m_cannot_be_a_base_url = true;
                state = State::Fragment;
            } else if (base_url->m_scheme != "file") {
                state = State::Relative;
                continue;
            } else {
                state = State::File;
                continue;
            }
            break;
        case State::SpecialRelativeOrAuthority:
            if (code_point == '/' && get_remaining().starts_with("/"sv)) {
                state = State::SpecialAuthorityIgnoreSlashes;
                ++iterator;
            } else {
                report_validation_error();
                state = State::Relative;
                continue;
            }
            break;
        case State::PathOrAuthority:
            if (code_point == '/') {
                state = State::Authority;
            } else {
                state = State::Path;
                continue;
            }
            break;
        case State::Relative:
            url->m_scheme = base_url->m_scheme;
            if (code_point == '/') {
                state = State::RelativeSlash;
            } else if (url->is_special() && code_point == '\\') {
                report_validation_error();
                state = State::RelativeSlash;
            } else {
                url->m_username = base_url->m_username;
                url->m_password = base_url->m_password;
                url->m_host = base_url->m_host;
                url->m_port = base_url->m_port;
                url->m_paths = base_url->m_paths;
                url->m_query = base_url->m_query;

                if (code_point == '?') {
                    url->m_query = "";
                    state = State::Query;
                } else if (code_point == '#') {
                    url->m_fragment = "";
                    state = State::Fragment;
                } else if (code_point != end_of_file) {
                    url->m_query = {};
                    if (url->m_paths.size())
                        url->m_paths.remove(url->m_paths.size() - 1);
                    state = State::Path;
                    continue;
                }
            }
            break;
        case State::RelativeSlash:
            if (url->is_special() && (code_point == '/' || code_point == '\\')) {
                if (code_point == '\\')
                    report_validation_error();
                state = State::SpecialAuthorityIgnoreSlashes;
            } else if (code_point == '/') {
                state = State::Authority;
            } else {
                url->m_username = base_url->m_username;
                url->m_password = base_url->m_password;
                url->m_host = base_url->m_host;
                url->m_port = base_url->m_port;
                state = State::Path;
                continue;
            }
            break;
        case State::SpecialAuthoritySlashes:
            if (code_point == '/' && get_remaining().starts_with("/"sv)) {
                state = State::SpecialAuthorityIgnoreSlashes;
                ++iterator;
            } else {
                report_validation_error();
                state = State::SpecialAuthorityIgnoreSlashes;
                continue;
            }
            break;
        case State::SpecialAuthorityIgnoreSlashes:
            if (code_point != '/' && code_point != '\\') {
                state = State::Authority;
                continue;
            } else {
                report_validation_error();
            }
            break;
        case State::Authority:
            if (code_point == '@') {
                report_validation_error();
                if (at_sign_seen) {
                    auto content = buffer.to_deprecated_string();
                    buffer.clear();
                    buffer.append("%40"sv);
                    buffer.append(content);
                }
                at_sign_seen = true;
                StringBuilder builder;
                for (auto c : Utf8View(builder.string_view())) {
                    if (c == ':' && !password_token_seen) {
                        password_token_seen = true;
                        continue;
                    }
                    builder.clear();
                    if (password_token_seen) {
                        builder.append(url->password());
                        URL::append_percent_encoded_if_necessary(builder, c, URL::PercentEncodeSet::Userinfo);
                        // NOTE: This is has to be encoded and then decoded because the original sequence could contain already percent-encoded sequences.
                        url->m_password = URL::percent_decode(builder.string_view());
                    } else {
                        builder.append(url->username());
                        URL::append_percent_encoded_if_necessary(builder, c, URL::PercentEncodeSet::Userinfo);
                        // NOTE: This is has to be encoded and then decoded because the original sequence could contain already percent-encoded sequences.
                        url->m_username = URL::percent_decode(builder.string_view());
                    }
                }
                buffer.clear();
            } else if (code_point == end_of_file || code_point == '/' || code_point == '?' || code_point == '#' || (url->is_special() && code_point == '\\')) {
                if (at_sign_seen && buffer.is_empty()) {
                    report_validation_error();
                    return {};
                }
                // NOTE: This decreases the iterator by the number of code points in buffer plus one.
                iterator = input.iterator_at_byte_offset(iterator - input.begin() - buffer.length() - 1);
                buffer.clear();
                state = State::Host;
            } else {
                buffer.append_code_point(code_point);
            }
            break;
        case State::Host:
        case State::Hostname:
            if (code_point == ':' && !inside_brackets) {
                if (buffer.is_empty()) {
                    report_validation_error();
                    return {};
                }
                auto host = parse_host(buffer.string_view(), !url->is_special());
                if (!host.has_value())
                    return {};
                url->m_host = host.release_value();
                buffer.clear();
                state = State::Port;
            } else if (code_point == end_of_file || code_point == '/' || code_point == '?' || code_point == '#' || (url->is_special() && code_point == '\\')) {
                if (url->is_special() && buffer.is_empty()) {
                    report_validation_error();
                    return {};
                }
                auto host = parse_host(buffer.string_view(), !url->is_special());
                if (!host.has_value())
                    return {};
                url->m_host = host.value();
                buffer.clear();
                state = State::Port;
                continue;
            } else if (code_point == '[') {
                inside_brackets = true;
            } else if (code_point == ']') {
                inside_brackets = false;
            } else {
                buffer.append_code_point(code_point);
            }
            break;
        case State::Port:
            if (is_ascii_digit(code_point)) {
                buffer.append_code_point(code_point);
            } else if (code_point == end_of_file || code_point == '/' || code_point == '?' || code_point == '#' || (url->is_special() && code_point == '\\')) {
                if (!buffer.is_empty()) {
                    auto port = buffer.string_view().to_uint();
                    if (!port.has_value() || port.value() > 65535) {
                        report_validation_error();
                        return {};
                    }
                    if (port.value() == URL::default_port_for_scheme(url->scheme()))
                        url->m_port = {};
                    else
                        url->m_port = port.value();
                    buffer.clear();
                }
                state = State::PathStart;
                continue;
            } else {
                report_validation_error();
                return {};
            }
            break;
        case State::File:
            url->m_scheme = "file";
            url->m_host = "";
            if (code_point == '/' || code_point == '\\') {
                if (code_point == '\\')
                    report_validation_error();
                state = State::FileSlash;
            } else if (base_url && base_url->m_scheme == "file") {
                url->m_host = base_url->m_host;
                url->m_paths = base_url->m_paths;
                url->m_query = base_url->m_query;
                if (code_point == '?') {
                    url->m_query = "";
                    state = State::Query;
                } else if (code_point == '#') {
                    url->m_fragment = "";
                    state = State::Fragment;
                } else if (code_point != end_of_file) {
                    url->m_query = {};
                    auto substring_from_pointer = input.substring_view(iterator - input.begin()).as_string();
                    if (!starts_with_windows_drive_letter(substring_from_pointer)) {
                        if (!url->paths().is_empty() && !(url->scheme() == "file" && url->paths().size() == 1 && is_normalized_windows_drive_letter(url->paths()[0])))
                            url->m_paths.remove(url->m_paths.size() - 1);
                    } else {
                        report_validation_error();
                        url->m_paths.clear();
                    }
                    state = State::Path;
                    continue;
                }
            }
            break;
        case State::FileSlash:
            if (code_point == '/' || code_point == '\\') {
                if (code_point == '\\')
                    report_validation_error();
                state = State::FileHost;
            } else if (base_url && base_url->m_scheme == "file") {
                url->m_host = base_url->m_host;
                auto substring_from_pointer = input.substring_view(iterator - input.begin()).as_string();
                if (!starts_with_windows_drive_letter(substring_from_pointer) && is_normalized_windows_drive_letter(base_url->m_paths[0]))
                    url->append_path(base_url->m_paths[0]);
                state = State::Path;
                continue;
            }
            break;
        case State::FileHost:
            if (code_point == end_of_file || code_point == '/' || code_point == '\\' || code_point == '?' || code_point == '#') {
                if (is_windows_drive_letter(buffer.string_view())) {
                    report_validation_error();
                    state = State::Path;
                } else if (buffer.is_empty()) {
                    url->m_host = "";
                    state = State::PathStart;
                } else {
                    auto host = parse_host(buffer.string_view(), true);
                    if (!host.has_value())
                        return {};
                    if (host.value() == "localhost")
                        host = "";
                    url->m_host = host.release_value();
                    buffer.clear();
                    state = State::PathStart;
                }
                continue;
            } else {
                buffer.append_code_point(code_point);
            }
            break;
        case State::PathStart:
            if (url->is_special()) {
                if (code_point == '\\')
                    report_validation_error();
                state = State::Path;
                if (code_point != '/' && code_point != '\\')
                    continue;
            } else if (code_point == '?') {
                url->m_query = "";
                state = State::Query;
            } else if (code_point == '#') {
                url->m_fragment = "";
                state = State::Fragment;
            } else if (code_point != end_of_file) {
                state = State::Path;
                if (code_point != '/')
                    continue;
            }
            break;
        case State::Path:
            if (code_point == end_of_file || code_point == '/' || (url->is_special() && code_point == '\\') || code_point == '?' || code_point == '#') {
                if (url->is_special() && code_point == '\\')
                    report_validation_error();
                if (is_double_dot_path_segment(buffer.string_view())) {
                    if (!url->m_paths.is_empty() && !(url->m_scheme == "file" && url->m_paths.size() == 1 && is_normalized_windows_drive_letter(url->m_paths[0])))
                        url->m_paths.remove(url->m_paths.size() - 1);
                    if (code_point != '/' && !(url->is_special() && code_point == '\\'))
                        url->append_path("");
                } else if (is_single_dot_path_segment(buffer.string_view()) && code_point != '/' && !(url->is_special() && code_point == '\\')) {
                    url->append_path("");
                } else if (!is_single_dot_path_segment(buffer.string_view())) {
                    if (url->m_scheme == "file" && url->m_paths.is_empty() && is_windows_drive_letter(buffer.string_view())) {
                        auto drive_letter = buffer.string_view()[0];
                        buffer.clear();
                        buffer.append(drive_letter);
                        buffer.append(':');
                    }
                    // NOTE: This needs to be percent decoded since the member variables contain decoded data.
                    url->append_path(URL::percent_decode(buffer.string_view()));
                }
                buffer.clear();
                if (code_point == '?') {
                    url->m_query = "";
                    state = State::Query;
                } else if (code_point == '#') {
                    url->m_fragment = "";
                    state = State::Fragment;
                }
            } else {
                if (!is_url_code_point(code_point) && code_point != '%')
                    report_validation_error();
                // FIXME: If c is U+0025 (%) and remaining does not start with two ASCII hex digits, validation error.
                URL::append_percent_encoded_if_necessary(buffer, code_point, URL::PercentEncodeSet::Path);
            }
            break;
        case State::CannotBeABaseUrlPath:
            // NOTE: This does not follow the spec exactly but rather uses the buffer and only sets the path on EOF.
            // NOTE: Verify that the assumptions required for this simplification are correct.
            VERIFY(url->m_paths.size() == 1 && url->m_paths[0].is_empty());
            if (code_point == '?') {
                // NOTE: This needs to be percent decoded since the member variables contain decoded data.
                url->m_paths[0] = URL::percent_decode(buffer.string_view());
                url->m_query = "";
                state = State::Query;
            } else if (code_point == '#') {
                // NOTE: This needs to be percent decoded since the member variables contain decoded data.
                url->m_paths[0] = URL::percent_decode(buffer.string_view());
                url->m_fragment = "";
                state = State::Fragment;
            } else {
                if (code_point != end_of_file && !is_url_code_point(code_point) && code_point != '%')
                    report_validation_error();
                // FIXME: If c is U+0025 (%) and remaining does not start with two ASCII hex digits, validation error.
                if (code_point != end_of_file) {
                    URL::append_percent_encoded_if_necessary(buffer, code_point, URL::PercentEncodeSet::C0Control);
                } else {
                    // NOTE: This needs to be percent decoded since the member variables contain decoded data.
                    url->m_paths[0] = URL::percent_decode(buffer.string_view());
                }
            }
            break;
        case State::Query:
            // https://url.spec.whatwg.org/#query-state
            if (code_point == end_of_file || code_point == '#') {
                VERIFY(url->m_query == "");
                auto query_percent_encode_set = url->is_special() ? URL::PercentEncodeSet::SpecialQuery : URL::PercentEncodeSet::Query;
                url->m_query = percent_encode_after_encoding(buffer.string_view(), query_percent_encode_set);
                buffer.clear();
                if (code_point == '#') {
                    url->m_fragment = "";
                    state = State::Fragment;
                }
            } else if (code_point != end_of_file) {
                if (!is_url_code_point(code_point) && code_point != '%')
                    report_validation_error();
                // FIXME: If c is U+0025 (%) and remaining does not start with two ASCII hex digits, validation error.
                buffer.append_code_point(code_point);
            }
            break;
        case State::Fragment:
            // NOTE: This does not follow the spec exactly but rather uses the buffer and only sets the fragment on EOF.
            if (code_point != end_of_file) {
                if (!is_url_code_point(code_point) && code_point != '%')
                    report_validation_error();
                // FIXME: If c is U+0025 (%) and remaining does not start with two ASCII hex digits, validation error.
                buffer.append_code_point(code_point);
            } else {
                // NOTE: This needs to be percent decoded since the member variables contain decoded data.
                url->m_fragment = URL::percent_decode(buffer.string_view());
                buffer.clear();
            }
            break;
        default:
            VERIFY_NOT_REACHED();
        }

        if (iterator.done())
            break;
        ++iterator;
    }

    url->m_valid = true;
    dbgln_if(URL_PARSER_DEBUG, "URLParser::parse: Parsed URL to be '{}'.", url->serialize());
    return url.release_value();
}

}
-												AK: Add a new, spec-compliant URLParser

This adds a new URL parser, which aims to be compliant with the URL
specification (https://url.spec.whatwg.org/). It also contains a
rudimentary data URL parser.

											
										
										
											2021-05-25 20:13:15 +00:00
+								/*
 								 * Copyright (c) 2021, Max Wipfli <mail@maxwipfli.ch>
 								 *
 								 * SPDX-License-Identifier: BSD-2-Clause
 								 */
-												Everywhere: Replace ctype.h to avoid narrowing conversions

This replaces ctype.h with CharacterType.h everywhere I could find
issues with narrowing conversions. While using it will probably make
sense almost everywhere in the future, the most critical places should
have been addressed.

											
										
										
											2021-06-01 19:18:08 +00:00
+								#include <AK/CharacterTypes.h>
-												AK: Add a new, spec-compliant URLParser

This adds a new URL parser, which aims to be compliant with the URL
specification (https://url.spec.whatwg.org/). It also contains a
rudimentary data URL parser.

											
										
										
											2021-05-25 20:13:15 +00:00
+								#include <AK/Debug.h>
-												AK+Everywhere: Rename String to DeprecatedString

We have a new, improved string type coming up in AK (OOM aware, no null
state), and while it's going to use UTF-8, the name UTF8String is a
mouthful - so let's free up the String name by renaming the existing
class.
Making the old one have an annoying name will hopefully also help with
quick adoption :^)

											
										
										
											2022-12-04 18:02:33 +00:00
+								#include <AK/DeprecatedString.h>
-												AK: Add a new, spec-compliant URLParser

This adds a new URL parser, which aims to be compliant with the URL
specification (https://url.spec.whatwg.org/). It also contains a
rudimentary data URL parser.

											
										
										
											2021-05-25 20:13:15 +00:00
+								#include <AK/Optional.h>
 								#include <AK/SourceLocation.h>
 								#include <AK/StringBuilder.h>
 								#include <AK/StringUtils.h>
 								#include <AK/URLParser.h>
 								#include <AK/Utf8View.h>
 								namespace AK {
-												AK: Stop using U+0000 as end of file code point in URL parser

This changes URL parser to use the 0xFFFFFFFF constant instead of 0 to
indicate end of file. This fixes a bug where inputs containing null
bytes would terminate the parser early, because they were interpreted
as end of file.

											
										
										
											2021-06-03 10:43:08 +00:00
+								// NOTE: This is similar to the LibC macro EOF = -1.
 								constexpr u32 end_of_file = 0xFFFFFFFF;
-												AK: Fix 'constexpr' attribute on non-constexpr function

is_url_code_point invokes StringView::contains, which never was and
cannot become constexpr.

											
										
										
											2022-09-12 16:32:52 +00:00
+								static bool is_url_code_point(u32 code_point)
-												AK: Add a new, spec-compliant URLParser

This adds a new URL parser, which aims to be compliant with the URL
specification (https://url.spec.whatwg.org/). It also contains a
rudimentary data URL parser.

											
										
										
											2021-05-25 20:13:15 +00:00
+								{
 								    // FIXME: [...] and code points in the range U+00A0 to U+10FFFD, inclusive, excluding surrogates and noncharacters.
 								    return is_ascii_alphanumeric(code_point) || code_point >= 0xA0 || "!$&'()*+,-./:;=?@_~"sv.contains(code_point);
 								}
-												AK: Update URLParser.{cpp,h} to use east const

											
										
										
											2021-06-03 10:03:56 +00:00
+								static void report_validation_error(SourceLocation const& location = SourceLocation::current())
-												AK: Add a new, spec-compliant URLParser

This adds a new URL parser, which aims to be compliant with the URL
specification (https://url.spec.whatwg.org/). It also contains a
rudimentary data URL parser.

											
										
										
											2021-05-25 20:13:15 +00:00
+								{
 								    dbgln_if(URL_PARSER_DEBUG, "URLParser::parse: Validation error! {}", location);
 								}
-												AK+Everywhere: Rename String to DeprecatedString

We have a new, improved string type coming up in AK (OOM aware, no null
state), and while it's going to use UTF-8, the name UTF8String is a
mouthful - so let's free up the String name by renaming the existing
class.
Making the old one have an annoying name will hopefully also help with
quick adoption :^)

											
										
										
											2022-12-04 18:02:33 +00:00
+								static Optional<DeprecatedString> parse_opaque_host(StringView input)
-												AK: Add a new, spec-compliant URLParser

This adds a new URL parser, which aims to be compliant with the URL
specification (https://url.spec.whatwg.org/). It also contains a
rudimentary data URL parser.

											
										
										
											2021-05-25 20:13:15 +00:00
+								{
-												AK+Everywhere: Fix data corruption due to code-point-to-char conversion

In particular, StringView::contains(char) is often used with a u32
code point. When this is done, the compiler will for some reason allow
data corruption to occur silently.

In fact, this is one of two reasons for the following OSS Fuzz issue:
https://bugs.chromium.org/p/oss-fuzz/issues/detail?id=49184
This is probably a very old bug.

In the particular case of URLParser, AK::is_url_code_point got confused:
    return /* ... */ || "!$&'()*+,-./:;=?@_~"sv.contains(code_point);
If code_point is a large code point that happens to have the correct
lower bytes, AK::is_url_code_point is then convinced that the given
code point is okay, even if it is actually problematic.

This commit fixes *only* the silent data corruption due to the erroneous
conversion, and does not fully resolve OSS-Fuzz#49184.

											
										
										
											2022-09-12 14:31:16 +00:00
+								    auto forbidden_host_characters_excluding_percent = "\0\t\n\r #/:<>?@[\\]^|"sv;
 								    for (auto character : forbidden_host_characters_excluding_percent) {
 								        if (input.contains(character)) {
-												AK: Add a new, spec-compliant URLParser

This adds a new URL parser, which aims to be compliant with the URL
specification (https://url.spec.whatwg.org/). It also contains a
rudimentary data URL parser.

											
										
										
											2021-05-25 20:13:15 +00:00
+								            report_validation_error();
 								            return {};
 								        }
 								    }
 								    // FIXME: If input contains a code point that is not a URL code point and not U+0025 (%), validation error.
 								    // FIXME: If input contains a U+0025 (%) and the two code points following it are not ASCII hex digits, validation error.
 								    return URL::percent_encode(input, URL::PercentEncodeSet::C0Control);
 								}
-												AK+Everywhere: Rename String to DeprecatedString

We have a new, improved string type coming up in AK (OOM aware, no null
state), and while it's going to use UTF-8, the name UTF8String is a
mouthful - so let's free up the String name by renaming the existing
class.
Making the old one have an annoying name will hopefully also help with
quick adoption :^)

											
										
										
											2022-12-04 18:02:33 +00:00
+								static Optional<DeprecatedString> parse_ipv4_address(StringView input)
-												AK: Add a new, spec-compliant URLParser

This adds a new URL parser, which aims to be compliant with the URL
specification (https://url.spec.whatwg.org/). It also contains a
rudimentary data URL parser.

											
										
										
											2021-05-25 20:13:15 +00:00
+								{
 								    // FIXME: Implement the correct IPv4 parser as specified by https://url.spec.whatwg.org/#concept-ipv4-parser.
 								    return input;
 								}
 								// https://url.spec.whatwg.org/#concept-host-parser
 								// NOTE: This is a very bare-bones implementation.
-												AK+Everywhere: Rename String to DeprecatedString

We have a new, improved string type coming up in AK (OOM aware, no null
state), and while it's going to use UTF-8, the name UTF8String is a
mouthful - so let's free up the String name by renaming the existing
class.
Making the old one have an annoying name will hopefully also help with
quick adoption :^)

											
										
										
											2022-12-04 18:02:33 +00:00
+								static Optional<DeprecatedString> parse_host(StringView input, bool is_not_special = false)
-												AK: Add a new, spec-compliant URLParser

This adds a new URL parser, which aims to be compliant with the URL
specification (https://url.spec.whatwg.org/). It also contains a
rudimentary data URL parser.

											
										
										
											2021-05-25 20:13:15 +00:00
+								{
 								    if (input.starts_with('[')) {
 								        if (!input.ends_with(']')) {
 								            report_validation_error();
 								            return {};
 								        }
 								        // FIXME: Return the result of IPv6 parsing input with its leading U+005B ([) and trailing U+005D (]) removed.
 								        TODO();
 								    }
 								    if (is_not_special)
 								        return parse_opaque_host(input);
 								    VERIFY(!input.is_empty());
 								    // FIXME: Let domain be the result of running UTF-8 decode without BOM on the percent-decoding of input.
 								    auto domain = URL::percent_decode(input);
 								    // FIXME: Let asciiDomain be the result of running domain to ASCII on domain.
 								    auto& ascii_domain = domain;
-												AK+Everywhere: Fix data corruption due to code-point-to-char conversion

In particular, StringView::contains(char) is often used with a u32
code point. When this is done, the compiler will for some reason allow
data corruption to occur silently.

In fact, this is one of two reasons for the following OSS Fuzz issue:
https://bugs.chromium.org/p/oss-fuzz/issues/detail?id=49184
This is probably a very old bug.

In the particular case of URLParser, AK::is_url_code_point got confused:
    return /* ... */ || "!$&'()*+,-./:;=?@_~"sv.contains(code_point);
If code_point is a large code point that happens to have the correct
lower bytes, AK::is_url_code_point is then convinced that the given
code point is okay, even if it is actually problematic.

This commit fixes *only* the silent data corruption due to the erroneous
conversion, and does not fully resolve OSS-Fuzz#49184.

											
										
										
											2022-09-12 14:31:16 +00:00
+								    auto forbidden_host_characters = "\0\t\n\r #%/:<>?@[\\]^|"sv;
 								    for (auto character : forbidden_host_characters) {
 								        if (ascii_domain.view().contains(character)) {
-												AK: Add a new, spec-compliant URLParser

This adds a new URL parser, which aims to be compliant with the URL
specification (https://url.spec.whatwg.org/). It also contains a
rudimentary data URL parser.

											
										
										
											2021-05-25 20:13:15 +00:00
+								            report_validation_error();
 								            return {};
 								        }
 								    }
 								    auto ipv4_host = parse_ipv4_address(ascii_domain);
 								    return ipv4_host;
 								}
-												AK: Fix bad parsing of some file:/// URLs with base URL

We were dropping the base URL path components in the resulting URL due
to mistakenly determining the input URL to start with a Windows drive
letter. Fix this, add a spec link, and a test.

											
										
										
											2022-09-20 13:38:53 +00:00
+								// https://url.spec.whatwg.org/#start-with-a-windows-drive-letter
-												Everywhere: Pass AK::StringView by value

											
										
										
											2021-11-10 23:55:02 +00:00
+								constexpr bool starts_with_windows_drive_letter(StringView input)
-												AK: Add a new, spec-compliant URLParser

This adds a new URL parser, which aims to be compliant with the URL
specification (https://url.spec.whatwg.org/). It also contains a
rudimentary data URL parser.

											
										
										
											2021-05-25 20:13:15 +00:00
+								{
 								    if (input.length() < 2)
 								        return false;
-												AK: Fix bad parsing of some file:/// URLs with base URL

We were dropping the base URL path components in the resulting URL due
to mistakenly determining the input URL to start with a Windows drive
letter. Fix this, add a spec link, and a test.

											
										
										
											2022-09-20 13:38:53 +00:00
+								    if (!is_ascii_alpha(input[0]) || !(input[1] == ':' || input[1] == '|'))
-												AK: Add a new, spec-compliant URLParser

This adds a new URL parser, which aims to be compliant with the URL
specification (https://url.spec.whatwg.org/). It also contains a
rudimentary data URL parser.

											
										
										
											2021-05-25 20:13:15 +00:00
+								        return false;
 								    if (input.length() == 2)
 								        return true;
 								    return "/\\?#"sv.contains(input[2]);
 								}
-												Everywhere: Pass AK::StringView by value

											
										
										
											2021-11-10 23:55:02 +00:00
+								constexpr bool is_windows_drive_letter(StringView input)
-												AK: Add a new, spec-compliant URLParser

This adds a new URL parser, which aims to be compliant with the URL
specification (https://url.spec.whatwg.org/). It also contains a
rudimentary data URL parser.

											
										
										
											2021-05-25 20:13:15 +00:00
+								{
 								    return input.length() == 2 && is_ascii_alpha(input[0]) && (input[1] == ':' || input[1] == '|');
 								}
-												Everywhere: Pass AK::StringView by value

											
										
										
											2021-11-10 23:55:02 +00:00
+								constexpr bool is_normalized_windows_drive_letter(StringView input)
-												AK: Add a new, spec-compliant URLParser

This adds a new URL parser, which aims to be compliant with the URL
specification (https://url.spec.whatwg.org/). It also contains a
rudimentary data URL parser.

											
										
										
											2021-05-25 20:13:15 +00:00
+								{
 								    return input.length() == 2 && is_ascii_alpha(input[0]) && input[1] == ':';
 								}
-												Everywhere: Pass AK::StringView by value

											
										
										
											2021-11-10 23:55:02 +00:00
+								constexpr bool is_single_dot_path_segment(StringView input)
-												AK: Add a new, spec-compliant URLParser

This adds a new URL parser, which aims to be compliant with the URL
specification (https://url.spec.whatwg.org/). It also contains a
rudimentary data URL parser.

											
										
										
											2021-05-25 20:13:15 +00:00
+								{
 								    return input == "."sv || input.equals_ignoring_case("%2e"sv);
 								}
-												Everywhere: Pass AK::StringView by value

											
										
										
											2021-11-10 23:55:02 +00:00
+								constexpr bool is_double_dot_path_segment(StringView input)
-												AK: Add a new, spec-compliant URLParser

This adds a new URL parser, which aims to be compliant with the URL
specification (https://url.spec.whatwg.org/). It also contains a
rudimentary data URL parser.

											
										
										
											2021-05-25 20:13:15 +00:00
+								{
 								    return input == ".."sv || input.equals_ignoring_case(".%2e"sv) || input.equals_ignoring_case("%2e."sv) || input.equals_ignoring_case("%2e%2e"sv);
 								}
-												AK: Don't destructively re-encode query strings in the URL parser

We were decoding and then re-encoding the query string in URLs.
This round-trip caused us to lose information about plus ('+')
ASCII characters encoded as "%2B".

											
										
										
											2022-04-09 22:48:15 +00:00
+								// https://url.spec.whatwg.org/#string-percent-encode-after-encoding
-												AK+Everywhere: Rename String to DeprecatedString

We have a new, improved string type coming up in AK (OOM aware, no null
state), and while it's going to use UTF-8, the name UTF8String is a
mouthful - so let's free up the String name by renaming the existing
class.
Making the old one have an annoying name will hopefully also help with
quick adoption :^)

											
										
										
											2022-12-04 18:02:33 +00:00
+								static DeprecatedString percent_encode_after_encoding(StringView input, URL::PercentEncodeSet percent_encode_set, bool space_as_plus = false)
-												AK: Don't destructively re-encode query strings in the URL parser

We were decoding and then re-encoding the query string in URLs.
This round-trip caused us to lose information about plus ('+')
ASCII characters encoded as "%2B".

											
										
										
											2022-04-09 22:48:15 +00:00
+								{
 								    // NOTE: This is written somewhat ad-hoc since we don't yet implement the Encoding spec.
 								    StringBuilder output;
 								    // 3. For each byte of encodeOutput converted to a byte sequence:
 								    for (auto byte : input) {
 								        // 1. If spaceAsPlus is true and byte is 0x20 (SP), then append U+002B (+) to output and continue.
 								        if (space_as_plus && byte == ' ') {
 								            output.append('+');
 								            continue;
 								        }
 								        // 2. Let isomorph be a code point whose value is byte’s value.
 								        u32 isomorph = byte;
 								        // 3. Assert: percentEncodeSet includes all non-ASCII code points.
 								        // 4. If isomorphic is not in percentEncodeSet, then append isomorph to output.
 								        if (!URL::code_point_is_in_percent_encode_set(isomorph, percent_encode_set)) {
 								            output.append_code_point(isomorph);
 								        }
 								        // 5. Otherwise, percent-encode byte and append the result to output.
 								        else {
 								            output.appendff("%{:02X}", byte);
 								        }
 								    }
 								    // 6. Return output.
-												Everywhere: Rename to_{string => deprecated_string}() where applicable

This will make it easier to support both string types at the same time
while we convert code, and tracking down remaining uses.

One big exception is Value::to_string() in LibJS, where the name is
dictated by the ToString AO.

											
										
										
											2022-12-06 01:12:49 +00:00
+								    return output.to_deprecated_string();
-												AK: Don't destructively re-encode query strings in the URL parser

We were decoding and then re-encoding the query string in URLs.
This round-trip caused us to lose information about plus ('+')
ASCII characters encoded as "%2B".

											
										
										
											2022-04-09 22:48:15 +00:00
+								}
-												AK: Add a new, spec-compliant URLParser

This adds a new URL parser, which aims to be compliant with the URL
specification (https://url.spec.whatwg.org/). It also contains a
rudimentary data URL parser.

											
										
										
											2021-05-25 20:13:15 +00:00
+								// https://fetch.spec.whatwg.org/#data-urls
-												AK: Improve the parsing of data urls

Improve the parsing of data urls in URLParser to bring it more up-to-
spec. At the moment, we cannot parse the components of the MIME type
since it is represented as a string, but the spec requires it to be
parsed as a "MIME type record".

											
										
										
											2021-08-04 16:29:06 +00:00
+								// FIXME: This only loosely follows the spec, as we use the same class for "regular" and data URLs, unlike the spec.
-												Everywhere: Pass AK::StringView by value

											
										
										
											2021-11-10 23:55:02 +00:00
+								Optional<URL> URLParser::parse_data_url(StringView raw_input)
-												AK: Add a new, spec-compliant URLParser

This adds a new URL parser, which aims to be compliant with the URL
specification (https://url.spec.whatwg.org/). It also contains a
rudimentary data URL parser.

											
										
										
											2021-05-25 20:13:15 +00:00
+								{
 								    dbgln_if(URL_PARSER_DEBUG, "URLParser::parse_data_url: Parsing '{}'.", raw_input);
-												Everywhere: Add sv suffix to strings relying on StringView(char const*)

Each of these strings would previously rely on StringView's char const*
constructor overload, which would call __builtin_strlen on the string.
Since we now have operator ""sv, we can replace these with much simpler
versions. This opens the door to being able to remove
StringView(char const*).

No functional changes.

											
										
										
											2022-07-11 17:32:29 +00:00
+								    VERIFY(raw_input.starts_with("data:"sv));
-												AK: Add a new, spec-compliant URLParser

This adds a new URL parser, which aims to be compliant with the URL
specification (https://url.spec.whatwg.org/). It also contains a
rudimentary data URL parser.

											
										
										
											2021-05-25 20:13:15 +00:00
+								    auto input = raw_input.substring_view(5);
 								    auto comma_offset = input.find(',');
 								    if (!comma_offset.has_value())
 								        return {};
-												Everywhere: Add sv suffix to strings relying on StringView(char const*)

Each of these strings would previously rely on StringView's char const*
constructor overload, which would call __builtin_strlen on the string.
Since we now have operator ""sv, we can replace these with much simpler
versions. This opens the door to being able to remove
StringView(char const*).

No functional changes.

											
										
										
											2022-07-11 17:32:29 +00:00
+								    auto mime_type = StringUtils::trim(input.substring_view(0, comma_offset.value()), "\t\n\f\r "sv, TrimMode::Both);
-												AK: Add a new, spec-compliant URLParser

This adds a new URL parser, which aims to be compliant with the URL
specification (https://url.spec.whatwg.org/). It also contains a
rudimentary data URL parser.

											
										
										
											2021-05-25 20:13:15 +00:00
+								    auto encoded_body = input.substring_view(comma_offset.value() + 1);
 								    auto body = URL::percent_decode(encoded_body);
-												AK: Improve the parsing of data urls

Improve the parsing of data urls in URLParser to bring it more up-to-
spec. At the moment, we cannot parse the components of the MIME type
since it is represented as a string, but the spec requires it to be
parsed as a "MIME type record".

											
										
										
											2021-08-04 16:29:06 +00:00
+								    bool is_base64_encoded = false;
-												Everywhere: Add sv suffix to strings relying on StringView(char const*)

Each of these strings would previously rely on StringView's char const*
constructor overload, which would call __builtin_strlen on the string.
Since we now have operator ""sv, we can replace these with much simpler
versions. This opens the door to being able to remove
StringView(char const*).

No functional changes.

											
										
										
											2022-07-11 17:32:29 +00:00
+								    if (mime_type.ends_with("base64"sv, CaseSensitivity::CaseInsensitive)) {
-												AK: Improve the parsing of data urls

Improve the parsing of data urls in URLParser to bring it more up-to-
spec. At the moment, we cannot parse the components of the MIME type
since it is represented as a string, but the spec requires it to be
parsed as a "MIME type record".

											
										
										
											2021-08-04 16:29:06 +00:00
+								        auto substring_view = mime_type.substring_view(0, mime_type.length() - 6);
-												Everywhere: Add sv suffix to strings relying on StringView(char const*)

Each of these strings would previously rely on StringView's char const*
constructor overload, which would call __builtin_strlen on the string.
Since we now have operator ""sv, we can replace these with much simpler
versions. This opens the door to being able to remove
StringView(char const*).

No functional changes.

											
										
										
											2022-07-11 17:32:29 +00:00
+								        auto trimmed_substring_view = StringUtils::trim(substring_view, " "sv, TrimMode::Right);
-												AK: Improve the parsing of data urls

Improve the parsing of data urls in URLParser to bring it more up-to-
spec. At the moment, we cannot parse the components of the MIME type
since it is represented as a string, but the spec requires it to be
parsed as a "MIME type record".

											
										
										
											2021-08-04 16:29:06 +00:00
+								        if (trimmed_substring_view.ends_with(';')) {
 								            is_base64_encoded = true;
 								            mime_type = trimmed_substring_view.substring_view(0, trimmed_substring_view.length() - 1);
 								        }
-												AK: Add a new, spec-compliant URLParser

This adds a new URL parser, which aims to be compliant with the URL
specification (https://url.spec.whatwg.org/). It also contains a
rudimentary data URL parser.

											
										
										
											2021-05-25 20:13:15 +00:00
+								    }
-												AK: Improve the parsing of data urls

Improve the parsing of data urls in URLParser to bring it more up-to-
spec. At the moment, we cannot parse the components of the MIME type
since it is represented as a string, but the spec requires it to be
parsed as a "MIME type record".

											
										
										
											2021-08-04 16:29:06 +00:00
+								    StringBuilder builder;
-												Everywhere: Add sv suffix to strings relying on StringView(char const*)

Each of these strings would previously rely on StringView's char const*
constructor overload, which would call __builtin_strlen on the string.
Since we now have operator ""sv, we can replace these with much simpler
versions. This opens the door to being able to remove
StringView(char const*).

No functional changes.

											
										
										
											2022-07-11 17:32:29 +00:00
+								    if (mime_type.starts_with(";"sv) || mime_type.is_empty()) {
 								        builder.append("text/plain"sv);
-												AK: Add a new, spec-compliant URLParser

This adds a new URL parser, which aims to be compliant with the URL
specification (https://url.spec.whatwg.org/). It also contains a
rudimentary data URL parser.

											
										
										
											2021-05-25 20:13:15 +00:00
+								        builder.append(mime_type);
-												AK: Improve the parsing of data urls

Improve the parsing of data urls in URLParser to bring it more up-to-
spec. At the moment, we cannot parse the components of the MIME type
since it is represented as a string, but the spec requires it to be
parsed as a "MIME type record".

											
										
										
											2021-08-04 16:29:06 +00:00
+								        mime_type = builder.string_view();
-												AK: Add a new, spec-compliant URLParser

This adds a new URL parser, which aims to be compliant with the URL
specification (https://url.spec.whatwg.org/). It also contains a
rudimentary data URL parser.

											
										
										
											2021-05-25 20:13:15 +00:00
+								    }
-												AK: Improve the parsing of data urls

Improve the parsing of data urls in URLParser to bring it more up-to-
spec. At the moment, we cannot parse the components of the MIME type
since it is represented as a string, but the spec requires it to be
parsed as a "MIME type record".

											
										
										
											2021-08-04 16:29:06 +00:00
+								    // FIXME: Parse the MIME type's components according to https://mimesniff.spec.whatwg.org/#parse-a-mime-type
-												Everywhere: Add sv suffix to strings relying on StringView(char const*)

Each of these strings would previously rely on StringView's char const*
constructor overload, which would call __builtin_strlen on the string.
Since we now have operator ""sv, we can replace these with much simpler
versions. This opens the door to being able to remove
StringView(char const*).

No functional changes.

											
										
										
											2022-07-11 17:32:29 +00:00
+								    URL url { StringUtils::trim(mime_type, "\n\r\t "sv, TrimMode::Both), move(body), is_base64_encoded };
-												AK: Add a new, spec-compliant URLParser

This adds a new URL parser, which aims to be compliant with the URL
specification (https://url.spec.whatwg.org/). It also contains a
rudimentary data URL parser.

											
										
										
											2021-05-25 20:13:15 +00:00
+								    dbgln_if(URL_PARSER_DEBUG, "URLParser::parse_data_url: Parsed data URL to be '{}'.", url.serialize());
 								    return url;
 								}
 								// https://url.spec.whatwg.org/#concept-basic-url-parser
 								// NOTE: This parser assumes a UTF-8 encoding.
 								// NOTE: Refrain from using the URL classes setters inside this algorithm. Rather, set the values directly. This bypasses the setters' built-in
 								//       validation, which is strictly unnecessary since we set m_valid=true at the end anyways. Furthermore, this algorithm may be used in the
 								//       future for validation of URLs, which would then lead to infinite recursion.
 								//       The same goes for base_url, because e.g. the port() getter does not always return m_port, and we are interested in the underlying member
 								//       variables' values here, not what the URL class presents to its users.
 								// NOTE: Since the URL class's member variables contain percent decoded data, we have to deviate from the URL parser specification when setting
 								//       some of those values. Because the specification leaves all values percent encoded in their URL data structure, we have to percent decode
 								//       everything before setting the member variables.
-												Everywhere: Pass AK::StringView by value

											
										
										
											2021-11-10 23:55:02 +00:00
+								URL URLParser::parse(StringView raw_input, URL const* base_url, Optional<URL> url, Optional<State> state_override)
-												AK: Add a new, spec-compliant URLParser

This adds a new URL parser, which aims to be compliant with the URL
specification (https://url.spec.whatwg.org/). It also contains a
rudimentary data URL parser.

											
										
										
											2021-05-25 20:13:15 +00:00
+								{
 								    dbgln_if(URL_PARSER_DEBUG, "URLParser::parse: Parsing '{}'", raw_input);
 								    if (raw_input.is_empty())
-												AK: Use base URL when the specified URL is empty

											
										
										
											2022-12-27 19:17:30 +00:00
+								        return base_url ? *base_url : URL {};
-												AK: Add a new, spec-compliant URLParser

This adds a new URL parser, which aims to be compliant with the URL
specification (https://url.spec.whatwg.org/). It also contains a
rudimentary data URL parser.

											
										
										
											2021-05-25 20:13:15 +00:00
-												Everywhere: Add sv suffix to strings relying on StringView(char const*)

Each of these strings would previously rely on StringView's char const*
constructor overload, which would call __builtin_strlen on the string.
Since we now have operator ""sv, we can replace these with much simpler
versions. This opens the door to being able to remove
StringView(char const*).

No functional changes.

											
										
										
											2022-07-11 17:32:29 +00:00
+								    if (raw_input.starts_with("data:"sv)) {
-												AK: Add a new, spec-compliant URLParser

This adds a new URL parser, which aims to be compliant with the URL
specification (https://url.spec.whatwg.org/). It also contains a
rudimentary data URL parser.

											
										
										
											2021-05-25 20:13:15 +00:00
+								        auto maybe_url = parse_data_url(raw_input);
 								        if (!maybe_url.has_value())
 								            return {};
 								        return maybe_url.release_value();
 								    }
 								    size_t start_index = 0;
 								    size_t end_index = raw_input.length();
-												AK: Accept optional url and state override parameters in URLParser

These are required in the specification and used by the web's URL
built-in, this commit also removes the Badge<AK::URL> from URLParser
to allow other classes that need to call the parser directly like the
web's URL built-in to do so.

											
										
										
											2021-09-13 19:34:14 +00:00
+								    if (!url.has_value()) {
 								        url = URL();
 								        // NOTE: This removes all leading and trailing C0 control or space characters.
 								        bool has_validation_error = false;
 								        for (size_t i = 0; i < raw_input.length(); ++i) {
 								            i8 ch = raw_input[i];
 								            if (0 <= ch && ch <= 0x20) {
 								                ++start_index;
 								                has_validation_error = true;
 								            } else {
 								                break;
 								            }
-												AK: Add a new, spec-compliant URLParser

This adds a new URL parser, which aims to be compliant with the URL
specification (https://url.spec.whatwg.org/). It also contains a
rudimentary data URL parser.

											
										
										
											2021-05-25 20:13:15 +00:00
+								        }
-												AK: Accept optional url and state override parameters in URLParser

These are required in the specification and used by the web's URL
built-in, this commit also removes the Badge<AK::URL> from URLParser
to allow other classes that need to call the parser directly like the
web's URL built-in to do so.

											
										
										
											2021-09-13 19:34:14 +00:00
+								        for (ssize_t i = raw_input.length() - 1; i >= 0; --i) {
 								            i8 ch = raw_input[i];
 								            if (0 <= ch && ch <= 0x20) {
 								                --end_index;
 								                has_validation_error = true;
 								            } else {
 								                break;
 								            }
-												AK: Add a new, spec-compliant URLParser

This adds a new URL parser, which aims to be compliant with the URL
specification (https://url.spec.whatwg.org/). It also contains a
rudimentary data URL parser.

											
										
										
											2021-05-25 20:13:15 +00:00
+								        }
-												AK: Accept optional url and state override parameters in URLParser

These are required in the specification and used by the web's URL
built-in, this commit also removes the Badge<AK::URL> from URLParser
to allow other classes that need to call the parser directly like the
web's URL built-in to do so.

											
										
										
											2021-09-13 19:34:14 +00:00
+								        if (has_validation_error)
 								            report_validation_error();
-												AK: Add a new, spec-compliant URLParser

This adds a new URL parser, which aims to be compliant with the URL
specification (https://url.spec.whatwg.org/). It also contains a
rudimentary data URL parser.

											
										
										
											2021-05-25 20:13:15 +00:00
+								    }
 								    if (start_index >= end_index)
 								        return {};
-												AK+Everywhere: Rename String to DeprecatedString

We have a new, improved string type coming up in AK (OOM aware, no null
state), and while it's going to use UTF-8, the name UTF8String is a
mouthful - so let's free up the String name by renaming the existing
class.
Making the old one have an annoying name will hopefully also help with
quick adoption :^)

											
										
										
											2022-12-04 18:02:33 +00:00
+								    DeprecatedString processed_input = raw_input.substring_view(start_index, end_index - start_index);
-												AK: Add a new, spec-compliant URLParser

This adds a new URL parser, which aims to be compliant with the URL
specification (https://url.spec.whatwg.org/). It also contains a
rudimentary data URL parser.

											
										
										
											2021-05-25 20:13:15 +00:00
 								    // NOTE: This replaces all tab and newline characters with nothing.
-												Everywhere: Add sv suffix to strings relying on StringView(char const*)

Each of these strings would previously rely on StringView's char const*
constructor overload, which would call __builtin_strlen on the string.
Since we now have operator ""sv, we can replace these with much simpler
versions. This opens the door to being able to remove
StringView(char const*).

No functional changes.

											
										
										
											2022-07-11 17:32:29 +00:00
+								    if (processed_input.contains("\t"sv) || processed_input.contains("\n"sv)) {
-												AK: Add a new, spec-compliant URLParser

This adds a new URL parser, which aims to be compliant with the URL
specification (https://url.spec.whatwg.org/). It also contains a
rudimentary data URL parser.

											
										
										
											2021-05-25 20:13:15 +00:00
+								        report_validation_error();
-												Everywhere: Add sv suffix to strings relying on StringView(char const*)

Each of these strings would previously rely on StringView's char const*
constructor overload, which would call __builtin_strlen on the string.
Since we now have operator ""sv, we can replace these with much simpler
versions. This opens the door to being able to remove
StringView(char const*).

No functional changes.

											
										
										
											2022-07-11 17:32:29 +00:00
+								        processed_input = processed_input.replace("\t"sv, ""sv, ReplaceMode::All).replace("\n"sv, ""sv, ReplaceMode::All);
-												AK: Add a new, spec-compliant URLParser

This adds a new URL parser, which aims to be compliant with the URL
specification (https://url.spec.whatwg.org/). It also contains a
rudimentary data URL parser.

											
										
										
											2021-05-25 20:13:15 +00:00
+								    }
-												AK: Accept optional url and state override parameters in URLParser

These are required in the specification and used by the web's URL
built-in, this commit also removes the Badge<AK::URL> from URLParser
to allow other classes that need to call the parser directly like the
web's URL built-in to do so.

											
										
										
											2021-09-13 19:34:14 +00:00
+								    State state = state_override.value_or(State::SchemeStart);
-												AK: Add a new, spec-compliant URLParser

This adds a new URL parser, which aims to be compliant with the URL
specification (https://url.spec.whatwg.org/). It also contains a
rudimentary data URL parser.

											
										
										
											2021-05-25 20:13:15 +00:00
+								    StringBuilder buffer;
 								    bool at_sign_seen = false;
 								    bool inside_brackets = false;
 								    bool password_token_seen = false;
 								    Utf8View input(processed_input);
-												AK: Rename Utf8CodepointIterator => Utf8CodePointIterator

											
										
										
											2021-06-01 07:45:52 +00:00
+								    Utf8CodePointIterator iterator = input.begin();
-												AK: Add a new, spec-compliant URLParser

This adds a new URL parser, which aims to be compliant with the URL
specification (https://url.spec.whatwg.org/). It also contains a
rudimentary data URL parser.

											
										
										
											2021-05-25 20:13:15 +00:00
 								    auto get_remaining = [&input, &iterator] {
-												AK: Do not VERIFY on invalid code point bytes in UTF8View

The previous behavior was to always VERIFY that the UTF-8 bytes were
valid when iterating over the code points of an UTF8View. This change
makes it so we instead output the 0xFFFD 'REPLACEMENT CHARACTER'
code point when encountering invalid bytes, and keep iterating the
view after skipping one byte.

Leaving the decision to the consumer would break symmetry with the
UTF32View API, which would in turn require heavy refactoring and/or
code duplication in generic code such as the one found in
Gfx::Painter and the Shell.

To make it easier for the consumers to detect the original bytes, we
provide a new method on the iterator that returns a Span over the
data that has been decoded. This method is immediately used in the
TextNode::compute_text_for_rendering method, which previously did
this in a ad-hoc waay.

This also add tests for the new behavior in TestUtf8.cpp, as well
as reinforcements to the existing tests to check if the underlying
bytes match up with their expected values.

											
										
										
											2021-05-30 16:52:24 +00:00
+								        return input.substring_view(iterator - input.begin() + iterator.underlying_code_point_length_in_bytes()).as_string();
-												AK: Add a new, spec-compliant URLParser

This adds a new URL parser, which aims to be compliant with the URL
specification (https://url.spec.whatwg.org/). It also contains a
rudimentary data URL parser.

											
										
										
											2021-05-25 20:13:15 +00:00
+								    };
 								    // NOTE: "continue" should only be used to prevent incrementing the iterator, as this is done at the end of the loop.
 								    //       ++iterator : "increase pointer by 1"
 								    //       continue   : "decrease pointer by 1"
 								    for (;;) {
-												AK: Stop using U+0000 as end of file code point in URL parser

This changes URL parser to use the 0xFFFFFFFF constant instead of 0 to
indicate end of file. This fixes a bug where inputs containing null
bytes would terminate the parser early, because they were interpreted
as end of file.

											
										
										
											2021-06-03 10:43:08 +00:00
+								        u32 code_point = end_of_file;
-												AK: Add a new, spec-compliant URLParser

This adds a new URL parser, which aims to be compliant with the URL
specification (https://url.spec.whatwg.org/). It also contains a
rudimentary data URL parser.

											
										
										
											2021-05-25 20:13:15 +00:00
+								        if (!iterator.done())
 								            code_point = *iterator;
 								        if constexpr (URL_PARSER_DEBUG) {
-												AK: Stop using U+0000 as end of file code point in URL parser

This changes URL parser to use the 0xFFFFFFFF constant instead of 0 to
indicate end of file. This fixes a bug where inputs containing null
bytes would terminate the parser early, because they were interpreted
as end of file.

											
										
										
											2021-06-03 10:43:08 +00:00
+								            if (code_point == end_of_file)
-												AK: Make debugging URLParser easier

This patch adds a state_name method to URLParser to convert a state to a
string. With this, the debugging statements now display the state names.

Furthermore, this fixes a bug where non-ASCII code points were
formatted as characters, which fails an assertion in the formatting
system.

											
										
										
											2021-06-03 10:40:04 +00:00
+								                dbgln("URLParser::parse: {} state with EOF.", state_name(state));
 								            else if (is_ascii_printable(code_point))
 								                dbgln("URLParser::parse: {} state with code point U+{:04X} ({:c}).", state_name(state), code_point, code_point);
-												AK: Add a new, spec-compliant URLParser

This adds a new URL parser, which aims to be compliant with the URL
specification (https://url.spec.whatwg.org/). It also contains a
rudimentary data URL parser.

											
										
										
											2021-05-25 20:13:15 +00:00
+								            else
-												AK: Make debugging URLParser easier

This patch adds a state_name method to URLParser to convert a state to a
string. With this, the debugging statements now display the state names.

Furthermore, this fixes a bug where non-ASCII code points were
formatted as characters, which fails an assertion in the formatting
system.

											
										
										
											2021-06-03 10:40:04 +00:00
+								                dbgln("URLParser::parse: {} state with code point U+{:04X}.", state_name(state), code_point);
-												AK: Add a new, spec-compliant URLParser

This adds a new URL parser, which aims to be compliant with the URL
specification (https://url.spec.whatwg.org/). It also contains a
rudimentary data URL parser.

											
										
										
											2021-05-25 20:13:15 +00:00
+								        }
 								        switch (state) {
 								        case State::SchemeStart:
 								            if (is_ascii_alpha(code_point)) {
 								                buffer.append_as_lowercase(code_point);
 								                state = State::Scheme;
 								            } else {
 								                state = State::NoScheme;
 								                continue;
 								            }
 								            break;
 								        case State::Scheme:
 								            if (is_ascii_alphanumeric(code_point) || code_point == '+' || code_point == '-' || code_point == '.') {
 								                buffer.append_as_lowercase(code_point);
 								            } else if (code_point == ':') {
-												Everywhere: Rename to_{string => deprecated_string}() where applicable

This will make it easier to support both string types at the same time
while we convert code, and tracking down remaining uses.

One big exception is Value::to_string() in LibJS, where the name is
dictated by the ToString AO.

											
										
										
											2022-12-06 01:12:49 +00:00
+								                url->m_scheme = buffer.to_deprecated_string();
-												AK: Add a new, spec-compliant URLParser

This adds a new URL parser, which aims to be compliant with the URL
specification (https://url.spec.whatwg.org/). It also contains a
rudimentary data URL parser.

											
										
										
											2021-05-25 20:13:15 +00:00
+								                buffer.clear();
-												AK: Accept optional url and state override parameters in URLParser

These are required in the specification and used by the web's URL
built-in, this commit also removes the Badge<AK::URL> from URLParser
to allow other classes that need to call the parser directly like the
web's URL built-in to do so.

											
										
										
											2021-09-13 19:34:14 +00:00
+								                if (url->scheme() == "file") {
-												Everywhere: Add sv suffix to strings relying on StringView(char const*)

Each of these strings would previously rely on StringView's char const*
constructor overload, which would call __builtin_strlen on the string.
Since we now have operator ""sv, we can replace these with much simpler
versions. This opens the door to being able to remove
StringView(char const*).

No functional changes.

											
										
										
											2022-07-11 17:32:29 +00:00
+								                    if (!get_remaining().starts_with("//"sv)) {
-												AK: Add a new, spec-compliant URLParser

This adds a new URL parser, which aims to be compliant with the URL
specification (https://url.spec.whatwg.org/). It also contains a
rudimentary data URL parser.

											
										
										
											2021-05-25 20:13:15 +00:00
+								                        report_validation_error();
 								                    }
 								                    state = State::File;
-												AK: Accept optional url and state override parameters in URLParser

These are required in the specification and used by the web's URL
built-in, this commit also removes the Badge<AK::URL> from URLParser
to allow other classes that need to call the parser directly like the
web's URL built-in to do so.

											
										
										
											2021-09-13 19:34:14 +00:00
+								                } else if (url->is_special()) {
 								                    if (base_url && base_url->m_scheme == url->m_scheme)
-												AK: Add a new, spec-compliant URLParser

This adds a new URL parser, which aims to be compliant with the URL
specification (https://url.spec.whatwg.org/). It also contains a
rudimentary data URL parser.

											
										
										
											2021-05-25 20:13:15 +00:00
+								                        state = State::SpecialRelativeOrAuthority;
 								                    else
 								                        state = State::SpecialAuthoritySlashes;
-												Everywhere: Add sv suffix to strings relying on StringView(char const*)

Each of these strings would previously rely on StringView's char const*
constructor overload, which would call __builtin_strlen on the string.
Since we now have operator ""sv, we can replace these with much simpler
versions. This opens the door to being able to remove
StringView(char const*).

No functional changes.

											
										
										
											2022-07-11 17:32:29 +00:00
+								                } else if (get_remaining().starts_with("/"sv)) {
-												AK: Add a new, spec-compliant URLParser

This adds a new URL parser, which aims to be compliant with the URL
specification (https://url.spec.whatwg.org/). It also contains a
rudimentary data URL parser.

											
										
										
											2021-05-25 20:13:15 +00:00
+								                    state = State::PathOrAuthority;
 								                    ++iterator;
 								                } else {
-												AK: Accept optional url and state override parameters in URLParser

These are required in the specification and used by the web's URL
built-in, this commit also removes the Badge<AK::URL> from URLParser
to allow other classes that need to call the parser directly like the
web's URL built-in to do so.

											
										
										
											2021-09-13 19:34:14 +00:00
+								                    url->m_cannot_be_a_base_url = true;
 								                    url->append_path("");
-												AK: Add a new, spec-compliant URLParser

This adds a new URL parser, which aims to be compliant with the URL
specification (https://url.spec.whatwg.org/). It also contains a
rudimentary data URL parser.

											
										
										
											2021-05-25 20:13:15 +00:00
+								                    state = State::CannotBeABaseUrlPath;
 								                }
 								            } else {
 								                buffer.clear();
 								                state = State::NoScheme;
 								                iterator = input.begin();
 								                continue;
 								            }
 								            break;
 								        case State::NoScheme:
 								            if (!base_url || (base_url->m_cannot_be_a_base_url && code_point != '#')) {
 								                report_validation_error();
 								                return {};
 								            } else if (base_url->m_cannot_be_a_base_url && code_point == '#') {
-												AK: Accept optional url and state override parameters in URLParser

These are required in the specification and used by the web's URL
built-in, this commit also removes the Badge<AK::URL> from URLParser
to allow other classes that need to call the parser directly like the
web's URL built-in to do so.

											
										
										
											2021-09-13 19:34:14 +00:00
+								                url->m_scheme = base_url->m_scheme;
 								                url->m_paths = base_url->m_paths;
 								                url->m_query = base_url->m_query;
 								                url->m_fragment = "";
 								                url->m_cannot_be_a_base_url = true;
-												AK: Add a new, spec-compliant URLParser

This adds a new URL parser, which aims to be compliant with the URL
specification (https://url.spec.whatwg.org/). It also contains a
rudimentary data URL parser.

											
										
										
											2021-05-25 20:13:15 +00:00
+								                state = State::Fragment;
 								            } else if (base_url->m_scheme != "file") {
 								                state = State::Relative;
 								                continue;
 								            } else {
 								                state = State::File;
 								                continue;
 								            }
 								            break;
 								        case State::SpecialRelativeOrAuthority:
-												Everywhere: Add sv suffix to strings relying on StringView(char const*)

Each of these strings would previously rely on StringView's char const*
constructor overload, which would call __builtin_strlen on the string.
Since we now have operator ""sv, we can replace these with much simpler
versions. This opens the door to being able to remove
StringView(char const*).

No functional changes.

											
										
										
											2022-07-11 17:32:29 +00:00
+								            if (code_point == '/' && get_remaining().starts_with("/"sv)) {
-												AK: Add a new, spec-compliant URLParser

This adds a new URL parser, which aims to be compliant with the URL
specification (https://url.spec.whatwg.org/). It also contains a
rudimentary data URL parser.

											
										
										
											2021-05-25 20:13:15 +00:00
+								                state = State::SpecialAuthorityIgnoreSlashes;
 								                ++iterator;
 								            } else {
 								                report_validation_error();
 								                state = State::Relative;
 								                continue;
 								            }
 								            break;
 								        case State::PathOrAuthority:
 								            if (code_point == '/') {
 								                state = State::Authority;
 								            } else {
 								                state = State::Path;
 								                continue;
 								            }
 								            break;
 								        case State::Relative:
-												AK: Accept optional url and state override parameters in URLParser

These are required in the specification and used by the web's URL
built-in, this commit also removes the Badge<AK::URL> from URLParser
to allow other classes that need to call the parser directly like the
web's URL built-in to do so.

											
										
										
											2021-09-13 19:34:14 +00:00
+								            url->m_scheme = base_url->m_scheme;
-												AK: Add a new, spec-compliant URLParser

This adds a new URL parser, which aims to be compliant with the URL
specification (https://url.spec.whatwg.org/). It also contains a
rudimentary data URL parser.

											
										
										
											2021-05-25 20:13:15 +00:00
+								            if (code_point == '/') {
 								                state = State::RelativeSlash;
-												AK: Accept optional url and state override parameters in URLParser

These are required in the specification and used by the web's URL
built-in, this commit also removes the Badge<AK::URL> from URLParser
to allow other classes that need to call the parser directly like the
web's URL built-in to do so.

											
										
										
											2021-09-13 19:34:14 +00:00
+								            } else if (url->is_special() && code_point == '\\') {
-												AK: Add a new, spec-compliant URLParser

This adds a new URL parser, which aims to be compliant with the URL
specification (https://url.spec.whatwg.org/). It also contains a
rudimentary data URL parser.

											
										
										
											2021-05-25 20:13:15 +00:00
+								                report_validation_error();
 								                state = State::RelativeSlash;
 								            } else {
-												AK: Accept optional url and state override parameters in URLParser

These are required in the specification and used by the web's URL
built-in, this commit also removes the Badge<AK::URL> from URLParser
to allow other classes that need to call the parser directly like the
web's URL built-in to do so.

											
										
										
											2021-09-13 19:34:14 +00:00
+								                url->m_username = base_url->m_username;
 								                url->m_password = base_url->m_password;
 								                url->m_host = base_url->m_host;
 								                url->m_port = base_url->m_port;
 								                url->m_paths = base_url->m_paths;
 								                url->m_query = base_url->m_query;
-												AK: Add a new, spec-compliant URLParser

This adds a new URL parser, which aims to be compliant with the URL
specification (https://url.spec.whatwg.org/). It also contains a
rudimentary data URL parser.

											
										
										
											2021-05-25 20:13:15 +00:00
 								                if (code_point == '?') {
-												AK: Accept optional url and state override parameters in URLParser

These are required in the specification and used by the web's URL
built-in, this commit also removes the Badge<AK::URL> from URLParser
to allow other classes that need to call the parser directly like the
web's URL built-in to do so.

											
										
										
											2021-09-13 19:34:14 +00:00
+								                    url->m_query = "";
-												AK: Add a new, spec-compliant URLParser

This adds a new URL parser, which aims to be compliant with the URL
specification (https://url.spec.whatwg.org/). It also contains a
rudimentary data URL parser.

											
										
										
											2021-05-25 20:13:15 +00:00
+								                    state = State::Query;
 								                } else if (code_point == '#') {
-												AK: Accept optional url and state override parameters in URLParser

These are required in the specification and used by the web's URL
built-in, this commit also removes the Badge<AK::URL> from URLParser
to allow other classes that need to call the parser directly like the
web's URL built-in to do so.

											
										
										
											2021-09-13 19:34:14 +00:00
+								                    url->m_fragment = "";
-												AK: Add a new, spec-compliant URLParser

This adds a new URL parser, which aims to be compliant with the URL
specification (https://url.spec.whatwg.org/). It also contains a
rudimentary data URL parser.

											
										
										
											2021-05-25 20:13:15 +00:00
+								                    state = State::Fragment;
-												AK: Stop using U+0000 as end of file code point in URL parser

This changes URL parser to use the 0xFFFFFFFF constant instead of 0 to
indicate end of file. This fixes a bug where inputs containing null
bytes would terminate the parser early, because they were interpreted
as end of file.

											
										
										
											2021-06-03 10:43:08 +00:00
+								                } else if (code_point != end_of_file) {
-												AK: Accept optional url and state override parameters in URLParser

These are required in the specification and used by the web's URL
built-in, this commit also removes the Badge<AK::URL> from URLParser
to allow other classes that need to call the parser directly like the
web's URL built-in to do so.

											
										
										
											2021-09-13 19:34:14 +00:00
+								                    url->m_query = {};
 								                    if (url->m_paths.size())
 								                        url->m_paths.remove(url->m_paths.size() - 1);
-												AK: Add a new, spec-compliant URLParser

This adds a new URL parser, which aims to be compliant with the URL
specification (https://url.spec.whatwg.org/). It also contains a
rudimentary data URL parser.

											
										
										
											2021-05-25 20:13:15 +00:00
+								                    state = State::Path;
 								                    continue;
 								                }
 								            }
 								            break;
 								        case State::RelativeSlash:
-												AK: Accept optional url and state override parameters in URLParser

These are required in the specification and used by the web's URL
built-in, this commit also removes the Badge<AK::URL> from URLParser
to allow other classes that need to call the parser directly like the
web's URL built-in to do so.

											
										
										
											2021-09-13 19:34:14 +00:00
+								            if (url->is_special() && (code_point == '/' || code_point == '\\')) {
-												AK: Add a new, spec-compliant URLParser

This adds a new URL parser, which aims to be compliant with the URL
specification (https://url.spec.whatwg.org/). It also contains a
rudimentary data URL parser.

											
										
										
											2021-05-25 20:13:15 +00:00
+								                if (code_point == '\\')
 								                    report_validation_error();
 								                state = State::SpecialAuthorityIgnoreSlashes;
 								            } else if (code_point == '/') {
 								                state = State::Authority;
 								            } else {
-												AK: Accept optional url and state override parameters in URLParser

These are required in the specification and used by the web's URL
built-in, this commit also removes the Badge<AK::URL> from URLParser
to allow other classes that need to call the parser directly like the
web's URL built-in to do so.

											
										
										
											2021-09-13 19:34:14 +00:00
+								                url->m_username = base_url->m_username;
 								                url->m_password = base_url->m_password;
 								                url->m_host = base_url->m_host;
 								                url->m_port = base_url->m_port;
-												AK: Add a new, spec-compliant URLParser

This adds a new URL parser, which aims to be compliant with the URL
specification (https://url.spec.whatwg.org/). It also contains a
rudimentary data URL parser.

											
										
										
											2021-05-25 20:13:15 +00:00
+								                state = State::Path;
 								                continue;
 								            }
 								            break;
 								        case State::SpecialAuthoritySlashes:
-												Everywhere: Add sv suffix to strings relying on StringView(char const*)

Each of these strings would previously rely on StringView's char const*
constructor overload, which would call __builtin_strlen on the string.
Since we now have operator ""sv, we can replace these with much simpler
versions. This opens the door to being able to remove
StringView(char const*).

No functional changes.

											
										
										
											2022-07-11 17:32:29 +00:00
+								            if (code_point == '/' && get_remaining().starts_with("/"sv)) {
-												AK: Add a new, spec-compliant URLParser

This adds a new URL parser, which aims to be compliant with the URL
specification (https://url.spec.whatwg.org/). It also contains a
rudimentary data URL parser.

											
										
										
											2021-05-25 20:13:15 +00:00
+								                state = State::SpecialAuthorityIgnoreSlashes;
 								                ++iterator;
 								            } else {
 								                report_validation_error();
 								                state = State::SpecialAuthorityIgnoreSlashes;
 								                continue;
 								            }
 								            break;
 								        case State::SpecialAuthorityIgnoreSlashes:
 								            if (code_point != '/' && code_point != '\\') {
 								                state = State::Authority;
 								                continue;
 								            } else {
 								                report_validation_error();
 								            }
 								            break;
 								        case State::Authority:
 								            if (code_point == '@') {
 								                report_validation_error();
 								                if (at_sign_seen) {
-												Everywhere: Rename to_{string => deprecated_string}() where applicable

This will make it easier to support both string types at the same time
while we convert code, and tracking down remaining uses.

One big exception is Value::to_string() in LibJS, where the name is
dictated by the ToString AO.

											
										
										
											2022-12-06 01:12:49 +00:00
+								                    auto content = buffer.to_deprecated_string();
-												AK: Add a new, spec-compliant URLParser

This adds a new URL parser, which aims to be compliant with the URL
specification (https://url.spec.whatwg.org/). It also contains a
rudimentary data URL parser.

											
										
										
											2021-05-25 20:13:15 +00:00
+								                    buffer.clear();
-												Everywhere: Add sv suffix to strings relying on StringView(char const*)

Each of these strings would previously rely on StringView's char const*
constructor overload, which would call __builtin_strlen on the string.
Since we now have operator ""sv, we can replace these with much simpler
versions. This opens the door to being able to remove
StringView(char const*).

No functional changes.

											
										
										
											2022-07-11 17:32:29 +00:00
+								                    buffer.append("%40"sv);
-												AK: Add a new, spec-compliant URLParser

This adds a new URL parser, which aims to be compliant with the URL
specification (https://url.spec.whatwg.org/). It also contains a
rudimentary data URL parser.

											
										
										
											2021-05-25 20:13:15 +00:00
+								                    buffer.append(content);
 								                }
 								                at_sign_seen = true;
 								                StringBuilder builder;
-												AK: Don't create Utf8View from temporary String in URLParser

This fixes a bug where a Utf8View was created with data from a temporary
string, which was immediately deleted. This lead to a use-after-free
issue. This also changes most occurences for StringBuilder::to_string in
URLParser to use ::string_view(), as the value is passed as StringView
const& most of the time anyways.

This fixes oss-fuzz issue 34973.

											
										
										
											2021-06-08 13:22:02 +00:00
+								                for (auto c : Utf8View(builder.string_view())) {
-												AK: Add a new, spec-compliant URLParser

This adds a new URL parser, which aims to be compliant with the URL
specification (https://url.spec.whatwg.org/). It also contains a
rudimentary data URL parser.

											
										
										
											2021-05-25 20:13:15 +00:00
+								                    if (c == ':' && !password_token_seen) {
 								                        password_token_seen = true;
 								                        continue;
 								                    }
 								                    builder.clear();
 								                    if (password_token_seen) {
-												AK: Accept optional url and state override parameters in URLParser

These are required in the specification and used by the web's URL
built-in, this commit also removes the Badge<AK::URL> from URLParser
to allow other classes that need to call the parser directly like the
web's URL built-in to do so.

											
										
										
											2021-09-13 19:34:14 +00:00
+								                        builder.append(url->password());
-												AK: Add a new, spec-compliant URLParser

This adds a new URL parser, which aims to be compliant with the URL
specification (https://url.spec.whatwg.org/). It also contains a
rudimentary data URL parser.

											
										
										
											2021-05-25 20:13:15 +00:00
+								                        URL::append_percent_encoded_if_necessary(builder, c, URL::PercentEncodeSet::Userinfo);
 								                        // NOTE: This is has to be encoded and then decoded because the original sequence could contain already percent-encoded sequences.
-												AK: Accept optional url and state override parameters in URLParser

These are required in the specification and used by the web's URL
built-in, this commit also removes the Badge<AK::URL> from URLParser
to allow other classes that need to call the parser directly like the
web's URL built-in to do so.

											
										
										
											2021-09-13 19:34:14 +00:00
+								                        url->m_password = URL::percent_decode(builder.string_view());
-												AK: Add a new, spec-compliant URLParser

This adds a new URL parser, which aims to be compliant with the URL
specification (https://url.spec.whatwg.org/). It also contains a
rudimentary data URL parser.

											
										
										
											2021-05-25 20:13:15 +00:00
+								                    } else {
-												AK: Accept optional url and state override parameters in URLParser

These are required in the specification and used by the web's URL
built-in, this commit also removes the Badge<AK::URL> from URLParser
to allow other classes that need to call the parser directly like the
web's URL built-in to do so.

											
										
										
											2021-09-13 19:34:14 +00:00
+								                        builder.append(url->username());
-												AK: Add a new, spec-compliant URLParser

This adds a new URL parser, which aims to be compliant with the URL
specification (https://url.spec.whatwg.org/). It also contains a
rudimentary data URL parser.

											
										
										
											2021-05-25 20:13:15 +00:00
+								                        URL::append_percent_encoded_if_necessary(builder, c, URL::PercentEncodeSet::Userinfo);
 								                        // NOTE: This is has to be encoded and then decoded because the original sequence could contain already percent-encoded sequences.
-												AK: Accept optional url and state override parameters in URLParser

These are required in the specification and used by the web's URL
built-in, this commit also removes the Badge<AK::URL> from URLParser
to allow other classes that need to call the parser directly like the
web's URL built-in to do so.

											
										
										
											2021-09-13 19:34:14 +00:00
+								                        url->m_username = URL::percent_decode(builder.string_view());
-												AK: Add a new, spec-compliant URLParser

This adds a new URL parser, which aims to be compliant with the URL
specification (https://url.spec.whatwg.org/). It also contains a
rudimentary data URL parser.

											
										
										
											2021-05-25 20:13:15 +00:00
+								                    }
 								                }
 								                buffer.clear();
-												AK: Accept optional url and state override parameters in URLParser

These are required in the specification and used by the web's URL
built-in, this commit also removes the Badge<AK::URL> from URLParser
to allow other classes that need to call the parser directly like the
web's URL built-in to do so.

											
										
										
											2021-09-13 19:34:14 +00:00
+								            } else if (code_point == end_of_file || code_point == '/' || code_point == '?' || code_point == '#' || (url->is_special() && code_point == '\\')) {
-												AK: Add a new, spec-compliant URLParser

This adds a new URL parser, which aims to be compliant with the URL
specification (https://url.spec.whatwg.org/). It also contains a
rudimentary data URL parser.

											
										
										
											2021-05-25 20:13:15 +00:00
+								                if (at_sign_seen && buffer.is_empty()) {
 								                    report_validation_error();
 								                    return {};
 								                }
 								                // NOTE: This decreases the iterator by the number of code points in buffer plus one.
 								                iterator = input.iterator_at_byte_offset(iterator - input.begin() - buffer.length() - 1);
 								                buffer.clear();
 								                state = State::Host;
 								            } else {
 								                buffer.append_code_point(code_point);
 								            }
 								            break;
 								        case State::Host:
 								        case State::Hostname:
 								            if (code_point == ':' && !inside_brackets) {
 								                if (buffer.is_empty()) {
 								                    report_validation_error();
 								                    return {};
 								                }
-												AK: Accept optional url and state override parameters in URLParser

These are required in the specification and used by the web's URL
built-in, this commit also removes the Badge<AK::URL> from URLParser
to allow other classes that need to call the parser directly like the
web's URL built-in to do so.

											
										
										
											2021-09-13 19:34:14 +00:00
+								                auto host = parse_host(buffer.string_view(), !url->is_special());
-												AK: Add a new, spec-compliant URLParser

This adds a new URL parser, which aims to be compliant with the URL
specification (https://url.spec.whatwg.org/). It also contains a
rudimentary data URL parser.

											
										
										
											2021-05-25 20:13:15 +00:00
+								                if (!host.has_value())
 								                    return {};
-												AK: Accept optional url and state override parameters in URLParser

These are required in the specification and used by the web's URL
built-in, this commit also removes the Badge<AK::URL> from URLParser
to allow other classes that need to call the parser directly like the
web's URL built-in to do so.

											
										
										
											2021-09-13 19:34:14 +00:00
+								                url->m_host = host.release_value();
-												AK: Add a new, spec-compliant URLParser

This adds a new URL parser, which aims to be compliant with the URL
specification (https://url.spec.whatwg.org/). It also contains a
rudimentary data URL parser.

											
										
										
											2021-05-25 20:13:15 +00:00
+								                buffer.clear();
 								                state = State::Port;
-												AK: Accept optional url and state override parameters in URLParser

These are required in the specification and used by the web's URL
built-in, this commit also removes the Badge<AK::URL> from URLParser
to allow other classes that need to call the parser directly like the
web's URL built-in to do so.

											
										
										
											2021-09-13 19:34:14 +00:00
+								            } else if (code_point == end_of_file || code_point == '/' || code_point == '?' || code_point == '#' || (url->is_special() && code_point == '\\')) {
 								                if (url->is_special() && buffer.is_empty()) {
-												AK: Add a new, spec-compliant URLParser

This adds a new URL parser, which aims to be compliant with the URL
specification (https://url.spec.whatwg.org/). It also contains a
rudimentary data URL parser.

											
										
										
											2021-05-25 20:13:15 +00:00
+								                    report_validation_error();
 								                    return {};
 								                }
-												AK: Accept optional url and state override parameters in URLParser

These are required in the specification and used by the web's URL
built-in, this commit also removes the Badge<AK::URL> from URLParser
to allow other classes that need to call the parser directly like the
web's URL built-in to do so.

											
										
										
											2021-09-13 19:34:14 +00:00
+								                auto host = parse_host(buffer.string_view(), !url->is_special());
-												AK: Add a new, spec-compliant URLParser

This adds a new URL parser, which aims to be compliant with the URL
specification (https://url.spec.whatwg.org/). It also contains a
rudimentary data URL parser.

											
										
										
											2021-05-25 20:13:15 +00:00
+								                if (!host.has_value())
 								                    return {};
-												AK: Accept optional url and state override parameters in URLParser

These are required in the specification and used by the web's URL
built-in, this commit also removes the Badge<AK::URL> from URLParser
to allow other classes that need to call the parser directly like the
web's URL built-in to do so.

											
										
										
											2021-09-13 19:34:14 +00:00
+								                url->m_host = host.value();
-												AK: Add a new, spec-compliant URLParser

This adds a new URL parser, which aims to be compliant with the URL
specification (https://url.spec.whatwg.org/). It also contains a
rudimentary data URL parser.

											
										
										
											2021-05-25 20:13:15 +00:00
+								                buffer.clear();
 								                state = State::Port;
 								                continue;
 								            } else if (code_point == '[') {
 								                inside_brackets = true;
 								            } else if (code_point == ']') {
 								                inside_brackets = false;
 								            } else {
 								                buffer.append_code_point(code_point);
 								            }
 								            break;
 								        case State::Port:
 								            if (is_ascii_digit(code_point)) {
 								                buffer.append_code_point(code_point);
-												AK: Accept optional url and state override parameters in URLParser

These are required in the specification and used by the web's URL
built-in, this commit also removes the Badge<AK::URL> from URLParser
to allow other classes that need to call the parser directly like the
web's URL built-in to do so.

											
										
										
											2021-09-13 19:34:14 +00:00
+								            } else if (code_point == end_of_file || code_point == '/' || code_point == '?' || code_point == '#' || (url->is_special() && code_point == '\\')) {
-												AK: Add a new, spec-compliant URLParser

This adds a new URL parser, which aims to be compliant with the URL
specification (https://url.spec.whatwg.org/). It also contains a
rudimentary data URL parser.

											
										
										
											2021-05-25 20:13:15 +00:00
+								                if (!buffer.is_empty()) {
-												AK: Don't create Utf8View from temporary String in URLParser

This fixes a bug where a Utf8View was created with data from a temporary
string, which was immediately deleted. This lead to a use-after-free
issue. This also changes most occurences for StringBuilder::to_string in
URLParser to use ::string_view(), as the value is passed as StringView
const& most of the time anyways.

This fixes oss-fuzz issue 34973.

											
										
										
											2021-06-08 13:22:02 +00:00
+								                    auto port = buffer.string_view().to_uint();
-												AK: Add a new, spec-compliant URLParser

This adds a new URL parser, which aims to be compliant with the URL
specification (https://url.spec.whatwg.org/). It also contains a
rudimentary data URL parser.

											
										
										
											2021-05-25 20:13:15 +00:00
+								                    if (!port.has_value() || port.value() > 65535) {
 								                        report_validation_error();
 								                        return {};
 								                    }
-												AK: Accept optional url and state override parameters in URLParser

These are required in the specification and used by the web's URL
built-in, this commit also removes the Badge<AK::URL> from URLParser
to allow other classes that need to call the parser directly like the
web's URL built-in to do so.

											
										
										
											2021-09-13 19:34:14 +00:00
+								                    if (port.value() == URL::default_port_for_scheme(url->scheme()))
-												AK: Make URL::m_port an Optional<u16>, Expose raw port getter

Our current way of signalling a missing port with m_port == 0 was
lacking, as 0 is a valid port number in URLs.

											
										
										
											2021-09-13 20:12:16 +00:00
+								                        url->m_port = {};
-												AK: Add a new, spec-compliant URLParser

This adds a new URL parser, which aims to be compliant with the URL
specification (https://url.spec.whatwg.org/). It also contains a
rudimentary data URL parser.

											
										
										
											2021-05-25 20:13:15 +00:00
+								                    else
-												AK: Accept optional url and state override parameters in URLParser

These are required in the specification and used by the web's URL
built-in, this commit also removes the Badge<AK::URL> from URLParser
to allow other classes that need to call the parser directly like the
web's URL built-in to do so.

											
										
										
											2021-09-13 19:34:14 +00:00
+								                        url->m_port = port.value();
-												AK: Add a new, spec-compliant URLParser

This adds a new URL parser, which aims to be compliant with the URL
specification (https://url.spec.whatwg.org/). It also contains a
rudimentary data URL parser.

											
										
										
											2021-05-25 20:13:15 +00:00
+								                    buffer.clear();
 								                }
 								                state = State::PathStart;
 								                continue;
 								            } else {
 								                report_validation_error();
 								                return {};
 								            }
 								            break;
 								        case State::File:
-												AK: Accept optional url and state override parameters in URLParser

These are required in the specification and used by the web's URL
built-in, this commit also removes the Badge<AK::URL> from URLParser
to allow other classes that need to call the parser directly like the
web's URL built-in to do so.

											
										
										
											2021-09-13 19:34:14 +00:00
+								            url->m_scheme = "file";
 								            url->m_host = "";
-												AK: Add a new, spec-compliant URLParser

This adds a new URL parser, which aims to be compliant with the URL
specification (https://url.spec.whatwg.org/). It also contains a
rudimentary data URL parser.

											
										
										
											2021-05-25 20:13:15 +00:00
+								            if (code_point == '/' || code_point == '\\') {
 								                if (code_point == '\\')
 								                    report_validation_error();
 								                state = State::FileSlash;
 								            } else if (base_url && base_url->m_scheme == "file") {
-												AK: Accept optional url and state override parameters in URLParser

These are required in the specification and used by the web's URL
built-in, this commit also removes the Badge<AK::URL> from URLParser
to allow other classes that need to call the parser directly like the
web's URL built-in to do so.

											
										
										
											2021-09-13 19:34:14 +00:00
+								                url->m_host = base_url->m_host;
 								                url->m_paths = base_url->m_paths;
 								                url->m_query = base_url->m_query;
-												AK: Add a new, spec-compliant URLParser

This adds a new URL parser, which aims to be compliant with the URL
specification (https://url.spec.whatwg.org/). It also contains a
rudimentary data URL parser.

											
										
										
											2021-05-25 20:13:15 +00:00
+								                if (code_point == '?') {
-												AK: Accept optional url and state override parameters in URLParser

These are required in the specification and used by the web's URL
built-in, this commit also removes the Badge<AK::URL> from URLParser
to allow other classes that need to call the parser directly like the
web's URL built-in to do so.

											
										
										
											2021-09-13 19:34:14 +00:00
+								                    url->m_query = "";
-												AK: Add a new, spec-compliant URLParser

This adds a new URL parser, which aims to be compliant with the URL
specification (https://url.spec.whatwg.org/). It also contains a
rudimentary data URL parser.

											
										
										
											2021-05-25 20:13:15 +00:00
+								                    state = State::Query;
 								                } else if (code_point == '#') {
-												AK: Accept optional url and state override parameters in URLParser

These are required in the specification and used by the web's URL
built-in, this commit also removes the Badge<AK::URL> from URLParser
to allow other classes that need to call the parser directly like the
web's URL built-in to do so.

											
										
										
											2021-09-13 19:34:14 +00:00
+								                    url->m_fragment = "";
-												AK: Add a new, spec-compliant URLParser

This adds a new URL parser, which aims to be compliant with the URL
specification (https://url.spec.whatwg.org/). It also contains a
rudimentary data URL parser.

											
										
										
											2021-05-25 20:13:15 +00:00
+								                    state = State::Fragment;
-												AK: Stop using U+0000 as end of file code point in URL parser

This changes URL parser to use the 0xFFFFFFFF constant instead of 0 to
indicate end of file. This fixes a bug where inputs containing null
bytes would terminate the parser early, because they were interpreted
as end of file.

											
										
										
											2021-06-03 10:43:08 +00:00
+								                } else if (code_point != end_of_file) {
-												AK: Accept optional url and state override parameters in URLParser

These are required in the specification and used by the web's URL
built-in, this commit also removes the Badge<AK::URL> from URLParser
to allow other classes that need to call the parser directly like the
web's URL built-in to do so.

											
										
										
											2021-09-13 19:34:14 +00:00
+								                    url->m_query = {};
-												AK: Add a new, spec-compliant URLParser

This adds a new URL parser, which aims to be compliant with the URL
specification (https://url.spec.whatwg.org/). It also contains a
rudimentary data URL parser.

											
										
										
											2021-05-25 20:13:15 +00:00
+								                    auto substring_from_pointer = input.substring_view(iterator - input.begin()).as_string();
 								                    if (!starts_with_windows_drive_letter(substring_from_pointer)) {
-												AK: Accept optional url and state override parameters in URLParser

These are required in the specification and used by the web's URL
built-in, this commit also removes the Badge<AK::URL> from URLParser
to allow other classes that need to call the parser directly like the
web's URL built-in to do so.

											
										
										
											2021-09-13 19:34:14 +00:00
+								                        if (!url->paths().is_empty() && !(url->scheme() == "file" && url->paths().size() == 1 && is_normalized_windows_drive_letter(url->paths()[0])))
 								                            url->m_paths.remove(url->m_paths.size() - 1);
-												AK: Add a new, spec-compliant URLParser

This adds a new URL parser, which aims to be compliant with the URL
specification (https://url.spec.whatwg.org/). It also contains a
rudimentary data URL parser.

											
										
										
											2021-05-25 20:13:15 +00:00
+								                    } else {
 								                        report_validation_error();
-												AK: Accept optional url and state override parameters in URLParser

These are required in the specification and used by the web's URL
built-in, this commit also removes the Badge<AK::URL> from URLParser
to allow other classes that need to call the parser directly like the
web's URL built-in to do so.

											
										
										
											2021-09-13 19:34:14 +00:00
+								                        url->m_paths.clear();
-												AK: Add a new, spec-compliant URLParser

This adds a new URL parser, which aims to be compliant with the URL
specification (https://url.spec.whatwg.org/). It also contains a
rudimentary data URL parser.

											
										
										
											2021-05-25 20:13:15 +00:00
+								                    }
 								                    state = State::Path;
 								                    continue;
 								                }
 								            }
 								            break;
 								        case State::FileSlash:
 								            if (code_point == '/' || code_point == '\\') {
 								                if (code_point == '\\')
 								                    report_validation_error();
 								                state = State::FileHost;
 								            } else if (base_url && base_url->m_scheme == "file") {
-												AK: Accept optional url and state override parameters in URLParser

These are required in the specification and used by the web's URL
built-in, this commit also removes the Badge<AK::URL> from URLParser
to allow other classes that need to call the parser directly like the
web's URL built-in to do so.

											
										
										
											2021-09-13 19:34:14 +00:00
+								                url->m_host = base_url->m_host;
-												AK: Add a new, spec-compliant URLParser

This adds a new URL parser, which aims to be compliant with the URL
specification (https://url.spec.whatwg.org/). It also contains a
rudimentary data URL parser.

											
										
										
											2021-05-25 20:13:15 +00:00
+								                auto substring_from_pointer = input.substring_view(iterator - input.begin()).as_string();
 								                if (!starts_with_windows_drive_letter(substring_from_pointer) && is_normalized_windows_drive_letter(base_url->m_paths[0]))
-												AK: Accept optional url and state override parameters in URLParser

These are required in the specification and used by the web's URL
built-in, this commit also removes the Badge<AK::URL> from URLParser
to allow other classes that need to call the parser directly like the
web's URL built-in to do so.

											
										
										
											2021-09-13 19:34:14 +00:00
+								                    url->append_path(base_url->m_paths[0]);
-												AK: Add a new, spec-compliant URLParser

This adds a new URL parser, which aims to be compliant with the URL
specification (https://url.spec.whatwg.org/). It also contains a
rudimentary data URL parser.

											
										
										
											2021-05-25 20:13:15 +00:00
+								                state = State::Path;
 								                continue;
 								            }
 								            break;
 								        case State::FileHost:
-												AK: Stop using U+0000 as end of file code point in URL parser

This changes URL parser to use the 0xFFFFFFFF constant instead of 0 to
indicate end of file. This fixes a bug where inputs containing null
bytes would terminate the parser early, because they were interpreted
as end of file.

											
										
										
											2021-06-03 10:43:08 +00:00
+								            if (code_point == end_of_file || code_point == '/' || code_point == '\\' || code_point == '?' || code_point == '#') {
-												AK: Don't create Utf8View from temporary String in URLParser

This fixes a bug where a Utf8View was created with data from a temporary
string, which was immediately deleted. This lead to a use-after-free
issue. This also changes most occurences for StringBuilder::to_string in
URLParser to use ::string_view(), as the value is passed as StringView
const& most of the time anyways.

This fixes oss-fuzz issue 34973.

											
										
										
											2021-06-08 13:22:02 +00:00
+								                if (is_windows_drive_letter(buffer.string_view())) {
-												AK: Add a new, spec-compliant URLParser

This adds a new URL parser, which aims to be compliant with the URL
specification (https://url.spec.whatwg.org/). It also contains a
rudimentary data URL parser.

											
										
										
											2021-05-25 20:13:15 +00:00
+								                    report_validation_error();
 								                    state = State::Path;
 								                } else if (buffer.is_empty()) {
-												AK: Accept optional url and state override parameters in URLParser

These are required in the specification and used by the web's URL
built-in, this commit also removes the Badge<AK::URL> from URLParser
to allow other classes that need to call the parser directly like the
web's URL built-in to do so.

											
										
										
											2021-09-13 19:34:14 +00:00
+								                    url->m_host = "";
-												AK: Add a new, spec-compliant URLParser

This adds a new URL parser, which aims to be compliant with the URL
specification (https://url.spec.whatwg.org/). It also contains a
rudimentary data URL parser.

											
										
										
											2021-05-25 20:13:15 +00:00
+								                    state = State::PathStart;
 								                } else {
-												AK: Don't create Utf8View from temporary String in URLParser

This fixes a bug where a Utf8View was created with data from a temporary
string, which was immediately deleted. This lead to a use-after-free
issue. This also changes most occurences for StringBuilder::to_string in
URLParser to use ::string_view(), as the value is passed as StringView
const& most of the time anyways.

This fixes oss-fuzz issue 34973.

											
										
										
											2021-06-08 13:22:02 +00:00
+								                    auto host = parse_host(buffer.string_view(), true);
-												AK: Add a new, spec-compliant URLParser

This adds a new URL parser, which aims to be compliant with the URL
specification (https://url.spec.whatwg.org/). It also contains a
rudimentary data URL parser.

											
										
										
											2021-05-25 20:13:15 +00:00
+								                    if (!host.has_value())
 								                        return {};
 								                    if (host.value() == "localhost")
 								                        host = "";
-												AK: Accept optional url and state override parameters in URLParser

These are required in the specification and used by the web's URL
built-in, this commit also removes the Badge<AK::URL> from URLParser
to allow other classes that need to call the parser directly like the
web's URL built-in to do so.

											
										
										
											2021-09-13 19:34:14 +00:00
+								                    url->m_host = host.release_value();
-												AK: Add a new, spec-compliant URLParser

This adds a new URL parser, which aims to be compliant with the URL
specification (https://url.spec.whatwg.org/). It also contains a
rudimentary data URL parser.

											
										
										
											2021-05-25 20:13:15 +00:00
+								                    buffer.clear();
 								                    state = State::PathStart;
 								                }
 								                continue;
 								            } else {
 								                buffer.append_code_point(code_point);
 								            }
 								            break;
 								        case State::PathStart:
-												AK: Accept optional url and state override parameters in URLParser

These are required in the specification and used by the web's URL
built-in, this commit also removes the Badge<AK::URL> from URLParser
to allow other classes that need to call the parser directly like the
web's URL built-in to do so.

											
										
										
											2021-09-13 19:34:14 +00:00
+								            if (url->is_special()) {
-												AK: Add a new, spec-compliant URLParser

This adds a new URL parser, which aims to be compliant with the URL
specification (https://url.spec.whatwg.org/). It also contains a
rudimentary data URL parser.

											
										
										
											2021-05-25 20:13:15 +00:00
+								                if (code_point == '\\')
 								                    report_validation_error();
 								                state = State::Path;
 								                if (code_point != '/' && code_point != '\\')
 								                    continue;
 								            } else if (code_point == '?') {
-												AK: Accept optional url and state override parameters in URLParser

These are required in the specification and used by the web's URL
built-in, this commit also removes the Badge<AK::URL> from URLParser
to allow other classes that need to call the parser directly like the
web's URL built-in to do so.

											
										
										
											2021-09-13 19:34:14 +00:00
+								                url->m_query = "";
-												AK: Add a new, spec-compliant URLParser

This adds a new URL parser, which aims to be compliant with the URL
specification (https://url.spec.whatwg.org/). It also contains a
rudimentary data URL parser.

											
										
										
											2021-05-25 20:13:15 +00:00
+								                state = State::Query;
 								            } else if (code_point == '#') {
-												AK: Accept optional url and state override parameters in URLParser

These are required in the specification and used by the web's URL
built-in, this commit also removes the Badge<AK::URL> from URLParser
to allow other classes that need to call the parser directly like the
web's URL built-in to do so.

											
										
										
											2021-09-13 19:34:14 +00:00
+								                url->m_fragment = "";
-												AK: Add a new, spec-compliant URLParser

This adds a new URL parser, which aims to be compliant with the URL
specification (https://url.spec.whatwg.org/). It also contains a
rudimentary data URL parser.

											
										
										
											2021-05-25 20:13:15 +00:00
+								                state = State::Fragment;
-												AK: Stop using U+0000 as end of file code point in URL parser

This changes URL parser to use the 0xFFFFFFFF constant instead of 0 to
indicate end of file. This fixes a bug where inputs containing null
bytes would terminate the parser early, because they were interpreted
as end of file.

											
										
										
											2021-06-03 10:43:08 +00:00
+								            } else if (code_point != end_of_file) {
-												AK: Add a new, spec-compliant URLParser

This adds a new URL parser, which aims to be compliant with the URL
specification (https://url.spec.whatwg.org/). It also contains a
rudimentary data URL parser.

											
										
										
											2021-05-25 20:13:15 +00:00
+								                state = State::Path;
 								                if (code_point != '/')
 								                    continue;
 								            }
 								            break;
 								        case State::Path:
-												AK: Accept optional url and state override parameters in URLParser

These are required in the specification and used by the web's URL
built-in, this commit also removes the Badge<AK::URL> from URLParser
to allow other classes that need to call the parser directly like the
web's URL built-in to do so.

											
										
										
											2021-09-13 19:34:14 +00:00
+								            if (code_point == end_of_file || code_point == '/' || (url->is_special() && code_point == '\\') || code_point == '?' || code_point == '#') {
 								                if (url->is_special() && code_point == '\\')
-												AK: Add a new, spec-compliant URLParser

This adds a new URL parser, which aims to be compliant with the URL
specification (https://url.spec.whatwg.org/). It also contains a
rudimentary data URL parser.

											
										
										
											2021-05-25 20:13:15 +00:00
+								                    report_validation_error();
-												AK: Don't create Utf8View from temporary String in URLParser

This fixes a bug where a Utf8View was created with data from a temporary
string, which was immediately deleted. This lead to a use-after-free
issue. This also changes most occurences for StringBuilder::to_string in
URLParser to use ::string_view(), as the value is passed as StringView
const& most of the time anyways.

This fixes oss-fuzz issue 34973.

											
										
										
											2021-06-08 13:22:02 +00:00
+								                if (is_double_dot_path_segment(buffer.string_view())) {
-												AK: Accept optional url and state override parameters in URLParser

These are required in the specification and used by the web's URL
built-in, this commit also removes the Badge<AK::URL> from URLParser
to allow other classes that need to call the parser directly like the
web's URL built-in to do so.

											
										
										
											2021-09-13 19:34:14 +00:00
+								                    if (!url->m_paths.is_empty() && !(url->m_scheme == "file" && url->m_paths.size() == 1 && is_normalized_windows_drive_letter(url->m_paths[0])))
 								                        url->m_paths.remove(url->m_paths.size() - 1);
 								                    if (code_point != '/' && !(url->is_special() && code_point == '\\'))
 								                        url->append_path("");
 								                } else if (is_single_dot_path_segment(buffer.string_view()) && code_point != '/' && !(url->is_special() && code_point == '\\')) {
 								                    url->append_path("");
-												AK: Don't create Utf8View from temporary String in URLParser

This fixes a bug where a Utf8View was created with data from a temporary
string, which was immediately deleted. This lead to a use-after-free
issue. This also changes most occurences for StringBuilder::to_string in
URLParser to use ::string_view(), as the value is passed as StringView
const& most of the time anyways.

This fixes oss-fuzz issue 34973.

											
										
										
											2021-06-08 13:22:02 +00:00
+								                } else if (!is_single_dot_path_segment(buffer.string_view())) {
-												AK: Accept optional url and state override parameters in URLParser

These are required in the specification and used by the web's URL
built-in, this commit also removes the Badge<AK::URL> from URLParser
to allow other classes that need to call the parser directly like the
web's URL built-in to do so.

											
										
										
											2021-09-13 19:34:14 +00:00
+								                    if (url->m_scheme == "file" && url->m_paths.is_empty() && is_windows_drive_letter(buffer.string_view())) {
-												AK: Don't create Utf8View from temporary String in URLParser

This fixes a bug where a Utf8View was created with data from a temporary
string, which was immediately deleted. This lead to a use-after-free
issue. This also changes most occurences for StringBuilder::to_string in
URLParser to use ::string_view(), as the value is passed as StringView
const& most of the time anyways.

This fixes oss-fuzz issue 34973.

											
										
										
											2021-06-08 13:22:02 +00:00
+								                        auto drive_letter = buffer.string_view()[0];
-												AK: Add a new, spec-compliant URLParser

This adds a new URL parser, which aims to be compliant with the URL
specification (https://url.spec.whatwg.org/). It also contains a
rudimentary data URL parser.

											
										
										
											2021-05-25 20:13:15 +00:00
+								                        buffer.clear();
 								                        buffer.append(drive_letter);
 								                        buffer.append(':');
 								                    }
 								                    // NOTE: This needs to be percent decoded since the member variables contain decoded data.
-												AK: Accept optional url and state override parameters in URLParser

These are required in the specification and used by the web's URL
built-in, this commit also removes the Badge<AK::URL> from URLParser
to allow other classes that need to call the parser directly like the
web's URL built-in to do so.

											
										
										
											2021-09-13 19:34:14 +00:00
+								                    url->append_path(URL::percent_decode(buffer.string_view()));
-												AK: Add a new, spec-compliant URLParser

This adds a new URL parser, which aims to be compliant with the URL
specification (https://url.spec.whatwg.org/). It also contains a
rudimentary data URL parser.

											
										
										
											2021-05-25 20:13:15 +00:00
+								                }
 								                buffer.clear();
 								                if (code_point == '?') {
-												AK: Accept optional url and state override parameters in URLParser

These are required in the specification and used by the web's URL
built-in, this commit also removes the Badge<AK::URL> from URLParser
to allow other classes that need to call the parser directly like the
web's URL built-in to do so.

											
										
										
											2021-09-13 19:34:14 +00:00
+								                    url->m_query = "";
-												AK: Add a new, spec-compliant URLParser

This adds a new URL parser, which aims to be compliant with the URL
specification (https://url.spec.whatwg.org/). It also contains a
rudimentary data URL parser.

											
										
										
											2021-05-25 20:13:15 +00:00
+								                    state = State::Query;
 								                } else if (code_point == '#') {
-												AK: Accept optional url and state override parameters in URLParser

These are required in the specification and used by the web's URL
built-in, this commit also removes the Badge<AK::URL> from URLParser
to allow other classes that need to call the parser directly like the
web's URL built-in to do so.

											
										
										
											2021-09-13 19:34:14 +00:00
+								                    url->m_fragment = "";
-												AK: Add a new, spec-compliant URLParser

This adds a new URL parser, which aims to be compliant with the URL
specification (https://url.spec.whatwg.org/). It also contains a
rudimentary data URL parser.

											
										
										
											2021-05-25 20:13:15 +00:00
+								                    state = State::Fragment;
 								                }
 								            } else {
 								                if (!is_url_code_point(code_point) && code_point != '%')
 								                    report_validation_error();
 								                // FIXME: If c is U+0025 (%) and remaining does not start with two ASCII hex digits, validation error.
 								                URL::append_percent_encoded_if_necessary(buffer, code_point, URL::PercentEncodeSet::Path);
 								            }
 								            break;
 								        case State::CannotBeABaseUrlPath:
 								            // NOTE: This does not follow the spec exactly but rather uses the buffer and only sets the path on EOF.
 								            // NOTE: Verify that the assumptions required for this simplification are correct.
-												AK: Accept optional url and state override parameters in URLParser

These are required in the specification and used by the web's URL
built-in, this commit also removes the Badge<AK::URL> from URLParser
to allow other classes that need to call the parser directly like the
web's URL built-in to do so.

											
										
										
											2021-09-13 19:34:14 +00:00
+								            VERIFY(url->m_paths.size() == 1 && url->m_paths[0].is_empty());
-												AK: Add a new, spec-compliant URLParser

This adds a new URL parser, which aims to be compliant with the URL
specification (https://url.spec.whatwg.org/). It also contains a
rudimentary data URL parser.

											
										
										
											2021-05-25 20:13:15 +00:00
+								            if (code_point == '?') {
 								                // NOTE: This needs to be percent decoded since the member variables contain decoded data.
-												AK: Accept optional url and state override parameters in URLParser

These are required in the specification and used by the web's URL
built-in, this commit also removes the Badge<AK::URL> from URLParser
to allow other classes that need to call the parser directly like the
web's URL built-in to do so.

											
										
										
											2021-09-13 19:34:14 +00:00
+								                url->m_paths[0] = URL::percent_decode(buffer.string_view());
 								                url->m_query = "";
-												AK: Add a new, spec-compliant URLParser

This adds a new URL parser, which aims to be compliant with the URL
specification (https://url.spec.whatwg.org/). It also contains a
rudimentary data URL parser.

											
										
										
											2021-05-25 20:13:15 +00:00
+								                state = State::Query;
 								            } else if (code_point == '#') {
 								                // NOTE: This needs to be percent decoded since the member variables contain decoded data.
-												AK: Accept optional url and state override parameters in URLParser

These are required in the specification and used by the web's URL
built-in, this commit also removes the Badge<AK::URL> from URLParser
to allow other classes that need to call the parser directly like the
web's URL built-in to do so.

											
										
										
											2021-09-13 19:34:14 +00:00
+								                url->m_paths[0] = URL::percent_decode(buffer.string_view());
 								                url->m_fragment = "";
-												AK: Add a new, spec-compliant URLParser

This adds a new URL parser, which aims to be compliant with the URL
specification (https://url.spec.whatwg.org/). It also contains a
rudimentary data URL parser.

											
										
										
											2021-05-25 20:13:15 +00:00
+								                state = State::Fragment;
 								            } else {
-												AK: Stop using U+0000 as end of file code point in URL parser

This changes URL parser to use the 0xFFFFFFFF constant instead of 0 to
indicate end of file. This fixes a bug where inputs containing null
bytes would terminate the parser early, because they were interpreted
as end of file.

											
										
										
											2021-06-03 10:43:08 +00:00
+								                if (code_point != end_of_file && !is_url_code_point(code_point) && code_point != '%')
-												AK: Add a new, spec-compliant URLParser

This adds a new URL parser, which aims to be compliant with the URL
specification (https://url.spec.whatwg.org/). It also contains a
rudimentary data URL parser.

											
										
										
											2021-05-25 20:13:15 +00:00
+								                    report_validation_error();
 								                // FIXME: If c is U+0025 (%) and remaining does not start with two ASCII hex digits, validation error.
-												AK: Stop using U+0000 as end of file code point in URL parser

This changes URL parser to use the 0xFFFFFFFF constant instead of 0 to
indicate end of file. This fixes a bug where inputs containing null
bytes would terminate the parser early, because they were interpreted
as end of file.

											
										
										
											2021-06-03 10:43:08 +00:00
+								                if (code_point != end_of_file) {
-												AK: Add a new, spec-compliant URLParser

This adds a new URL parser, which aims to be compliant with the URL
specification (https://url.spec.whatwg.org/). It also contains a
rudimentary data URL parser.

											
										
										
											2021-05-25 20:13:15 +00:00
+								                    URL::append_percent_encoded_if_necessary(buffer, code_point, URL::PercentEncodeSet::C0Control);
 								                } else {
 								                    // NOTE: This needs to be percent decoded since the member variables contain decoded data.
-												AK: Accept optional url and state override parameters in URLParser

These are required in the specification and used by the web's URL
built-in, this commit also removes the Badge<AK::URL> from URLParser
to allow other classes that need to call the parser directly like the
web's URL built-in to do so.

											
										
										
											2021-09-13 19:34:14 +00:00
+								                    url->m_paths[0] = URL::percent_decode(buffer.string_view());
-												AK: Add a new, spec-compliant URLParser

This adds a new URL parser, which aims to be compliant with the URL
specification (https://url.spec.whatwg.org/). It also contains a
rudimentary data URL parser.

											
										
										
											2021-05-25 20:13:15 +00:00
+								                }
 								            }
 								            break;
 								        case State::Query:
-												AK: Don't destructively re-encode query strings in the URL parser

We were decoding and then re-encoding the query string in URLs.
This round-trip caused us to lose information about plus ('+')
ASCII characters encoded as "%2B".

											
										
										
											2022-04-09 22:48:15 +00:00
+								            // https://url.spec.whatwg.org/#query-state
-												AK: Stop using U+0000 as end of file code point in URL parser

This changes URL parser to use the 0xFFFFFFFF constant instead of 0 to
indicate end of file. This fixes a bug where inputs containing null
bytes would terminate the parser early, because they were interpreted
as end of file.

											
										
										
											2021-06-03 10:43:08 +00:00
+								            if (code_point == end_of_file || code_point == '#') {
-												AK: Accept optional url and state override parameters in URLParser

These are required in the specification and used by the web's URL
built-in, this commit also removes the Badge<AK::URL> from URLParser
to allow other classes that need to call the parser directly like the
web's URL built-in to do so.

											
										
										
											2021-09-13 19:34:14 +00:00
+								                VERIFY(url->m_query == "");
 								                auto query_percent_encode_set = url->is_special() ? URL::PercentEncodeSet::SpecialQuery : URL::PercentEncodeSet::Query;
-												AK: Don't destructively re-encode query strings in the URL parser

We were decoding and then re-encoding the query string in URLs.
This round-trip caused us to lose information about plus ('+')
ASCII characters encoded as "%2B".

											
										
										
											2022-04-09 22:48:15 +00:00
+								                url->m_query = percent_encode_after_encoding(buffer.string_view(), query_percent_encode_set);
-												AK: Add a new, spec-compliant URLParser

This adds a new URL parser, which aims to be compliant with the URL
specification (https://url.spec.whatwg.org/). It also contains a
rudimentary data URL parser.

											
										
										
											2021-05-25 20:13:15 +00:00
+								                buffer.clear();
 								                if (code_point == '#') {
-												AK: Accept optional url and state override parameters in URLParser

These are required in the specification and used by the web's URL
built-in, this commit also removes the Badge<AK::URL> from URLParser
to allow other classes that need to call the parser directly like the
web's URL built-in to do so.

											
										
										
											2021-09-13 19:34:14 +00:00
+								                    url->m_fragment = "";
-												AK: Add a new, spec-compliant URLParser

This adds a new URL parser, which aims to be compliant with the URL
specification (https://url.spec.whatwg.org/). It also contains a
rudimentary data URL parser.

											
										
										
											2021-05-25 20:13:15 +00:00
+								                    state = State::Fragment;
 								                }
-												AK: Stop using U+0000 as end of file code point in URL parser

This changes URL parser to use the 0xFFFFFFFF constant instead of 0 to
indicate end of file. This fixes a bug where inputs containing null
bytes would terminate the parser early, because they were interpreted
as end of file.

											
										
										
											2021-06-03 10:43:08 +00:00
+								            } else if (code_point != end_of_file) {
-												AK: Add a new, spec-compliant URLParser

This adds a new URL parser, which aims to be compliant with the URL
specification (https://url.spec.whatwg.org/). It also contains a
rudimentary data URL parser.

											
										
										
											2021-05-25 20:13:15 +00:00
+								                if (!is_url_code_point(code_point) && code_point != '%')
 								                    report_validation_error();
 								                // FIXME: If c is U+0025 (%) and remaining does not start with two ASCII hex digits, validation error.
 								                buffer.append_code_point(code_point);
 								            }
 								            break;
 								        case State::Fragment:
 								            // NOTE: This does not follow the spec exactly but rather uses the buffer and only sets the fragment on EOF.
-												AK: Stop using U+0000 as end of file code point in URL parser

This changes URL parser to use the 0xFFFFFFFF constant instead of 0 to
indicate end of file. This fixes a bug where inputs containing null
bytes would terminate the parser early, because they were interpreted
as end of file.

											
										
										
											2021-06-03 10:43:08 +00:00
+								            if (code_point != end_of_file) {
-												AK: Add a new, spec-compliant URLParser

This adds a new URL parser, which aims to be compliant with the URL
specification (https://url.spec.whatwg.org/). It also contains a
rudimentary data URL parser.

											
										
										
											2021-05-25 20:13:15 +00:00
+								                if (!is_url_code_point(code_point) && code_point != '%')
 								                    report_validation_error();
 								                // FIXME: If c is U+0025 (%) and remaining does not start with two ASCII hex digits, validation error.
 								                buffer.append_code_point(code_point);
 								            } else {
 								                // NOTE: This needs to be percent decoded since the member variables contain decoded data.
-												AK: Accept optional url and state override parameters in URLParser

These are required in the specification and used by the web's URL
built-in, this commit also removes the Badge<AK::URL> from URLParser
to allow other classes that need to call the parser directly like the
web's URL built-in to do so.

											
										
										
											2021-09-13 19:34:14 +00:00
+								                url->m_fragment = URL::percent_decode(buffer.string_view());
-												AK: Add a new, spec-compliant URLParser

This adds a new URL parser, which aims to be compliant with the URL
specification (https://url.spec.whatwg.org/). It also contains a
rudimentary data URL parser.

											
										
										
											2021-05-25 20:13:15 +00:00
+								                buffer.clear();
 								            }
 								            break;
 								        default:
 								            VERIFY_NOT_REACHED();
 								        }
 								        if (iterator.done())
 								            break;
 								        ++iterator;
 								    }
-												AK: Accept optional url and state override parameters in URLParser

These are required in the specification and used by the web's URL
built-in, this commit also removes the Badge<AK::URL> from URLParser
to allow other classes that need to call the parser directly like the
web's URL built-in to do so.

											
										
										
											2021-09-13 19:34:14 +00:00
+								    url->m_valid = true;
 								    dbgln_if(URL_PARSER_DEBUG, "URLParser::parse: Parsed URL to be '{}'.", url->serialize());
 								    return url.release_value();
-												AK: Add a new, spec-compliant URLParser

This adds a new URL parser, which aims to be compliant with the URL
specification (https://url.spec.whatwg.org/). It also contains a
rudimentary data URL parser.

											
										
										
											2021-05-25 20:13:15 +00:00
+								}
 								}