diff --git a/AK/URLParser.cpp b/AK/URLParser.cpp index 5e90b17b8d1..4204cbcd1f9 100644 --- a/AK/URLParser.cpp +++ b/AK/URLParser.cpp @@ -1,5 +1,6 @@ /* * Copyright (c) 2021, Max Wipfli + * Copyright (c) 2023, Shannon Booth * * SPDX-License-Identifier: BSD-2-Clause */ @@ -7,6 +8,7 @@ #include #include #include +#include #include #include #include @@ -44,10 +46,184 @@ static Optional parse_opaque_host(StringView input) return URL::percent_encode(input, URL::PercentEncodeSet::C0Control); } -static Optional parse_ipv4_address(StringView input) +struct ParsedIPv4Number { + u32 number { 0 }; + bool validation_error { false }; +}; + +// https://url.spec.whatwg.org/#ipv4-number-parser +static Optional parse_ipv4_number(StringView input) { - // FIXME: Implement the correct IPv4 parser as specified by https://url.spec.whatwg.org/#concept-ipv4-parser. - return input; + // 1. If input is the empty string, then return failure. + if (input.is_empty()) + return {}; + + // 2. Let validationError be false. + bool validation_error = false; + + // 3. Let R be 10. + u8 radix = 10; + + // 4. If input contains at least two code points and the first two code points are either "0X" or "0x", then: + if (input.length() >= 2 && (input.starts_with("0X"sv) || input.starts_with("0x"sv))) { + // 1. Set validationError to true. + validation_error = true; + + // 2. Remove the first two code points from input. + input = input.substring_view(2); + + // 3. Set R to 16. + radix = 16; + } + // 5. Otherwise, if input contains at least two code points and the first code point is U+0030 (0), then: + else if (input.length() >= 2 && input[0] == '0') { + // 1. Set validationError to true. + validation_error = true; + + // 2. Remove the first code point from input. + input = input.substring_view(1); + + // 3. Set R to 8. + radix = 8; + } + + // 6. If input is the empty string, then return (0, true). + if (input.is_empty()) + return ParsedIPv4Number { 0, true }; + + // 7. If input contains a code point that is not a radix-R digit, then return failure. + if (radix == 8) { + if (!all_of(input, [](auto character) { return is_ascii_octal_digit(character); })) + return {}; + } else if (radix == 10) { + if (!all_of(input, [](auto character) { return is_ascii_digit(character); })) + return {}; + } else if (radix == 16) { + if (!all_of(input, [](auto character) { return is_ascii_hex_digit(character); })) + return {}; + } else { + VERIFY_NOT_REACHED(); + } + + // 8. Let output be the mathematical integer value that is represented by input in radix-R notation, using ASCII hex digits for digits with values 0 through 15. + u32 output; + if (radix == 8) + output = StringUtils::convert_to_uint_from_octal(input).release_value(); + else if (radix == 10) + output = input.to_uint().release_value(); + else if (radix == 16) + output = StringUtils::convert_to_uint_from_hex(input).release_value(); + else + VERIFY_NOT_REACHED(); + + // 9. Return (output, validationError). + return ParsedIPv4Number { output, validation_error }; +} + +// https://url.spec.whatwg.org/#concept-ipv4-parser +static Optional parse_ipv4_address(StringView input) +{ + // 1. Let parts be the result of strictly splitting input on U+002E (.). + auto parts = input.split_view("."sv, SplitBehavior::KeepEmpty); + + // 2. If the last item in parts is the empty string, then: + if (parts.last().is_empty()) { + // 1. IPv4-empty-part validation error. + report_validation_error(); + + // 2. If parts’s size is greater than 1, then remove the last item from parts. + if (parts.size() > 1) + parts.take_last(); + } + + // 3. If parts’s size is greater than 4, IPv4-too-many-parts validation error, return failure. + if (parts.size() > 4) { + report_validation_error(); + return {}; + } + + // 4. Let numbers be an empty list. + Vector numbers; + + // 5. For each part of parts: + for (auto const& part : parts) { + // 1. Let result be the result of parsing part. + auto const result = parse_ipv4_number(part); + + // 2. If result is failure, IPv4-non-numeric-part validation error, return failure. + if (!result.has_value()) { + report_validation_error(); + return {}; + } + + // 3. If result[1] is true, IPv4-non-decimal-part validation error. + if (result->validation_error) + report_validation_error(); + + // 4. Append result[0] to numbers. + numbers.append(result->number); + } + + // 6. If any item in numbers is greater than 255, IPv4-out-of-range-part validation error. + // 7. If any but the last item in numbers is greater than 255, then return failure. + for (size_t i = 0; i < numbers.size(); ++i) { + if (numbers[i] > 255) { + report_validation_error(); + if (i != numbers.size() - 1) + return {}; + } + } + + // 8. If the last item in numbers is greater than or equal to 256^(5 − numbers’s size), then return failure. + if (numbers.last() >= pow(256, 5 - numbers.size())) + return {}; + + // 9. Let ipv4 be the last item in numbers. + auto ipv4 = numbers.last(); + + // 10. Remove the last item from numbers. + numbers.take_last(); + + // 11. Let counter be 0. + u8 counter = 0; + + // 12. For each n of numbers: + for (u32 n : numbers) { + // 1. Increment ipv4 by n × 256^(3 − counter). + ipv4 += n * pow(256, 3 - counter); + + // 2. Increment counter by 1. + ++counter; + } + + // 13. Return ipv4. + return ipv4; +} + +// https://url.spec.whatwg.org/#concept-ipv4-serializer +static ErrorOr serialize_ipv4_address(u32 address) +{ + // 1. Let output be the empty string. + // NOTE: Array to avoid prepend. + Array output; + + // 2. Let n be the value of address. + u32 n = address; + + // 3. For each i in the range 1 to 4, inclusive: + for (size_t i = 0; i <= 3; ++i) { + // 1. Prepend n % 256, serialized, to output. + output[3 - i] = n % 256; + + // 2. If i is not 4, then prepend U+002E (.) to output. + // NOTE: done at end + + // 3. Set n to floor(n / 256). + n /= 256; + } + + // 4. Return output. + return String::formatted("{}.{}.{}.{}", output[0], output[1], output[2], output[3]); } // https://url.spec.whatwg.org/#concept-ipv6-serializer @@ -365,15 +541,21 @@ static Optional parse_host(StringView input, bool is_not_speci return result.release_value().to_deprecated_string(); } + // 2. If isNotSpecial is true, then return the result of opaque-host parsing input. if (is_not_special) return parse_opaque_host(input); + + // 3. Assert: input is not the empty string. VERIFY(!input.is_empty()); - // FIXME: Let domain be the result of running UTF-8 decode without BOM on the percent-decoding of input. + // FIXME: 4. Let domain be the result of running UTF-8 decode without BOM on the percent-decoding of input. auto domain = URL::percent_decode(input); - // FIXME: Let asciiDomain be the result of running domain to ASCII on domain. + + // FIXME: 5. Let asciiDomain be the result of running domain to ASCII on domain. + // FIXME: 6. If asciiDomain is failure, then return failure. auto& ascii_domain = domain; + // 7. If asciiDomain contains a forbidden domain code point, domain-invalid-code-point validation error, return failure. auto forbidden_host_characters = "\0\t\n\r #%/:<>?@[\\]^|"sv; for (auto character : forbidden_host_characters) { if (ascii_domain.view().contains(character)) { @@ -382,8 +564,21 @@ static Optional parse_host(StringView input, bool is_not_speci } } - auto ipv4_host = parse_ipv4_address(ascii_domain); - return ipv4_host; + // 8. If asciiDomain ends in a number, then return the result of IPv4 parsing asciiDomain. + if (is_ascii_digit(ascii_domain[ascii_domain.length() - 1])) { + auto ipv4_host = parse_ipv4_address(ascii_domain); + if (!ipv4_host.has_value()) + return {}; + + auto result = serialize_ipv4_address(*ipv4_host); + if (result.is_error()) + return {}; + + return result.release_value().to_deprecated_string(); + } + + // 9. Return asciiDomain. + return ascii_domain; } // https://url.spec.whatwg.org/#start-with-a-windows-drive-letter diff --git a/Tests/AK/TestURL.cpp b/Tests/AK/TestURL.cpp index 12cda81a51b..cca71b31b81 100644 --- a/Tests/AK/TestURL.cpp +++ b/Tests/AK/TestURL.cpp @@ -458,3 +458,40 @@ TEST_CASE(ipv6_address) EXPECT_EQ(url.is_valid(), false); } } + +TEST_CASE(ipv4_address) +{ + { + constexpr auto ipv4_url = "http://127.0.0.1/index.html"sv; + URL url(ipv4_url); + EXPECT(url.is_valid()); + EXPECT_EQ(url.host(), "127.0.0.1"sv); + } + + { + constexpr auto ipv4_url = "http://0x.0x.0"sv; + URL url(ipv4_url); + EXPECT(url.is_valid()); + EXPECT_EQ(url.host(), "0.0.0.0"sv); + } + + { + constexpr auto bad_ipv4_url = "https://127..0.0.1"sv; + URL url(bad_ipv4_url); + EXPECT(!url.is_valid()); + } + + { + constexpr auto ipv4_url = "http://256"sv; + URL url(ipv4_url); + EXPECT(url.is_valid()); + EXPECT_EQ(url.host(), "0.0.1.0"sv); + } + + { + constexpr auto ipv4_url = "http://888888888"sv; + URL url(ipv4_url); + EXPECT(url.is_valid()); + EXPECT_EQ(url.host(), "52.251.94.56"sv); + } +}