From db3f1180464eefc841d2eccc5a6b441398dd164d Mon Sep 17 00:00:00 2001 From: Shannon Booth Date: Wed, 7 Aug 2024 02:47:46 +1200 Subject: [PATCH] LibURL: Fix heuristic for URL domain parsing IDNA fast path Our heuristic was a bit too simplistic and would not run through the ToASCII unicode algorithm which performs some extra validation. This would cause invalid URLs that should fail to be parsed be mistakenly accepted. This fixes 8 tests in: https://wpt.live/url/url-constructor.any.html --- .../LibWeb/Text/expected/URL/invalid-urls.txt | 2 ++ Tests/LibWeb/Text/input/URL/invalid-urls.html | 21 +++++++++++++++++++ Userland/Libraries/LibURL/Parser.cpp | 20 ++++++++++++++---- 3 files changed, 39 insertions(+), 4 deletions(-) create mode 100644 Tests/LibWeb/Text/expected/URL/invalid-urls.txt create mode 100644 Tests/LibWeb/Text/input/URL/invalid-urls.html diff --git a/Tests/LibWeb/Text/expected/URL/invalid-urls.txt b/Tests/LibWeb/Text/expected/URL/invalid-urls.txt new file mode 100644 index 00000000000..7ffbe3d1b7b --- /dev/null +++ b/Tests/LibWeb/Text/expected/URL/invalid-urls.txt @@ -0,0 +1,2 @@ +new URL('file://xn--/p', undefined) +error creating URL: 'TypeError: Invalid URL' diff --git a/Tests/LibWeb/Text/input/URL/invalid-urls.html b/Tests/LibWeb/Text/input/URL/invalid-urls.html new file mode 100644 index 00000000000..b8f73c9fece --- /dev/null +++ b/Tests/LibWeb/Text/input/URL/invalid-urls.html @@ -0,0 +1,21 @@ + + diff --git a/Userland/Libraries/LibURL/Parser.cpp b/Userland/Libraries/LibURL/Parser.cpp index 36a67998c39..6a9aa801b86 100644 --- a/Userland/Libraries/LibURL/Parser.cpp +++ b/Userland/Libraries/LibURL/Parser.cpp @@ -596,14 +596,26 @@ static ErrorOr domain_to_ascii(StringView domain, bool be_strict) // 1. Let result be the result of running Unicode ToASCII with domain_name set to domain, UseSTD3ASCIIRules set to beStrict, CheckHyphens set to false, CheckBidi set to true, CheckJoiners set to true, Transitional_Processing set to false, and VerifyDnsLength set to beStrict. [UTS46] // 2. If result is a failure value, domain-to-ASCII validation error, return failure. - // OPTIMIZATION: Fast path for all-ASCII domain strings. - if (all_of(domain, is_ascii)) { + // OPTIMIZATION: If beStrict is false, domain is an ASCII string, and strictly splitting domain on U+002E (.) + // does not produce any item that starts with an ASCII case-insensitive match for "xn--", this + // step is equivalent to ASCII lowercasing domain. + if (!be_strict && all_of(domain, is_ascii)) { // 3. If result is the empty string, domain-to-ASCII validation error, return failure. if (domain.is_empty()) return Error::from_string_literal("Empty domain"); - auto lowercase_domain = domain.to_lowercase_string(); - return String::from_utf8_without_validation(lowercase_domain.bytes()); + bool slow_path = false; + for (auto part : domain.split_view('.')) { + if (part.starts_with("xn--"sv, CaseSensitivity::CaseInsensitive)) { + slow_path = true; + break; + } + } + + if (!slow_path) { + auto lowercase_domain = domain.to_lowercase_string(); + return String::from_utf8_without_validation(lowercase_domain.bytes()); + } } Unicode::IDNA::ToAsciiOptions const options {