LibURL: Fix heuristic for URL domain parsing IDNA fast path

Our heuristic was a bit too simplistic and would not run through the ToASCII unicode algorithm which performs some extra validation. This would cause invalid URLs that should fail to be parsed be mistakenly accepted. This fixes 8 tests in: https://wpt.live/url/url-constructor.any.html
Author: https://github.com/shannonbooth Commit: https://github.com/LadybirdBrowser/ladybird/commit/db3f1180464 Pull-request: https://github.com/LadybirdBrowser/ladybird/pull/987 Reviewed-by: https://github.com/tcl3 ✅
2024-11-22 07:30:19 +00:00 · 2024-08-07 02:47:46 +12:00 · 2024-08-07 02:47:46 +12:00 · db3f118046 · 2024-08-06 22:09:08 +00:00
commit db3f118046
parent fd4e943e12
3 changed files with 39 additions and 4 deletions
--- a/Tests/LibWeb/Text/expected/URL/invalid-urls.txt
+++ b/Tests/LibWeb/Text/expected/URL/invalid-urls.txt
@ -0,0 +1,2 @@
 new URL('file://xn--/p', undefined)
 error creating URL: 'TypeError: Invalid URL'
--- a/Tests/LibWeb/Text/input/URL/invalid-urls.html
+++ b/Tests/LibWeb/Text/input/URL/invalid-urls.html
@ -0,0 +1,21 @@
 <script src="../include.js"></script>
 <script>
    test(() => {
        const urls = [
            { input: 'file://xn--/p' },
        ];
        for (url of urls) {
            if (url.base === undefined)
                println(`new URL('${url.input}', ${url.base})`);
            else
                println(`new URL('${url.input}', '${url.base}')`);
            try {
                new URL(url.input, url.base);
            } catch (e) {
                println(`error creating URL: '${e}'`);
            }
        }
    });
 </script>
--- a/Userland/Libraries/LibURL/Parser.cpp
+++ b/Userland/Libraries/LibURL/Parser.cpp
@ -596,14 +596,26 @@ static ErrorOr<String> domain_to_ascii(StringView domain, bool be_strict)
    // 1. Let result be the result of running Unicode ToASCII with domain_name set to domain, UseSTD3ASCIIRules set to beStrict, CheckHyphens set to false, CheckBidi set to true, CheckJoiners set to true, Transitional_Processing set to false, and VerifyDnsLength set to beStrict. [UTS46]
    // 2. If result is a failure value, domain-to-ASCII validation error, return failure.
-    // OPTIMIZATION: Fast path for all-ASCII domain strings.
+    // OPTIMIZATION: If beStrict is false, domain is an ASCII string, and strictly splitting domain on U+002E (.)
-    if (all_of(domain, is_ascii)) {
+    //               does not produce any item that starts with an ASCII case-insensitive match for "xn--", this
    //               step is equivalent to ASCII lowercasing domain.
    if (!be_strict && all_of(domain, is_ascii)) {
        // 3. If result is the empty string, domain-to-ASCII validation error, return failure.
        if (domain.is_empty())
            return Error::from_string_literal("Empty domain");
-        auto lowercase_domain = domain.to_lowercase_string();
+        bool slow_path = false;
-        return String::from_utf8_without_validation(lowercase_domain.bytes());
+        for (auto part : domain.split_view('.')) {
            if (part.starts_with("xn--"sv, CaseSensitivity::CaseInsensitive)) {
                slow_path = true;
                break;
            }
        }
        if (!slow_path) {
            auto lowercase_domain = domain.to_lowercase_string();
            return String::from_utf8_without_validation(lowercase_domain.bytes());
        }
    }
    Unicode::IDNA::ToAsciiOptions const options {
		`@ -0,0 +1,2 @@`
							`new URL('file://xn--/p', undefined)`
							`error creating URL: 'TypeError: Invalid URL'`