LibURL: Fix heuristic for URL domain parsing IDNA fast path

Our heuristic was a bit too simplistic and would not run through the
ToASCII unicode algorithm which performs some extra validation. This
would cause invalid URLs that should fail to be parsed be mistakenly
accepted.

This fixes 8 tests in: https://wpt.live/url/url-constructor.any.html
This commit is contained in:
Shannon Booth 2024-08-07 02:47:46 +12:00 committed by Tim Ledbetter
parent fd4e943e12
commit db3f118046
Notes: github-actions[bot] 2024-08-06 22:09:08 +00:00
3 changed files with 39 additions and 4 deletions

View file

@ -0,0 +1,2 @@
new URL('file://xn--/p', undefined)
error creating URL: 'TypeError: Invalid URL'

View file

@ -0,0 +1,21 @@
<script src="../include.js"></script>
<script>
test(() => {
const urls = [
{ input: 'file://xn--/p' },
];
for (url of urls) {
if (url.base === undefined)
println(`new URL('${url.input}', ${url.base})`);
else
println(`new URL('${url.input}', '${url.base}')`);
try {
new URL(url.input, url.base);
} catch (e) {
println(`error creating URL: '${e}'`);
}
}
});
</script>

View file

@ -596,14 +596,26 @@ static ErrorOr<String> domain_to_ascii(StringView domain, bool be_strict)
// 1. Let result be the result of running Unicode ToASCII with domain_name set to domain, UseSTD3ASCIIRules set to beStrict, CheckHyphens set to false, CheckBidi set to true, CheckJoiners set to true, Transitional_Processing set to false, and VerifyDnsLength set to beStrict. [UTS46] // 1. Let result be the result of running Unicode ToASCII with domain_name set to domain, UseSTD3ASCIIRules set to beStrict, CheckHyphens set to false, CheckBidi set to true, CheckJoiners set to true, Transitional_Processing set to false, and VerifyDnsLength set to beStrict. [UTS46]
// 2. If result is a failure value, domain-to-ASCII validation error, return failure. // 2. If result is a failure value, domain-to-ASCII validation error, return failure.
// OPTIMIZATION: Fast path for all-ASCII domain strings. // OPTIMIZATION: If beStrict is false, domain is an ASCII string, and strictly splitting domain on U+002E (.)
if (all_of(domain, is_ascii)) { // does not produce any item that starts with an ASCII case-insensitive match for "xn--", this
// step is equivalent to ASCII lowercasing domain.
if (!be_strict && all_of(domain, is_ascii)) {
// 3. If result is the empty string, domain-to-ASCII validation error, return failure. // 3. If result is the empty string, domain-to-ASCII validation error, return failure.
if (domain.is_empty()) if (domain.is_empty())
return Error::from_string_literal("Empty domain"); return Error::from_string_literal("Empty domain");
auto lowercase_domain = domain.to_lowercase_string(); bool slow_path = false;
return String::from_utf8_without_validation(lowercase_domain.bytes()); for (auto part : domain.split_view('.')) {
if (part.starts_with("xn--"sv, CaseSensitivity::CaseInsensitive)) {
slow_path = true;
break;
}
}
if (!slow_path) {
auto lowercase_domain = domain.to_lowercase_string();
return String::from_utf8_without_validation(lowercase_domain.bytes());
}
} }
Unicode::IDNA::ToAsciiOptions const options { Unicode::IDNA::ToAsciiOptions const options {