LibURL+LibUnicode+LibWebView: Handle punycode directly in LibURL

We had defined punycode handling in LibUnicode when LibURL (AK at the
time) was unable to depend on LibUnicode. This is no longer the case.
This commit is contained in:
Timothy Flynn 2024-03-25 10:57:36 -04:00 committed by Tim Flynn
parent 5ddfcfd07e
commit 576c2f4f4d
Notes: sideshowbarker 2024-07-17 01:53:23 +09:00
9 changed files with 37 additions and 106 deletions

View file

@ -357,6 +357,7 @@ if (${CMAKE_SYSTEM_NAME} MATCHES "BSD$" OR HAIKU)
target_link_libraries(AK PRIVATE execinfo)
endif()
add_serenity_subdirectory(Userland/Libraries/LibUnicode)
add_serenity_subdirectory(Userland/Libraries/LibURL)
# LibCore
@ -481,7 +482,6 @@ if (BUILD_LAGOM)
TextCodec
Threading
TLS
Unicode
Video
Wasm
WebSocket

View file

@ -8,5 +8,8 @@ shared_library("LibURL") {
"URL.cpp",
"URL.h",
]
deps = [ "//AK" ]
deps = [
"//AK",
"//Userland/Libraries/LibUnicode",
]
}

View file

@ -177,13 +177,9 @@ source_set("LibUnicode") {
"Punycode.cpp",
"Segmentation.cpp",
"String.cpp",
"URL.cpp",
"UnicodeUtils.cpp",
]
deps = [
"//AK",
"//Userland/Libraries/LibURL",
]
deps = [ "//AK" ]
if (enable_unicode_database_download) {
deps += [

View file

@ -4,3 +4,4 @@ set(SOURCES
)
serenity_lib(LibURL url)
target_link_libraries(LibURL PRIVATE LibUnicode)

View file

@ -15,6 +15,7 @@
#include <AK/StringUtils.h>
#include <AK/Utf8View.h>
#include <LibURL/Parser.h>
#include <LibUnicode/IDNA.h>
namespace URL {
@ -574,6 +575,29 @@ static bool ends_in_a_number_checker(StringView input)
return false;
}
// https://url.spec.whatwg.org/#concept-domain-to-ascii
static ErrorOr<String> domain_to_ascii(StringView domain, bool be_strict)
{
// 1. Let result be the result of running Unicode ToASCII with domain_name set to domain, UseSTD3ASCIIRules set to beStrict, CheckHyphens set to false, CheckBidi set to true, CheckJoiners set to true, Transitional_Processing set to false, and VerifyDnsLength set to beStrict. [UTS46]
// 2. If result is a failure value, domain-to-ASCII validation error, return failure.
Unicode::IDNA::ToAsciiOptions const options {
Unicode::IDNA::CheckHyphens::No,
Unicode::IDNA::CheckBidi::Yes,
Unicode::IDNA::CheckJoiners::Yes,
be_strict ? Unicode::IDNA::UseStd3AsciiRules::Yes : Unicode::IDNA::UseStd3AsciiRules::No,
Unicode::IDNA::TransitionalProcessing::No,
be_strict ? Unicode::IDNA::VerifyDnsLength::Yes : Unicode::IDNA::VerifyDnsLength::No
};
auto result = TRY(Unicode::IDNA::to_ascii(Utf8View(domain), options));
// 3. If result is the empty string, domain-to-ASCII validation error, return failure.
if (result.is_empty())
return Error::from_string_literal("Empty domain");
// 4. Return result.
return result;
}
// https://url.spec.whatwg.org/#concept-host-parser
// NOTE: This is a very bare-bones implementation.
static Optional<Host> parse_host(StringView input, bool is_opaque = false)
@ -603,10 +627,10 @@ static Optional<Host> parse_host(StringView input, bool is_opaque = false)
// FIXME: 4. Let domain be the result of running UTF-8 decode without BOM on the percent-decoding of input.
auto domain = percent_decode(input);
// NOTE: This is handled in Unicode::create_unicode_url, to work around the fact that we can't call into LibUnicode here
// FIXME: 5. Let asciiDomain be the result of running domain to ASCII with domain and false.
// FIXME: 6. If asciiDomain is failure, then return failure.
auto ascii_domain_or_error = String::from_byte_string(domain);
// 5. Let asciiDomain be the result of running domain to ASCII with domain and false.
auto ascii_domain_or_error = domain_to_ascii(domain, false);
// 6. If asciiDomain is failure, then return failure.
if (ascii_domain_or_error.is_error())
return {};

View file

@ -10,12 +10,10 @@ set(SOURCES
Segmentation.cpp
String.cpp
UnicodeUtils.cpp
URL.cpp
${UNICODE_DATA_SOURCES}
)
set(GENERATED_SOURCES ${CURRENT_LIB_GENERATED})
serenity_lib(LibUnicode unicode)
target_link_libraries(LibUnicode PRIVATE LibURL)
target_compile_definitions(LibUnicode PRIVATE ENABLE_UNICODE_DATA=$<BOOL:${ENABLE_UNICODE_DATABASE_DOWNLOAD}>)

View file

@ -1,57 +0,0 @@
/*
* Copyright (c) 2023, Simon Wanner <simon@skyrising.xyz>
*
* SPDX-License-Identifier: BSD-2-Clause
*/
#include <LibUnicode/IDNA.h>
#include <LibUnicode/URL.h>
namespace Unicode {
// https://url.spec.whatwg.org/#concept-domain-to-ascii
static ErrorOr<String> domain_to_ascii(StringView domain, bool be_strict)
{
// 1. Let result be the result of running Unicode ToASCII with domain_name set to domain, UseSTD3ASCIIRules set to beStrict, CheckHyphens set to false, CheckBidi set to true, CheckJoiners set to true, Transitional_Processing set to false, and VerifyDnsLength set to beStrict. [UTS46]
// 2. If result is a failure value, domain-to-ASCII validation error, return failure.
Unicode::IDNA::ToAsciiOptions const options {
Unicode::IDNA::CheckHyphens::No,
Unicode::IDNA::CheckBidi::Yes,
Unicode::IDNA::CheckJoiners::Yes,
be_strict ? Unicode::IDNA::UseStd3AsciiRules::Yes : Unicode::IDNA::UseStd3AsciiRules::No,
Unicode::IDNA::TransitionalProcessing::No,
be_strict ? Unicode::IDNA::VerifyDnsLength::Yes : Unicode::IDNA::VerifyDnsLength::No
};
auto result = TRY(Unicode::IDNA::to_ascii(Utf8View(domain), options));
// 3. If result is the empty string, domain-to-ASCII validation error, return failure.
if (result.is_empty())
return Error::from_string_literal("Empty domain");
// 4. Return result.
return result;
}
// https://url.spec.whatwg.org/#concept-host-parser
ErrorOr<URL::URL> create_unicode_url(String const& url_string)
{
// NOTE: 1.-4. are implemented in URL::Parser::parse_host
URL::URL url = url_string;
if (!url.is_valid() || !url.host().has<String>())
return url;
auto& domain = url.host().get<String>();
if (domain.is_empty())
return url;
// 5. Let asciiDomain be the result of running domain to ASCII with domain and false.
// 6. If asciiDomain is failure, then return failure.
auto ascii_domain = TRY(domain_to_ascii(domain.bytes_as_string_view(), false));
// FIXME: Reimplement 7. or call into URL::Parser::parse_host using ascii_domain (8. & 9. do not apply)
url.set_host(ascii_domain);
return url;
}
}

View file

@ -1,16 +0,0 @@
/*
* Copyright (c) 2023, Simon Wanner <simon@skyrising.xyz>
*
* SPDX-License-Identifier: BSD-2-Clause
*/
#pragma once
#include <AK/String.h>
#include <LibURL/URL.h>
namespace Unicode {
ErrorOr<URL::URL> create_unicode_url(String const&);
}

View file

@ -5,11 +5,9 @@
* SPDX-License-Identifier: BSD-2-Clause
*/
#include <AK/LexicalPath.h>
#include <AK/String.h>
#include <LibCore/System.h>
#include <LibFileSystem/FileSystem.h>
#include <LibUnicode/URL.h>
#include <LibWebView/URL.h>
#if defined(ENABLE_PUBLIC_SUFFIX)
@ -18,32 +16,16 @@
namespace WebView {
static Optional<URL::URL> create_url_with_url_or_path(String const& url_or_path)
{
auto url = Unicode::create_unicode_url(url_or_path);
if (!url.is_error() && url.value().is_valid())
return url.release_value();
auto path = LexicalPath::canonicalized_path(url_or_path.to_byte_string());
auto url_from_path = URL::create_with_file_scheme(path);
if (url_from_path.is_valid())
return url_from_path;
return {};
}
static Optional<URL::URL> query_public_suffix_list(StringView url_string)
{
auto out = MUST(String::from_utf8(url_string));
if (!out.starts_with_bytes("about:"sv) && !out.contains("://"sv))
out = MUST(String::formatted("https://{}"sv, out));
auto maybe_url = create_url_with_url_or_path(out);
if (!maybe_url.has_value())
auto url = URL::create_with_url_or_path(out.to_byte_string());
if (!url.is_valid())
return {};
auto url = maybe_url.release_value();
if (url.host().has<URL::IPv4Address>() || url.host().has<URL::IPv6Address>())
return url;