mirror of
https://github.com/LadybirdBrowser/ladybird.git
synced 2024-11-25 00:50:22 +00:00
LibUnicode: Replace code point bidirectional classes with ICU
This commit is contained in:
parent
e77dafc987
commit
aa3a30870b
Notes:
sideshowbarker
2024-07-17 02:37:08 +09:00
Author: https://github.com/trflynn89 Commit: https://github.com/LadybirdBrowser/ladybird/commit/aa3a30870b Pull-request: https://github.com/LadybirdBrowser/ladybird/pull/239
8 changed files with 102 additions and 348 deletions
|
@ -10,9 +10,6 @@ set(UCD_VERSION_FILE "${UCD_PATH}/version.txt")
|
||||||
set(UCD_ZIP_URL "https://www.unicode.org/Public/${UCD_VERSION}/ucd/UCD.zip")
|
set(UCD_ZIP_URL "https://www.unicode.org/Public/${UCD_VERSION}/ucd/UCD.zip")
|
||||||
set(UCD_ZIP_PATH "${UCD_PATH}/UCD.zip")
|
set(UCD_ZIP_PATH "${UCD_PATH}/UCD.zip")
|
||||||
|
|
||||||
set(UNICODE_DATA_SOURCE "UnicodeData.txt")
|
|
||||||
set(UNICODE_DATA_PATH "${UCD_PATH}/${UNICODE_DATA_SOURCE}")
|
|
||||||
|
|
||||||
string(REGEX REPLACE "([0-9]+\\.[0-9]+)\\.[0-9]+" "\\1" EMOJI_VERSION "${UCD_VERSION}")
|
string(REGEX REPLACE "([0-9]+\\.[0-9]+)\\.[0-9]+" "\\1" EMOJI_VERSION "${UCD_VERSION}")
|
||||||
set(EMOJI_TEST_URL "https://www.unicode.org/Public/emoji/${EMOJI_VERSION}/emoji-test.txt")
|
set(EMOJI_TEST_URL "https://www.unicode.org/Public/emoji/${EMOJI_VERSION}/emoji-test.txt")
|
||||||
set(EMOJI_TEST_PATH "${UCD_PATH}/emoji-test.txt")
|
set(EMOJI_TEST_PATH "${UCD_PATH}/emoji-test.txt")
|
||||||
|
@ -26,18 +23,12 @@ if (ENABLE_UNICODE_DATABASE_DOWNLOAD)
|
||||||
|
|
||||||
if (ENABLE_NETWORK_DOWNLOADS)
|
if (ENABLE_NETWORK_DOWNLOADS)
|
||||||
download_file("${UCD_ZIP_URL}" "${UCD_ZIP_PATH}" SHA256 "${UCD_SHA256}")
|
download_file("${UCD_ZIP_URL}" "${UCD_ZIP_PATH}" SHA256 "${UCD_SHA256}")
|
||||||
extract_path("${UCD_PATH}" "${UCD_ZIP_PATH}" "${UNICODE_DATA_SOURCE}" "${UNICODE_DATA_PATH}")
|
|
||||||
|
|
||||||
download_file("${EMOJI_TEST_URL}" "${EMOJI_TEST_PATH}" SHA256 "${EMOJI_SHA256}")
|
download_file("${EMOJI_TEST_URL}" "${EMOJI_TEST_PATH}" SHA256 "${EMOJI_SHA256}")
|
||||||
else()
|
else()
|
||||||
message(STATUS "Skipping download of ${UCD_ZIP_URL}, expecting the archive to have been extracted to ${UCD_ZIP_PATH}")
|
message(STATUS "Skipping download of ${UCD_ZIP_URL}, expecting the archive to have been extracted to ${UCD_ZIP_PATH}")
|
||||||
message(STATUS "Skipping download of ${EMOJI_TEST_URL}, expecting the file to be at ${EMOJI_TEST_PATH}")
|
message(STATUS "Skipping download of ${EMOJI_TEST_URL}, expecting the file to be at ${EMOJI_TEST_PATH}")
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
|
|
||||||
set(UNICODE_DATA_HEADER UnicodeData.h)
|
|
||||||
set(UNICODE_DATA_IMPLEMENTATION UnicodeData.cpp)
|
|
||||||
|
|
||||||
set(EMOJI_DATA_HEADER EmojiData.h)
|
set(EMOJI_DATA_HEADER EmojiData.h)
|
||||||
set(EMOJI_DATA_IMPLEMENTATION EmojiData.cpp)
|
set(EMOJI_DATA_IMPLEMENTATION EmojiData.cpp)
|
||||||
|
|
||||||
|
@ -45,14 +36,6 @@ if (ENABLE_UNICODE_DATABASE_DOWNLOAD)
|
||||||
set(EMOJI_INSTALL_ARG -i "${EMOJI_INSTALL_PATH}")
|
set(EMOJI_INSTALL_ARG -i "${EMOJI_INSTALL_PATH}")
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
invoke_generator(
|
|
||||||
"UnicodeData"
|
|
||||||
Lagom::GenerateUnicodeData
|
|
||||||
"${UCD_VERSION_FILE}"
|
|
||||||
"${UNICODE_DATA_HEADER}"
|
|
||||||
"${UNICODE_DATA_IMPLEMENTATION}"
|
|
||||||
arguments -u "${UNICODE_DATA_PATH}"
|
|
||||||
)
|
|
||||||
invoke_generator(
|
invoke_generator(
|
||||||
"EmojiData"
|
"EmojiData"
|
||||||
Lagom::GenerateEmojiData
|
Lagom::GenerateEmojiData
|
||||||
|
@ -69,8 +52,6 @@ if (ENABLE_UNICODE_DATABASE_DOWNLOAD)
|
||||||
)
|
)
|
||||||
|
|
||||||
set(UNICODE_DATA_SOURCES
|
set(UNICODE_DATA_SOURCES
|
||||||
${UNICODE_DATA_HEADER}
|
|
||||||
${UNICODE_DATA_IMPLEMENTATION}
|
|
||||||
${EMOJI_DATA_HEADER}
|
${EMOJI_DATA_HEADER}
|
||||||
${EMOJI_DATA_IMPLEMENTATION}
|
${EMOJI_DATA_IMPLEMENTATION}
|
||||||
)
|
)
|
||||||
|
|
|
@ -1,2 +1 @@
|
||||||
lagom_tool(GenerateUnicodeData SOURCES GenerateUnicodeData.cpp LIBS LibMain)
|
|
||||||
lagom_tool(GenerateEmojiData SOURCES GenerateEmojiData.cpp LIBS LibMain)
|
lagom_tool(GenerateEmojiData SOURCES GenerateEmojiData.cpp LIBS LibMain)
|
||||||
|
|
|
@ -1,291 +0,0 @@
|
||||||
/*
|
|
||||||
* Copyright (c) 2021, Tim Flynn <trflynn89@serenityos.org>
|
|
||||||
*
|
|
||||||
* SPDX-License-Identifier: BSD-2-Clause
|
|
||||||
*/
|
|
||||||
|
|
||||||
#include "GeneratorUtil.h"
|
|
||||||
#include <AK/AllOf.h>
|
|
||||||
#include <AK/Array.h>
|
|
||||||
#include <AK/ByteString.h>
|
|
||||||
#include <AK/CharacterTypes.h>
|
|
||||||
#include <AK/Error.h>
|
|
||||||
#include <AK/Find.h>
|
|
||||||
#include <AK/HashMap.h>
|
|
||||||
#include <AK/Optional.h>
|
|
||||||
#include <AK/QuickSort.h>
|
|
||||||
#include <AK/SourceGenerator.h>
|
|
||||||
#include <AK/StringUtils.h>
|
|
||||||
#include <AK/Types.h>
|
|
||||||
#include <AK/Vector.h>
|
|
||||||
#include <LibCore/ArgsParser.h>
|
|
||||||
#include <LibUnicode/CharacterTypes.h>
|
|
||||||
|
|
||||||
// https://www.unicode.org/reports/tr44/#PropList.txt
|
|
||||||
using PropList = HashMap<ByteString, Vector<Unicode::CodePointRange>>;
|
|
||||||
|
|
||||||
// https://www.unicode.org/reports/tr44/#UnicodeData.txt
|
|
||||||
struct CodePointData {
|
|
||||||
u32 code_point { 0 };
|
|
||||||
ByteString name;
|
|
||||||
ByteString bidi_class;
|
|
||||||
Optional<i8> numeric_value_decimal;
|
|
||||||
Optional<i8> numeric_value_digit;
|
|
||||||
Optional<i8> numeric_value_numeric;
|
|
||||||
bool bidi_mirrored { false };
|
|
||||||
ByteString unicode_1_name;
|
|
||||||
ByteString iso_comment;
|
|
||||||
};
|
|
||||||
|
|
||||||
struct CodePointBidiClass {
|
|
||||||
Unicode::CodePointRange code_point_range;
|
|
||||||
ByteString bidi_class;
|
|
||||||
};
|
|
||||||
|
|
||||||
struct UnicodeData {
|
|
||||||
Vector<CodePointData> code_point_data;
|
|
||||||
|
|
||||||
HashTable<ByteString> bidirectional_classes;
|
|
||||||
Vector<CodePointBidiClass> code_point_bidirectional_classes;
|
|
||||||
};
|
|
||||||
|
|
||||||
static ErrorOr<void> parse_unicode_data(Core::InputBufferedFile& file, UnicodeData& unicode_data)
|
|
||||||
{
|
|
||||||
Optional<u32> code_point_range_start;
|
|
||||||
Array<u8, 1024> buffer;
|
|
||||||
|
|
||||||
while (TRY(file.can_read_line())) {
|
|
||||||
auto line = TRY(file.read_line(buffer));
|
|
||||||
|
|
||||||
if (line.is_empty())
|
|
||||||
continue;
|
|
||||||
|
|
||||||
auto segments = line.split_view(';', SplitBehavior::KeepEmpty);
|
|
||||||
VERIFY(segments.size() == 15);
|
|
||||||
|
|
||||||
CodePointData data {};
|
|
||||||
data.code_point = AK::StringUtils::convert_to_uint_from_hex<u32>(segments[0]).value();
|
|
||||||
data.name = segments[1];
|
|
||||||
data.bidi_class = segments[4];
|
|
||||||
data.numeric_value_decimal = AK::StringUtils::convert_to_int<i8>(segments[6]);
|
|
||||||
data.numeric_value_digit = AK::StringUtils::convert_to_int<i8>(segments[7]);
|
|
||||||
data.numeric_value_numeric = AK::StringUtils::convert_to_int<i8>(segments[8]);
|
|
||||||
data.bidi_mirrored = segments[9] == "Y"sv;
|
|
||||||
data.unicode_1_name = segments[10];
|
|
||||||
data.iso_comment = segments[11];
|
|
||||||
|
|
||||||
if (data.name.starts_with("<"sv) && data.name.ends_with(", First>"sv)) {
|
|
||||||
VERIFY(!code_point_range_start.has_value());
|
|
||||||
code_point_range_start = data.code_point;
|
|
||||||
|
|
||||||
data.name = data.name.substring(1, data.name.length() - 9);
|
|
||||||
} else if (data.name.starts_with("<"sv) && data.name.ends_with(", Last>"sv)) {
|
|
||||||
VERIFY(code_point_range_start.has_value());
|
|
||||||
|
|
||||||
Unicode::CodePointRange code_point_range { *code_point_range_start, data.code_point };
|
|
||||||
|
|
||||||
data.name = data.name.substring(1, data.name.length() - 8);
|
|
||||||
code_point_range_start.clear();
|
|
||||||
|
|
||||||
unicode_data.code_point_bidirectional_classes.append({ code_point_range, data.bidi_class });
|
|
||||||
} else {
|
|
||||||
unicode_data.code_point_bidirectional_classes.append({ { data.code_point, data.code_point }, data.bidi_class });
|
|
||||||
}
|
|
||||||
|
|
||||||
unicode_data.bidirectional_classes.set(data.bidi_class, AK::HashSetExistingEntryBehavior::Keep);
|
|
||||||
unicode_data.code_point_data.append(move(data));
|
|
||||||
}
|
|
||||||
|
|
||||||
return {};
|
|
||||||
}
|
|
||||||
|
|
||||||
static ErrorOr<void> generate_unicode_data_header(Core::InputBufferedFile& file, UnicodeData& unicode_data)
|
|
||||||
{
|
|
||||||
StringBuilder builder;
|
|
||||||
SourceGenerator generator { builder };
|
|
||||||
|
|
||||||
auto generate_enum = [&](StringView name, StringView default_, auto values, Vector<Alias> aliases = {}) {
|
|
||||||
quick_sort(values);
|
|
||||||
quick_sort(aliases, [](auto& alias1, auto& alias2) { return alias1.alias < alias2.alias; });
|
|
||||||
|
|
||||||
generator.set("name", name);
|
|
||||||
generator.set("underlying", ByteString::formatted("{}UnderlyingType", name));
|
|
||||||
generator.set("type", ((values.size() + !default_.is_empty()) < 256) ? "u8"sv : "u16"sv);
|
|
||||||
|
|
||||||
generator.append(R"~~~(
|
|
||||||
using @underlying@ = @type@;
|
|
||||||
|
|
||||||
enum class @name@ : @underlying@ {)~~~");
|
|
||||||
|
|
||||||
if (!default_.is_empty()) {
|
|
||||||
generator.set("default", default_);
|
|
||||||
generator.append(R"~~~(
|
|
||||||
@default@,)~~~");
|
|
||||||
}
|
|
||||||
|
|
||||||
for (auto const& value : values) {
|
|
||||||
generator.set("value", value);
|
|
||||||
generator.append(R"~~~(
|
|
||||||
@value@,)~~~");
|
|
||||||
}
|
|
||||||
|
|
||||||
for (auto const& alias : aliases) {
|
|
||||||
generator.set("alias", alias.alias);
|
|
||||||
generator.set("value", alias.name);
|
|
||||||
generator.append(R"~~~(
|
|
||||||
@alias@ = @value@,)~~~");
|
|
||||||
}
|
|
||||||
|
|
||||||
generator.append(R"~~~(
|
|
||||||
};
|
|
||||||
)~~~");
|
|
||||||
};
|
|
||||||
|
|
||||||
generator.append(R"~~~(
|
|
||||||
#pragma once
|
|
||||||
|
|
||||||
#include <AK/Types.h>
|
|
||||||
#include <LibUnicode/Forward.h>
|
|
||||||
|
|
||||||
namespace Unicode {
|
|
||||||
)~~~");
|
|
||||||
|
|
||||||
generate_enum("BidirectionalClass"sv, {}, unicode_data.bidirectional_classes.values());
|
|
||||||
|
|
||||||
generator.append(R"~~~(
|
|
||||||
}
|
|
||||||
)~~~");
|
|
||||||
|
|
||||||
TRY(file.write_until_depleted(generator.as_string_view().bytes()));
|
|
||||||
return {};
|
|
||||||
}
|
|
||||||
|
|
||||||
static ErrorOr<void> generate_unicode_data_implementation(Core::InputBufferedFile& file, UnicodeData const& unicode_data)
|
|
||||||
{
|
|
||||||
StringBuilder builder;
|
|
||||||
SourceGenerator generator { builder };
|
|
||||||
|
|
||||||
generator.append(R"~~~(
|
|
||||||
#include <AK/Array.h>
|
|
||||||
#include <AK/BinarySearch.h>
|
|
||||||
#include <AK/CharacterTypes.h>
|
|
||||||
#include <AK/Optional.h>
|
|
||||||
#include <AK/Span.h>
|
|
||||||
#include <AK/ByteString.h>
|
|
||||||
#include <AK/StringView.h>
|
|
||||||
#include <LibUnicode/CharacterTypes.h>
|
|
||||||
#include <LibUnicode/UnicodeData.h>
|
|
||||||
|
|
||||||
namespace Unicode {
|
|
||||||
)~~~");
|
|
||||||
|
|
||||||
generator.append(R"~~~(
|
|
||||||
struct BidiClassData {
|
|
||||||
CodePointRange code_point_range {};
|
|
||||||
BidirectionalClass bidi_class {};
|
|
||||||
};
|
|
||||||
|
|
||||||
struct CodePointBidiClassComparator : public CodePointRangeComparator {
|
|
||||||
constexpr int operator()(u32 code_point, BidiClassData const& bidi_class)
|
|
||||||
{
|
|
||||||
return CodePointRangeComparator::operator()(code_point, bidi_class.code_point_range);
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
)~~~");
|
|
||||||
|
|
||||||
{
|
|
||||||
constexpr size_t max_bidi_classes_per_row = 20;
|
|
||||||
size_t bidi_classes_in_current_row = 0;
|
|
||||||
|
|
||||||
generator.set("size"sv, ByteString::number(unicode_data.code_point_bidirectional_classes.size()));
|
|
||||||
generator.append(R"~~~(
|
|
||||||
static constexpr Array<BidiClassData, @size@> s_bidirectional_classes { {
|
|
||||||
)~~~");
|
|
||||||
for (auto const& data : unicode_data.code_point_bidirectional_classes) {
|
|
||||||
if (bidi_classes_in_current_row++ > 0)
|
|
||||||
generator.append(", ");
|
|
||||||
|
|
||||||
generator.set("first", ByteString::formatted("{:#x}", data.code_point_range.first));
|
|
||||||
generator.set("last", ByteString::formatted("{:#x}", data.code_point_range.last));
|
|
||||||
generator.set("bidi_class", data.bidi_class);
|
|
||||||
generator.append("{ { @first@, @last@ }, BidirectionalClass::@bidi_class@ }");
|
|
||||||
|
|
||||||
if (bidi_classes_in_current_row == max_bidi_classes_per_row) {
|
|
||||||
bidi_classes_in_current_row = 0;
|
|
||||||
generator.append(",\n ");
|
|
||||||
}
|
|
||||||
}
|
|
||||||
generator.append(R"~~~(
|
|
||||||
} };
|
|
||||||
)~~~");
|
|
||||||
}
|
|
||||||
|
|
||||||
generator.append(R"~~~(
|
|
||||||
Optional<BidirectionalClass> bidirectional_class(u32 code_point)
|
|
||||||
{
|
|
||||||
if (auto const* entry = binary_search(s_bidirectional_classes, code_point, nullptr, CodePointBidiClassComparator {}))
|
|
||||||
return entry->bidi_class;
|
|
||||||
|
|
||||||
return {};
|
|
||||||
}
|
|
||||||
)~~~");
|
|
||||||
|
|
||||||
auto append_from_string = [&](StringView enum_title, StringView enum_snake, auto const& prop_list, Vector<Alias> const& aliases) -> ErrorOr<void> {
|
|
||||||
HashValueMap<StringView> hashes;
|
|
||||||
TRY(hashes.try_ensure_capacity(prop_list.size() + aliases.size()));
|
|
||||||
|
|
||||||
ValueFromStringOptions options {};
|
|
||||||
|
|
||||||
for (auto const& prop : prop_list) {
|
|
||||||
if constexpr (IsSame<RemoveCVReference<decltype(prop)>, ByteString>) {
|
|
||||||
hashes.set(CaseInsensitiveASCIIStringViewTraits::hash(prop), prop);
|
|
||||||
options.sensitivity = CaseSensitivity::CaseInsensitive;
|
|
||||||
} else {
|
|
||||||
hashes.set(prop.key.hash(), prop.key);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
for (auto const& alias : aliases)
|
|
||||||
hashes.set(alias.alias.hash(), alias.alias);
|
|
||||||
|
|
||||||
generate_value_from_string(generator, "{}_from_string"sv, enum_title, enum_snake, move(hashes), options);
|
|
||||||
|
|
||||||
return {};
|
|
||||||
};
|
|
||||||
|
|
||||||
TRY(append_from_string("BidirectionalClass"sv, "bidirectional_class"sv, unicode_data.bidirectional_classes, {}));
|
|
||||||
|
|
||||||
generator.append(R"~~~(
|
|
||||||
}
|
|
||||||
)~~~");
|
|
||||||
|
|
||||||
TRY(file.write_until_depleted(generator.as_string_view().bytes()));
|
|
||||||
return {};
|
|
||||||
}
|
|
||||||
|
|
||||||
ErrorOr<int> serenity_main(Main::Arguments arguments)
|
|
||||||
{
|
|
||||||
StringView generated_header_path;
|
|
||||||
StringView generated_implementation_path;
|
|
||||||
StringView unicode_data_path;
|
|
||||||
|
|
||||||
Core::ArgsParser args_parser;
|
|
||||||
args_parser.add_option(generated_header_path, "Path to the Unicode Data header file to generate", "generated-header-path", 'h', "generated-header-path");
|
|
||||||
args_parser.add_option(generated_implementation_path, "Path to the Unicode Data implementation file to generate", "generated-implementation-path", 'c', "generated-implementation-path");
|
|
||||||
args_parser.add_option(unicode_data_path, "Path to UnicodeData.txt file", "unicode-data-path", 'u', "unicode-data-path");
|
|
||||||
args_parser.parse(arguments);
|
|
||||||
|
|
||||||
auto generated_header_file = TRY(open_file(generated_header_path, Core::File::OpenMode::Write));
|
|
||||||
auto generated_implementation_file = TRY(open_file(generated_implementation_path, Core::File::OpenMode::Write));
|
|
||||||
auto unicode_data_file = TRY(open_file(unicode_data_path, Core::File::OpenMode::Read));
|
|
||||||
|
|
||||||
UnicodeData unicode_data {};
|
|
||||||
|
|
||||||
TRY(parse_unicode_data(*unicode_data_file, unicode_data));
|
|
||||||
|
|
||||||
TRY(generate_unicode_data_header(*generated_header_file, unicode_data));
|
|
||||||
TRY(generate_unicode_data_implementation(*generated_implementation_file, unicode_data));
|
|
||||||
|
|
||||||
return 0;
|
|
||||||
}
|
|
|
@ -1,5 +1,5 @@
|
||||||
/*
|
/*
|
||||||
* Copyright (c) 2021, Tim Flynn <trflynn89@serenityos.org>
|
* Copyright (c) 2021-2024, Tim Flynn <trflynn89@serenityos.org>
|
||||||
*
|
*
|
||||||
* SPDX-License-Identifier: BSD-2-Clause
|
* SPDX-License-Identifier: BSD-2-Clause
|
||||||
*/
|
*/
|
||||||
|
@ -307,25 +307,13 @@ TEST_CASE(script_extension)
|
||||||
|
|
||||||
TEST_CASE(code_point_bidirectional_character_type)
|
TEST_CASE(code_point_bidirectional_character_type)
|
||||||
{
|
{
|
||||||
auto code_point_bidi_class = [](u32 code_point) {
|
|
||||||
auto bidi_class = Unicode::bidirectional_class(code_point);
|
|
||||||
VERIFY(bidi_class.has_value());
|
|
||||||
return bidi_class.release_value();
|
|
||||||
};
|
|
||||||
|
|
||||||
auto bidi_class_from_string = [](StringView name) {
|
|
||||||
auto result = Unicode::bidirectional_class_from_string(name);
|
|
||||||
VERIFY(result.has_value());
|
|
||||||
return result.release_value();
|
|
||||||
};
|
|
||||||
|
|
||||||
// Left-to-right
|
// Left-to-right
|
||||||
EXPECT_EQ(code_point_bidi_class('A'), bidi_class_from_string("L"sv));
|
EXPECT_EQ(Unicode::bidirectional_class('A'), Unicode::BidiClass::LeftToRight);
|
||||||
EXPECT_EQ(code_point_bidi_class('z'), bidi_class_from_string("L"sv));
|
EXPECT_EQ(Unicode::bidirectional_class('z'), Unicode::BidiClass::LeftToRight);
|
||||||
// European number
|
// European number
|
||||||
EXPECT_EQ(code_point_bidi_class('7'), bidi_class_from_string("EN"sv));
|
EXPECT_EQ(Unicode::bidirectional_class('7'), Unicode::BidiClass::EuropeanNumber);
|
||||||
// Whitespace
|
// Whitespace
|
||||||
EXPECT_EQ(code_point_bidi_class(' '), bidi_class_from_string("WS"sv));
|
EXPECT_EQ(Unicode::bidirectional_class(' '), Unicode::BidiClass::WhiteSpaceNeutral);
|
||||||
// Arabic right-to-left (U+FEB4 ARABIC LETTER SEEN MEDIAL FORM)
|
// Arabic right-to-left (U+FEB4 ARABIC LETTER SEEN MEDIAL FORM)
|
||||||
EXPECT_EQ(code_point_bidi_class(0xFEB4), bidi_class_from_string("AL"sv));
|
EXPECT_EQ(Unicode::bidirectional_class(0xFEB4), Unicode::BidiClass::RightToLeftArabic);
|
||||||
}
|
}
|
||||||
|
|
|
@ -310,7 +310,67 @@ bool code_point_has_script_extension(u32 code_point, Script script)
|
||||||
return static_cast<bool>(uscript_hasScript(icu_code_point, icu_script));
|
return static_cast<bool>(uscript_hasScript(icu_code_point, icu_script));
|
||||||
}
|
}
|
||||||
|
|
||||||
Optional<BidirectionalClass> __attribute__((weak)) bidirectional_class_from_string(StringView) { return {}; }
|
static constexpr BidiClass char_direction_to_bidi_class(UCharDirection direction)
|
||||||
Optional<BidirectionalClass> __attribute__((weak)) bidirectional_class(u32) { return {}; }
|
{
|
||||||
|
switch (direction) {
|
||||||
|
case U_ARABIC_NUMBER:
|
||||||
|
return BidiClass::ArabicNumber;
|
||||||
|
case U_BLOCK_SEPARATOR:
|
||||||
|
return BidiClass::BlockSeparator;
|
||||||
|
case U_BOUNDARY_NEUTRAL:
|
||||||
|
return BidiClass::BoundaryNeutral;
|
||||||
|
case U_COMMON_NUMBER_SEPARATOR:
|
||||||
|
return BidiClass::CommonNumberSeparator;
|
||||||
|
case U_DIR_NON_SPACING_MARK:
|
||||||
|
return BidiClass::DirNonSpacingMark;
|
||||||
|
case U_EUROPEAN_NUMBER:
|
||||||
|
return BidiClass::EuropeanNumber;
|
||||||
|
case U_EUROPEAN_NUMBER_SEPARATOR:
|
||||||
|
return BidiClass::EuropeanNumberSeparator;
|
||||||
|
case U_EUROPEAN_NUMBER_TERMINATOR:
|
||||||
|
return BidiClass::EuropeanNumberTerminator;
|
||||||
|
case U_FIRST_STRONG_ISOLATE:
|
||||||
|
return BidiClass::FirstStrongIsolate;
|
||||||
|
case U_LEFT_TO_RIGHT:
|
||||||
|
return BidiClass::LeftToRight;
|
||||||
|
case U_LEFT_TO_RIGHT_EMBEDDING:
|
||||||
|
return BidiClass::LeftToRightEmbedding;
|
||||||
|
case U_LEFT_TO_RIGHT_ISOLATE:
|
||||||
|
return BidiClass::LeftToRightIsolate;
|
||||||
|
case U_LEFT_TO_RIGHT_OVERRIDE:
|
||||||
|
return BidiClass::LeftToRightOverride;
|
||||||
|
case U_OTHER_NEUTRAL:
|
||||||
|
return BidiClass::OtherNeutral;
|
||||||
|
case U_POP_DIRECTIONAL_FORMAT:
|
||||||
|
return BidiClass::PopDirectionalFormat;
|
||||||
|
case U_POP_DIRECTIONAL_ISOLATE:
|
||||||
|
return BidiClass::PopDirectionalIsolate;
|
||||||
|
case U_RIGHT_TO_LEFT:
|
||||||
|
return BidiClass::RightToLeft;
|
||||||
|
case U_RIGHT_TO_LEFT_ARABIC:
|
||||||
|
return BidiClass::RightToLeftArabic;
|
||||||
|
case U_RIGHT_TO_LEFT_EMBEDDING:
|
||||||
|
return BidiClass::RightToLeftEmbedding;
|
||||||
|
case U_RIGHT_TO_LEFT_ISOLATE:
|
||||||
|
return BidiClass::RightToLeftIsolate;
|
||||||
|
case U_RIGHT_TO_LEFT_OVERRIDE:
|
||||||
|
return BidiClass::RightToLeftOverride;
|
||||||
|
case U_SEGMENT_SEPARATOR:
|
||||||
|
return BidiClass::SegmentSeparator;
|
||||||
|
case U_WHITE_SPACE_NEUTRAL:
|
||||||
|
return BidiClass::WhiteSpaceNeutral;
|
||||||
|
case U_CHAR_DIRECTION_COUNT:
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
VERIFY_NOT_REACHED();
|
||||||
|
}
|
||||||
|
|
||||||
|
BidiClass bidirectional_class(u32 code_point)
|
||||||
|
{
|
||||||
|
auto icu_code_point = static_cast<UChar32>(code_point);
|
||||||
|
|
||||||
|
auto direction = u_charDirection(icu_code_point);
|
||||||
|
return char_direction_to_bidi_class(direction);
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -49,7 +49,32 @@ Optional<Script> script_from_string(StringView);
|
||||||
bool code_point_has_script(u32 code_point, Script script);
|
bool code_point_has_script(u32 code_point, Script script);
|
||||||
bool code_point_has_script_extension(u32 code_point, Script script);
|
bool code_point_has_script_extension(u32 code_point, Script script);
|
||||||
|
|
||||||
Optional<BidirectionalClass> bidirectional_class_from_string(StringView);
|
enum class BidiClass {
|
||||||
Optional<BidirectionalClass> bidirectional_class(u32 code_point);
|
ArabicNumber, // AN
|
||||||
|
BlockSeparator, // B
|
||||||
|
BoundaryNeutral, // BN
|
||||||
|
CommonNumberSeparator, // CS
|
||||||
|
DirNonSpacingMark, // NSM
|
||||||
|
EuropeanNumber, // EN
|
||||||
|
EuropeanNumberSeparator, // ES
|
||||||
|
EuropeanNumberTerminator, // ET
|
||||||
|
FirstStrongIsolate, // FSI
|
||||||
|
LeftToRight, // L
|
||||||
|
LeftToRightEmbedding, // LRE
|
||||||
|
LeftToRightIsolate, // LRI
|
||||||
|
LeftToRightOverride, // LRO
|
||||||
|
OtherNeutral, // ON
|
||||||
|
PopDirectionalFormat, // PDF
|
||||||
|
PopDirectionalIsolate, // PDI
|
||||||
|
RightToLeft, // R
|
||||||
|
RightToLeftArabic, // AL
|
||||||
|
RightToLeftEmbedding, // RLE
|
||||||
|
RightToLeftIsolate, // RLI
|
||||||
|
RightToLeftOverride, // RLO
|
||||||
|
SegmentSeparator, // S
|
||||||
|
WhiteSpaceNeutral, // WS
|
||||||
|
};
|
||||||
|
|
||||||
|
BidiClass bidirectional_class(u32 code_point);
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -11,7 +11,7 @@
|
||||||
|
|
||||||
namespace Unicode {
|
namespace Unicode {
|
||||||
|
|
||||||
enum class BidirectionalClass : u8;
|
enum class BidiClass;
|
||||||
enum class EmojiGroup : u8;
|
enum class EmojiGroup : u8;
|
||||||
|
|
||||||
struct CurrencyCode;
|
struct CurrencyCode;
|
||||||
|
|
|
@ -2338,22 +2338,14 @@ bool Element::is_auto_directionality_form_associated_element() const
|
||||||
// https://html.spec.whatwg.org/multipage/dom.html#auto-directionality
|
// https://html.spec.whatwg.org/multipage/dom.html#auto-directionality
|
||||||
Optional<Element::Directionality> Element::auto_directionality() const
|
Optional<Element::Directionality> Element::auto_directionality() const
|
||||||
{
|
{
|
||||||
static auto bidirectional_class_L = Unicode::bidirectional_class_from_string("L"sv);
|
|
||||||
static auto bidirectional_class_AL = Unicode::bidirectional_class_from_string("AL"sv);
|
|
||||||
static auto bidirectional_class_R = Unicode::bidirectional_class_from_string("R"sv);
|
|
||||||
|
|
||||||
// AD-HOC: Assume 'ltr' if Unicode data generation is disabled.
|
|
||||||
if (!bidirectional_class_L.has_value())
|
|
||||||
return Directionality::Ltr;
|
|
||||||
|
|
||||||
// https://html.spec.whatwg.org/multipage/dom.html#text-node-directionality
|
// https://html.spec.whatwg.org/multipage/dom.html#text-node-directionality
|
||||||
auto text_node_directionality = [](Text const& text_node) -> Optional<Directionality> {
|
auto text_node_directionality = [](Text const& text_node) -> Optional<Directionality> {
|
||||||
// 1. If text's data does not contain a code point whose bidirectional character type is L, AL, or R, then return null.
|
// 1. If text's data does not contain a code point whose bidirectional character type is L, AL, or R, then return null.
|
||||||
// 2. Let codePoint be the first code point in text's data whose bidirectional character type is L, AL, or R.
|
// 2. Let codePoint be the first code point in text's data whose bidirectional character type is L, AL, or R.
|
||||||
Optional<Unicode::BidirectionalClass> found_character_bidi_class;
|
Optional<Unicode::BidiClass> found_character_bidi_class;
|
||||||
for (auto code_point : Utf8View(text_node.data())) {
|
for (auto code_point : Utf8View(text_node.data())) {
|
||||||
auto bidi_class = Unicode::bidirectional_class(code_point);
|
auto bidi_class = Unicode::bidirectional_class(code_point);
|
||||||
if (first_is_one_of(bidi_class, bidirectional_class_L, bidirectional_class_AL, bidirectional_class_R)) {
|
if (first_is_one_of(bidi_class, Unicode::BidiClass::LeftToRight, Unicode::BidiClass::RightToLeftArabic, Unicode::BidiClass::RightToLeft)) {
|
||||||
found_character_bidi_class = bidi_class;
|
found_character_bidi_class = bidi_class;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
@ -2362,12 +2354,12 @@ Optional<Element::Directionality> Element::auto_directionality() const
|
||||||
return {};
|
return {};
|
||||||
|
|
||||||
// 3. If codePoint is of bidirectional character type AL or R, then return 'rtl'.
|
// 3. If codePoint is of bidirectional character type AL or R, then return 'rtl'.
|
||||||
if (first_is_one_of(*found_character_bidi_class, bidirectional_class_AL, bidirectional_class_R))
|
if (first_is_one_of(*found_character_bidi_class, Unicode::BidiClass::RightToLeftArabic, Unicode::BidiClass::RightToLeft))
|
||||||
return Directionality::Rtl;
|
return Directionality::Rtl;
|
||||||
|
|
||||||
// 4. If codePoint is of bidirectional character type L, then return 'ltr'.
|
// 4. If codePoint is of bidirectional character type L, then return 'ltr'.
|
||||||
// NOTE: codePoint should always be of bidirectional character type L by this point, so we can just return 'ltr' here.
|
// NOTE: codePoint should always be of bidirectional character type L by this point, so we can just return 'ltr' here.
|
||||||
VERIFY(*found_character_bidi_class == bidirectional_class_L);
|
VERIFY(*found_character_bidi_class == Unicode::BidiClass::LeftToRight);
|
||||||
return Directionality::Ltr;
|
return Directionality::Ltr;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -2381,9 +2373,9 @@ Optional<Element::Directionality> Element::auto_directionality() const
|
||||||
// and there is no character of bidirectional character type L anywhere before it in the element's value, then return 'rtl'.
|
// and there is no character of bidirectional character type L anywhere before it in the element's value, then return 'rtl'.
|
||||||
for (auto code_point : Utf8View(value)) {
|
for (auto code_point : Utf8View(value)) {
|
||||||
auto bidi_class = Unicode::bidirectional_class(code_point);
|
auto bidi_class = Unicode::bidirectional_class(code_point);
|
||||||
if (bidi_class == bidirectional_class_L)
|
if (bidi_class == Unicode::BidiClass::LeftToRight)
|
||||||
break;
|
break;
|
||||||
if (bidi_class == bidirectional_class_AL || bidi_class == bidirectional_class_R)
|
if (bidi_class == Unicode::BidiClass::RightToLeftArabic || bidi_class == Unicode::BidiClass::RightToLeft)
|
||||||
return Directionality::Rtl;
|
return Directionality::Rtl;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
Loading…
Reference in a new issue